[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Diff of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Thu Jun 9 19:06:55 2005 UTC revision 1.3, Fri Jan 6 20:35:01 2006 UTC
# Line 6  Line 6 
6  use Genome;  use Genome;
7  use SimBlocks;  use SimBlocks;
8  use File::Path;  use File::Path;
9  use Location;  use BasicLocation;
10    use Cwd;
11    
12  =head1 Similarity Block Loader  =head1 Similarity Block Loader
13    
14  This script loads the similarity block database from  This script loads the similarity block database from
15  the input files.  the input files. The load process involves two steps:
16    converting the input files into C<dtx> load files
17    (B<generate>), and loading the C<dtx> files into the
18    database (B<load>).
19    
20  The script takes a single parameter-- the directory name.  The script takes a single parameter-- a directory name.
21  The default directory name is taken from the config file  The default directory name is taken from the config file
22  parameter C<$fig_config::SimBlastData>. The output files  parameter C<$fig_config::SimBlastData>. The output files
23  will be produced in the similarity block data directory  will be produced in the similarity block data directory
24  C<$fig_config::SimBlockData>, which will be created if  C<$fig_config::SimBlockData>, which will be created if
25  it does not exist.  it does not exist. The input directory and all its
26    subdirectories will be processed for input files.
27    
28  In addition to the directory name, the following  In addition to the directory name, the following
29  option parameters are supported.  option parameters are supported.
# Line 28  Line 33 
33  =item trace  =item trace
34    
35  Trace level for output messages. A higher number means more  Trace level for output messages. A higher number means more
36  messages. The default is C<1>.  messages. The default is C<2>. Trace messages are sent to
37    the file C<trace.log> in the B<$FIG_Config::tmp>
38  =item logFile  directory.
   
 Name of a file to be used for trace messages, or  
 C<*> to send them to the standard output. If a  
 file is specified, the output directory must  
 already exist. The default is C<*>.  
39    
40  =item load  =item load
41    
# Line 50  Line 50 
50  =back  =back
51    
52  For example, the following command line will process the  For example, the following command line will process the
53  C<Fragment.sort> file in the C</Users/fig/BlastData> directory and  input files in the C</Users/fig/BlastData> directory and
54  run at a trace level of 3.  run at a trace level of 3.
55    
56  C<< SimLoad /Users/fig/BlastData -trace=3 >>  C<< SimLoad -trace=3 /Users/fig/BlastData >>
57    
58    The following command line converts the input files in
59    the default directory into load files but does not load
60    the database and runs at a trace level of 2.
61    
62  Please note that spaces in the file and directory names must  C<< SimLoad -load=no >>
 be escaped as C<\b> to prevent them from being parsed as  
 argument delimiters.  
63    
64  =head2 Input Directory  =head2 Input Directory
65    
66  The following files must exist in the input directory.  The following files must exist in each input directory.
67    
68  =over 4  =over 4
69    
# Line 279  Line 281 
281    
282  # Parse the command line.  # Parse the command line.
283  my ($options, @arguments) = Tracer::ParseCommand({ trace => 1,  my ($options, @arguments) = Tracer::ParseCommand({ trace => 1,
                                                   logFile => "*",  
284                                                    load => 'yes',                                                    load => 'yes',
285                                                    generate => 'yes'},                                                    generate => 'yes'},
286                                                   @ARGV);                                                   @ARGV);
287  # Extract the directory name from the argument array.  # Extract the directory name from the argument array.
288  my $inDirectory = $FIG_Config::simBlastData;  my $inDirectoryTree = $FIG_Config::simBlastData;
289  if ($arguments[0]) {  if ($arguments[0]) {
290      $inDirectory = Cwd::abs_path($arguments[0]);      $inDirectoryTree = Cwd::abs_path($arguments[0]);
291  }  }
292  # Set up tracing.  # Set up tracing.
293  my $traceLevel = $options->{trace};  my $traceLevel = $options->{trace};
294  my $traceOption = $options->{logFile};  TSetup("$traceLevel Tracer ERDB SimBlocks DBKernel SQL", "+>$FIG_Config::temp/trace.log");
 my $TracingToFile = ($traceOption ne "*");  
 my $traceDestination = ($TracingToFile ? ">$traceOption" : "TEXT");  
 TSetup("$traceLevel Tracer", $traceDestination);  
295  # Get the output directory.  # Get the output directory.
296  my $outDirectory = $FIG_Config::simBlocksData;  my $outDirectory = $FIG_Config::simBlocksData;
297  # Insure that it exists.  # Insure that it exists.
298  if (! -d $outDirectory) {  if (! -d $outDirectory) {
299      Output("Creating output directory $outDirectory.");      Trace("Creating output directory $outDirectory.") if T(2);
300      mkpath($outDirectory);      mkpath($outDirectory);
301    } else {
302        # Here we have an output directory already. Clear any
303        # leftover data from previous runs.
304        my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory);
305        my $numFiles = @files;
306        if ($numFiles > 0) {
307            Trace("Deleting $numFiles old dtx files from $outDirectory.") if T(2);
308            unlink map { "$outDirectory/$_" } @files;
309        }
310  }  }
311  # Create an error counter.  # Create an error counter and a directory counter.
312  my $errorCount = 0;  my $errorCount = 0;
313    my $dirCount = 0;
314  # Check to see if we should generate the output files.  # Check to see if we should generate the output files.
315  if ($options->{generate} eq 'no') {  if ($options->{generate} eq 'no') {
316      # Here we are to use existing output files.      # Here we are to use existing output files.
317      Output("Existing database load files will be used.");      Trace("Existing database load files will be used.") if T(2);
318  } else {  } else {
319      # Here we need to produce new output files.      # Here we need to produce new output files.
320      # Verify that the input directory exists.      # Verify that the input directory exists.
321      if (! -d $inDirectory) {      if (! -d $inDirectoryTree) {
322          Confess("Input directory \"$inDirectory\" not found.");          Confess("Input directory \"$inDirectoryTree\" not found.");
323        }
324        # Loop through the subdirectories.
325        for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 0)) {
326            # Verify that this is a directory. If it's ".", we use
327            # the top-level directory.
328            my $inDirectory = ($inputDirectory eq "." ? $inDirectoryTree :
329                                                        "$inDirectoryTree/$inputDirectory");
330            if (($inputDirectory !~ /\../) && -d $inDirectory) {
331                # Here we have a directory to process. Check for a genome
332                # file.
333                my $genomeFileName = "$inDirectory/Genome.tbl";
334                if (! -e $genomeFileName) {
335                    Trace("$genomeFileName not found. Directory skipped.") if T(1);
336                } else {
337                    # Now we can process the directory and accumulate the error
338                    # count.
339                    $errorCount += ProcessDirectory($inDirectory, $outDirectory);
340                    $dirCount++;
341                }
342            }
343        }
344        Trace("Load files generated from $dirCount directories.") if T(2);
345    }
346    # Check for errors.
347    if ($errorCount > 0) {
348        Trace("$errorCount errors found in input files.") if T(0);
349    } else {
350        # No errors, so it's okay to load the database.
351        if ($options->{load} eq 'yes') {
352            # Here we have no outstanding errors and the user wants us to load
353            # the database. First, we create a similarity block object.
354            my $simBlocks = SimBlocks->new();
355            # Use it to load the database. Note we specify that the tables are to be
356            # dropped and rebuilt.
357            $simBlocks->LoadTables($outDirectory, 1);
358            Trace("Database loaded.") if T(2);
359        }
360      }      }
361    
362    # Process a single input directory.
363    sub ProcessDirectory {
364        my ($inDirectory, $outDirectory) = @_;
365        Trace("Processing directory $inDirectory.") if T(2);
366      # Our first task is to copy the genome data to the output directory      # Our first task is to copy the genome data to the output directory
367      # and build the genome list.      # and add them to the genome list.
368      my %genomes = ();      my %genomes = ();
369      (open GENOMESIN, "<$inDirectory/Genome.tbl") ||      Open(\*GENOMESIN, "<$inDirectory/Genome.tbl");
370          Confess("Could not open Genome input file in $inDirectory.");      Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");
371      (open GENOMESOUT, ">$outDirectory/Genome.dtx") ||      # Count the genomes read and errors found.
         Confess("Could not open Genome output file in $outDirectory.");  
     # Count the genomes read.  
372      my $genomeCount = 0;      my $genomeCount = 0;
373        my $errorCount = 0;
374      # Loop through the input.      # Loop through the input.
375      while (my $genomeData = <GENOMESIN>) {      while (my $genomeData = <GENOMESIN>) {
376          # Echo the genome record to the output.          # Echo the genome record to the output.
# Line 336  Line 385 
385          # Count this genome.          # Count this genome.
386          $genomeCount++;          $genomeCount++;
387      }      }
388      Output("$genomeCount genomes found.");      Trace("$genomeCount genomes found.") if T(2);
389      # Close the files.      # Close the files.
390      close GENOMESIN;      close GENOMESIN;
391      close GENOMESOUT;      close GENOMESOUT;
# Line 347  Line 396 
396      # are sorted by block ID, so all processing for this section of the      # are sorted by block ID, so all processing for this section of the
397      # script is done a block at a time. The first task is to      # script is done a block at a time. The first task is to
398      # open the output files.      # open the output files.
399      (open BLOCKSOUT, ">$outDirectory/GroupBlock.dtx") ||      Open(\*BLOCKSOUT, ">>$outDirectory/GroupBlock.dtx");
400          Confess("Could not open block output file in $outDirectory.");      Open(\*REGIONSOUT, ">>$outDirectory/Region.dtx");
401      (open REGIONSOUT, ">$outDirectory/Region.dtx") ||      Open(\*INSTANCESOUT, ">>$outDirectory/HasInstanceOf.dtx");
402          Confess("Could not open region output file in $outDirectory.");      Open(\*CONTAINSOUT, ">>$outDirectory/ContainsRegion.dtx");
403      (open INSTANCESOUT, ">$outDirectory/HasInstanceOf.dtx") ||      Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx");
         Confess("Could not open instance output file in $outDirectory.");  
     (open CONTAINSOUT, ">$outDirectory/ContainsRegion.dtx") ||  
         Confess("Could not open contig-region output file in $outDirectory.");  
     (open INCLUDESOUT, ">$outDirectory/IncludesRegion.dtx") ||  
         Confess("Could not open block-region output file in $outDirectory.");  
404      # Determine which file sets we'll be processing.      # Determine which file sets we'll be processing.
405      my @fileSets = ();      my @fileSets = ();
406      my @prefixes = ("", "InterGenic_");      my @prefixes = ("", "InterGenic_");
# Line 365  Line 409 
409              push @fileSets, $prefix;              push @fileSets, $prefix;
410          }          }
411      }      }
412        # Set up the duplicate-region check.
413        my %allRegions = ();
414      # Set up some counters.      # Set up some counters.
415      my ($blocksCount, $regionsCount) = (0, 0);      my ($blocksCount, $regionsCount) = (0, 0);
416      # Loop through the useful file sets.      # Loop through the useful file sets.
417      for my $fileSet (@fileSets) {      for my $fileSet (@fileSets) {
418          (open BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl") ||          Open(\*BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl");
419              Confess("Could not open block input file in $inDirectory.");          Open(\*REGIONSIN, "<$inDirectory/${fileSet}Region.tbl");
420          (open REGIONSIN, "<$inDirectory/${fileSet}Region.tbl") ||          Trace("Processing ${fileSet}Blocks.") if T(2);
             Confess("Could not open region input file in $inDirectory.");  
         Output("Processing ${fileSet}Blocks.");  
421          # The outer loop processes blocks. This is accomplished by reading          # The outer loop processes blocks. This is accomplished by reading
422          # through the block file. We prime the loop by reading the first          # through the block file. We prime the loop by reading the first
423          # region record. This is because we finish processing a block when          # region record. This is because we finish processing a block when
# Line 403  Line 447 
447                  # If this region's block ID is invalid, complain                  # If this region's block ID is invalid, complain
448                  # and skip it.                  # and skip it.
449                  if ($regionRecord{blockID} != $blockID) {                  if ($regionRecord{blockID} != $blockID) {
450                      Output("Block $regionRecord{blockID} in region record $regionsCount not found in block input file at record $blocksCount.");                      Trace("Block $regionRecord{blockID} in region record $regionsCount not found in block input file at record $blocksCount.") if T(0);
451                      $errorCount++;                      $errorCount++;
452                  } else {                  } else {
453                      # Here both files are in sync, which is good. The next step is                      # Here both files are in sync, which is good. The next step is
# Line 411  Line 455 
455                      my $genomeID = $regionRecord{genomeID};                      my $genomeID = $regionRecord{genomeID};
456                      my $contigID = "$genomeID:$regionRecord{contigID}";                      my $contigID = "$genomeID:$regionRecord{contigID}";
457                      if (! exists $genomes{$genomeID}) {                      if (! exists $genomes{$genomeID}) {
458                          Output("Genome $genomeID in region record $regionsCount not found in genome input file.");                          Trace("Genome $genomeID in region record $regionsCount not found in genome input file.") if T(0);
459                          $errorCount++;                          $errorCount++;
460                      } else {                      } else {
461                          # Denote this genome has an instance of this block.                          # Denote this genome has an instance of this block.
# Line 435  Line 479 
479                              $blockData{len} = length $snippet;                              $blockData{len} = length $snippet;
480                          } elsif ($blockData{len} != length $snippet) {                          } elsif ($blockData{len} != length $snippet) {
481                              # Here it is not the first, but the lengths do not match.                              # Here it is not the first, but the lengths do not match.
482                              Output("Snippet for region record $regionsCount does not match block length $blockData{len}.");                              Trace("Snippet for region record $regionsCount does not match block length $blockData{len}.") if T(0);
483                              $errorCount++;                              $errorCount++;
484                          } else {                          } else {
485                              # Here everything is legitimate, so we merge the new                              # Here everything is legitimate, so we merge the new
# Line 453  Line 497 
497              # We have now processed all the regions in the block. Insure we found at least              # We have now processed all the regions in the block. Insure we found at least
498              # one.              # one.
499              if (! $regionCounter) {              if (! $regionCounter) {
500                  Output("No regions found for block $blockID at $blocksCount in block input file.");                  Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0);
501                  $errorCount++;                  $errorCount++;
502              } else {              } else {
503                  Trace("$regionCounter regions found in block $blockID.") if T(2);                  Trace("$regionCounter regions found in block $blockID.") if T(2);
# Line 466  Line 510 
510                  my @positions = SimBlocks::ParsePattern($blockData{pattern});                  my @positions = SimBlocks::ParsePattern($blockData{pattern});
511                  # Loop through the regions, writing them out to the region output file.                  # Loop through the regions, writing them out to the region output file.
512                  for my $region (keys %regionMap) {                  for my $region (keys %regionMap) {
513                        if (length($region) > 80) {
514                            Trace("Invalid region key \"$region\".") if T(1);
515                        }
516                      # Get the region's snips.                      # Get the region's snips.
517                      my $source = $regionMap{$region};                      my $source = $regionMap{$region};
518                      my $content = "";                      my $content = "";
# Line 473  Line 520 
520                          $content .= substr $source, $pos, 1;                          $content .= substr $source, $pos, 1;
521                      }                      }
522                      # Get the region's location data.                      # Get the region's location data.
523                      my $location = Location->new($region);                      my $location = BasicLocation->new($region);
524                      # Write this region to the output files.                      # Write this region to the output files.
525                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,
526                                            $location->Length, $regionPeg{$region},                                            $location->Right, $location->Length,
527                                            $location->Left, $content) . "\n";                                            $regionPeg{$region}, $location->Left, $content) . "\n";
528                      print CONTAINSOUT join("\t", $location->Contig, $region,                      print CONTAINSOUT join("\t", $location->Contig, $region,
529                                             $location->Length, $location->Left) . "\n";                                             $location->Length, $location->Left) . "\n";
530                      print INCLUDESOUT join("\t", $blockID, $region);                      print INCLUDESOUT join("\t", $blockID, $region);
# Line 499  Line 546 
546      close BLOCKSOUT;      close BLOCKSOUT;
547      close INSTANCESOUT;      close INSTANCESOUT;
548      # All the block data has been written. Tell the user what we found.      # All the block data has been written. Tell the user what we found.
549      Output("$blocksCount blocks processed, $regionsCount regions processed.");      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);
550      # The next task is to write the genome/contig data. This is provided by the      # The next task is to write the genome/contig data. This is provided by the
551      # "%contigs" hash. First, we need to open the files.      # "%contigs" hash. First, we need to open the files.
552      my $contigsCount = 0;      my $contigsCount = 0;
553      (open CONTIGSOUT, ">$outDirectory/Contig.dtx") ||      Open(\*CONTIGSOUT, ">>$outDirectory/Contig.dtx");
554          Confess("Could not open contig output file in $outDirectory.");      Open(\*CONSISTSOUT, ">>$outDirectory/ConsistsOf.dtx");
     (open CONSISTSOUT, ">$outDirectory/ConsistsOf.dtx") ||  
         Confess("Could not open consists output file in $outDirectory.");  
555      for my $contigID (keys %contigs) {      for my $contigID (keys %contigs) {
556          print CONTIGSOUT "$contigID\n";          print CONTIGSOUT "$contigID\n";
557          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";
558          $contigsCount++;          $contigsCount++;
559      }      }
560      Output("$contigsCount contigs found.");      Trace("$contigsCount contigs found.") if T(2);
561      # Now warn the user about all the genomes that didn't have blocks.      # Now warn the user about all the genomes that didn't have blocks.
562      for my $genomeID (keys %genomes) {      for my $genomeID (keys %genomes) {
563          if (! $genomes{$genomeID}) {          if (! $genomes{$genomeID}) {
564              Output("Genome $genomeID did not have any regions.");              Trace("Genome $genomeID did not have any regions.") if T(1);
565              $errorCount++;              $errorCount++;
566          }          }
567      }      }
568  }      return $errorCount;
 # Check for errors.  
 if ($errorCount > 0) {  
     Output("$errorCount errors found in input files.");  
 } else {  
     # No errors, so it's okay to load the database.  
     if ($options->{load} eq 'yes') {  
         # Here we have no outstanding errors and the user wants us to load  
         # the database. First, we create a similarity block object.  
         my $simBlocks = SimBlocks->new();  
         # Use it to load the database. Note we specify that the tables are to be  
         # dropped and rebuilt.  
         $simBlocks->LoadTables($outDirectory, 1);  
         Output("Database loaded.");  
     }  
569  }  }
570  # Tell the user we're done.  # Tell the user we're done.
571  Output("Processing complete.");  Trace("Processing complete.") if T(0);
572    
573  # Read a region record from the file and parse it into a hash  # Read a region record from the file and parse it into a hash
574  # for return to the caller. If we reach end-of-file, the  # for return to the caller. If we reach end-of-file, the
# Line 562  Line 593 
593      return %retVal;      return %retVal;
594  }  }
595    
 # Write an output line. The output line will be traced at level 0. If  
 # $TracingToFile is set, it will be echoed to the standard output.  
 sub Output {  
     my ($message) = @_;  
     Trace($message) if T(0);  
     if ($TracingToFile) {  
         print "$message\n";  
     }  
 }  
   
596  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.3

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3