[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Diff of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Wed Jul 27 20:05:58 2005 UTC revision 1.4, Wed Jan 11 19:39:20 2006 UTC
# Line 7  Line 7 
7  use SimBlocks;  use SimBlocks;
8  use File::Path;  use File::Path;
9  use BasicLocation;  use BasicLocation;
10    use Cwd;
11    
12  =head1 Similarity Block Loader  =head1 Similarity Block Loader
13    
14  This script loads the similarity block database from  This script loads the similarity block database from
15  the input files.  the input files. The load process involves two steps:
16    converting the input files into C<dtx> load files
17    (B<generate>), and loading the C<dtx> files into the
18    database (B<load>).
19    
20  The script takes a single parameter-- the directory name.  The script takes a single parameter-- a directory name.
21  The default directory name is taken from the config file  The default directory name is taken from the config file
22  parameter C<$fig_config::SimBlastData>. The output files  parameter C<$fig_config::SimBlastData>. The output files
23  will be produced in the similarity block data directory  will be produced in the similarity block data directory
24  C<$fig_config::SimBlockData>, which will be created if  C<$fig_config::SimBlockData>, which will be created if
25  it does not exist.  it does not exist. The input directory and all its
26    subdirectories will be processed for input files.
27    
28  In addition to the directory name, the following  In addition to the directory name, the following
29  option parameters are supported.  option parameters are supported.
# Line 28  Line 33 
33  =item trace  =item trace
34    
35  Trace level for output messages. A higher number means more  Trace level for output messages. A higher number means more
36  messages. The default is C<1>.  messages. The default is C<3>. Trace messages are sent to
37    the file C<trace.log> in the B<$FIG_Config::tmp>
38    directory.
39    
40  =item logFile  =item sql
41    
42  Name of a file to be used for trace messages, or  If specified, SQL activity will be traced at the specified
43  C<*> to send them to the standard output. If a  trace level.
 file is specified, the output directory must  
 already exist. The default is C<*>.  
44    
45  =item load  =item load
46    
# Line 50  Line 55 
55  =back  =back
56    
57  For example, the following command line will process the  For example, the following command line will process the
58  C<Fragment.sort> file in the C</Users/fig/BlastData> directory and  input files in the C</Users/fig/BlastData> directory and
59  run at a trace level of 3.  run at a trace level of 3.
60    
61  C<< SimLoad /Users/fig/BlastData -trace=3 >>  C<< SimLoad -trace=3 /Users/fig/BlastData >>
62    
63  Please note that spaces in the file and directory names must  The following command line converts the input files in
64  be escaped as C<\b> to prevent them from being parsed as  the default directory into load files but does not load
65  argument delimiters.  the database and runs at a trace level of 2.
66    
67    C<< SimLoad -load=no >>
68    
69  =head2 Input Directory  =head2 Input Directory
70    
71  The following files must exist in the input directory.  The following files must exist in each input directory.
72    
73  =over 4  =over 4
74    
# Line 278  Line 285 
285  my $TRAILER = 999999999;  my $TRAILER = 999999999;
286    
287  # Parse the command line.  # Parse the command line.
288  my ($options, @arguments) = Tracer::ParseCommand({ trace => 1,  my ($options, @arguments) = Tracer::StandardSetup(['SimBlocks'],
289                                                    logFile => "*",                                                    { load => 'yes',
                                                   load => 'yes',  
290                                                    generate => 'yes'},                                                    generate => 'yes'},
291                                                   @ARGV);                                                   @ARGV);
292  # Extract the directory name from the argument array.  # Extract the directory name from the argument array.
293  my $inDirectory = $FIG_Config::simBlastData;  my $inDirectoryTree = $FIG_Config::simBlastData;
294  if ($arguments[0]) {  if ($arguments[0]) {
295      $inDirectory = Cwd::abs_path($arguments[0]);      $inDirectoryTree = Cwd::abs_path($arguments[0]);
296  }  }
 # Set up tracing.  
 my $traceLevel = $options->{trace};  
 my $traceOption = $options->{logFile};  
 my $TracingToFile = ($traceOption ne "*");  
 my $traceDestination = ($TracingToFile ? ">$traceOption" : "TEXT");  
 TSetup("$traceLevel Tracer", $traceDestination);  
297  # Get the output directory.  # Get the output directory.
298  my $outDirectory = $FIG_Config::simBlocksData;  my $outDirectory = $FIG_Config::simBlocksData;
299  # Insure that it exists.  # Insure that it exists.
300  if (! -d $outDirectory) {  if (! -d $outDirectory) {
301      Output("Creating output directory $outDirectory.");      Trace("Creating output directory $outDirectory.") if T(2);
302      mkpath($outDirectory);      mkpath($outDirectory);
303    } elsif ($options->{generate} eq 'yes') {
304        # Here we have an output directory already and are going to generate new
305        # load files. Clear any leftover data from previous runs.
306        my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory);
307        my $numFiles = @files;
308        if ($numFiles > 0) {
309            Trace("Deleting $numFiles old dtx files from $outDirectory.") if T(2);
310            unlink map { "$outDirectory/$_" } @files;
311        }
312  }  }
313  # Create an error counter.  # Create an error counter and a directory counter.
314  my $errorCount = 0;  my $errorCount = 0;
315    my $dirCount = 0;
316  # Check to see if we should generate the output files.  # Check to see if we should generate the output files.
317  if ($options->{generate} eq 'no') {  if ($options->{generate} eq 'no') {
318      # Here we are to use existing output files.      # Here we are to use existing output files.
319      Output("Existing database load files will be used.");      Trace("Existing database load files will be used.") if T(2);
320  } else {  } else {
321      # Here we need to produce new output files.      # Here we need to produce new output files.
322      # Verify that the input directory exists.      # Verify that the input directory exists.
323      if (! -d $inDirectory) {      if (! -d $inDirectoryTree) {
324          Confess("Input directory \"$inDirectory\" not found.");          Confess("Input directory \"$inDirectoryTree\" not found.");
325        }
326        # Loop through the subdirectories.
327        for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 0)) {
328            # Verify that this is a directory. If it's ".", we use
329            # the top-level directory.
330            my $inDirectory = ($inputDirectory eq "." ? $inDirectoryTree :
331                                                        "$inDirectoryTree/$inputDirectory");
332            if (($inputDirectory !~ /\../) && -d $inDirectory) {
333                # Here we have a directory to process. Check for a genome
334                # file.
335                my $genomeFileName = "$inDirectory/Genome.tbl";
336                if (! -e $genomeFileName) {
337                    Trace("$genomeFileName not found. Directory skipped.") if T(1);
338                } else {
339                    # Now we can process the directory and accumulate the error
340                    # count.
341                    $errorCount += ProcessDirectory($inDirectory, $outDirectory);
342                    $dirCount++;
343                }
344            }
345        }
346        Trace("Load files generated from $dirCount directories.") if T(2);
347    }
348    # Check for errors.
349    if ($errorCount > 0) {
350        Trace("$errorCount errors found in input files.") if T(0);
351    } else {
352        # No errors, so it's okay to load the database.
353        if ($options->{load} eq 'yes') {
354            # Here we have no outstanding errors and the user wants us to load
355            # the database. First, we create a similarity block object.
356            my $simBlocks = SimBlocks->new();
357            # Use it to load the database. Note we specify that the tables are to be
358            # dropped and rebuilt.
359            $simBlocks->LoadTables($outDirectory, 1);
360            Trace("Database loaded.") if T(2);
361        }
362      }      }
363    
364    # Process a single input directory.
365    sub ProcessDirectory {
366        my ($inDirectory, $outDirectory) = @_;
367        Trace("Processing directory $inDirectory.") if T(2);
368      # Our first task is to copy the genome data to the output directory      # Our first task is to copy the genome data to the output directory
369      # and build the genome list.      # and add them to the genome list.
370      my %genomes = ();      my %genomes = ();
371      (open GENOMESIN, "<$inDirectory/Genome.tbl") ||      Open(\*GENOMESIN, "<$inDirectory/Genome.tbl");
372          Confess("Could not open Genome input file in $inDirectory.");      Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");
373      (open GENOMESOUT, ">$outDirectory/Genome.dtx") ||      # Count the genomes read and errors found.
         Confess("Could not open Genome output file in $outDirectory.");  
     # Count the genomes read.  
374      my $genomeCount = 0;      my $genomeCount = 0;
375        my $errorCount = 0;
376      # Loop through the input.      # Loop through the input.
377      while (my $genomeData = <GENOMESIN>) {      while (my $genomeData = <GENOMESIN>) {
378          # Echo the genome record to the output.          # Echo the genome record to the output.
# Line 336  Line 387 
387          # Count this genome.          # Count this genome.
388          $genomeCount++;          $genomeCount++;
389      }      }
390      Output("$genomeCount genomes found.");      Trace("$genomeCount genomes found.") if T(2);
391      # Close the files.      # Close the files.
392      close GENOMESIN;      close GENOMESIN;
393      close GENOMESOUT;      close GENOMESOUT;
# Line 347  Line 398 
398      # are sorted by block ID, so all processing for this section of the      # are sorted by block ID, so all processing for this section of the
399      # script is done a block at a time. The first task is to      # script is done a block at a time. The first task is to
400      # open the output files.      # open the output files.
401      (open BLOCKSOUT, ">$outDirectory/GroupBlock.dtx") ||      Open(\*BLOCKSOUT, ">>$outDirectory/GroupBlock.dtx");
402          Confess("Could not open block output file in $outDirectory.");      Open(\*REGIONSOUT, ">>$outDirectory/Region.dtx");
403      (open REGIONSOUT, ">$outDirectory/Region.dtx") ||      Open(\*INSTANCESOUT, ">>$outDirectory/HasInstanceOf.dtx");
404          Confess("Could not open region output file in $outDirectory.");      Open(\*CONTAINSOUT, ">>$outDirectory/ContainsRegion.dtx");
405      (open INSTANCESOUT, ">$outDirectory/HasInstanceOf.dtx") ||      Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx");
         Confess("Could not open instance output file in $outDirectory.");  
     (open CONTAINSOUT, ">$outDirectory/ContainsRegion.dtx") ||  
         Confess("Could not open contig-region output file in $outDirectory.");  
     (open INCLUDESOUT, ">$outDirectory/IncludesRegion.dtx") ||  
         Confess("Could not open block-region output file in $outDirectory.");  
406      # Determine which file sets we'll be processing.      # Determine which file sets we'll be processing.
407      my @fileSets = ();      my @fileSets = ();
408      my @prefixes = ("", "InterGenic_");      my @prefixes = ("", "InterGenic_");
# Line 365  Line 411 
411              push @fileSets, $prefix;              push @fileSets, $prefix;
412          }          }
413      }      }
414        # Set up the duplicate-region check.
415        my %allRegions = ();
416      # Set up some counters.      # Set up some counters.
417      my ($blocksCount, $regionsCount) = (0, 0);      my ($blocksCount, $regionsCount) = (0, 0);
418      # Loop through the useful file sets.      # Loop through the useful file sets.
419      for my $fileSet (@fileSets) {      for my $fileSet (@fileSets) {
420          (open BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl") ||          Open(\*BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl");
421              Confess("Could not open block input file in $inDirectory.");          Open(\*REGIONSIN, "<$inDirectory/${fileSet}Region.tbl");
422          (open REGIONSIN, "<$inDirectory/${fileSet}Region.tbl") ||          Trace("Processing ${fileSet}Blocks.") if T(2);
             Confess("Could not open region input file in $inDirectory.");  
         Output("Processing ${fileSet}Blocks.");  
423          # The outer loop processes blocks. This is accomplished by reading          # The outer loop processes blocks. This is accomplished by reading
424          # through the block file. We prime the loop by reading the first          # through the block file. We prime the loop by reading the first
425          # region record. This is because we finish processing a block when          # region record. This is because we finish processing a block when
# Line 403  Line 449 
449                  # If this region's block ID is invalid, complain                  # If this region's block ID is invalid, complain
450                  # and skip it.                  # and skip it.
451                  if ($regionRecord{blockID} != $blockID) {                  if ($regionRecord{blockID} != $blockID) {
452                      Output("Block $regionRecord{blockID} in region record $regionsCount not found in block input file at record $blocksCount.");                      Trace("Block $regionRecord{blockID} in region record $regionsCount not found in block input file at record $blocksCount.") if T(0);
453                      $errorCount++;                      $errorCount++;
454                  } else {                  } else {
455                      # Here both files are in sync, which is good. The next step is                      # Here both files are in sync, which is good. The next step is
# Line 411  Line 457 
457                      my $genomeID = $regionRecord{genomeID};                      my $genomeID = $regionRecord{genomeID};
458                      my $contigID = "$genomeID:$regionRecord{contigID}";                      my $contigID = "$genomeID:$regionRecord{contigID}";
459                      if (! exists $genomes{$genomeID}) {                      if (! exists $genomes{$genomeID}) {
460                          Output("Genome $genomeID in region record $regionsCount not found in genome input file.");                          Trace("Genome $genomeID in region record $regionsCount not found in genome input file.") if T(0);
461                          $errorCount++;                          $errorCount++;
462                      } else {                      } else {
463                          # Denote this genome has an instance of this block.                          # Denote this genome has an instance of this block.
# Line 435  Line 481 
481                              $blockData{len} = length $snippet;                              $blockData{len} = length $snippet;
482                          } elsif ($blockData{len} != length $snippet) {                          } elsif ($blockData{len} != length $snippet) {
483                              # Here it is not the first, but the lengths do not match.                              # Here it is not the first, but the lengths do not match.
484                              Output("Snippet for region record $regionsCount does not match block length $blockData{len}.");                              Trace("Snippet for region record $regionsCount does not match block length $blockData{len}.") if T(0);
485                              $errorCount++;                              $errorCount++;
486                          } else {                          } else {
487                              # Here everything is legitimate, so we merge the new                              # Here everything is legitimate, so we merge the new
# Line 453  Line 499 
499              # We have now processed all the regions in the block. Insure we found at least              # We have now processed all the regions in the block. Insure we found at least
500              # one.              # one.
501              if (! $regionCounter) {              if (! $regionCounter) {
502                  Output("No regions found for block $blockID at $blocksCount in block input file.");                  Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0);
503                  $errorCount++;                  $errorCount++;
504              } else {              } else {
505                  Trace("$regionCounter regions found in block $blockID.") if T(2);                  Trace("$regionCounter regions found in block $blockID.") if T(4);
506                  # Write the block record.                  # Write the block record.
507                  my $variance = $blockData{"snip-count"} / $blockData{len};                  my $variance = $blockData{"snip-count"} / $blockData{len};
508                  print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len},                  print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len},
# Line 466  Line 512 
512                  my @positions = SimBlocks::ParsePattern($blockData{pattern});                  my @positions = SimBlocks::ParsePattern($blockData{pattern});
513                  # Loop through the regions, writing them out to the region output file.                  # Loop through the regions, writing them out to the region output file.
514                  for my $region (keys %regionMap) {                  for my $region (keys %regionMap) {
515                        if (length($region) > 80) {
516                            Trace("Invalid region key \"$region\".") if T(1);
517                        }
518                      # Get the region's snips.                      # Get the region's snips.
519                      my $source = $regionMap{$region};                      my $source = $regionMap{$region};
520                      my $content = "";                      my $content = "";
# Line 476  Line 525 
525                      my $location = BasicLocation->new($region);                      my $location = BasicLocation->new($region);
526                      # Write this region to the output files.                      # Write this region to the output files.
527                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,
528                                            $location->Length, $regionPeg{$region},                                            $location->Right, $location->Length,
529                                            $location->Left, $content) . "\n";                                            $regionPeg{$region}, $location->Left, $content) . "\n";
530                      print CONTAINSOUT join("\t", $location->Contig, $region,                      print CONTAINSOUT join("\t", $location->Contig, $region,
531                                             $location->Length, $location->Left) . "\n";                                             $location->Length, $location->Left) . "\n";
532                      print INCLUDESOUT join("\t", $blockID, $region);                      print INCLUDESOUT join("\t", $blockID, $region) . "\n";
533                  }                  }
534                  # Finally, we need to connect this block to the genomes in which it occurs.                  # Finally, we need to connect this block to the genomes in which it occurs.
535                  for my $genomeID (keys %genomesFound) {                  for my $genomeID (keys %genomesFound) {
# Line 499  Line 548 
548      close BLOCKSOUT;      close BLOCKSOUT;
549      close INSTANCESOUT;      close INSTANCESOUT;
550      # All the block data has been written. Tell the user what we found.      # All the block data has been written. Tell the user what we found.
551      Output("$blocksCount blocks processed, $regionsCount regions processed.");      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);
552      # The next task is to write the genome/contig data. This is provided by the      # The next task is to write the genome/contig data. This is provided by the
553      # "%contigs" hash. First, we need to open the files.      # "%contigs" hash. First, we need to open the files.
554      my $contigsCount = 0;      my $contigsCount = 0;
555      (open CONTIGSOUT, ">$outDirectory/Contig.dtx") ||      Open(\*CONTIGSOUT, ">>$outDirectory/Contig.dtx");
556          Confess("Could not open contig output file in $outDirectory.");      Open(\*CONSISTSOUT, ">>$outDirectory/ConsistsOf.dtx");
     (open CONSISTSOUT, ">$outDirectory/ConsistsOf.dtx") ||  
         Confess("Could not open consists output file in $outDirectory.");  
557      for my $contigID (keys %contigs) {      for my $contigID (keys %contigs) {
558          print CONTIGSOUT "$contigID\n";          print CONTIGSOUT "$contigID\n";
559          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";
560          $contigsCount++;          $contigsCount++;
561      }      }
562      Output("$contigsCount contigs found.");      Trace("$contigsCount contigs found.") if T(2);
563        # Close the output files.
564        close CONTIGSOUT;
565        close CONSISTSOUT;
566      # Now warn the user about all the genomes that didn't have blocks.      # Now warn the user about all the genomes that didn't have blocks.
567      for my $genomeID (keys %genomes) {      for my $genomeID (keys %genomes) {
568          if (! $genomes{$genomeID}) {          if (! $genomes{$genomeID}) {
569              Output("Genome $genomeID did not have any regions.");              Trace("Genome $genomeID did not have any regions.") if T(1);
570              $errorCount++;              $errorCount++;
571          }          }
572      }      }
573  }      return $errorCount;
 # Check for errors.  
 if ($errorCount > 0) {  
     Output("$errorCount errors found in input files.");  
 } else {  
     # No errors, so it's okay to load the database.  
     if ($options->{load} eq 'yes') {  
         # Here we have no outstanding errors and the user wants us to load  
         # the database. First, we create a similarity block object.  
         my $simBlocks = SimBlocks->new();  
         # Use it to load the database. Note we specify that the tables are to be  
         # dropped and rebuilt.  
         $simBlocks->LoadTables($outDirectory, 1);  
         Output("Database loaded.");  
     }  
574  }  }
575  # Tell the user we're done.  # Tell the user we're done.
576  Output("Processing complete.");  Trace("Processing complete.") if T(0);
577    
578  # Read a region record from the file and parse it into a hash  # Read a region record from the file and parse it into a hash
579  # for return to the caller. If we reach end-of-file, the  # for return to the caller. If we reach end-of-file, the
# Line 562  Line 598 
598      return %retVal;      return %retVal;
599  }  }
600    
 # Write an output line. The output line will be traced at level 0. If  
 # $TracingToFile is set, it will be echoed to the standard output.  
 sub Output {  
     my ($message) = @_;  
     Trace($message) if T(0);  
     if ($TracingToFile) {  
         print "$message\n";  
     }  
 }  
   
601  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.4

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3