[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Diff of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Wed Jul 27 20:05:58 2005 UTC revision 1.5, Sun Jan 15 21:32:17 2006 UTC
# Line 7  Line 7 
7  use SimBlocks;  use SimBlocks;
8  use File::Path;  use File::Path;
9  use BasicLocation;  use BasicLocation;
10    use Cwd;
11    
12  =head1 Similarity Block Loader  =head1 Similarity Block Loader
13    
14  This script loads the similarity block database from  This script loads the similarity block database from
15  the input files.  the input files. The load process involves two steps:
16    converting the input files into C<dtx> load files
17    (B<generate>), and loading the C<dtx> files into the
18    database (B<load>).
19    
20  The script takes a single parameter-- the directory name.  The script takes a single parameter-- a directory name.
21  The default directory name is taken from the config file  The default directory name is taken from the config file
22  parameter C<$fig_config::SimBlastData>. The output files  parameter C<$fig_config::SimBlastData>. The output files
23  will be produced in the similarity block data directory  will be produced in the similarity block data directory
24  C<$fig_config::SimBlockData>, which will be created if  C<$fig_config::SimBlockData>, which will be created if
25  it does not exist.  it does not exist. The input directory and all its
26    subdirectories will be processed for input files.
27    
28  In addition to the directory name, the following  In addition to the directory name, the following
29  option parameters are supported.  option parameters are supported.
# Line 28  Line 33 
33  =item trace  =item trace
34    
35  Trace level for output messages. A higher number means more  Trace level for output messages. A higher number means more
36  messages. The default is C<1>.  messages. The default is C<3>. Trace messages are sent to
37    the file C<trace.log> in the B<$FIG_Config::tmp>
38    directory.
39    
40  =item logFile  =item sql
41    
42  Name of a file to be used for trace messages, or  If specified, SQL activity will be traced at the specified
43  C<*> to send them to the standard output. If a  trace level.
 file is specified, the output directory must  
 already exist. The default is C<*>.  
44    
45  =item load  =item load
46    
# Line 50  Line 55 
55  =back  =back
56    
57  For example, the following command line will process the  For example, the following command line will process the
58  C<Fragment.sort> file in the C</Users/fig/BlastData> directory and  input files in the C</Users/fig/BlastData> directory and
59  run at a trace level of 3.  run at a trace level of 3.
60    
61  C<< SimLoad /Users/fig/BlastData -trace=3 >>  C<< SimLoad -trace=3 /Users/fig/BlastData >>
62    
63  Please note that spaces in the file and directory names must  The following command line converts the input files in
64  be escaped as C<\b> to prevent them from being parsed as  the default directory into load files but does not load
65  argument delimiters.  the database and runs at a trace level of 2.
66    
67    C<< SimLoad -load=no >>
68    
69  =head2 Input Directory  =head2 Input Directory
70    
71  The following files must exist in the input directory.  The following files must exist in each input directory.
72    
73  =over 4  =over 4
74    
# Line 278  Line 285 
285  my $TRAILER = 999999999;  my $TRAILER = 999999999;
286    
287  # Parse the command line.  # Parse the command line.
288  my ($options, @arguments) = Tracer::ParseCommand({ trace => 1,  my ($options, @arguments) = StandardSetup(['SimBlocks'],
289                                                    logFile => "*",                                                  { load => ['yes', "'no' to suppress loading the database"],
290                                                    load => 'yes',                                                    generate => ['yes', "'no' to suppress generating the load files"]
291                                                    generate => 'yes'},                                                  },
292                                                    "directoryName",
293                                                   @ARGV);                                                   @ARGV);
294  # Extract the directory name from the argument array.  # Extract the directory name from the argument array.
295  my $inDirectory = $FIG_Config::simBlastData;  my $inDirectoryTree = $FIG_Config::simBlastData;
296  if ($arguments[0]) {  if ($arguments[0]) {
297      $inDirectory = Cwd::abs_path($arguments[0]);      $inDirectoryTree = Cwd::abs_path($arguments[0]);
298  }  }
 # Set up tracing.  
 my $traceLevel = $options->{trace};  
 my $traceOption = $options->{logFile};  
 my $TracingToFile = ($traceOption ne "*");  
 my $traceDestination = ($TracingToFile ? ">$traceOption" : "TEXT");  
 TSetup("$traceLevel Tracer", $traceDestination);  
299  # Get the output directory.  # Get the output directory.
300  my $outDirectory = $FIG_Config::simBlocksData;  my $outDirectory = $FIG_Config::simBlocksData;
301  # Insure that it exists.  # Insure that it exists.
302  if (! -d $outDirectory) {  if (! -d $outDirectory) {
303      Output("Creating output directory $outDirectory.");      Trace("Creating output directory $outDirectory.") if T(2);
304      mkpath($outDirectory);      mkpath($outDirectory);
305    } elsif ($options->{generate} eq 'yes') {
306        # Here we have an output directory already and are going to generate new
307        # load files. Clear any leftover data from previous runs.
308        my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory);
309        my $numFiles = @files;
310        if ($numFiles > 0) {
311            Trace("Deleting $numFiles old dtx files from $outDirectory.") if T(2);
312            unlink map { "$outDirectory/$_" } @files;
313        }
314  }  }
315  # Create an error counter.  # Create an error counter and a directory counter.
316  my $errorCount = 0;  my $errorCount = 0;
317    my $dirCount = 0;
318  # Check to see if we should generate the output files.  # Check to see if we should generate the output files.
319  if ($options->{generate} eq 'no') {  if ($options->{generate} eq 'no') {
320      # Here we are to use existing output files.      # Here we are to use existing output files.
321      Output("Existing database load files will be used.");      Trace("Existing database load files will be used.") if T(2);
322  } else {  } else {
323      # Here we need to produce new output files.      # Here we need to produce new output files.
324      # Verify that the input directory exists.      # Verify that the input directory exists.
325      if (! -d $inDirectory) {      if (! -d $inDirectoryTree) {
326          Confess("Input directory \"$inDirectory\" not found.");          Confess("Input directory \"$inDirectoryTree\" not found.");
327        }
328        # Loop through the subdirectories.
329        for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 0)) {
330            # Verify that this is a directory. If it's ".", we use
331            # the top-level directory.
332            my $inDirectory = ($inputDirectory eq "." ? $inDirectoryTree :
333                                                        "$inDirectoryTree/$inputDirectory");
334            if (($inputDirectory !~ /\../) && -d $inDirectory) {
335                # Here we have a directory to process. Check for a genome
336                # file.
337                my $genomeFileName = "$inDirectory/Genome.tbl";
338                if (! -e $genomeFileName) {
339                    Trace("$genomeFileName not found. Directory skipped.") if T(1);
340                } else {
341                    # Now we can process the directory and accumulate the error
342                    # count.
343                    $errorCount += ProcessDirectory($inDirectory, $outDirectory);
344                    $dirCount++;
345                }
346            }
347        }
348        Trace("Load files generated from $dirCount directories.") if T(2);
349    }
350    # Check for errors.
351    if ($errorCount > 0) {
352        Trace("$errorCount errors found in input files.") if T(0);
353    } else {
354        # No errors, so it's okay to load the database.
355        if ($options->{load} eq 'yes') {
356            # Here we have no outstanding errors and the user wants us to load
357            # the database. First, we create a similarity block object.
358            my $simBlocks = SimBlocks->new();
359            # Use it to load the database. Note we specify that the tables are to be
360            # dropped and rebuilt.
361            $simBlocks->LoadTables($outDirectory, 1);
362            Trace("Database loaded.") if T(2);
363        }
364      }      }
365    
366    # Process a single input directory.
367    sub ProcessDirectory {
368        my ($inDirectory, $outDirectory) = @_;
369        Trace("Processing directory $inDirectory.") if T(2);
370      # Our first task is to copy the genome data to the output directory      # Our first task is to copy the genome data to the output directory
371      # and build the genome list.      # and add them to the genome list.
372      my %genomes = ();      my %genomes = ();
373      (open GENOMESIN, "<$inDirectory/Genome.tbl") ||      Open(\*GENOMESIN, "<$inDirectory/Genome.tbl");
374          Confess("Could not open Genome input file in $inDirectory.");      Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");
375      (open GENOMESOUT, ">$outDirectory/Genome.dtx") ||      # Count the genomes read and errors found.
         Confess("Could not open Genome output file in $outDirectory.");  
     # Count the genomes read.  
376      my $genomeCount = 0;      my $genomeCount = 0;
377        my $errorCount = 0;
378      # Loop through the input.      # Loop through the input.
379      while (my $genomeData = <GENOMESIN>) {      while (my $genomeData = <GENOMESIN>) {
380          # Echo the genome record to the output.          # Echo the genome record to the output.
# Line 336  Line 389 
389          # Count this genome.          # Count this genome.
390          $genomeCount++;          $genomeCount++;
391      }      }
392      Output("$genomeCount genomes found.");      Trace("$genomeCount genomes found.") if T(2);
393      # Close the files.      # Close the files.
394      close GENOMESIN;      close GENOMESIN;
395      close GENOMESOUT;      close GENOMESOUT;
# Line 347  Line 400 
400      # are sorted by block ID, so all processing for this section of the      # are sorted by block ID, so all processing for this section of the
401      # script is done a block at a time. The first task is to      # script is done a block at a time. The first task is to
402      # open the output files.      # open the output files.
403      (open BLOCKSOUT, ">$outDirectory/GroupBlock.dtx") ||      Open(\*BLOCKSOUT, ">>$outDirectory/GroupBlock.dtx");
404          Confess("Could not open block output file in $outDirectory.");      Open(\*REGIONSOUT, ">>$outDirectory/Region.dtx");
405      (open REGIONSOUT, ">$outDirectory/Region.dtx") ||      Open(\*INSTANCESOUT, ">>$outDirectory/HasInstanceOf.dtx");
406          Confess("Could not open region output file in $outDirectory.");      Open(\*CONTAINSOUT, ">>$outDirectory/ContainsRegion.dtx");
407      (open INSTANCESOUT, ">$outDirectory/HasInstanceOf.dtx") ||      Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx");
         Confess("Could not open instance output file in $outDirectory.");  
     (open CONTAINSOUT, ">$outDirectory/ContainsRegion.dtx") ||  
         Confess("Could not open contig-region output file in $outDirectory.");  
     (open INCLUDESOUT, ">$outDirectory/IncludesRegion.dtx") ||  
         Confess("Could not open block-region output file in $outDirectory.");  
408      # Determine which file sets we'll be processing.      # Determine which file sets we'll be processing.
409      my @fileSets = ();      my @fileSets = ();
410      my @prefixes = ("", "InterGenic_");      my @prefixes = ("", "InterGenic_");
# Line 365  Line 413 
413              push @fileSets, $prefix;              push @fileSets, $prefix;
414          }          }
415      }      }
416        # Set up the duplicate-region check.
417        my %allRegions = ();
418      # Set up some counters.      # Set up some counters.
419      my ($blocksCount, $regionsCount) = (0, 0);      my ($blocksCount, $regionsCount) = (0, 0);
420      # Loop through the useful file sets.      # Loop through the useful file sets.
421      for my $fileSet (@fileSets) {      for my $fileSet (@fileSets) {
422          (open BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl") ||          Open(\*BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl");
423              Confess("Could not open block input file in $inDirectory.");          Open(\*REGIONSIN, "<$inDirectory/${fileSet}Region.tbl");
424          (open REGIONSIN, "<$inDirectory/${fileSet}Region.tbl") ||          Trace("Processing ${fileSet}Blocks.") if T(2);
             Confess("Could not open region input file in $inDirectory.");  
         Output("Processing ${fileSet}Blocks.");  
425          # The outer loop processes blocks. This is accomplished by reading          # The outer loop processes blocks. This is accomplished by reading
426          # through the block file. We prime the loop by reading the first          # through the block file. We prime the loop by reading the first
427          # region record. This is because we finish processing a block when          # region record. This is because we finish processing a block when
# Line 403  Line 451 
451                  # If this region's block ID is invalid, complain                  # If this region's block ID is invalid, complain
452                  # and skip it.                  # and skip it.
453                  if ($regionRecord{blockID} != $blockID) {                  if ($regionRecord{blockID} != $blockID) {
454                      Output("Block $regionRecord{blockID} in region record $regionsCount not found in block input file at record $blocksCount.");                      Trace("Block $regionRecord{blockID} in region record $regionsCount not found in block input file at record $blocksCount.") if T(0);
455                      $errorCount++;                      $errorCount++;
456                  } else {                  } else {
457                      # Here both files are in sync, which is good. The next step is                      # Here both files are in sync, which is good. The next step is
# Line 411  Line 459 
459                      my $genomeID = $regionRecord{genomeID};                      my $genomeID = $regionRecord{genomeID};
460                      my $contigID = "$genomeID:$regionRecord{contigID}";                      my $contigID = "$genomeID:$regionRecord{contigID}";
461                      if (! exists $genomes{$genomeID}) {                      if (! exists $genomes{$genomeID}) {
462                          Output("Genome $genomeID in region record $regionsCount not found in genome input file.");                          Trace("Genome $genomeID in region record $regionsCount not found in genome input file.") if T(0);
463                          $errorCount++;                          $errorCount++;
464                      } else {                      } else {
465                          # Denote this genome has an instance of this block.                          # Denote this genome has an instance of this block.
# Line 435  Line 483 
483                              $blockData{len} = length $snippet;                              $blockData{len} = length $snippet;
484                          } elsif ($blockData{len} != length $snippet) {                          } elsif ($blockData{len} != length $snippet) {
485                              # Here it is not the first, but the lengths do not match.                              # Here it is not the first, but the lengths do not match.
486                              Output("Snippet for region record $regionsCount does not match block length $blockData{len}.");                              Trace("Snippet for region record $regionsCount does not match block length $blockData{len}.") if T(0);
487                              $errorCount++;                              $errorCount++;
488                          } else {                          } else {
489                              # Here everything is legitimate, so we merge the new                              # Here everything is legitimate, so we merge the new
# Line 453  Line 501 
501              # We have now processed all the regions in the block. Insure we found at least              # We have now processed all the regions in the block. Insure we found at least
502              # one.              # one.
503              if (! $regionCounter) {              if (! $regionCounter) {
504                  Output("No regions found for block $blockID at $blocksCount in block input file.");                  Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0);
505                  $errorCount++;                  $errorCount++;
506              } else {              } else {
507                  Trace("$regionCounter regions found in block $blockID.") if T(2);                  Trace("$regionCounter regions found in block $blockID.") if T(4);
508                  # Write the block record.                  # Write the block record.
509                  my $variance = $blockData{"snip-count"} / $blockData{len};                  my $variance = $blockData{"snip-count"} / $blockData{len};
510                  print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len},                  print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len},
# Line 466  Line 514 
514                  my @positions = SimBlocks::ParsePattern($blockData{pattern});                  my @positions = SimBlocks::ParsePattern($blockData{pattern});
515                  # Loop through the regions, writing them out to the region output file.                  # Loop through the regions, writing them out to the region output file.
516                  for my $region (keys %regionMap) {                  for my $region (keys %regionMap) {
517                        if (length($region) > 80) {
518                            Trace("Invalid region key \"$region\".") if T(1);
519                        }
520                      # Get the region's snips.                      # Get the region's snips.
521                      my $source = $regionMap{$region};                      my $source = $regionMap{$region};
522                      my $content = "";                      my $content = "";
# Line 476  Line 527 
527                      my $location = BasicLocation->new($region);                      my $location = BasicLocation->new($region);
528                      # Write this region to the output files.                      # Write this region to the output files.
529                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,
530                                            $location->Length, $regionPeg{$region},                                            $location->Right, $location->Length,
531                                            $location->Left, $content) . "\n";                                            $regionPeg{$region}, $location->Left, $content) . "\n";
532                      print CONTAINSOUT join("\t", $location->Contig, $region,                      print CONTAINSOUT join("\t", $location->Contig, $region,
533                                             $location->Length, $location->Left) . "\n";                                             $location->Length, $location->Left) . "\n";
534                      print INCLUDESOUT join("\t", $blockID, $region);                      print INCLUDESOUT join("\t", $blockID, $region) . "\n";
535                  }                  }
536                  # Finally, we need to connect this block to the genomes in which it occurs.                  # Finally, we need to connect this block to the genomes in which it occurs.
537                  for my $genomeID (keys %genomesFound) {                  for my $genomeID (keys %genomesFound) {
# Line 499  Line 550 
550      close BLOCKSOUT;      close BLOCKSOUT;
551      close INSTANCESOUT;      close INSTANCESOUT;
552      # All the block data has been written. Tell the user what we found.      # All the block data has been written. Tell the user what we found.
553      Output("$blocksCount blocks processed, $regionsCount regions processed.");      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);
554      # The next task is to write the genome/contig data. This is provided by the      # The next task is to write the genome/contig data. This is provided by the
555      # "%contigs" hash. First, we need to open the files.      # "%contigs" hash. First, we need to open the files.
556      my $contigsCount = 0;      my $contigsCount = 0;
557      (open CONTIGSOUT, ">$outDirectory/Contig.dtx") ||      Open(\*CONTIGSOUT, ">>$outDirectory/Contig.dtx");
558          Confess("Could not open contig output file in $outDirectory.");      Open(\*CONSISTSOUT, ">>$outDirectory/ConsistsOf.dtx");
     (open CONSISTSOUT, ">$outDirectory/ConsistsOf.dtx") ||  
         Confess("Could not open consists output file in $outDirectory.");  
559      for my $contigID (keys %contigs) {      for my $contigID (keys %contigs) {
560          print CONTIGSOUT "$contigID\n";          print CONTIGSOUT "$contigID\n";
561          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";
562          $contigsCount++;          $contigsCount++;
563      }      }
564      Output("$contigsCount contigs found.");      Trace("$contigsCount contigs found.") if T(2);
565        # Close the output files.
566        close CONTIGSOUT;
567        close CONSISTSOUT;
568      # Now warn the user about all the genomes that didn't have blocks.      # Now warn the user about all the genomes that didn't have blocks.
569      for my $genomeID (keys %genomes) {      for my $genomeID (keys %genomes) {
570          if (! $genomes{$genomeID}) {          if (! $genomes{$genomeID}) {
571              Output("Genome $genomeID did not have any regions.");              Trace("Genome $genomeID did not have any regions.") if T(1);
572              $errorCount++;              $errorCount++;
573          }          }
574      }      }
575  }      return $errorCount;
 # Check for errors.  
 if ($errorCount > 0) {  
     Output("$errorCount errors found in input files.");  
 } else {  
     # No errors, so it's okay to load the database.  
     if ($options->{load} eq 'yes') {  
         # Here we have no outstanding errors and the user wants us to load  
         # the database. First, we create a similarity block object.  
         my $simBlocks = SimBlocks->new();  
         # Use it to load the database. Note we specify that the tables are to be  
         # dropped and rebuilt.  
         $simBlocks->LoadTables($outDirectory, 1);  
         Output("Database loaded.");  
     }  
576  }  }
577  # Tell the user we're done.  # Tell the user we're done.
578  Output("Processing complete.");  Trace("Processing complete.") if T(0);
579    
580  # Read a region record from the file and parse it into a hash  # Read a region record from the file and parse it into a hash
581  # for return to the caller. If we reach end-of-file, the  # for return to the caller. If we reach end-of-file, the
# Line 562  Line 600 
600      return %retVal;      return %retVal;
601  }  }
602    
 # Write an output line. The output line will be traced at level 0. If  
 # $TracingToFile is set, it will be echoed to the standard output.  
 sub Output {  
     my ($message) = @_;  
     Trace($message) if T(0);  
     if ($TracingToFile) {  
         print "$message\n";  
     }  
 }  
   
603  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.5

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3