[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Diff of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Thu Jun 9 19:06:55 2005 UTC revision 1.7, Wed Feb 1 03:18:41 2006 UTC
# Line 4  Line 4 
4  use CGI;  use CGI;
5  use Tracer;  use Tracer;
6  use Genome;  use Genome;
7    use DBKernel;
8  use SimBlocks;  use SimBlocks;
9  use File::Path;  use File::Path;
10  use Location;  use BasicLocation;
11    use Cwd;
12    
13  =head1 Similarity Block Loader  =head1 Similarity Block Loader
14    
15  This script loads the similarity block database from  This script loads the similarity block database from
16  the input files.  the input files. The load process involves two steps:
17    converting the input files into C<dtx> load files
18  The script takes a single parameter-- the directory name.  (B<generate>), and loading the C<dtx> files into the
19  The default directory name is taken from the config file  database (B<load>).
20  parameter C<$fig_config::SimBlastData>. The output files  
21  will be produced in the similarity block data directory  The script takes a single parameter-- a directory name.
22  C<$fig_config::SimBlockData>, which will be created if  The default directory name is C<"$FIG_Config::data/CloseStrainSets">.
23  it does not exist.  The input files should be in subdirectories called
24    C<Block> under the subdirectories of the input directory.
25    The subdirectory names themselves are considered the name
26    of the close-strain set. So, for example
27    
28        Data/CloseStrainSets/Vibrio/Block
29    
30    would be presumed to contain the Vibrio strains.
31    
32    The output files will be produced in the similarity block
33    data directory C<$fig_config::SimBlockData>, which will be
34    created if it does not exist. The input directory and all its
35    subdirectories will be processed for input files.
36    
37  In addition to the directory name, the following  In addition to the directory name, the following
38  option parameters are supported.  option parameters are supported.
# Line 28  Line 42 
42  =item trace  =item trace
43    
44  Trace level for output messages. A higher number means more  Trace level for output messages. A higher number means more
45  messages. The default is C<1>.  messages. The default is C<3>. Trace messages are sent to
46    the file C<trace.log> in the B<$FIG_Config::tmp>
47    directory.
48    
49  =item logFile  =item sql
50    
51  Name of a file to be used for trace messages, or  If specified, SQL activity will be traced at the specified
52  C<*> to send them to the standard output. If a  trace level.
 file is specified, the output directory must  
 already exist. The default is C<*>.  
53    
54  =item load  =item load
55    
# Line 47  Line 61 
61  C<yes> to generate output files from input files, else  C<yes> to generate output files from input files, else
62  C<no>. The default is C<yes>.  C<no>. The default is C<yes>.
63    
64    =item createDB
65    
66    If specified, the database will be dropped and
67    re-created before loading.
68    
69  =back  =back
70    
71  For example, the following command line will process the  For example, the following command line will process the
72  C<Fragment.sort> file in the C</Users/fig/BlastData> directory and  input files in the C</Users/fig/BlastData> directory tree
73  run at a trace level of 3.  and run at a trace level of 3.
74    
75  C<< SimLoad /Users/fig/BlastData -trace=3 >>  C<< SimLoad -trace=3 /Users/fig/BlastData >>
76    
77  Please note that spaces in the file and directory names must  The following command line converts the input files in
78  be escaped as C<\b> to prevent them from being parsed as  the default directory into load files but does not load
79  argument delimiters.  the database and runs at a trace level of 2.
80    
81    C<< SimLoad -load=no >>
82    
83  =head2 Input Directory  =head2 Input Directory
84    
85  The following files must exist in the input directory.  The following files must exist in each C<Block> directory
86    under the input directory.
87    
88  =over 4  =over 4
89    
# Line 278  Line 300 
300  my $TRAILER = 999999999;  my $TRAILER = 999999999;
301    
302  # Parse the command line.  # Parse the command line.
303  my ($options, @arguments) = Tracer::ParseCommand({ trace => 1,  my ($options, @arguments) = StandardSetup(['SimBlocks'],
304                                                    logFile => "*",                                                  { load => ['yes', "'no' to suppress loading the database"],
305                                                    load => 'yes',                                                    generate => ['yes', "'no' to suppress generating the load files"],
306                                                    generate => 'yes'},                                                    createDB => [0, 'drop and create the database before loading']
307                                                    },
308                                                    "directoryName",
309                                                   @ARGV);                                                   @ARGV);
310  # Extract the directory name from the argument array.  # Extract the directory name from the argument array.
311  my $inDirectory = $FIG_Config::simBlastData;  my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets");
312  if ($arguments[0]) {  # Check to see if we need to create the database.
313      $inDirectory = Cwd::abs_path($arguments[0]);  if ($options->{createDB}) {
314  }      Trace("Creating database.") if T(2);
315  # Set up tracing.      DBKernel::CreateDB($FIG_Config::simBlocksDB);
316  my $traceLevel = $options->{trace};  }
 my $traceOption = $options->{logFile};  
 my $TracingToFile = ($traceOption ne "*");  
 my $traceDestination = ($TracingToFile ? ">$traceOption" : "TEXT");  
 TSetup("$traceLevel Tracer", $traceDestination);  
317  # Get the output directory.  # Get the output directory.
318  my $outDirectory = $FIG_Config::simBlocksData;  my $outDirectory = $FIG_Config::simBlocksData;
319  # Insure that it exists.  # Insure that it exists.
320  if (! -d $outDirectory) {  if (! -d $outDirectory) {
321      Output("Creating output directory $outDirectory.");      Trace("Creating output directory $outDirectory.") if T(2);
322      mkpath($outDirectory);      mkpath($outDirectory);
323    } elsif ($options->{generate} eq 'yes') {
324        # Here we have an output directory already and are going to generate new
325        # load files. Clear any leftover data from previous runs.
326        my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory);
327        my $numFiles = @files;
328        if ($numFiles > 0) {
329            Trace("Deleting $numFiles old dtx files from $outDirectory.") if T(2);
330            unlink map { "$outDirectory/$_" } @files;
331        }
332  }  }
333  # Create an error counter.  # Create an error counter and a directory counter.
334  my $errorCount = 0;  my $errorCount = 0;
335    my $dirCount = 0;
336  # Check to see if we should generate the output files.  # Check to see if we should generate the output files.
337  if ($options->{generate} eq 'no') {  if ($options->{generate} eq 'no') {
338      # Here we are to use existing output files.      # Here we are to use existing output files.
339      Output("Existing database load files will be used.");      Trace("Existing database load files will be used.") if T(2);
340  } else {  } else {
341      # Here we need to produce new output files.      # Here we need to produce new output files.
342      # Verify that the input directory exists.      # Verify that the input directory exists.
343      if (! -d $inDirectory) {      if (! -d $inDirectoryTree) {
344          Confess("Input directory \"$inDirectory\" not found.");          Confess("Input directory \"$inDirectoryTree\" not found.");
345        }
346        # Loop through the subdirectories.
347        for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 1)) {
348            # Verify that this is a directory.
349            my $inDirectory = "$inDirectoryTree/$inputDirectory/Blocks";
350            if (-d $inDirectory) {
351                # Here we have a directory to process. Check for a genome
352                # file.
353                my $genomeFileName = "$inDirectory/Genome.tbl";
354                if (! -e $genomeFileName) {
355                    Trace("$genomeFileName not found. Directory skipped.") if T(1);
356                } else {
357                    # Now we can process the directory and accumulate the error
358                    # count.
359                    $errorCount += ProcessDirectory($inDirectory, $outDirectory, $inputDirectory);
360                    $dirCount++;
361                }
362            }
363        }
364        Trace("Load files generated from $dirCount directories.") if T(2);
365    }
366    # Check for errors.
367    if ($errorCount > 0) {
368        Trace("$errorCount errors found in input files.") if T(0);
369    } else {
370        # No errors, so it's okay to load the database.
371        if ($options->{load} eq 'yes') {
372            # Here we have no outstanding errors and the user wants us to load
373            # the database. First, we create a similarity block object.
374            my $simBlocks = SimBlocks->new();
375            # Use it to load the database. Note we specify that the tables are to be
376            # dropped and rebuilt.
377            $simBlocks->LoadTables($outDirectory, 1);
378            Trace("Database loaded.") if T(2);
379        }
380      }      }
381    
382    # Process a single input directory.
383    sub ProcessDirectory {
384        my ($inDirectory, $outDirectory, $groupName) = @_;
385        Trace("Processing directory $inDirectory.") if T(2);
386      # Our first task is to copy the genome data to the output directory      # Our first task is to copy the genome data to the output directory
387      # and build the genome list.      # and add the genomes to the genome list.
388      my %genomes = ();      my %genomes = ();
389      (open GENOMESIN, "<$inDirectory/Genome.tbl") ||      Open(\*GENOMESIN, "<$inDirectory/Genome.tbl");
390          Confess("Could not open Genome input file in $inDirectory.");      Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");
391      (open GENOMESOUT, ">$outDirectory/Genome.dtx") ||      # Count the genomes read and errors found.
         Confess("Could not open Genome output file in $outDirectory.");  
     # Count the genomes read.  
392      my $genomeCount = 0;      my $genomeCount = 0;
393        my $errorCount = 0;
394      # Loop through the input.      # Loop through the input.
395      while (my $genomeData = <GENOMESIN>) {      while (my $genomeData = <GENOMESIN>) {
396          # Echo the genome record to the output.          # Echo the genome record to the output.
397          print GENOMESOUT $genomeData;          my @fields = Tracer::ParseRecord($genomeData);
398            print GENOMESOUT join("\t", @fields, $groupName). "\n";
399          # Extract the genome ID.          # Extract the genome ID.
400          my ($genomeID) = Tracer::ParseRecord($genomeData);          my $genomeID = $fields[0];
401          # Store it in the genomes hash. We start with a value of 0. If          # Store it in the genomes hash. We start with a value of 0. If
402          # contig information for the genome is found, we change the value          # contig information for the genome is found, we change the value
403          # to 1. When we're all done with the regions, we can check the          # to 1. When we're all done with the regions, we can check the
# Line 336  Line 406 
406          # Count this genome.          # Count this genome.
407          $genomeCount++;          $genomeCount++;
408      }      }
409      Output("$genomeCount genomes found.");      Trace("$genomeCount genomes found.") if T(2);
410      # Close the files.      # Close the files.
411      close GENOMESIN;      close GENOMESIN;
412      close GENOMESOUT;      close GENOMESOUT;
# Line 347  Line 417 
417      # are sorted by block ID, so all processing for this section of the      # are sorted by block ID, so all processing for this section of the
418      # script is done a block at a time. The first task is to      # script is done a block at a time. The first task is to
419      # open the output files.      # open the output files.
420      (open BLOCKSOUT, ">$outDirectory/GroupBlock.dtx") ||      Open(\*BLOCKSOUT, ">>$outDirectory/GroupBlock.dtx");
421          Confess("Could not open block output file in $outDirectory.");      Open(\*REGIONSOUT, ">>$outDirectory/Region.dtx");
422      (open REGIONSOUT, ">$outDirectory/Region.dtx") ||      Open(\*INSTANCESOUT, ">>$outDirectory/HasInstanceOf.dtx");
423          Confess("Could not open region output file in $outDirectory.");      Open(\*CONTAINSOUT, ">>$outDirectory/ContainsRegion.dtx");
424      (open INSTANCESOUT, ">$outDirectory/HasInstanceOf.dtx") ||      Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx");
         Confess("Could not open instance output file in $outDirectory.");  
     (open CONTAINSOUT, ">$outDirectory/ContainsRegion.dtx") ||  
         Confess("Could not open contig-region output file in $outDirectory.");  
     (open INCLUDESOUT, ">$outDirectory/IncludesRegion.dtx") ||  
         Confess("Could not open block-region output file in $outDirectory.");  
425      # Determine which file sets we'll be processing.      # Determine which file sets we'll be processing.
426      my @fileSets = ();      my @fileSets = ();
427      my @prefixes = ("", "InterGenic_");      my @prefixes = ("", "InterGenic_");
# Line 365  Line 430 
430              push @fileSets, $prefix;              push @fileSets, $prefix;
431          }          }
432      }      }
433        # Set up the duplicate-region check.
434        my %allRegions = ();
435      # Set up some counters.      # Set up some counters.
436      my ($blocksCount, $regionsCount) = (0, 0);      my ($blocksCount, $regionsCount) = (0, 0);
437      # Loop through the useful file sets.      # Loop through the useful file sets.
438      for my $fileSet (@fileSets) {      for my $fileSet (@fileSets) {
439          (open BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl") ||          Open(\*BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl");
440              Confess("Could not open block input file in $inDirectory.");          Open(\*REGIONSIN, "<$inDirectory/${fileSet}Region.tbl");
441          (open REGIONSIN, "<$inDirectory/${fileSet}Region.tbl") ||          Trace("Processing ${fileSet}Blocks.") if T(2);
             Confess("Could not open region input file in $inDirectory.");  
         Output("Processing ${fileSet}Blocks.");  
442          # The outer loop processes blocks. This is accomplished by reading          # The outer loop processes blocks. This is accomplished by reading
443          # through the block file. We prime the loop by reading the first          # through the block file. We prime the loop by reading the first
444          # region record. This is because we finish processing a block when          # region record. This is because we finish processing a block when
# Line 403  Line 468 
468                  # If this region's block ID is invalid, complain                  # If this region's block ID is invalid, complain
469                  # and skip it.                  # and skip it.
470                  if ($regionRecord{blockID} != $blockID) {                  if ($regionRecord{blockID} != $blockID) {
471                      Output("Block $regionRecord{blockID} in region record $regionsCount not found in block input file at record $blocksCount.");                      Trace("Block $regionRecord{blockID} in region record $regionsCount not found in block input file at record $blocksCount.") if T(0);
472                      $errorCount++;                      $errorCount++;
473                  } else {                  } else {
474                      # Here both files are in sync, which is good. The next step is                      # Here both files are in sync, which is good. The next step is
# Line 411  Line 476 
476                      my $genomeID = $regionRecord{genomeID};                      my $genomeID = $regionRecord{genomeID};
477                      my $contigID = "$genomeID:$regionRecord{contigID}";                      my $contigID = "$genomeID:$regionRecord{contigID}";
478                      if (! exists $genomes{$genomeID}) {                      if (! exists $genomes{$genomeID}) {
479                          Output("Genome $genomeID in region record $regionsCount not found in genome input file.");                          Trace("Genome $genomeID in region record $regionsCount not found in genome input file.") if T(0);
480                          $errorCount++;                          $errorCount++;
481                      } else {                      } else {
482                          # Denote this genome has an instance of this block.                          # Denote this genome has an instance of this block.
# Line 435  Line 500 
500                              $blockData{len} = length $snippet;                              $blockData{len} = length $snippet;
501                          } elsif ($blockData{len} != length $snippet) {                          } elsif ($blockData{len} != length $snippet) {
502                              # Here it is not the first, but the lengths do not match.                              # Here it is not the first, but the lengths do not match.
503                              Output("Snippet for region record $regionsCount does not match block length $blockData{len}.");                              Trace("Snippet for region record $regionsCount does not match block length $blockData{len}.") if T(0);
504                              $errorCount++;                              $errorCount++;
505                          } else {                          } else {
506                              # Here everything is legitimate, so we merge the new                              # Here everything is legitimate, so we merge the new
# Line 453  Line 518 
518              # We have now processed all the regions in the block. Insure we found at least              # We have now processed all the regions in the block. Insure we found at least
519              # one.              # one.
520              if (! $regionCounter) {              if (! $regionCounter) {
521                  Output("No regions found for block $blockID at $blocksCount in block input file.");                  Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0);
522                  $errorCount++;                  $errorCount++;
523              } else {              } else {
524                  Trace("$regionCounter regions found in block $blockID.") if T(2);                  Trace("$regionCounter regions found in block $blockID.") if T(4);
525                  # Write the block record.                  # Write the block record. Note that the block ID is prefixed by the group name to
526                    # make it unique.
527                  my $variance = $blockData{"snip-count"} / $blockData{len};                  my $variance = $blockData{"snip-count"} / $blockData{len};
528                  print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len},                  print BLOCKSOUT join("\t", "$groupName:$blockID", $blockData{description}, $blockData{len},
529                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";
530                  # Find all the variance points in the block pattern. We'll use them to create                  # Find all the variance points in the block pattern. We'll use them to create
531                  # the content strings for each region.                  # the content strings for each region.
532                  my @positions = SimBlocks::ParsePattern($blockData{pattern});                  my @positions = SimBlocks::ParsePattern($blockData{pattern});
533                  # Loop through the regions, writing them out to the region output file.                  # Loop through the regions, writing them out to the region output file.
534                  for my $region (keys %regionMap) {                  for my $region (keys %regionMap) {
535                        if (length($region) > 80) {
536                            Trace("Invalid region key \"$region\".") if T(1);
537                        }
538                      # Get the region's snips.                      # Get the region's snips.
539                      my $source = $regionMap{$region};                      my $source = $regionMap{$region};
540                      my $content = "";                      my $content = "";
# Line 473  Line 542 
542                          $content .= substr $source, $pos, 1;                          $content .= substr $source, $pos, 1;
543                      }                      }
544                      # Get the region's location data.                      # Get the region's location data.
545                      my $location = Location->new($region);                      my $location = BasicLocation->new($region);
546                      # Write this region to the output files.                      # Write this region to the output files.
547                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,
548                                            $location->Length, $regionPeg{$region},                                            $location->Right, $location->Length,
549                                            $location->Left, $content) . "\n";                                            $regionPeg{$region}, $location->Left, $content) . "\n";
550                      print CONTAINSOUT join("\t", $location->Contig, $region,                      print CONTAINSOUT join("\t", $location->Contig, $region,
551                                             $location->Length, $location->Left) . "\n";                                             $location->Length, $location->Left) . "\n";
552                      print INCLUDESOUT join("\t", $blockID, $region);                      print INCLUDESOUT join("\t", "$groupName:$blockID", $region) . "\n";
553                  }                  }
554                  # Finally, we need to connect this block to the genomes in which it occurs.                  # Finally, we need to connect this block to the genomes in which it occurs.
555                  for my $genomeID (keys %genomesFound) {                  for my $genomeID (keys %genomesFound) {
556                      print INSTANCESOUT join("\t", $genomeID, $blockID) . "\n";                      print INSTANCESOUT join("\t", $genomeID, "$groupName:$blockID") . "\n";
557                  }                  }
558                  # Count this block's regions.                  # Count this block's regions.
559                  $regionsCount += $regionCounter;                  $regionsCount += $regionCounter;
# Line 499  Line 568 
568      close BLOCKSOUT;      close BLOCKSOUT;
569      close INSTANCESOUT;      close INSTANCESOUT;
570      # All the block data has been written. Tell the user what we found.      # All the block data has been written. Tell the user what we found.
571      Output("$blocksCount blocks processed, $regionsCount regions processed.");      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);
572      # The next task is to write the genome/contig data. This is provided by the      # The next task is to write the genome/contig data. This is provided by the
573      # "%contigs" hash. First, we need to open the files.      # "%contigs" hash. First, we need to open the files.
574      my $contigsCount = 0;      my $contigsCount = 0;
575      (open CONTIGSOUT, ">$outDirectory/Contig.dtx") ||      Open(\*CONTIGSOUT, ">>$outDirectory/Contig.dtx");
576          Confess("Could not open contig output file in $outDirectory.");      Open(\*CONSISTSOUT, ">>$outDirectory/ConsistsOf.dtx");
     (open CONSISTSOUT, ">$outDirectory/ConsistsOf.dtx") ||  
         Confess("Could not open consists output file in $outDirectory.");  
577      for my $contigID (keys %contigs) {      for my $contigID (keys %contigs) {
578          print CONTIGSOUT "$contigID\n";          print CONTIGSOUT "$contigID\n";
579          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";
580          $contigsCount++;          $contigsCount++;
581      }      }
582      Output("$contigsCount contigs found.");      Trace("$contigsCount contigs found.") if T(2);
583        # Close the output files.
584        close CONTIGSOUT;
585        close CONSISTSOUT;
586      # Now warn the user about all the genomes that didn't have blocks.      # Now warn the user about all the genomes that didn't have blocks.
587      for my $genomeID (keys %genomes) {      for my $genomeID (keys %genomes) {
588          if (! $genomes{$genomeID}) {          if (! $genomes{$genomeID}) {
589              Output("Genome $genomeID did not have any regions.");              Trace("Genome $genomeID did not have any regions.") if T(1);
590              $errorCount++;              $errorCount++;
591          }          }
592      }      }
593  }      return $errorCount;
 # Check for errors.  
 if ($errorCount > 0) {  
     Output("$errorCount errors found in input files.");  
 } else {  
     # No errors, so it's okay to load the database.  
     if ($options->{load} eq 'yes') {  
         # Here we have no outstanding errors and the user wants us to load  
         # the database. First, we create a similarity block object.  
         my $simBlocks = SimBlocks->new();  
         # Use it to load the database. Note we specify that the tables are to be  
         # dropped and rebuilt.  
         $simBlocks->LoadTables($outDirectory, 1);  
         Output("Database loaded.");  
     }  
594  }  }
595  # Tell the user we're done.  # Tell the user we're done.
596  Output("Processing complete.");  Trace("Processing complete.") if T(0);
597    
598  # Read a region record from the file and parse it into a hash  # Read a region record from the file and parse it into a hash
599  # for return to the caller. If we reach end-of-file, the  # for return to the caller. If we reach end-of-file, the
# Line 562  Line 618 
618      return %retVal;      return %retVal;
619  }  }
620    
 # Write an output line. The output line will be traced at level 0. If  
 # $TracingToFile is set, it will be echoed to the standard output.  
 sub Output {  
     my ($message) = @_;  
     Trace($message) if T(0);  
     if ($TracingToFile) {  
         print "$message\n";  
     }  
 }  
   
621  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3