[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Diff of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Fri Jan 6 20:35:01 2006 UTC revision 1.7, Wed Feb 1 03:18:41 2006 UTC
# Line 4  Line 4 
4  use CGI;  use CGI;
5  use Tracer;  use Tracer;
6  use Genome;  use Genome;
7    use DBKernel;
8  use SimBlocks;  use SimBlocks;
9  use File::Path;  use File::Path;
10  use BasicLocation;  use BasicLocation;
# Line 18  Line 19 
19  database (B<load>).  database (B<load>).
20    
21  The script takes a single parameter-- a directory name.  The script takes a single parameter-- a directory name.
22  The default directory name is taken from the config file  The default directory name is C<"$FIG_Config::data/CloseStrainSets">.
23  parameter C<$fig_config::SimBlastData>. The output files  The input files should be in subdirectories called
24  will be produced in the similarity block data directory  C<Block> under the subdirectories of the input directory.
25  C<$fig_config::SimBlockData>, which will be created if  The subdirectory names themselves are considered the name
26  it does not exist. The input directory and all its  of the close-strain set. So, for example
27    
28        Data/CloseStrainSets/Vibrio/Block
29    
30    would be presumed to contain the Vibrio strains.
31    
32    The output files will be produced in the similarity block
33    data directory C<$fig_config::SimBlockData>, which will be
34    created if it does not exist. The input directory and all its
35  subdirectories will be processed for input files.  subdirectories will be processed for input files.
36    
37  In addition to the directory name, the following  In addition to the directory name, the following
# Line 33  Line 42 
42  =item trace  =item trace
43    
44  Trace level for output messages. A higher number means more  Trace level for output messages. A higher number means more
45  messages. The default is C<2>. Trace messages are sent to  messages. The default is C<3>. Trace messages are sent to
46  the file C<trace.log> in the B<$FIG_Config::tmp>  the file C<trace.log> in the B<$FIG_Config::tmp>
47  directory.  directory.
48    
49    =item sql
50    
51    If specified, SQL activity will be traced at the specified
52    trace level.
53    
54  =item load  =item load
55    
56  C<yes> to load the data into the database, else C<no>.  C<yes> to load the data into the database, else C<no>.
# Line 47  Line 61 
61  C<yes> to generate output files from input files, else  C<yes> to generate output files from input files, else
62  C<no>. The default is C<yes>.  C<no>. The default is C<yes>.
63    
64    =item createDB
65    
66    If specified, the database will be dropped and
67    re-created before loading.
68    
69  =back  =back
70    
71  For example, the following command line will process the  For example, the following command line will process the
72  input files in the C</Users/fig/BlastData> directory and  input files in the C</Users/fig/BlastData> directory tree
73  run at a trace level of 3.  and run at a trace level of 3.
74    
75  C<< SimLoad -trace=3 /Users/fig/BlastData >>  C<< SimLoad -trace=3 /Users/fig/BlastData >>
76    
# Line 63  Line 82 
82    
83  =head2 Input Directory  =head2 Input Directory
84    
85  The following files must exist in each input directory.  The following files must exist in each C<Block> directory
86    under the input directory.
87    
88  =over 4  =over 4
89    
# Line 280  Line 300 
300  my $TRAILER = 999999999;  my $TRAILER = 999999999;
301    
302  # Parse the command line.  # Parse the command line.
303  my ($options, @arguments) = Tracer::ParseCommand({ trace => 1,  my ($options, @arguments) = StandardSetup(['SimBlocks'],
304                                                    load => 'yes',                                                  { load => ['yes', "'no' to suppress loading the database"],
305                                                    generate => 'yes'},                                                    generate => ['yes', "'no' to suppress generating the load files"],
306                                                      createDB => [0, 'drop and create the database before loading']
307                                                    },
308                                                    "directoryName",
309                                                   @ARGV);                                                   @ARGV);
310  # Extract the directory name from the argument array.  # Extract the directory name from the argument array.
311  my $inDirectoryTree = $FIG_Config::simBlastData;  my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets");
312  if ($arguments[0]) {  # Check to see if we need to create the database.
313      $inDirectoryTree = Cwd::abs_path($arguments[0]);  if ($options->{createDB}) {
314  }      Trace("Creating database.") if T(2);
315  # Set up tracing.      DBKernel::CreateDB($FIG_Config::simBlocksDB);
316  my $traceLevel = $options->{trace};  }
 TSetup("$traceLevel Tracer ERDB SimBlocks DBKernel SQL", "+>$FIG_Config::temp/trace.log");  
317  # Get the output directory.  # Get the output directory.
318  my $outDirectory = $FIG_Config::simBlocksData;  my $outDirectory = $FIG_Config::simBlocksData;
319  # Insure that it exists.  # Insure that it exists.
320  if (! -d $outDirectory) {  if (! -d $outDirectory) {
321      Trace("Creating output directory $outDirectory.") if T(2);      Trace("Creating output directory $outDirectory.") if T(2);
322      mkpath($outDirectory);      mkpath($outDirectory);
323  } else {  } elsif ($options->{generate} eq 'yes') {
324      # Here we have an output directory already. Clear any      # Here we have an output directory already and are going to generate new
325      # leftover data from previous runs.      # load files. Clear any leftover data from previous runs.
326      my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory);      my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory);
327      my $numFiles = @files;      my $numFiles = @files;
328      if ($numFiles > 0) {      if ($numFiles > 0) {
# Line 322  Line 344 
344          Confess("Input directory \"$inDirectoryTree\" not found.");          Confess("Input directory \"$inDirectoryTree\" not found.");
345      }      }
346      # Loop through the subdirectories.      # Loop through the subdirectories.
347      for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 0)) {      for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 1)) {
348          # Verify that this is a directory. If it's ".", we use          # Verify that this is a directory.
349          # the top-level directory.          my $inDirectory = "$inDirectoryTree/$inputDirectory/Blocks";
350          my $inDirectory = ($inputDirectory eq "." ? $inDirectoryTree :          if (-d $inDirectory) {
                                                     "$inDirectoryTree/$inputDirectory");  
         if (($inputDirectory !~ /\../) && -d $inDirectory) {  
351              # Here we have a directory to process. Check for a genome              # Here we have a directory to process. Check for a genome
352              # file.              # file.
353              my $genomeFileName = "$inDirectory/Genome.tbl";              my $genomeFileName = "$inDirectory/Genome.tbl";
# Line 336  Line 356 
356              } else {              } else {
357                  # Now we can process the directory and accumulate the error                  # Now we can process the directory and accumulate the error
358                  # count.                  # count.
359                  $errorCount += ProcessDirectory($inDirectory, $outDirectory);                  $errorCount += ProcessDirectory($inDirectory, $outDirectory, $inputDirectory);
360                  $dirCount++;                  $dirCount++;
361              }              }
362          }          }
# Line 361  Line 381 
381    
382  # Process a single input directory.  # Process a single input directory.
383  sub ProcessDirectory {  sub ProcessDirectory {
384      my ($inDirectory, $outDirectory) = @_;      my ($inDirectory, $outDirectory, $groupName) = @_;
385      Trace("Processing directory $inDirectory.") if T(2);      Trace("Processing directory $inDirectory.") if T(2);
386      # Our first task is to copy the genome data to the output directory      # Our first task is to copy the genome data to the output directory
387      # and add them to the genome list.      # and add the genomes to the genome list.
388      my %genomes = ();      my %genomes = ();
389      Open(\*GENOMESIN, "<$inDirectory/Genome.tbl");      Open(\*GENOMESIN, "<$inDirectory/Genome.tbl");
390      Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");      Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");
# Line 374  Line 394 
394      # Loop through the input.      # Loop through the input.
395      while (my $genomeData = <GENOMESIN>) {      while (my $genomeData = <GENOMESIN>) {
396          # Echo the genome record to the output.          # Echo the genome record to the output.
397          print GENOMESOUT $genomeData;          my @fields = Tracer::ParseRecord($genomeData);
398            print GENOMESOUT join("\t", @fields, $groupName). "\n";
399          # Extract the genome ID.          # Extract the genome ID.
400          my ($genomeID) = Tracer::ParseRecord($genomeData);          my $genomeID = $fields[0];
401          # Store it in the genomes hash. We start with a value of 0. If          # Store it in the genomes hash. We start with a value of 0. If
402          # contig information for the genome is found, we change the value          # contig information for the genome is found, we change the value
403          # to 1. When we're all done with the regions, we can check the          # to 1. When we're all done with the regions, we can check the
# Line 500  Line 521 
521                  Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0);                  Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0);
522                  $errorCount++;                  $errorCount++;
523              } else {              } else {
524                  Trace("$regionCounter regions found in block $blockID.") if T(2);                  Trace("$regionCounter regions found in block $blockID.") if T(4);
525                  # Write the block record.                  # Write the block record. Note that the block ID is prefixed by the group name to
526                    # make it unique.
527                  my $variance = $blockData{"snip-count"} / $blockData{len};                  my $variance = $blockData{"snip-count"} / $blockData{len};
528                  print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len},                  print BLOCKSOUT join("\t", "$groupName:$blockID", $blockData{description}, $blockData{len},
529                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";
530                  # Find all the variance points in the block pattern. We'll use them to create                  # Find all the variance points in the block pattern. We'll use them to create
531                  # the content strings for each region.                  # the content strings for each region.
# Line 527  Line 549 
549                                            $regionPeg{$region}, $location->Left, $content) . "\n";                                            $regionPeg{$region}, $location->Left, $content) . "\n";
550                      print CONTAINSOUT join("\t", $location->Contig, $region,                      print CONTAINSOUT join("\t", $location->Contig, $region,
551                                             $location->Length, $location->Left) . "\n";                                             $location->Length, $location->Left) . "\n";
552                      print INCLUDESOUT join("\t", $blockID, $region);                      print INCLUDESOUT join("\t", "$groupName:$blockID", $region) . "\n";
553                  }                  }
554                  # Finally, we need to connect this block to the genomes in which it occurs.                  # Finally, we need to connect this block to the genomes in which it occurs.
555                  for my $genomeID (keys %genomesFound) {                  for my $genomeID (keys %genomesFound) {
556                      print INSTANCESOUT join("\t", $genomeID, $blockID) . "\n";                      print INSTANCESOUT join("\t", $genomeID, "$groupName:$blockID") . "\n";
557                  }                  }
558                  # Count this block's regions.                  # Count this block's regions.
559                  $regionsCount += $regionCounter;                  $regionsCount += $regionCounter;
# Line 558  Line 580 
580          $contigsCount++;          $contigsCount++;
581      }      }
582      Trace("$contigsCount contigs found.") if T(2);      Trace("$contigsCount contigs found.") if T(2);
583        # Close the output files.
584        close CONTIGSOUT;
585        close CONSISTSOUT;
586      # Now warn the user about all the genomes that didn't have blocks.      # Now warn the user about all the genomes that didn't have blocks.
587      for my $genomeID (keys %genomes) {      for my $genomeID (keys %genomes) {
588          if (! $genomes{$genomeID}) {          if (! $genomes{$genomeID}) {

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3