[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Diff of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Fri Jan 6 20:35:01 2006 UTC revision 1.9, Thu Feb 2 21:30:05 2006 UTC
# Line 4  Line 4 
4  use CGI;  use CGI;
5  use Tracer;  use Tracer;
6  use Genome;  use Genome;
7    use ERDBLoad;
8    use DBKernel;
9  use SimBlocks;  use SimBlocks;
10  use File::Path;  use File::Path;
11  use BasicLocation;  use BasicLocation;
# Line 18  Line 20 
20  database (B<load>).  database (B<load>).
21    
22  The script takes a single parameter-- a directory name.  The script takes a single parameter-- a directory name.
23  The default directory name is taken from the config file  The default directory name is C<"$FIG_Config::data/CloseStrainSets">.
24  parameter C<$fig_config::SimBlastData>. The output files  The input files should be in subdirectories called
25  will be produced in the similarity block data directory  C<Block> under the subdirectories of the input directory.
26  C<$fig_config::SimBlockData>, which will be created if  The subdirectory names themselves are considered the name
27  it does not exist. The input directory and all its  of the close-strain set. So, for example
28    
29        Data/CloseStrainSets/Vibrio/Block
30    
31    would be presumed to contain the Vibrio strains.
32    
33    The output files will be produced in the similarity block
34    data directory C<$fig_config::SimBlockData>, which will be
35    created if it does not exist. The input directory and all its
36  subdirectories will be processed for input files.  subdirectories will be processed for input files.
37    
38  In addition to the directory name, the following  In addition to the directory name, the following
# Line 33  Line 43 
43  =item trace  =item trace
44    
45  Trace level for output messages. A higher number means more  Trace level for output messages. A higher number means more
46  messages. The default is C<2>. Trace messages are sent to  messages. The default is C<3>. Trace messages are sent to
47  the file C<trace.log> in the B<$FIG_Config::tmp>  the file C<trace.log> in the B<$FIG_Config::tmp>
48  directory.  directory.
49    
50    =item sql
51    
52    If specified, SQL activity will be traced at the specified
53    trace level.
54    
55  =item load  =item load
56    
57  C<yes> to load the data into the database, else C<no>.  C<yes> to load the data into the database, else C<no>.
# Line 47  Line 62 
62  C<yes> to generate output files from input files, else  C<yes> to generate output files from input files, else
63  C<no>. The default is C<yes>.  C<no>. The default is C<yes>.
64    
65    =item createDB
66    
67    If specified, the database will be dropped and
68    re-created before loading.
69    
70  =back  =back
71    
72  For example, the following command line will process the  For example, the following command line will process the
73  input files in the C</Users/fig/BlastData> directory and  input files in the C</Users/fig/BlastData> directory tree
74  run at a trace level of 3.  and run at a trace level of 3.
75    
76  C<< SimLoad -trace=3 /Users/fig/BlastData >>  C<< SimLoad -trace=3 /Users/fig/BlastData >>
77    
# Line 63  Line 83 
83    
84  =head2 Input Directory  =head2 Input Directory
85    
86  The following files must exist in each input directory.  The following files must exist in each C<Block> directory
87    under the input directory.
88    
89  =over 4  =over 4
90    
91  =item Genome.tbl  =item genome.tbl
92    
93  This is a tab-delimited file that contains the ID of each  This is a tab-delimited file that contains the ID of each
94  genome followed by a description string.  genome followed by a description string.
95    
96  =item Block.tbl, InterGenic_Block.tbl  =item block.tbl, intergenic_block.tbl
97    
98  These are tab-delimited files that associate a gene name  These are tab-delimited files that associate a gene name
99  with each block. The InterGenic file is optional.  with each block. The InterGenic file is optional.
100    
101  =item Region.tbl, InterGenic_Region.tbl  =item region.tbl, intergenic_region.tbl
102    
103  These are tab-delimited files that describe each region  These are tab-delimited files that describe each region
104  of a block. The InterGenic file is optional.  of a block. The InterGenic file is optional.
# Line 86  Line 107 
107    
108  The format of each file is given below.  The format of each file is given below.
109    
110  =head3 Genome.tbl  =head3 genome.tbl
111    
112  The Genome file is copied almost unmodified to the  The Genome file is copied almost unmodified to the
113  load file for the B<Genome> entity. Each record  load file for the B<Genome> entity. Each record
# Line 104  Line 125 
125  A text description of the genome (usually the species name with  A text description of the genome (usually the species name with
126  a strain ID).  a strain ID).
127    
128    =item groupName
129    
130    The name of the group to which the genome belongs.
131    
132  =back  =back
133    
134  =head3 Block.tbl, InterGenic_Block.tbl  =head3 block.tbl, intergenic_block.tbl
135    
136  These files produce most of the data found in the B<GroupBlock>  These files produce most of the data found in the B<GroupBlock>
137  entity. Each record represents a single block. Blocks either  entity. Each record represents a single block. Blocks either
# Line 129  Line 154 
154    
155  =back  =back
156    
157  =head3 Region.tbl, InterGenic_Region.tbl  =head3 region.tbl, intergenic_region.tbl
158    
159  These files describe the regions in each block. They are  These files describe the regions in each block. They are
160  used to derive the relationships between genomes and  used to derive the relationships between genomes and
# Line 280  Line 305 
305  my $TRAILER = 999999999;  my $TRAILER = 999999999;
306    
307  # Parse the command line.  # Parse the command line.
308  my ($options, @arguments) = Tracer::ParseCommand({ trace => 1,  my ($options, @arguments) = StandardSetup(['SimBlocks'],
309                                                    load => 'yes',                                                  { load => ['yes', "'no' to suppress loading the database"],
310                                                    generate => 'yes'},                                                    generate => ['yes', "'no' to suppress generating the load files"],
311                                                      createDB => [0, 'drop and create the database before loading']
312                                                    },
313                                                    "directoryName",
314                                                   @ARGV);                                                   @ARGV);
315  # Extract the directory name from the argument array.  # Extract the directory name from the argument array.
316  my $inDirectoryTree = $FIG_Config::simBlastData;  my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets");
317  if ($arguments[0]) {  # Check to see if we need to create the database.
318      $inDirectoryTree = Cwd::abs_path($arguments[0]);  if ($options->{createDB}) {
319  }      Trace("Creating database.") if T(2);
320  # Set up tracing.      DBKernel::CreateDB($FIG_Config::simBlocksDB);
321  my $traceLevel = $options->{trace};  }
 TSetup("$traceLevel Tracer ERDB SimBlocks DBKernel SQL", "+>$FIG_Config::temp/trace.log");  
322  # Get the output directory.  # Get the output directory.
323  my $outDirectory = $FIG_Config::simBlocksData;  my $outDirectory = $FIG_Config::simBlocksData;
324  # Insure that it exists.  # Insure that it exists.
325  if (! -d $outDirectory) {  if (! -d $outDirectory) {
326      Trace("Creating output directory $outDirectory.") if T(2);      Trace("Creating output directory $outDirectory.") if T(2);
327      mkpath($outDirectory);      mkpath($outDirectory);
328  } else {  } elsif ($options->{generate} eq 'yes') {
329      # Here we have an output directory already. Clear any      # Here we have an output directory already and are going to generate new
330      # leftover data from previous runs.      # load files. Clear any leftover data from previous runs.
331      my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory);      my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory);
332      my $numFiles = @files;      my $numFiles = @files;
333      if ($numFiles > 0) {      if ($numFiles > 0) {
# Line 311  Line 338 
338  # Create an error counter and a directory counter.  # Create an error counter and a directory counter.
339  my $errorCount = 0;  my $errorCount = 0;
340  my $dirCount = 0;  my $dirCount = 0;
341    # Get a similarity block object. Note that we've waited until the database
342    # was already created before doing this. The tables, however, may not yet
343    # exist, so we can't do very much with it.
344    my $simBlocks = SimBlocks->new();
345    # Determine if this is a load-only run.
346    my $loadOnly = ($options->{generate} eq 'no');
347    # Create the loader objects for the tables. These are global to the whole
348    # script.
349    my $GenomeLoad = ERDBLoad->new($simBlocks, 'Genome', $outDirectory, $loadOnly);
350    my $ContigLoad = ERDBLoad->new($simBlocks, 'Contig', $outDirectory, $loadOnly);
351    my $GroupBlockLoad = ERDBLoad->new($simBlocks, 'GroupBlock', $outDirectory, $loadOnly);
352    my $RegionLoad = ERDBLoad->new($simBlocks, 'Region', $outDirectory, $loadOnly);
353    my $ConsistsOfLoad = ERDBLoad->new($simBlocks, 'ConsistsOf', $outDirectory, $loadOnly);
354    my $ContainsRegionLoad = ERDBLoad->new($simBlocks, 'ContainsRegion', $outDirectory, $loadOnly);
355    my $HasInstanceOfLoad = ERDBLoad->new($simBlocks, 'HasInstanceOf', $outDirectory, $loadOnly);
356    my $IncludesRegionLoad = ERDBLoad->new($simBlocks, 'IncludesRegion', $outDirectory, $loadOnly);
357    # This hash helps us to clean up when we're done.
358    my $loaderList = { Genome => $GenomeLoad, Contig => $ContigLoad,
359                       GroupBlock => $GroupBlockLoad, Region => $RegionLoad,
360                       ConsistsOf => $ConsistsOfLoad, ContainsRegion => $ContainsRegionLoad,
361                       HasInstanceOf => $HasInstanceOfLoad,
362                       IncludesRegion => $IncludesRegionLoad};
363  # Check to see if we should generate the output files.  # Check to see if we should generate the output files.
364  if ($options->{generate} eq 'no') {  if ($loadOnly) {
365      # Here we are to use existing output files.      # Here we are to use existing output files.
366      Trace("Existing database load files will be used.") if T(2);      Trace("Existing database load files will be used.") if T(2);
367  } else {  } else {
# Line 322  Line 371 
371          Confess("Input directory \"$inDirectoryTree\" not found.");          Confess("Input directory \"$inDirectoryTree\" not found.");
372      }      }
373      # Loop through the subdirectories.      # Loop through the subdirectories.
374      for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 0)) {      for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 1)) {
375          # Verify that this is a directory. If it's ".", we use          # Verify that this is a directory.
376          # the top-level directory.          my $inDirectory = "$inDirectoryTree/$inputDirectory/Blocks";
377          my $inDirectory = ($inputDirectory eq "." ? $inDirectoryTree :          if (-d $inDirectory) {
                                                     "$inDirectoryTree/$inputDirectory");  
         if (($inputDirectory !~ /\../) && -d $inDirectory) {  
378              # Here we have a directory to process. Check for a genome              # Here we have a directory to process. Check for a genome
379              # file.              # file.
380              my $genomeFileName = "$inDirectory/Genome.tbl";              my $genomeFileName = "$inDirectory/genome.tbl";
381              if (! -e $genomeFileName) {              if (! -e $genomeFileName) {
382                  Trace("$genomeFileName not found. Directory skipped.") if T(1);                  Trace("$genomeFileName not found. Directory skipped.") if T(1);
383              } else {              } else {
384                  # Now we can process the directory and accumulate the error                  # Now we can process the directory and accumulate the error
385                  # count.                  # count.
386                  $errorCount += ProcessDirectory($inDirectory, $outDirectory);                  $errorCount += ProcessDirectory($inDirectory, $outDirectory, $inputDirectory);
387                  $dirCount++;                  $dirCount++;
388              }              }
389          }          }
390      }      }
391      Trace("Load files generated from $dirCount directories.") if T(2);      Trace("Load files generated from $dirCount directories.") if T(2);
392        # Now finish up the loading.
393        my $stats = Stats->new();
394        for my $table (keys %{$loaderList}) {
395            Trace("Finishing $table.") if T(2);
396            my $tableThing = $loaderList->{$table};
397            my $newStats = $tableThing->Finish();
398            $stats->Accumulate($newStats);
399        }
400        Trace("Statistics for generate.\n" . $stats->Show()) if T(2);
401  }  }
402  # Check for errors.  # Check for errors.
403  if ($errorCount > 0) {  if ($errorCount > 0) {
# Line 351  Line 407 
407      if ($options->{load} eq 'yes') {      if ($options->{load} eq 'yes') {
408          # Here we have no outstanding errors and the user wants us to load          # Here we have no outstanding errors and the user wants us to load
409          # the database. First, we create a similarity block object.          # the database. First, we create a similarity block object.
         my $simBlocks = SimBlocks->new();  
410          # Use it to load the database. Note we specify that the tables are to be          # Use it to load the database. Note we specify that the tables are to be
411          # dropped and rebuilt.          # dropped and rebuilt.
412          $simBlocks->LoadTables($outDirectory, 1);          $simBlocks->LoadTables($outDirectory, 1);
# Line 361  Line 416 
416    
417  # Process a single input directory.  # Process a single input directory.
418  sub ProcessDirectory {  sub ProcessDirectory {
419      my ($inDirectory, $outDirectory) = @_;      my ($inDirectory, $outDirectory, $groupName) = @_;
420      Trace("Processing directory $inDirectory.") if T(2);      Trace("Processing directory $inDirectory.") if T(2);
421      # Our first task is to copy the genome data to the output directory      # Our first task is to copy the genome data to the output
422      # and add them to the genome list.      # and add the genomes to the genome list.
423      my %genomes = ();      my %genomes = ();
424      Open(\*GENOMESIN, "<$inDirectory/Genome.tbl");      Open(\*GENOMESIN, "<$inDirectory/genome.tbl");
     Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");  
425      # Count the genomes read and errors found.      # Count the genomes read and errors found.
426      my $genomeCount = 0;      my $genomeCount = 0;
427      my $errorCount = 0;      my $errorCount = 0;
428      # Loop through the input.      # Loop through the input.
429      while (my $genomeData = <GENOMESIN>) {      while (my $genomeData = <GENOMESIN>) {
430          # Echo the genome record to the output.          # Echo the genome record to the output.
431          print GENOMESOUT $genomeData;          my @fields = Tracer::ParseRecord($genomeData);
432            $GenomeLoad->Put(@fields, $groupName);
433          # Extract the genome ID.          # Extract the genome ID.
434          my ($genomeID) = Tracer::ParseRecord($genomeData);          my $genomeID = $fields[0];
435          # Store it in the genomes hash. We start with a value of 0. If          # Store it in the genomes hash. We start with a value of 0. If
436          # contig information for the genome is found, we change the value          # contig information for the genome is found, we change the value
437          # to 1. When we're all done with the regions, we can check the          # to 1. When we're all done with the regions, we can check the
# Line 386  Line 441 
441          $genomeCount++;          $genomeCount++;
442      }      }
443      Trace("$genomeCount genomes found.") if T(2);      Trace("$genomeCount genomes found.") if T(2);
444      # Close the files.      # Close the genome file.
445      close GENOMESIN;      close GENOMESIN;
     close GENOMESOUT;  
446      # Create the contig hash used to associate contigs to their parent      # Create the contig hash used to associate contigs to their parent
447      # genomes.      # genomes.
448      my %contigs = ();      my %contigs = ();
449      # Now we begin to read the Block and Region files in parallel. Both      # Now we begin to read the Block and Region files in parallel. Both
450      # are sorted by block ID, so all processing for this section of the      # are sorted by block ID, so all processing for this section of the
451      # script is done a block at a time. The first task is to      # script is done a block at a time. First, we determine which file
452      # open the output files.      # sets we'll be processing. There are at most two: regular and
453      Open(\*BLOCKSOUT, ">>$outDirectory/GroupBlock.dtx");      # intergenic.
     Open(\*REGIONSOUT, ">>$outDirectory/Region.dtx");  
     Open(\*INSTANCESOUT, ">>$outDirectory/HasInstanceOf.dtx");  
     Open(\*CONTAINSOUT, ">>$outDirectory/ContainsRegion.dtx");  
     Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx");  
     # Determine which file sets we'll be processing.  
454      my @fileSets = ();      my @fileSets = ();
455      my @prefixes = ("", "InterGenic_");      my @prefixes = ("", "intergenic_");
456      for my $prefix (@prefixes) {      for my $prefix (@prefixes) {
457          if (-e "$inDirectory/${prefix}Block.tbl") {          if (-e "$inDirectory/${prefix}block.tbl") {
458              push @fileSets, $prefix;              push @fileSets, $prefix;
459          }          }
460      }      }
# Line 415  Line 464 
464      my ($blocksCount, $regionsCount) = (0, 0);      my ($blocksCount, $regionsCount) = (0, 0);
465      # Loop through the useful file sets.      # Loop through the useful file sets.
466      for my $fileSet (@fileSets) {      for my $fileSet (@fileSets) {
467          Open(\*BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl");          Open(\*BLOCKSIN, "<$inDirectory/${fileSet}block.tbl");
468          Open(\*REGIONSIN, "<$inDirectory/${fileSet}Region.tbl");          Open(\*REGIONSIN, "<$inDirectory/${fileSet}region.tbl");
469          Trace("Processing ${fileSet}Blocks.") if T(2);          Trace("Processing ${fileSet}Blocks.") if T(2);
470          # The outer loop processes blocks. This is accomplished by reading          # The outer loop processes blocks. This is accomplished by reading
471          # through the block file. We prime the loop by reading the first          # through the block file. We prime the loop by reading the first
# Line 500  Line 549 
549                  Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0);                  Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0);
550                  $errorCount++;                  $errorCount++;
551              } else {              } else {
552                  Trace("$regionCounter regions found in block $blockID.") if T(2);                  Trace("$regionCounter regions found in block $blockID.") if T(4);
553                  # Write the block record.                  # Write the block record. Note that the block ID is prefixed by the group name to
554                    # make it unique.
555                  my $variance = $blockData{"snip-count"} / $blockData{len};                  my $variance = $blockData{"snip-count"} / $blockData{len};
556                  print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len},                  $GroupBlockLoad->Put("$groupName:$blockID", $blockData{description}, $blockData{len},
557                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";                                       $blockData{"snip-count"}, $variance, $blockData{pattern});
558                  # Find all the variance points in the block pattern. We'll use them to create                  # Find all the variance points in the block pattern. We'll use them to create
559                  # the content strings for each region.                  # the content strings for each region.
560                  my @positions = SimBlocks::ParsePattern($blockData{pattern});                  my @positions = SimBlocks::ParsePattern($blockData{pattern});
# Line 522  Line 572 
572                      # Get the region's location data.                      # Get the region's location data.
573                      my $location = BasicLocation->new($region);                      my $location = BasicLocation->new($region);
574                      # Write this region to the output files.                      # Write this region to the output files.
575                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,                      $RegionLoad->Put($region, $location->Contig, $location->Dir,
576                                            $location->Right, $location->Length,                                            $location->Right, $location->Length,
577                                            $regionPeg{$region}, $location->Left, $content) . "\n";                                       $regionPeg{$region}, $location->Left, $content);
578                      print CONTAINSOUT join("\t", $location->Contig, $region,                      $ContainsRegionLoad->Put($location->Contig, $region, $location->Length,
579                                             $location->Length, $location->Left) . "\n";                                               $location->Left);
580                      print INCLUDESOUT join("\t", $blockID, $region);                      $IncludesRegionLoad->Put("$groupName:$blockID", $region);
581                  }                  }
582                  # Finally, we need to connect this block to the genomes in which it occurs.                  # Finally, we need to connect this block to the genomes in which it occurs.
583                  for my $genomeID (keys %genomesFound) {                  for my $genomeID (keys %genomesFound) {
584                      print INSTANCESOUT join("\t", $genomeID, $blockID) . "\n";                      $HasInstanceOfLoad->Put($genomeID, "$groupName:$blockID");
585                  }                  }
586                  # Count this block's regions.                  # Count this block's regions.
587                  $regionsCount += $regionCounter;                  $regionsCount += $regionCounter;
# Line 542  Line 592 
592          close REGIONSIN;          close REGIONSIN;
593      }      }
594      # Close the output files.      # Close the output files.
     close REGIONSOUT;  
     close BLOCKSOUT;  
     close INSTANCESOUT;  
595      # All the block data has been written. Tell the user what we found.      # All the block data has been written. Tell the user what we found.
596      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);
597      # The next task is to write the genome/contig data. This is provided by the      # The next task is to write the genome/contig data. This is provided by the
598      # "%contigs" hash. First, we need to open the files.      # "%contigs" hash.
599      my $contigsCount = 0;      my $contigsCount = 0;
     Open(\*CONTIGSOUT, ">>$outDirectory/Contig.dtx");  
     Open(\*CONSISTSOUT, ">>$outDirectory/ConsistsOf.dtx");  
600      for my $contigID (keys %contigs) {      for my $contigID (keys %contigs) {
601          print CONTIGSOUT "$contigID\n";          $ContigLoad->Put("$contigID");
602          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";          $ConsistsOfLoad->Put($contigs{$contigID}, $contigID);
603          $contigsCount++;          $contigsCount++;
604      }      }
605      Trace("$contigsCount contigs found.") if T(2);      Trace("$contigsCount contigs found.") if T(2);

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.9

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3