[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Diff of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Wed Jan 11 19:39:20 2006 UTC revision 1.11, Thu Dec 6 14:58:03 2007 UTC
# Line 4  Line 4 
4  use CGI;  use CGI;
5  use Tracer;  use Tracer;
6  use Genome;  use Genome;
7    use ERDBLoad;
8    use DBKernel;
9  use SimBlocks;  use SimBlocks;
10  use File::Path;  use File::Path;
11  use BasicLocation;  use BasicLocation;
# Line 18  Line 20 
20  database (B<load>).  database (B<load>).
21    
22  The script takes a single parameter-- a directory name.  The script takes a single parameter-- a directory name.
23  The default directory name is taken from the config file  The default directory name is C<"$FIG_Config::data/CloseStrainSets">.
24  parameter C<$fig_config::SimBlastData>. The output files  The input files should be in subdirectories called
25  will be produced in the similarity block data directory  C<Block> under the subdirectories of the input directory.
26  C<$fig_config::SimBlockData>, which will be created if  The subdirectory names themselves are considered the name
27  it does not exist. The input directory and all its  of the close-strain set. So, for example
28    
29        Data/CloseStrainSets/Vibrio/Block
30    
31    would be presumed to contain the Vibrio strains.
32    
33    The output files will be produced in the similarity block
34    data directory C<$fig_config::SimBlockData>, which will be
35    created if it does not exist. The input directory and all its
36  subdirectories will be processed for input files.  subdirectories will be processed for input files.
37    
38  In addition to the directory name, the following  In addition to the directory name, the following
# Line 52  Line 62 
62  C<yes> to generate output files from input files, else  C<yes> to generate output files from input files, else
63  C<no>. The default is C<yes>.  C<no>. The default is C<yes>.
64    
65    =item createDB
66    
67    If specified, the database will be dropped and
68    re-created before loading.
69    
70  =back  =back
71    
72  For example, the following command line will process the  For example, the following command line will process the
73  input files in the C</Users/fig/BlastData> directory and  input files in the C</Users/fig/BlastData> directory tree
74  run at a trace level of 3.  and run at a trace level of 3.
75    
76  C<< SimLoad -trace=3 /Users/fig/BlastData >>      SimLoad -trace=3 /Users/fig/BlastData
77    
78  The following command line converts the input files in  The following command line converts the input files in
79  the default directory into load files but does not load  the default directory into load files but does not load
80  the database and runs at a trace level of 2.  the database and runs at a trace level of 2.
81    
82  C<< SimLoad -load=no >>      SimLoad -load=no
83    
84  =head2 Input Directory  =head2 Input Directory
85    
86  The following files must exist in each input directory.  The following files must exist in each C<Block> directory
87    under the input directory.
88    
89  =over 4  =over 4
90    
91  =item Genome.tbl  =item genome.tbl
92    
93  This is a tab-delimited file that contains the ID of each  This is a tab-delimited file that contains the ID of each
94  genome followed by a description string.  genome followed by a description string.
95    
96  =item Block.tbl, InterGenic_Block.tbl  =item block.tbl, intergenic_block.tbl
97    
98  These are tab-delimited files that associate a gene name  These are tab-delimited files that associate a gene name
99  with each block. The InterGenic file is optional.  with each block. The InterGenic file is optional.
100    
101  =item Region.tbl, InterGenic_Region.tbl  =item region.tbl, intergenic_region.tbl
102    
103  These are tab-delimited files that describe each region  These are tab-delimited files that describe each region
104  of a block. The InterGenic file is optional.  of a block. The InterGenic file is optional.
# Line 91  Line 107 
107    
108  The format of each file is given below.  The format of each file is given below.
109    
110  =head3 Genome.tbl  =head3 genome.tbl
111    
112  The Genome file is copied almost unmodified to the  The Genome file is copied almost unmodified to the
113  load file for the B<Genome> entity. Each record  load file for the B<Genome> entity. Each record
# Line 109  Line 125 
125  A text description of the genome (usually the species name with  A text description of the genome (usually the species name with
126  a strain ID).  a strain ID).
127    
128    =item groupName
129    
130    The name of the group to which the genome belongs.
131    
132  =back  =back
133    
134  =head3 Block.tbl, InterGenic_Block.tbl  =head3 block.tbl, intergenic_block.tbl
135    
136  These files produce most of the data found in the B<GroupBlock>  These files produce most of the data found in the B<GroupBlock>
137  entity. Each record represents a single block. Blocks either  entity. Each record represents a single block. Blocks either
# Line 134  Line 154 
154    
155  =back  =back
156    
157  =head3 Region.tbl, InterGenic_Region.tbl  =head3 region.tbl, intergenic_region.tbl
158    
159  These files describe the regions in each block. They are  These files describe the regions in each block. They are
160  used to derive the relationships between genomes and  used to derive the relationships between genomes and
# Line 285  Line 305 
305  my $TRAILER = 999999999;  my $TRAILER = 999999999;
306    
307  # Parse the command line.  # Parse the command line.
308  my ($options, @arguments) = Tracer::StandardSetup(['SimBlocks'],  my ($options, @arguments) = StandardSetup(['SimBlocks'],
309                                                    { load => 'yes',                                                  { load => ['yes', "'no' to suppress loading the database"],
310                                                      generate => 'yes'},                                                    generate => ['yes', "'no' to suppress generating the load files"],
311                                                      createDB => [0, 'drop and create the database before loading']
312                                                    },
313                                                    "directoryName",
314                                                   @ARGV);                                                   @ARGV);
315  # Extract the directory name from the argument array.  # Extract the directory name from the argument array.
316  my $inDirectoryTree = $FIG_Config::simBlastData;  my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets");
317  if ($arguments[0]) {  # Check to see if we need to create the database.
318      $inDirectoryTree = Cwd::abs_path($arguments[0]);  if ($options->{createDB}) {
319        my $dbname = SimBlocks::DBName();
320        Trace("Creating database $dbname.") if T(2);
321        DBKernel::CreateDB($dbname);
322  }  }
323  # Get the output directory.  # Get the output directory.
324  my $outDirectory = $FIG_Config::simBlocksData;  my $outDirectory = $FIG_Config::simBlocksData;
325    # Default it if it isn't present.
326    if (! $outDirectory) {
327        $outDirectory = "$FIG_Config::fig/BlockData";
328    }
329  # Insure that it exists.  # Insure that it exists.
330  if (! -d $outDirectory) {  if (! -d $outDirectory) {
331      Trace("Creating output directory $outDirectory.") if T(2);      Trace("Creating output directory $outDirectory.") if T(2);
# Line 313  Line 343 
343  # Create an error counter and a directory counter.  # Create an error counter and a directory counter.
344  my $errorCount = 0;  my $errorCount = 0;
345  my $dirCount = 0;  my $dirCount = 0;
346    # Get a similarity block object. Note that we've waited until the database
347    # was already created before doing this. The tables, however, may not yet
348    # exist, so we can't do very much with it.
349    my $simBlocks = SimBlocks->new();
350    # Determine if this is a load-only run.
351    my $loadOnly = ($options->{generate} eq 'no');
352    # Create the loader objects for the tables. These are global to the whole
353    # script.
354    my $GenomeLoad = ERDBLoad->new($simBlocks, 'Genome', $outDirectory, $loadOnly);
355    my $ContigLoad = ERDBLoad->new($simBlocks, 'Contig', $outDirectory, $loadOnly);
356    my $GroupBlockLoad = ERDBLoad->new($simBlocks, 'GroupBlock', $outDirectory, $loadOnly);
357    my $RegionLoad = ERDBLoad->new($simBlocks, 'Region', $outDirectory, $loadOnly);
358    my $ConsistsOfLoad = ERDBLoad->new($simBlocks, 'ConsistsOf', $outDirectory, $loadOnly);
359    my $ContainsRegionLoad = ERDBLoad->new($simBlocks, 'ContainsRegion', $outDirectory, $loadOnly);
360    my $HasInstanceOfLoad = ERDBLoad->new($simBlocks, 'HasInstanceOf', $outDirectory, $loadOnly);
361    my $IncludesRegionLoad = ERDBLoad->new($simBlocks, 'IncludesRegion', $outDirectory, $loadOnly);
362    # This hash helps us to clean up when we're done.
363    my $loaderList = { Genome => $GenomeLoad, Contig => $ContigLoad,
364                       GroupBlock => $GroupBlockLoad, Region => $RegionLoad,
365                       ConsistsOf => $ConsistsOfLoad, ContainsRegion => $ContainsRegionLoad,
366                       HasInstanceOf => $HasInstanceOfLoad,
367                       IncludesRegion => $IncludesRegionLoad};
368  # Check to see if we should generate the output files.  # Check to see if we should generate the output files.
369  if ($options->{generate} eq 'no') {  if ($loadOnly) {
370      # Here we are to use existing output files.      # Here we are to use existing output files.
371      Trace("Existing database load files will be used.") if T(2);      Trace("Existing database load files will be used.") if T(2);
372  } else {  } else {
# Line 324  Line 376 
376          Confess("Input directory \"$inDirectoryTree\" not found.");          Confess("Input directory \"$inDirectoryTree\" not found.");
377      }      }
378      # Loop through the subdirectories.      # Loop through the subdirectories.
379      for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 0)) {      for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 1)) {
380          # Verify that this is a directory. If it's ".", we use          # Verify that this is a directory.
381          # the top-level directory.          my $inDirectory = "$inDirectoryTree/$inputDirectory/Blocks";
382          my $inDirectory = ($inputDirectory eq "." ? $inDirectoryTree :          if (-d $inDirectory) {
                                                     "$inDirectoryTree/$inputDirectory");  
         if (($inputDirectory !~ /\../) && -d $inDirectory) {  
383              # Here we have a directory to process. Check for a genome              # Here we have a directory to process. Check for a genome
384              # file.              # file.
385              my $genomeFileName = "$inDirectory/Genome.tbl";              my $genomeFileName = "$inDirectory/genome.tbl";
386              if (! -e $genomeFileName) {              if (! -e $genomeFileName) {
387                  Trace("$genomeFileName not found. Directory skipped.") if T(1);                  Trace("$genomeFileName not found. Directory skipped.") if T(1);
388              } else {              } else {
389                  # Now we can process the directory and accumulate the error                  # Now we can process the directory and accumulate the error
390                  # count.                  # count.
391                  $errorCount += ProcessDirectory($inDirectory, $outDirectory);                  $errorCount += ProcessDirectory($inDirectory, $outDirectory, $inputDirectory);
392                  $dirCount++;                  $dirCount++;
393              }              }
394          }          }
395      }      }
396      Trace("Load files generated from $dirCount directories.") if T(2);      Trace("Load files generated from $dirCount directories.") if T(2);
397        # Now finish up the loading.
398        my $stats = Stats->new();
399        for my $table (keys %{$loaderList}) {
400            Trace("Finishing $table.") if T(2);
401            my $tableThing = $loaderList->{$table};
402            my $newStats = $tableThing->Finish();
403            $stats->Accumulate($newStats);
404        }
405        Trace("Statistics for generate.\n" . $stats->Show()) if T(2);
406  }  }
407  # Check for errors.  # Check for errors.
408  if ($errorCount > 0) {  if ($errorCount > 0) {
# Line 353  Line 412 
412      if ($options->{load} eq 'yes') {      if ($options->{load} eq 'yes') {
413          # Here we have no outstanding errors and the user wants us to load          # Here we have no outstanding errors and the user wants us to load
414          # the database. First, we create a similarity block object.          # the database. First, we create a similarity block object.
         my $simBlocks = SimBlocks->new();  
415          # Use it to load the database. Note we specify that the tables are to be          # Use it to load the database. Note we specify that the tables are to be
416          # dropped and rebuilt.          # dropped and rebuilt.
417          $simBlocks->LoadTables($outDirectory, 1);          $simBlocks->LoadTables($outDirectory, 1);
# Line 363  Line 421 
421    
422  # Process a single input directory.  # Process a single input directory.
423  sub ProcessDirectory {  sub ProcessDirectory {
424      my ($inDirectory, $outDirectory) = @_;      my ($inDirectory, $outDirectory, $groupName) = @_;
425      Trace("Processing directory $inDirectory.") if T(2);      Trace("Processing directory $inDirectory.") if T(2);
426      # Our first task is to copy the genome data to the output directory      # Our first task is to copy the genome data to the output
427      # and add them to the genome list.      # and add the genomes to the genome list.
428      my %genomes = ();      my %genomes = ();
429      Open(\*GENOMESIN, "<$inDirectory/Genome.tbl");      Open(\*GENOMESIN, "<$inDirectory/genome.tbl");
     Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");  
430      # Count the genomes read and errors found.      # Count the genomes read and errors found.
431      my $genomeCount = 0;      my $genomeCount = 0;
432      my $errorCount = 0;      my $errorCount = 0;
433      # Loop through the input.      # Loop through the input.
434      while (my $genomeData = <GENOMESIN>) {      while (my $genomeData = <GENOMESIN>) {
435          # Echo the genome record to the output.          # Echo the genome record to the output.
436          print GENOMESOUT $genomeData;          my @fields = Tracer::ParseRecord($genomeData);
437            $GenomeLoad->Put(@fields, $groupName);
438          # Extract the genome ID.          # Extract the genome ID.
439          my ($genomeID) = Tracer::ParseRecord($genomeData);          my $genomeID = $fields[0];
440          # Store it in the genomes hash. We start with a value of 0. If          # Store it in the genomes hash. We start with a value of 0. If
441          # contig information for the genome is found, we change the value          # contig information for the genome is found, we change the value
442          # to 1. When we're all done with the regions, we can check the          # to 1. When we're all done with the regions, we can check the
# Line 388  Line 446 
446          $genomeCount++;          $genomeCount++;
447      }      }
448      Trace("$genomeCount genomes found.") if T(2);      Trace("$genomeCount genomes found.") if T(2);
449      # Close the files.      # Close the genome file.
450      close GENOMESIN;      close GENOMESIN;
     close GENOMESOUT;  
451      # Create the contig hash used to associate contigs to their parent      # Create the contig hash used to associate contigs to their parent
452      # genomes.      # genomes.
453      my %contigs = ();      my %contigs = ();
454      # Now we begin to read the Block and Region files in parallel. Both      # Now we begin to read the Block and Region files in parallel. Both
455      # are sorted by block ID, so all processing for this section of the      # are sorted by block ID, so all processing for this section of the
456      # script is done a block at a time. The first task is to      # script is done a block at a time. First, we determine which file
457      # open the output files.      # sets we'll be processing. There are at most two: regular and
458      Open(\*BLOCKSOUT, ">>$outDirectory/GroupBlock.dtx");      # intergenic.
     Open(\*REGIONSOUT, ">>$outDirectory/Region.dtx");  
     Open(\*INSTANCESOUT, ">>$outDirectory/HasInstanceOf.dtx");  
     Open(\*CONTAINSOUT, ">>$outDirectory/ContainsRegion.dtx");  
     Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx");  
     # Determine which file sets we'll be processing.  
459      my @fileSets = ();      my @fileSets = ();
460      my @prefixes = ("", "InterGenic_");      my @prefixes = ("", "intergenic_");
461      for my $prefix (@prefixes) {      for my $prefix (@prefixes) {
462          if (-e "$inDirectory/${prefix}Block.tbl") {          if (-e "$inDirectory/${prefix}block.tbl") {
463              push @fileSets, $prefix;              push @fileSets, $prefix;
464          }          }
465      }      }
# Line 417  Line 469 
469      my ($blocksCount, $regionsCount) = (0, 0);      my ($blocksCount, $regionsCount) = (0, 0);
470      # Loop through the useful file sets.      # Loop through the useful file sets.
471      for my $fileSet (@fileSets) {      for my $fileSet (@fileSets) {
472          Open(\*BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl");          Open(\*BLOCKSIN, "<$inDirectory/${fileSet}block.tbl");
473          Open(\*REGIONSIN, "<$inDirectory/${fileSet}Region.tbl");          Open(\*REGIONSIN, "<$inDirectory/${fileSet}region.tbl");
474          Trace("Processing ${fileSet}Blocks.") if T(2);          Trace("Processing ${fileSet}Blocks.") if T(2);
475          # The outer loop processes blocks. This is accomplished by reading          # The outer loop processes blocks. This is accomplished by reading
476          # through the block file. We prime the loop by reading the first          # through the block file. We prime the loop by reading the first
# Line 503  Line 555 
555                  $errorCount++;                  $errorCount++;
556              } else {              } else {
557                  Trace("$regionCounter regions found in block $blockID.") if T(4);                  Trace("$regionCounter regions found in block $blockID.") if T(4);
558                  # Write the block record.                  # Write the block record. Note that the block ID is prefixed by the group name to
559                    # make it unique.
560                  my $variance = $blockData{"snip-count"} / $blockData{len};                  my $variance = $blockData{"snip-count"} / $blockData{len};
561                  print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len},                  $GroupBlockLoad->Put("$groupName:$blockID", $blockData{description}, $blockData{len},
562                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";                                       $blockData{"snip-count"}, $variance, $blockData{pattern});
563                  # Find all the variance points in the block pattern. We'll use them to create                  # Find all the variance points in the block pattern. We'll use them to create
564                  # the content strings for each region.                  # the content strings for each region.
565                  my @positions = SimBlocks::ParsePattern($blockData{pattern});                  my @positions = SimBlocks::ParsePattern($blockData{pattern});
# Line 524  Line 577 
577                      # Get the region's location data.                      # Get the region's location data.
578                      my $location = BasicLocation->new($region);                      my $location = BasicLocation->new($region);
579                      # Write this region to the output files.                      # Write this region to the output files.
580                      print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,                      $RegionLoad->Put($region, $location->Contig, $location->Dir,
581                                            $location->Right, $location->Length,                                            $location->Right, $location->Length,
582                                            $regionPeg{$region}, $location->Left, $content) . "\n";                                       $regionPeg{$region}, $location->Left, $content);
583                      print CONTAINSOUT join("\t", $location->Contig, $region,                      $ContainsRegionLoad->Put($location->Contig, $region, $location->Length,
584                                             $location->Length, $location->Left) . "\n";                                               $location->Left);
585                      print INCLUDESOUT join("\t", $blockID, $region) . "\n";                      $IncludesRegionLoad->Put("$groupName:$blockID", $region);
586                  }                  }
587                  # Finally, we need to connect this block to the genomes in which it occurs.                  # Finally, we need to connect this block to the genomes in which it occurs.
588                  for my $genomeID (keys %genomesFound) {                  for my $genomeID (keys %genomesFound) {
589                      print INSTANCESOUT join("\t", $genomeID, $blockID) . "\n";                      $HasInstanceOfLoad->Put($genomeID, "$groupName:$blockID");
590                  }                  }
591                  # Count this block's regions.                  # Count this block's regions.
592                  $regionsCount += $regionCounter;                  $regionsCount += $regionCounter;
# Line 544  Line 597 
597          close REGIONSIN;          close REGIONSIN;
598      }      }
599      # Close the output files.      # Close the output files.
     close REGIONSOUT;  
     close BLOCKSOUT;  
     close INSTANCESOUT;  
600      # All the block data has been written. Tell the user what we found.      # All the block data has been written. Tell the user what we found.
601      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);      Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);
602      # The next task is to write the genome/contig data. This is provided by the      # The next task is to write the genome/contig data. This is provided by the
603      # "%contigs" hash. First, we need to open the files.      # "%contigs" hash.
604      my $contigsCount = 0;      my $contigsCount = 0;
     Open(\*CONTIGSOUT, ">>$outDirectory/Contig.dtx");  
     Open(\*CONSISTSOUT, ">>$outDirectory/ConsistsOf.dtx");  
605      for my $contigID (keys %contigs) {      for my $contigID (keys %contigs) {
606          print CONTIGSOUT "$contigID\n";          $ContigLoad->Put("$contigID");
607          print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";          $ConsistsOfLoad->Put($contigs{$contigID}, $contigID);
608          $contigsCount++;          $contigsCount++;
609      }      }
610      Trace("$contigsCount contigs found.") if T(2);      Trace("$contigsCount contigs found.") if T(2);
     # Close the output files.  
     close CONTIGSOUT;  
     close CONSISTSOUT;  
611      # Now warn the user about all the genomes that didn't have blocks.      # Now warn the user about all the genomes that didn't have blocks.
612      for my $genomeID (keys %genomes) {      for my $genomeID (keys %genomes) {
613          if (! $genomes{$genomeID}) {          if (! $genomes{$genomeID}) {

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.11

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3