[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Diff of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Wed Jan 11 19:39:20 2006 UTC revision 1.7, Wed Feb 1 03:18:41 2006 UTC
# Line 4  Line 4 
4  use CGI;  use CGI;
5  use Tracer;  use Tracer;
6  use Genome;  use Genome;
7    use DBKernel;
8  use SimBlocks;  use SimBlocks;
9  use File::Path;  use File::Path;
10  use BasicLocation;  use BasicLocation;
# Line 18  Line 19 
19  database (B<load>).  database (B<load>).
20    
21  The script takes a single parameter-- a directory name.  The script takes a single parameter-- a directory name.
22  The default directory name is taken from the config file  The default directory name is C<"$FIG_Config::data/CloseStrainSets">.
23  parameter C<$fig_config::SimBlastData>. The output files  The input files should be in subdirectories called
24  will be produced in the similarity block data directory  C<Block> under the subdirectories of the input directory.
25  C<$fig_config::SimBlockData>, which will be created if  The subdirectory names themselves are considered the name
26  it does not exist. The input directory and all its  of the close-strain set. So, for example
27    
28        Data/CloseStrainSets/Vibrio/Block
29    
30    would be presumed to contain the Vibrio strains.
31    
32    The output files will be produced in the similarity block
33    data directory C<$fig_config::SimBlockData>, which will be
34    created if it does not exist. The input directory and all its
35  subdirectories will be processed for input files.  subdirectories will be processed for input files.
36    
37  In addition to the directory name, the following  In addition to the directory name, the following
# Line 52  Line 61 
61  C<yes> to generate output files from input files, else  C<yes> to generate output files from input files, else
62  C<no>. The default is C<yes>.  C<no>. The default is C<yes>.
63    
64    =item createDB
65    
66    If specified, the database will be dropped and
67    re-created before loading.
68    
69  =back  =back
70    
71  For example, the following command line will process the  For example, the following command line will process the
72  input files in the C</Users/fig/BlastData> directory and  input files in the C</Users/fig/BlastData> directory tree
73  run at a trace level of 3.  and run at a trace level of 3.
74    
75  C<< SimLoad -trace=3 /Users/fig/BlastData >>  C<< SimLoad -trace=3 /Users/fig/BlastData >>
76    
# Line 68  Line 82 
82    
83  =head2 Input Directory  =head2 Input Directory
84    
85  The following files must exist in each input directory.  The following files must exist in each C<Block> directory
86    under the input directory.
87    
88  =over 4  =over 4
89    
# Line 285  Line 300 
300  my $TRAILER = 999999999;  my $TRAILER = 999999999;
301    
302  # Parse the command line.  # Parse the command line.
303  my ($options, @arguments) = Tracer::StandardSetup(['SimBlocks'],  my ($options, @arguments) = StandardSetup(['SimBlocks'],
304                                                    { load => 'yes',                                                  { load => ['yes', "'no' to suppress loading the database"],
305                                                      generate => 'yes'},                                                    generate => ['yes', "'no' to suppress generating the load files"],
306                                                      createDB => [0, 'drop and create the database before loading']
307                                                    },
308                                                    "directoryName",
309                                                   @ARGV);                                                   @ARGV);
310  # Extract the directory name from the argument array.  # Extract the directory name from the argument array.
311  my $inDirectoryTree = $FIG_Config::simBlastData;  my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets");
312  if ($arguments[0]) {  # Check to see if we need to create the database.
313      $inDirectoryTree = Cwd::abs_path($arguments[0]);  if ($options->{createDB}) {
314        Trace("Creating database.") if T(2);
315        DBKernel::CreateDB($FIG_Config::simBlocksDB);
316  }  }
317  # Get the output directory.  # Get the output directory.
318  my $outDirectory = $FIG_Config::simBlocksData;  my $outDirectory = $FIG_Config::simBlocksData;
# Line 324  Line 344 
344          Confess("Input directory \"$inDirectoryTree\" not found.");          Confess("Input directory \"$inDirectoryTree\" not found.");
345      }      }
346      # Loop through the subdirectories.      # Loop through the subdirectories.
347      for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 0)) {      for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 1)) {
348          # Verify that this is a directory. If it's ".", we use          # Verify that this is a directory.
349          # the top-level directory.          my $inDirectory = "$inDirectoryTree/$inputDirectory/Blocks";
350          my $inDirectory = ($inputDirectory eq "." ? $inDirectoryTree :          if (-d $inDirectory) {
                                                     "$inDirectoryTree/$inputDirectory");  
         if (($inputDirectory !~ /\../) && -d $inDirectory) {  
351              # Here we have a directory to process. Check for a genome              # Here we have a directory to process. Check for a genome
352              # file.              # file.
353              my $genomeFileName = "$inDirectory/Genome.tbl";              my $genomeFileName = "$inDirectory/Genome.tbl";
# Line 338  Line 356 
356              } else {              } else {
357                  # Now we can process the directory and accumulate the error                  # Now we can process the directory and accumulate the error
358                  # count.                  # count.
359                  $errorCount += ProcessDirectory($inDirectory, $outDirectory);                  $errorCount += ProcessDirectory($inDirectory, $outDirectory, $inputDirectory);
360                  $dirCount++;                  $dirCount++;
361              }              }
362          }          }
# Line 363  Line 381 
381    
382  # Process a single input directory.  # Process a single input directory.
383  sub ProcessDirectory {  sub ProcessDirectory {
384      my ($inDirectory, $outDirectory) = @_;      my ($inDirectory, $outDirectory, $groupName) = @_;
385      Trace("Processing directory $inDirectory.") if T(2);      Trace("Processing directory $inDirectory.") if T(2);
386      # Our first task is to copy the genome data to the output directory      # Our first task is to copy the genome data to the output directory
387      # and add them to the genome list.      # and add the genomes to the genome list.
388      my %genomes = ();      my %genomes = ();
389      Open(\*GENOMESIN, "<$inDirectory/Genome.tbl");      Open(\*GENOMESIN, "<$inDirectory/Genome.tbl");
390      Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");      Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");
# Line 376  Line 394 
394      # Loop through the input.      # Loop through the input.
395      while (my $genomeData = <GENOMESIN>) {      while (my $genomeData = <GENOMESIN>) {
396          # Echo the genome record to the output.          # Echo the genome record to the output.
397          print GENOMESOUT $genomeData;          my @fields = Tracer::ParseRecord($genomeData);
398            print GENOMESOUT join("\t", @fields, $groupName). "\n";
399          # Extract the genome ID.          # Extract the genome ID.
400          my ($genomeID) = Tracer::ParseRecord($genomeData);          my $genomeID = $fields[0];
401          # Store it in the genomes hash. We start with a value of 0. If          # Store it in the genomes hash. We start with a value of 0. If
402          # contig information for the genome is found, we change the value          # contig information for the genome is found, we change the value
403          # to 1. When we're all done with the regions, we can check the          # to 1. When we're all done with the regions, we can check the
# Line 503  Line 522 
522                  $errorCount++;                  $errorCount++;
523              } else {              } else {
524                  Trace("$regionCounter regions found in block $blockID.") if T(4);                  Trace("$regionCounter regions found in block $blockID.") if T(4);
525                  # Write the block record.                  # Write the block record. Note that the block ID is prefixed by the group name to
526                    # make it unique.
527                  my $variance = $blockData{"snip-count"} / $blockData{len};                  my $variance = $blockData{"snip-count"} / $blockData{len};
528                  print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len},                  print BLOCKSOUT join("\t", "$groupName:$blockID", $blockData{description}, $blockData{len},
529                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";
530                  # Find all the variance points in the block pattern. We'll use them to create                  # Find all the variance points in the block pattern. We'll use them to create
531                  # the content strings for each region.                  # the content strings for each region.
# Line 529  Line 549 
549                                            $regionPeg{$region}, $location->Left, $content) . "\n";                                            $regionPeg{$region}, $location->Left, $content) . "\n";
550                      print CONTAINSOUT join("\t", $location->Contig, $region,                      print CONTAINSOUT join("\t", $location->Contig, $region,
551                                             $location->Length, $location->Left) . "\n";                                             $location->Length, $location->Left) . "\n";
552                      print INCLUDESOUT join("\t", $blockID, $region) . "\n";                      print INCLUDESOUT join("\t", "$groupName:$blockID", $region) . "\n";
553                  }                  }
554                  # Finally, we need to connect this block to the genomes in which it occurs.                  # Finally, we need to connect this block to the genomes in which it occurs.
555                  for my $genomeID (keys %genomesFound) {                  for my $genomeID (keys %genomesFound) {
556                      print INSTANCESOUT join("\t", $genomeID, $blockID) . "\n";                      print INSTANCESOUT join("\t", $genomeID, "$groupName:$blockID") . "\n";
557                  }                  }
558                  # Count this block's regions.                  # Count this block's regions.
559                  $regionsCount += $regionCounter;                  $regionsCount += $regionCounter;

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3