4 |
use CGI; |
use CGI; |
5 |
use Tracer; |
use Tracer; |
6 |
use Genome; |
use Genome; |
7 |
|
use DBKernel; |
8 |
use SimBlocks; |
use SimBlocks; |
9 |
use File::Path; |
use File::Path; |
10 |
use BasicLocation; |
use BasicLocation; |
19 |
database (B<load>). |
database (B<load>). |
20 |
|
|
21 |
The script takes a single parameter-- a directory name. |
The script takes a single parameter-- a directory name. |
22 |
The default directory name is taken from the config file |
The default directory name is C<"$FIG_Config::data/CloseStrainSets">. |
23 |
parameter C<$fig_config::SimBlastData>. The output files |
The input files should be in subdirectories called |
24 |
will be produced in the similarity block data directory |
C<Block> under the subdirectories of the input directory. |
25 |
C<$fig_config::SimBlockData>, which will be created if |
The subdirectory names themselves are considered the name |
26 |
it does not exist. The input directory and all its |
of the close-strain set. So, for example |
27 |
|
|
28 |
|
Data/CloseStrainSets/Vibrio/Block |
29 |
|
|
30 |
|
would be presumed to contain the Vibrio strains. |
31 |
|
|
32 |
|
The output files will be produced in the similarity block |
33 |
|
data directory C<$fig_config::SimBlockData>, which will be |
34 |
|
created if it does not exist. The input directory and all its |
35 |
subdirectories will be processed for input files. |
subdirectories will be processed for input files. |
36 |
|
|
37 |
In addition to the directory name, the following |
In addition to the directory name, the following |
61 |
C<yes> to generate output files from input files, else |
C<yes> to generate output files from input files, else |
62 |
C<no>. The default is C<yes>. |
C<no>. The default is C<yes>. |
63 |
|
|
64 |
|
=item createDB |
65 |
|
|
66 |
|
If specified, the database will be dropped and |
67 |
|
re-created before loading. |
68 |
|
|
69 |
=back |
=back |
70 |
|
|
71 |
For example, the following command line will process the |
For example, the following command line will process the |
72 |
input files in the C</Users/fig/BlastData> directory and |
input files in the C</Users/fig/BlastData> directory tree |
73 |
run at a trace level of 3. |
and run at a trace level of 3. |
74 |
|
|
75 |
C<< SimLoad -trace=3 /Users/fig/BlastData >> |
C<< SimLoad -trace=3 /Users/fig/BlastData >> |
76 |
|
|
82 |
|
|
83 |
=head2 Input Directory |
=head2 Input Directory |
84 |
|
|
85 |
The following files must exist in each input directory. |
The following files must exist in each C<Block> directory |
86 |
|
under the input directory. |
87 |
|
|
88 |
=over 4 |
=over 4 |
89 |
|
|
300 |
my $TRAILER = 999999999; |
my $TRAILER = 999999999; |
301 |
|
|
302 |
# Parse the command line. |
# Parse the command line. |
303 |
my ($options, @arguments) = Tracer::StandardSetup(['SimBlocks'], |
my ($options, @arguments) = StandardSetup(['SimBlocks'], |
304 |
{ load => 'yes', |
{ load => ['yes', "'no' to suppress loading the database"], |
305 |
generate => 'yes'}, |
generate => ['yes', "'no' to suppress generating the load files"], |
306 |
|
createDB => [0, 'drop and create the database before loading'] |
307 |
|
}, |
308 |
|
"directoryName", |
309 |
@ARGV); |
@ARGV); |
310 |
# Extract the directory name from the argument array. |
# Extract the directory name from the argument array. |
311 |
my $inDirectoryTree = $FIG_Config::simBlastData; |
my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets"); |
312 |
if ($arguments[0]) { |
# Check to see if we need to create the database. |
313 |
$inDirectoryTree = Cwd::abs_path($arguments[0]); |
if ($options->{createDB}) { |
314 |
|
Trace("Creating database.") if T(2); |
315 |
|
DBKernel::CreateDB($FIG_Config::simBlocksDB); |
316 |
} |
} |
317 |
# Get the output directory. |
# Get the output directory. |
318 |
my $outDirectory = $FIG_Config::simBlocksData; |
my $outDirectory = $FIG_Config::simBlocksData; |
344 |
Confess("Input directory \"$inDirectoryTree\" not found."); |
Confess("Input directory \"$inDirectoryTree\" not found."); |
345 |
} |
} |
346 |
# Loop through the subdirectories. |
# Loop through the subdirectories. |
347 |
for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 0)) { |
for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 1)) { |
348 |
# Verify that this is a directory. If it's ".", we use |
# Verify that this is a directory. |
349 |
# the top-level directory. |
my $inDirectory = "$inDirectoryTree/$inputDirectory/Blocks"; |
350 |
my $inDirectory = ($inputDirectory eq "." ? $inDirectoryTree : |
if (-d $inDirectory) { |
|
"$inDirectoryTree/$inputDirectory"); |
|
|
if (($inputDirectory !~ /\../) && -d $inDirectory) { |
|
351 |
# Here we have a directory to process. Check for a genome |
# Here we have a directory to process. Check for a genome |
352 |
# file. |
# file. |
353 |
my $genomeFileName = "$inDirectory/Genome.tbl"; |
my $genomeFileName = "$inDirectory/Genome.tbl"; |
356 |
} else { |
} else { |
357 |
# Now we can process the directory and accumulate the error |
# Now we can process the directory and accumulate the error |
358 |
# count. |
# count. |
359 |
$errorCount += ProcessDirectory($inDirectory, $outDirectory); |
$errorCount += ProcessDirectory($inDirectory, $outDirectory, $inputDirectory); |
360 |
$dirCount++; |
$dirCount++; |
361 |
} |
} |
362 |
} |
} |
381 |
|
|
382 |
# Process a single input directory. |
# Process a single input directory. |
383 |
sub ProcessDirectory { |
sub ProcessDirectory { |
384 |
my ($inDirectory, $outDirectory) = @_; |
my ($inDirectory, $outDirectory, $groupName) = @_; |
385 |
Trace("Processing directory $inDirectory.") if T(2); |
Trace("Processing directory $inDirectory.") if T(2); |
386 |
# Our first task is to copy the genome data to the output directory |
# Our first task is to copy the genome data to the output directory |
387 |
# and add them to the genome list. |
# and add the genomes to the genome list. |
388 |
my %genomes = (); |
my %genomes = (); |
389 |
Open(\*GENOMESIN, "<$inDirectory/Genome.tbl"); |
Open(\*GENOMESIN, "<$inDirectory/Genome.tbl"); |
390 |
Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx"); |
Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx"); |
394 |
# Loop through the input. |
# Loop through the input. |
395 |
while (my $genomeData = <GENOMESIN>) { |
while (my $genomeData = <GENOMESIN>) { |
396 |
# Echo the genome record to the output. |
# Echo the genome record to the output. |
397 |
print GENOMESOUT $genomeData; |
my @fields = Tracer::ParseRecord($genomeData); |
398 |
|
print GENOMESOUT join("\t", @fields, $groupName). "\n"; |
399 |
# Extract the genome ID. |
# Extract the genome ID. |
400 |
my ($genomeID) = Tracer::ParseRecord($genomeData); |
my $genomeID = $fields[0]; |
401 |
# Store it in the genomes hash. We start with a value of 0. If |
# Store it in the genomes hash. We start with a value of 0. If |
402 |
# contig information for the genome is found, we change the value |
# contig information for the genome is found, we change the value |
403 |
# to 1. When we're all done with the regions, we can check the |
# to 1. When we're all done with the regions, we can check the |
522 |
$errorCount++; |
$errorCount++; |
523 |
} else { |
} else { |
524 |
Trace("$regionCounter regions found in block $blockID.") if T(4); |
Trace("$regionCounter regions found in block $blockID.") if T(4); |
525 |
# Write the block record. |
# Write the block record. Note that the block ID is prefixed by the group name to |
526 |
|
# make it unique. |
527 |
my $variance = $blockData{"snip-count"} / $blockData{len}; |
my $variance = $blockData{"snip-count"} / $blockData{len}; |
528 |
print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len}, |
print BLOCKSOUT join("\t", "$groupName:$blockID", $blockData{description}, $blockData{len}, |
529 |
$blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n"; |
$blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n"; |
530 |
# Find all the variance points in the block pattern. We'll use them to create |
# Find all the variance points in the block pattern. We'll use them to create |
531 |
# the content strings for each region. |
# the content strings for each region. |
549 |
$regionPeg{$region}, $location->Left, $content) . "\n"; |
$regionPeg{$region}, $location->Left, $content) . "\n"; |
550 |
print CONTAINSOUT join("\t", $location->Contig, $region, |
print CONTAINSOUT join("\t", $location->Contig, $region, |
551 |
$location->Length, $location->Left) . "\n"; |
$location->Length, $location->Left) . "\n"; |
552 |
print INCLUDESOUT join("\t", $blockID, $region) . "\n"; |
print INCLUDESOUT join("\t", "$groupName:$blockID", $region) . "\n"; |
553 |
} |
} |
554 |
# Finally, we need to connect this block to the genomes in which it occurs. |
# Finally, we need to connect this block to the genomes in which it occurs. |
555 |
for my $genomeID (keys %genomesFound) { |
for my $genomeID (keys %genomesFound) { |
556 |
print INSTANCESOUT join("\t", $genomeID, $blockID) . "\n"; |
print INSTANCESOUT join("\t", $genomeID, "$groupName:$blockID") . "\n"; |
557 |
} |
} |
558 |
# Count this block's regions. |
# Count this block's regions. |
559 |
$regionsCount += $regionCounter; |
$regionsCount += $regionCounter; |