4 |
use CGI; |
use CGI; |
5 |
use Tracer; |
use Tracer; |
6 |
use Genome; |
use Genome; |
7 |
|
use DBKernel; |
8 |
use SimBlocks; |
use SimBlocks; |
9 |
use File::Path; |
use File::Path; |
10 |
use BasicLocation; |
use BasicLocation; |
19 |
database (B<load>). |
database (B<load>). |
20 |
|
|
21 |
The script takes a single parameter-- a directory name. |
The script takes a single parameter-- a directory name. |
22 |
The default directory name is taken from the config file |
The default directory name is C<"$FIG_Config::data/CloseStrainSets">. |
23 |
parameter C<$fig_config::SimBlastData>. The output files |
The input files should be in subdirectories called |
24 |
will be produced in the similarity block data directory |
C<Block> under the subdirectories of the input directory. |
25 |
C<$fig_config::SimBlockData>, which will be created if |
The subdirectory names themselves are considered the name |
26 |
it does not exist. The input directory and all its |
of the close-strain set. So, for example |
27 |
|
|
28 |
|
Data/CloseStrainSets/Vibrio/Block |
29 |
|
|
30 |
|
would be presumed to contain the Vibrio strains. |
31 |
|
|
32 |
|
The output files will be produced in the similarity block |
33 |
|
data directory C<$fig_config::SimBlockData>, which will be |
34 |
|
created if it does not exist. The input directory and all its |
35 |
subdirectories will be processed for input files. |
subdirectories will be processed for input files. |
36 |
|
|
37 |
In addition to the directory name, the following |
In addition to the directory name, the following |
42 |
=item trace |
=item trace |
43 |
|
|
44 |
Trace level for output messages. A higher number means more |
Trace level for output messages. A higher number means more |
45 |
messages. The default is C<2>. Trace messages are sent to |
messages. The default is C<3>. Trace messages are sent to |
46 |
the file C<trace.log> in the B<$FIG_Config::tmp> |
the file C<trace.log> in the B<$FIG_Config::tmp> |
47 |
directory. |
directory. |
48 |
|
|
49 |
|
=item sql |
50 |
|
|
51 |
|
If specified, SQL activity will be traced at the specified |
52 |
|
trace level. |
53 |
|
|
54 |
=item load |
=item load |
55 |
|
|
56 |
C<yes> to load the data into the database, else C<no>. |
C<yes> to load the data into the database, else C<no>. |
61 |
C<yes> to generate output files from input files, else |
C<yes> to generate output files from input files, else |
62 |
C<no>. The default is C<yes>. |
C<no>. The default is C<yes>. |
63 |
|
|
64 |
|
=item createDB |
65 |
|
|
66 |
|
If specified, the database will be dropped and |
67 |
|
re-created before loading. |
68 |
|
|
69 |
=back |
=back |
70 |
|
|
71 |
For example, the following command line will process the |
For example, the following command line will process the |
72 |
input files in the C</Users/fig/BlastData> directory and |
input files in the C</Users/fig/BlastData> directory tree |
73 |
run at a trace level of 3. |
and run at a trace level of 3. |
74 |
|
|
75 |
C<< SimLoad -trace=3 /Users/fig/BlastData >> |
C<< SimLoad -trace=3 /Users/fig/BlastData >> |
76 |
|
|
82 |
|
|
83 |
=head2 Input Directory |
=head2 Input Directory |
84 |
|
|
85 |
The following files must exist in each input directory. |
The following files must exist in each C<Block> directory |
86 |
|
under the input directory. |
87 |
|
|
88 |
=over 4 |
=over 4 |
89 |
|
|
300 |
my $TRAILER = 999999999; |
my $TRAILER = 999999999; |
301 |
|
|
302 |
# Parse the command line. |
# Parse the command line. |
303 |
my ($options, @arguments) = Tracer::ParseCommand({ trace => 1, |
my ($options, @arguments) = StandardSetup(['SimBlocks'], |
304 |
load => 'yes', |
{ load => ['yes', "'no' to suppress loading the database"], |
305 |
generate => 'yes'}, |
generate => ['yes', "'no' to suppress generating the load files"], |
306 |
|
createDB => [0, 'drop and create the database before loading'] |
307 |
|
}, |
308 |
|
"directoryName", |
309 |
@ARGV); |
@ARGV); |
310 |
# Extract the directory name from the argument array. |
# Extract the directory name from the argument array. |
311 |
my $inDirectoryTree = $FIG_Config::simBlastData; |
my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets"); |
312 |
if ($arguments[0]) { |
# Check to see if we need to create the database. |
313 |
$inDirectoryTree = Cwd::abs_path($arguments[0]); |
if ($options->{createDB}) { |
314 |
} |
Trace("Creating database.") if T(2); |
315 |
# Set up tracing. |
DBKernel::CreateDB($FIG_Config::simBlocksDB); |
316 |
my $traceLevel = $options->{trace}; |
} |
|
TSetup("$traceLevel Tracer ERDB SimBlocks DBKernel SQL", "+>$FIG_Config::temp/trace.log"); |
|
317 |
# Get the output directory. |
# Get the output directory. |
318 |
my $outDirectory = $FIG_Config::simBlocksData; |
my $outDirectory = $FIG_Config::simBlocksData; |
319 |
# Insure that it exists. |
# Insure that it exists. |
320 |
if (! -d $outDirectory) { |
if (! -d $outDirectory) { |
321 |
Trace("Creating output directory $outDirectory.") if T(2); |
Trace("Creating output directory $outDirectory.") if T(2); |
322 |
mkpath($outDirectory); |
mkpath($outDirectory); |
323 |
} else { |
} elsif ($options->{generate} eq 'yes') { |
324 |
# Here we have an output directory already. Clear any |
# Here we have an output directory already and are going to generate new |
325 |
# leftover data from previous runs. |
# load files. Clear any leftover data from previous runs. |
326 |
my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory); |
my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory); |
327 |
my $numFiles = @files; |
my $numFiles = @files; |
328 |
if ($numFiles > 0) { |
if ($numFiles > 0) { |
344 |
Confess("Input directory \"$inDirectoryTree\" not found."); |
Confess("Input directory \"$inDirectoryTree\" not found."); |
345 |
} |
} |
346 |
# Loop through the subdirectories. |
# Loop through the subdirectories. |
347 |
for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 0)) { |
for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 1)) { |
348 |
# Verify that this is a directory. If it's ".", we use |
# Verify that this is a directory. |
349 |
# the top-level directory. |
my $inDirectory = "$inDirectoryTree/$inputDirectory/Blocks"; |
350 |
my $inDirectory = ($inputDirectory eq "." ? $inDirectoryTree : |
if (-d $inDirectory) { |
|
"$inDirectoryTree/$inputDirectory"); |
|
|
if (($inputDirectory !~ /\../) && -d $inDirectory) { |
|
351 |
# Here we have a directory to process. Check for a genome |
# Here we have a directory to process. Check for a genome |
352 |
# file. |
# file. |
353 |
my $genomeFileName = "$inDirectory/Genome.tbl"; |
my $genomeFileName = "$inDirectory/Genome.tbl"; |
356 |
} else { |
} else { |
357 |
# Now we can process the directory and accumulate the error |
# Now we can process the directory and accumulate the error |
358 |
# count. |
# count. |
359 |
$errorCount += ProcessDirectory($inDirectory, $outDirectory); |
$errorCount += ProcessDirectory($inDirectory, $outDirectory, $inputDirectory); |
360 |
$dirCount++; |
$dirCount++; |
361 |
} |
} |
362 |
} |
} |
381 |
|
|
382 |
# Process a single input directory. |
# Process a single input directory. |
383 |
sub ProcessDirectory { |
sub ProcessDirectory { |
384 |
my ($inDirectory, $outDirectory) = @_; |
my ($inDirectory, $outDirectory, $groupName) = @_; |
385 |
Trace("Processing directory $inDirectory.") if T(2); |
Trace("Processing directory $inDirectory.") if T(2); |
386 |
# Our first task is to copy the genome data to the output directory |
# Our first task is to copy the genome data to the output directory |
387 |
# and add them to the genome list. |
# and add the genomes to the genome list. |
388 |
my %genomes = (); |
my %genomes = (); |
389 |
Open(\*GENOMESIN, "<$inDirectory/Genome.tbl"); |
Open(\*GENOMESIN, "<$inDirectory/Genome.tbl"); |
390 |
Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx"); |
Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx"); |
394 |
# Loop through the input. |
# Loop through the input. |
395 |
while (my $genomeData = <GENOMESIN>) { |
while (my $genomeData = <GENOMESIN>) { |
396 |
# Echo the genome record to the output. |
# Echo the genome record to the output. |
397 |
print GENOMESOUT $genomeData; |
my @fields = Tracer::ParseRecord($genomeData); |
398 |
|
print GENOMESOUT join("\t", @fields, $groupName). "\n"; |
399 |
# Extract the genome ID. |
# Extract the genome ID. |
400 |
my ($genomeID) = Tracer::ParseRecord($genomeData); |
my $genomeID = $fields[0]; |
401 |
# Store it in the genomes hash. We start with a value of 0. If |
# Store it in the genomes hash. We start with a value of 0. If |
402 |
# contig information for the genome is found, we change the value |
# contig information for the genome is found, we change the value |
403 |
# to 1. When we're all done with the regions, we can check the |
# to 1. When we're all done with the regions, we can check the |
521 |
Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0); |
Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0); |
522 |
$errorCount++; |
$errorCount++; |
523 |
} else { |
} else { |
524 |
Trace("$regionCounter regions found in block $blockID.") if T(2); |
Trace("$regionCounter regions found in block $blockID.") if T(4); |
525 |
# Write the block record. |
# Write the block record. Note that the block ID is prefixed by the group name to |
526 |
|
# make it unique. |
527 |
my $variance = $blockData{"snip-count"} / $blockData{len}; |
my $variance = $blockData{"snip-count"} / $blockData{len}; |
528 |
print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len}, |
print BLOCKSOUT join("\t", "$groupName:$blockID", $blockData{description}, $blockData{len}, |
529 |
$blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n"; |
$blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n"; |
530 |
# Find all the variance points in the block pattern. We'll use them to create |
# Find all the variance points in the block pattern. We'll use them to create |
531 |
# the content strings for each region. |
# the content strings for each region. |
549 |
$regionPeg{$region}, $location->Left, $content) . "\n"; |
$regionPeg{$region}, $location->Left, $content) . "\n"; |
550 |
print CONTAINSOUT join("\t", $location->Contig, $region, |
print CONTAINSOUT join("\t", $location->Contig, $region, |
551 |
$location->Length, $location->Left) . "\n"; |
$location->Length, $location->Left) . "\n"; |
552 |
print INCLUDESOUT join("\t", $blockID, $region); |
print INCLUDESOUT join("\t", "$groupName:$blockID", $region) . "\n"; |
553 |
} |
} |
554 |
# Finally, we need to connect this block to the genomes in which it occurs. |
# Finally, we need to connect this block to the genomes in which it occurs. |
555 |
for my $genomeID (keys %genomesFound) { |
for my $genomeID (keys %genomesFound) { |
556 |
print INSTANCESOUT join("\t", $genomeID, $blockID) . "\n"; |
print INSTANCESOUT join("\t", $genomeID, "$groupName:$blockID") . "\n"; |
557 |
} |
} |
558 |
# Count this block's regions. |
# Count this block's regions. |
559 |
$regionsCount += $regionCounter; |
$regionsCount += $regionCounter; |
580 |
$contigsCount++; |
$contigsCount++; |
581 |
} |
} |
582 |
Trace("$contigsCount contigs found.") if T(2); |
Trace("$contigsCount contigs found.") if T(2); |
583 |
|
# Close the output files. |
584 |
|
close CONTIGSOUT; |
585 |
|
close CONSISTSOUT; |
586 |
# Now warn the user about all the genomes that didn't have blocks. |
# Now warn the user about all the genomes that didn't have blocks. |
587 |
for my $genomeID (keys %genomes) { |
for my $genomeID (keys %genomes) { |
588 |
if (! $genomes{$genomeID}) { |
if (! $genomes{$genomeID}) { |