4 |
use CGI; |
use CGI; |
5 |
use Tracer; |
use Tracer; |
6 |
use Genome; |
use Genome; |
7 |
|
use ERDBLoad; |
8 |
|
use DBKernel; |
9 |
use SimBlocks; |
use SimBlocks; |
10 |
use File::Path; |
use File::Path; |
11 |
use BasicLocation; |
use BasicLocation; |
20 |
database (B<load>). |
database (B<load>). |
21 |
|
|
22 |
The script takes a single parameter-- a directory name. |
The script takes a single parameter-- a directory name. |
23 |
The default directory name is taken from the config file |
The default directory name is C<"$FIG_Config::data/CloseStrainSets">. |
24 |
parameter C<$fig_config::SimBlastData>. The output files |
The input files should be in subdirectories called |
25 |
will be produced in the similarity block data directory |
C<Block> under the subdirectories of the input directory. |
26 |
C<$fig_config::SimBlockData>, which will be created if |
The subdirectory names themselves are considered the name |
27 |
it does not exist. The input directory and all its |
of the close-strain set. So, for example |
28 |
|
|
29 |
|
Data/CloseStrainSets/Vibrio/Block |
30 |
|
|
31 |
|
would be presumed to contain the Vibrio strains. |
32 |
|
|
33 |
|
The output files will be produced in the similarity block |
34 |
|
data directory C<$fig_config::SimBlockData>, which will be |
35 |
|
created if it does not exist. The input directory and all its |
36 |
subdirectories will be processed for input files. |
subdirectories will be processed for input files. |
37 |
|
|
38 |
In addition to the directory name, the following |
In addition to the directory name, the following |
43 |
=item trace |
=item trace |
44 |
|
|
45 |
Trace level for output messages. A higher number means more |
Trace level for output messages. A higher number means more |
46 |
messages. The default is C<2>. Trace messages are sent to |
messages. The default is C<3>. Trace messages are sent to |
47 |
the file C<trace.log> in the B<$FIG_Config::tmp> |
the file C<trace.log> in the B<$FIG_Config::tmp> |
48 |
directory. |
directory. |
49 |
|
|
50 |
|
=item sql |
51 |
|
|
52 |
|
If specified, SQL activity will be traced at the specified |
53 |
|
trace level. |
54 |
|
|
55 |
=item load |
=item load |
56 |
|
|
57 |
C<yes> to load the data into the database, else C<no>. |
C<yes> to load the data into the database, else C<no>. |
62 |
C<yes> to generate output files from input files, else |
C<yes> to generate output files from input files, else |
63 |
C<no>. The default is C<yes>. |
C<no>. The default is C<yes>. |
64 |
|
|
65 |
|
=item createDB |
66 |
|
|
67 |
|
If specified, the database will be dropped and |
68 |
|
re-created before loading. |
69 |
|
|
70 |
=back |
=back |
71 |
|
|
72 |
For example, the following command line will process the |
For example, the following command line will process the |
73 |
input files in the C</Users/fig/BlastData> directory and |
input files in the C</Users/fig/BlastData> directory tree |
74 |
run at a trace level of 3. |
and run at a trace level of 3. |
75 |
|
|
76 |
C<< SimLoad -trace=3 /Users/fig/BlastData >> |
C<< SimLoad -trace=3 /Users/fig/BlastData >> |
77 |
|
|
83 |
|
|
84 |
=head2 Input Directory |
=head2 Input Directory |
85 |
|
|
86 |
The following files must exist in each input directory. |
The following files must exist in each C<Block> directory |
87 |
|
under the input directory. |
88 |
|
|
89 |
=over 4 |
=over 4 |
90 |
|
|
91 |
=item Genome.tbl |
=item genome.tbl |
92 |
|
|
93 |
This is a tab-delimited file that contains the ID of each |
This is a tab-delimited file that contains the ID of each |
94 |
genome followed by a description string. |
genome followed by a description string. |
95 |
|
|
96 |
=item Block.tbl, InterGenic_Block.tbl |
=item block.tbl, intergenic_block.tbl |
97 |
|
|
98 |
These are tab-delimited files that associate a gene name |
These are tab-delimited files that associate a gene name |
99 |
with each block. The InterGenic file is optional. |
with each block. The InterGenic file is optional. |
100 |
|
|
101 |
=item Region.tbl, InterGenic_Region.tbl |
=item region.tbl, intergenic_region.tbl |
102 |
|
|
103 |
These are tab-delimited files that describe each region |
These are tab-delimited files that describe each region |
104 |
of a block. The InterGenic file is optional. |
of a block. The InterGenic file is optional. |
107 |
|
|
108 |
The format of each file is given below. |
The format of each file is given below. |
109 |
|
|
110 |
=head3 Genome.tbl |
=head3 genome.tbl |
111 |
|
|
112 |
The Genome file is copied almost unmodified to the |
The Genome file is copied almost unmodified to the |
113 |
load file for the B<Genome> entity. Each record |
load file for the B<Genome> entity. Each record |
125 |
A text description of the genome (usually the species name with |
A text description of the genome (usually the species name with |
126 |
a strain ID). |
a strain ID). |
127 |
|
|
128 |
|
=item groupName |
129 |
|
|
130 |
|
The name of the group to which the genome belongs. |
131 |
|
|
132 |
=back |
=back |
133 |
|
|
134 |
=head3 Block.tbl, InterGenic_Block.tbl |
=head3 block.tbl, intergenic_block.tbl |
135 |
|
|
136 |
These files produce most of the data found in the B<GroupBlock> |
These files produce most of the data found in the B<GroupBlock> |
137 |
entity. Each record represents a single block. Blocks either |
entity. Each record represents a single block. Blocks either |
154 |
|
|
155 |
=back |
=back |
156 |
|
|
157 |
=head3 Region.tbl, InterGenic_Region.tbl |
=head3 region.tbl, intergenic_region.tbl |
158 |
|
|
159 |
These files describe the regions in each block. They are |
These files describe the regions in each block. They are |
160 |
used to derive the relationships between genomes and |
used to derive the relationships between genomes and |
305 |
my $TRAILER = 999999999; |
my $TRAILER = 999999999; |
306 |
|
|
307 |
# Parse the command line. |
# Parse the command line. |
308 |
my ($options, @arguments) = Tracer::ParseCommand({ trace => 1, |
my ($options, @arguments) = StandardSetup(['SimBlocks'], |
309 |
load => 'yes', |
{ load => ['yes', "'no' to suppress loading the database"], |
310 |
generate => 'yes'}, |
generate => ['yes', "'no' to suppress generating the load files"], |
311 |
|
createDB => [0, 'drop and create the database before loading'] |
312 |
|
}, |
313 |
|
"directoryName", |
314 |
@ARGV); |
@ARGV); |
315 |
# Extract the directory name from the argument array. |
# Extract the directory name from the argument array. |
316 |
my $inDirectoryTree = $FIG_Config::simBlastData; |
my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets"); |
317 |
if ($arguments[0]) { |
# Check to see if we need to create the database. |
318 |
$inDirectoryTree = Cwd::abs_path($arguments[0]); |
if ($options->{createDB}) { |
319 |
} |
my $dbname = SimBlocks::DBName(); |
320 |
# Set up tracing. |
Trace("Creating database $dbname.") if T(2); |
321 |
my $traceLevel = $options->{trace}; |
DBKernel::CreateDB($dbname); |
322 |
TSetup("$traceLevel Tracer ERDB SimBlocks DBKernel SQL", "+>$FIG_Config::temp/trace.log"); |
} |
323 |
# Get the output directory. |
# Get the output directory. |
324 |
my $outDirectory = $FIG_Config::simBlocksData; |
my $outDirectory = $FIG_Config::simBlocksData; |
325 |
|
# Default it if it isn't present. |
326 |
|
if (! $outDirectory) { |
327 |
|
$outDirectory = "$FIG_Config::fig/BlockData"; |
328 |
|
} |
329 |
# Insure that it exists. |
# Insure that it exists. |
330 |
if (! -d $outDirectory) { |
if (! -d $outDirectory) { |
331 |
Trace("Creating output directory $outDirectory.") if T(2); |
Trace("Creating output directory $outDirectory.") if T(2); |
332 |
mkpath($outDirectory); |
mkpath($outDirectory); |
333 |
} else { |
} elsif ($options->{generate} eq 'yes') { |
334 |
# Here we have an output directory already. Clear any |
# Here we have an output directory already and are going to generate new |
335 |
# leftover data from previous runs. |
# load files. Clear any leftover data from previous runs. |
336 |
my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory); |
my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory); |
337 |
my $numFiles = @files; |
my $numFiles = @files; |
338 |
if ($numFiles > 0) { |
if ($numFiles > 0) { |
343 |
# Create an error counter and a directory counter. |
# Create an error counter and a directory counter. |
344 |
my $errorCount = 0; |
my $errorCount = 0; |
345 |
my $dirCount = 0; |
my $dirCount = 0; |
346 |
|
# Get a similarity block object. Note that we've waited until the database |
347 |
|
# was already created before doing this. The tables, however, may not yet |
348 |
|
# exist, so we can't do very much with it. |
349 |
|
my $simBlocks = SimBlocks->new(); |
350 |
|
# Determine if this is a load-only run. |
351 |
|
my $loadOnly = ($options->{generate} eq 'no'); |
352 |
|
# Create the loader objects for the tables. These are global to the whole |
353 |
|
# script. |
354 |
|
my $GenomeLoad = ERDBLoad->new($simBlocks, 'Genome', $outDirectory, $loadOnly); |
355 |
|
my $ContigLoad = ERDBLoad->new($simBlocks, 'Contig', $outDirectory, $loadOnly); |
356 |
|
my $GroupBlockLoad = ERDBLoad->new($simBlocks, 'GroupBlock', $outDirectory, $loadOnly); |
357 |
|
my $RegionLoad = ERDBLoad->new($simBlocks, 'Region', $outDirectory, $loadOnly); |
358 |
|
my $ConsistsOfLoad = ERDBLoad->new($simBlocks, 'ConsistsOf', $outDirectory, $loadOnly); |
359 |
|
my $ContainsRegionLoad = ERDBLoad->new($simBlocks, 'ContainsRegion', $outDirectory, $loadOnly); |
360 |
|
my $HasInstanceOfLoad = ERDBLoad->new($simBlocks, 'HasInstanceOf', $outDirectory, $loadOnly); |
361 |
|
my $IncludesRegionLoad = ERDBLoad->new($simBlocks, 'IncludesRegion', $outDirectory, $loadOnly); |
362 |
|
# This hash helps us to clean up when we're done. |
363 |
|
my $loaderList = { Genome => $GenomeLoad, Contig => $ContigLoad, |
364 |
|
GroupBlock => $GroupBlockLoad, Region => $RegionLoad, |
365 |
|
ConsistsOf => $ConsistsOfLoad, ContainsRegion => $ContainsRegionLoad, |
366 |
|
HasInstanceOf => $HasInstanceOfLoad, |
367 |
|
IncludesRegion => $IncludesRegionLoad}; |
368 |
# Check to see if we should generate the output files. |
# Check to see if we should generate the output files. |
369 |
if ($options->{generate} eq 'no') { |
if ($loadOnly) { |
370 |
# Here we are to use existing output files. |
# Here we are to use existing output files. |
371 |
Trace("Existing database load files will be used.") if T(2); |
Trace("Existing database load files will be used.") if T(2); |
372 |
} else { |
} else { |
376 |
Confess("Input directory \"$inDirectoryTree\" not found."); |
Confess("Input directory \"$inDirectoryTree\" not found."); |
377 |
} |
} |
378 |
# Loop through the subdirectories. |
# Loop through the subdirectories. |
379 |
for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 0)) { |
for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 1)) { |
380 |
# Verify that this is a directory. If it's ".", we use |
# Verify that this is a directory. |
381 |
# the top-level directory. |
my $inDirectory = "$inDirectoryTree/$inputDirectory/Blocks"; |
382 |
my $inDirectory = ($inputDirectory eq "." ? $inDirectoryTree : |
if (-d $inDirectory) { |
|
"$inDirectoryTree/$inputDirectory"); |
|
|
if (($inputDirectory !~ /\../) && -d $inDirectory) { |
|
383 |
# Here we have a directory to process. Check for a genome |
# Here we have a directory to process. Check for a genome |
384 |
# file. |
# file. |
385 |
my $genomeFileName = "$inDirectory/Genome.tbl"; |
my $genomeFileName = "$inDirectory/genome.tbl"; |
386 |
if (! -e $genomeFileName) { |
if (! -e $genomeFileName) { |
387 |
Trace("$genomeFileName not found. Directory skipped.") if T(1); |
Trace("$genomeFileName not found. Directory skipped.") if T(1); |
388 |
} else { |
} else { |
389 |
# Now we can process the directory and accumulate the error |
# Now we can process the directory and accumulate the error |
390 |
# count. |
# count. |
391 |
$errorCount += ProcessDirectory($inDirectory, $outDirectory); |
$errorCount += ProcessDirectory($inDirectory, $outDirectory, $inputDirectory); |
392 |
$dirCount++; |
$dirCount++; |
393 |
} |
} |
394 |
} |
} |
395 |
} |
} |
396 |
Trace("Load files generated from $dirCount directories.") if T(2); |
Trace("Load files generated from $dirCount directories.") if T(2); |
397 |
|
# Now finish up the loading. |
398 |
|
my $stats = Stats->new(); |
399 |
|
for my $table (keys %{$loaderList}) { |
400 |
|
Trace("Finishing $table.") if T(2); |
401 |
|
my $tableThing = $loaderList->{$table}; |
402 |
|
my $newStats = $tableThing->Finish(); |
403 |
|
$stats->Accumulate($newStats); |
404 |
|
} |
405 |
|
Trace("Statistics for generate.\n" . $stats->Show()) if T(2); |
406 |
} |
} |
407 |
# Check for errors. |
# Check for errors. |
408 |
if ($errorCount > 0) { |
if ($errorCount > 0) { |
412 |
if ($options->{load} eq 'yes') { |
if ($options->{load} eq 'yes') { |
413 |
# Here we have no outstanding errors and the user wants us to load |
# Here we have no outstanding errors and the user wants us to load |
414 |
# the database. First, we create a similarity block object. |
# the database. First, we create a similarity block object. |
|
my $simBlocks = SimBlocks->new(); |
|
415 |
# Use it to load the database. Note we specify that the tables are to be |
# Use it to load the database. Note we specify that the tables are to be |
416 |
# dropped and rebuilt. |
# dropped and rebuilt. |
417 |
$simBlocks->LoadTables($outDirectory, 1); |
$simBlocks->LoadTables($outDirectory, 1); |
421 |
|
|
422 |
# Process a single input directory. |
# Process a single input directory. |
423 |
sub ProcessDirectory { |
sub ProcessDirectory { |
424 |
my ($inDirectory, $outDirectory) = @_; |
my ($inDirectory, $outDirectory, $groupName) = @_; |
425 |
Trace("Processing directory $inDirectory.") if T(2); |
Trace("Processing directory $inDirectory.") if T(2); |
426 |
# Our first task is to copy the genome data to the output directory |
# Our first task is to copy the genome data to the output |
427 |
# and add them to the genome list. |
# and add the genomes to the genome list. |
428 |
my %genomes = (); |
my %genomes = (); |
429 |
Open(\*GENOMESIN, "<$inDirectory/Genome.tbl"); |
Open(\*GENOMESIN, "<$inDirectory/genome.tbl"); |
|
Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx"); |
|
430 |
# Count the genomes read and errors found. |
# Count the genomes read and errors found. |
431 |
my $genomeCount = 0; |
my $genomeCount = 0; |
432 |
my $errorCount = 0; |
my $errorCount = 0; |
433 |
# Loop through the input. |
# Loop through the input. |
434 |
while (my $genomeData = <GENOMESIN>) { |
while (my $genomeData = <GENOMESIN>) { |
435 |
# Echo the genome record to the output. |
# Echo the genome record to the output. |
436 |
print GENOMESOUT $genomeData; |
my @fields = Tracer::ParseRecord($genomeData); |
437 |
|
$GenomeLoad->Put(@fields, $groupName); |
438 |
# Extract the genome ID. |
# Extract the genome ID. |
439 |
my ($genomeID) = Tracer::ParseRecord($genomeData); |
my $genomeID = $fields[0]; |
440 |
# Store it in the genomes hash. We start with a value of 0. If |
# Store it in the genomes hash. We start with a value of 0. If |
441 |
# contig information for the genome is found, we change the value |
# contig information for the genome is found, we change the value |
442 |
# to 1. When we're all done with the regions, we can check the |
# to 1. When we're all done with the regions, we can check the |
446 |
$genomeCount++; |
$genomeCount++; |
447 |
} |
} |
448 |
Trace("$genomeCount genomes found.") if T(2); |
Trace("$genomeCount genomes found.") if T(2); |
449 |
# Close the files. |
# Close the genome file. |
450 |
close GENOMESIN; |
close GENOMESIN; |
|
close GENOMESOUT; |
|
451 |
# Create the contig hash used to associate contigs to their parent |
# Create the contig hash used to associate contigs to their parent |
452 |
# genomes. |
# genomes. |
453 |
my %contigs = (); |
my %contigs = (); |
454 |
# Now we begin to read the Block and Region files in parallel. Both |
# Now we begin to read the Block and Region files in parallel. Both |
455 |
# are sorted by block ID, so all processing for this section of the |
# are sorted by block ID, so all processing for this section of the |
456 |
# script is done a block at a time. The first task is to |
# script is done a block at a time. First, we determine which file |
457 |
# open the output files. |
# sets we'll be processing. There are at most two: regular and |
458 |
Open(\*BLOCKSOUT, ">>$outDirectory/GroupBlock.dtx"); |
# intergenic. |
|
Open(\*REGIONSOUT, ">>$outDirectory/Region.dtx"); |
|
|
Open(\*INSTANCESOUT, ">>$outDirectory/HasInstanceOf.dtx"); |
|
|
Open(\*CONTAINSOUT, ">>$outDirectory/ContainsRegion.dtx"); |
|
|
Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx"); |
|
|
# Determine which file sets we'll be processing. |
|
459 |
my @fileSets = (); |
my @fileSets = (); |
460 |
my @prefixes = ("", "InterGenic_"); |
my @prefixes = ("", "intergenic_"); |
461 |
for my $prefix (@prefixes) { |
for my $prefix (@prefixes) { |
462 |
if (-e "$inDirectory/${prefix}Block.tbl") { |
if (-e "$inDirectory/${prefix}block.tbl") { |
463 |
push @fileSets, $prefix; |
push @fileSets, $prefix; |
464 |
} |
} |
465 |
} |
} |
469 |
my ($blocksCount, $regionsCount) = (0, 0); |
my ($blocksCount, $regionsCount) = (0, 0); |
470 |
# Loop through the useful file sets. |
# Loop through the useful file sets. |
471 |
for my $fileSet (@fileSets) { |
for my $fileSet (@fileSets) { |
472 |
Open(\*BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl"); |
Open(\*BLOCKSIN, "<$inDirectory/${fileSet}block.tbl"); |
473 |
Open(\*REGIONSIN, "<$inDirectory/${fileSet}Region.tbl"); |
Open(\*REGIONSIN, "<$inDirectory/${fileSet}region.tbl"); |
474 |
Trace("Processing ${fileSet}Blocks.") if T(2); |
Trace("Processing ${fileSet}Blocks.") if T(2); |
475 |
# The outer loop processes blocks. This is accomplished by reading |
# The outer loop processes blocks. This is accomplished by reading |
476 |
# through the block file. We prime the loop by reading the first |
# through the block file. We prime the loop by reading the first |
554 |
Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0); |
Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0); |
555 |
$errorCount++; |
$errorCount++; |
556 |
} else { |
} else { |
557 |
Trace("$regionCounter regions found in block $blockID.") if T(2); |
Trace("$regionCounter regions found in block $blockID.") if T(4); |
558 |
# Write the block record. |
# Write the block record. Note that the block ID is prefixed by the group name to |
559 |
|
# make it unique. |
560 |
my $variance = $blockData{"snip-count"} / $blockData{len}; |
my $variance = $blockData{"snip-count"} / $blockData{len}; |
561 |
print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len}, |
$GroupBlockLoad->Put("$groupName:$blockID", $blockData{description}, $blockData{len}, |
562 |
$blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n"; |
$blockData{"snip-count"}, $variance, $blockData{pattern}); |
563 |
# Find all the variance points in the block pattern. We'll use them to create |
# Find all the variance points in the block pattern. We'll use them to create |
564 |
# the content strings for each region. |
# the content strings for each region. |
565 |
my @positions = SimBlocks::ParsePattern($blockData{pattern}); |
my @positions = SimBlocks::ParsePattern($blockData{pattern}); |
577 |
# Get the region's location data. |
# Get the region's location data. |
578 |
my $location = BasicLocation->new($region); |
my $location = BasicLocation->new($region); |
579 |
# Write this region to the output files. |
# Write this region to the output files. |
580 |
print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir, |
$RegionLoad->Put($region, $location->Contig, $location->Dir, |
581 |
$location->Right, $location->Length, |
$location->Right, $location->Length, |
582 |
$regionPeg{$region}, $location->Left, $content) . "\n"; |
$regionPeg{$region}, $location->Left, $content); |
583 |
print CONTAINSOUT join("\t", $location->Contig, $region, |
$ContainsRegionLoad->Put($location->Contig, $region, $location->Length, |
584 |
$location->Length, $location->Left) . "\n"; |
$location->Left); |
585 |
print INCLUDESOUT join("\t", $blockID, $region); |
$IncludesRegionLoad->Put("$groupName:$blockID", $region); |
586 |
} |
} |
587 |
# Finally, we need to connect this block to the genomes in which it occurs. |
# Finally, we need to connect this block to the genomes in which it occurs. |
588 |
for my $genomeID (keys %genomesFound) { |
for my $genomeID (keys %genomesFound) { |
589 |
print INSTANCESOUT join("\t", $genomeID, $blockID) . "\n"; |
$HasInstanceOfLoad->Put($genomeID, "$groupName:$blockID"); |
590 |
} |
} |
591 |
# Count this block's regions. |
# Count this block's regions. |
592 |
$regionsCount += $regionCounter; |
$regionsCount += $regionCounter; |
597 |
close REGIONSIN; |
close REGIONSIN; |
598 |
} |
} |
599 |
# Close the output files. |
# Close the output files. |
|
close REGIONSOUT; |
|
|
close BLOCKSOUT; |
|
|
close INSTANCESOUT; |
|
600 |
# All the block data has been written. Tell the user what we found. |
# All the block data has been written. Tell the user what we found. |
601 |
Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2); |
Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2); |
602 |
# The next task is to write the genome/contig data. This is provided by the |
# The next task is to write the genome/contig data. This is provided by the |
603 |
# "%contigs" hash. First, we need to open the files. |
# "%contigs" hash. |
604 |
my $contigsCount = 0; |
my $contigsCount = 0; |
|
Open(\*CONTIGSOUT, ">>$outDirectory/Contig.dtx"); |
|
|
Open(\*CONSISTSOUT, ">>$outDirectory/ConsistsOf.dtx"); |
|
605 |
for my $contigID (keys %contigs) { |
for my $contigID (keys %contigs) { |
606 |
print CONTIGSOUT "$contigID\n"; |
$ContigLoad->Put("$contigID"); |
607 |
print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n"; |
$ConsistsOfLoad->Put($contigs{$contigID}, $contigID); |
608 |
$contigsCount++; |
$contigsCount++; |
609 |
} |
} |
610 |
Trace("$contigsCount contigs found.") if T(2); |
Trace("$contigsCount contigs found.") if T(2); |