[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Annotation of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     use strict;
4 :     use CGI;
5 :     use Tracer;
6 :     use Genome;
7 : parrello 1.9 use ERDBLoad;
8 : parrello 1.7 use DBKernel;
9 : parrello 1.1 use SimBlocks;
10 :     use File::Path;
11 : parrello 1.2 use BasicLocation;
12 : parrello 1.3 use Cwd;
13 : parrello 1.1
14 :     =head1 Similarity Block Loader
15 :    
16 :     This script loads the similarity block database from
17 : parrello 1.3 the input files. The load process involves two steps:
18 :     converting the input files into C<dtx> load files
19 :     (B<generate>), and loading the C<dtx> files into the
20 :     database (B<load>).
21 : parrello 1.1
22 : parrello 1.3 The script takes a single parameter-- a directory name.
23 : parrello 1.7 The default directory name is C<"$FIG_Config::data/CloseStrainSets">.
24 :     The input files should be in subdirectories called
25 :     C<Block> under the subdirectories of the input directory.
26 :     The subdirectory names themselves are considered the name
27 :     of the close-strain set. So, for example
28 :    
29 :     Data/CloseStrainSets/Vibrio/Block
30 :    
31 :     would be presumed to contain the Vibrio strains.
32 :    
33 :     The output files will be produced in the similarity block
34 :     data directory C<$fig_config::SimBlockData>, which will be
35 :     created if it does not exist. The input directory and all its
36 : parrello 1.3 subdirectories will be processed for input files.
37 : parrello 1.1
38 :     In addition to the directory name, the following
39 :     option parameters are supported.
40 :    
41 :     =over 4
42 :    
43 :     =item trace
44 :    
45 :     Trace level for output messages. A higher number means more
46 : parrello 1.4 messages. The default is C<3>. Trace messages are sent to
47 : parrello 1.3 the file C<trace.log> in the B<$FIG_Config::tmp>
48 :     directory.
49 : parrello 1.1
50 : parrello 1.4 =item sql
51 :    
52 :     If specified, SQL activity will be traced at the specified
53 :     trace level.
54 :    
55 : parrello 1.1 =item load
56 :    
57 :     C<yes> to load the data into the database, else C<no>.
58 :     The default is C<yes>.
59 :    
60 :     =item generate
61 :    
62 :     C<yes> to generate output files from input files, else
63 :     C<no>. The default is C<yes>.
64 :    
65 : parrello 1.6 =item createDB
66 :    
67 :     If specified, the database will be dropped and
68 :     re-created before loading.
69 :    
70 : parrello 1.1 =back
71 :    
72 :     For example, the following command line will process the
73 : parrello 1.7 input files in the C</Users/fig/BlastData> directory tree
74 :     and run at a trace level of 3.
75 : parrello 1.1
76 : parrello 1.3 C<< SimLoad -trace=3 /Users/fig/BlastData >>
77 :    
78 :     The following command line converts the input files in
79 :     the default directory into load files but does not load
80 :     the database and runs at a trace level of 2.
81 : parrello 1.1
82 : parrello 1.3 C<< SimLoad -load=no >>
83 : parrello 1.1
84 :     =head2 Input Directory
85 :    
86 : parrello 1.7 The following files must exist in each C<Block> directory
87 :     under the input directory.
88 : parrello 1.1
89 :     =over 4
90 :    
91 : parrello 1.8 =item genome.tbl
92 : parrello 1.1
93 :     This is a tab-delimited file that contains the ID of each
94 :     genome followed by a description string.
95 :    
96 : parrello 1.8 =item block.tbl, intergenic_block.tbl
97 : parrello 1.1
98 :     These are tab-delimited files that associate a gene name
99 :     with each block. The InterGenic file is optional.
100 :    
101 : parrello 1.8 =item region.tbl, intergenic_region.tbl
102 : parrello 1.1
103 :     These are tab-delimited files that describe each region
104 :     of a block. The InterGenic file is optional.
105 :    
106 :     =back
107 :    
108 :     The format of each file is given below.
109 :    
110 : parrello 1.8 =head3 genome.tbl
111 : parrello 1.1
112 :     The Genome file is copied almost unmodified to the
113 :     load file for the B<Genome> entity. Each record
114 :     represents a single genome. It has the following
115 :     fields.
116 :    
117 :     =over 4
118 :    
119 :     =item genomeID
120 :    
121 :     The ID of the genome.
122 :    
123 :     =item description
124 :    
125 :     A text description of the genome (usually the species name with
126 :     a strain ID).
127 :    
128 : parrello 1.9 =item groupName
129 : parrello 1.8
130 :     The name of the group to which the genome belongs.
131 :    
132 : parrello 1.1 =back
133 :    
134 : parrello 1.8 =head3 block.tbl, intergenic_block.tbl
135 : parrello 1.1
136 :     These files produce most of the data found in the B<GroupBlock>
137 :     entity. Each record represents a single block. Blocks either
138 :     correspond to genes or to inter-genic regions. Both types
139 :     of blocks may appear in multiple locations in multiple
140 :     contigs. The files should be sorted by block ID.
141 :    
142 :     =over 4
143 :    
144 :     =item blockID
145 :    
146 :     The unique ID of the block. This ID is also used in the
147 :     C<Region.tbl> file.
148 :    
149 :     =item blockName
150 :    
151 :     The name of the block. For a gene block, this is the gene
152 :     name. For an inter-genic block, it is a name computed
153 :     from the names of the genes that are commonly nearby.
154 :    
155 :     =back
156 :    
157 : parrello 1.8 =head3 region.tbl, intergenic_region.tbl
158 : parrello 1.1
159 :     These files describe the regions in each block. They are
160 :     used to derive the relationships between genomes and
161 :     contigs (B<ConsistsOf>), the contigs themselves
162 :     (B<Contig>), the relationships between blocks and
163 :     contigs (B<ContainsRegionIn>), and the derived
164 :     relationship between genomes and blocks
165 :     (B<HasInstanceOf>). The files must be sorted by block
166 :     ID, and each record in a file represents a single
167 :     region in a contig. Each region belongs to a
168 :     single block. Note that the C<Region.tbl> file contains
169 :     the regions for the C<Block.tbl> file, and the
170 :     C<InterGenic_Region.tbl> file contains the regions for
171 :     the C<InterGenic_Block.tbl> file. No mixing is allowed.
172 :    
173 :     =over 4
174 :    
175 :     =item regionPEG
176 :    
177 :     PEG ID for this region. If the region is in an
178 :     inter-genic block, this field will be composed of
179 :     the IDs for the neighboring genes.
180 :    
181 :     =item genomeID
182 :    
183 :     ID of the relevant genome.
184 :    
185 :     =item contigID
186 :    
187 :     ID of the contig containing this region. This is a standard contig
188 :     ID that does not include the genome ID. It will be converted to
189 :     a Sprout-style contig ID (which includes the genome data) before
190 :     it is written to the output files.
191 :    
192 :     =item begin
193 :    
194 :     The start point of the region. For a forward region this is the
195 :     left endpoint; for a reverse region it is the right endpoint. It
196 :     is a 1-based offset (which is consistent with Sprout usage), and
197 :     the identified location is inside the region.
198 :    
199 :     =item end
200 :    
201 :     The end point of the region. For a forward region this is the
202 :     right endpoint; for a reverse region it is the left endpoint. It
203 :     is a 1-based offset (which is consistent with Sprout usage), and
204 :     the identified location is inside the region.
205 :    
206 :     =item blockID
207 :    
208 :     The ID of the block containing this region.
209 :    
210 :     =item snippet
211 :    
212 :     A DNA snippet representing the contents of the region. The region
213 :     may be shorter than the block length. If that is the case, the
214 :     snippet will contain insertion characters (C<->). So, while it
215 :     is not the case that every region in a block must be the same
216 :     length, all of the snippets for a block must be the same length.
217 :     The snippets will be in alignment form. In other words, if the
218 :     region is reversed, the nucleotide letters will be the complement
219 :     in transcription order. (For example, if positions 20 through 25
220 :     of contig B<XXX> are C<AGCCTT>, then the snippet for C<XXX_25_20>
221 :     will be C<AAGGCT>.)
222 :    
223 :     =back
224 :    
225 :     =head2 Output File Notes
226 :    
227 :     =over 4
228 :    
229 :     =item Genome.dtx
230 :    
231 :     This file is a direct copy of the C<Genome.tbl> file; however, we
232 :     also use it to create a hash of genome IDs (C<%genomes>). The hash
233 :     is useful when validating the C<Region.tbl> file.
234 :    
235 :     =item Contig.dtx
236 :    
237 :     This file contains nothing but contig IDs. As contigs are
238 :     discovered from the C<Region.tbl> file their IDs are put
239 :     into the C<%contigs> hash. This hash maps contig IDs to
240 :     their parent genome IDs. When processing is complete,
241 :     this file is generated from the hash.
242 :    
243 :     =item GroupBlock.dtx
244 :    
245 :     This file describes the blocks. As records come in from
246 :     C<Region.tbl>, we build a hash called C<%blockData> that
247 :     contains our latest estimate of all the C<GroupBlock.dtx>
248 :     columns for the current block (with the exception of
249 :     B<variance>, which is computed by dividing the B<snip-count>
250 :     by the length (B<len>).
251 :    
252 :     =item ConsistsOf.dtx
253 :    
254 :     This file maps genomes to contigs, and is generated from
255 :     the C<%contigs> hash built while reading the C<Region.tbl>
256 :     file.
257 :    
258 :     =item HasInstanceOf.dtx
259 :    
260 :     This file lists the genomes containing each block. The
261 :     C<Region.tbl> file is sorted by block. While inside a
262 :     block's section of the file, we use a hash called
263 :     C<%genomesFound> that contains the ID of every genome
264 :     found for the block. When we finish with a block,
265 :     we run through the C<%genomesFound> hash to produce
266 :     the block's B<HasInstanceOf> data.
267 :    
268 :     =item Region.dtx
269 :    
270 :     This file describes the contig regions in the blocks.
271 :     As the C<Region.tbl> file is read in, we build a
272 :     hash called C<%regionMap> that maps a region's
273 :     SEED-style location string to the DNA content.
274 :     When we finish with a block, the DNA content is
275 :     converted into an alignment by comparing it to
276 :     the block's pattern in C<%blockData>. (Essentially,
277 :     we only include the region's content for the
278 :     positions that vary between regions in the block.)
279 :     From this and the region string itself, we have
280 :     enough data to create the B<Region>
281 :     data.
282 :    
283 :     =item IncludesRegion.dtx
284 :    
285 :     This file maps group blocks to regions. The first column
286 :     is the block ID and the second column is the SEED-style
287 :     region string for the target region. This file is built
288 :     in parallel with C<Region.dtx>. It will have one record
289 :     for each region.
290 :    
291 :     =item ContainsRegion.dtx
292 :    
293 :     This file maps contigs to regions. The first column is
294 :     the contig ID and the second column is the SEED-style
295 :     location string for the region. It contains two redundant
296 :     columns used for sorting-- the region length (column 3)
297 :     and the left-most region position (column 4). This
298 :     file is built in parallel with C<Region.dtx>. It will
299 :     have one record for each region.
300 :    
301 :     =cut
302 :    
303 :     # Create a huge number we can use for an end-of-file
304 :     # indicator in the block ID.
305 :     my $TRAILER = 999999999;
306 :    
307 :     # Parse the command line.
308 : parrello 1.5 my ($options, @arguments) = StandardSetup(['SimBlocks'],
309 :     { load => ['yes', "'no' to suppress loading the database"],
310 : parrello 1.6 generate => ['yes', "'no' to suppress generating the load files"],
311 :     createDB => [0, 'drop and create the database before loading']
312 : parrello 1.5 },
313 :     "directoryName",
314 : parrello 1.1 @ARGV);
315 :     # Extract the directory name from the argument array.
316 : parrello 1.7 my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets");
317 :     # Check to see if we need to create the database.
318 :     if ($options->{createDB}) {
319 :     Trace("Creating database.") if T(2);
320 :     DBKernel::CreateDB($FIG_Config::simBlocksDB);
321 : parrello 1.1 }
322 :     # Get the output directory.
323 :     my $outDirectory = $FIG_Config::simBlocksData;
324 :     # Insure that it exists.
325 :     if (! -d $outDirectory) {
326 : parrello 1.3 Trace("Creating output directory $outDirectory.") if T(2);
327 : parrello 1.1 mkpath($outDirectory);
328 : parrello 1.4 } elsif ($options->{generate} eq 'yes') {
329 :     # Here we have an output directory already and are going to generate new
330 :     # load files. Clear any leftover data from previous runs.
331 : parrello 1.3 my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory);
332 :     my $numFiles = @files;
333 :     if ($numFiles > 0) {
334 :     Trace("Deleting $numFiles old dtx files from $outDirectory.") if T(2);
335 :     unlink map { "$outDirectory/$_" } @files;
336 :     }
337 : parrello 1.1 }
338 : parrello 1.3 # Create an error counter and a directory counter.
339 : parrello 1.1 my $errorCount = 0;
340 : parrello 1.3 my $dirCount = 0;
341 : parrello 1.9 # Get a similarity block object. Note that we've waited until the database
342 :     # was already created before doing this. The tables, however, may not yet
343 :     # exist, so we can't do very much with it.
344 :     my $simBlocks = SimBlocks->new();
345 :     # Determine if this is a load-only run.
346 :     my $loadOnly = ($options->{generate} eq 'no');
347 :     # Create the loader objects for the tables. These are global to the whole
348 :     # script.
349 :     my $GenomeLoad = ERDBLoad->new($simBlocks, 'Genome', $outDirectory, $loadOnly);
350 :     my $ContigLoad = ERDBLoad->new($simBlocks, 'Contig', $outDirectory, $loadOnly);
351 :     my $GroupBlockLoad = ERDBLoad->new($simBlocks, 'GroupBlock', $outDirectory, $loadOnly);
352 :     my $RegionLoad = ERDBLoad->new($simBlocks, 'Region', $outDirectory, $loadOnly);
353 :     my $ConsistsOfLoad = ERDBLoad->new($simBlocks, 'ConsistsOf', $outDirectory, $loadOnly);
354 :     my $ContainsRegionLoad = ERDBLoad->new($simBlocks, 'ContainsRegion', $outDirectory, $loadOnly);
355 :     my $HasInstanceOfLoad = ERDBLoad->new($simBlocks, 'HasInstanceOf', $outDirectory, $loadOnly);
356 :     my $IncludesRegionLoad = ERDBLoad->new($simBlocks, 'IncludesRegion', $outDirectory, $loadOnly);
357 :     # This hash helps us to clean up when we're done.
358 :     my $loaderList = { Genome => $GenomeLoad, Contig => $ContigLoad,
359 :     GroupBlock => $GroupBlockLoad, Region => $RegionLoad,
360 :     ConsistsOf => $ConsistsOfLoad, ContainsRegion => $ContainsRegionLoad,
361 :     HasInstanceOf => $HasInstanceOfLoad,
362 :     IncludesRegion => $IncludesRegionLoad};
363 : parrello 1.1 # Check to see if we should generate the output files.
364 : parrello 1.9 if ($loadOnly) {
365 : parrello 1.1 # Here we are to use existing output files.
366 : parrello 1.3 Trace("Existing database load files will be used.") if T(2);
367 : parrello 1.1 } else {
368 :     # Here we need to produce new output files.
369 :     # Verify that the input directory exists.
370 : parrello 1.3 if (! -d $inDirectoryTree) {
371 :     Confess("Input directory \"$inDirectoryTree\" not found.");
372 : parrello 1.1 }
373 : parrello 1.3 # Loop through the subdirectories.
374 : parrello 1.7 for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 1)) {
375 :     # Verify that this is a directory.
376 :     my $inDirectory = "$inDirectoryTree/$inputDirectory/Blocks";
377 :     if (-d $inDirectory) {
378 : parrello 1.3 # Here we have a directory to process. Check for a genome
379 :     # file.
380 : parrello 1.8 my $genomeFileName = "$inDirectory/genome.tbl";
381 : parrello 1.3 if (! -e $genomeFileName) {
382 :     Trace("$genomeFileName not found. Directory skipped.") if T(1);
383 :     } else {
384 :     # Now we can process the directory and accumulate the error
385 :     # count.
386 : parrello 1.7 $errorCount += ProcessDirectory($inDirectory, $outDirectory, $inputDirectory);
387 : parrello 1.3 $dirCount++;
388 :     }
389 :     }
390 :     }
391 :     Trace("Load files generated from $dirCount directories.") if T(2);
392 : parrello 1.9 # Now finish up the loading.
393 :     my $stats = Stats->new();
394 :     for my $table (keys %{$loaderList}) {
395 :     Trace("Finishing $table.") if T(2);
396 :     my $tableThing = $loaderList->{$table};
397 :     my $newStats = $tableThing->Finish();
398 :     $stats->Accumulate($newStats);
399 :     }
400 :     Trace("Statistics for generate.\n" . $stats->Show()) if T(2);
401 : parrello 1.3 }
402 :     # Check for errors.
403 :     if ($errorCount > 0) {
404 :     Trace("$errorCount errors found in input files.") if T(0);
405 :     } else {
406 :     # No errors, so it's okay to load the database.
407 :     if ($options->{load} eq 'yes') {
408 :     # Here we have no outstanding errors and the user wants us to load
409 :     # the database. First, we create a similarity block object.
410 :     # Use it to load the database. Note we specify that the tables are to be
411 :     # dropped and rebuilt.
412 :     $simBlocks->LoadTables($outDirectory, 1);
413 :     Trace("Database loaded.") if T(2);
414 :     }
415 :     }
416 :    
417 :     # Process a single input directory.
418 :     sub ProcessDirectory {
419 : parrello 1.7 my ($inDirectory, $outDirectory, $groupName) = @_;
420 : parrello 1.3 Trace("Processing directory $inDirectory.") if T(2);
421 : parrello 1.9 # Our first task is to copy the genome data to the output
422 : parrello 1.7 # and add the genomes to the genome list.
423 : parrello 1.1 my %genomes = ();
424 : parrello 1.8 Open(\*GENOMESIN, "<$inDirectory/genome.tbl");
425 : parrello 1.3 # Count the genomes read and errors found.
426 : parrello 1.1 my $genomeCount = 0;
427 : parrello 1.3 my $errorCount = 0;
428 : parrello 1.1 # Loop through the input.
429 :     while (my $genomeData = <GENOMESIN>) {
430 :     # Echo the genome record to the output.
431 : parrello 1.7 my @fields = Tracer::ParseRecord($genomeData);
432 : parrello 1.9 $GenomeLoad->Put(@fields, $groupName);
433 : parrello 1.1 # Extract the genome ID.
434 : parrello 1.7 my $genomeID = $fields[0];
435 : parrello 1.1 # Store it in the genomes hash. We start with a value of 0. If
436 :     # contig information for the genome is found, we change the value
437 :     # to 1. When we're all done with the regions, we can check the
438 :     # hash to insure all the genomes were represented in the input.
439 :     $genomes{$genomeID} = 0;
440 :     # Count this genome.
441 :     $genomeCount++;
442 :     }
443 : parrello 1.3 Trace("$genomeCount genomes found.") if T(2);
444 : parrello 1.9 # Close the genome file.
445 : parrello 1.1 close GENOMESIN;
446 :     # Create the contig hash used to associate contigs to their parent
447 :     # genomes.
448 :     my %contigs = ();
449 :     # Now we begin to read the Block and Region files in parallel. Both
450 :     # are sorted by block ID, so all processing for this section of the
451 : parrello 1.9 # script is done a block at a time. First, we determine which file
452 :     # sets we'll be processing. There are at most two: regular and
453 :     # intergenic.
454 : parrello 1.1 my @fileSets = ();
455 : parrello 1.8 my @prefixes = ("", "intergenic_");
456 : parrello 1.1 for my $prefix (@prefixes) {
457 : parrello 1.8 if (-e "$inDirectory/${prefix}block.tbl") {
458 : parrello 1.1 push @fileSets, $prefix;
459 :     }
460 :     }
461 : parrello 1.3 # Set up the duplicate-region check.
462 :     my %allRegions = ();
463 : parrello 1.1 # Set up some counters.
464 :     my ($blocksCount, $regionsCount) = (0, 0);
465 :     # Loop through the useful file sets.
466 :     for my $fileSet (@fileSets) {
467 : parrello 1.8 Open(\*BLOCKSIN, "<$inDirectory/${fileSet}block.tbl");
468 :     Open(\*REGIONSIN, "<$inDirectory/${fileSet}region.tbl");
469 : parrello 1.3 Trace("Processing ${fileSet}Blocks.") if T(2);
470 : parrello 1.1 # The outer loop processes blocks. This is accomplished by reading
471 :     # through the block file. We prime the loop by reading the first
472 :     # region record. This is because we finish processing a block when
473 :     # the first record of the next block is found in the region file.
474 :     my %regionRecord = GetRegionRecord();
475 :     $regionsCount++;
476 :     while (my $blockRecord = <BLOCKSIN>) {
477 :     $blocksCount++;
478 :     # Parse the block record.
479 :     my ($blockID, $blockName, $pegID) = Tracer::ParseRecord($blockRecord);
480 :     # Create the block data for this block.
481 :     my %blockData = ( id => $blockID, description => $blockName );
482 :     # Initialize the tracking hashes. "genomesFound" tracks the
483 : parrello 1.2 # genomes whose contigs are represented by the block,
484 : parrello 1.1 # "regionMap" maps each region to its contents, and
485 :     # "regionPeg" maps each region to its PEG (if any).
486 :     my %genomesFound = ();
487 :     my %regionMap = ();
488 :     my %regionPeg = ();
489 :     # Count the number of regions found in this block.
490 :     my $regionCounter = 0;
491 :     # Loop through the regions in the block. Because of the way
492 :     # "GetRegionRecord" works, the "blockID" field will have an
493 :     # impossibly high value if we've hit end-of-file in the
494 :     # region input file.
495 :     while ($regionRecord{blockID} <= $blockID) {
496 :     # If this region's block ID is invalid, complain
497 :     # and skip it.
498 :     if ($regionRecord{blockID} != $blockID) {
499 : parrello 1.3 Trace("Block $regionRecord{blockID} in region record $regionsCount not found in block input file at record $blocksCount.") if T(0);
500 : parrello 1.1 $errorCount++;
501 :     } else {
502 :     # Here both files are in sync, which is good. The next step is
503 :     # to connect with the Genome and the Contig.
504 :     my $genomeID = $regionRecord{genomeID};
505 :     my $contigID = "$genomeID:$regionRecord{contigID}";
506 :     if (! exists $genomes{$genomeID}) {
507 : parrello 1.3 Trace("Genome $genomeID in region record $regionsCount not found in genome input file.") if T(0);
508 : parrello 1.1 $errorCount++;
509 :     } else {
510 :     # Denote this genome has an instance of this block.
511 :     $genomesFound{$genomeID} = 1;
512 :     # Denote this genome has occurred in the region file.
513 :     $genomes{$genomeID} = 1;
514 :     # Connect the contig to the genome.
515 :     $contigs{$contigID} = $genomeID;
516 :     # Now we need to process the snippet. First, we create a
517 :     # region string.
518 :     my $regionString = "${contigID}_$regionRecord{begin}_$regionRecord{end}";
519 :     # Next, we stuff the snippet and PEG in the region's hash entries.
520 :     my $snippet = $regionRecord{snippet};
521 :     $regionMap{$regionString} = $snippet;
522 :     $regionPeg{$regionString} = $regionRecord{peg};
523 :     # Check to see if this is the block's first snippet.
524 :     if (! exists $blockData{pattern}) {
525 :     # Here it is, so store the snippet as the pattern.
526 :     $blockData{pattern} = $snippet;
527 :     $blockData{"snip-count"} = 0;
528 :     $blockData{len} = length $snippet;
529 :     } elsif ($blockData{len} != length $snippet) {
530 :     # Here it is not the first, but the lengths do not match.
531 : parrello 1.3 Trace("Snippet for region record $regionsCount does not match block length $blockData{len}.") if T(0);
532 : parrello 1.1 $errorCount++;
533 :     } else {
534 :     # Here everything is legitimate, so we merge the new
535 :     # snippet into the pattern.
536 :     ($blockData{pattern}, $blockData{"snip-count"}) =
537 :     SimBlocks::MergeDNA($blockData{pattern}, $snippet);
538 :     }
539 :     }
540 :     # Count this region.
541 :     $regionCounter++;
542 :     }
543 :     # Get the next region record.
544 :     %regionRecord = GetRegionRecord();
545 :     }
546 :     # We have now processed all the regions in the block. Insure we found at least
547 :     # one.
548 :     if (! $regionCounter) {
549 : parrello 1.3 Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0);
550 : parrello 1.1 $errorCount++;
551 :     } else {
552 : parrello 1.4 Trace("$regionCounter regions found in block $blockID.") if T(4);
553 : parrello 1.7 # Write the block record. Note that the block ID is prefixed by the group name to
554 :     # make it unique.
555 : parrello 1.1 my $variance = $blockData{"snip-count"} / $blockData{len};
556 : parrello 1.9 $GroupBlockLoad->Put("$groupName:$blockID", $blockData{description}, $blockData{len},
557 :     $blockData{"snip-count"}, $variance, $blockData{pattern});
558 : parrello 1.1 # Find all the variance points in the block pattern. We'll use them to create
559 :     # the content strings for each region.
560 :     my @positions = SimBlocks::ParsePattern($blockData{pattern});
561 :     # Loop through the regions, writing them out to the region output file.
562 :     for my $region (keys %regionMap) {
563 : parrello 1.3 if (length($region) > 80) {
564 :     Trace("Invalid region key \"$region\".") if T(1);
565 :     }
566 : parrello 1.1 # Get the region's snips.
567 :     my $source = $regionMap{$region};
568 :     my $content = "";
569 :     for my $pos (@positions) {
570 :     $content .= substr $source, $pos, 1;
571 :     }
572 :     # Get the region's location data.
573 : parrello 1.2 my $location = BasicLocation->new($region);
574 : parrello 1.1 # Write this region to the output files.
575 : parrello 1.9 $RegionLoad->Put($region, $location->Contig, $location->Dir,
576 :     $location->Right, $location->Length,
577 :     $regionPeg{$region}, $location->Left, $content);
578 :     $ContainsRegionLoad->Put($location->Contig, $region, $location->Length,
579 :     $location->Left);
580 :     $IncludesRegionLoad->Put("$groupName:$blockID", $region);
581 : parrello 1.1 }
582 :     # Finally, we need to connect this block to the genomes in which it occurs.
583 :     for my $genomeID (keys %genomesFound) {
584 : parrello 1.9 $HasInstanceOfLoad->Put($genomeID, "$groupName:$blockID");
585 : parrello 1.1 }
586 :     # Count this block's regions.
587 :     $regionsCount += $regionCounter;
588 :     }
589 :     }
590 :     # Close the input files.
591 :     close BLOCKSIN;
592 :     close REGIONSIN;
593 :     }
594 :     # Close the output files.
595 :     # All the block data has been written. Tell the user what we found.
596 : parrello 1.3 Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);
597 : parrello 1.1 # The next task is to write the genome/contig data. This is provided by the
598 : parrello 1.9 # "%contigs" hash.
599 : parrello 1.1 my $contigsCount = 0;
600 :     for my $contigID (keys %contigs) {
601 : parrello 1.9 $ContigLoad->Put("$contigID");
602 :     $ConsistsOfLoad->Put($contigs{$contigID}, $contigID);
603 : parrello 1.1 $contigsCount++;
604 :     }
605 : parrello 1.3 Trace("$contigsCount contigs found.") if T(2);
606 : parrello 1.1 # Now warn the user about all the genomes that didn't have blocks.
607 :     for my $genomeID (keys %genomes) {
608 :     if (! $genomes{$genomeID}) {
609 : parrello 1.3 Trace("Genome $genomeID did not have any regions.") if T(1);
610 : parrello 1.1 $errorCount++;
611 :     }
612 :     }
613 : parrello 1.3 return $errorCount;
614 : parrello 1.1 }
615 :     # Tell the user we're done.
616 : parrello 1.3 Trace("Processing complete.") if T(0);
617 : parrello 1.1
618 :     # Read a region record from the file and parse it into a hash
619 :     # for return to the caller. If we reach end-of-file, the
620 :     # hash returned will have $TRAILER in the blockID field.
621 :     sub GetRegionRecord {
622 :     # Create the return hash.
623 :     my %retVal = ();
624 :     # Read the record.
625 :     my $regionData = <REGIONSIN>;
626 :     # Check for end-of-file.
627 :     if (!defined $regionData) {
628 :     # Here we have end-of-file, so stuff in a trailer
629 :     # value for the block ID.
630 :     $retVal{blockID} = $TRAILER;
631 :     } else {
632 :     # Here we have a real record.
633 :     ($retVal{peg}, $retVal{genomeID}, $retVal{contigID},
634 :     $retVal{begin}, $retVal{end}, $retVal{blockID},
635 :     $retVal{snippet}) = Tracer::ParseRecord($regionData);
636 :     }
637 :     # Return the hash created.
638 :     return %retVal;
639 :     }
640 :    
641 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3