[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Annotation of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     use strict;
4 :     use CGI;
5 :     use Tracer;
6 :     use Genome;
7 :     use SimBlocks;
8 :     use File::Path;
9 : parrello 1.2 use BasicLocation;
10 : parrello 1.3 use Cwd;
11 : parrello 1.1
12 :     =head1 Similarity Block Loader
13 :    
14 :     This script loads the similarity block database from
15 : parrello 1.3 the input files. The load process involves two steps:
16 :     converting the input files into C<dtx> load files
17 :     (B<generate>), and loading the C<dtx> files into the
18 :     database (B<load>).
19 : parrello 1.1
20 : parrello 1.3 The script takes a single parameter-- a directory name.
21 : parrello 1.1 The default directory name is taken from the config file
22 :     parameter C<$fig_config::SimBlastData>. The output files
23 :     will be produced in the similarity block data directory
24 :     C<$fig_config::SimBlockData>, which will be created if
25 : parrello 1.3 it does not exist. The input directory and all its
26 :     subdirectories will be processed for input files.
27 : parrello 1.1
28 :     In addition to the directory name, the following
29 :     option parameters are supported.
30 :    
31 :     =over 4
32 :    
33 :     =item trace
34 :    
35 :     Trace level for output messages. A higher number means more
36 : parrello 1.4 messages. The default is C<3>. Trace messages are sent to
37 : parrello 1.3 the file C<trace.log> in the B<$FIG_Config::tmp>
38 :     directory.
39 : parrello 1.1
40 : parrello 1.4 =item sql
41 :    
42 :     If specified, SQL activity will be traced at the specified
43 :     trace level.
44 :    
45 : parrello 1.1 =item load
46 :    
47 :     C<yes> to load the data into the database, else C<no>.
48 :     The default is C<yes>.
49 :    
50 :     =item generate
51 :    
52 :     C<yes> to generate output files from input files, else
53 :     C<no>. The default is C<yes>.
54 :    
55 : parrello 1.6 =item createDB
56 :    
57 :     If specified, the database will be dropped and
58 :     re-created before loading.
59 :    
60 : parrello 1.1 =back
61 :    
62 :     For example, the following command line will process the
63 : parrello 1.3 input files in the C</Users/fig/BlastData> directory and
64 : parrello 1.1 run at a trace level of 3.
65 :    
66 : parrello 1.3 C<< SimLoad -trace=3 /Users/fig/BlastData >>
67 :    
68 :     The following command line converts the input files in
69 :     the default directory into load files but does not load
70 :     the database and runs at a trace level of 2.
71 : parrello 1.1
72 : parrello 1.3 C<< SimLoad -load=no >>
73 : parrello 1.1
74 :     =head2 Input Directory
75 :    
76 : parrello 1.3 The following files must exist in each input directory.
77 : parrello 1.1
78 :     =over 4
79 :    
80 :     =item Genome.tbl
81 :    
82 :     This is a tab-delimited file that contains the ID of each
83 :     genome followed by a description string.
84 :    
85 :     =item Block.tbl, InterGenic_Block.tbl
86 :    
87 :     These are tab-delimited files that associate a gene name
88 :     with each block. The InterGenic file is optional.
89 :    
90 :     =item Region.tbl, InterGenic_Region.tbl
91 :    
92 :     These are tab-delimited files that describe each region
93 :     of a block. The InterGenic file is optional.
94 :    
95 :     =back
96 :    
97 :     The format of each file is given below.
98 :    
99 :     =head3 Genome.tbl
100 :    
101 :     The Genome file is copied almost unmodified to the
102 :     load file for the B<Genome> entity. Each record
103 :     represents a single genome. It has the following
104 :     fields.
105 :    
106 :     =over 4
107 :    
108 :     =item genomeID
109 :    
110 :     The ID of the genome.
111 :    
112 :     =item description
113 :    
114 :     A text description of the genome (usually the species name with
115 :     a strain ID).
116 :    
117 :     =back
118 :    
119 :     =head3 Block.tbl, InterGenic_Block.tbl
120 :    
121 :     These files produce most of the data found in the B<GroupBlock>
122 :     entity. Each record represents a single block. Blocks either
123 :     correspond to genes or to inter-genic regions. Both types
124 :     of blocks may appear in multiple locations in multiple
125 :     contigs. The files should be sorted by block ID.
126 :    
127 :     =over 4
128 :    
129 :     =item blockID
130 :    
131 :     The unique ID of the block. This ID is also used in the
132 :     C<Region.tbl> file.
133 :    
134 :     =item blockName
135 :    
136 :     The name of the block. For a gene block, this is the gene
137 :     name. For an inter-genic block, it is a name computed
138 :     from the names of the genes that are commonly nearby.
139 :    
140 :     =back
141 :    
142 :     =head3 Region.tbl, InterGenic_Region.tbl
143 :    
144 :     These files describe the regions in each block. They are
145 :     used to derive the relationships between genomes and
146 :     contigs (B<ConsistsOf>), the contigs themselves
147 :     (B<Contig>), the relationships between blocks and
148 :     contigs (B<ContainsRegionIn>), and the derived
149 :     relationship between genomes and blocks
150 :     (B<HasInstanceOf>). The files must be sorted by block
151 :     ID, and each record in a file represents a single
152 :     region in a contig. Each region belongs to a
153 :     single block. Note that the C<Region.tbl> file contains
154 :     the regions for the C<Block.tbl> file, and the
155 :     C<InterGenic_Region.tbl> file contains the regions for
156 :     the C<InterGenic_Block.tbl> file. No mixing is allowed.
157 :    
158 :     =over 4
159 :    
160 :     =item regionPEG
161 :    
162 :     PEG ID for this region. If the region is in an
163 :     inter-genic block, this field will be composed of
164 :     the IDs for the neighboring genes.
165 :    
166 :     =item genomeID
167 :    
168 :     ID of the relevant genome.
169 :    
170 :     =item contigID
171 :    
172 :     ID of the contig containing this region. This is a standard contig
173 :     ID that does not include the genome ID. It will be converted to
174 :     a Sprout-style contig ID (which includes the genome data) before
175 :     it is written to the output files.
176 :    
177 :     =item begin
178 :    
179 :     The start point of the region. For a forward region this is the
180 :     left endpoint; for a reverse region it is the right endpoint. It
181 :     is a 1-based offset (which is consistent with Sprout usage), and
182 :     the identified location is inside the region.
183 :    
184 :     =item end
185 :    
186 :     The end point of the region. For a forward region this is the
187 :     right endpoint; for a reverse region it is the left endpoint. It
188 :     is a 1-based offset (which is consistent with Sprout usage), and
189 :     the identified location is inside the region.
190 :    
191 :     =item blockID
192 :    
193 :     The ID of the block containing this region.
194 :    
195 :     =item snippet
196 :    
197 :     A DNA snippet representing the contents of the region. The region
198 :     may be shorter than the block length. If that is the case, the
199 :     snippet will contain insertion characters (C<->). So, while it
200 :     is not the case that every region in a block must be the same
201 :     length, all of the snippets for a block must be the same length.
202 :     The snippets will be in alignment form. In other words, if the
203 :     region is reversed, the nucleotide letters will be the complement
204 :     in transcription order. (For example, if positions 20 through 25
205 :     of contig B<XXX> are C<AGCCTT>, then the snippet for C<XXX_25_20>
206 :     will be C<AAGGCT>.)
207 :    
208 :     =back
209 :    
210 :     =head2 Output File Notes
211 :    
212 :     =over 4
213 :    
214 :     =item Genome.dtx
215 :    
216 :     This file is a direct copy of the C<Genome.tbl> file; however, we
217 :     also use it to create a hash of genome IDs (C<%genomes>). The hash
218 :     is useful when validating the C<Region.tbl> file.
219 :    
220 :     =item Contig.dtx
221 :    
222 :     This file contains nothing but contig IDs. As contigs are
223 :     discovered from the C<Region.tbl> file their IDs are put
224 :     into the C<%contigs> hash. This hash maps contig IDs to
225 :     their parent genome IDs. When processing is complete,
226 :     this file is generated from the hash.
227 :    
228 :     =item GroupBlock.dtx
229 :    
230 :     This file describes the blocks. As records come in from
231 :     C<Region.tbl>, we build a hash called C<%blockData> that
232 :     contains our latest estimate of all the C<GroupBlock.dtx>
233 :     columns for the current block (with the exception of
234 :     B<variance>, which is computed by dividing the B<snip-count>
235 :     by the length (B<len>).
236 :    
237 :     =item ConsistsOf.dtx
238 :    
239 :     This file maps genomes to contigs, and is generated from
240 :     the C<%contigs> hash built while reading the C<Region.tbl>
241 :     file.
242 :    
243 :     =item HasInstanceOf.dtx
244 :    
245 :     This file lists the genomes containing each block. The
246 :     C<Region.tbl> file is sorted by block. While inside a
247 :     block's section of the file, we use a hash called
248 :     C<%genomesFound> that contains the ID of every genome
249 :     found for the block. When we finish with a block,
250 :     we run through the C<%genomesFound> hash to produce
251 :     the block's B<HasInstanceOf> data.
252 :    
253 :     =item Region.dtx
254 :    
255 :     This file describes the contig regions in the blocks.
256 :     As the C<Region.tbl> file is read in, we build a
257 :     hash called C<%regionMap> that maps a region's
258 :     SEED-style location string to the DNA content.
259 :     When we finish with a block, the DNA content is
260 :     converted into an alignment by comparing it to
261 :     the block's pattern in C<%blockData>. (Essentially,
262 :     we only include the region's content for the
263 :     positions that vary between regions in the block.)
264 :     From this and the region string itself, we have
265 :     enough data to create the B<Region>
266 :     data.
267 :    
268 :     =item IncludesRegion.dtx
269 :    
270 :     This file maps group blocks to regions. The first column
271 :     is the block ID and the second column is the SEED-style
272 :     region string for the target region. This file is built
273 :     in parallel with C<Region.dtx>. It will have one record
274 :     for each region.
275 :    
276 :     =item ContainsRegion.dtx
277 :    
278 :     This file maps contigs to regions. The first column is
279 :     the contig ID and the second column is the SEED-style
280 :     location string for the region. It contains two redundant
281 :     columns used for sorting-- the region length (column 3)
282 :     and the left-most region position (column 4). This
283 :     file is built in parallel with C<Region.dtx>. It will
284 :     have one record for each region.
285 :    
286 :     =cut
287 :    
288 :     # Create a huge number we can use for an end-of-file
289 :     # indicator in the block ID.
290 :     my $TRAILER = 999999999;
291 :    
292 :     # Parse the command line.
293 : parrello 1.5 my ($options, @arguments) = StandardSetup(['SimBlocks'],
294 :     { load => ['yes', "'no' to suppress loading the database"],
295 : parrello 1.6 generate => ['yes', "'no' to suppress generating the load files"],
296 :     createDB => [0, 'drop and create the database before loading']
297 : parrello 1.5 },
298 :     "directoryName",
299 : parrello 1.1 @ARGV);
300 :     # Extract the directory name from the argument array.
301 : parrello 1.3 my $inDirectoryTree = $FIG_Config::simBlastData;
302 : parrello 1.1 if ($arguments[0]) {
303 : parrello 1.3 $inDirectoryTree = Cwd::abs_path($arguments[0]);
304 : parrello 1.1 }
305 :     # Get the output directory.
306 :     my $outDirectory = $FIG_Config::simBlocksData;
307 :     # Insure that it exists.
308 :     if (! -d $outDirectory) {
309 : parrello 1.3 Trace("Creating output directory $outDirectory.") if T(2);
310 : parrello 1.1 mkpath($outDirectory);
311 : parrello 1.4 } elsif ($options->{generate} eq 'yes') {
312 :     # Here we have an output directory already and are going to generate new
313 :     # load files. Clear any leftover data from previous runs.
314 : parrello 1.3 my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory);
315 :     my $numFiles = @files;
316 :     if ($numFiles > 0) {
317 :     Trace("Deleting $numFiles old dtx files from $outDirectory.") if T(2);
318 :     unlink map { "$outDirectory/$_" } @files;
319 :     }
320 : parrello 1.1 }
321 : parrello 1.3 # Create an error counter and a directory counter.
322 : parrello 1.1 my $errorCount = 0;
323 : parrello 1.3 my $dirCount = 0;
324 : parrello 1.1 # Check to see if we should generate the output files.
325 :     if ($options->{generate} eq 'no') {
326 :     # Here we are to use existing output files.
327 : parrello 1.3 Trace("Existing database load files will be used.") if T(2);
328 : parrello 1.1 } else {
329 :     # Here we need to produce new output files.
330 :     # Verify that the input directory exists.
331 : parrello 1.3 if (! -d $inDirectoryTree) {
332 :     Confess("Input directory \"$inDirectoryTree\" not found.");
333 : parrello 1.1 }
334 : parrello 1.3 # Loop through the subdirectories.
335 :     for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 0)) {
336 :     # Verify that this is a directory. If it's ".", we use
337 :     # the top-level directory.
338 :     my $inDirectory = ($inputDirectory eq "." ? $inDirectoryTree :
339 :     "$inDirectoryTree/$inputDirectory");
340 :     if (($inputDirectory !~ /\../) && -d $inDirectory) {
341 :     # Here we have a directory to process. Check for a genome
342 :     # file.
343 :     my $genomeFileName = "$inDirectory/Genome.tbl";
344 :     if (! -e $genomeFileName) {
345 :     Trace("$genomeFileName not found. Directory skipped.") if T(1);
346 :     } else {
347 :     # Now we can process the directory and accumulate the error
348 :     # count.
349 :     $errorCount += ProcessDirectory($inDirectory, $outDirectory);
350 :     $dirCount++;
351 :     }
352 :     }
353 :     }
354 :     Trace("Load files generated from $dirCount directories.") if T(2);
355 :     }
356 :     # Check for errors.
357 :     if ($errorCount > 0) {
358 :     Trace("$errorCount errors found in input files.") if T(0);
359 :     } else {
360 :     # No errors, so it's okay to load the database.
361 :     if ($options->{load} eq 'yes') {
362 :     # Here we have no outstanding errors and the user wants us to load
363 :     # the database. First, we create a similarity block object.
364 :     my $simBlocks = SimBlocks->new();
365 :     # Use it to load the database. Note we specify that the tables are to be
366 :     # dropped and rebuilt.
367 :     $simBlocks->LoadTables($outDirectory, 1);
368 :     Trace("Database loaded.") if T(2);
369 :     }
370 :     }
371 :    
372 :     # Process a single input directory.
373 :     sub ProcessDirectory {
374 :     my ($inDirectory, $outDirectory) = @_;
375 :     Trace("Processing directory $inDirectory.") if T(2);
376 : parrello 1.1 # Our first task is to copy the genome data to the output directory
377 : parrello 1.3 # and add them to the genome list.
378 : parrello 1.1 my %genomes = ();
379 : parrello 1.3 Open(\*GENOMESIN, "<$inDirectory/Genome.tbl");
380 :     Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");
381 :     # Count the genomes read and errors found.
382 : parrello 1.1 my $genomeCount = 0;
383 : parrello 1.3 my $errorCount = 0;
384 : parrello 1.1 # Loop through the input.
385 :     while (my $genomeData = <GENOMESIN>) {
386 :     # Echo the genome record to the output.
387 :     print GENOMESOUT $genomeData;
388 :     # Extract the genome ID.
389 :     my ($genomeID) = Tracer::ParseRecord($genomeData);
390 :     # Store it in the genomes hash. We start with a value of 0. If
391 :     # contig information for the genome is found, we change the value
392 :     # to 1. When we're all done with the regions, we can check the
393 :     # hash to insure all the genomes were represented in the input.
394 :     $genomes{$genomeID} = 0;
395 :     # Count this genome.
396 :     $genomeCount++;
397 :     }
398 : parrello 1.3 Trace("$genomeCount genomes found.") if T(2);
399 : parrello 1.1 # Close the files.
400 :     close GENOMESIN;
401 :     close GENOMESOUT;
402 :     # Create the contig hash used to associate contigs to their parent
403 :     # genomes.
404 :     my %contigs = ();
405 :     # Now we begin to read the Block and Region files in parallel. Both
406 :     # are sorted by block ID, so all processing for this section of the
407 :     # script is done a block at a time. The first task is to
408 :     # open the output files.
409 : parrello 1.3 Open(\*BLOCKSOUT, ">>$outDirectory/GroupBlock.dtx");
410 :     Open(\*REGIONSOUT, ">>$outDirectory/Region.dtx");
411 :     Open(\*INSTANCESOUT, ">>$outDirectory/HasInstanceOf.dtx");
412 :     Open(\*CONTAINSOUT, ">>$outDirectory/ContainsRegion.dtx");
413 :     Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx");
414 : parrello 1.1 # Determine which file sets we'll be processing.
415 :     my @fileSets = ();
416 :     my @prefixes = ("", "InterGenic_");
417 :     for my $prefix (@prefixes) {
418 :     if (-e "$inDirectory/${prefix}Block.tbl") {
419 :     push @fileSets, $prefix;
420 :     }
421 :     }
422 : parrello 1.3 # Set up the duplicate-region check.
423 :     my %allRegions = ();
424 : parrello 1.1 # Set up some counters.
425 :     my ($blocksCount, $regionsCount) = (0, 0);
426 :     # Loop through the useful file sets.
427 :     for my $fileSet (@fileSets) {
428 : parrello 1.3 Open(\*BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl");
429 :     Open(\*REGIONSIN, "<$inDirectory/${fileSet}Region.tbl");
430 :     Trace("Processing ${fileSet}Blocks.") if T(2);
431 : parrello 1.1 # The outer loop processes blocks. This is accomplished by reading
432 :     # through the block file. We prime the loop by reading the first
433 :     # region record. This is because we finish processing a block when
434 :     # the first record of the next block is found in the region file.
435 :     my %regionRecord = GetRegionRecord();
436 :     $regionsCount++;
437 :     while (my $blockRecord = <BLOCKSIN>) {
438 :     $blocksCount++;
439 :     # Parse the block record.
440 :     my ($blockID, $blockName, $pegID) = Tracer::ParseRecord($blockRecord);
441 :     # Create the block data for this block.
442 :     my %blockData = ( id => $blockID, description => $blockName );
443 :     # Initialize the tracking hashes. "genomesFound" tracks the
444 : parrello 1.2 # genomes whose contigs are represented by the block,
445 : parrello 1.1 # "regionMap" maps each region to its contents, and
446 :     # "regionPeg" maps each region to its PEG (if any).
447 :     my %genomesFound = ();
448 :     my %regionMap = ();
449 :     my %regionPeg = ();
450 :     # Count the number of regions found in this block.
451 :     my $regionCounter = 0;
452 :     # Loop through the regions in the block. Because of the way
453 :     # "GetRegionRecord" works, the "blockID" field will have an
454 :     # impossibly high value if we've hit end-of-file in the
455 :     # region input file.
456 :     while ($regionRecord{blockID} <= $blockID) {
457 :     # If this region's block ID is invalid, complain
458 :     # and skip it.
459 :     if ($regionRecord{blockID} != $blockID) {
460 : parrello 1.3 Trace("Block $regionRecord{blockID} in region record $regionsCount not found in block input file at record $blocksCount.") if T(0);
461 : parrello 1.1 $errorCount++;
462 :     } else {
463 :     # Here both files are in sync, which is good. The next step is
464 :     # to connect with the Genome and the Contig.
465 :     my $genomeID = $regionRecord{genomeID};
466 :     my $contigID = "$genomeID:$regionRecord{contigID}";
467 :     if (! exists $genomes{$genomeID}) {
468 : parrello 1.3 Trace("Genome $genomeID in region record $regionsCount not found in genome input file.") if T(0);
469 : parrello 1.1 $errorCount++;
470 :     } else {
471 :     # Denote this genome has an instance of this block.
472 :     $genomesFound{$genomeID} = 1;
473 :     # Denote this genome has occurred in the region file.
474 :     $genomes{$genomeID} = 1;
475 :     # Connect the contig to the genome.
476 :     $contigs{$contigID} = $genomeID;
477 :     # Now we need to process the snippet. First, we create a
478 :     # region string.
479 :     my $regionString = "${contigID}_$regionRecord{begin}_$regionRecord{end}";
480 :     # Next, we stuff the snippet and PEG in the region's hash entries.
481 :     my $snippet = $regionRecord{snippet};
482 :     $regionMap{$regionString} = $snippet;
483 :     $regionPeg{$regionString} = $regionRecord{peg};
484 :     # Check to see if this is the block's first snippet.
485 :     if (! exists $blockData{pattern}) {
486 :     # Here it is, so store the snippet as the pattern.
487 :     $blockData{pattern} = $snippet;
488 :     $blockData{"snip-count"} = 0;
489 :     $blockData{len} = length $snippet;
490 :     } elsif ($blockData{len} != length $snippet) {
491 :     # Here it is not the first, but the lengths do not match.
492 : parrello 1.3 Trace("Snippet for region record $regionsCount does not match block length $blockData{len}.") if T(0);
493 : parrello 1.1 $errorCount++;
494 :     } else {
495 :     # Here everything is legitimate, so we merge the new
496 :     # snippet into the pattern.
497 :     ($blockData{pattern}, $blockData{"snip-count"}) =
498 :     SimBlocks::MergeDNA($blockData{pattern}, $snippet);
499 :     }
500 :     }
501 :     # Count this region.
502 :     $regionCounter++;
503 :     }
504 :     # Get the next region record.
505 :     %regionRecord = GetRegionRecord();
506 :     }
507 :     # We have now processed all the regions in the block. Insure we found at least
508 :     # one.
509 :     if (! $regionCounter) {
510 : parrello 1.3 Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0);
511 : parrello 1.1 $errorCount++;
512 :     } else {
513 : parrello 1.4 Trace("$regionCounter regions found in block $blockID.") if T(4);
514 : parrello 1.1 # Write the block record.
515 :     my $variance = $blockData{"snip-count"} / $blockData{len};
516 :     print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len},
517 :     $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";
518 :     # Find all the variance points in the block pattern. We'll use them to create
519 :     # the content strings for each region.
520 :     my @positions = SimBlocks::ParsePattern($blockData{pattern});
521 :     # Loop through the regions, writing them out to the region output file.
522 :     for my $region (keys %regionMap) {
523 : parrello 1.3 if (length($region) > 80) {
524 :     Trace("Invalid region key \"$region\".") if T(1);
525 :     }
526 : parrello 1.1 # Get the region's snips.
527 :     my $source = $regionMap{$region};
528 :     my $content = "";
529 :     for my $pos (@positions) {
530 :     $content .= substr $source, $pos, 1;
531 :     }
532 :     # Get the region's location data.
533 : parrello 1.2 my $location = BasicLocation->new($region);
534 : parrello 1.1 # Write this region to the output files.
535 :     print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,
536 : parrello 1.3 $location->Right, $location->Length,
537 :     $regionPeg{$region}, $location->Left, $content) . "\n";
538 : parrello 1.1 print CONTAINSOUT join("\t", $location->Contig, $region,
539 :     $location->Length, $location->Left) . "\n";
540 : parrello 1.4 print INCLUDESOUT join("\t", $blockID, $region) . "\n";
541 : parrello 1.1 }
542 :     # Finally, we need to connect this block to the genomes in which it occurs.
543 :     for my $genomeID (keys %genomesFound) {
544 :     print INSTANCESOUT join("\t", $genomeID, $blockID) . "\n";
545 :     }
546 :     # Count this block's regions.
547 :     $regionsCount += $regionCounter;
548 :     }
549 :     }
550 :     # Close the input files.
551 :     close BLOCKSIN;
552 :     close REGIONSIN;
553 :     }
554 :     # Close the output files.
555 :     close REGIONSOUT;
556 :     close BLOCKSOUT;
557 :     close INSTANCESOUT;
558 :     # All the block data has been written. Tell the user what we found.
559 : parrello 1.3 Trace("$blocksCount blocks processed, $regionsCount regions processed.") if T(2);
560 : parrello 1.1 # The next task is to write the genome/contig data. This is provided by the
561 :     # "%contigs" hash. First, we need to open the files.
562 :     my $contigsCount = 0;
563 : parrello 1.3 Open(\*CONTIGSOUT, ">>$outDirectory/Contig.dtx");
564 :     Open(\*CONSISTSOUT, ">>$outDirectory/ConsistsOf.dtx");
565 : parrello 1.1 for my $contigID (keys %contigs) {
566 :     print CONTIGSOUT "$contigID\n";
567 :     print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";
568 :     $contigsCount++;
569 :     }
570 : parrello 1.3 Trace("$contigsCount contigs found.") if T(2);
571 : parrello 1.4 # Close the output files.
572 :     close CONTIGSOUT;
573 :     close CONSISTSOUT;
574 : parrello 1.1 # Now warn the user about all the genomes that didn't have blocks.
575 :     for my $genomeID (keys %genomes) {
576 :     if (! $genomes{$genomeID}) {
577 : parrello 1.3 Trace("Genome $genomeID did not have any regions.") if T(1);
578 : parrello 1.1 $errorCount++;
579 :     }
580 :     }
581 : parrello 1.3 return $errorCount;
582 : parrello 1.1 }
583 :     # Tell the user we're done.
584 : parrello 1.3 Trace("Processing complete.") if T(0);
585 : parrello 1.1
586 :     # Read a region record from the file and parse it into a hash
587 :     # for return to the caller. If we reach end-of-file, the
588 :     # hash returned will have $TRAILER in the blockID field.
589 :     sub GetRegionRecord {
590 :     # Create the return hash.
591 :     my %retVal = ();
592 :     # Read the record.
593 :     my $regionData = <REGIONSIN>;
594 :     # Check for end-of-file.
595 :     if (!defined $regionData) {
596 :     # Here we have end-of-file, so stuff in a trailer
597 :     # value for the block ID.
598 :     $retVal{blockID} = $TRAILER;
599 :     } else {
600 :     # Here we have a real record.
601 :     ($retVal{peg}, $retVal{genomeID}, $retVal{contigID},
602 :     $retVal{begin}, $retVal{end}, $retVal{blockID},
603 :     $retVal{snippet}) = Tracer::ParseRecord($regionData);
604 :     }
605 :     # Return the hash created.
606 :     return %retVal;
607 :     }
608 :    
609 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3