[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Annotation of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     use strict;
4 :     use CGI;
5 :     use Tracer;
6 :     use Genome;
7 :     use SimBlocks;
8 :     use File::Path;
9 : parrello 1.2 use BasicLocation;
10 : parrello 1.1
11 :     =head1 Similarity Block Loader
12 :    
13 :     This script loads the similarity block database from
14 :     the input files.
15 :    
16 :     The script takes a single parameter-- the directory name.
17 :     The default directory name is taken from the config file
18 :     parameter C<$fig_config::SimBlastData>. The output files
19 :     will be produced in the similarity block data directory
20 :     C<$fig_config::SimBlockData>, which will be created if
21 :     it does not exist.
22 :    
23 :     In addition to the directory name, the following
24 :     option parameters are supported.
25 :    
26 :     =over 4
27 :    
28 :     =item trace
29 :    
30 :     Trace level for output messages. A higher number means more
31 :     messages. The default is C<1>.
32 :    
33 :     =item logFile
34 :    
35 :     Name of a file to be used for trace messages, or
36 :     C<*> to send them to the standard output. If a
37 :     file is specified, the output directory must
38 :     already exist. The default is C<*>.
39 :    
40 :     =item load
41 :    
42 :     C<yes> to load the data into the database, else C<no>.
43 :     The default is C<yes>.
44 :    
45 :     =item generate
46 :    
47 :     C<yes> to generate output files from input files, else
48 :     C<no>. The default is C<yes>.
49 :    
50 :     =back
51 :    
52 :     For example, the following command line will process the
53 :     C<Fragment.sort> file in the C</Users/fig/BlastData> directory and
54 :     run at a trace level of 3.
55 :    
56 :     C<< SimLoad /Users/fig/BlastData -trace=3 >>
57 :    
58 :     Please note that spaces in the file and directory names must
59 :     be escaped as C<\b> to prevent them from being parsed as
60 :     argument delimiters.
61 :    
62 :     =head2 Input Directory
63 :    
64 :     The following files must exist in the input directory.
65 :    
66 :     =over 4
67 :    
68 :     =item Genome.tbl
69 :    
70 :     This is a tab-delimited file that contains the ID of each
71 :     genome followed by a description string.
72 :    
73 :     =item Block.tbl, InterGenic_Block.tbl
74 :    
75 :     These are tab-delimited files that associate a gene name
76 :     with each block. The InterGenic file is optional.
77 :    
78 :     =item Region.tbl, InterGenic_Region.tbl
79 :    
80 :     These are tab-delimited files that describe each region
81 :     of a block. The InterGenic file is optional.
82 :    
83 :     =back
84 :    
85 :     The format of each file is given below.
86 :    
87 :     =head3 Genome.tbl
88 :    
89 :     The Genome file is copied almost unmodified to the
90 :     load file for the B<Genome> entity. Each record
91 :     represents a single genome. It has the following
92 :     fields.
93 :    
94 :     =over 4
95 :    
96 :     =item genomeID
97 :    
98 :     The ID of the genome.
99 :    
100 :     =item description
101 :    
102 :     A text description of the genome (usually the species name with
103 :     a strain ID).
104 :    
105 :     =back
106 :    
107 :     =head3 Block.tbl, InterGenic_Block.tbl
108 :    
109 :     These files produce most of the data found in the B<GroupBlock>
110 :     entity. Each record represents a single block. Blocks either
111 :     correspond to genes or to inter-genic regions. Both types
112 :     of blocks may appear in multiple locations in multiple
113 :     contigs. The files should be sorted by block ID.
114 :    
115 :     =over 4
116 :    
117 :     =item blockID
118 :    
119 :     The unique ID of the block. This ID is also used in the
120 :     C<Region.tbl> file.
121 :    
122 :     =item blockName
123 :    
124 :     The name of the block. For a gene block, this is the gene
125 :     name. For an inter-genic block, it is a name computed
126 :     from the names of the genes that are commonly nearby.
127 :    
128 :     =back
129 :    
130 :     =head3 Region.tbl, InterGenic_Region.tbl
131 :    
132 :     These files describe the regions in each block. They are
133 :     used to derive the relationships between genomes and
134 :     contigs (B<ConsistsOf>), the contigs themselves
135 :     (B<Contig>), the relationships between blocks and
136 :     contigs (B<ContainsRegionIn>), and the derived
137 :     relationship between genomes and blocks
138 :     (B<HasInstanceOf>). The files must be sorted by block
139 :     ID, and each record in a file represents a single
140 :     region in a contig. Each region belongs to a
141 :     single block. Note that the C<Region.tbl> file contains
142 :     the regions for the C<Block.tbl> file, and the
143 :     C<InterGenic_Region.tbl> file contains the regions for
144 :     the C<InterGenic_Block.tbl> file. No mixing is allowed.
145 :    
146 :     =over 4
147 :    
148 :     =item regionPEG
149 :    
150 :     PEG ID for this region. If the region is in an
151 :     inter-genic block, this field will be composed of
152 :     the IDs for the neighboring genes.
153 :    
154 :     =item genomeID
155 :    
156 :     ID of the relevant genome.
157 :    
158 :     =item contigID
159 :    
160 :     ID of the contig containing this region. This is a standard contig
161 :     ID that does not include the genome ID. It will be converted to
162 :     a Sprout-style contig ID (which includes the genome data) before
163 :     it is written to the output files.
164 :    
165 :     =item begin
166 :    
167 :     The start point of the region. For a forward region this is the
168 :     left endpoint; for a reverse region it is the right endpoint. It
169 :     is a 1-based offset (which is consistent with Sprout usage), and
170 :     the identified location is inside the region.
171 :    
172 :     =item end
173 :    
174 :     The end point of the region. For a forward region this is the
175 :     right endpoint; for a reverse region it is the left endpoint. It
176 :     is a 1-based offset (which is consistent with Sprout usage), and
177 :     the identified location is inside the region.
178 :    
179 :     =item blockID
180 :    
181 :     The ID of the block containing this region.
182 :    
183 :     =item snippet
184 :    
185 :     A DNA snippet representing the contents of the region. The region
186 :     may be shorter than the block length. If that is the case, the
187 :     snippet will contain insertion characters (C<->). So, while it
188 :     is not the case that every region in a block must be the same
189 :     length, all of the snippets for a block must be the same length.
190 :     The snippets will be in alignment form. In other words, if the
191 :     region is reversed, the nucleotide letters will be the complement
192 :     in transcription order. (For example, if positions 20 through 25
193 :     of contig B<XXX> are C<AGCCTT>, then the snippet for C<XXX_25_20>
194 :     will be C<AAGGCT>.)
195 :    
196 :     =back
197 :    
198 :     =head2 Output File Notes
199 :    
200 :     =over 4
201 :    
202 :     =item Genome.dtx
203 :    
204 :     This file is a direct copy of the C<Genome.tbl> file; however, we
205 :     also use it to create a hash of genome IDs (C<%genomes>). The hash
206 :     is useful when validating the C<Region.tbl> file.
207 :    
208 :     =item Contig.dtx
209 :    
210 :     This file contains nothing but contig IDs. As contigs are
211 :     discovered from the C<Region.tbl> file their IDs are put
212 :     into the C<%contigs> hash. This hash maps contig IDs to
213 :     their parent genome IDs. When processing is complete,
214 :     this file is generated from the hash.
215 :    
216 :     =item GroupBlock.dtx
217 :    
218 :     This file describes the blocks. As records come in from
219 :     C<Region.tbl>, we build a hash called C<%blockData> that
220 :     contains our latest estimate of all the C<GroupBlock.dtx>
221 :     columns for the current block (with the exception of
222 :     B<variance>, which is computed by dividing the B<snip-count>
223 :     by the length (B<len>).
224 :    
225 :     =item ConsistsOf.dtx
226 :    
227 :     This file maps genomes to contigs, and is generated from
228 :     the C<%contigs> hash built while reading the C<Region.tbl>
229 :     file.
230 :    
231 :     =item HasInstanceOf.dtx
232 :    
233 :     This file lists the genomes containing each block. The
234 :     C<Region.tbl> file is sorted by block. While inside a
235 :     block's section of the file, we use a hash called
236 :     C<%genomesFound> that contains the ID of every genome
237 :     found for the block. When we finish with a block,
238 :     we run through the C<%genomesFound> hash to produce
239 :     the block's B<HasInstanceOf> data.
240 :    
241 :     =item Region.dtx
242 :    
243 :     This file describes the contig regions in the blocks.
244 :     As the C<Region.tbl> file is read in, we build a
245 :     hash called C<%regionMap> that maps a region's
246 :     SEED-style location string to the DNA content.
247 :     When we finish with a block, the DNA content is
248 :     converted into an alignment by comparing it to
249 :     the block's pattern in C<%blockData>. (Essentially,
250 :     we only include the region's content for the
251 :     positions that vary between regions in the block.)
252 :     From this and the region string itself, we have
253 :     enough data to create the B<Region>
254 :     data.
255 :    
256 :     =item IncludesRegion.dtx
257 :    
258 :     This file maps group blocks to regions. The first column
259 :     is the block ID and the second column is the SEED-style
260 :     region string for the target region. This file is built
261 :     in parallel with C<Region.dtx>. It will have one record
262 :     for each region.
263 :    
264 :     =item ContainsRegion.dtx
265 :    
266 :     This file maps contigs to regions. The first column is
267 :     the contig ID and the second column is the SEED-style
268 :     location string for the region. It contains two redundant
269 :     columns used for sorting-- the region length (column 3)
270 :     and the left-most region position (column 4). This
271 :     file is built in parallel with C<Region.dtx>. It will
272 :     have one record for each region.
273 :    
274 :     =cut
275 :    
276 :     # Create a huge number we can use for an end-of-file
277 :     # indicator in the block ID.
278 :     my $TRAILER = 999999999;
279 :    
280 :     # Parse the command line.
281 :     my ($options, @arguments) = Tracer::ParseCommand({ trace => 1,
282 :     logFile => "*",
283 :     load => 'yes',
284 :     generate => 'yes'},
285 :     @ARGV);
286 :     # Extract the directory name from the argument array.
287 :     my $inDirectory = $FIG_Config::simBlastData;
288 :     if ($arguments[0]) {
289 :     $inDirectory = Cwd::abs_path($arguments[0]);
290 :     }
291 :     # Set up tracing.
292 :     my $traceLevel = $options->{trace};
293 :     my $traceOption = $options->{logFile};
294 :     my $TracingToFile = ($traceOption ne "*");
295 :     my $traceDestination = ($TracingToFile ? ">$traceOption" : "TEXT");
296 :     TSetup("$traceLevel Tracer", $traceDestination);
297 :     # Get the output directory.
298 :     my $outDirectory = $FIG_Config::simBlocksData;
299 :     # Insure that it exists.
300 :     if (! -d $outDirectory) {
301 :     Output("Creating output directory $outDirectory.");
302 :     mkpath($outDirectory);
303 :     }
304 :     # Create an error counter.
305 :     my $errorCount = 0;
306 :     # Check to see if we should generate the output files.
307 :     if ($options->{generate} eq 'no') {
308 :     # Here we are to use existing output files.
309 :     Output("Existing database load files will be used.");
310 :     } else {
311 :     # Here we need to produce new output files.
312 :     # Verify that the input directory exists.
313 :     if (! -d $inDirectory) {
314 :     Confess("Input directory \"$inDirectory\" not found.");
315 :     }
316 :     # Our first task is to copy the genome data to the output directory
317 :     # and build the genome list.
318 :     my %genomes = ();
319 :     (open GENOMESIN, "<$inDirectory/Genome.tbl") ||
320 :     Confess("Could not open Genome input file in $inDirectory.");
321 :     (open GENOMESOUT, ">$outDirectory/Genome.dtx") ||
322 :     Confess("Could not open Genome output file in $outDirectory.");
323 : parrello 1.2 # Count the genomes read.
324 : parrello 1.1 my $genomeCount = 0;
325 :     # Loop through the input.
326 :     while (my $genomeData = <GENOMESIN>) {
327 :     # Echo the genome record to the output.
328 :     print GENOMESOUT $genomeData;
329 :     # Extract the genome ID.
330 :     my ($genomeID) = Tracer::ParseRecord($genomeData);
331 :     # Store it in the genomes hash. We start with a value of 0. If
332 :     # contig information for the genome is found, we change the value
333 :     # to 1. When we're all done with the regions, we can check the
334 :     # hash to insure all the genomes were represented in the input.
335 :     $genomes{$genomeID} = 0;
336 :     # Count this genome.
337 :     $genomeCount++;
338 :     }
339 :     Output("$genomeCount genomes found.");
340 :     # Close the files.
341 :     close GENOMESIN;
342 :     close GENOMESOUT;
343 :     # Create the contig hash used to associate contigs to their parent
344 :     # genomes.
345 :     my %contigs = ();
346 :     # Now we begin to read the Block and Region files in parallel. Both
347 :     # are sorted by block ID, so all processing for this section of the
348 :     # script is done a block at a time. The first task is to
349 :     # open the output files.
350 :     (open BLOCKSOUT, ">$outDirectory/GroupBlock.dtx") ||
351 :     Confess("Could not open block output file in $outDirectory.");
352 :     (open REGIONSOUT, ">$outDirectory/Region.dtx") ||
353 :     Confess("Could not open region output file in $outDirectory.");
354 :     (open INSTANCESOUT, ">$outDirectory/HasInstanceOf.dtx") ||
355 :     Confess("Could not open instance output file in $outDirectory.");
356 :     (open CONTAINSOUT, ">$outDirectory/ContainsRegion.dtx") ||
357 :     Confess("Could not open contig-region output file in $outDirectory.");
358 :     (open INCLUDESOUT, ">$outDirectory/IncludesRegion.dtx") ||
359 :     Confess("Could not open block-region output file in $outDirectory.");
360 :     # Determine which file sets we'll be processing.
361 :     my @fileSets = ();
362 :     my @prefixes = ("", "InterGenic_");
363 :     for my $prefix (@prefixes) {
364 :     if (-e "$inDirectory/${prefix}Block.tbl") {
365 :     push @fileSets, $prefix;
366 :     }
367 :     }
368 :     # Set up some counters.
369 :     my ($blocksCount, $regionsCount) = (0, 0);
370 :     # Loop through the useful file sets.
371 :     for my $fileSet (@fileSets) {
372 :     (open BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl") ||
373 :     Confess("Could not open block input file in $inDirectory.");
374 :     (open REGIONSIN, "<$inDirectory/${fileSet}Region.tbl") ||
375 :     Confess("Could not open region input file in $inDirectory.");
376 :     Output("Processing ${fileSet}Blocks.");
377 :     # The outer loop processes blocks. This is accomplished by reading
378 :     # through the block file. We prime the loop by reading the first
379 :     # region record. This is because we finish processing a block when
380 :     # the first record of the next block is found in the region file.
381 :     my %regionRecord = GetRegionRecord();
382 :     $regionsCount++;
383 :     while (my $blockRecord = <BLOCKSIN>) {
384 :     $blocksCount++;
385 :     # Parse the block record.
386 :     my ($blockID, $blockName, $pegID) = Tracer::ParseRecord($blockRecord);
387 :     # Create the block data for this block.
388 :     my %blockData = ( id => $blockID, description => $blockName );
389 :     # Initialize the tracking hashes. "genomesFound" tracks the
390 : parrello 1.2 # genomes whose contigs are represented by the block,
391 : parrello 1.1 # "regionMap" maps each region to its contents, and
392 :     # "regionPeg" maps each region to its PEG (if any).
393 :     my %genomesFound = ();
394 :     my %regionMap = ();
395 :     my %regionPeg = ();
396 :     # Count the number of regions found in this block.
397 :     my $regionCounter = 0;
398 :     # Loop through the regions in the block. Because of the way
399 :     # "GetRegionRecord" works, the "blockID" field will have an
400 :     # impossibly high value if we've hit end-of-file in the
401 :     # region input file.
402 :     while ($regionRecord{blockID} <= $blockID) {
403 :     # If this region's block ID is invalid, complain
404 :     # and skip it.
405 :     if ($regionRecord{blockID} != $blockID) {
406 :     Output("Block $regionRecord{blockID} in region record $regionsCount not found in block input file at record $blocksCount.");
407 :     $errorCount++;
408 :     } else {
409 :     # Here both files are in sync, which is good. The next step is
410 :     # to connect with the Genome and the Contig.
411 :     my $genomeID = $regionRecord{genomeID};
412 :     my $contigID = "$genomeID:$regionRecord{contigID}";
413 :     if (! exists $genomes{$genomeID}) {
414 :     Output("Genome $genomeID in region record $regionsCount not found in genome input file.");
415 :     $errorCount++;
416 :     } else {
417 :     # Denote this genome has an instance of this block.
418 :     $genomesFound{$genomeID} = 1;
419 :     # Denote this genome has occurred in the region file.
420 :     $genomes{$genomeID} = 1;
421 :     # Connect the contig to the genome.
422 :     $contigs{$contigID} = $genomeID;
423 :     # Now we need to process the snippet. First, we create a
424 :     # region string.
425 :     my $regionString = "${contigID}_$regionRecord{begin}_$regionRecord{end}";
426 :     # Next, we stuff the snippet and PEG in the region's hash entries.
427 :     my $snippet = $regionRecord{snippet};
428 :     $regionMap{$regionString} = $snippet;
429 :     $regionPeg{$regionString} = $regionRecord{peg};
430 :     # Check to see if this is the block's first snippet.
431 :     if (! exists $blockData{pattern}) {
432 :     # Here it is, so store the snippet as the pattern.
433 :     $blockData{pattern} = $snippet;
434 :     $blockData{"snip-count"} = 0;
435 :     $blockData{len} = length $snippet;
436 :     } elsif ($blockData{len} != length $snippet) {
437 :     # Here it is not the first, but the lengths do not match.
438 :     Output("Snippet for region record $regionsCount does not match block length $blockData{len}.");
439 :     $errorCount++;
440 :     } else {
441 :     # Here everything is legitimate, so we merge the new
442 :     # snippet into the pattern.
443 :     ($blockData{pattern}, $blockData{"snip-count"}) =
444 :     SimBlocks::MergeDNA($blockData{pattern}, $snippet);
445 :     }
446 :     }
447 :     # Count this region.
448 :     $regionCounter++;
449 :     }
450 :     # Get the next region record.
451 :     %regionRecord = GetRegionRecord();
452 :     }
453 :     # We have now processed all the regions in the block. Insure we found at least
454 :     # one.
455 :     if (! $regionCounter) {
456 :     Output("No regions found for block $blockID at $blocksCount in block input file.");
457 :     $errorCount++;
458 :     } else {
459 :     Trace("$regionCounter regions found in block $blockID.") if T(2);
460 :     # Write the block record.
461 :     my $variance = $blockData{"snip-count"} / $blockData{len};
462 :     print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len},
463 :     $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";
464 :     # Find all the variance points in the block pattern. We'll use them to create
465 :     # the content strings for each region.
466 :     my @positions = SimBlocks::ParsePattern($blockData{pattern});
467 :     # Loop through the regions, writing them out to the region output file.
468 :     for my $region (keys %regionMap) {
469 :     # Get the region's snips.
470 :     my $source = $regionMap{$region};
471 :     my $content = "";
472 :     for my $pos (@positions) {
473 :     $content .= substr $source, $pos, 1;
474 :     }
475 :     # Get the region's location data.
476 : parrello 1.2 my $location = BasicLocation->new($region);
477 : parrello 1.1 # Write this region to the output files.
478 :     print REGIONSOUT join("\t", $region, $location->Contig, $location->Dir,
479 :     $location->Length, $regionPeg{$region},
480 :     $location->Left, $content) . "\n";
481 :     print CONTAINSOUT join("\t", $location->Contig, $region,
482 :     $location->Length, $location->Left) . "\n";
483 :     print INCLUDESOUT join("\t", $blockID, $region);
484 :     }
485 :     # Finally, we need to connect this block to the genomes in which it occurs.
486 :     for my $genomeID (keys %genomesFound) {
487 :     print INSTANCESOUT join("\t", $genomeID, $blockID) . "\n";
488 :     }
489 :     # Count this block's regions.
490 :     $regionsCount += $regionCounter;
491 :     }
492 :     }
493 :     # Close the input files.
494 :     close BLOCKSIN;
495 :     close REGIONSIN;
496 :     }
497 :     # Close the output files.
498 :     close REGIONSOUT;
499 :     close BLOCKSOUT;
500 :     close INSTANCESOUT;
501 :     # All the block data has been written. Tell the user what we found.
502 :     Output("$blocksCount blocks processed, $regionsCount regions processed.");
503 :     # The next task is to write the genome/contig data. This is provided by the
504 :     # "%contigs" hash. First, we need to open the files.
505 :     my $contigsCount = 0;
506 :     (open CONTIGSOUT, ">$outDirectory/Contig.dtx") ||
507 :     Confess("Could not open contig output file in $outDirectory.");
508 :     (open CONSISTSOUT, ">$outDirectory/ConsistsOf.dtx") ||
509 :     Confess("Could not open consists output file in $outDirectory.");
510 :     for my $contigID (keys %contigs) {
511 :     print CONTIGSOUT "$contigID\n";
512 :     print CONSISTSOUT join("\t", $contigs{$contigID}, $contigID) . "\n";
513 :     $contigsCount++;
514 :     }
515 :     Output("$contigsCount contigs found.");
516 :     # Now warn the user about all the genomes that didn't have blocks.
517 :     for my $genomeID (keys %genomes) {
518 :     if (! $genomes{$genomeID}) {
519 :     Output("Genome $genomeID did not have any regions.");
520 :     $errorCount++;
521 :     }
522 :     }
523 :     }
524 :     # Check for errors.
525 :     if ($errorCount > 0) {
526 :     Output("$errorCount errors found in input files.");
527 :     } else {
528 :     # No errors, so it's okay to load the database.
529 :     if ($options->{load} eq 'yes') {
530 :     # Here we have no outstanding errors and the user wants us to load
531 :     # the database. First, we create a similarity block object.
532 :     my $simBlocks = SimBlocks->new();
533 :     # Use it to load the database. Note we specify that the tables are to be
534 :     # dropped and rebuilt.
535 :     $simBlocks->LoadTables($outDirectory, 1);
536 :     Output("Database loaded.");
537 :     }
538 :     }
539 :     # Tell the user we're done.
540 :     Output("Processing complete.");
541 :    
542 :     # Read a region record from the file and parse it into a hash
543 :     # for return to the caller. If we reach end-of-file, the
544 :     # hash returned will have $TRAILER in the blockID field.
545 :     sub GetRegionRecord {
546 :     # Create the return hash.
547 :     my %retVal = ();
548 :     # Read the record.
549 :     my $regionData = <REGIONSIN>;
550 :     # Check for end-of-file.
551 :     if (!defined $regionData) {
552 :     # Here we have end-of-file, so stuff in a trailer
553 :     # value for the block ID.
554 :     $retVal{blockID} = $TRAILER;
555 :     } else {
556 :     # Here we have a real record.
557 :     ($retVal{peg}, $retVal{genomeID}, $retVal{contigID},
558 :     $retVal{begin}, $retVal{end}, $retVal{blockID},
559 :     $retVal{snippet}) = Tracer::ParseRecord($regionData);
560 :     }
561 :     # Return the hash created.
562 :     return %retVal;
563 :     }
564 :    
565 :     # Write an output line. The output line will be traced at level 0. If
566 :     # $TracingToFile is set, it will be echoed to the standard output.
567 :     sub Output {
568 :     my ($message) = @_;
569 :     Trace($message) if T(0);
570 :     if ($TracingToFile) {
571 :     print "$message\n";
572 :     }
573 :     }
574 :    
575 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3