[Bio] / Sprout / SimLoad.pl Repository:
ViewVC logotype

Diff of /Sprout/SimLoad.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Fri Jan 6 20:35:01 2006 UTC revision 1.8, Wed Feb 1 03:28:11 2006 UTC
# Line 4  Line 4 
4  use CGI;  use CGI;
5  use Tracer;  use Tracer;
6  use Genome;  use Genome;
7    use DBKernel;
8  use SimBlocks;  use SimBlocks;
9  use File::Path;  use File::Path;
10  use BasicLocation;  use BasicLocation;
# Line 18  Line 19 
19  database (B<load>).  database (B<load>).
20    
21  The script takes a single parameter-- a directory name.  The script takes a single parameter-- a directory name.
22  The default directory name is taken from the config file  The default directory name is C<"$FIG_Config::data/CloseStrainSets">.
23  parameter C<$fig_config::SimBlastData>. The output files  The input files should be in subdirectories called
24  will be produced in the similarity block data directory  C<Block> under the subdirectories of the input directory.
25  C<$fig_config::SimBlockData>, which will be created if  The subdirectory names themselves are considered the name
26  it does not exist. The input directory and all its  of the close-strain set. So, for example
27    
28        Data/CloseStrainSets/Vibrio/Block
29    
30    would be presumed to contain the Vibrio strains.
31    
32    The output files will be produced in the similarity block
33    data directory C<$fig_config::SimBlockData>, which will be
34    created if it does not exist. The input directory and all its
35  subdirectories will be processed for input files.  subdirectories will be processed for input files.
36    
37  In addition to the directory name, the following  In addition to the directory name, the following
# Line 33  Line 42 
42  =item trace  =item trace
43    
44  Trace level for output messages. A higher number means more  Trace level for output messages. A higher number means more
45  messages. The default is C<2>. Trace messages are sent to  messages. The default is C<3>. Trace messages are sent to
46  the file C<trace.log> in the B<$FIG_Config::tmp>  the file C<trace.log> in the B<$FIG_Config::tmp>
47  directory.  directory.
48    
49    =item sql
50    
51    If specified, SQL activity will be traced at the specified
52    trace level.
53    
54  =item load  =item load
55    
56  C<yes> to load the data into the database, else C<no>.  C<yes> to load the data into the database, else C<no>.
# Line 47  Line 61 
61  C<yes> to generate output files from input files, else  C<yes> to generate output files from input files, else
62  C<no>. The default is C<yes>.  C<no>. The default is C<yes>.
63    
64    =item createDB
65    
66    If specified, the database will be dropped and
67    re-created before loading.
68    
69  =back  =back
70    
71  For example, the following command line will process the  For example, the following command line will process the
72  input files in the C</Users/fig/BlastData> directory and  input files in the C</Users/fig/BlastData> directory tree
73  run at a trace level of 3.  and run at a trace level of 3.
74    
75  C<< SimLoad -trace=3 /Users/fig/BlastData >>  C<< SimLoad -trace=3 /Users/fig/BlastData >>
76    
# Line 63  Line 82 
82    
83  =head2 Input Directory  =head2 Input Directory
84    
85  The following files must exist in each input directory.  The following files must exist in each C<Block> directory
86    under the input directory.
87    
88  =over 4  =over 4
89    
90  =item Genome.tbl  =item genome.tbl
91    
92  This is a tab-delimited file that contains the ID of each  This is a tab-delimited file that contains the ID of each
93  genome followed by a description string.  genome followed by a description string.
94    
95  =item Block.tbl, InterGenic_Block.tbl  =item block.tbl, intergenic_block.tbl
96    
97  These are tab-delimited files that associate a gene name  These are tab-delimited files that associate a gene name
98  with each block. The InterGenic file is optional.  with each block. The InterGenic file is optional.
99    
100  =item Region.tbl, InterGenic_Region.tbl  =item region.tbl, intergenic_region.tbl
101    
102  These are tab-delimited files that describe each region  These are tab-delimited files that describe each region
103  of a block. The InterGenic file is optional.  of a block. The InterGenic file is optional.
# Line 86  Line 106 
106    
107  The format of each file is given below.  The format of each file is given below.
108    
109  =head3 Genome.tbl  =head3 genome.tbl
110    
111  The Genome file is copied almost unmodified to the  The Genome file is copied almost unmodified to the
112  load file for the B<Genome> entity. Each record  load file for the B<Genome> entity. Each record
# Line 104  Line 124 
124  A text description of the genome (usually the species name with  A text description of the genome (usually the species name with
125  a strain ID).  a strain ID).
126    
127    =teim groupName
128    
129    The name of the group to which the genome belongs.
130    
131  =back  =back
132    
133  =head3 Block.tbl, InterGenic_Block.tbl  =head3 block.tbl, intergenic_block.tbl
134    
135  These files produce most of the data found in the B<GroupBlock>  These files produce most of the data found in the B<GroupBlock>
136  entity. Each record represents a single block. Blocks either  entity. Each record represents a single block. Blocks either
# Line 129  Line 153 
153    
154  =back  =back
155    
156  =head3 Region.tbl, InterGenic_Region.tbl  =head3 region.tbl, intergenic_region.tbl
157    
158  These files describe the regions in each block. They are  These files describe the regions in each block. They are
159  used to derive the relationships between genomes and  used to derive the relationships between genomes and
# Line 280  Line 304 
304  my $TRAILER = 999999999;  my $TRAILER = 999999999;
305    
306  # Parse the command line.  # Parse the command line.
307  my ($options, @arguments) = Tracer::ParseCommand({ trace => 1,  my ($options, @arguments) = StandardSetup(['SimBlocks'],
308                                                    load => 'yes',                                                  { load => ['yes', "'no' to suppress loading the database"],
309                                                    generate => 'yes'},                                                    generate => ['yes', "'no' to suppress generating the load files"],
310                                                      createDB => [0, 'drop and create the database before loading']
311                                                    },
312                                                    "directoryName",
313                                                   @ARGV);                                                   @ARGV);
314  # Extract the directory name from the argument array.  # Extract the directory name from the argument array.
315  my $inDirectoryTree = $FIG_Config::simBlastData;  my $inDirectoryTree = ($arguments[0] ? Cwd::abs_path($arguments[0]) : "$FIG_Config::data/CloseStrainSets");
316  if ($arguments[0]) {  # Check to see if we need to create the database.
317      $inDirectoryTree = Cwd::abs_path($arguments[0]);  if ($options->{createDB}) {
318  }      Trace("Creating database.") if T(2);
319  # Set up tracing.      DBKernel::CreateDB($FIG_Config::simBlocksDB);
320  my $traceLevel = $options->{trace};  }
 TSetup("$traceLevel Tracer ERDB SimBlocks DBKernel SQL", "+>$FIG_Config::temp/trace.log");  
321  # Get the output directory.  # Get the output directory.
322  my $outDirectory = $FIG_Config::simBlocksData;  my $outDirectory = $FIG_Config::simBlocksData;
323  # Insure that it exists.  # Insure that it exists.
324  if (! -d $outDirectory) {  if (! -d $outDirectory) {
325      Trace("Creating output directory $outDirectory.") if T(2);      Trace("Creating output directory $outDirectory.") if T(2);
326      mkpath($outDirectory);      mkpath($outDirectory);
327  } else {  } elsif ($options->{generate} eq 'yes') {
328      # Here we have an output directory already. Clear any      # Here we have an output directory already and are going to generate new
329      # leftover data from previous runs.      # load files. Clear any leftover data from previous runs.
330      my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory);      my @files = grep { $_ =~ /.dtx$/ } Tracer::OpenDir($outDirectory);
331      my $numFiles = @files;      my $numFiles = @files;
332      if ($numFiles > 0) {      if ($numFiles > 0) {
# Line 322  Line 348 
348          Confess("Input directory \"$inDirectoryTree\" not found.");          Confess("Input directory \"$inDirectoryTree\" not found.");
349      }      }
350      # Loop through the subdirectories.      # Loop through the subdirectories.
351      for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 0)) {      for my $inputDirectory (Tracer::OpenDir($inDirectoryTree, 1)) {
352          # Verify that this is a directory. If it's ".", we use          # Verify that this is a directory.
353          # the top-level directory.          my $inDirectory = "$inDirectoryTree/$inputDirectory/Blocks";
354          my $inDirectory = ($inputDirectory eq "." ? $inDirectoryTree :          if (-d $inDirectory) {
                                                     "$inDirectoryTree/$inputDirectory");  
         if (($inputDirectory !~ /\../) && -d $inDirectory) {  
355              # Here we have a directory to process. Check for a genome              # Here we have a directory to process. Check for a genome
356              # file.              # file.
357              my $genomeFileName = "$inDirectory/Genome.tbl";              my $genomeFileName = "$inDirectory/genome.tbl";
358              if (! -e $genomeFileName) {              if (! -e $genomeFileName) {
359                  Trace("$genomeFileName not found. Directory skipped.") if T(1);                  Trace("$genomeFileName not found. Directory skipped.") if T(1);
360              } else {              } else {
361                  # Now we can process the directory and accumulate the error                  # Now we can process the directory and accumulate the error
362                  # count.                  # count.
363                  $errorCount += ProcessDirectory($inDirectory, $outDirectory);                  $errorCount += ProcessDirectory($inDirectory, $outDirectory, $inputDirectory);
364                  $dirCount++;                  $dirCount++;
365              }              }
366          }          }
# Line 361  Line 385 
385    
386  # Process a single input directory.  # Process a single input directory.
387  sub ProcessDirectory {  sub ProcessDirectory {
388      my ($inDirectory, $outDirectory) = @_;      my ($inDirectory, $outDirectory, $groupName) = @_;
389      Trace("Processing directory $inDirectory.") if T(2);      Trace("Processing directory $inDirectory.") if T(2);
390      # Our first task is to copy the genome data to the output directory      # Our first task is to copy the genome data to the output directory
391      # and add them to the genome list.      # and add the genomes to the genome list.
392      my %genomes = ();      my %genomes = ();
393      Open(\*GENOMESIN, "<$inDirectory/Genome.tbl");      Open(\*GENOMESIN, "<$inDirectory/genome.tbl");
394      Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");      Open(\*GENOMESOUT, ">>$outDirectory/Genome.dtx");
395      # Count the genomes read and errors found.      # Count the genomes read and errors found.
396      my $genomeCount = 0;      my $genomeCount = 0;
# Line 374  Line 398 
398      # Loop through the input.      # Loop through the input.
399      while (my $genomeData = <GENOMESIN>) {      while (my $genomeData = <GENOMESIN>) {
400          # Echo the genome record to the output.          # Echo the genome record to the output.
401          print GENOMESOUT $genomeData;          my @fields = Tracer::ParseRecord($genomeData);
402            print GENOMESOUT join("\t", @fields, $groupName). "\n";
403          # Extract the genome ID.          # Extract the genome ID.
404          my ($genomeID) = Tracer::ParseRecord($genomeData);          my $genomeID = $fields[0];
405          # Store it in the genomes hash. We start with a value of 0. If          # Store it in the genomes hash. We start with a value of 0. If
406          # contig information for the genome is found, we change the value          # contig information for the genome is found, we change the value
407          # to 1. When we're all done with the regions, we can check the          # to 1. When we're all done with the regions, we can check the
# Line 403  Line 428 
428      Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx");      Open(\*INCLUDESOUT, ">>$outDirectory/IncludesRegion.dtx");
429      # Determine which file sets we'll be processing.      # Determine which file sets we'll be processing.
430      my @fileSets = ();      my @fileSets = ();
431      my @prefixes = ("", "InterGenic_");      my @prefixes = ("", "intergenic_");
432      for my $prefix (@prefixes) {      for my $prefix (@prefixes) {
433          if (-e "$inDirectory/${prefix}Block.tbl") {          if (-e "$inDirectory/${prefix}block.tbl") {
434              push @fileSets, $prefix;              push @fileSets, $prefix;
435          }          }
436      }      }
# Line 415  Line 440 
440      my ($blocksCount, $regionsCount) = (0, 0);      my ($blocksCount, $regionsCount) = (0, 0);
441      # Loop through the useful file sets.      # Loop through the useful file sets.
442      for my $fileSet (@fileSets) {      for my $fileSet (@fileSets) {
443          Open(\*BLOCKSIN, "<$inDirectory/${fileSet}Block.tbl");          Open(\*BLOCKSIN, "<$inDirectory/${fileSet}block.tbl");
444          Open(\*REGIONSIN, "<$inDirectory/${fileSet}Region.tbl");          Open(\*REGIONSIN, "<$inDirectory/${fileSet}region.tbl");
445          Trace("Processing ${fileSet}Blocks.") if T(2);          Trace("Processing ${fileSet}Blocks.") if T(2);
446          # The outer loop processes blocks. This is accomplished by reading          # The outer loop processes blocks. This is accomplished by reading
447          # through the block file. We prime the loop by reading the first          # through the block file. We prime the loop by reading the first
# Line 500  Line 525 
525                  Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0);                  Trace("No regions found for block $blockID at $blocksCount in block input file.") if T(0);
526                  $errorCount++;                  $errorCount++;
527              } else {              } else {
528                  Trace("$regionCounter regions found in block $blockID.") if T(2);                  Trace("$regionCounter regions found in block $blockID.") if T(4);
529                  # Write the block record.                  # Write the block record. Note that the block ID is prefixed by the group name to
530                    # make it unique.
531                  my $variance = $blockData{"snip-count"} / $blockData{len};                  my $variance = $blockData{"snip-count"} / $blockData{len};
532                  print BLOCKSOUT join("\t", $blockID, $blockData{description}, $blockData{len},                  print BLOCKSOUT join("\t", "$groupName:$blockID", $blockData{description}, $blockData{len},
533                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";                                       $blockData{"snip-count"}, $variance, $blockData{pattern}) . "\n";
534                  # Find all the variance points in the block pattern. We'll use them to create                  # Find all the variance points in the block pattern. We'll use them to create
535                  # the content strings for each region.                  # the content strings for each region.
# Line 527  Line 553 
553                                            $regionPeg{$region}, $location->Left, $content) . "\n";                                            $regionPeg{$region}, $location->Left, $content) . "\n";
554                      print CONTAINSOUT join("\t", $location->Contig, $region,                      print CONTAINSOUT join("\t", $location->Contig, $region,
555                                             $location->Length, $location->Left) . "\n";                                             $location->Length, $location->Left) . "\n";
556                      print INCLUDESOUT join("\t", $blockID, $region);                      print INCLUDESOUT join("\t", "$groupName:$blockID", $region) . "\n";
557                  }                  }
558                  # Finally, we need to connect this block to the genomes in which it occurs.                  # Finally, we need to connect this block to the genomes in which it occurs.
559                  for my $genomeID (keys %genomesFound) {                  for my $genomeID (keys %genomesFound) {
560                      print INSTANCESOUT join("\t", $genomeID, $blockID) . "\n";                      print INSTANCESOUT join("\t", $genomeID, "$groupName:$blockID") . "\n";
561                  }                  }
562                  # Count this block's regions.                  # Count this block's regions.
563                  $regionsCount += $regionCounter;                  $regionsCount += $regionCounter;
# Line 558  Line 584 
584          $contigsCount++;          $contigsCount++;
585      }      }
586      Trace("$contigsCount contigs found.") if T(2);      Trace("$contigsCount contigs found.") if T(2);
587        # Close the output files.
588        close CONTIGSOUT;
589        close CONSISTSOUT;
590      # Now warn the user about all the genomes that didn't have blocks.      # Now warn the user about all the genomes that didn't have blocks.
591      for my $genomeID (keys %genomes) {      for my $genomeID (keys %genomes) {
592          if (! $genomes{$genomeID}) {          if (! $genomes{$genomeID}) {

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.8

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3