[Bio] / FigKernelScripts / load_attributes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/load_attributes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.25 - (download) (as text) (annotate)
Mon Aug 1 22:02:22 2005 UTC (14 years, 8 months ago) by overbeek
Branch: MAIN
Changes since 1.24: +4 -0 lines
update evidence code writeup

use FIG;
use Tracer;
use strict;

my $fig = new FIG;

=pod

=head1 load_attributes

load_attributes is used to reload the attributes file. You can run it with the following command line prompts
-v be verbose and -vv be very verbose. This may be superceded by the trace commands?
-links include the links as attributes. At the moment only pubmed ID's are loaded as links.
-keep keeps the temporary files. Usually the temporary files are deleted, but this will keep (at least some of) them
-noglobal normally attributes in $FIG::Config::global are also processed. This will ignore those

load_attributes begins by deleting the database tables for ALL attributes, and then reloads the data. This is NOT selective.

We then process through each of the genome directories according to $fig->genomes() and look for attributes in each directory. These are written to a temporary file and then loaded.



=cut

my ($verbose, $flush, $links, $keep, $doglobal)=(0,0,0,0,1);
while (@ARGV)
{
	my $try=shift(@ARGV);
	if ($try eq "-v") {$verbose=1}
	elsif ($try eq "-vv") {$verbose=2}
	elsif ($try eq "-links") {$links=1}
	elsif ($try eq "-keep") {$keep=1}
	elsif ($try eq "-noglobal") {$doglobal=0}
}


Trace("Deleting and Recreating attribute table.") if T(2);
my $dbf = $fig->db_handle;
$dbf->drop_table( tbl => "attribute" );
$dbf->create_table( tbl => 'attribute', flds => "fid varchar(64), tag varchar(64), val text, url text");

my @tlogs; # we are going to store any transaction_logs we encounter here, and then process them at the end

# This is where we are going to store all the attributes data, and then we will load it all at once
foreach my $genome ($fig->genomes)
#foreach my $genome (qw[158879.1 9986.1 83333.1])  # this is for testing so we only load 2 genomes!
{
   # if we are keeping the file, this will be incremented so that we don't overwrite each genome
   my $filecount=1;
   while (-e "$FIG_Config::temp/load_attributes.$$.$genome.$filecount") {$filecount++}
   
   my $attributesFH; 
   open($attributesFH, ">$FIG_Config::temp/load_attributes.$$.$genome.$filecount") 
              || die "can't open $FIG_Config::temp/load_attributes.$$.$genome.$filecount for writing";
   my %kv;
#<<<<<<< load_attributes.pl
   #Trace("Processing $genome.") if T(3); # how do we turn this off from the command line?
#=======
   #Trace("Processing $genome.") if T(3);
#>>>>>>> 1.24
   # I have rewritten this to allow the following things:
   # 1. Attributes for genomes are now available in $FIG_Config::organisms/$genome/Attributes
   # 2. Attributes for features (not just pegs) are now available in $FIG_Config::organisms/$genome/Features/*/Attributes
    
   my $dir = "$FIG_Config::organisms/$genome/Attributes";
   # note that this grep ignores emacs editing files and . and ..
   map 
   {
    $_ eq "transaction_log" ? push @tlogs, "$FIG_Config::organisms/$genome/Attributes/$_" : 
            &parse_file_to_temp("$FIG_Config::organisms/$genome/Attributes/$_", $attributesFH);
   } 
   grep { $_ !~ /^\./ && $_ !~ /^\#/ } readdir(ATTR) if (opendir(ATTR,$dir));
    
   # Now find the other attributes files
   # We should use File::Find here, but I am not sure if that is in the default distro, so I'll just write a quickie. Not as good, though
   
   my $fattdir="$FIG_Config::organisms/$genome/Features";
   opendir(FEATURES, $fattdir) || die "Can't open dir $fattdir";
   foreach my $dir (grep { $_ !~ /^\./ && $_ !~ /^\#/ } readdir(FEATURES))
   {
     &links_file("$fattdir/$dir/$dir.links", $attributesFH) if ($links && -e "$fattdir/$dir/$dir.links");
     map 
     {
      $_ eq "transaction_log" ? push @tlogs, "$fattdir/$dir/Attributes/$_" :
            &parse_file_to_temp("$fattdir/$dir/Attributes/$_", $attributesFH);
     } 
     grep { $_ !~ /^\./ && $_ !~ /^\#/ } readdir(ATTR) if (opendir(ATTR,"$fattdir/$dir/Attributes"));
   }
   
   close($attributesFH);
   #remove the file if it has zero size. No need in continuing
   if (!-s "$FIG_Config::temp/load_attributes.$$.$genome.$filecount") {unlink("$FIG_Config::temp/load_attributes.$$.$genome.$filecount"); next}
   
   # finally load all the attributes
   my $result = $dbf->load_table( tbl => "attribute", file => "$FIG_Config::temp/load_attributes.$$.$genome.$filecount" );
   if ($verbose) {print STDERR "Got $result for ", $fig->genus_species($genome), "\n"}
   if (!$keep) {unlink("$FIG_Config::temp/load_attributes.$$.$genome.$filecount")}
}

# now we need to load the global attributes files
if ($doglobal) 
{ 
 if (opendir(DIR, "$FIG_Config::global/Attributes/")) 
 {
  my $globalFH;
  open($globalFH, ">$FIG_Config::temp/global_attributes") || die "Can't open $FIG_Config::temp/global_attributes for writing";
  map 
  {
   $_ eq "transaction_log" ? push @tlogs, "$FIG_Config::global/Attributes/$_" :
   $_ eq "attribute_keys" ? 1 :
         &parse_file_to_temp("$FIG_Config::global/Attributes/$_", $globalFH);
  }
  grep {$_ !~ /^\./ && $_ !~ /^\#/} readdir(DIR);
  close $globalFH;
 }
 my $result = $dbf->load_table( tbl => "attribute", file => "$FIG_Config::temp/global_attributes" ) if (-e "$FIG_Config::temp/global_attributes");
 if ($verbose) {print STDERR "Got $result for $FIG_Config::temp/global_attributes\n"}
 if (!$keep) {unlink("$FIG_Config::temp/global_attributes")}
}
else {print STDERR "$FIG_Config::global/Attributes/ was not parsed\n"}
	   

# finally parse the transaction_log files
&parse_transaction_logs(\@tlogs) if (scalar(@tlogs));


Trace("Creating index.") if T(2);
$dbf->create_index( idx  => "attribute_fid_ix",
		   tbl  => "attribute",
		   type => "btree",
		   flds => "fid" 
		 );
Trace("Attributes loaded.") if T(2);
exit(0);


=head3 links_file()

Read the links and write them to the output filehandle provided. Requires two arguments - the links file and the filehandle where they should be written to

=cut

sub links_file {
   # we are going to parse the links into a temporary file, and then read them
   # at the moment there is something weird where links has lots of things like gi, uniprot id, and so on. These are aliases 
   # and I am not sure why they are in links.
   # I am just going to keep the pubmed links for now
   # however, I am going to parse out any pubmed link that may be for the genome article.
   # this will be done by removing any article with some large number of hits
   my ($links_file, $write_to)=@_;
   return unless (-e $links_file);
   
   open (IN, $links_file) || die "Can't open $links_file";
   my $output;
   while (<IN>)
   {
     next unless (/pubmed/i);
     chomp;
     m#^(fig\|\d+\.\d+\.\w\w\w\.\d+).*(http.*)>(.*?)</a>#i;
     unless ($1 && $2 && $3) {print STDERR "Error parsing\n>>>$_<<<\n"; next}
     my ($peg, $url, $val)=($1, $2, $3);
     $val =~ s/pubmed\s+//i;
     push (@{$output->{$val}}, "$peg\tPUBMED\t$val\t$url\n");
   }
   # only output if we want to keep it
   if ($output)
   {
      foreach my $key (keys %$output)
      {
	 next if (scalar @{$output->{$key}} > 100);
	 print $write_to @{$output->{$key}};
      }
   }
}



=head2 parse_file_to_temp()

This method takes two arguments, the name of a file to read and a filehandle to write to. The file is opened, comments and blank lines are ignored, a couple of tests are applied, and the data is written to the filehandle.

=cut

sub parse_file_to_temp {
 my ($from, $to)=@_;
 return unless ($from);
 unless ($to) {open ($to, ">-")} #open $to to STDOUT

 if ($verbose == 2) {print STDERR "Parsing from $from to $to\n"}
 open (IN, "$from") || die "can't open $from for reading";
 while (<IN>) 
 {
  chomp;
  s/\r/\n/;
  next if (/^\s*\#/); # ignore comments
  next if (/^\s*$/); # ignore blanks or whitespace only lines

  # there is a problem with periods. They can not be escaped or else the dbh load will fail. We unescape them here
  s/\\\./\./g;
  my @line=split /\t/;
  unless ($line[0]) {print STDERR "No ID at line in $from at:\n$_"; next}
  unless ($line[1]) {print STDERR "No key at line in $from at:\n$_"; next}
  if (length($line[1]) > 64) {print STDERR "Key is longer than 64 characters in $from at:\n$_"; next}
  if ($#line ==2) {$line[3]=''}
  unless ($#line == 3) {print STDERR "Lines in $from have more than 4 columns. You are only allowed feature, key, value, and url\n"; next}
  # clean the key
  $line[1] =  $fig->clean_attribute_key($line[1]);
  unless (defined $line[3]) {$line[3] = ""}
  print $to (join "\t", @line) . "\n";
 }
}

=head2 parse_transaction_logs()

This method takes a reference to an array of paths to transactions_logs and will read and process them

=cut

sub parse_transaction_logs {
 my $logs=shift;
 return unless $logs;
 foreach my $l (@$logs) {
  if ($verbose) {print STDERR "Parsing transaction log $l\n"}
  $fig->read_attribute_transaction_log($l);
 }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3