[Bio] / FigKernelScripts / load_attributes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/load_attributes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.32 - (download) (as text) (annotate)
Wed Jan 4 16:28:16 2006 UTC (14 years, 2 months ago) by overbeek
Branch: MAIN
Changes since 1.31: +4 -4 lines
RAE: ignore files ending ~

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use FIG;
use Tracer;
use strict;

my $fig = new FIG;

=pod

=head1 load_attributes

load_attributes is used to reload the attributes file. You can run it with the following command line prompts
-v be verbose and -vv be very verbose. This may be superceded by the trace commands?
-links include the links as attributes. At the moment only pubmed ID's are loaded as links.
-keep keeps the temporary files. Usually the temporary files are deleted, but this will keep (at least some of) them
-noglobal normally attributes in $FIG::Config::global are also processed. This will ignore those
-force will remove non [a-zA-Z0-9_] characters from key names before adding the key


load_attributes begins by deleting the database tables for ALL attributes, and then reloads the data. This is NOT selective.

We then process through each of the genome directories according to $fig->genomes() and look for attributes in each directory. These are written to a temporary file and then loaded.

Note that key names can only contain the characters matched by the \w method (i.e. [a-zA-Z0-9_])

=cut

my ($verbose, $flush, $links, $keep, $doglobal, $force)=(0,0,0,0,1,0);
while (@ARGV)
{
	my $try=shift(@ARGV);
	if ($try eq "-v") {$verbose=1}
	elsif ($try eq "-vv") {$verbose=2}
	elsif ($try eq "-links") {$links=1}
	elsif ($try eq "-keep") {$keep=1}
	elsif ($try eq "-noglobal") {$doglobal=0}
	elsif ($try eq "-force") {$force=1}
}

Trace("Deleting and Recreating attribute table.") if T(2);
my $dbf = $fig->db_handle;
$dbf->drop_table( tbl => "attribute" );
$dbf->create_table( tbl => 'attribute', flds => "genome varchar(64), ftype varchar(64), id varchar(64), tag varchar(64), val text, url text");
$dbf->drop_table( tbl => "attribute_metadata" );
$dbf->create_table( tbl => 'attribute_metadata', flds => "attrkey varchar(64), metakey varchar(64), metaval text");

my @tlogs; # we are going to store any transaction_logs we encounter here, and then process them at the end
my @akeys; # we are going to store any attributes metadata we encounter here, and then process them at the end

# This is where we are going to store all the attributes data, and then we will load it all at once
foreach my $genome ($fig->genomes)
#foreach my $genome (qw[158879.1 9986.1 83333.1])  # this is for testing so we only load 2 genomes!
{
   # if we are keeping the file, this will be incremented so that we don't overwrite each genome
   my $filecount=1;
   while (-e "$FIG_Config::temp/load_attributes.$$.$genome.$filecount") {$filecount++}
   
   my $attributesFH; 
   open($attributesFH, ">$FIG_Config::temp/load_attributes.$$.$genome.$filecount") 
              || die "can't open $FIG_Config::temp/load_attributes.$$.$genome.$filecount for writing";
   my %kv;
   # I have rewritten this to allow the following things:
   # 1. Attributes for genomes are now available in $FIG_Config::organisms/$genome/Attributes
   # 2. Attributes for features (not just pegs) are now available in $FIG_Config::organisms/$genome/Features/*/Attributes
    
   my $dir = "$FIG_Config::organisms/$genome/Attributes";
   # note that this grep ignores emacs editing files and . and ..
   map 
   {
    $_ eq "transaction_log" ? push @tlogs, "$FIG_Config::organisms/$genome/Attributes/$_" : 
    ($_ eq "attribute_keys" || $_ eq "attribute_metadata") ? push @akeys, "$FIG_Config::organisms/$genome/Attributes/$_" :
            &parse_file_to_temp("$FIG_Config::organisms/$genome/Attributes/$_", $attributesFH);
   } 
   grep { $_ !~ /\~$/ && $_ !~ /^\./ && $_ !~ /^\#/ } readdir(ATTR) if (opendir(ATTR,$dir));
    
   # Now find the other attributes files
   # We should use File::Find here, but I am not sure if that is in the default distro, so I'll just write a quickie. Not as good, though
   
   my $fattdir="$FIG_Config::organisms/$genome/Features";
   opendir(FEATURES, $fattdir) || die "Can't open dir $fattdir";
   foreach my $dir (grep {$_ !~ /\~$/ && $_ !~ /^\./ && $_ !~ /^\#/ } readdir(FEATURES))
   {
     &links_file("$fattdir/$dir/$dir.links", $attributesFH) if ($links && -e "$fattdir/$dir/$dir.links");
     map 
     {
      $_ eq "transaction_log" ? push @tlogs, "$fattdir/$dir/Attributes/$_" :
      ($_ eq "attribute_keys" || $_ eq "attribute_metadata") ? push @akeys,"$fattdir/$dir/Attributes/$_" :
            &parse_file_to_temp("$fattdir/$dir/Attributes/$_", $attributesFH);
     } 
     grep {$_ !~ /\~$/ && $_ !~ /^\./ && $_ !~ /^\#/ } readdir(ATTR) if (opendir(ATTR,"$fattdir/$dir/Attributes"));
   }
   
   close($attributesFH);
   #remove the file if it has zero size. No need in continuing
   if (!-s "$FIG_Config::temp/load_attributes.$$.$genome.$filecount") {unlink("$FIG_Config::temp/load_attributes.$$.$genome.$filecount"); next}
   
   # finally load all the attributes
   my $result = $dbf->load_table( tbl => "attribute", file => "$FIG_Config::temp/load_attributes.$$.$genome.$filecount" );
   if ($verbose && $result != -1) {print STDERR "Got $result for ", $fig->genus_species($genome), " ($genome) while trying to load.\n"}
   if (!$keep) {unlink("$FIG_Config::temp/load_attributes.$$.$genome.$filecount")}
}

# now we need to load the global attributes files
if ($doglobal) 
{ 
 if (opendir(DIR, "$FIG_Config::global/Attributes/")) 
 {
  my $globalFH;
  open($globalFH, ">$FIG_Config::temp/global_attributes") || die "Can't open $FIG_Config::temp/global_attributes for writing";
  map 
  {
   $_ eq "transaction_log" ? push @tlogs, "$FIG_Config::global/Attributes/$_" :
   ($_ eq "attribute_keys" || $_ eq "attribute_metadata") ? push @akeys,"$FIG_Config::global/Attributes/$_" :
         &parse_file_to_temp("$FIG_Config::global/Attributes/$_", $globalFH);
  }
  grep {$_ !~ /\~$/ && $_ !~ /^\./ && $_ !~ /^\#/} readdir(DIR);
  close $globalFH;
 }
 my $result = $dbf->load_table( tbl => "attribute", file => "$FIG_Config::temp/global_attributes" ) if (-e "$FIG_Config::temp/global_attributes");
 if ($verbose) {print STDERR "Got $result for $FIG_Config::temp/global_attributes\n"}
 if (!$keep) {unlink("$FIG_Config::temp/global_attributes")}
}
else {print STDERR "$FIG_Config::global/Attributes/ was not parsed\n"}
	   

# finally parse the transaction_log files and attributes_metadata
&parse_transaction_logs(\@tlogs) if (scalar(@tlogs));
&parse_attributes_metadata(\@akeys) if (scalar(@akeys));


Trace("Creating index.") if T(2);
# rob messing with indexes
# fields are now : genome ftype id key val url
$dbf->create_index( idx  => "attribute_genome_ix", tbl  => "attribute", type => "btree", flds => "id,genome,ftype");
$dbf->create_index( idx  => "attribute_genome_ftype_ix", tbl  => "attribute", type => "btree", flds => "genome, ftype");
$dbf->create_index( idx  => "attribute_key_ix", tbl  => "attribute", type => "btree", flds => "tag" );
#$dbf->create_index( idx  => "attribute_val_ix", tbl  => "attribute", type => "btree", flds => "val");
#$dbf->create_index( idx  => "attribute_metadata_ix", tbl  => "attribute_metadata", type => "btree", flds => "attrkey, metakey, metaval");
$dbf->create_index( idx  => "attribute_metadata_ix", tbl  => "attribute_metadata", type => "btree", flds => "attrkey, metakey");


Trace("Attributes loaded.") if T(2);
exit(0);


=head3 links_file()

Read the links and write them to the output filehandle provided. Requires two arguments - the links file and the filehandle where they should be written to

=cut

sub links_file {
   # we are going to parse the links into a temporary file, and then read them
   # at the moment there is something weird where links has lots of things like gi, uniprot id, and so on. These are aliases 
   # and I am not sure why they are in links.
   # I am just going to keep the pubmed links for now
   # however, I am going to parse out any pubmed link that may be for the genome article.
   # this will be done by removing any article with some large number of hits
   my ($links_file, $write_to)=@_;
   return unless (-e $links_file);
   
   open (IN, $links_file) || die "Can't open $links_file";
   my $output;
   while (<IN>)
   {
     next unless (/pubmed/i);
     chomp;
     m#^(fig\|\d+\.\d+\.\w\w\w\.\d+).*(http.*)>(.*?)</a>#i;
     unless ($1 && $2 && $3) {print STDERR "Error parsing\n>>>$_<<<\n"; next}
     my ($peg, $url, $val)=($1, $2, $3);
     $val =~ s/pubmed\s+//i;
     push (@{$output->{$val}}, "$peg\tPUBMED\t$val\t$url\n");
   }
   # only output if we want to keep it
   if ($output)
   {
      foreach my $key (keys %$output)
      {
	 next if (scalar @{$output->{$key}} > 100);
	 print $write_to @{$output->{$key}};
      }
   }
}



=head2 parse_file_to_temp()

This method takes two arguments, the name of a file to read and a filehandle to write to. The file is opened, comments and blank lines are ignored, a couple of tests are applied, and the data is written to the filehandle.

=cut

sub parse_file_to_temp {
 my ($from, $to)=@_;
 return unless ($from);
 unless ($to) {open ($to, ">-")} #open $to to STDOUT

 if ($verbose == 2) {print STDERR "Parsing from $from to $to\n"}
 open (IN, "$from") || die "can't open $from for reading";
 while (<IN>) 
 {
  if (/\r.*\r/) {print STDERR "The file $from appears to have multiple \\r delimters. Please remove these, as this file has been skipped\n"; next}
  s/\r//;# catch lines that end \r\n
  chomp;
  next if (/^\s*\#/); # ignore comments
  next if (/^\s*$/); # ignore blanks or whitespace only lines

  # there is a problem with periods. They can not be escaped or else the dbh load will fail. We unescape them here
  s/\\\./\./g;
  my @line=split /\t/;
  unless ($line[0]) {if ($verbose) {print STDERR "No ID at line in $from at:\n$_\n"} next}
  unless ($line[1]) {if ($verbose) {print STDERR "No key at line in $from at:\n$_\n"} next}
  unless ($line[2]) {if ($verbose) {print STDERR "No value at line in $from at:\n$_\n"} next} # can you have a key without a value?
  if (length($line[1]) > 64) {print STDERR "Key is longer than 64 characters in $from at:\n$_\n"; next}
  if ($#line ==2) {$line[3]=''}
  unless ($#line == 3) {print STDERR "Line \n$_\n in $from have more than 4 columns. You are only allowed feature, key, value, and url\n"; next}
  # clean the key
  if ($line[1] =~ /\W/ && !$force) 
  {
   print STDERR "the key: $line[1]  from $from has characters that are not [a-zA-Z0-9] and _. Please correct this or use the force option to clean the key\n";
   next;
  }
  elsif ($line[1] =~ /\W/ && $force) 
  {
   $line[1] =  $fig->clean_attribute_key($line[1]);
  }
  
  # replace the first element in  the line with the split feature as appropriate
  splice(@line, 0, 1, $fig->split_attribute_oid($line[0]));
  unless (defined $line[3]) {$line[3] = ""}
  print $to (join "\t", @line) . "\n";
 }
}

=head2 parse_transaction_logs()

This method takes a reference to an array of paths to transactions_logs and will read and process them

=cut

sub parse_transaction_logs {
 my $logs=shift;
 return unless $logs;
 foreach my $l (@$logs) 
 {
  if ($verbose) {print STDERR "Parsing transaction log $l\n"}
  $fig->read_attribute_transaction_log($l);
 }
}


=head2 parse_attributes_metadata()

This method takes a reference to an array of attributes metadata files and loads them into the database. It will also rename attribute_keys to attribute_metadata to be consistent and hopefully clearer.

=cut

sub parse_attributes_metadata {
 my $akeys=shift;
 return unless ($akeys);
 
 # first we are going to see if we need to rename or append any files
 my %attributekeys;
 foreach my $ak (@$akeys) 
 {
  # rename attribute_keys to attribute_metadata by appending to a file in case there is more data there.
  if ($ak =~ /attribute_keys$/) 
  {
   my $location=$fig->update_attributes_metadata($ak);
   $attributekeys{$location}=1;
  }
  else
  {
   $attributekeys{$ak}=1;
  }
 }
 
 foreach my $ak (keys %attributekeys) 
 {
  if ($verbose) {print STDERR "Parsing attribute metadata $ak\n"}
  open(IN, $ak) || die "Can't open $ak";
  while (<IN>) 
  {
   next if (/^\s*\#/);
   chomp;
   my @line=split /\t/;
   # here we pass in the attribute key (line[0]) and a reference to an array with metakey and key info
   $fig->key_info($line[0], {$line[1]=>$line[2]}, 1);
  }
 }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3