[Bio] / FigKernelScripts / gather_attributes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/gather_attributes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (download) (as text) (annotate)
Mon Dec 5 18:56:37 2005 UTC (14 years, 4 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.7: +17 -0 lines
Add license words.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

#__perl__

use FIG;
use Tracer;
use strict;

my $fig = new FIG;

=pod

=head1 Gather attributes

Look through the sources for all the atttributes file, find the attributes, and print them out. This can then be piped into sort -u to clean up dups.

This script takes two optional parameters:
-f <filename> will put the output to the file named. Otherwise output will be to STDOUT
-d is a boolean. If present the files will be "deleted" once used. Actually, they will be moved to FIG_Config::temp/Attributes/deleted_attributes/
-t is for testing and it will only run through a few genomes

=cut 

my ($file, $delete,$testing)=(">-", 0,0);
while (@ARGV) {
 my $t=shift @ARGV;
 if ($t eq "-f") {$file=shift @ARGV}
 elsif ($t eq "-d") {$delete = "$FIG_Config::temp/Attributes/deleted_attributes/"}
 elsif ($t eq "-t") {$testing=1}
}

my $attributesFH;  my @tlogs;
open($attributesFH, $file) || die "can't open $file for writing";

foreach my $genome ($fig->genomes)
{
   if ($testing) {next unless ($genome eq 158879.1 || $genome eq 9986.1 || $genome eq 83333.1)}
   # if we are keeping the file, this will be incremented so that we don't overwrite each genome
   
   #Trace("Processing $genome.") if T(3);
   # I have rewritten this to allow the following things:
   # 1. Attributes for genomes are now available in $FIG_Config::organisms/$genome/Attributes
   # 2. Attributes for features (not just pegs) are now available in $FIG_Config::organisms/$genome/Features/*/Attributes
    
   my $dir = "$FIG_Config::organisms/$genome/Attributes";
   $delete && `mkdir -p $delete/$genome/Attributes/`;
   # note that this grep ignores emacs editing files and . and ..
   my @move;
   map 
   {
    my $f=$_; # what in this block is setting $_? Is it the &parse_file_to_temp ?
    $f eq "transaction_log" ? push @tlogs, "$FIG_Config::organisms/$genome/Attributes/$f" : 
            &parse_file_to_temp("$FIG_Config::organisms/$genome/Attributes/$f", $attributesFH);
    $delete && `mv $FIG_Config::organisms/$genome/Attributes/$f $delete/$genome/Attributes/`;
   } 
   grep { $_ !~ /^\./ && $_ !~ /^\#/ } readdir(ATTR) if (opendir(ATTR,$dir));

   # Now find the other attributes files
   # We should use File::Find here, but I am not sure if that is in the default distro, so I'll just write a quickie. Not as good, though
   
   my $fattdir="$FIG_Config::organisms/$genome/Features";
   
   opendir(FEATURES, $fattdir) || die "Can't open dir $fattdir";
   foreach my $dir (grep { $_ !~ /^\./ && $_ !~ /^\#/ } readdir(FEATURES))
   {
     $delete && `mkdir -p $delete/$genome/Features/$dir/Attributes`; # if we are going to "delete" the files, make the place to move them to.
     &links_file("$fattdir/$dir/$dir.links", $attributesFH) if (-e "$fattdir/$dir/$dir.links");
     map 
     {
      my $f=$_;
      $f eq "transaction_log" ? push @tlogs, "$fattdir/$dir/Attributes/$f" :
            &parse_file_to_temp("$fattdir/$dir/Attributes/$f", $attributesFH);
      $delete && `mv $fattdir/$dir/Attributes/$f $delete/$genome/Features/$dir/Attributes/$f`;
     } 
     grep { $_ !~ /^\./ && $_ !~ /^\#/ } readdir(ATTR) if (opendir(ATTR,"$fattdir/$dir/Attributes"));
   }
}

# now we need to load the global attributes files
$delete && (`mkdir -p $delete/Global/Attributes/`);
if (opendir(DIR, "$FIG_Config::global/Attributes/")) 
{
 map 
 {
  $_ eq "transaction_log" ? push @tlogs, "$FIG_Config::global/Attributes/$_" :
  $_ eq "attribute_keys" ? 1 :
        &parse_file_to_temp("$FIG_Config::global/Attributes/$_", $attributesFH);
 $delete && `mv $FIG_Config::global/Attributes/$_ $delete/Global/Attributes/`;
 }
 grep {$_ !~ /^\./ && $_ !~ /^\#/} readdir(DIR);
}
	 

# finally parse the transaction_log files
&parse_transaction_logs(\@tlogs) if (scalar(@tlogs));


exit(0);


=head3 links_file()

Read the links and write them to the output filehandle provided. Requires two arguments - the links file and the filehandle where they should be written to

=cut

sub links_file {
   # we are going to parse the links into a temporary file, and then read them
   # at the moment there is something weird where links has lots of things like gi, uniprot id, and so on. These are aliases 
   # and I am not sure why they are in links.
   # I am just going to keep the pubmed links for now
   # however, I am going to parse out any pubmed link that may be for the genome article.
   # this will be done by removing any article with some large number of hits
   my ($links_file, $write_to)=@_;
   return unless (-e $links_file);
   
   open (IN, $links_file) || die "Can't open $links_file";
   my $output;
   while (<IN>)
   {
     next unless (/pubmed/i);
     chomp;
     m#^(fig\|\d+\.\d+\.\w\w\w\.\d+).*(http.*)>(.*?)</a>#i;
     unless ($1 && $2 && $3) {print STDERR "Error parsing\n>>>$_<<<\n"; next}
     my ($peg, $url, $val)=($1, $2, $3);
     $val =~ s/pubmed\s+//i;
     push (@{$output->{$val}}, "$peg\tPUBMED\t$val\t$url\n");
   }
   # only output if we want to keep it
   if ($output)
   {
      foreach my $key (keys %$output)
      {
	 next if (scalar @{$output->{$key}} > 100);
	 print $write_to @{$output->{$key}};
      }
   }
}



=head2 parse_file_to_temp()

This method takes two arguments, the name of a file to read and a filehandle to write to. The file is opened, comments and blank lines are ignored, a couple of tests are applied, and the data is written to the filehandle.

=cut

sub parse_file_to_temp {
 my ($from, $to)=@_;
 return unless ($from);
 unless ($to) {open ($to, ">-")} #open $to to STDOUT

 open (IN, "$from") || return 0;
 while (<IN>) 
 {
  chomp;
  next if (/^\s*\#/); # ignore comments
  next if (/^\s*$/); # ignore blanks or whitespace only lines

  # there is a problem with periods. They can not be escaped or else the dbh load will fail. We unescape them here
  s/\\\./\./g;
  my @line=split /\t/;
  unless ($line[0]) {print STDERR "No ID at line in $from at:\n$_"; next}
  unless ($line[1]) {print STDERR "No key at line in $from at:\n$_"; next}
  #clean the key
  $line[1] =  $fig->clean_attribute_key($line[1]);
  if (length($line[1]) > 64) {print STDERR "Key is longer than 64 characters in $from at:\n$_"; next}
  unless (defined $line[3]) {$line[3] = ""}
  print $to (join "\t", @line) . "\n";
 }
}

=head2 parse_transaction_logs()

This method takes a reference to an array of paths to transactions_logs and will read and process them

=cut

sub parse_transaction_logs {
 my $logs=shift;
 return unless $logs;
 foreach my $l (@$logs) {
  $fig->read_attribute_transaction_log($l);
 }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3