[Bio] / FigKernelScripts / load_attributes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/load_attributes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (download) (as text) (annotate)
Wed Apr 13 00:03:52 2005 UTC (14 years, 11 months ago) by redwards
Branch: MAIN
Changes since 1.12: +1 -0 lines
fixed load_attributes. Hopefully

use FIG;
my $fig = new FIG;

# there are several hidden utilities here. If you call the file:
# Filename		Output
# load_attributes 	nothing
# -v		information about those organisms that we add information about
# -vv		information about all organisms
# -flush	remove any "assigned attributes" file, so that only the base attributes are loaded.
# -link		take the links from proteins and store them as attributes
# -keep 	don't unlink the files. Be sure to unlink them yourself!

my ($verbose, $flush, $links, $keep)=(0,0,0,0);
while (@ARGV)
{
 my $try=shift(@ARGV);
 if ($try eq "-v") {$verbose=1}
 if ($try eq "-vv") {$verbose=2}
 if ($try eq "-flush") {$flush=1}
 if ($try eq "-link") {$link=1}
 if ($try eq "-keep") {$keep=1}
}



my $dbf = $fig->{_dbf};
$dbf->drop_table( tbl => "attribute" );
$dbf->create_table( tbl => 'attribute',
		    flds => "fid varchar(64), tag varchar(64), val text, url text"
		    );

my(%kv,$genome);
foreach $genome ($fig->genomes)
{
   # I have rewritten this to allow the following things:
   	# 1. Attributes for genomes are now available in $FIG_Config::organisms/$genome/Attributes
	# 2. Attributes for features (not just pegs) are now available in $FIG_Config::organisms/$genome/Features/*/Attributes
	# 3. in 2, above, we don't know what * is. It should minimally be pegs and tRNA's but there is no reason it is not
	#	something else.

    undef %kv;
    my $stderr; # error messages that show progress, but I only want to print if we have something interesting to say
    # The files that we will read for this genome are stored in @allfiles.
    my @allfiles;
    
    # Find the genome attributes files.
    $stderr .= "Reading from ". $fig->genus_species($genome). " ($genome)\n";
    
    my $dir = "$FIG_Config::organisms/$genome/Attributes";
    if (opendir(ATTR,$dir))
    {
       my @files = grep { $_ !~ /^\./ } readdir(ATTR);
       $stderr .= "\tFound ". (scalar @files) . " genome attributes\n";
       closedir(ATTR);
      
       if ($flush) { 
         if (-e "$dir/assigned_attributes") {`rm -f $dir/assigned_attributes`}
       }
       # here is an alternate way of making assigned_attributes at the end of the list if the file exists
       # my speculation is that this is so these overwrite all other attributes, si?
       push @allfiles,  map  { $_ = "$dir/$_"} grep { $_ ne "assigned_attributes" } @files;
       push @allfiles, "$dir/assigned_attributes" if (-e "$dir/assigned_attributes"); 
    }
    
    # Now find the other attributes files
    # We should use File::Find here, but I am not sure if that is in the default distro, so I'll just write a quickie. Not as good, though
   
    opendir(FEATURES, "$FIG_Config::organisms/$genome/Features") || die "Can't open dir $FIG_Config::organisms/$genome/Features/";
    foreach my $dir (readdir(FEATURES)) {
     next if ($dir =~ /^\./);
     if ($flush) {
       if (-e "$FIG_Config::organisms/$genome/Features/$dir/Attributes/assigned_attributes") {
	 `rm -f $FIG_Config::organisms/$genome/Features/$dir/Attributes/assigned_attributes`;
       }
     } 
     if (-e "$FIG_Config::organisms/$genome/Features/$dir/Attributes" && opendir(ATTR, "$FIG_Config::organisms/$genome/Features/$dir/Attributes"))
     {
 	my @files = grep { $_ !~ /^\./ } readdir(ATTR);
	closedir(ATTR);
        next unless (scalar @files); # you could have an empty attributes dir, no problem
	$stderr .= "\tFound ". (scalar @files) . " attributes files in $dir\n";
        push @allfiles,  map  { $_ = "$FIG_Config::organisms/$genome/Features/$dir/Attributes/$_" } grep { $_ ne "assigned_attributes" } @files;
	push @allfiles, "$FIG_Config::organisms/$genome/Features/$dir/Attributes/assigned_attributes" 
	    if (-e "$FIG_Config::organisms/$genome/Features/$dir/Attributes/assigned_attributes");
     }
     if ($link && -e "$FIG_Config::organisms/$genome/Features/$dir/$dir.links")
     {
       # we are going to parse the links into a temporary file, and then read them
       # at the moment there is something weird where links has lots of things like gi, uniprot id, and so on. These are aliases 
       # and I am not sure why they are in links.
       # I am just going to keep the pubmed links for now

       # however, I am going to parse out any pubmed link that may be for the genome article.
       # this will be done by removing any article with some large number of hits
       open (IN, "$FIG_Config::organisms/$genome/Features/$dir/$dir.links") || die "Can't open $FIG_Config::organisms/$genome/Features/$dir/$dir.links";
       my $output;
       while (<IN>)
       {
         next unless (/pubmed/i);
	 chomp;
	 m#^(fig\|\d+\.\d+\.\w\w\w\.\d+).*(http.*)>(.*?)</a>#i;
	 unless ($1 && $2 && $3)
	 {
	   print STDERR "Error parsing\n>>>$_<<<\n";
	   next;
	 }
	 my ($peg, $url, $val)=($1, $2, $3);
	 $val =~ s/pubmed\s+//i;
	 push (@{$output->{$val}}, "$peg\tPUBMED\t$val\t$url\n");
       }
       # only output if we want to keep it
       if ($output)
       {
         open (OUT, ">>$FIG_Config::temp/linkstmp$$") || die "Can't open $FIG_Config::temp/linkstmp$$ for writing";
         foreach my $key (keys %$output)
	 {
	   next if (scalar @{$output->{$key}} > 100);
	   print OUT @{$output->{$key}};
	 }
	 close OUT;
       }
       
     }
    }

    push @allfiles, "$FIG_Config::temp/linkstmp$$" if (-e "$FIG_Config::temp/linkstmp$$");

    # now read each of the files and save the data
    foreach my $file (@allfiles) 
    {
         if ((-s "$file") && open(TMPATTR,"<$file"))
	 {
             while (defined($_ = <TMPATTR>))
             {
	         chomp;
		 # allow comments
		 next if (/^\s*\#/);
		 my ($id, $tag, $val, $url)=split /\t/; # we can allow spaces in the elements
		 if ($id && $tag)
		 {
		     $tag =~ s/^\s+//; $tag =~ s/\s+$//; $tag=uc($tag);
		     if ($val)
		     {
		         push @{$kv{"$id\t$tag"}}, "$val\t$url\n";
		     }
	             else
		     {
			 delete $kv{"$id\t$tag"};
		     }
		 }
		 else
		 {
		     print STDERR "There was an error parsing $_ from $file\n";
		     next;
		 }
	     }
	     close(TMPATTR);
	 }
    }
    
    # delete the temp file that has the links data in it
    unlink("$FIG_Config::temp/linkstmp$$");
    
    if ($verbose == 1 && scalar keys %kv) 
    {
     print "$stderr\tWe have ", scalar keys %kv, " attributes to add for ", $fig->genus_species($genome), " ($genome)\n";
    }
    elsif ($verbose == 2) {
     print "$stderr\tWe have ", scalar keys %kv, " attributes to add for ", $fig->genus_species($genome), " ($genome)\n";
    }

    if (open(TMPATTR,">$FIG_Config::temp/load_attributes_tmp$$"))
    {
	    my($pegK,$k,$v);
	    foreach $pegK (sort keys(%kv))
	    {
		($peg,$k) = split(/\t/,$pegK);
		foreach my $l (@{$kv{$pegK}})
		{
		   $l =~ s/\\././g; # there is a problem with load_table seeing \.\) and not knowing the end of the record !?!
		   print TMPATTR "$peg\t$k\t$l";
		}
	    }
	    close(TMPATTR);
	    my $result = $dbf->load_table( tbl => "attribute",
			      file => "$FIG_Config::temp/load_attributes_tmp$$" );
            if ($verbose == 2) {print STDERR "Got $result for ", $fig->genus_species($genome), "\n"}
            if (!$keep) 
	     {
	      unlink("$FIG_Config::temp/load_attributes_tmp$$");
	    }
    }
    else
    {
	    die "$FIG_Config::temp/tmp$$";
    }
}
$dbf->create_index( idx  => "attribute_fid_ix",
		    tbl  => "attribute",
		    type => "btree",
		    flds => "fid" 
		  );

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3