[Bio] / FigKernelScripts / load_attributes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/load_attributes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (download) (as text) (annotate)
Fri Jul 15 03:23:54 2005 UTC (14 years, 8 months ago) by redwards
Branch: MAIN
Changes since 1.15: +19 -0 lines
updating protein_info.cgi

use FIG;
use Tracer;

my $fig = new FIG;

=pod

=head1 load_attributes

load_attributes is used to reload the attributes file. You can run it with the following command line prompts
-v be verbose and -vv be very verbose. This may be superceded by the trace commands?
-flush


load_attributes begins by deleting the database tables for ALL attributes, and then reloads the data. This is NOT selective.

We then process through each of the genome directories according to $fig->genomes() and look for attributes in 




=cut


# there are several hidden utilities here. If you call the file:
# Filename		Output
# load_attributes 	nothing
# -v		information about those organisms that we add information about
# -vv		information about all organisms
# -flush	remove any "assigned attributes" file, so that only the base attributes are loaded.
# -link		take the links from proteins and store them as attributes
# -keep 	don't unlink the files. Be sure to unlink them yourself!

Trace("Parsing command line for load_attributes.") if T(2);
my ($verbose, $flush, $links, $keep)=(0,0,0,0);
while (@ARGV)
{
	my $try=shift(@ARGV);
	if ($try eq "-v") {$verbose=1}
	if ($try eq "-vv") {$verbose=2}
	if ($try eq "-flush") {$flush=1}
	if ($try eq "-link") {$link=1}
	if ($try eq "-keep") {$keep=1}
}


Trace("Recreating attribute table.") if T(2);
my $dbf = $fig->db_handle;
$dbf->drop_table( tbl => "attribute" );
$dbf->create_table( tbl => 'attribute',
		    flds => "fid varchar(64), tag varchar(64), val text, url text"
		    );

Trace("Processing genomes.") if T(2);
my(%kv,$genome);
foreach $genome ($fig->genomes)
 {
	Trace("Processing $genome.") if T(3);
	# I have rewritten this to allow the following things:
   	# 1. Attributes for genomes are now available in $FIG_Config::organisms/$genome/Attributes
	# 2. Attributes for features (not just pegs) are now available in $FIG_Config::organisms/$genome/Features/*/Attributes
	# 3. in 2, above, we don't know what * is. It should minimally be pegs and tRNA's but there is no reason it is not
	#	something else.

    undef %kv;
    my $stderr; # error messages that show progress, but I only want to print if we have something interesting to say
    # The files that we will read for this genome are stored in @allfiles.
    my @allfiles;
    
    # Find the genome attributes files.
    $stderr .= "Reading from ". $fig->genus_species($genome). " ($genome)\n";
    
    my $dir = "$FIG_Config::organisms/$genome/Attributes";
    if (opendir(ATTR,$dir))
    {
       my @files = grep { $_ !~ /^\./ } readdir(ATTR);
       $stderr .= "\tFound ". (scalar @files) . " genome attributes\n";
       closedir(ATTR);
      
       if ($flush) { 
			if (-e "$dir/assigned_attributes") {
				`rm -f $dir/assigned_attributes`
			}
		}
		# here is an alternate way of making assigned_attributes at the end of the list if the file exists
		# my speculation is that this is so these overwrite all other attributes, si?
		push @allfiles,  map  { $_ = "$dir/$_"} grep { $_ ne "assigned_attributes" } @files;
		push @allfiles, "$dir/assigned_attributes" if (-e "$dir/assigned_attributes"); 
    }
    
    # Now find the other attributes files
    # We should use File::Find here, but I am not sure if that is in the default distro, so I'll just write a quickie. Not as good, though
   
    opendir(FEATURES, "$FIG_Config::organisms/$genome/Features") || die "Can't open dir $FIG_Config::organisms/$genome/Features/";
    foreach my $dir (readdir(FEATURES)) {
		next if ($dir =~ /^\./);
		if ($flush) {
			if (-e "$FIG_Config::organisms/$genome/Features/$dir/Attributes/assigned_attributes") {
				`rm -f $FIG_Config::organisms/$genome/Features/$dir/Attributes/assigned_attributes`;
			}
		} 
		if (-e "$FIG_Config::organisms/$genome/Features/$dir/Attributes" && opendir(ATTR, "$FIG_Config::organisms/$genome/Features/$dir/Attributes"))
		{
			my @files = grep { $_ !~ /^\./ } readdir(ATTR);
			closedir(ATTR);
				next unless (scalar @files); # you could have an empty attributes dir, no problem
			$stderr .= "\tFound ". (scalar @files) . " attributes files in $dir\n";
				push @allfiles,  map  { $_ = "$FIG_Config::organisms/$genome/Features/$dir/Attributes/$_" } grep { $_ ne "assigned_attributes" } @files;
			push @allfiles, "$FIG_Config::organisms/$genome/Features/$dir/Attributes/assigned_attributes" 
				if (-e "$FIG_Config::organisms/$genome/Features/$dir/Attributes/assigned_attributes");
		}
		if ($link && -e "$FIG_Config::organisms/$genome/Features/$dir/$dir.links")
		{
			# we are going to parse the links into a temporary file, and then read them
			# at the moment there is something weird where links has lots of things like gi, uniprot id, and so on. These are aliases 
			# and I am not sure why they are in links.
			# I am just going to keep the pubmed links for now
			# however, I am going to parse out any pubmed link that may be for the genome article.
			# this will be done by removing any article with some large number of hits
			open (IN, "$FIG_Config::organisms/$genome/Features/$dir/$dir.links") || die "Can't open $FIG_Config::organisms/$genome/Features/$dir/$dir.links";
			my $output;
			while (<IN>)
			{
				next unless (/pubmed/i);
				chomp;
				m#^(fig\|\d+\.\d+\.\w\w\w\.\d+).*(http.*)>(.*?)</a>#i;
				unless ($1 && $2 && $3)
				{
				  print STDERR "Error parsing\n>>>$_<<<\n";
				  next;
				}
				my ($peg, $url, $val)=($1, $2, $3);
				$val =~ s/pubmed\s+//i;
				push (@{$output->{$val}}, "$peg\tPUBMED\t$val\t$url\n");
			}
			# only output if we want to keep it
			if ($output)
			{
				open (OUT, ">>$FIG_Config::temp/linkstmp$$") || die "Can't open $FIG_Config::temp/linkstmp$$ for writing";
				foreach my $key (keys %$output)
				{
					next if (scalar @{$output->{$key}} > 100);
					print OUT @{$output->{$key}};
				}
				close OUT;
			}       
		}
    }

    push @allfiles, "$FIG_Config::temp/linkstmp$$" if (-e "$FIG_Config::temp/linkstmp$$");

    # now read each of the files and save the data
    foreach my $file (@allfiles) 
    {
        if ((-s "$file") && open(TMPATTR,"<$file"))
		{
            while (defined($_ = <TMPATTR>))
			{
				chomp;
				# allow comments
				next if (/^\s*\#/);
				my ($id, $tag, $val, $url)=split /\t/; # we can allow spaces in the elements
				if ($id && $tag)
				{
					$tag =~ s/^\s+//; $tag =~ s/\s+$//; $tag=uc($tag);
					if ($val)
					{
						push @{$kv{"$id\t$tag"}}, "$val\t$url\n";
					}
					else
					{
						delete $kv{"$id\t$tag"};
					}
				}
				else
				{
					print STDERR "There was an error parsing $_ from $file\n";
					next;
				}
			}
			close(TMPATTR);
		}
    }
    
    # delete the temp file that has the links data in it
    unlink("$FIG_Config::temp/linkstmp$$");
    
    if ($verbose == 1 && scalar keys %kv) 
    {
		print "$stderr\tWe have ", scalar keys %kv, " attributes to add for ", $fig->genus_species($genome), " ($genome)\n";
    }
    elsif ($verbose == 2) {
		print "$stderr\tWe have ", scalar keys %kv, " attributes to add for ", $fig->genus_species($genome), " ($genome)\n";
    }

    if (open(TMPATTR,"| sort -u > $FIG_Config::temp/load_attributes_tmp$$"))
    {
	    my($pegK,$k,$v);
	    foreach $pegK (sort keys(%kv))
	    {
			($peg,$k) = split(/\t/,$pegK);
			foreach my $l (@{$kv{$pegK}})
			{
				$l =~ s/\\././g; # there is a problem with load_table seeing \.\) and not knowing the end of the record !?!
				print TMPATTR "$peg\t$k\t$l";
			}
	    }
	    close(TMPATTR);
	    my $result = $dbf->load_table( tbl => "attribute",
			      file => "$FIG_Config::temp/load_attributes_tmp$$" );
	    if ($verbose == 2) {
		print STDERR "Got $result for ", $fig->genus_species($genome), "\n";
	    }
	    if (!$keep) 
	    {
		unlink("$FIG_Config::temp/load_attributes_tmp$$");
	    }
    }
    else
    {
	    die "$FIG_Config::temp/tmp$$";
    }
}
Trace("Creating index.") if T(2);
$dbf->create_index( idx  => "attribute_fid_ix",
		    tbl  => "attribute",
		    type => "btree",
		    flds => "fid" 
		  );
Trace("Attributes loaded.") if T(2);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3