[Bio] / Sprout / SaplingFunctionCheck.pl Repository:
ViewVC logotype

View of /Sprout/SaplingFunctionCheck.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.1 - (download) (as text) (annotate)
Mon Oct 27 18:10:27 2014 UTC (4 years, 3 months ago) by parrello
Branch: MAIN
New method to fix functional assignments.

#!/usr/bin/perl -w

=head1 Sapling Function Update Script

This script reads the assigned function files in a SEED organism directory
and verifies that the Sapling features have the correct functions.
Any functions that do not match will be updated.

The currently-supported command-line options are as follows.

=over 4

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item host

Alternate database host, if the database is located somewhere other than the
default. This is necessary on some Sapling machines to insure we get a writable
copy of the database.

=item port

Alternate MySQL port (used for debugging).

=item orgDir

The organism directory to use. The default is $FIG_Config::organisms.



use strict;
use Tracer;
use Sapling;
use Stats;

use SaplingFunctionLoader;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(SaplingDataLoader) ],
                                           { host => ["", "alternate database host machine"],
                                             dbName => ["", "alternate database name"],
                                             port => ["", "alternate database port"],
                                             orgDir => [$FIG_Config::organisms, "SEED organism directory"] },
# Create the statistics object.
my $stats = Stats->new();
# Insure we catch errors.
eval {
    # Get the Sapling database.
    my $sap = Sapling->new(dbhost => $options->{host}, dbName => $options->{dbName}, port => $options->{port});
    # Get the organism directory.
    my $orgDir = $options->{orgDir};
    # Get the genome list.
    my %genomes = map { $_->[0] => $_->[1] } $sap->GetAll('Genome', '', [], 'id scientific-name');
    # Loop through the genomes.
    for my $genome (sort keys %genomes) {
    	$stats->Add(genomes => 1);
    	# Check for the functions file.
    	my $funFileName = "$orgDir/$genome/assigned_functions";
    	if (! -f $funFileName) {
    		$stats->Add(genomeNotInSeed => 1);
    		Trace("Genome $genome - $genomes{$genome}: no assigned functions found.") if T(1);
    	} else {
    		Trace("Processing $genome: $genomes{$genome}.") if T(2);
    		# Read in the assigned functions.
    		Trace("Reading $funFileName.") if T(3);
    		my %functions;
    		my $fh = Tracer::Open(undef, "<$funFileName", "Could not open assigned functions for $genome");
    		while (! eof $fh) {
    			my ($fid, $fun) = Tracer::GetLine($fh);
    			$stats->Add(functionLineIn => 1);
    			$functions{$fid} = $fun;
    		# Get the functions in the database.
    		Trace("Polling functions in Sapling.") if T(3);
    		my %sapFuns = map {$_->[0] => $_->[1] } $sap->GetAll('Feature', 'Feature(id) LIKE ?', ["fig|$genome.%"],
    				"id function");
    		$stats->Add(sapFunctionIn => scalar(keys %sapFuns));
    		# Compare the function lists.
    		Trace("Comparing functions.") if T(3);
    		my %updates;
    		for my $fid (keys %sapFuns) {
    			if (! exists $functions{$fid}) {
    				$stats->Add(seedFunctionNotFound => 1);
    			} elsif ($functions{$fid} ne $sapFuns{$fid}) {
    				$stats->Add(seedFunctionChanged => 1);
    				$updates{$fid} = [$functions{$fid}, 'seed'];
    			} else {
    				$stats->Add(seedFunctionMatches => 1);
    		my $found = scalar(keys %updates);
    		if ($found) {
    			# We have updates to apply.
    			Trace("Processing updates to $genome.") if T(3);
    			my $newStats = SaplingFunctionLoader::Load($sap, \%updates);
    Trace("Processing complete.") if T(2);
if ($@) {
    Trace("Script failed with error: $@") if T(0);
} else {
    Trace("Script complete.") if T(2);
Trace("Statistics for this run:\n" . $stats->Show()) if T(2);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3