[Bio] / Sprout / TagCount.pl Repository:
ViewVC logotype

View of /Sprout/TagCount.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Sat May 10 17:35:57 2008 UTC (11 years, 7 months ago) by parrello
Branch: MAIN
CVS Tags: rast_rel_2008_06_18, rast_rel_2008_06_16, rast_rel_2008_08_07, rast_rel_2008_07_21, mgrast_rel_2008_0806
Changes since 1.3: +4 -2 lines
Added ability to specify an alternate database and a counter for features changed.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

=head1 NMPDR Keyword Analysis

This script reads through the Feature table and converts the keywords
to stems. Only keywords made up entirely of letters will be converted.
A hash will be created that contains a count of the number of features
having each stem. This is loaded into an index table used to give the
user a list of alternate words after a search.

The currently-supported command-line options are as follows.

=over 4

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item phone

Phone number to message when the script is complete.

=item dbname

Database name. Because this script modifies the database, it is sometimes useful
to run it on something other than the main database instance. The default is
to use the main instance.

=back

=cut

use strict;
use Tracer;
use Sprout;
use Cwd;
use File::Copy;
use File::Path;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(ERDB) ],
                                           {
                                              dbname => [$FIG_Config::sproutDB, "name of the sprout database to use"],
                                              phone => ["", "phone number (international format) to call when load finishes"],
                                           },
                                           "",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    # Get a SPROUT object.
    my $sprout = Sprout->new($options->{dbname});
    # Compute a LIMIT clause for the maximum number of features to display.
    my $limit = ($options->{max} ? "LIMIT $options->{max}" : "");
    # Query all the features.
    Trace("Retrieving features.") if T(2);
    my $features = $sprout->Get(['Feature'], "ORDER BY Feature(id) $limit", []);
    # These are counters we use to display progress.
    my $count = 0;
    my $stemCount = 0;
    my $updateCount = 0;
    # The keywords will be stored in this hash.
    my %keystems = ();
    # Loop through the features.
    while (my $feature = $features->Fetch()) {
        # Count this feature.
        $count++;
        # Get the feature ID and keywords.
        my ($id, $keywords) = $feature->Values(['Feature(id)', 'Feature(keywords)']);
        # Tell the user our progress.
        if ($count % 1000 == 0 && T(3)) {
            Trace("$count features processed. Current ID is $id. $stemCount keystems found. $updateCount updates.");
        }
        # Break up the keyword list. and extract the keywords that are actual words and are not too short.
        my @keywords = split /\s+/, $keywords;
        my $wordCount = scalar @keywords;
        Trace("$wordCount keywords found for $id.") if T(4);
        # This hash is used to remember which stems have already been computed for this feature.
        my %localStems = ();
        # This flag will be TRUE if we need to update the feature table entry.
        my $changed = 0;
        # This will be the new keyword string.
        my @newKeywords = ();
        # Loop through the words.
        for my $keyword (@keywords) {
            # Compute the stem for this keyword.
            my $keystem = $sprout->Stem($keyword);
            # Is this word stemmable?
            if (! defined $keystem) {
                # No, so add the original word to the new keyword list.
                push @newKeywords, $keyword;
            } else {
                # Yes, so add the stem to the new keyword list.
                push @newKeywords, $keystem;
                # It it different from the original word?
                if ($keystem ne $keyword) {
                    # Yes, we have a change.
                    $changed = 1;
                }
                # Is it new for this feature?
                if (! exists $localStems{$keystem}) {
                    # Yes, so we process it.
                    if (! exists $keystems{$keystem}) {
                        $stemCount++;
                        $keystems{$keystem} = 1;
                    } else {
                        $keystems{$keystem}++;
                    }
                }
                # Make sure we don't count it again for this feature.
                $localStems{$keystem} = 1;
            }
        }
        # Check for a change.
        if ($changed) {
            # Yes, update the feature table with the new keywords.
            $updateCount++;
            $sprout->UpdateEntity(Feature => $id, { keywords => join(' ', @newKeywords) });
        }
    }
    # Tell the user what we did.
    Trace("$count features processed. $stemCount keystems found, $updateCount updates.") if T(2);
    # Now we create the keyword table.
    $sprout->CreateTable(Keyword => 1, $stemCount);
    # This is for progress display.
    $count = 0;
    for my $keystem (sort keys %keystems) {
        # Insert this keyword into the table.
        $sprout->InsertObject(Keyword => { id => $keystem, count => $keystems{$keystem} });
        # Count it.
        $count++;
        if ($count % 1000 == 0 && T(3)) {
            Trace("$count of $stemCount keystems inserted.");
        }
    }
    Trace("Processing complete.") if T(2);
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
}
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "NMPDR Tag Counter terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3