[Bio] / Sprout / TagCount.pl Repository:
ViewVC logotype

View of /Sprout/TagCount.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Tue Sep 9 21:02:10 2008 UTC (11 years, 3 months ago) by parrello
Branch: MAIN
CVS Tags: rast_release_2008_09_29, rast_2008_0924, rast_rel_2008_09_30, rast_rel_2008_10_29, mgrast_rel_2008_0923, mgrast_rel_2008_0924, rast_rel_2009_02_05, mgrast_rel_2008_0625, rast_rel_2008_12_18, mgrast_rel_2008_1110_v2, rast_rel_2008_10_09, mgrast_rel_2008_0919, mgrast_rel_2008_1110, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2009_03_26, rast_rel_2008_11_24
Changes since 1.4: +58 -81 lines
Changes for v24 database.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

=head1 NMPDR Keyword Analysis

This script reads through the Feature table and converts the keywords
to stems. Analysis of the stems is displayed in table form when it completes.
The currently-supported command-line options are as follows. This script
is for testing only. It will stop working in version 24.

=over 4

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item phone

Phone number to message when the script is complete.

=item dbname

Database name. The default is to use the main instance.

=item limit

Maximum number of features to process per genome. This allows
testing the facility without generating millions of results.

=back

=cut

use strict;
use Tracer;
use Sprout;
use Cwd;
use File::Copy;
use File::Path;
use BioWords;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(BioWords)],
                                           {
                                              limit => ["", "if specified, the maximum number of features to process per genome"],
                                              dbname => [$FIG_Config::sproutDB, "name of the sprout database to use"],
                                              phone => ["", "phone number (international format) to call when load finishes"],
                                           },
                                           "",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    # Get a SPROUT object.
    my $sprout = Sprout->new($options->{dbname}, { xmlFileName => "$FIG_Config::sproutData/OldSproutDBD.xml" });
    # Compute a LIMIT clause for the maximum number of features to display.
    my $limit = ($options->{limit} ? "LIMIT $options->{limit}" : "");
    # Query all the features.
    # Get access to the stemmer.
    my $biowords = BioWords->new();
    # These are counters we use to display progress.
    my $count = 0;
    my $stemCount = 0;
    my $updateCount = 0;
    # The keyword stems will be stored in this hash.
    my %keystems = ();
    # Get all the genomes.
    my @genomes = sort $sprout->Genomes();
    for my $genome (@genomes) {
        Trace("Retrieving features for $genome.") if T(3);
        my $features = $sprout->Get(['HasFeature', 'Feature'],
                                    "HasFeature(from-link) = ? $limit", [$genome]);
        my ($myCount, $wordsCount) = (0, 0);
        # Loop through the features.
        while (my $feature = $features->Fetch()) {
            # Count this feature.
            $myCount++;
            # Get the feature ID and keywords.
            my ($id, $keywords) = $feature->Values(['Feature(id)', 'Feature(keywords)']);
            # Process the keyword list and extract the real words (we need to count them).
            my $wordCount = grep { $biowords->IsWord($_) } $biowords->AnalyzeSearchExpression($keywords);
            Trace("$wordCount keywords found for $id.") if T(4);
            $wordsCount += $wordCount;
            # Extract the stems and count them.
            my @stems = $biowords->StemList();
            for my $stem (@stems) {
                if (exists $keystems{$stem}) {
                    $keystems{$stem}++;
                } else {
                    $keystems{$stem} = 1;
                    $stemCount++;
                }
            }
            # Tell the user our progress.
            if ($myCount % 1000 == 0 && T(3)) {
                Trace("$myCount features processed. Current ID is $id. $stemCount keystems found.");
            }
        }
        # Count this genome's features.
        Trace("$myCount features and $wordsCount words found for $genome.") if T(3);
        $count += $myCount;
    }
    # Tell the user what we did.
    Trace("$count features processed. $stemCount keystems found.") if T(2);
    # Now we display our results.
    Trace("Retrieving word list.") if T(2);
    my $words = $biowords->WordList();
    Trace("Word list found.") if T(2);
    for my $word (@{$words}) {
        # Get the data and trace it.
        my ($stem, $phonex) = $biowords->StemLookup($word);
        Trace("$word = $stem, $phonex, $keystems{$stem}") if T(0);
    }
    Trace("Processing complete.") if T(2);
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
}
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "NMPDR Tag Counter terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3