[Bio] / Sprout / TagCount.pl Repository:
ViewVC logotype

Annotation of /Sprout/TagCount.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     =head1 NMPDR Keyword Analysis
21 :    
22 : parrello 1.2 This script reads through the Feature table and converts the keywords
23 : parrello 1.5 to stems. Analysis of the stems is displayed in table form when it completes.
24 :     The currently-supported command-line options are as follows. This script
25 :     is for testing only. It will stop working in version 24.
26 : parrello 1.1
27 :     =over 4
28 :    
29 :     =item user
30 :    
31 :     Name suffix to be used for log files. If omitted, the PID is used.
32 :    
33 :     =item trace
34 :    
35 :     Numeric trace level. A higher trace level causes more messages to appear. The
36 :     default trace level is 2. Tracing will be directly to the standard output
37 :     as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
38 :     where I<User> is the value of the B<user> option above.
39 :    
40 :     =item sql
41 :    
42 :     If specified, turns on tracing of SQL activity.
43 :    
44 :     =item background
45 :    
46 :     Save the standard and error output to files. The files will be created
47 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
48 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
49 :     B<user> option above.
50 :    
51 :     =item h
52 :    
53 :     Display this command's parameters and options.
54 :    
55 :     =item phone
56 :    
57 :     Phone number to message when the script is complete.
58 :    
59 : parrello 1.3 =item dbname
60 :    
61 : parrello 1.5 Database name. The default is to use the main instance.
62 :    
63 :     =item limit
64 :    
65 :     Maximum number of features to process per genome. This allows
66 :     testing the facility without generating millions of results.
67 : parrello 1.3
68 : parrello 1.1 =back
69 :    
70 :     =cut
71 :    
72 :     use strict;
73 :     use Tracer;
74 : parrello 1.3 use Sprout;
75 : parrello 1.1 use Cwd;
76 :     use File::Copy;
77 :     use File::Path;
78 : parrello 1.5 use BioWords;
79 : parrello 1.1
80 :     # Get the command-line options and parameters.
81 : parrello 1.5 my ($options, @parameters) = StandardSetup([qw(BioWords)],
82 : parrello 1.1 {
83 : parrello 1.5 limit => ["", "if specified, the maximum number of features to process per genome"],
84 : parrello 1.3 dbname => [$FIG_Config::sproutDB, "name of the sprout database to use"],
85 : parrello 1.1 phone => ["", "phone number (international format) to call when load finishes"],
86 :     },
87 :     "",
88 :     @ARGV);
89 :     # Set a variable to contain return type information.
90 :     my $rtype;
91 :     # Insure we catch errors.
92 :     eval {
93 :     # Get a SPROUT object.
94 : parrello 1.5 my $sprout = Sprout->new($options->{dbname}, { xmlFileName => "$FIG_Config::sproutData/OldSproutDBD.xml" });
95 : parrello 1.1 # Compute a LIMIT clause for the maximum number of features to display.
96 : parrello 1.5 my $limit = ($options->{limit} ? "LIMIT $options->{limit}" : "");
97 : parrello 1.1 # Query all the features.
98 : parrello 1.5 # Get access to the stemmer.
99 :     my $biowords = BioWords->new();
100 : parrello 1.1 # These are counters we use to display progress.
101 :     my $count = 0;
102 :     my $stemCount = 0;
103 : parrello 1.4 my $updateCount = 0;
104 : parrello 1.5 # The keyword stems will be stored in this hash.
105 : parrello 1.1 my %keystems = ();
106 : parrello 1.5 # Get all the genomes.
107 :     my @genomes = sort $sprout->Genomes();
108 :     for my $genome (@genomes) {
109 :     Trace("Retrieving features for $genome.") if T(3);
110 :     my $features = $sprout->Get(['HasFeature', 'Feature'],
111 :     "HasFeature(from-link) = ? $limit", [$genome]);
112 :     my ($myCount, $wordsCount) = (0, 0);
113 :     # Loop through the features.
114 :     while (my $feature = $features->Fetch()) {
115 :     # Count this feature.
116 :     $myCount++;
117 :     # Get the feature ID and keywords.
118 :     my ($id, $keywords) = $feature->Values(['Feature(id)', 'Feature(keywords)']);
119 :     # Process the keyword list and extract the real words (we need to count them).
120 :     my $wordCount = grep { $biowords->IsWord($_) } $biowords->AnalyzeSearchExpression($keywords);
121 :     Trace("$wordCount keywords found for $id.") if T(4);
122 :     $wordsCount += $wordCount;
123 :     # Extract the stems and count them.
124 :     my @stems = $biowords->StemList();
125 :     for my $stem (@stems) {
126 :     if (exists $keystems{$stem}) {
127 :     $keystems{$stem}++;
128 :     } else {
129 :     $keystems{$stem} = 1;
130 :     $stemCount++;
131 : parrello 1.3 }
132 : parrello 1.5 }
133 :     # Tell the user our progress.
134 :     if ($myCount % 1000 == 0 && T(3)) {
135 :     Trace("$myCount features processed. Current ID is $id. $stemCount keystems found.");
136 : parrello 1.1 }
137 :     }
138 : parrello 1.5 # Count this genome's features.
139 :     Trace("$myCount features and $wordsCount words found for $genome.") if T(3);
140 :     $count += $myCount;
141 : parrello 1.1 }
142 :     # Tell the user what we did.
143 : parrello 1.5 Trace("$count features processed. $stemCount keystems found.") if T(2);
144 :     # Now we display our results.
145 :     Trace("Retrieving word list.") if T(2);
146 :     my $words = $biowords->WordList();
147 :     Trace("Word list found.") if T(2);
148 :     for my $word (@{$words}) {
149 :     # Get the data and trace it.
150 :     my ($stem, $phonex) = $biowords->StemLookup($word);
151 :     Trace("$word = $stem, $phonex, $keystems{$stem}") if T(0);
152 : parrello 1.1 }
153 :     Trace("Processing complete.") if T(2);
154 :     };
155 :     if ($@) {
156 :     Trace("Script failed with error: $@") if T(0);
157 :     $rtype = "error";
158 :     } else {
159 :     Trace("Script complete.") if T(2);
160 :     $rtype = "no error";
161 :     }
162 :     if ($options->{phone}) {
163 :     my $msgID = Tracer::SendSMS($options->{phone}, "NMPDR Tag Counter terminated with $rtype.");
164 :     if ($msgID) {
165 :     Trace("Phone message sent with ID $msgID.") if T(2);
166 :     } else {
167 :     Trace("Phone message not sent.") if T(2);
168 :     }
169 :     }
170 :    
171 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3