[Bio] / Sprout / FCupdate.pl Repository:
ViewVC logotype

Annotation of /Sprout/FCupdate.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use Stats;
23 :     use ERDB;
24 :     use Sapling;
25 :     use SeedUtils;
26 :     use FC;
27 :    
28 :     =head1 FCupdate Script
29 :    
30 :     =head2 Introduction
31 :    
32 :     FCupdate [options]
33 :    
34 :     Functional Coupling Update Script
35 :    
36 :     This script examines the 1000 highest-scoring pair sets in the Sapling database
37 :     to identify genomes that should have their coupling data updated. It will then
38 :     look for a C<pchs.raw> file for each identified genome. If one is found, it will
39 :     be used to add new pairs and pair sets to the database. Existing pair sets may
40 :     also get split or deleted. This script should be run periodically to insure that
41 :     the coupling data is up to date.
42 :    
43 :     =head2 Command-Line Options
44 :    
45 :     =over 4
46 :    
47 :     =item incomplete
48 :    
49 :     If specified, then incomplete genomes will be included when looking for new
50 :     genomes to process; otherwise, only complete genomes will be processed.
51 :    
52 :     =item trace
53 :    
54 :     Specifies the tracing level. The higher the tracing level, the more messages
55 :     will appear in the trace log. Use E to specify emergency tracing.
56 :    
57 :     =item genomes
58 :    
59 :     If this parameter is specified, then instead of looking for new genomes, the
60 :     specific genomes listed will be processed. The genomes should be specified as a
61 :     comma-delimited list.
62 :    
63 :     =item sampleSize
64 :    
65 :     Rather than examine all of the pair sets, the highest-scoring sets are scanned
66 :     to look for unrepresented genomes. This parameter specifies the number of pair
67 :     sets to scan; the default is C<1000>.
68 :    
69 :     =item user
70 :    
71 :     Name suffix to be used for log files. If omitted, the PID is used.
72 :    
73 :     =item sql
74 :    
75 :     If specified, turns on tracing of SQL activity.
76 :    
77 :     =item background
78 :    
79 :     Save the standard and error output to files. The files will be created
80 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
81 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
82 :     B<user> option above.
83 :    
84 : parrello 1.2 =item dbName
85 :    
86 :     SQL name of the target database. If not specified, the default name is used.
87 :     This option allows you to specify a backup or alternate database that can
88 :     be updated without compromising the main database.
89 :    
90 :     =item dbhost
91 :    
92 :     Name of the MySQL database host. If not specified, the default host is used.
93 :     This option is required when the default host is restricted to read-only
94 :     database access.
95 :    
96 : parrello 1.1 =item help
97 :    
98 :     Display this command's parameters and options.
99 :    
100 :     =item warn
101 :    
102 :     Create an event in the RSS feed when an error occurs.
103 :    
104 :     =item phone
105 :    
106 :     Phone number to message when the script is complete.
107 :    
108 :     =back
109 :    
110 :     =cut
111 :    
112 :     # Get the command-line options and parameters.
113 :     my ($options, @parameters) = StandardSetup([qw(FC Sapling) ],
114 :     {
115 : parrello 1.2 dbName => ["", "if specified, the SQL name of the target database"],
116 :     dbhost => ["", "if specified, the name of the target database"],
117 : parrello 1.1 incomplete => ["", "if specified, incomplete genomes will be processed"],
118 :     trace => ["2", "tracing level"],
119 :     genomes => ["", "specific genomes to process"],
120 :     sampleSize => ["1000", "number of pair sets to sample when looking for new genomes"],
121 :     phone => ["", "phone number (international format) to call when load finishes"]
122 :     },
123 :     "",
124 :     @ARGV);
125 :     # Set a variable to contain return type information.
126 :     my $rtype;
127 :     # Create the statistics object.
128 :     my $stats = Stats->new();
129 :     my $myStartTime = time();
130 :     # Insure we catch errors.
131 :     eval {
132 :     # Create the SAPLING object.
133 : parrello 1.2 my $sap = ERDB::GetDatabase('Sapling', undef, %$options);
134 : parrello 1.1 # We'll put our list of genomes to process in here.
135 :     my @genomes;
136 :     # Check for genomes specified on the command line.
137 :     if ($options->{genomes}) {
138 :     # If they're there, we extract themdirectly.
139 :     @genomes = split /\s*,\s*/, $options->{genomes};
140 :     } else {
141 :     # No command-line genomes, so we figure out the best ones by sampling
142 :     # pair sets.
143 :     @genomes = ComputeGenomes($sap, $options->{sampleSize},
144 :     $options->{incomplete});
145 :     }
146 :     Trace(scalar(@genomes) . " genomes selected for re-coupling.") if T(3);
147 :     # Loop through the genomes found.
148 :     for my $genome (@genomes) {
149 :     # Look for the PCH.RAW file in the organism directory.
150 :     my $pchFile = "$FIG_Config::organisms/$genome/pchs.raw";
151 :     if (-f $pchFile) {
152 :     # We found one, so run the genome.
153 :     Trace("Processing $genome PCH file $pchFile.") if T(2);
154 :     FC::extend_pairs_and_pair_sets($sap, $pchFile, $stats);
155 :     $stats->Add(genomes => 1);
156 :     } else {
157 :     # None exists, so issue a warning.
158 :     Trace("PCH file $pchFile not found for $genome.") if T(1);
159 :     $stats->Add(missingPCHfile => 1);
160 :     }
161 :     }
162 :     };
163 :     if ($@) {
164 :     Trace("Script failed with error: $@") if T(0);
165 :     $rtype = "error";
166 :     } else {
167 :     Trace("Script complete.") if T(2);
168 :     $rtype = "no error";
169 :     }
170 :     # Display the run statistics.
171 :     $stats->Add(duration => (time() - $myStartTime));
172 :     Trace("Statistics for this run:\n" . $stats->Show()) if T(2);
173 :     if ($options->{phone}) {
174 :     my $msgID = Tracer::SendSMS($options->{phone}, "FCupdate terminated with $rtype.");
175 :     if ($msgID) {
176 :     Trace("Phone message sent with ID $msgID.") if T(2);
177 :     } else {
178 :     Trace("Phone message not sent.") if T(2);
179 :     }
180 :     }
181 :    
182 :     =head3 ComputeGenomes
183 :    
184 :     my @genomes = ComputeGenomes($sap, $sampleSize, $incomplete);
185 :    
186 :     Compute a list of genomes whose functional coupling data should be
187 :     updated. A sampling of pair sets is examined, and the return list will
188 :     contain any genomes not present in any of the pair sets.
189 :    
190 :     =over 4
191 :    
192 :     =item sap
193 :    
194 :     L<Sapling> object to use for accessing the database.
195 :    
196 :     =item sampleSize
197 :    
198 :     Number of pair sets to sample. The specified number of pair sets with the
199 :     highest scores will be examined.
200 :    
201 :     =item incomplete
202 :    
203 :     If TRUE, then both complete and incomplete genomes will be returned; otherwise,
204 :     only complete genomes will be returned.
205 :    
206 :     =item RETURN
207 :    
208 :     Returns a list of the IDs of genomes whose functional coupling data probably
209 :     needs to be updated.
210 :    
211 :     =back
212 :    
213 :     =cut
214 :    
215 :     sub ComputeGenomes {
216 :     # Get the parameters.
217 :     my ($sap, $sampleSize, $incomplete) = @_;
218 :     # Validate the sample size.
219 :     unless ($sampleSize =~ /^\d+$/) {
220 :     Confess("Invalid sampleSize parameter.");
221 :     }
222 :     # We want to get a hash of all the genome IDs in the system. Initially, every
223 :     # genome ID will map to 1. If a genome ID is found in a pair set, we map it
224 :     # to 0. At the end, all genome IDs that still map to 1 are returned.
225 :     # First, we filter out non-prokaryotes.
226 :     my $filter = "Genome(prokaryotic) = ?";
227 :     my @parms = (1);
228 :     # Check to see if incomplete genomes are allowed. If not, we add the complete-flag
229 :     # to the filter.
230 :     if (! $incomplete) {
231 :     $filter .= " AND Genome(complete) = ?";
232 :     push @parms, 1;
233 :     }
234 :     # Create the genome ID hash.
235 :     my %genomes = map { $_ => 1 } $sap->GetFlat("Genome", $filter, \@parms, "id");
236 :     # Ask for the highest-scoring pair sets.
237 :     my @sets = $sap->GetFlat("PairSet", "ORDER BY PairSet(score) DESC LIMIT $sampleSize", [], "id");
238 :     # Loop through the sets.
239 :     for my $set (@sets) {
240 :     Trace("Processing pair set #$set for genome search.") if T(3);
241 :     # This will count the pairings.
242 :     my $count = 0;
243 :     # Get all the pairs in this set.
244 :     my $query = $sap->Get("IsDeterminedBy", "IsDeterminedBy(from-link) = ?",
245 :     [$set]);
246 :     while (my $pairData = $query->Fetch()) {
247 :     # Count this pair.
248 :     $count++;
249 :     # Record the genomes for the pegs in the pair. The pegs can be found
250 :     # separated by a colon in the pairing ID.
251 :     for my $peg (split /:/, $pairData->PrimaryValue('to-link')) {
252 :     $genomes{genome_of($peg)} = 0;
253 :     }
254 :     }
255 :     }
256 :     # Find the unrepresented genomes.
257 :     my @retVal = grep { $genomes{$_} } sort keys %genomes;
258 :     # Return them.
259 :     return @retVal;
260 :     }
261 :    
262 :    
263 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3