[Bio] / Sprout / FCupdate.pl Repository:
ViewVC logotype

Annotation of /Sprout/FCupdate.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use Stats;
23 :     use ERDB;
24 :     use Sapling;
25 :     use SeedUtils;
26 :     use FC;
27 :    
28 :     =head1 FCupdate Script
29 :    
30 :     =head2 Introduction
31 :    
32 :     FCupdate [options]
33 :    
34 :     Functional Coupling Update Script
35 :    
36 :     This script examines the 1000 highest-scoring pair sets in the Sapling database
37 :     to identify genomes that should have their coupling data updated. It will then
38 :     look for a C<pchs.raw> file for each identified genome. If one is found, it will
39 :     be used to add new pairs and pair sets to the database. Existing pair sets may
40 :     also get split or deleted. This script should be run periodically to insure that
41 :     the coupling data is up to date.
42 :    
43 :     =head2 Command-Line Options
44 :    
45 :     =over 4
46 :    
47 :     =item incomplete
48 :    
49 :     If specified, then incomplete genomes will be included when looking for new
50 :     genomes to process; otherwise, only complete genomes will be processed.
51 :    
52 :     =item trace
53 :    
54 :     Specifies the tracing level. The higher the tracing level, the more messages
55 :     will appear in the trace log. Use E to specify emergency tracing.
56 :    
57 :     =item genomes
58 :    
59 :     If this parameter is specified, then instead of looking for new genomes, the
60 :     specific genomes listed will be processed. The genomes should be specified as a
61 :     comma-delimited list.
62 :    
63 :     =item sampleSize
64 :    
65 :     Rather than examine all of the pair sets, the highest-scoring sets are scanned
66 :     to look for unrepresented genomes. This parameter specifies the number of pair
67 :     sets to scan; the default is C<1000>.
68 :    
69 :     =item user
70 :    
71 :     Name suffix to be used for log files. If omitted, the PID is used.
72 :    
73 :     =item sql
74 :    
75 :     If specified, turns on tracing of SQL activity.
76 :    
77 :     =item background
78 :    
79 :     Save the standard and error output to files. The files will be created
80 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
81 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
82 :     B<user> option above.
83 :    
84 :     =item help
85 :    
86 :     Display this command's parameters and options.
87 :    
88 :     =item warn
89 :    
90 :     Create an event in the RSS feed when an error occurs.
91 :    
92 :     =item phone
93 :    
94 :     Phone number to message when the script is complete.
95 :    
96 :     =back
97 :    
98 :     =cut
99 :    
100 :     # Get the command-line options and parameters.
101 :     my ($options, @parameters) = StandardSetup([qw(FC Sapling) ],
102 :     {
103 :     incomplete => ["", "if specified, incomplete genomes will be processed"],
104 :     trace => ["2", "tracing level"],
105 :     genomes => ["", "specific genomes to process"],
106 :     sampleSize => ["1000", "number of pair sets to sample when looking for new genomes"],
107 :     phone => ["", "phone number (international format) to call when load finishes"]
108 :     },
109 :     "",
110 :     @ARGV);
111 :     # Set a variable to contain return type information.
112 :     my $rtype;
113 :     # Create the statistics object.
114 :     my $stats = Stats->new();
115 :     my $myStartTime = time();
116 :     # Insure we catch errors.
117 :     eval {
118 :     # Create the SAPLING object.
119 :     my $sap = ERDB::GetDatabase('Sapling');
120 :     # We'll put our list of genomes to process in here.
121 :     my @genomes;
122 :     # Check for genomes specified on the command line.
123 :     if ($options->{genomes}) {
124 :     # If they're there, we extract themdirectly.
125 :     @genomes = split /\s*,\s*/, $options->{genomes};
126 :     } else {
127 :     # No command-line genomes, so we figure out the best ones by sampling
128 :     # pair sets.
129 :     @genomes = ComputeGenomes($sap, $options->{sampleSize},
130 :     $options->{incomplete});
131 :     }
132 :     Trace(scalar(@genomes) . " genomes selected for re-coupling.") if T(3);
133 :     # Loop through the genomes found.
134 :     for my $genome (@genomes) {
135 :     # Look for the PCH.RAW file in the organism directory.
136 :     my $pchFile = "$FIG_Config::organisms/$genome/pchs.raw";
137 :     if (-f $pchFile) {
138 :     # We found one, so run the genome.
139 :     Trace("Processing $genome PCH file $pchFile.") if T(2);
140 :     FC::extend_pairs_and_pair_sets($sap, $pchFile, $stats);
141 :     $stats->Add(genomes => 1);
142 :     } else {
143 :     # None exists, so issue a warning.
144 :     Trace("PCH file $pchFile not found for $genome.") if T(1);
145 :     $stats->Add(missingPCHfile => 1);
146 :     }
147 :     }
148 :     };
149 :     if ($@) {
150 :     Trace("Script failed with error: $@") if T(0);
151 :     $rtype = "error";
152 :     } else {
153 :     Trace("Script complete.") if T(2);
154 :     $rtype = "no error";
155 :     }
156 :     # Display the run statistics.
157 :     $stats->Add(duration => (time() - $myStartTime));
158 :     Trace("Statistics for this run:\n" . $stats->Show()) if T(2);
159 :     if ($options->{phone}) {
160 :     my $msgID = Tracer::SendSMS($options->{phone}, "FCupdate terminated with $rtype.");
161 :     if ($msgID) {
162 :     Trace("Phone message sent with ID $msgID.") if T(2);
163 :     } else {
164 :     Trace("Phone message not sent.") if T(2);
165 :     }
166 :     }
167 :    
168 :     =head3 ComputeGenomes
169 :    
170 :     my @genomes = ComputeGenomes($sap, $sampleSize, $incomplete);
171 :    
172 :     Compute a list of genomes whose functional coupling data should be
173 :     updated. A sampling of pair sets is examined, and the return list will
174 :     contain any genomes not present in any of the pair sets.
175 :    
176 :     =over 4
177 :    
178 :     =item sap
179 :    
180 :     L<Sapling> object to use for accessing the database.
181 :    
182 :     =item sampleSize
183 :    
184 :     Number of pair sets to sample. The specified number of pair sets with the
185 :     highest scores will be examined.
186 :    
187 :     =item incomplete
188 :    
189 :     If TRUE, then both complete and incomplete genomes will be returned; otherwise,
190 :     only complete genomes will be returned.
191 :    
192 :     =item RETURN
193 :    
194 :     Returns a list of the IDs of genomes whose functional coupling data probably
195 :     needs to be updated.
196 :    
197 :     =back
198 :    
199 :     =cut
200 :    
201 :     sub ComputeGenomes {
202 :     # Get the parameters.
203 :     my ($sap, $sampleSize, $incomplete) = @_;
204 :     # Validate the sample size.
205 :     unless ($sampleSize =~ /^\d+$/) {
206 :     Confess("Invalid sampleSize parameter.");
207 :     }
208 :     # We want to get a hash of all the genome IDs in the system. Initially, every
209 :     # genome ID will map to 1. If a genome ID is found in a pair set, we map it
210 :     # to 0. At the end, all genome IDs that still map to 1 are returned.
211 :     # First, we filter out non-prokaryotes.
212 :     my $filter = "Genome(prokaryotic) = ?";
213 :     my @parms = (1);
214 :     # Check to see if incomplete genomes are allowed. If not, we add the complete-flag
215 :     # to the filter.
216 :     if (! $incomplete) {
217 :     $filter .= " AND Genome(complete) = ?";
218 :     push @parms, 1;
219 :     }
220 :     # Create the genome ID hash.
221 :     my %genomes = map { $_ => 1 } $sap->GetFlat("Genome", $filter, \@parms, "id");
222 :     # Ask for the highest-scoring pair sets.
223 :     my @sets = $sap->GetFlat("PairSet", "ORDER BY PairSet(score) DESC LIMIT $sampleSize", [], "id");
224 :     # Loop through the sets.
225 :     for my $set (@sets) {
226 :     Trace("Processing pair set #$set for genome search.") if T(3);
227 :     # This will count the pairings.
228 :     my $count = 0;
229 :     # Get all the pairs in this set.
230 :     my $query = $sap->Get("IsDeterminedBy", "IsDeterminedBy(from-link) = ?",
231 :     [$set]);
232 :     while (my $pairData = $query->Fetch()) {
233 :     # Count this pair.
234 :     $count++;
235 :     # Record the genomes for the pegs in the pair. The pegs can be found
236 :     # separated by a colon in the pairing ID.
237 :     for my $peg (split /:/, $pairData->PrimaryValue('to-link')) {
238 :     $genomes{genome_of($peg)} = 0;
239 :     }
240 :     }
241 :     }
242 :     # Find the unrepresented genomes.
243 :     my @retVal = grep { $genomes{$_} } sort keys %genomes;
244 :     # Return them.
245 :     return @retVal;
246 :     }
247 :    
248 :    
249 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3