[Bio] / Sprout / BadIdFinder.pl Repository:
ViewVC logotype

Annotation of /Sprout/BadIdFinder.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use FIG;
23 :     use Stats;
24 :    
25 :    
26 :     =head1 BadIdFinder Script
27 :    
28 :     BadIdFinder [options]
29 :    
30 :     Find errors in corresponding IDs
31 :    
32 :     =head2 Introduction
33 :    
34 :     This script runs through the corresponding_ids IsAlsoFoundIn relationship of the
35 :     %FIG{SproutDatabase}% looking for IDs that appear in ID sets for different
36 :     genomes. If any such IDs are found, they are output as errors.
37 :    
38 :     =head2 Command-Line Options
39 :    
40 :     =over 4
41 :    
42 :     =item output
43 :    
44 :     Name of the output directory. The output directory will contain three files
45 :     produced by this script-- C<contigs.tbl>, C<multiples.tbl>, and
46 :     C<incompatibles.tbl>. Each file is a standard tab-delimited file containing
47 :     identifiers followed by a comma-delimited list of file set numbers. The presence
48 :     of an identifier in a file indicates it has the type of problem indicated
49 :     by the file name: I<incompatible> indicates the identifier maps to FIG IDs
50 :     in multiple genomes, I<contig> indicates the identifier is a contig ID, and
51 :     I<multiple> indicates the identifier maps to multiple FIG IDs in the same
52 :     genome. The default output is to the sapling data directory if it exists
53 :     and the FIG temporary directory otherwise.
54 :    
55 :     =item limit
56 :    
57 :     Stop after the specified number of identifiers has been processed. This is a utility
58 :     parameter for testing purposes.
59 :    
60 :     =item trace
61 :    
62 :     Specifies the tracing level. The higher the tracing level, the more messages
63 :     will appear in the trace log. Use E to specify emergency tracing.
64 :    
65 :     =item user
66 :    
67 :     Name suffix to be used for log files. If omitted, the PID is used.
68 :    
69 :     =item sql
70 :    
71 :     If specified, turns on tracing of SQL activity.
72 :    
73 :     =item background
74 :    
75 :     Save the standard and error output to files. The files will be created
76 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
77 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
78 :     B<user> option above.
79 :    
80 :     =item help
81 :    
82 :     Display this command's parameters and options.
83 :    
84 :     =item warn
85 :    
86 :     Create an event in the RSS feed when an error occurs.
87 :    
88 :     =item phone
89 :    
90 :     Phone number to message when the script is complete.
91 :    
92 :     =back
93 :    
94 :     =cut
95 :    
96 :     # Get the command-line options and parameters.
97 :     my ($options, @parameters) = StandardSetup([qw(FIG) ],
98 :     {
99 :     output => [$FIG_Config::saplingData || $FIG_Config::temp, "output directory name"],
100 :     trace => ["2", "tracing level"],
101 :     phone => ["", "phone number (international format) to call when load finishes"]
102 :     },
103 :     "",
104 :     @ARGV);
105 :     # Set a variable to contain return type information.
106 :     my $rtype;
107 :     # Insure we catch errors.
108 :     eval {
109 :     Trace("Connecting to databases.") if T(2);
110 :     # Create the FIG object.
111 :     my $fig = FIG->new();
112 :     # Create a statistics object.
113 :     my $stats = Stats->new();
114 :     # Get access to the database.
115 :     my $dbh = $fig->db_handle();
116 :     # Do demand-driven flow control. Otherwise it will take forever to start.
117 :     $dbh->set_demand_driven(1);
118 :     # Because we're doing the demand-driven thing, this is a two-pass algorithm.
119 :     # The first pass runs through the whole table in protein order. We'll
120 :     # accumulate the proteins of interest in this output file.
121 :     my $tempFileName = "$FIG_Config::temp/BadIdFinderQ$$.tbl";
122 :     my $oh = Open(undef, ">$tempFileName");
123 :     # We want to find all proteins that are in more than one identifier set. This
124 :     # variable tracks the currently-active ID.
125 :     my $thisID;
126 :     # This is a hash of the sets for the current ID. Each set ID is mapped to a
127 :     # file number.
128 :     my %setIDs;
129 :     # Create the initial query.
130 :     my $sth = $dbh->prepare_command("SELECT protein_id, set_id, file_num FROM id_correspondence ORDER BY protein_id");
131 :     $sth->execute() || Confess("Database error: " . $sth->errstr);
132 :     Trace("Starting sort loop.") if T(2);
133 :     # Loop through the results.
134 :     while (my $rowPtr = $sth->fetch()) {
135 :     # Extract the results.
136 :     my ($protein, $set, $file) = @$rowPtr;
137 :     # Accumulate our progress.
138 :     Trace($stats->Ask('relationships') . " relationship records processed.")
139 :     if $stats->Check(relationships => 10000) && T(3);
140 :     # Is this a new identifier?
141 :     if ($protein ne $thisID) {
142 :     # Yes it is. Count it.
143 :     $stats->Add(identifiers => 1);
144 :     # Check the previous identifier and output it if we need to revisit it.
145 :     my $flag = OutputCheck($oh, $thisID, \%setIDs);
146 :     $stats->Add(possibleDuplicate => $flag);
147 :     # Clear the hash.
148 :     %setIDs = ();
149 :     # Save the identifier.
150 :     $thisID = $protein;
151 :     }
152 :     # Save this identifier's set information.
153 :     $setIDs{$set} = $file;
154 :     }
155 :     # Check the final identifier.
156 :     my $flag = OutputCheck($oh, $thisID, \%setIDs);
157 :     $stats->Add(possibleDuplicate => $flag);
158 :     # Clear the statement handle.
159 :     undef $sth;
160 :     Trace("Sort loop completed.") if T(2);
161 :     # Close the output file and re-open it for input.
162 :     close $oh;
163 :     my $ih = Open(undef, "<$tempFileName");
164 :     # Format the SELECT command for the next loop.
165 :     my $command = "SELECT i.protein_id FROM id_correspondence i " .
166 :     "WHERE i.set_id = ? AND i.type = 1";
167 :     $sth = $dbh->prepare_command($command);
168 :     # Open the output files.
169 :     my %files;
170 :     for my $fileType (qw(incompatibles contigs multiples)) {
171 :     $files{$fileType} = Open(undef, ">$options->{output}/$fileType.tbl");
172 :     }
173 :     # Loop through the identifiers queued for processing.
174 :     Trace("Starting second pass loop.") if T(2);
175 :     while (! eof $ih) {
176 :     # Get this identifier.
177 :     my ($id, $sets, $files) = Tracer::GetLine($ih);
178 :     # Accumulate our progress.
179 :     Trace($stats->Ask('records') . " possibility records processed.")
180 :     if $stats->Check(records => 1000) && T(3);
181 :     # Is this a contig ID?
182 :     if ($id =~ /^[A-Z][A-Z]_\d+$/) {
183 :     Trace("Identifier $id from file $files is a contig ID.") if T(3);
184 :     $stats->Add(contigs => 1);
185 :     Tracer::PutLine($files{contigs}, [$id, $files]);
186 :     } else {
187 :     # We'll track the genomes containing the FIG IDs in here.
188 :     my %genomes;
189 :     # Get the FIG IDs for the sets in the incoming list.
190 :     for my $set (split /,/, $sets) {
191 :     $stats->Add(setQuery => 1);
192 :     # Ask for this set's FIG IDs. There is generally only one. Sometimes there are two.
193 :     $sth->execute($set) || Confess("Error in possibility query: " . $sth->errstr);
194 :     my $rows = $sth->fetchall_arrayref();
195 :     # Did we find anything?
196 :     if (! @$rows) {
197 :     # No, this is a seedless set.
198 :     $stats->Add(seedless => 1);
199 :     } else {
200 :     # Yes! Get the FIG IDs and count their genomes.
201 :     for my $row (@$rows) {
202 :     $stats->Add(figIDs => 1);
203 :     my $fid = $row->[0];
204 :     my $genome = FIG::genome_of($fid);
205 :     $genomes{$genome} = 1;
206 :     }
207 :     }
208 :     }
209 :     # Now, if we have more than one genome in this set, it's a problem.
210 :     if (scalar(keys %genomes) > 1) {
211 :     Trace("Identifier $id is in incompatible sets $sets from files $files.") if T(1);
212 :     $stats->Add(incompatibles => 1);
213 :     Tracer::PutLine($files{incompatibles}, [$id, $files]);
214 :     } else {
215 :     my ($genome) = keys %genomes;
216 :     Trace("Identifier $id is in multiple sets $sets for the genome $genome.") if T(3);
217 :     $stats->Add(multiples => 1);
218 :     Tracer::PutLine($files{multiples}, [$id, $files]);
219 :     }
220 :     }
221 :     }
222 :     # Close the output files.
223 :     for my $fileType (keys %files) {
224 :     close $files{$fileType};
225 :     }
226 :     # Display the statistics.
227 :     Trace("Statistics from run:\n" . $stats->Show()) if T(2);
228 :     };
229 :     if ($@) {
230 :     Trace("Script failed with error: $@") if T(0);
231 :     $rtype = "error";
232 :     } else {
233 :     Trace("Script complete.") if T(2);
234 :     $rtype = "no error";
235 :     }
236 :     if ($options->{phone}) {
237 :     my $msgID = Tracer::SendSMS($options->{phone}, "BadIdFinder terminated with $rtype.");
238 :     if ($msgID) {
239 :     Trace("Phone message sent with ID $msgID.") if T(2);
240 :     } else {
241 :     Trace("Phone message not sent.") if T(2);
242 :     }
243 :     }
244 :    
245 :     =head3 OutputCheck
246 :    
247 :     my $flag = OutputCheck($oh, $thisID, \%setIDs);
248 :    
249 :     Check the specified hash of set IDs. If more than one set ID is present,
250 :     output the specified ID along with its set and file information.
251 :    
252 :     =over 4
253 :    
254 :     =item oh
255 :    
256 :     File handle for the output file.
257 :    
258 :     =item thisID
259 :    
260 :     Identifier of current interest.
261 :    
262 :     =item setIDs
263 :    
264 :     Reference to a hash mapping identifier set IDs to file numbers.
265 :    
266 :     =item RETURN
267 :    
268 :     Returns 1 if the identifier is output, else 0.
269 :    
270 :     =back
271 :    
272 :     =cut
273 :    
274 :     sub OutputCheck {
275 :     # Get the parameters.
276 :     my ($oh, $thisID, $setIDs) = @_;
277 :     # Declare the return variable.
278 :     my $retVal = 0;
279 :     # Get the set IDs.
280 :     my @sets = sort { $a <=> $b } keys %$setIDs;
281 :     # If there is more than one set, we have a possible duplicate.
282 :     if (@sets > 1) {
283 :     # Denote we're writing out a possible.
284 :     $retVal = 1;
285 :     # Compute the list of file numbers.
286 :     my %fileMap = map { $_ => 1 } values %$setIDs;
287 :     # Output the identifier, the list of sets, and the list of file numbers.
288 :     my $line = join("\t", $thisID,
289 :     join(",", @sets),
290 :     join(", ", keys %fileMap));
291 :     print $oh "$line\n";
292 :     }
293 :     # Return the activity indicator.
294 :     return $retVal;
295 :     }
296 :    
297 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3