[Bio] / Sprout / AliasCrunch.pl Repository:
ViewVC logotype

Annotation of /Sprout/AliasCrunch.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use Stats;
23 :     use AliasAnalysis;
24 :     use FIGRules;
25 :    
26 :    
27 :     =head1 AliasCrunch Script
28 :    
29 :     =head2 Introduction
30 :    
31 :     AliasCrunch [options]
32 :    
33 :     This script finds all the aliases scattered throughout the SEED and puts them in
34 :     flat files organized by genome ID. The files are currently stored in the Sprout
35 :     data directory.
36 :    
37 :     The aliases come from three sources: (1) the peg.synonyms file, (2) the C<tbl>
38 :     files in the organism directories, and (3) the ID correspondence file. The three
39 :     sources have different methods for sorting the data, and the data in the
40 :     peg.synonyms file is considered secondary to the data in the other files because
41 :     it identifies proteins, not features.
42 :    
43 :     Each of the three sources has its own special rules. The C<tbl> files may
44 :     contain aliases in their natural form as well as the normalized form. The
45 :     ID correspondence file contains only natural forms, but the types are determined
46 :     by the position of the alias in each record. In addition, some of the RefSeq
47 :     IDs in the correspondence file are actually contig IDs and have to be deleted.
48 :     The C<peg.synonyms> file and the ID correspondence may have records which do
49 :     not correspond to any FIG IDs. These are ignored.
50 :    
51 :     The output alias files will be lexically sorted and have the following fields
52 :     on each line.
53 :    
54 :     =over 4
55 :    
56 :     =item 1
57 :    
58 :     %FIG{FIG ID}% of the relevant feature
59 :    
60 :     =item 2
61 :    
62 :     Alias identifier
63 :    
64 :     =item 3
65 :    
66 :     Alias type
67 :    
68 :     =item 4
69 :    
70 :     Confidence grade: C<A> for curated, C<B> for uploaded, C<C> for peg synonym
71 :    
72 :     =back
73 :    
74 :     =head2 Command-Line Options
75 :    
76 :     =over 4
77 :    
78 :     =item trace
79 :    
80 :     Specifies the tracing level. The higher the tracing level, the more messages
81 :     will appear in the trace log. Use E to specify emergency tracing.
82 :    
83 :     =item output
84 :    
85 :     Directory in which the output files should be placed. The output files will all
86 :     have names of the form C<alias.>I<genome>C<.tbl>, where I<genome> is the genome
87 :     ID, and will be sorted by %FIG{FIG ID}%.
88 :    
89 :     =item clear
90 :    
91 :     Clears any existing alias files from the directory before processing.
92 :    
93 :     =item user
94 :    
95 :     Name suffix to be used for log files. If omitted, the PID is used.
96 :    
97 :     =item sql
98 :    
99 :     If specified, turns on tracing of SQL activity.
100 :    
101 :     =item keepTemp
102 :    
103 :     If specified, the intermediate temporary files will not be deleted when the
104 :     script is finished using them.
105 :    
106 :     =item background
107 :    
108 :     Save the standard and error output to files. The files will be created
109 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
110 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
111 :     B<user> option above.
112 :    
113 :     =item help
114 :    
115 :     Display this command's parameters and options.
116 :    
117 :     =item warn
118 :    
119 :     Create an event in the RSS feed when an error occurs.
120 :    
121 :     =item phone
122 :    
123 :     Phone number to message when the script is complete.
124 :    
125 :     =back
126 :    
127 :     =cut
128 :    
129 :     # Get the command-line options and parameters.
130 :     my ($options, @parameters) = StandardSetup([qw() ],
131 :     {
132 :     trace => ["3", "tracing level"],
133 :     output => [$FIG_Config::sproutData, "output directory for alias files"],
134 :     clear => ["", "if specified, existing alias files will be erased"],
135 :     keepTemp => ["", "if specified, the intermediate temporary files will not be deleted"],
136 :     phone => ["", "phone number (international format) to call when load finishes"]
137 :     },
138 :     "",
139 :     @ARGV);
140 :     # Set a variable to contain return type information.
141 :     my $rtype;
142 :     # Create a statistics object.
143 :     my $stats = Stats->new();
144 :     # Compute the merge file name.
145 :     my $mfileName = "$FIG_Config::temp/mergeAC$$.tmp.tbl";
146 :     # This will contain a list of the temporary files to delete at end of run.
147 :     my @tempFiles = $mfileName;
148 :     # Insure we catch errors.
149 :     eval {
150 :     # Compute the output directory.
151 :     my $output = $options->{output};
152 :     if (! $output) {
153 :     Confess("No output directory specified.");
154 :     } elsif (! -d $output) {
155 :     Confess("Invalid output directory \"$output\".");
156 :     } else {
157 :     # Here we have a valid output directory. Do we need to clear it?
158 :     if ($options->{clear}) {
159 :     # Yes. Find and delete any existing files.
160 :     Trace("Clearing old alias files from $output directory.") if T(3);
161 :     for my $file (grep { $_ =~ /^alias\.[0-9.]+\.tbl$/ } OpenDir($output)) {
162 :     unlink "$output/$file";
163 :     $stats->Add(filesCleared => 1);
164 :     }
165 :     }
166 :     # Open the merge file. Each record of the merge file will contain (1) a
167 :     # normalized alias, (2) a confidence grade (A, B, C), (3) an alias type,
168 :     # and (4) a feature ID. The merge file is then read back so that we can
169 :     # determine the list of features associated with each alias. Only the FIG IDs
170 :     # with the best confidence (A over B, B over C) for an alias will be kept.
171 :     Trace("Creating merge file $mfileName.") if T(2);
172 :     my $mergeH = Open(undef, "| sort -u >$mfileName");
173 :     # Now read in the three sources of data.
174 :     ReadCorrespondingIDs($mergeH, $stats);
175 :     ReadOrganismIDs($mergeH, $stats);
176 :     ReadSynonyms($mergeH, $stats);
177 :     # Close the merge file and reopen it for input.
178 :     close $mergeH;
179 :     Trace("Processing merge file results.") if T(2);
180 :     my $ih = Open(undef, "<$mfileName");
181 :     # This file will be used to sort the aliases by feature ID.
182 :     my $sortFileName = "$FIG_Config::temp/sortAC$$.tmp.tbl";
183 :     push @tempFiles, $sortFileName;
184 :     my $oh = Open(undef, "| sort >$sortFileName");
185 :     # Now we set up the data we'll be accumulating for each alias.
186 :     # "$prevAlias" is the current alias, "$prevConf" is its confidence.
187 :     # We emit a row if we encounter a new alias or an old alias at the
188 :     # same confidence level.
189 :     my ($prevAlias, $prevConf);
190 :     # Loop through the merge file.
191 :     while (! eof $ih) {
192 :     # Get this row of data.
193 :     my ($alias, $conf, $type, $fid) = Tracer::GetLine($ih);
194 :     Trace($stats->Ask('mergeFileRecords') . " merge file records read.") if $stats->Check(mergeFileRecords => 5000) && T(3);
195 :     # Should we emit this alias?
196 :     if ($alias ne $prevAlias) {
197 :     # Yes. This is a new alias.
198 :     $prevAlias = $alias;
199 :     $prevConf = $conf;
200 :     # Emit this alias.
201 :     WriteAlias($oh, $alias, $conf, $type, $fid);
202 :     } elsif ($conf eq $prevConf) {
203 :     # Yes. This is an old alias at the same confidence level.
204 :     WriteAlias($oh, $alias, $conf, $type, $fid);
205 :     }
206 :     }
207 :     # Close the sort file.
208 :     Trace("Closing sort file.") if T(2);
209 :     close $oh;
210 :     # This will contain the current genome ID.
211 :     my $currGenome;
212 :     # This will contain the current output file handle.
213 :     my $gh;
214 :     # Now read the sort file and split it up by genome ID.
215 :     $ih = Open(undef, "<$sortFileName");
216 :     while (! eof $ih) {
217 :     # Get the next record.
218 :     my ($fid, $alias, $type, $conf) = Tracer::GetLine($ih);
219 :     # Compute the genome ID.
220 :     my ($genomeID) = FIGRules::ParseFeatureID($fid);
221 :     # Is it new?
222 :     if ($genomeID ne $currGenome) {
223 :     # Yes. Close the old file and start a new one.
224 :     if (defined $gh) {
225 :     close $gh;
226 :     }
227 :     $gh = Open(undef, ">$options->{output}/alias.$genomeID.tbl");
228 :     Trace("Genome file for $genomeID created.") if T(3);
229 :     $currGenome = $genomeID;
230 :     }
231 :     # Write this record to the current output file.
232 :     Tracer::PutLine($gh, [$fid, $alias, $type, $conf]);
233 :     }
234 :     # Close the current genome output file.
235 :     close $gh;
236 :     }
237 :     };
238 :     if ($@) {
239 :     Trace("Script failed with error: $@") if T(0);
240 :     $rtype = "error";
241 :     } else {
242 :     Trace("Script complete.") if T(2);
243 :     $rtype = "no error";
244 :     }
245 :     # Delete the temporary files (if any).
246 :     for my $fileName (@tempFiles) {
247 :     if (-f $fileName) {
248 :     if ($options->{keepTemp}) {
249 :     Trace("Temporary file $fileName was not deleted.") if T(3);
250 :     } else {
251 :     Trace("Deleting temporary file $fileName.") if T(3);
252 :     unlink $fileName;
253 :     }
254 :     }
255 :     }
256 :     # Display the statistics.
257 :     Trace("Statistics for this run:\n" . $stats->Show()) if T(2);
258 :     if ($options->{phone}) {
259 :     my $msgID = Tracer::SendSMS($options->{phone}, "AliasCrunch terminated with $rtype.");
260 :     if ($msgID) {
261 :     Trace("Phone message sent with ID $msgID.") if T(2);
262 :     } else {
263 :     Trace("Phone message not sent.") if T(2);
264 :     }
265 :     }
266 :    
267 :     =head2 Utility Methods
268 :    
269 :     =head3 ReadOrganismIDs
270 :    
271 :     ReadOrganismIDs($mergeH, $stats);
272 :    
273 :     Read all the data from the organism directories and output it to the
274 :     merge file. Organism directory aliases have medium confidence level
275 :     (C<B>).
276 :    
277 :     =over 4
278 :    
279 :     =item mergeH
280 :    
281 :     Open output handle for the merge file. Each record of the merge file should
282 :     contain (1) a normalized alias, (2) the confidence grade C<B>, (3) an
283 :     alias type, and (4) a feature ID.
284 :    
285 :     =item stats
286 :    
287 :     Statistics object for tracking this operation.
288 :    
289 :     =back
290 :    
291 :     =cut
292 :    
293 :     sub ReadOrganismIDs {
294 :     # Get the parameters.
295 :     my ($mergeH, $stats) = @_;
296 :     # Loop through the organism directories.
297 :     for my $orgDir (grep { $_ =~ /^\d+\.\d+$/ } OpenDir($FIG_Config::organisms)) {
298 :     Trace("Processing $orgDir.") if T(3);
299 :     $stats->Add(orgDirGenomes => 1);
300 :     # We need to process all of this organism's TBL files.
301 :     my $orgDirDir = "$FIG_Config::organisms/$orgDir/Features";
302 :     for my $ftype (OpenDir($orgDirDir, 1)) {
303 :     my $tblFileName = "$orgDirDir/$ftype/tbl";
304 :     if (-s $tblFileName) {
305 :     Trace("Data found in $tblFileName.") if T(3);
306 :     # Read this TBL file.
307 :     $stats->Add(orgDirFiles => 1);
308 :     my $ih = Open(undef, "<$tblFileName");
309 :     while (! eof $ih) {
310 :     # Get the feature ID and its aliases.
311 :     my ($fid, undef, undef, undef, @aliases) = Tracer::GetLine($ih);
312 :     $stats->Add(orgDirFeatures => 1);
313 :     # Loop through the aliases.
314 :     for my $alias (@aliases) {
315 :     my $normalized;
316 :     # Determine the type.
317 :     my $aliasType = AliasAnalysis::TypeOf($alias);
318 :     $stats->Add(orgDirAll => 1);
319 :     # Is this a recognized type?
320 :     if ($aliasType) {
321 :     $stats->Add(orgDirNormal => 1);
322 :     # Yes. Write it normally.
323 :     WriteToMerge($mergeH, $alias, B => $aliasType, $fid);
324 :     } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
325 :     # No, but this is a natural locus tag.
326 :     $stats->Add(orgDirLocus => 1);
327 :     WriteToMerge($mergeH, $normalized, B => 'LocusTag', $fid);
328 :     } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {
329 :     # No, but this is a natural gene name.
330 :     $stats->Add(orgDirGene => 1);
331 :     WriteToMerge($mergeH, $normalized, B => 'GENE', $fid);
332 :     } elsif ($alias =~ /^\d+$/) {
333 :     # Here it's a naked number, which means it's a GI number
334 :     # of some sort. We only take these from the corresponding ID
335 :     # table.
336 :     $stats->Add(orgDirSkip => 1);
337 :     } elsif ($alias =~ /[:|]/) {
338 :     # Here it's an alias of an unknown type.
339 :     $stats->Add(orgDirUnknown => 1);
340 :     } else {
341 :     # Here it's a miscellaneous type.
342 :     WriteToMerge($mergeH, $alias, B => 'Miscellaneous', $fid);
343 :     }
344 :     }
345 :     }
346 :     }
347 :     }
348 :     }
349 :     Trace("Organism directories complete.") if T(2);
350 :     }
351 :    
352 :     =head3 ReadSynonyms
353 :    
354 :     ReadSynonyms($mergeH, $stats);
355 :    
356 :     Read all the data from the C<peg.synonyms> file and output it to the
357 :     merge file. Data from the PEG synonyms file has the lowest confidence
358 :     level (C<C>), because all IDs with the same protein sequence are
359 :     conflated.
360 :    
361 :     =over 4
362 :    
363 :     =item mergeH
364 :    
365 :     Open output handle for the merge file. Each record of the merge file should
366 :     contain (1) a normalized alias, (2) the confidence grade C<B>, (3) an
367 :     alias type, and (4) a feature ID.
368 :    
369 :     =item stats
370 :    
371 :     Statistics object for tracking this operation.
372 :    
373 :     =back
374 :    
375 :     =cut
376 :    
377 :     sub ReadSynonyms {
378 :     # Get the parameters.
379 :     my ($mergeH, $stats) = @_;
380 :     # Open the peg.synonyms file.
381 :     my $synFileName = "$FIG_Config::global/peg.synonyms";
382 :     my $ih = Open(undef, "<$synFileName");
383 :     Trace("Processing $synFileName.") if T(2);
384 :     # Loop through the file.
385 :     while (! eof $ih) {
386 :     # Get this record.
387 :     my ($prot_id, $synonyms) = Tracer::GetLine($ih);
388 :     Trace($stats->Ask('proteins') . " protein synonym records read.") if $stats->Check(proteins => 5000) && T(3);
389 :     # Parse out the synonyms.
390 :     my @synonyms = split /;/, $synonyms;
391 :     # We'll save any FIG IDs in here.
392 :     my %figIDs;
393 :     # Other IDs go in here.
394 :     my @aliasTuples;
395 :     # Loop through the synonyms.
396 :     for my $synonym (@synonyms) {
397 :     # Strip off the length.
398 :     my ($alias) = split /,/, $synonym, 2;
399 :     # Convert NMPDR IDs to FIG IDs.
400 :     $alias =~ s/^nmpdr/fig/;
401 :     # Process according to the type.
402 :     if ($alias =~ /^fig/) {
403 :     $stats->Add(proteinFIG => 1);
404 :     $figIDs{$alias} = 1;
405 :     } else {
406 :     # Here we have an external ID. If it's of a recognized type, we'll
407 :     # keep it.
408 :     my $type = AliasAnalysis::TypeOf($alias);
409 :     if (! defined $type) {
410 :     # Not a recognized type, so ignore it.
411 :     $stats->Add(proteinSkip => 1);
412 :     } else {
413 :     # A recognized type, so keep it.
414 :     push @aliasTuples, [$alias, $type];
415 :     $stats->Add(proteinNormal => 1);
416 :     }
417 :     }
418 :     }
419 :     # Now we have all the IDs in place. If there are any FIG IDs in the
420 :     # bunch, write them to the merge file with all their aliases.
421 :     for my $fid (keys %figIDs) {
422 :     for my $aliasTuple (@aliasTuples) {
423 :     my ($alias, $type) = @$aliasTuple;
424 :     $stats->Add(proteinOut => 1);
425 :     WriteToMerge($mergeH, $alias, C => $type, $fid);
426 :     }
427 :     }
428 :     }
429 :     Trace("Protein synonyms complete.") if T(2);
430 :     }
431 :    
432 :     =head3 ReadCorrespondingIDs
433 :    
434 :     ReadCorrespondingIDs($mergeH, $stats);
435 :    
436 :     Read all the data from the corresponding ID file and output it to the
437 :     merge file. Corresponding IDs have the highest confidence level (C<A>).
438 :    
439 :     =over 4
440 :    
441 :     =item mergeH
442 :    
443 :     Open output handle for the merge file. Each record of the merge file should
444 :     contain (1) a normalized alias, (2) the confidence grade C<B>, (3) an
445 :     alias type, and (4) a feature ID.
446 :    
447 :     =item stats
448 :    
449 :     Statistics object for tracking this operation.
450 :    
451 :     =back
452 :    
453 :     =cut
454 :    
455 :     sub ReadCorrespondingIDs {
456 :     # Get the parameters.
457 :     my ($mergeH, $stats) = @_;
458 :     # Open the corresponding-ID file.
459 :     my $ih = Open(undef, "<$FIG_Config::global/id_correspondence");
460 :     Trace("Processing corresponding IDs.") if T(2);
461 :     # Read the header record.
462 :     my ($type0, @types) = Tracer::GetLine($ih);
463 :     # Insure SEED is the first column.
464 :     Confess("Incorrect file format. SEED is not first.") if ($type0 ne 'SEED');
465 :     # Skip the flag record.
466 :     Tracer::GetLine($ih);
467 :     # Loop through the file.
468 :     while (! eof $ih) {
469 :     # Get this FID and its synonym lists.
470 :     my ($fidList, @others) = Tracer::GetLine($ih);
471 :     Trace($stats->Ask('correspondingIDs') . " corresponding ID records read.") if $stats->Check(correspondingIDs => 5000) && T(3);
472 :     if (! $fidList) {
473 :     # Skip this record if there are no FIG IDs in it.
474 :     $stats->Add(correspondingSkip => 1);
475 :     } else {
476 :     # Get the list of FIG IDs.
477 :     my @fids = split /\s*;\s*/, $fidList;
478 :     # Loop through the other aliases.
479 :     for (my $i = 0; $i <= $#others; $i++) {
480 :     # Get this alias type.
481 :     my $type = $types[$i];
482 :     # Loop through the alias list.
483 :     for my $alias (split /\s*;\s*/, $others[$i]) {
484 :     # Ignore this alias if it's a RefSeq contig.
485 :     if ($type eq 'RefSeq' && $alias =~ /^[A-Z][CMT]/) {
486 :     $stats->Add(correspondingContig => 1);
487 :     } else {
488 :     # We're okay, so normalize the alias.
489 :     my $normalized = AliasAnalysis::Normalize($type => $alias);
490 :     $stats->Add(correspondingNormal => 1);
491 :     # Write it out once for each FIG ID.
492 :     for my $fid (@fids) {
493 :     WriteToMerge($mergeH, $normalized, A => $type, $fid);
494 :     }
495 :     }
496 :     }
497 :     }
498 :     }
499 :     }
500 :     # Close the input file and the sort file.
501 :     close $ih;
502 :     Trace("Corresponding IDs complete.") if T(2);
503 :     }
504 :    
505 :     =head3 WriteAlias
506 :    
507 :     WriteAlias($oh, $alias, $conf, $type, $fid);
508 :    
509 :     Write an alias record to the sort file. The alias record
510 :     contains the alias ID, the alias type, its confidence level, and the
511 :     corresponding feature ID.
512 :    
513 :     =over 4
514 :    
515 :     =item oh
516 :    
517 :     Open handle for the output file.
518 :    
519 :     =item alias
520 :    
521 :     Alias identifier.
522 :    
523 :     =item conf
524 :    
525 :     Confidence grade for this alias: C<A> is best, C<F> is worst.
526 :    
527 :     =item type
528 :    
529 :     Type of alias (e.g. NCBI, CMR, RefSeq).
530 :    
531 :     =item fid
532 :    
533 :     %FIG{FIG ID} corresponding to the alias.
534 :    
535 :     =back
536 :    
537 :     =cut
538 :    
539 :     sub WriteAlias {
540 :     # Get the parameters.
541 :     my ($oh, $alias, $conf, $type, $fid) = @_;
542 :     # Compute the genome ID.
543 :     my $genomeID = FIGRules::ParseFeatureID($fid);
544 :     # Write this alias to the output file.
545 :     Tracer::PutLine($oh, [$fid, $alias, $type, $conf]);
546 :     $stats->Add(aliasOut => 1);
547 :     $stats->Add("aliasOut$type" => 1);
548 :     }
549 :    
550 :     =head3 WriteToMerge
551 :    
552 :     WriteToMerge($mergeH, $alias, $grade => $aliasType, $fid);
553 :    
554 :     Write an alias connection to the output merge file.
555 :    
556 :     =over 4
557 :    
558 :     =item mergeH
559 :    
560 :     Open output handle for the merge file.
561 :    
562 :     =item alias
563 :    
564 :     Alias identifier.
565 :    
566 :     =item conf
567 :    
568 :     Confidence grade for this alias: C<A> is best, C<F> is worst.
569 :    
570 :     =item type
571 :    
572 :     Type of alias (e.g. NCBI, CMR, RefSeq).
573 :    
574 :     =item fid
575 :    
576 :     %FIG{FIG ID} corresponding to the alias.
577 :    
578 :     =back
579 :    
580 :     =cut
581 :    
582 :     sub WriteToMerge {
583 :     # Get the parameters.
584 :     my ($mergeH, $alias, $grade, $aliasType, $fid) = @_;
585 :     # Write the merge file record.
586 :     Tracer::PutLine($mergeH, [$alias, $grade, $aliasType, $fid]);
587 :     $stats->Add(mergeOut => 1);
588 :     }
589 :    
590 :    
591 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3