[Bio] / Sprout / AliasCrunch.pl Repository:
ViewVC logotype

Annotation of /Sprout/AliasCrunch.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use Stats;
23 :     use AliasAnalysis;
24 :     use FIGRules;
25 :    
26 :    
27 :     =head1 AliasCrunch Script
28 :    
29 :     =head2 Introduction
30 :    
31 :     AliasCrunch [options]
32 :    
33 :     This script finds all the aliases scattered throughout the SEED and puts them in
34 :     flat files organized by genome ID. The files are currently stored in the Sprout
35 :     data directory.
36 :    
37 :     The aliases come from three sources: (1) the peg.synonyms file, (2) the C<tbl>
38 :     files in the organism directories, and (3) the ID correspondence file. The three
39 :     sources have different methods for sorting the data, and the data in the
40 :     peg.synonyms file is considered secondary to the data in the other files because
41 :     it identifies proteins, not features.
42 :    
43 :     Each of the three sources has its own special rules. The C<tbl> files may
44 :     contain aliases in their natural form as well as the normalized form. The
45 :     ID correspondence file contains only natural forms, but the types are determined
46 :     by the position of the alias in each record. In addition, some of the RefSeq
47 :     IDs in the correspondence file are actually contig IDs and have to be deleted.
48 :     The C<peg.synonyms> file and the ID correspondence may have records which do
49 :     not correspond to any FIG IDs. These are ignored.
50 :    
51 :     The output alias files will be lexically sorted and have the following fields
52 :     on each line.
53 :    
54 :     =over 4
55 :    
56 :     =item 1
57 :    
58 :     %FIG{FIG ID}% of the relevant feature
59 :    
60 :     =item 2
61 :    
62 :     Alias identifier
63 :    
64 :     =item 3
65 :    
66 :     Alias type
67 :    
68 :     =item 4
69 :    
70 :     Confidence grade: C<A> for curated, C<B> for uploaded, C<C> for peg synonym
71 :    
72 :     =back
73 :    
74 :     =head2 Command-Line Options
75 :    
76 :     =over 4
77 :    
78 :     =item trace
79 :    
80 :     Specifies the tracing level. The higher the tracing level, the more messages
81 :     will appear in the trace log. Use E to specify emergency tracing.
82 :    
83 :     =item output
84 :    
85 :     Directory in which the output files should be placed. The output files will all
86 :     have names of the form C<alias.>I<genome>C<.tbl>, where I<genome> is the genome
87 :     ID, and will be sorted by %FIG{FIG ID}%.
88 :    
89 :     =item clear
90 :    
91 :     Clears any existing alias files from the directory before processing.
92 :    
93 :     =item user
94 :    
95 :     Name suffix to be used for log files. If omitted, the PID is used.
96 :    
97 :     =item sql
98 :    
99 :     If specified, turns on tracing of SQL activity.
100 :    
101 :     =item keepTemp
102 :    
103 :     If specified, the intermediate temporary files will not be deleted when the
104 :     script is finished using them.
105 :    
106 :     =item background
107 :    
108 :     Save the standard and error output to files. The files will be created
109 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
110 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
111 :     B<user> option above.
112 :    
113 :     =item help
114 :    
115 :     Display this command's parameters and options.
116 :    
117 :     =item warn
118 :    
119 :     Create an event in the RSS feed when an error occurs.
120 :    
121 :     =item phone
122 :    
123 :     Phone number to message when the script is complete.
124 :    
125 :     =back
126 :    
127 :     =cut
128 :    
129 :     # Get the command-line options and parameters.
130 :     my ($options, @parameters) = StandardSetup([qw() ],
131 :     {
132 :     trace => ["3", "tracing level"],
133 :     output => [$FIG_Config::sproutData, "output directory for alias files"],
134 :     clear => ["", "if specified, existing alias files will be erased"],
135 :     keepTemp => ["", "if specified, the intermediate temporary files will not be deleted"],
136 :     phone => ["", "phone number (international format) to call when load finishes"]
137 :     },
138 :     "",
139 :     @ARGV);
140 :     # Set a variable to contain return type information.
141 :     my $rtype;
142 :     # Create a statistics object.
143 :     my $stats = Stats->new();
144 :     # Compute the merge file name.
145 :     my $mfileName = "$FIG_Config::temp/mergeAC$$.tmp.tbl";
146 :     # This will contain a list of the temporary files to delete at end of run.
147 :     my @tempFiles = $mfileName;
148 :     # Insure we catch errors.
149 :     eval {
150 :     # Compute the output directory.
151 :     my $output = $options->{output};
152 :     if (! $output) {
153 :     Confess("No output directory specified.");
154 :     } elsif (! -d $output) {
155 :     Confess("Invalid output directory \"$output\".");
156 :     } else {
157 :     # Here we have a valid output directory. Do we need to clear it?
158 :     if ($options->{clear}) {
159 :     # Yes. Find and delete any existing files.
160 :     Trace("Clearing old alias files from $output directory.") if T(3);
161 :     for my $file (grep { $_ =~ /^alias\.[0-9.]+\.tbl$/ } OpenDir($output)) {
162 :     unlink "$output/$file";
163 :     $stats->Add(filesCleared => 1);
164 :     }
165 :     }
166 : parrello 1.2 # Create the corresponding-ID file.
167 :     CreateCorrespondingIdFile($stats, "$output/id_corresponding.tbl");
168 : parrello 1.1 # Open the merge file. Each record of the merge file will contain (1) a
169 :     # normalized alias, (2) a confidence grade (A, B, C), (3) an alias type,
170 :     # and (4) a feature ID. The merge file is then read back so that we can
171 :     # determine the list of features associated with each alias. Only the FIG IDs
172 :     # with the best confidence (A over B, B over C) for an alias will be kept.
173 :     Trace("Creating merge file $mfileName.") if T(2);
174 :     my $mergeH = Open(undef, "| sort -u >$mfileName");
175 :     # Now read in the three sources of data.
176 : parrello 1.2 ReadCorrespondingIDs($mergeH, $stats, "$output/id_corresponding.tbl");
177 : parrello 1.1 ReadOrganismIDs($mergeH, $stats);
178 :     ReadSynonyms($mergeH, $stats);
179 :     # Close the merge file and reopen it for input.
180 :     close $mergeH;
181 :     Trace("Processing merge file results.") if T(2);
182 :     my $ih = Open(undef, "<$mfileName");
183 :     # This file will be used to sort the aliases by feature ID.
184 :     my $sortFileName = "$FIG_Config::temp/sortAC$$.tmp.tbl";
185 :     push @tempFiles, $sortFileName;
186 :     my $oh = Open(undef, "| sort >$sortFileName");
187 :     # Now we set up the data we'll be accumulating for each alias.
188 :     # "$prevAlias" is the current alias, "$prevConf" is its confidence.
189 :     # We emit a row if we encounter a new alias or an old alias at the
190 :     # same confidence level.
191 :     my ($prevAlias, $prevConf);
192 :     # Loop through the merge file.
193 :     while (! eof $ih) {
194 :     # Get this row of data.
195 :     my ($alias, $conf, $type, $fid) = Tracer::GetLine($ih);
196 :     Trace($stats->Ask('mergeFileRecords') . " merge file records read.") if $stats->Check(mergeFileRecords => 5000) && T(3);
197 :     # Should we emit this alias?
198 :     if ($alias ne $prevAlias) {
199 :     # Yes. This is a new alias.
200 :     $prevAlias = $alias;
201 :     $prevConf = $conf;
202 :     # Emit this alias.
203 :     WriteAlias($oh, $alias, $conf, $type, $fid);
204 :     } elsif ($conf eq $prevConf) {
205 :     # Yes. This is an old alias at the same confidence level.
206 :     WriteAlias($oh, $alias, $conf, $type, $fid);
207 :     }
208 :     }
209 :     # Close the sort file.
210 :     Trace("Closing sort file.") if T(2);
211 :     close $oh;
212 :     # This will contain the current genome ID.
213 :     my $currGenome;
214 :     # This will contain the current output file handle.
215 :     my $gh;
216 :     # Now read the sort file and split it up by genome ID.
217 :     $ih = Open(undef, "<$sortFileName");
218 :     while (! eof $ih) {
219 :     # Get the next record.
220 :     my ($fid, $alias, $type, $conf) = Tracer::GetLine($ih);
221 :     # Compute the genome ID.
222 :     my ($genomeID) = FIGRules::ParseFeatureID($fid);
223 :     # Is it new?
224 :     if ($genomeID ne $currGenome) {
225 :     # Yes. Close the old file and start a new one.
226 :     if (defined $gh) {
227 :     close $gh;
228 :     }
229 :     $gh = Open(undef, ">$options->{output}/alias.$genomeID.tbl");
230 :     Trace("Genome file for $genomeID created.") if T(3);
231 :     $currGenome = $genomeID;
232 :     }
233 :     # Write this record to the current output file.
234 :     Tracer::PutLine($gh, [$fid, $alias, $type, $conf]);
235 :     }
236 :     # Close the current genome output file.
237 :     close $gh;
238 :     }
239 :     };
240 :     if ($@) {
241 :     Trace("Script failed with error: $@") if T(0);
242 :     $rtype = "error";
243 :     } else {
244 :     Trace("Script complete.") if T(2);
245 :     $rtype = "no error";
246 :     }
247 :     # Delete the temporary files (if any).
248 :     for my $fileName (@tempFiles) {
249 :     if (-f $fileName) {
250 :     if ($options->{keepTemp}) {
251 :     Trace("Temporary file $fileName was not deleted.") if T(3);
252 :     } else {
253 :     Trace("Deleting temporary file $fileName.") if T(3);
254 :     unlink $fileName;
255 :     }
256 :     }
257 :     }
258 :     # Display the statistics.
259 :     Trace("Statistics for this run:\n" . $stats->Show()) if T(2);
260 :     if ($options->{phone}) {
261 :     my $msgID = Tracer::SendSMS($options->{phone}, "AliasCrunch terminated with $rtype.");
262 :     if ($msgID) {
263 :     Trace("Phone message sent with ID $msgID.") if T(2);
264 :     } else {
265 :     Trace("Phone message not sent.") if T(2);
266 :     }
267 :     }
268 :    
269 :     =head2 Utility Methods
270 :    
271 :     =head3 ReadOrganismIDs
272 :    
273 :     ReadOrganismIDs($mergeH, $stats);
274 :    
275 :     Read all the data from the organism directories and output it to the
276 :     merge file. Organism directory aliases have medium confidence level
277 :     (C<B>).
278 :    
279 :     =over 4
280 :    
281 :     =item mergeH
282 :    
283 :     Open output handle for the merge file. Each record of the merge file should
284 :     contain (1) a normalized alias, (2) the confidence grade C<B>, (3) an
285 :     alias type, and (4) a feature ID.
286 :    
287 :     =item stats
288 :    
289 :     Statistics object for tracking this operation.
290 :    
291 :     =back
292 :    
293 :     =cut
294 :    
295 :     sub ReadOrganismIDs {
296 :     # Get the parameters.
297 :     my ($mergeH, $stats) = @_;
298 :     # Loop through the organism directories.
299 : parrello 1.3 for my $orgDir (sort grep { $_ =~ /^\d+\.\d+$/ } OpenDir($FIG_Config::organisms)) {
300 : parrello 1.1 Trace("Processing $orgDir.") if T(3);
301 :     $stats->Add(orgDirGenomes => 1);
302 :     # We need to process all of this organism's TBL files.
303 :     my $orgDirDir = "$FIG_Config::organisms/$orgDir/Features";
304 : parrello 1.8 if (! -d $orgDirDir) {
305 :     Trace("No feature directory found for $orgDir.") if T(1);
306 :     $stats->Add(orgDirMissing => 1);
307 :     } else {
308 :     for my $ftype (OpenDir($orgDirDir, 1)) {
309 :     my $tblFileName = "$orgDirDir/$ftype/tbl";
310 :     if (-s $tblFileName) {
311 :     Trace("Data found in $tblFileName.") if T(3);
312 :     # Read this TBL file.
313 :     $stats->Add(orgDirFiles => 1);
314 :     my $ih = Open(undef, "<$tblFileName");
315 :     while (! eof $ih) {
316 :     # Get the feature ID and its aliases.
317 :     my ($fid, undef, @aliases) = Tracer::GetLine($ih);
318 :     $stats->Add(orgDirFeatures => 1);
319 :     # Loop through the aliases.
320 :     for my $alias (@aliases) {
321 :     my $normalized;
322 :     # Determine the type.
323 :     my $aliasType = AliasAnalysis::TypeOf($alias);
324 :     $stats->Add(orgDirAll => 1);
325 :     # Is this a recognized type?
326 :     if ($aliasType) {
327 :     $stats->Add(orgDirNormal => 1);
328 :     # Yes. Write it normally.
329 :     WriteToMerge($mergeH, $alias, B => $aliasType, $fid);
330 :     } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {
331 :     # No, but this is a specially-marked locus tag.
332 :     $normalized = $1;
333 :     $stats->Add(orgDirLocus => 1);
334 :     WriteToMerge($mergeH, $normalized, B => 'LocusTag', $fid);
335 :     } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
336 :     # No, but this is a natural locus tag.
337 :     $stats->Add(orgDirLocus => 1);
338 :     WriteToMerge($mergeH, $normalized, B => 'LocusTag', $fid);
339 :     } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {
340 :     # No, but this is a natural gene name.
341 :     $stats->Add(orgDirGene => 1);
342 :     WriteToMerge($mergeH, $normalized, B => 'GENE', $fid);
343 :     } elsif ($alias =~ /^\d+$/) {
344 :     # Here it's a naked number, which means it's a GI number
345 :     # of some sort. We only take these from the corresponding ID
346 :     # table.
347 :     $stats->Add(orgDirSkip => 1);
348 :     } elsif ($alias =~ /^protein_id\|(.+)/) {
349 :     # Here we have a REFSEQ protein ID.
350 :     $normalized = $1;
351 :     $stats->Add(orgDirProtein => 1);
352 :     WriteToMerge($mergeH, $normalized, C => 'RefSeq', $fid);
353 :     } elsif ($alias =~ /[:|]/) {
354 :     # Here it's an alias of an unknown type.
355 :     $stats->Add(orgDirUnknown => 1);
356 :     } else {
357 :     # Here it's a miscellaneous type.
358 :     $stats->Add(orgDirMisc => 1);
359 :     WriteToMerge($mergeH, $alias, B => 'Miscellaneous', $fid);
360 :     }
361 : parrello 1.1 }
362 :     }
363 :     }
364 :     }
365 :     }
366 :     }
367 :     Trace("Organism directories complete.") if T(2);
368 :     }
369 :    
370 :     =head3 ReadSynonyms
371 :    
372 :     ReadSynonyms($mergeH, $stats);
373 :    
374 :     Read all the data from the C<peg.synonyms> file and output it to the
375 :     merge file. Data from the PEG synonyms file has the lowest confidence
376 :     level (C<C>), because all IDs with the same protein sequence are
377 :     conflated.
378 :    
379 :     =over 4
380 :    
381 :     =item mergeH
382 :    
383 :     Open output handle for the merge file. Each record of the merge file should
384 :     contain (1) a normalized alias, (2) the confidence grade C<B>, (3) an
385 :     alias type, and (4) a feature ID.
386 :    
387 :     =item stats
388 :    
389 :     Statistics object for tracking this operation.
390 :    
391 :     =back
392 :    
393 :     =cut
394 :    
395 :     sub ReadSynonyms {
396 :     # Get the parameters.
397 :     my ($mergeH, $stats) = @_;
398 :     # Open the peg.synonyms file.
399 :     my $synFileName = "$FIG_Config::global/peg.synonyms";
400 :     my $ih = Open(undef, "<$synFileName");
401 :     Trace("Processing $synFileName.") if T(2);
402 :     # Loop through the file.
403 :     while (! eof $ih) {
404 :     # Get this record.
405 :     my ($prot_id, $synonyms) = Tracer::GetLine($ih);
406 :     Trace($stats->Ask('proteins') . " protein synonym records read.") if $stats->Check(proteins => 5000) && T(3);
407 :     # Parse out the synonyms.
408 :     my @synonyms = split /;/, $synonyms;
409 :     # We'll save any FIG IDs in here.
410 :     my %figIDs;
411 :     # Other IDs go in here.
412 :     my @aliasTuples;
413 :     # Loop through the synonyms.
414 :     for my $synonym (@synonyms) {
415 :     # Strip off the length.
416 :     my ($alias) = split /,/, $synonym, 2;
417 :     # Convert NMPDR IDs to FIG IDs.
418 :     $alias =~ s/^nmpdr/fig/;
419 :     # Process according to the type.
420 :     if ($alias =~ /^fig/) {
421 :     $stats->Add(proteinFIG => 1);
422 :     $figIDs{$alias} = 1;
423 :     } else {
424 :     # Here we have an external ID. If it's of a recognized type, we'll
425 :     # keep it.
426 :     my $type = AliasAnalysis::TypeOf($alias);
427 :     if (! defined $type) {
428 :     # Not a recognized type, so ignore it.
429 :     $stats->Add(proteinSkip => 1);
430 :     } else {
431 :     # A recognized type, so keep it.
432 :     push @aliasTuples, [$alias, $type];
433 :     $stats->Add(proteinNormal => 1);
434 :     }
435 :     }
436 :     }
437 :     # Now we have all the IDs in place. If there are any FIG IDs in the
438 :     # bunch, write them to the merge file with all their aliases.
439 :     for my $fid (keys %figIDs) {
440 :     for my $aliasTuple (@aliasTuples) {
441 :     my ($alias, $type) = @$aliasTuple;
442 :     $stats->Add(proteinOut => 1);
443 :     WriteToMerge($mergeH, $alias, C => $type, $fid);
444 :     }
445 :     }
446 :     }
447 :     Trace("Protein synonyms complete.") if T(2);
448 :     }
449 :    
450 :     =head3 ReadCorrespondingIDs
451 :    
452 : parrello 1.2 ReadCorrespondingIDs($mergeH, $stats, $name);
453 : parrello 1.1
454 : parrello 1.2 Read all the data from the corresponding ID table and output it to the
455 : parrello 1.1 merge file. Corresponding IDs have the highest confidence level (C<A>).
456 :    
457 :     =over 4
458 :    
459 :     =item mergeH
460 :    
461 :     Open output handle for the merge file. Each record of the merge file should
462 :     contain (1) a normalized alias, (2) the confidence grade C<B>, (3) an
463 :     alias type, and (4) a feature ID.
464 :    
465 :     =item stats
466 :    
467 :     Statistics object for tracking this operation.
468 :    
469 : parrello 1.2 =item name
470 :    
471 :     Name of the corresponding ID file.
472 :    
473 : parrello 1.1 =back
474 :    
475 :     =cut
476 :    
477 :     sub ReadCorrespondingIDs {
478 :     # Get the parameters.
479 : parrello 1.2 my ($mergeH, $stats, $name) = @_;
480 : parrello 1.1 # Open the corresponding-ID file.
481 : parrello 1.2 my $ih = Open(undef, "<$name");
482 : parrello 1.1 Trace("Processing corresponding IDs.") if T(2);
483 :     # Read the header record.
484 :     my ($type0, @types) = Tracer::GetLine($ih);
485 :     # Insure SEED is the first column.
486 :     Confess("Incorrect file format. SEED is not first.") if ($type0 ne 'SEED');
487 :     # Skip the flag record.
488 :     Tracer::GetLine($ih);
489 :     # Loop through the file.
490 :     while (! eof $ih) {
491 :     # Get this FID and its synonym lists.
492 :     my ($fidList, @others) = Tracer::GetLine($ih);
493 :     Trace($stats->Ask('correspondingIDs') . " corresponding ID records read.") if $stats->Check(correspondingIDs => 5000) && T(3);
494 :     if (! $fidList) {
495 :     # Skip this record if there are no FIG IDs in it.
496 :     $stats->Add(correspondingSkip => 1);
497 :     } else {
498 :     # Get the list of FIG IDs.
499 :     my @fids = split /\s*;\s*/, $fidList;
500 :     # Loop through the other aliases.
501 :     for (my $i = 0; $i <= $#others; $i++) {
502 :     # Get this alias type.
503 :     my $type = $types[$i];
504 :     # Loop through the alias list.
505 :     for my $alias (split /\s*;\s*/, $others[$i]) {
506 :     # Ignore this alias if it's a RefSeq contig.
507 :     if ($type eq 'RefSeq' && $alias =~ /^[A-Z][CMT]/) {
508 :     $stats->Add(correspondingContig => 1);
509 :     } else {
510 : parrello 1.3 # Check for a locus tag disguised as a CMR ID.
511 :     my $realType = $type;
512 :     if ($type eq 'CMR' && $alias =~ /^[A-Z]{2,3}_\d+$/) {
513 :     $realType = 'LocusTag';
514 :     $stats->Add(correspondingLocus => 1);
515 :     } else {
516 :     $stats->Add(correspondingNormal => 1);
517 :     }
518 :     # Normalize the alias.
519 : parrello 1.1 my $normalized = AliasAnalysis::Normalize($type => $alias);
520 :     # Write it out once for each FIG ID.
521 :     for my $fid (@fids) {
522 :     WriteToMerge($mergeH, $normalized, A => $type, $fid);
523 :     }
524 :     }
525 :     }
526 :     }
527 :     }
528 :     }
529 :     # Close the input file and the sort file.
530 :     close $ih;
531 :     Trace("Corresponding IDs complete.") if T(2);
532 :     }
533 :    
534 : parrello 1.2 =head3 CreateCorrespondingIdFile
535 :    
536 :     CreateCorrespondingIdFile($stats, $name);
537 :    
538 :     Create a corresponding-ID file from the data in the SEED database. The
539 :     outgoing file will contain a header record with the ID types followed by
540 :     a record for each ID group. Within a group, the field for a given ID type
541 :     will contain a semicolon-delimited list of the IDs of that type in the
542 :     group.
543 :    
544 :     When the SEED database goes away this method will need to be replaced.
545 :    
546 :     =over 4
547 :    
548 :     =item stats
549 :    
550 :     Statistics object to use for tracking progress.
551 :    
552 :     =item name
553 :    
554 :     Name to give to the corresponding-ID file.
555 :    
556 :     =back
557 :    
558 :     =cut
559 :    
560 :     sub CreateCorrespondingIdFile {
561 :     # Get the parameters.
562 :     my ($stats, $name) = @_;
563 :     # Get the FIG database.
564 :     require FIG;
565 :     my $fig = new FIG;
566 :     my $dbh = $fig->db_handle();
567 :     # Open the output file.
568 :     my $oh = Open(undef, ">$name");
569 :     Trace("Creating header for corresponding ID file.") if T(2);
570 :     # Create the header record from the id types table.
571 :     my %types = map { $_->[0] => $_->[1] } @{$dbh->SQL("SELECT id, name FROM id_correspondence_type")};
572 :     my @typeList = sort keys %types;
573 :     my @header = map { $types{$_} } @typeList;
574 :     Trace("Header is " . join(" ", @header) . ".") if T(3);
575 :     Tracer::PutLine($oh, \@header);
576 :     # Now we loop through the id correspondence table, creating groups (sets). We use
577 :     # an SQL statement for this.
578 :     my $sth = $dbh->prepare_command("SELECT set_id, protein_id, type FROM id_correspondence");
579 :     my $rc = $sth->execute();
580 :     if (! $rc) {
581 :     Confess("SELECT error creating corresponding ID file: " . $sth->errstr());
582 :     }
583 :     # These variables contain the ID and content of the current group.
584 :     my ($set, $content) = (-1, undef);
585 :     # These variables will hold the fields from the current record.
586 :     my ($set_id, $protein_id, $type);
587 :     # This flag will be set to TRUE when we're done.
588 :     my $done = 0;
589 :     while (! $done) {
590 :     # Get the next record.
591 :     my $record = $sth->fetchrow_arrayref();
592 :     if (! defined $record) {
593 :     # No record, so we're done.
594 :     Trace("End of correspondence table found.") if T(3);
595 :     $done = 1;
596 :     } else {
597 :     # A record found, so we get its data.
598 :     ($set_id, $protein_id, $type) = @$record;
599 :     Trace($stats->Ask('corrTableRecords') . " corresponding ID table records read.") if $stats->Check(corrTableRecords => 5000) && T(3);
600 :     }
601 :     # Is this a new group?
602 :     if ($done || $set_id != $set) {
603 :     # Yes. If the old group has content, we write it out. Each field is
604 :     # formed by joining the IDs for that type into a string using semicolons.
605 :     if (defined $content) {
606 :     my @typeStrings = map { join("; ", @{$content->{$_}}) } @typeList;
607 :     Tracer::PutLine($oh, \@typeStrings);
608 :     }
609 :     # Check for an error in the sort.
610 :     if ($set > $set_id) {
611 :     Confess("Invalid set order in id_correspondence table: $set to $set_id.");
612 :     }
613 :     # Now start the new group.
614 :     $set = $set_id;
615 :     $content = { map { $_ => [] } @typeList };
616 :     }
617 :     # Put this ID in this group.
618 :     push @{$content->{$type}}, $protein_id;
619 :     }
620 :     # Close up the output file.
621 :     Trace("Corresponding ID file created as $name.") if T(2);
622 :     close $oh;
623 :     }
624 :    
625 :    
626 : parrello 1.1 =head3 WriteAlias
627 :    
628 :     WriteAlias($oh, $alias, $conf, $type, $fid);
629 :    
630 :     Write an alias record to the sort file. The alias record
631 :     contains the alias ID, the alias type, its confidence level, and the
632 :     corresponding feature ID.
633 :    
634 :     =over 4
635 :    
636 :     =item oh
637 :    
638 :     Open handle for the output file.
639 :    
640 :     =item alias
641 :    
642 :     Alias identifier.
643 :    
644 :     =item conf
645 :    
646 :     Confidence grade for this alias: C<A> is best, C<F> is worst.
647 :    
648 :     =item type
649 :    
650 :     Type of alias (e.g. NCBI, CMR, RefSeq).
651 :    
652 :     =item fid
653 :    
654 :     %FIG{FIG ID} corresponding to the alias.
655 :    
656 :     =back
657 :    
658 :     =cut
659 :    
660 :     sub WriteAlias {
661 :     # Get the parameters.
662 :     my ($oh, $alias, $conf, $type, $fid) = @_;
663 :     # Compute the genome ID.
664 :     my $genomeID = FIGRules::ParseFeatureID($fid);
665 :     # Write this alias to the output file.
666 :     Tracer::PutLine($oh, [$fid, $alias, $type, $conf]);
667 :     $stats->Add(aliasOut => 1);
668 :     $stats->Add("aliasOut$type" => 1);
669 :     }
670 :    
671 :     =head3 WriteToMerge
672 :    
673 :     WriteToMerge($mergeH, $alias, $grade => $aliasType, $fid);
674 :    
675 :     Write an alias connection to the output merge file.
676 :    
677 :     =over 4
678 :    
679 :     =item mergeH
680 :    
681 :     Open output handle for the merge file.
682 :    
683 :     =item alias
684 :    
685 :     Alias identifier.
686 :    
687 :     =item conf
688 :    
689 :     Confidence grade for this alias: C<A> is best, C<F> is worst.
690 :    
691 :     =item type
692 :    
693 :     Type of alias (e.g. NCBI, CMR, RefSeq).
694 :    
695 :     =item fid
696 :    
697 :     %FIG{FIG ID} corresponding to the alias.
698 :    
699 :     =back
700 :    
701 :     =cut
702 :    
703 :     sub WriteToMerge {
704 :     # Get the parameters.
705 :     my ($mergeH, $alias, $grade, $aliasType, $fid) = @_;
706 :     # Write the merge file record.
707 :     Tracer::PutLine($mergeH, [$alias, $grade, $aliasType, $fid]);
708 :     $stats->Add(mergeOut => 1);
709 :     }
710 :    
711 :    
712 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3