[Bio] / Sprout / AliasCrunch.pl Repository:
ViewVC logotype

Annotation of /Sprout/AliasCrunch.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use Stats;
23 :     use AliasAnalysis;
24 :     use FIGRules;
25 :    
26 :    
27 :     =head1 AliasCrunch Script
28 :    
29 :     =head2 Introduction
30 :    
31 :     AliasCrunch [options]
32 :    
33 :     This script finds all the aliases scattered throughout the SEED and puts them in
34 :     flat files organized by genome ID. The files are currently stored in the Sprout
35 :     data directory.
36 :    
37 :     The aliases come from three sources: (1) the peg.synonyms file, (2) the C<tbl>
38 :     files in the organism directories, and (3) the ID correspondence file. The three
39 :     sources have different methods for sorting the data, and the data in the
40 :     peg.synonyms file is considered secondary to the data in the other files because
41 :     it identifies proteins, not features.
42 :    
43 :     Each of the three sources has its own special rules. The C<tbl> files may
44 :     contain aliases in their natural form as well as the normalized form. The
45 :     ID correspondence file contains only natural forms, but the types are determined
46 :     by the position of the alias in each record. In addition, some of the RefSeq
47 :     IDs in the correspondence file are actually contig IDs and have to be deleted.
48 :     The C<peg.synonyms> file and the ID correspondence may have records which do
49 :     not correspond to any FIG IDs. These are ignored.
50 :    
51 :     The output alias files will be lexically sorted and have the following fields
52 :     on each line.
53 :    
54 :     =over 4
55 :    
56 :     =item 1
57 :    
58 :     %FIG{FIG ID}% of the relevant feature
59 :    
60 :     =item 2
61 :    
62 :     Alias identifier
63 :    
64 :     =item 3
65 :    
66 :     Alias type
67 :    
68 :     =item 4
69 :    
70 :     Confidence grade: C<A> for curated, C<B> for uploaded, C<C> for peg synonym
71 :    
72 :     =back
73 :    
74 :     =head2 Command-Line Options
75 :    
76 :     =over 4
77 :    
78 :     =item trace
79 :    
80 :     Specifies the tracing level. The higher the tracing level, the more messages
81 :     will appear in the trace log. Use E to specify emergency tracing.
82 :    
83 :     =item output
84 :    
85 :     Directory in which the output files should be placed. The output files will all
86 :     have names of the form C<alias.>I<genome>C<.tbl>, where I<genome> is the genome
87 :     ID, and will be sorted by %FIG{FIG ID}%.
88 :    
89 :     =item clear
90 :    
91 :     Clears any existing alias files from the directory before processing.
92 :    
93 :     =item user
94 :    
95 :     Name suffix to be used for log files. If omitted, the PID is used.
96 :    
97 :     =item sql
98 :    
99 :     If specified, turns on tracing of SQL activity.
100 :    
101 :     =item keepTemp
102 :    
103 :     If specified, the intermediate temporary files will not be deleted when the
104 :     script is finished using them.
105 :    
106 :     =item background
107 :    
108 :     Save the standard and error output to files. The files will be created
109 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
110 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
111 :     B<user> option above.
112 :    
113 :     =item help
114 :    
115 :     Display this command's parameters and options.
116 :    
117 :     =item warn
118 :    
119 :     Create an event in the RSS feed when an error occurs.
120 :    
121 :     =item phone
122 :    
123 :     Phone number to message when the script is complete.
124 :    
125 :     =back
126 :    
127 :     =cut
128 :    
129 :     # Get the command-line options and parameters.
130 :     my ($options, @parameters) = StandardSetup([qw() ],
131 :     {
132 :     trace => ["3", "tracing level"],
133 : parrello 1.9 output => ["$FIG_Config::saplingData/AliasData", "output directory for alias files"],
134 : parrello 1.1 clear => ["", "if specified, existing alias files will be erased"],
135 :     keepTemp => ["", "if specified, the intermediate temporary files will not be deleted"],
136 :     phone => ["", "phone number (international format) to call when load finishes"]
137 :     },
138 :     "",
139 :     @ARGV);
140 :     # Set a variable to contain return type information.
141 :     my $rtype;
142 :     # Create a statistics object.
143 :     my $stats = Stats->new();
144 :     # Compute the merge file name.
145 :     my $mfileName = "$FIG_Config::temp/mergeAC$$.tmp.tbl";
146 :     # This will contain a list of the temporary files to delete at end of run.
147 :     my @tempFiles = $mfileName;
148 :     # Insure we catch errors.
149 :     eval {
150 :     # Compute the output directory.
151 :     my $output = $options->{output};
152 :     if (! $output) {
153 :     Confess("No output directory specified.");
154 :     } elsif (! -d $output) {
155 :     Confess("Invalid output directory \"$output\".");
156 :     } else {
157 :     # Here we have a valid output directory. Do we need to clear it?
158 :     if ($options->{clear}) {
159 :     # Yes. Find and delete any existing files.
160 :     Trace("Clearing old alias files from $output directory.") if T(3);
161 :     for my $file (grep { $_ =~ /^alias\.[0-9.]+\.tbl$/ } OpenDir($output)) {
162 :     unlink "$output/$file";
163 :     $stats->Add(filesCleared => 1);
164 :     }
165 :     }
166 : parrello 1.2 # Create the corresponding-ID file.
167 :     CreateCorrespondingIdFile($stats, "$output/id_corresponding.tbl");
168 : parrello 1.1 # Open the merge file. Each record of the merge file will contain (1) a
169 :     # normalized alias, (2) a confidence grade (A, B, C), (3) an alias type,
170 :     # and (4) a feature ID. The merge file is then read back so that we can
171 :     # determine the list of features associated with each alias. Only the FIG IDs
172 :     # with the best confidence (A over B, B over C) for an alias will be kept.
173 :     Trace("Creating merge file $mfileName.") if T(2);
174 :     my $mergeH = Open(undef, "| sort -u >$mfileName");
175 :     # Now read in the three sources of data.
176 : parrello 1.2 ReadCorrespondingIDs($mergeH, $stats, "$output/id_corresponding.tbl");
177 : parrello 1.1 ReadOrganismIDs($mergeH, $stats);
178 :     ReadSynonyms($mergeH, $stats);
179 :     # Close the merge file and reopen it for input.
180 :     close $mergeH;
181 :     Trace("Processing merge file results.") if T(2);
182 :     my $ih = Open(undef, "<$mfileName");
183 :     # This file will be used to sort the aliases by feature ID.
184 :     my $sortFileName = "$FIG_Config::temp/sortAC$$.tmp.tbl";
185 :     push @tempFiles, $sortFileName;
186 :     my $oh = Open(undef, "| sort >$sortFileName");
187 :     # Now we set up the data we'll be accumulating for each alias.
188 :     # "$prevAlias" is the current alias, "$prevConf" is its confidence.
189 :     # We emit a row if we encounter a new alias or an old alias at the
190 :     # same confidence level.
191 :     my ($prevAlias, $prevConf);
192 :     # Loop through the merge file.
193 :     while (! eof $ih) {
194 :     # Get this row of data.
195 :     my ($alias, $conf, $type, $fid) = Tracer::GetLine($ih);
196 :     Trace($stats->Ask('mergeFileRecords') . " merge file records read.") if $stats->Check(mergeFileRecords => 5000) && T(3);
197 :     # Should we emit this alias?
198 :     if ($alias ne $prevAlias) {
199 :     # Yes. This is a new alias.
200 :     $prevAlias = $alias;
201 :     $prevConf = $conf;
202 :     # Emit this alias.
203 :     WriteAlias($oh, $alias, $conf, $type, $fid);
204 :     } elsif ($conf eq $prevConf) {
205 :     # Yes. This is an old alias at the same confidence level.
206 :     WriteAlias($oh, $alias, $conf, $type, $fid);
207 :     }
208 :     }
209 :     # Close the sort file.
210 :     Trace("Closing sort file.") if T(2);
211 :     close $oh;
212 :     # This will contain the current genome ID.
213 :     my $currGenome;
214 :     # This will contain the current output file handle.
215 :     my $gh;
216 :     # Now read the sort file and split it up by genome ID.
217 :     $ih = Open(undef, "<$sortFileName");
218 :     while (! eof $ih) {
219 :     # Get the next record.
220 :     my ($fid, $alias, $type, $conf) = Tracer::GetLine($ih);
221 :     # Compute the genome ID.
222 :     my ($genomeID) = FIGRules::ParseFeatureID($fid);
223 :     # Is it new?
224 :     if ($genomeID ne $currGenome) {
225 :     # Yes. Close the old file and start a new one.
226 :     if (defined $gh) {
227 :     close $gh;
228 :     }
229 :     $gh = Open(undef, ">$options->{output}/alias.$genomeID.tbl");
230 :     Trace("Genome file for $genomeID created.") if T(3);
231 :     $currGenome = $genomeID;
232 :     }
233 :     # Write this record to the current output file.
234 :     Tracer::PutLine($gh, [$fid, $alias, $type, $conf]);
235 :     }
236 :     # Close the current genome output file.
237 :     close $gh;
238 :     }
239 :     };
240 :     if ($@) {
241 :     Trace("Script failed with error: $@") if T(0);
242 :     $rtype = "error";
243 :     } else {
244 :     Trace("Script complete.") if T(2);
245 :     $rtype = "no error";
246 :     }
247 :     # Delete the temporary files (if any).
248 :     for my $fileName (@tempFiles) {
249 :     if (-f $fileName) {
250 :     if ($options->{keepTemp}) {
251 :     Trace("Temporary file $fileName was not deleted.") if T(3);
252 :     } else {
253 :     Trace("Deleting temporary file $fileName.") if T(3);
254 :     unlink $fileName;
255 :     }
256 :     }
257 :     }
258 :     # Display the statistics.
259 :     Trace("Statistics for this run:\n" . $stats->Show()) if T(2);
260 :     if ($options->{phone}) {
261 :     my $msgID = Tracer::SendSMS($options->{phone}, "AliasCrunch terminated with $rtype.");
262 :     if ($msgID) {
263 :     Trace("Phone message sent with ID $msgID.") if T(2);
264 :     } else {
265 :     Trace("Phone message not sent.") if T(2);
266 :     }
267 :     }
268 :    
269 :     =head2 Utility Methods
270 :    
271 :     =head3 ReadOrganismIDs
272 :    
273 :     ReadOrganismIDs($mergeH, $stats);
274 :    
275 :     Read all the data from the organism directories and output it to the
276 :     merge file. Organism directory aliases have medium confidence level
277 :     (C<B>).
278 :    
279 :     =over 4
280 :    
281 :     =item mergeH
282 :    
283 :     Open output handle for the merge file. Each record of the merge file should
284 :     contain (1) a normalized alias, (2) the confidence grade C<B>, (3) an
285 :     alias type, and (4) a feature ID.
286 :    
287 :     =item stats
288 :    
289 :     Statistics object for tracking this operation.
290 :    
291 :     =back
292 :    
293 :     =cut
294 :    
295 :     sub ReadOrganismIDs {
296 :     # Get the parameters.
297 :     my ($mergeH, $stats) = @_;
298 :     # Loop through the organism directories.
299 : parrello 1.3 for my $orgDir (sort grep { $_ =~ /^\d+\.\d+$/ } OpenDir($FIG_Config::organisms)) {
300 : parrello 1.1 Trace("Processing $orgDir.") if T(3);
301 :     $stats->Add(orgDirGenomes => 1);
302 :     # We need to process all of this organism's TBL files.
303 :     my $orgDirDir = "$FIG_Config::organisms/$orgDir/Features";
304 : parrello 1.8 if (! -d $orgDirDir) {
305 :     Trace("No feature directory found for $orgDir.") if T(1);
306 :     $stats->Add(orgDirMissing => 1);
307 :     } else {
308 :     for my $ftype (OpenDir($orgDirDir, 1)) {
309 :     my $tblFileName = "$orgDirDir/$ftype/tbl";
310 :     if (-s $tblFileName) {
311 :     Trace("Data found in $tblFileName.") if T(3);
312 :     # Read this TBL file.
313 :     $stats->Add(orgDirFiles => 1);
314 :     my $ih = Open(undef, "<$tblFileName");
315 :     while (! eof $ih) {
316 :     # Get the feature ID and its aliases.
317 :     my ($fid, undef, @aliases) = Tracer::GetLine($ih);
318 :     $stats->Add(orgDirFeatures => 1);
319 :     # Loop through the aliases.
320 :     for my $alias (@aliases) {
321 :     my $normalized;
322 :     # Determine the type.
323 :     my $aliasType = AliasAnalysis::TypeOf($alias);
324 :     $stats->Add(orgDirAll => 1);
325 :     # Is this a recognized type?
326 :     if ($aliasType) {
327 :     $stats->Add(orgDirNormal => 1);
328 :     # Yes. Write it normally.
329 :     WriteToMerge($mergeH, $alias, B => $aliasType, $fid);
330 :     } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {
331 :     # No, but this is a specially-marked locus tag.
332 : parrello 1.10 $normalized = "LocusTag:$1";
333 : parrello 1.8 $stats->Add(orgDirLocus => 1);
334 :     WriteToMerge($mergeH, $normalized, B => 'LocusTag', $fid);
335 :     } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
336 :     # No, but this is a natural locus tag.
337 :     $stats->Add(orgDirLocus => 1);
338 :     WriteToMerge($mergeH, $normalized, B => 'LocusTag', $fid);
339 :     } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {
340 :     # No, but this is a natural gene name.
341 :     $stats->Add(orgDirGene => 1);
342 :     WriteToMerge($mergeH, $normalized, B => 'GENE', $fid);
343 :     } elsif ($alias =~ /^\d+$/) {
344 :     # Here it's a naked number, which means it's a GI number
345 :     # of some sort. We only take these from the corresponding ID
346 :     # table.
347 :     $stats->Add(orgDirSkip => 1);
348 :     } elsif ($alias =~ /^protein_id\|(.+)/) {
349 :     # Here we have a REFSEQ protein ID.
350 :     $normalized = $1;
351 :     $stats->Add(orgDirProtein => 1);
352 :     WriteToMerge($mergeH, $normalized, C => 'RefSeq', $fid);
353 : parrello 1.10 } elsif ($alias =~ /^geneId\|(.+)/) {
354 :     # Here we have an alternate-form Gene ID.
355 :     $normalized = "GeneID:$1";
356 :     $stats->Add(orgDirGeneID => 1);
357 :     WriteToMerge($mergeH, $normalized, B => 'GeneID', $fid);
358 : parrello 1.8 } elsif ($alias =~ /[:|]/) {
359 :     # Here it's an alias of an unknown type.
360 :     $stats->Add(orgDirUnknown => 1);
361 :     } else {
362 :     # Here it's a miscellaneous type.
363 :     $stats->Add(orgDirMisc => 1);
364 :     WriteToMerge($mergeH, $alias, B => 'Miscellaneous', $fid);
365 :     }
366 : parrello 1.1 }
367 :     }
368 :     }
369 :     }
370 :     }
371 :     }
372 :     Trace("Organism directories complete.") if T(2);
373 :     }
374 :    
375 :     =head3 ReadSynonyms
376 :    
377 :     ReadSynonyms($mergeH, $stats);
378 :    
379 :     Read all the data from the C<peg.synonyms> file and output it to the
380 :     merge file. Data from the PEG synonyms file has the lowest confidence
381 :     level (C<C>), because all IDs with the same protein sequence are
382 :     conflated.
383 :    
384 :     =over 4
385 :    
386 :     =item mergeH
387 :    
388 :     Open output handle for the merge file. Each record of the merge file should
389 :     contain (1) a normalized alias, (2) the confidence grade C<B>, (3) an
390 :     alias type, and (4) a feature ID.
391 :    
392 :     =item stats
393 :    
394 :     Statistics object for tracking this operation.
395 :    
396 :     =back
397 :    
398 :     =cut
399 :    
400 :     sub ReadSynonyms {
401 :     # Get the parameters.
402 :     my ($mergeH, $stats) = @_;
403 :     # Open the peg.synonyms file.
404 :     my $synFileName = "$FIG_Config::global/peg.synonyms";
405 :     my $ih = Open(undef, "<$synFileName");
406 :     Trace("Processing $synFileName.") if T(2);
407 :     # Loop through the file.
408 :     while (! eof $ih) {
409 :     # Get this record.
410 :     my ($prot_id, $synonyms) = Tracer::GetLine($ih);
411 :     Trace($stats->Ask('proteins') . " protein synonym records read.") if $stats->Check(proteins => 5000) && T(3);
412 :     # Parse out the synonyms.
413 :     my @synonyms = split /;/, $synonyms;
414 :     # We'll save any FIG IDs in here.
415 :     my %figIDs;
416 :     # Other IDs go in here.
417 :     my @aliasTuples;
418 :     # Loop through the synonyms.
419 :     for my $synonym (@synonyms) {
420 :     # Strip off the length.
421 :     my ($alias) = split /,/, $synonym, 2;
422 :     # Convert NMPDR IDs to FIG IDs.
423 :     $alias =~ s/^nmpdr/fig/;
424 :     # Process according to the type.
425 :     if ($alias =~ /^fig/) {
426 :     $stats->Add(proteinFIG => 1);
427 :     $figIDs{$alias} = 1;
428 :     } else {
429 :     # Here we have an external ID. If it's of a recognized type, we'll
430 :     # keep it.
431 :     my $type = AliasAnalysis::TypeOf($alias);
432 :     if (! defined $type) {
433 :     # Not a recognized type, so ignore it.
434 :     $stats->Add(proteinSkip => 1);
435 :     } else {
436 :     # A recognized type, so keep it.
437 :     push @aliasTuples, [$alias, $type];
438 :     $stats->Add(proteinNormal => 1);
439 :     }
440 :     }
441 :     }
442 :     # Now we have all the IDs in place. If there are any FIG IDs in the
443 :     # bunch, write them to the merge file with all their aliases.
444 :     for my $fid (keys %figIDs) {
445 :     for my $aliasTuple (@aliasTuples) {
446 :     my ($alias, $type) = @$aliasTuple;
447 :     $stats->Add(proteinOut => 1);
448 :     WriteToMerge($mergeH, $alias, C => $type, $fid);
449 :     }
450 :     }
451 :     }
452 :     Trace("Protein synonyms complete.") if T(2);
453 :     }
454 :    
455 :     =head3 ReadCorrespondingIDs
456 :    
457 : parrello 1.2 ReadCorrespondingIDs($mergeH, $stats, $name);
458 : parrello 1.1
459 : parrello 1.2 Read all the data from the corresponding ID table and output it to the
460 : parrello 1.1 merge file. Corresponding IDs have the highest confidence level (C<A>).
461 :    
462 :     =over 4
463 :    
464 :     =item mergeH
465 :    
466 :     Open output handle for the merge file. Each record of the merge file should
467 :     contain (1) a normalized alias, (2) the confidence grade C<B>, (3) an
468 :     alias type, and (4) a feature ID.
469 :    
470 :     =item stats
471 :    
472 :     Statistics object for tracking this operation.
473 :    
474 : parrello 1.2 =item name
475 :    
476 :     Name of the corresponding ID file.
477 :    
478 : parrello 1.1 =back
479 :    
480 :     =cut
481 :    
482 :     sub ReadCorrespondingIDs {
483 :     # Get the parameters.
484 : parrello 1.2 my ($mergeH, $stats, $name) = @_;
485 : parrello 1.1 # Open the corresponding-ID file.
486 : parrello 1.2 my $ih = Open(undef, "<$name");
487 : parrello 1.1 Trace("Processing corresponding IDs.") if T(2);
488 :     # Read the header record.
489 :     my ($type0, @types) = Tracer::GetLine($ih);
490 :     # Insure SEED is the first column.
491 :     Confess("Incorrect file format. SEED is not first.") if ($type0 ne 'SEED');
492 :     # Skip the flag record.
493 :     Tracer::GetLine($ih);
494 :     # Loop through the file.
495 :     while (! eof $ih) {
496 :     # Get this FID and its synonym lists.
497 :     my ($fidList, @others) = Tracer::GetLine($ih);
498 :     Trace($stats->Ask('correspondingIDs') . " corresponding ID records read.") if $stats->Check(correspondingIDs => 5000) && T(3);
499 :     if (! $fidList) {
500 :     # Skip this record if there are no FIG IDs in it.
501 :     $stats->Add(correspondingSkip => 1);
502 :     } else {
503 :     # Get the list of FIG IDs.
504 :     my @fids = split /\s*;\s*/, $fidList;
505 :     # Loop through the other aliases.
506 :     for (my $i = 0; $i <= $#others; $i++) {
507 :     # Get this alias type.
508 :     my $type = $types[$i];
509 :     # Loop through the alias list.
510 :     for my $alias (split /\s*;\s*/, $others[$i]) {
511 :     # Ignore this alias if it's a RefSeq contig.
512 :     if ($type eq 'RefSeq' && $alias =~ /^[A-Z][CMT]/) {
513 :     $stats->Add(correspondingContig => 1);
514 :     } else {
515 : parrello 1.3 # Check for a locus tag disguised as a CMR ID.
516 :     my $realType = $type;
517 :     if ($type eq 'CMR' && $alias =~ /^[A-Z]{2,3}_\d+$/) {
518 :     $realType = 'LocusTag';
519 :     $stats->Add(correspondingLocus => 1);
520 :     } else {
521 :     $stats->Add(correspondingNormal => 1);
522 :     }
523 :     # Normalize the alias.
524 : parrello 1.1 my $normalized = AliasAnalysis::Normalize($type => $alias);
525 :     # Write it out once for each FIG ID.
526 :     for my $fid (@fids) {
527 :     WriteToMerge($mergeH, $normalized, A => $type, $fid);
528 :     }
529 :     }
530 :     }
531 :     }
532 :     }
533 :     }
534 :     # Close the input file and the sort file.
535 :     close $ih;
536 :     Trace("Corresponding IDs complete.") if T(2);
537 :     }
538 :    
539 : parrello 1.2 =head3 CreateCorrespondingIdFile
540 :    
541 :     CreateCorrespondingIdFile($stats, $name);
542 :    
543 :     Create a corresponding-ID file from the data in the SEED database. The
544 :     outgoing file will contain a header record with the ID types followed by
545 :     a record for each ID group. Within a group, the field for a given ID type
546 :     will contain a semicolon-delimited list of the IDs of that type in the
547 :     group.
548 :    
549 :     When the SEED database goes away this method will need to be replaced.
550 :    
551 :     =over 4
552 :    
553 :     =item stats
554 :    
555 :     Statistics object to use for tracking progress.
556 :    
557 :     =item name
558 :    
559 :     Name to give to the corresponding-ID file.
560 :    
561 :     =back
562 :    
563 :     =cut
564 :    
565 :     sub CreateCorrespondingIdFile {
566 :     # Get the parameters.
567 :     my ($stats, $name) = @_;
568 :     # Get the FIG database.
569 :     require FIG;
570 :     my $fig = new FIG;
571 :     my $dbh = $fig->db_handle();
572 :     # Open the output file.
573 :     my $oh = Open(undef, ">$name");
574 :     Trace("Creating header for corresponding ID file.") if T(2);
575 :     # Create the header record from the id types table.
576 :     my %types = map { $_->[0] => $_->[1] } @{$dbh->SQL("SELECT id, name FROM id_correspondence_type")};
577 :     my @typeList = sort keys %types;
578 :     my @header = map { $types{$_} } @typeList;
579 :     Trace("Header is " . join(" ", @header) . ".") if T(3);
580 :     Tracer::PutLine($oh, \@header);
581 :     # Now we loop through the id correspondence table, creating groups (sets). We use
582 :     # an SQL statement for this.
583 :     my $sth = $dbh->prepare_command("SELECT set_id, protein_id, type FROM id_correspondence");
584 :     my $rc = $sth->execute();
585 :     if (! $rc) {
586 :     Confess("SELECT error creating corresponding ID file: " . $sth->errstr());
587 :     }
588 :     # These variables contain the ID and content of the current group.
589 :     my ($set, $content) = (-1, undef);
590 :     # These variables will hold the fields from the current record.
591 :     my ($set_id, $protein_id, $type);
592 :     # This flag will be set to TRUE when we're done.
593 :     my $done = 0;
594 :     while (! $done) {
595 :     # Get the next record.
596 :     my $record = $sth->fetchrow_arrayref();
597 :     if (! defined $record) {
598 :     # No record, so we're done.
599 :     Trace("End of correspondence table found.") if T(3);
600 :     $done = 1;
601 :     } else {
602 :     # A record found, so we get its data.
603 :     ($set_id, $protein_id, $type) = @$record;
604 :     Trace($stats->Ask('corrTableRecords') . " corresponding ID table records read.") if $stats->Check(corrTableRecords => 5000) && T(3);
605 :     }
606 :     # Is this a new group?
607 :     if ($done || $set_id != $set) {
608 :     # Yes. If the old group has content, we write it out. Each field is
609 :     # formed by joining the IDs for that type into a string using semicolons.
610 :     if (defined $content) {
611 :     my @typeStrings = map { join("; ", @{$content->{$_}}) } @typeList;
612 :     Tracer::PutLine($oh, \@typeStrings);
613 :     }
614 :     # Check for an error in the sort.
615 :     if ($set > $set_id) {
616 :     Confess("Invalid set order in id_correspondence table: $set to $set_id.");
617 :     }
618 :     # Now start the new group.
619 :     $set = $set_id;
620 :     $content = { map { $_ => [] } @typeList };
621 :     }
622 :     # Put this ID in this group.
623 :     push @{$content->{$type}}, $protein_id;
624 :     }
625 :     # Close up the output file.
626 :     Trace("Corresponding ID file created as $name.") if T(2);
627 :     close $oh;
628 :     }
629 :    
630 :    
631 : parrello 1.1 =head3 WriteAlias
632 :    
633 :     WriteAlias($oh, $alias, $conf, $type, $fid);
634 :    
635 :     Write an alias record to the sort file. The alias record
636 :     contains the alias ID, the alias type, its confidence level, and the
637 :     corresponding feature ID.
638 :    
639 :     =over 4
640 :    
641 :     =item oh
642 :    
643 :     Open handle for the output file.
644 :    
645 :     =item alias
646 :    
647 :     Alias identifier.
648 :    
649 :     =item conf
650 :    
651 :     Confidence grade for this alias: C<A> is best, C<F> is worst.
652 :    
653 :     =item type
654 :    
655 :     Type of alias (e.g. NCBI, CMR, RefSeq).
656 :    
657 :     =item fid
658 :    
659 :     %FIG{FIG ID} corresponding to the alias.
660 :    
661 :     =back
662 :    
663 :     =cut
664 :    
665 :     sub WriteAlias {
666 :     # Get the parameters.
667 :     my ($oh, $alias, $conf, $type, $fid) = @_;
668 :     # Compute the genome ID.
669 :     my $genomeID = FIGRules::ParseFeatureID($fid);
670 :     # Write this alias to the output file.
671 :     Tracer::PutLine($oh, [$fid, $alias, $type, $conf]);
672 :     $stats->Add(aliasOut => 1);
673 :     $stats->Add("aliasOut$type" => 1);
674 :     }
675 :    
676 :     =head3 WriteToMerge
677 :    
678 :     WriteToMerge($mergeH, $alias, $grade => $aliasType, $fid);
679 :    
680 :     Write an alias connection to the output merge file.
681 :    
682 :     =over 4
683 :    
684 :     =item mergeH
685 :    
686 :     Open output handle for the merge file.
687 :    
688 :     =item alias
689 :    
690 :     Alias identifier.
691 :    
692 :     =item conf
693 :    
694 :     Confidence grade for this alias: C<A> is best, C<F> is worst.
695 :    
696 :     =item type
697 :    
698 :     Type of alias (e.g. NCBI, CMR, RefSeq).
699 :    
700 :     =item fid
701 :    
702 :     %FIG{FIG ID} corresponding to the alias.
703 :    
704 :     =back
705 :    
706 :     =cut
707 :    
708 :     sub WriteToMerge {
709 :     # Get the parameters.
710 :     my ($mergeH, $alias, $grade, $aliasType, $fid) = @_;
711 :     # Write the merge file record.
712 :     Tracer::PutLine($mergeH, [$alias, $grade, $aliasType, $fid]);
713 :     $stats->Add(mergeOut => 1);
714 :     }
715 :    
716 :    
717 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3