[Bio] / Sprout / AliasCrunch.pl Repository:
ViewVC logotype

Diff of /Sprout/AliasCrunch.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Thu Apr 2 01:28:49 2009 UTC revision 1.8, Wed Feb 2 15:43:10 2011 UTC
# Line 163  Line 163 
163                  $stats->Add(filesCleared => 1);                  $stats->Add(filesCleared => 1);
164              }              }
165          }          }
166            # Create the corresponding-ID file.
167            CreateCorrespondingIdFile($stats, "$output/id_corresponding.tbl");
168          # Open the merge file. Each record of the merge file will contain (1) a          # Open the merge file. Each record of the merge file will contain (1) a
169          # normalized alias, (2) a confidence grade (A, B, C), (3) an alias type,          # normalized alias, (2) a confidence grade (A, B, C), (3) an alias type,
170          # and (4) a feature ID. The merge file is then read back so that we can          # and (4) a feature ID. The merge file is then read back so that we can
# Line 171  Line 173 
173          Trace("Creating merge file $mfileName.") if T(2);          Trace("Creating merge file $mfileName.") if T(2);
174          my $mergeH = Open(undef, "| sort -u >$mfileName");          my $mergeH = Open(undef, "| sort -u >$mfileName");
175          # Now read in the three sources of data.          # Now read in the three sources of data.
176          ReadCorrespondingIDs($mergeH, $stats);          ReadCorrespondingIDs($mergeH, $stats, "$output/id_corresponding.tbl");
177          ReadOrganismIDs($mergeH, $stats);          ReadOrganismIDs($mergeH, $stats);
178          ReadSynonyms($mergeH, $stats);          ReadSynonyms($mergeH, $stats);
179          # Close the merge file and reopen it for input.          # Close the merge file and reopen it for input.
# Line 294  Line 296 
296      # Get the parameters.      # Get the parameters.
297      my ($mergeH, $stats) = @_;      my ($mergeH, $stats) = @_;
298      # Loop through the organism directories.      # Loop through the organism directories.
299      for my $orgDir (grep { $_ =~ /^\d+\.\d+$/ } OpenDir($FIG_Config::organisms)) {      for my $orgDir (sort grep { $_ =~ /^\d+\.\d+$/ } OpenDir($FIG_Config::organisms)) {
300          Trace("Processing $orgDir.") if T(3);          Trace("Processing $orgDir.") if T(3);
301          $stats->Add(orgDirGenomes => 1);          $stats->Add(orgDirGenomes => 1);
302          # We need to process all of this organism's TBL files.          # We need to process all of this organism's TBL files.
303          my $orgDirDir = "$FIG_Config::organisms/$orgDir/Features";          my $orgDirDir = "$FIG_Config::organisms/$orgDir/Features";
304            if (! -d $orgDirDir) {
305                Trace("No feature directory found for $orgDir.") if T(1);
306                $stats->Add(orgDirMissing => 1);
307            } else {
308          for my $ftype (OpenDir($orgDirDir, 1)) {          for my $ftype (OpenDir($orgDirDir, 1)) {
309              my $tblFileName = "$orgDirDir/$ftype/tbl";              my $tblFileName = "$orgDirDir/$ftype/tbl";
310              if (-s $tblFileName) {              if (-s $tblFileName) {
# Line 308  Line 314 
314                  my $ih = Open(undef, "<$tblFileName");                  my $ih = Open(undef, "<$tblFileName");
315                  while (! eof $ih) {                  while (! eof $ih) {
316                      # Get the feature ID and its aliases.                      # Get the feature ID and its aliases.
317                      my ($fid, undef, undef, undef, @aliases) = Tracer::GetLine($ih);                          my ($fid, undef, @aliases) = Tracer::GetLine($ih);
318                      $stats->Add(orgDirFeatures => 1);                      $stats->Add(orgDirFeatures => 1);
319                      # Loop through the aliases.                      # Loop through the aliases.
320                      for my $alias (@aliases) {                      for my $alias (@aliases) {
# Line 321  Line 327 
327                              $stats->Add(orgDirNormal => 1);                              $stats->Add(orgDirNormal => 1);
328                              # Yes. Write it normally.                              # Yes. Write it normally.
329                              WriteToMerge($mergeH, $alias, B => $aliasType, $fid);                              WriteToMerge($mergeH, $alias, B => $aliasType, $fid);
330                                } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {
331                                    # No, but this is a specially-marked locus tag.
332                                    $normalized = $1;
333                                    $stats->Add(orgDirLocus => 1);
334                                    WriteToMerge($mergeH, $normalized, B => 'LocusTag', $fid);
335                          } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {                          } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
336                              # No, but this is a natural locus tag.                              # No, but this is a natural locus tag.
337                              $stats->Add(orgDirLocus => 1);                              $stats->Add(orgDirLocus => 1);
# Line 334  Line 345 
345                              # of some sort. We only take these from the corresponding ID                              # of some sort. We only take these from the corresponding ID
346                              # table.                              # table.
347                              $stats->Add(orgDirSkip => 1);                              $stats->Add(orgDirSkip => 1);
348                                } elsif ($alias =~ /^protein_id\|(.+)/) {
349                                    # Here we have a REFSEQ protein ID.
350                                    $normalized = $1;
351                                    $stats->Add(orgDirProtein => 1);
352                                    WriteToMerge($mergeH, $normalized, C => 'RefSeq', $fid);
353                          } elsif ($alias =~ /[:|]/) {                          } elsif ($alias =~ /[:|]/) {
354                              # Here it's an alias of an unknown type.                              # Here it's an alias of an unknown type.
355                              $stats->Add(orgDirUnknown => 1);                              $stats->Add(orgDirUnknown => 1);
356                          } else {                          } else {
357                              # Here it's a miscellaneous type.                              # Here it's a miscellaneous type.
358                                    $stats->Add(orgDirMisc => 1);
359                              WriteToMerge($mergeH, $alias, B => 'Miscellaneous', $fid);                              WriteToMerge($mergeH, $alias, B => 'Miscellaneous', $fid);
360                          }                          }
361                      }                      }
# Line 346  Line 363 
363              }              }
364          }          }
365      }      }
366        }
367      Trace("Organism directories complete.") if T(2);      Trace("Organism directories complete.") if T(2);
368  }  }
369    
# Line 431  Line 449 
449    
450  =head3 ReadCorrespondingIDs  =head3 ReadCorrespondingIDs
451    
452      ReadCorrespondingIDs($mergeH, $stats);      ReadCorrespondingIDs($mergeH, $stats, $name);
453    
454  Read all the data from the corresponding ID file and output it to the  Read all the data from the corresponding ID table and output it to the
455  merge file. Corresponding IDs have the highest confidence level (C<A>).  merge file. Corresponding IDs have the highest confidence level (C<A>).
456    
457  =over 4  =over 4
# Line 448  Line 466 
466    
467  Statistics object for tracking this operation.  Statistics object for tracking this operation.
468    
469    =item name
470    
471    Name of the corresponding ID file.
472    
473  =back  =back
474    
475  =cut  =cut
476    
477  sub ReadCorrespondingIDs {  sub ReadCorrespondingIDs {
478      # Get the parameters.      # Get the parameters.
479      my ($mergeH, $stats) = @_;      my ($mergeH, $stats, $name) = @_;
480      # Open the corresponding-ID file.      # Open the corresponding-ID file.
481      my $ih = Open(undef, "<$FIG_Config::global/id_correspondence");      my $ih = Open(undef, "<$name");
482      Trace("Processing corresponding IDs.") if T(2);      Trace("Processing corresponding IDs.") if T(2);
483      # Read the header record.      # Read the header record.
484      my ($type0, @types) = Tracer::GetLine($ih);      my ($type0, @types) = Tracer::GetLine($ih);
# Line 485  Line 507 
507                      if ($type eq 'RefSeq' && $alias =~ /^[A-Z][CMT]/) {                      if ($type eq 'RefSeq' && $alias =~ /^[A-Z][CMT]/) {
508                          $stats->Add(correspondingContig => 1);                          $stats->Add(correspondingContig => 1);
509                      } else {                      } else {
510                          # We're okay, so normalize the alias.                          # Check for a locus tag disguised as a CMR ID.
511                          my $normalized = AliasAnalysis::Normalize($type => $alias);                          my $realType = $type;
512                            if ($type eq 'CMR' && $alias =~ /^[A-Z]{2,3}_\d+$/) {
513                                $realType = 'LocusTag';
514                                $stats->Add(correspondingLocus => 1);
515                            } else {
516                          $stats->Add(correspondingNormal => 1);                          $stats->Add(correspondingNormal => 1);
517                            }
518                            # Normalize the alias.
519                            my $normalized = AliasAnalysis::Normalize($type => $alias);
520                          # Write it out once for each FIG ID.                          # Write it out once for each FIG ID.
521                          for my $fid (@fids) {                          for my $fid (@fids) {
522                              WriteToMerge($mergeH, $normalized, A => $type, $fid);                              WriteToMerge($mergeH, $normalized, A => $type, $fid);
# Line 502  Line 531 
531      Trace("Corresponding IDs complete.") if T(2);      Trace("Corresponding IDs complete.") if T(2);
532  }  }
533    
534    =head3 CreateCorrespondingIdFile
535    
536        CreateCorrespondingIdFile($stats, $name);
537    
538    Create a corresponding-ID file from the data in the SEED database. The
539    outgoing file will contain a header record with the ID types followed by
540    a record for each ID group. Within a group, the field for a given ID type
541    will contain a semicolon-delimited list of the IDs of that type in the
542    group.
543    
544    When the SEED database goes away this method will need to be replaced.
545    
546    =over 4
547    
548    =item stats
549    
550    Statistics object to use for tracking progress.
551    
552    =item name
553    
554    Name to give to the corresponding-ID file.
555    
556    =back
557    
558    =cut
559    
560    sub CreateCorrespondingIdFile {
561        # Get the parameters.
562        my ($stats, $name) = @_;
563        # Get the FIG database.
564        require FIG;
565        my $fig = new FIG;
566        my $dbh = $fig->db_handle();
567        # Open the output file.
568        my $oh = Open(undef, ">$name");
569        Trace("Creating header for corresponding ID file.") if T(2);
570        # Create the header record from the id types table.
571        my %types = map { $_->[0] => $_->[1] } @{$dbh->SQL("SELECT id, name FROM id_correspondence_type")};
572        my @typeList = sort keys %types;
573        my @header = map { $types{$_} } @typeList;
574        Trace("Header is " . join(" ", @header) . ".") if T(3);
575        Tracer::PutLine($oh, \@header);
576        # Now we loop through the id correspondence table, creating groups (sets). We use
577        # an SQL statement for this.
578        my $sth = $dbh->prepare_command("SELECT set_id, protein_id, type FROM id_correspondence");
579        my $rc = $sth->execute();
580        if (! $rc) {
581            Confess("SELECT error creating corresponding ID file: " . $sth->errstr());
582        }
583        # These variables contain the ID and content of the current group.
584        my ($set, $content) = (-1, undef);
585        # These variables will hold the fields from the current record.
586        my ($set_id, $protein_id, $type);
587        # This flag will be set to TRUE when we're done.
588        my $done = 0;
589        while (! $done) {
590            # Get the next record.
591            my $record = $sth->fetchrow_arrayref();
592            if (! defined $record) {
593                # No record, so we're done.
594                Trace("End of correspondence table found.") if T(3);
595                $done = 1;
596            } else {
597                # A record found, so we get its data.
598                ($set_id, $protein_id, $type) = @$record;
599                Trace($stats->Ask('corrTableRecords') . " corresponding ID table records read.") if $stats->Check(corrTableRecords => 5000) && T(3);
600            }
601            # Is this a new group?
602            if ($done || $set_id != $set) {
603                # Yes. If the old group has content, we write it out. Each field is
604                # formed by joining the IDs for that type into a string using semicolons.
605                if (defined $content) {
606                    my @typeStrings = map { join("; ", @{$content->{$_}}) } @typeList;
607                    Tracer::PutLine($oh, \@typeStrings);
608                }
609                # Check for an error in the sort.
610                if ($set > $set_id) {
611                    Confess("Invalid set order in id_correspondence table: $set to $set_id.");
612                }
613                # Now start the new group.
614                $set = $set_id;
615                $content = { map { $_ => [] } @typeList };
616            }
617            # Put this ID in this group.
618            push @{$content->{$type}}, $protein_id;
619        }
620        # Close up the output file.
621        Trace("Corresponding ID file created as $name.") if T(2);
622        close $oh;
623    }
624    
625    
626  =head3 WriteAlias  =head3 WriteAlias
627    
628      WriteAlias($oh, $alias, $conf, $type, $fid);      WriteAlias($oh, $alias, $conf, $type, $fid);

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.8

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3