[Bio] / Sprout / AliasCrunch.pl Repository:
ViewVC logotype

Diff of /Sprout/AliasCrunch.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Thu Apr 2 01:28:49 2009 UTC revision 1.7, Tue Dec 14 19:47:40 2010 UTC
# Line 163  Line 163 
163                  $stats->Add(filesCleared => 1);                  $stats->Add(filesCleared => 1);
164              }              }
165          }          }
166            # Create the corresponding-ID file.
167            CreateCorrespondingIdFile($stats, "$output/id_corresponding.tbl");
168          # Open the merge file. Each record of the merge file will contain (1) a          # Open the merge file. Each record of the merge file will contain (1) a
169          # normalized alias, (2) a confidence grade (A, B, C), (3) an alias type,          # normalized alias, (2) a confidence grade (A, B, C), (3) an alias type,
170          # and (4) a feature ID. The merge file is then read back so that we can          # and (4) a feature ID. The merge file is then read back so that we can
# Line 171  Line 173 
173          Trace("Creating merge file $mfileName.") if T(2);          Trace("Creating merge file $mfileName.") if T(2);
174          my $mergeH = Open(undef, "| sort -u >$mfileName");          my $mergeH = Open(undef, "| sort -u >$mfileName");
175          # Now read in the three sources of data.          # Now read in the three sources of data.
176          ReadCorrespondingIDs($mergeH, $stats);          ReadCorrespondingIDs($mergeH, $stats, "$output/id_corresponding.tbl");
177          ReadOrganismIDs($mergeH, $stats);          ReadOrganismIDs($mergeH, $stats);
178          ReadSynonyms($mergeH, $stats);          ReadSynonyms($mergeH, $stats);
179          # Close the merge file and reopen it for input.          # Close the merge file and reopen it for input.
# Line 294  Line 296 
296      # Get the parameters.      # Get the parameters.
297      my ($mergeH, $stats) = @_;      my ($mergeH, $stats) = @_;
298      # Loop through the organism directories.      # Loop through the organism directories.
299      for my $orgDir (grep { $_ =~ /^\d+\.\d+$/ } OpenDir($FIG_Config::organisms)) {      for my $orgDir (sort grep { $_ =~ /^\d+\.\d+$/ } OpenDir($FIG_Config::organisms)) {
300          Trace("Processing $orgDir.") if T(3);          Trace("Processing $orgDir.") if T(3);
301          $stats->Add(orgDirGenomes => 1);          $stats->Add(orgDirGenomes => 1);
302          # We need to process all of this organism's TBL files.          # We need to process all of this organism's TBL files.
# Line 308  Line 310 
310                  my $ih = Open(undef, "<$tblFileName");                  my $ih = Open(undef, "<$tblFileName");
311                  while (! eof $ih) {                  while (! eof $ih) {
312                      # Get the feature ID and its aliases.                      # Get the feature ID and its aliases.
313                      my ($fid, undef, undef, undef, @aliases) = Tracer::GetLine($ih);                      my ($fid, undef, @aliases) = Tracer::GetLine($ih);
314                      $stats->Add(orgDirFeatures => 1);                      $stats->Add(orgDirFeatures => 1);
315                      # Loop through the aliases.                      # Loop through the aliases.
316                      for my $alias (@aliases) {                      for my $alias (@aliases) {
# Line 321  Line 323 
323                              $stats->Add(orgDirNormal => 1);                              $stats->Add(orgDirNormal => 1);
324                              # Yes. Write it normally.                              # Yes. Write it normally.
325                              WriteToMerge($mergeH, $alias, B => $aliasType, $fid);                              WriteToMerge($mergeH, $alias, B => $aliasType, $fid);
326                            } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {
327                                # No, but this is a specially-marked locus tag.
328                                $normalized = $1;
329                                $stats->Add(orgDirLocus => 1);
330                                WriteToMerge($mergeH, $normalized, B => 'LocusTag', $fid);
331                          } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {                          } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
332                              # No, but this is a natural locus tag.                              # No, but this is a natural locus tag.
333                              $stats->Add(orgDirLocus => 1);                              $stats->Add(orgDirLocus => 1);
# Line 334  Line 341 
341                              # of some sort. We only take these from the corresponding ID                              # of some sort. We only take these from the corresponding ID
342                              # table.                              # table.
343                              $stats->Add(orgDirSkip => 1);                              $stats->Add(orgDirSkip => 1);
344                            } elsif ($alias =~ /^protein_id\|(.+)/) {
345                                # Here we have a REFSEQ protein ID.
346                                $normalized = $1;
347                                $stats->Add(orgDirProtein => 1);
348                                WriteToMerge($mergeH, $normalized, C => 'RefSeq', $fid);
349                          } elsif ($alias =~ /[:|]/) {                          } elsif ($alias =~ /[:|]/) {
350                              # Here it's an alias of an unknown type.                              # Here it's an alias of an unknown type.
351                              $stats->Add(orgDirUnknown => 1);                              $stats->Add(orgDirUnknown => 1);
352                          } else {                          } else {
353                              # Here it's a miscellaneous type.                              # Here it's a miscellaneous type.
354                                $stats->Add(orgDirMisc => 1);
355                              WriteToMerge($mergeH, $alias, B => 'Miscellaneous', $fid);                              WriteToMerge($mergeH, $alias, B => 'Miscellaneous', $fid);
356                          }                          }
357                      }                      }
# Line 431  Line 444 
444    
445  =head3 ReadCorrespondingIDs  =head3 ReadCorrespondingIDs
446    
447      ReadCorrespondingIDs($mergeH, $stats);      ReadCorrespondingIDs($mergeH, $stats, $name);
448    
449  Read all the data from the corresponding ID file and output it to the  Read all the data from the corresponding ID table and output it to the
450  merge file. Corresponding IDs have the highest confidence level (C<A>).  merge file. Corresponding IDs have the highest confidence level (C<A>).
451    
452  =over 4  =over 4
# Line 448  Line 461 
461    
462  Statistics object for tracking this operation.  Statistics object for tracking this operation.
463    
464    =item name
465    
466    Name of the corresponding ID file.
467    
468  =back  =back
469    
470  =cut  =cut
471    
472  sub ReadCorrespondingIDs {  sub ReadCorrespondingIDs {
473      # Get the parameters.      # Get the parameters.
474      my ($mergeH, $stats) = @_;      my ($mergeH, $stats, $name) = @_;
475      # Open the corresponding-ID file.      # Open the corresponding-ID file.
476      my $ih = Open(undef, "<$FIG_Config::global/id_correspondence");      my $ih = Open(undef, "<$name");
477      Trace("Processing corresponding IDs.") if T(2);      Trace("Processing corresponding IDs.") if T(2);
478      # Read the header record.      # Read the header record.
479      my ($type0, @types) = Tracer::GetLine($ih);      my ($type0, @types) = Tracer::GetLine($ih);
# Line 485  Line 502 
502                      if ($type eq 'RefSeq' && $alias =~ /^[A-Z][CMT]/) {                      if ($type eq 'RefSeq' && $alias =~ /^[A-Z][CMT]/) {
503                          $stats->Add(correspondingContig => 1);                          $stats->Add(correspondingContig => 1);
504                      } else {                      } else {
505                          # We're okay, so normalize the alias.                          # Check for a locus tag disguised as a CMR ID.
506                          my $normalized = AliasAnalysis::Normalize($type => $alias);                          my $realType = $type;
507                            if ($type eq 'CMR' && $alias =~ /^[A-Z]{2,3}_\d+$/) {
508                                $realType = 'LocusTag';
509                                $stats->Add(correspondingLocus => 1);
510                            } else {
511                          $stats->Add(correspondingNormal => 1);                          $stats->Add(correspondingNormal => 1);
512                            }
513                            # Normalize the alias.
514                            my $normalized = AliasAnalysis::Normalize($type => $alias);
515                          # Write it out once for each FIG ID.                          # Write it out once for each FIG ID.
516                          for my $fid (@fids) {                          for my $fid (@fids) {
517                              WriteToMerge($mergeH, $normalized, A => $type, $fid);                              WriteToMerge($mergeH, $normalized, A => $type, $fid);
# Line 502  Line 526 
526      Trace("Corresponding IDs complete.") if T(2);      Trace("Corresponding IDs complete.") if T(2);
527  }  }
528    
529    =head3 CreateCorrespondingIdFile
530    
531        CreateCorrespondingIdFile($stats, $name);
532    
533    Create a corresponding-ID file from the data in the SEED database. The
534    outgoing file will contain a header record with the ID types followed by
535    a record for each ID group. Within a group, the field for a given ID type
536    will contain a semicolon-delimited list of the IDs of that type in the
537    group.
538    
539    When the SEED database goes away this method will need to be replaced.
540    
541    =over 4
542    
543    =item stats
544    
545    Statistics object to use for tracking progress.
546    
547    =item name
548    
549    Name to give to the corresponding-ID file.
550    
551    =back
552    
553    =cut
554    
555    sub CreateCorrespondingIdFile {
556        # Get the parameters.
557        my ($stats, $name) = @_;
558        # Get the FIG database.
559        require FIG;
560        my $fig = new FIG;
561        my $dbh = $fig->db_handle();
562        # Open the output file.
563        my $oh = Open(undef, ">$name");
564        Trace("Creating header for corresponding ID file.") if T(2);
565        # Create the header record from the id types table.
566        my %types = map { $_->[0] => $_->[1] } @{$dbh->SQL("SELECT id, name FROM id_correspondence_type")};
567        my @typeList = sort keys %types;
568        my @header = map { $types{$_} } @typeList;
569        Trace("Header is " . join(" ", @header) . ".") if T(3);
570        Tracer::PutLine($oh, \@header);
571        # Now we loop through the id correspondence table, creating groups (sets). We use
572        # an SQL statement for this.
573        my $sth = $dbh->prepare_command("SELECT set_id, protein_id, type FROM id_correspondence");
574        my $rc = $sth->execute();
575        if (! $rc) {
576            Confess("SELECT error creating corresponding ID file: " . $sth->errstr());
577        }
578        # These variables contain the ID and content of the current group.
579        my ($set, $content) = (-1, undef);
580        # These variables will hold the fields from the current record.
581        my ($set_id, $protein_id, $type);
582        # This flag will be set to TRUE when we're done.
583        my $done = 0;
584        while (! $done) {
585            # Get the next record.
586            my $record = $sth->fetchrow_arrayref();
587            if (! defined $record) {
588                # No record, so we're done.
589                Trace("End of correspondence table found.") if T(3);
590                $done = 1;
591            } else {
592                # A record found, so we get its data.
593                ($set_id, $protein_id, $type) = @$record;
594                Trace($stats->Ask('corrTableRecords') . " corresponding ID table records read.") if $stats->Check(corrTableRecords => 5000) && T(3);
595            }
596            # Is this a new group?
597            if ($done || $set_id != $set) {
598                # Yes. If the old group has content, we write it out. Each field is
599                # formed by joining the IDs for that type into a string using semicolons.
600                if (defined $content) {
601                    my @typeStrings = map { join("; ", @{$content->{$_}}) } @typeList;
602                    Tracer::PutLine($oh, \@typeStrings);
603                }
604                # Check for an error in the sort.
605                if ($set > $set_id) {
606                    Confess("Invalid set order in id_correspondence table: $set to $set_id.");
607                }
608                # Now start the new group.
609                $set = $set_id;
610                $content = { map { $_ => [] } @typeList };
611            }
612            # Put this ID in this group.
613            push @{$content->{$type}}, $protein_id;
614        }
615        # Close up the output file.
616        Trace("Corresponding ID file created as $name.") if T(2);
617        close $oh;
618    }
619    
620    
621  =head3 WriteAlias  =head3 WriteAlias
622    
623      WriteAlias($oh, $alias, $conf, $type, $fid);      WriteAlias($oh, $alias, $conf, $type, $fid);

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3