[Bio] / Sprout / AliasCrunch.pl Repository:
ViewVC logotype

Diff of /Sprout/AliasCrunch.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Thu Apr 2 01:28:49 2009 UTC revision 1.5, Fri Nov 5 16:51:17 2010 UTC
# Line 163  Line 163 
163                  $stats->Add(filesCleared => 1);                  $stats->Add(filesCleared => 1);
164              }              }
165          }          }
166            # Create the corresponding-ID file.
167            CreateCorrespondingIdFile($stats, "$output/id_corresponding.tbl");
168          # Open the merge file. Each record of the merge file will contain (1) a          # Open the merge file. Each record of the merge file will contain (1) a
169          # normalized alias, (2) a confidence grade (A, B, C), (3) an alias type,          # normalized alias, (2) a confidence grade (A, B, C), (3) an alias type,
170          # and (4) a feature ID. The merge file is then read back so that we can          # and (4) a feature ID. The merge file is then read back so that we can
# Line 171  Line 173 
173          Trace("Creating merge file $mfileName.") if T(2);          Trace("Creating merge file $mfileName.") if T(2);
174          my $mergeH = Open(undef, "| sort -u >$mfileName");          my $mergeH = Open(undef, "| sort -u >$mfileName");
175          # Now read in the three sources of data.          # Now read in the three sources of data.
176          ReadCorrespondingIDs($mergeH, $stats);          ReadCorrespondingIDs($mergeH, $stats, "$output/id_corresponding.tbl");
177          ReadOrganismIDs($mergeH, $stats);          ReadOrganismIDs($mergeH, $stats);
178          ReadSynonyms($mergeH, $stats);          ReadSynonyms($mergeH, $stats);
179          # Close the merge file and reopen it for input.          # Close the merge file and reopen it for input.
# Line 294  Line 296 
296      # Get the parameters.      # Get the parameters.
297      my ($mergeH, $stats) = @_;      my ($mergeH, $stats) = @_;
298      # Loop through the organism directories.      # Loop through the organism directories.
299      for my $orgDir (grep { $_ =~ /^\d+\.\d+$/ } OpenDir($FIG_Config::organisms)) {      for my $orgDir (sort grep { $_ =~ /^\d+\.\d+$/ } OpenDir($FIG_Config::organisms)) {
300          Trace("Processing $orgDir.") if T(3);          Trace("Processing $orgDir.") if T(3);
301          $stats->Add(orgDirGenomes => 1);          $stats->Add(orgDirGenomes => 1);
302          # We need to process all of this organism's TBL files.          # We need to process all of this organism's TBL files.
# Line 308  Line 310 
310                  my $ih = Open(undef, "<$tblFileName");                  my $ih = Open(undef, "<$tblFileName");
311                  while (! eof $ih) {                  while (! eof $ih) {
312                      # Get the feature ID and its aliases.                      # Get the feature ID and its aliases.
313                      my ($fid, undef, undef, undef, @aliases) = Tracer::GetLine($ih);                      my ($fid, undef, @aliases) = Tracer::GetLine($ih);
314                      $stats->Add(orgDirFeatures => 1);                      $stats->Add(orgDirFeatures => 1);
315                      # Loop through the aliases.                      # Loop through the aliases.
316                      for my $alias (@aliases) {                      for my $alias (@aliases) {
# Line 321  Line 323 
323                              $stats->Add(orgDirNormal => 1);                              $stats->Add(orgDirNormal => 1);
324                              # Yes. Write it normally.                              # Yes. Write it normally.
325                              WriteToMerge($mergeH, $alias, B => $aliasType, $fid);                              WriteToMerge($mergeH, $alias, B => $aliasType, $fid);
326                            } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^locus\|(.+)/) {
327                                # No, but this is a specially-marked locus tag.
328                                $normalized = $1;
329                                $stats->Add(orgDirLocus => 1);
330                                WriteToMerge($mergeH, $normalized, B => 'LocusTag', $fid);
331                          } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {                          } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
332                              # No, but this is a natural locus tag.                              # No, but this is a natural locus tag.
333                              $stats->Add(orgDirLocus => 1);                              $stats->Add(orgDirLocus => 1);
# Line 334  Line 341 
341                              # of some sort. We only take these from the corresponding ID                              # of some sort. We only take these from the corresponding ID
342                              # table.                              # table.
343                              $stats->Add(orgDirSkip => 1);                              $stats->Add(orgDirSkip => 1);
344                            } elsif ($alias =~ /^protein_id\|(.+)/) {
345                                # Here we have a REFSEQ protein ID.
346                                $normalized = $1;
347                                $stats->Add(orgDirProtein => 1);
348                                WriteToMerge($mergeH, $normalized, C => 'RefSeq', $fid);
349                          } elsif ($alias =~ /[:|]/) {                          } elsif ($alias =~ /[:|]/) {
350                              # Here it's an alias of an unknown type.                              # Here it's an alias of an unknown type.
351                              $stats->Add(orgDirUnknown => 1);                              $stats->Add(orgDirUnknown => 1);
# Line 431  Line 443 
443    
444  =head3 ReadCorrespondingIDs  =head3 ReadCorrespondingIDs
445    
446      ReadCorrespondingIDs($mergeH, $stats);      ReadCorrespondingIDs($mergeH, $stats, $name);
447    
448  Read all the data from the corresponding ID file and output it to the  Read all the data from the corresponding ID table and output it to the
449  merge file. Corresponding IDs have the highest confidence level (C<A>).  merge file. Corresponding IDs have the highest confidence level (C<A>).
450    
451  =over 4  =over 4
# Line 448  Line 460 
460    
461  Statistics object for tracking this operation.  Statistics object for tracking this operation.
462    
463    =item name
464    
465    Name of the corresponding ID file.
466    
467  =back  =back
468    
469  =cut  =cut
470    
471  sub ReadCorrespondingIDs {  sub ReadCorrespondingIDs {
472      # Get the parameters.      # Get the parameters.
473      my ($mergeH, $stats) = @_;      my ($mergeH, $stats, $name) = @_;
474      # Open the corresponding-ID file.      # Open the corresponding-ID file.
475      my $ih = Open(undef, "<$FIG_Config::global/id_correspondence");      my $ih = Open(undef, "<$name");
476      Trace("Processing corresponding IDs.") if T(2);      Trace("Processing corresponding IDs.") if T(2);
477      # Read the header record.      # Read the header record.
478      my ($type0, @types) = Tracer::GetLine($ih);      my ($type0, @types) = Tracer::GetLine($ih);
# Line 485  Line 501 
501                      if ($type eq 'RefSeq' && $alias =~ /^[A-Z][CMT]/) {                      if ($type eq 'RefSeq' && $alias =~ /^[A-Z][CMT]/) {
502                          $stats->Add(correspondingContig => 1);                          $stats->Add(correspondingContig => 1);
503                      } else {                      } else {
504                          # We're okay, so normalize the alias.                          # Check for a locus tag disguised as a CMR ID.
505                          my $normalized = AliasAnalysis::Normalize($type => $alias);                          my $realType = $type;
506                            if ($type eq 'CMR' && $alias =~ /^[A-Z]{2,3}_\d+$/) {
507                                $realType = 'LocusTag';
508                                $stats->Add(correspondingLocus => 1);
509                            } else {
510                          $stats->Add(correspondingNormal => 1);                          $stats->Add(correspondingNormal => 1);
511                            }
512                            # Normalize the alias.
513                            my $normalized = AliasAnalysis::Normalize($type => $alias);
514                          # Write it out once for each FIG ID.                          # Write it out once for each FIG ID.
515                          for my $fid (@fids) {                          for my $fid (@fids) {
516                              WriteToMerge($mergeH, $normalized, A => $type, $fid);                              WriteToMerge($mergeH, $normalized, A => $type, $fid);
# Line 502  Line 525 
525      Trace("Corresponding IDs complete.") if T(2);      Trace("Corresponding IDs complete.") if T(2);
526  }  }
527    
528    =head3 CreateCorrespondingIdFile
529    
530        CreateCorrespondingIdFile($stats, $name);
531    
532    Create a corresponding-ID file from the data in the SEED database. The
533    outgoing file will contain a header record with the ID types followed by
534    a record for each ID group. Within a group, the field for a given ID type
535    will contain a semicolon-delimited list of the IDs of that type in the
536    group.
537    
538    When the SEED database goes away this method will need to be replaced.
539    
540    =over 4
541    
542    =item stats
543    
544    Statistics object to use for tracking progress.
545    
546    =item name
547    
548    Name to give to the corresponding-ID file.
549    
550    =back
551    
552    =cut
553    
554    sub CreateCorrespondingIdFile {
555        # Get the parameters.
556        my ($stats, $name) = @_;
557        # Get the FIG database.
558        require FIG;
559        my $fig = new FIG;
560        my $dbh = $fig->db_handle();
561        # Open the output file.
562        my $oh = Open(undef, ">$name");
563        Trace("Creating header for corresponding ID file.") if T(2);
564        # Create the header record from the id types table.
565        my %types = map { $_->[0] => $_->[1] } @{$dbh->SQL("SELECT id, name FROM id_correspondence_type")};
566        my @typeList = sort keys %types;
567        my @header = map { $types{$_} } @typeList;
568        Trace("Header is " . join(" ", @header) . ".") if T(3);
569        Tracer::PutLine($oh, \@header);
570        # Now we loop through the id correspondence table, creating groups (sets). We use
571        # an SQL statement for this.
572        my $sth = $dbh->prepare_command("SELECT set_id, protein_id, type FROM id_correspondence");
573        my $rc = $sth->execute();
574        if (! $rc) {
575            Confess("SELECT error creating corresponding ID file: " . $sth->errstr());
576        }
577        # These variables contain the ID and content of the current group.
578        my ($set, $content) = (-1, undef);
579        # These variables will hold the fields from the current record.
580        my ($set_id, $protein_id, $type);
581        # This flag will be set to TRUE when we're done.
582        my $done = 0;
583        while (! $done) {
584            # Get the next record.
585            my $record = $sth->fetchrow_arrayref();
586            if (! defined $record) {
587                # No record, so we're done.
588                Trace("End of correspondence table found.") if T(3);
589                $done = 1;
590            } else {
591                # A record found, so we get its data.
592                ($set_id, $protein_id, $type) = @$record;
593                Trace($stats->Ask('corrTableRecords') . " corresponding ID table records read.") if $stats->Check(corrTableRecords => 5000) && T(3);
594            }
595            # Is this a new group?
596            if ($done || $set_id != $set) {
597                # Yes. If the old group has content, we write it out. Each field is
598                # formed by joining the IDs for that type into a string using semicolons.
599                if (defined $content) {
600                    my @typeStrings = map { join("; ", @{$content->{$_}}) } @typeList;
601                    Tracer::PutLine($oh, \@typeStrings);
602                }
603                # Check for an error in the sort.
604                if ($set > $set_id) {
605                    Confess("Invalid set order in id_correspondence table: $set to $set_id.");
606                }
607                # Now start the new group.
608                $set = $set_id;
609                $content = { map { $_ => [] } @typeList };
610            }
611            # Put this ID in this group.
612            push @{$content->{$type}}, $protein_id;
613        }
614        # Close up the output file.
615        Trace("Corresponding ID file created as $name.") if T(2);
616        close $oh;
617    }
618    
619    
620  =head3 WriteAlias  =head3 WriteAlias
621    
622      WriteAlias($oh, $alias, $conf, $type, $fid);      WriteAlias($oh, $alias, $conf, $type, $fid);

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.5

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3