[Bio] / Sprout / AliasCrunch.pl Repository:
ViewVC logotype

Diff of /Sprout/AliasCrunch.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Thu Apr 2 01:28:49 2009 UTC revision 1.2, Wed Aug 19 17:07:54 2009 UTC
# Line 163  Line 163 
163                  $stats->Add(filesCleared => 1);                  $stats->Add(filesCleared => 1);
164              }              }
165          }          }
166            # Create the corresponding-ID file.
167            CreateCorrespondingIdFile($stats, "$output/id_corresponding.tbl");
168          # Open the merge file. Each record of the merge file will contain (1) a          # Open the merge file. Each record of the merge file will contain (1) a
169          # normalized alias, (2) a confidence grade (A, B, C), (3) an alias type,          # normalized alias, (2) a confidence grade (A, B, C), (3) an alias type,
170          # and (4) a feature ID. The merge file is then read back so that we can          # and (4) a feature ID. The merge file is then read back so that we can
# Line 171  Line 173 
173          Trace("Creating merge file $mfileName.") if T(2);          Trace("Creating merge file $mfileName.") if T(2);
174          my $mergeH = Open(undef, "| sort -u >$mfileName");          my $mergeH = Open(undef, "| sort -u >$mfileName");
175          # Now read in the three sources of data.          # Now read in the three sources of data.
176          ReadCorrespondingIDs($mergeH, $stats);          ReadCorrespondingIDs($mergeH, $stats, "$output/id_corresponding.tbl");
177          ReadOrganismIDs($mergeH, $stats);          ReadOrganismIDs($mergeH, $stats);
178          ReadSynonyms($mergeH, $stats);          ReadSynonyms($mergeH, $stats);
179          # Close the merge file and reopen it for input.          # Close the merge file and reopen it for input.
# Line 431  Line 433 
433    
434  =head3 ReadCorrespondingIDs  =head3 ReadCorrespondingIDs
435    
436      ReadCorrespondingIDs($mergeH, $stats);      ReadCorrespondingIDs($mergeH, $stats, $name);
437    
438  Read all the data from the corresponding ID file and output it to the  Read all the data from the corresponding ID table and output it to the
439  merge file. Corresponding IDs have the highest confidence level (C<A>).  merge file. Corresponding IDs have the highest confidence level (C<A>).
440    
441  =over 4  =over 4
# Line 448  Line 450 
450    
451  Statistics object for tracking this operation.  Statistics object for tracking this operation.
452    
453    =item name
454    
455    Name of the corresponding ID file.
456    
457  =back  =back
458    
459  =cut  =cut
460    
461  sub ReadCorrespondingIDs {  sub ReadCorrespondingIDs {
462      # Get the parameters.      # Get the parameters.
463      my ($mergeH, $stats) = @_;      my ($mergeH, $stats, $name) = @_;
464      # Open the corresponding-ID file.      # Open the corresponding-ID file.
465      my $ih = Open(undef, "<$FIG_Config::global/id_correspondence");      my $ih = Open(undef, "<$name");
466      Trace("Processing corresponding IDs.") if T(2);      Trace("Processing corresponding IDs.") if T(2);
467      # Read the header record.      # Read the header record.
468      my ($type0, @types) = Tracer::GetLine($ih);      my ($type0, @types) = Tracer::GetLine($ih);
# Line 502  Line 508 
508      Trace("Corresponding IDs complete.") if T(2);      Trace("Corresponding IDs complete.") if T(2);
509  }  }
510    
511    =head3 CreateCorrespondingIdFile
512    
513        CreateCorrespondingIdFile($stats, $name);
514    
515    Create a corresponding-ID file from the data in the SEED database. The
516    outgoing file will contain a header record with the ID types followed by
517    a record for each ID group. Within a group, the field for a given ID type
518    will contain a semicolon-delimited list of the IDs of that type in the
519    group.
520    
521    When the SEED database goes away this method will need to be replaced.
522    
523    =over 4
524    
525    =item stats
526    
527    Statistics object to use for tracking progress.
528    
529    =item name
530    
531    Name to give to the corresponding-ID file.
532    
533    =back
534    
535    =cut
536    
537    sub CreateCorrespondingIdFile {
538        # Get the parameters.
539        my ($stats, $name) = @_;
540        # Get the FIG database.
541        require FIG;
542        my $fig = new FIG;
543        my $dbh = $fig->db_handle();
544        # Open the output file.
545        my $oh = Open(undef, ">$name");
546        Trace("Creating header for corresponding ID file.") if T(2);
547        # Create the header record from the id types table.
548        my %types = map { $_->[0] => $_->[1] } @{$dbh->SQL("SELECT id, name FROM id_correspondence_type")};
549        my @typeList = sort keys %types;
550        my @header = map { $types{$_} } @typeList;
551        Trace("Header is " . join(" ", @header) . ".") if T(3);
552        Tracer::PutLine($oh, \@header);
553        # Now we loop through the id correspondence table, creating groups (sets). We use
554        # an SQL statement for this.
555        my $sth = $dbh->prepare_command("SELECT set_id, protein_id, type FROM id_correspondence");
556        my $rc = $sth->execute();
557        if (! $rc) {
558            Confess("SELECT error creating corresponding ID file: " . $sth->errstr());
559        }
560        # These variables contain the ID and content of the current group.
561        my ($set, $content) = (-1, undef);
562        # These variables will hold the fields from the current record.
563        my ($set_id, $protein_id, $type);
564        # This flag will be set to TRUE when we're done.
565        my $done = 0;
566        while (! $done) {
567            # Get the next record.
568            my $record = $sth->fetchrow_arrayref();
569            if (! defined $record) {
570                # No record, so we're done.
571                Trace("End of correspondence table found.") if T(3);
572                $done = 1;
573            } else {
574                # A record found, so we get its data.
575                ($set_id, $protein_id, $type) = @$record;
576                Trace($stats->Ask('corrTableRecords') . " corresponding ID table records read.") if $stats->Check(corrTableRecords => 5000) && T(3);
577            }
578            # Is this a new group?
579            if ($done || $set_id != $set) {
580                # Yes. If the old group has content, we write it out. Each field is
581                # formed by joining the IDs for that type into a string using semicolons.
582                if (defined $content) {
583                    my @typeStrings = map { join("; ", @{$content->{$_}}) } @typeList;
584                    Tracer::PutLine($oh, \@typeStrings);
585                }
586                # Check for an error in the sort.
587                if ($set > $set_id) {
588                    Confess("Invalid set order in id_correspondence table: $set to $set_id.");
589                }
590                # Now start the new group.
591                $set = $set_id;
592                $content = { map { $_ => [] } @typeList };
593            }
594            # Put this ID in this group.
595            push @{$content->{$type}}, $protein_id;
596        }
597        # Close up the output file.
598        Trace("Corresponding ID file created as $name.") if T(2);
599        close $oh;
600    }
601    
602    
603  =head3 WriteAlias  =head3 WriteAlias
604    
605      WriteAlias($oh, $alias, $conf, $type, $fid);      WriteAlias($oh, $alias, $conf, $type, $fid);

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.2

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3