[Bio] / Sprout / AliasCrunch.pl Repository:
ViewVC logotype

View of /Sprout/AliasCrunch.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (download) (as text) (annotate)
Sun Feb 13 13:02:30 2011 UTC (8 years ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_10262011, HEAD
Changes since 1.9: +6 -1 lines
Inverted some relationships to improve performance.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use strict;
use Tracer;
use Stats;
use AliasAnalysis;
use FIGRules;


=head1 AliasCrunch Script

=head2 Introduction

    AliasCrunch [options] 

This script finds all the aliases scattered throughout the SEED and puts them in
flat files organized by genome ID. The files are currently stored in the Sprout
data directory.

The aliases come from three sources: (1) the peg.synonyms file, (2) the C<tbl>
files in the organism directories, and (3) the ID correspondence file. The three
sources have different methods for sorting the data, and the data in the
peg.synonyms file is considered secondary to the data in the other files because
it identifies proteins, not features.

Each of the three sources has its own special rules. The C<tbl> files may
contain aliases in their natural form as well as the normalized form. The
ID correspondence file contains only natural forms, but the types are determined
by the position of the alias in each record. In addition, some of the RefSeq
IDs in the correspondence file are actually contig IDs and have to be deleted.
The C<peg.synonyms> file and the ID correspondence may have records which do
not correspond to any FIG IDs. These are ignored.

The output alias files will be lexically sorted and have the following fields
on each line.

=over 4

=item 1

%FIG{FIG ID}% of the relevant feature

=item 2

Alias identifier

=item 3

Alias type

=item 4

Confidence grade: C<A> for curated, C<B> for uploaded, C<C> for peg synonym

=back

=head2 Command-Line Options

=over 4

=item trace

Specifies the tracing level. The higher the tracing level, the more messages
will appear in the trace log. Use E to specify emergency tracing.

=item output

Directory in which the output files should be placed. The output files will all
have names of the form C<alias.>I<genome>C<.tbl>, where I<genome> is the genome
ID, and will be sorted by %FIG{FIG ID}%.

=item clear

Clears any existing alias files from the directory before processing.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item sql

If specified, turns on tracing of SQL activity.

=item keepTemp

If specified, the intermediate temporary files will not be deleted when the
script is finished using them.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item help

Display this command's parameters and options.

=item warn

Create an event in the RSS feed when an error occurs.

=item phone

Phone number to message when the script is complete.

=back

=cut

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw() ],
                                           {
                                              trace => ["3", "tracing level"],
                                              output => ["$FIG_Config::saplingData/AliasData", "output directory for alias files"],
                                              clear => ["", "if specified, existing alias files will be erased"],
                                              keepTemp => ["", "if specified, the intermediate temporary files will not be deleted"],
                                              phone => ["", "phone number (international format) to call when load finishes"]
                                           },
                                           "",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Create a statistics object.
my $stats = Stats->new();
# Compute the merge file name.
my $mfileName = "$FIG_Config::temp/mergeAC$$.tmp.tbl";
# This will contain a list of the temporary files to delete at end of run.
my @tempFiles = $mfileName;
# Insure we catch errors.
eval {
    # Compute the output directory.
    my $output = $options->{output};
    if (! $output) {
        Confess("No output directory specified.");
    } elsif (! -d $output) {
        Confess("Invalid output directory \"$output\".");
    } else {
        # Here we have a valid output directory. Do we need to clear it?
        if ($options->{clear}) {
            # Yes. Find and delete any existing files.
            Trace("Clearing old alias files from $output directory.") if T(3);
            for my $file (grep { $_ =~ /^alias\.[0-9.]+\.tbl$/ } OpenDir($output)) {
                unlink "$output/$file";
                $stats->Add(filesCleared => 1);
            }
        }
        # Create the corresponding-ID file.
        CreateCorrespondingIdFile($stats, "$output/id_corresponding.tbl");
        # Open the merge file. Each record of the merge file will contain (1) a
        # normalized alias, (2) a confidence grade (A, B, C), (3) an alias type,
        # and (4) a feature ID. The merge file is then read back so that we can
        # determine the list of features associated with each alias. Only the FIG IDs
        # with the best confidence (A over B, B over C) for an alias will be kept.
        Trace("Creating merge file $mfileName.") if T(2);
        my $mergeH = Open(undef, "| sort -u >$mfileName");
        # Now read in the three sources of data.
        ReadCorrespondingIDs($mergeH, $stats, "$output/id_corresponding.tbl");
        ReadOrganismIDs($mergeH, $stats);
        ReadSynonyms($mergeH, $stats);
        # Close the merge file and reopen it for input.
        close $mergeH;
        Trace("Processing merge file results.") if T(2);
        my $ih = Open(undef, "<$mfileName");
        # This file will be used to sort the aliases by feature ID.
        my $sortFileName = "$FIG_Config::temp/sortAC$$.tmp.tbl";
        push @tempFiles, $sortFileName;
        my $oh = Open(undef, "| sort >$sortFileName");
        # Now we set up the data we'll be accumulating for each alias.
        # "$prevAlias" is the current alias, "$prevConf" is its confidence.
        # We emit a row if we encounter a new alias or an old alias at the
        # same confidence level.
        my ($prevAlias, $prevConf);
        # Loop through the merge file.
        while (! eof $ih) {
            # Get this row of data.
            my ($alias, $conf, $type, $fid) = Tracer::GetLine($ih);
            Trace($stats->Ask('mergeFileRecords') . " merge file records read.") if $stats->Check(mergeFileRecords => 5000) && T(3);
            # Should we emit this alias?
            if ($alias ne $prevAlias) {
                # Yes. This is a new alias.
                $prevAlias = $alias;
                $prevConf = $conf;
                # Emit this alias.
                WriteAlias($oh, $alias, $conf, $type, $fid);
            } elsif ($conf eq $prevConf) {
                # Yes. This is an old alias at the same confidence level.
                WriteAlias($oh, $alias, $conf, $type, $fid);
            }
        }
        # Close the sort file.
        Trace("Closing sort file.") if T(2);
        close $oh;
        # This will contain the current genome ID.
        my $currGenome;
        # This will contain the current output file handle.
        my $gh;
        # Now read the sort file and split it up by genome ID.
        $ih = Open(undef, "<$sortFileName");
        while (! eof $ih) {
            # Get the next record.
            my ($fid, $alias, $type, $conf) = Tracer::GetLine($ih);
            # Compute the genome ID.
            my ($genomeID) = FIGRules::ParseFeatureID($fid);
            # Is it new?
            if ($genomeID ne $currGenome) {
                # Yes. Close the old file and start a new one.
                if (defined $gh) {
                    close $gh;
                }
                $gh = Open(undef, ">$options->{output}/alias.$genomeID.tbl");
                Trace("Genome file for $genomeID created.") if T(3);
                $currGenome = $genomeID;
            }
            # Write this record to the current output file.
            Tracer::PutLine($gh, [$fid, $alias, $type, $conf]);
        }
        # Close the current genome output file.
        close $gh;
    }
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
}
# Delete the temporary files (if any).
for my $fileName (@tempFiles) {
    if (-f $fileName) {
        if ($options->{keepTemp}) {
            Trace("Temporary file $fileName was not deleted.") if T(3);
        } else {
            Trace("Deleting temporary file $fileName.") if T(3);
            unlink $fileName;
        }
    }
}
# Display the statistics.
Trace("Statistics for this run:\n" . $stats->Show()) if T(2);
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "AliasCrunch terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}

=head2 Utility Methods

=head3 ReadOrganismIDs

    ReadOrganismIDs($mergeH, $stats);

Read all the data from the organism directories and output it to the
merge file. Organism directory aliases have medium confidence level
(C<B>).

=over 4

=item mergeH

Open output handle for the merge file. Each record of the merge file should
contain (1) a normalized alias, (2) the confidence grade C<B>, (3) an
alias type, and (4) a feature ID.

=item stats

Statistics object for tracking this operation.

=back

=cut

sub ReadOrganismIDs {
    # Get the parameters.
    my ($mergeH, $stats) = @_;
    # Loop through the organism directories.
    for my $orgDir (sort grep { $_ =~ /^\d+\.\d+$/ } OpenDir($FIG_Config::organisms)) {
        Trace("Processing $orgDir.") if T(3);
        $stats->Add(orgDirGenomes => 1);
        # We need to process all of this organism's TBL files.
        my $orgDirDir = "$FIG_Config::organisms/$orgDir/Features";
        if (! -d $orgDirDir) {
            Trace("No feature directory found for $orgDir.") if T(1);
            $stats->Add(orgDirMissing => 1);
        } else {
            for my $ftype (OpenDir($orgDirDir, 1)) {
                my $tblFileName = "$orgDirDir/$ftype/tbl";
                if (-s $tblFileName) {
                    Trace("Data found in $tblFileName.") if T(3);
                    # Read this TBL file.
                    $stats->Add(orgDirFiles => 1);
                    my $ih = Open(undef, "<$tblFileName");
                    while (! eof $ih) {
                        # Get the feature ID and its aliases.
                        my ($fid, undef, @aliases) = Tracer::GetLine($ih);
                        $stats->Add(orgDirFeatures => 1);
                        # Loop through the aliases.
                        for my $alias (@aliases) {
                            my $normalized;
                            # Determine the type.
                            my $aliasType = AliasAnalysis::TypeOf($alias);
                            $stats->Add(orgDirAll => 1);
                            # Is this a recognized type?
                            if ($aliasType) {
                                $stats->Add(orgDirNormal => 1);
                                # Yes. Write it normally.
                                WriteToMerge($mergeH, $alias, B => $aliasType, $fid);
                            } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {
                                # No, but this is a specially-marked locus tag.
                                $normalized = "LocusTag:$1";
                                $stats->Add(orgDirLocus => 1);
                                WriteToMerge($mergeH, $normalized, B => 'LocusTag', $fid);
                            } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
                                # No, but this is a natural locus tag.
                                $stats->Add(orgDirLocus => 1);
                                WriteToMerge($mergeH, $normalized, B => 'LocusTag', $fid);
                            } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {
                                # No, but this is a natural gene name.
                                $stats->Add(orgDirGene => 1);
                                WriteToMerge($mergeH, $normalized, B => 'GENE', $fid);
                            } elsif ($alias =~ /^\d+$/) {
                                # Here it's a naked number, which means it's a GI number
                                # of some sort. We only take these from the corresponding ID
                                # table.
                                $stats->Add(orgDirSkip => 1);
                            } elsif ($alias =~ /^protein_id\|(.+)/) {
                                # Here we have a REFSEQ protein ID.
                                $normalized = $1;
                                $stats->Add(orgDirProtein => 1);
                                WriteToMerge($mergeH, $normalized, C => 'RefSeq', $fid);
                            } elsif ($alias =~ /^geneId\|(.+)/) {
                                # Here we have an alternate-form Gene ID.
                                $normalized = "GeneID:$1";
                                $stats->Add(orgDirGeneID => 1);
                                WriteToMerge($mergeH, $normalized, B => 'GeneID', $fid);
                            } elsif ($alias =~ /[:|]/) {
                                # Here it's an alias of an unknown type.
                                $stats->Add(orgDirUnknown => 1);
                            } else {
                                # Here it's a miscellaneous type.
                                $stats->Add(orgDirMisc => 1);
                                WriteToMerge($mergeH, $alias, B => 'Miscellaneous', $fid);
                            }
                        }
                    }
                }
            }
        }
    }
    Trace("Organism directories complete.") if T(2);
}

=head3 ReadSynonyms

    ReadSynonyms($mergeH, $stats);

Read all the data from the C<peg.synonyms> file and output it to the
merge file. Data from the PEG synonyms file has the lowest confidence
level (C<C>), because all IDs with the same protein sequence are
conflated.

=over 4

=item mergeH

Open output handle for the merge file. Each record of the merge file should
contain (1) a normalized alias, (2) the confidence grade C<B>, (3) an
alias type, and (4) a feature ID.

=item stats

Statistics object for tracking this operation.

=back

=cut

sub ReadSynonyms {
    # Get the parameters.
    my ($mergeH, $stats) = @_;
    # Open the peg.synonyms file.
    my $synFileName = "$FIG_Config::global/peg.synonyms";
    my $ih = Open(undef, "<$synFileName");
    Trace("Processing $synFileName.") if T(2);
    # Loop through the file.
    while (! eof $ih) {
        # Get this record.
        my ($prot_id, $synonyms) = Tracer::GetLine($ih);
        Trace($stats->Ask('proteins') . " protein synonym records read.") if $stats->Check(proteins => 5000) && T(3);
        # Parse out the synonyms.
        my @synonyms = split /;/, $synonyms;
        # We'll save any FIG IDs in here.
        my %figIDs;
        # Other IDs go in here.
        my @aliasTuples;
        # Loop through the synonyms.
        for my $synonym (@synonyms) {
            # Strip off the length.
            my ($alias) = split /,/, $synonym, 2;
            # Convert NMPDR IDs to FIG IDs.
            $alias =~ s/^nmpdr/fig/;
            # Process according to the type.
            if ($alias =~ /^fig/) {
                $stats->Add(proteinFIG => 1);
                $figIDs{$alias} = 1;
            } else {
                # Here we have an external ID. If it's of a recognized type, we'll
                # keep it.
                my $type = AliasAnalysis::TypeOf($alias);
                if (! defined $type) {
                    # Not a recognized type, so ignore it.
                    $stats->Add(proteinSkip => 1);
                } else {
                    # A recognized type, so keep it.
                    push @aliasTuples, [$alias, $type];
                    $stats->Add(proteinNormal => 1);
                }
            }
        }
        # Now we have all the IDs in place. If there are any FIG IDs in the
        # bunch, write them to the merge file with all their aliases.
        for my $fid (keys %figIDs) {
            for my $aliasTuple (@aliasTuples) {
                my ($alias, $type) = @$aliasTuple;
                $stats->Add(proteinOut => 1);
                WriteToMerge($mergeH, $alias, C => $type, $fid);
            }
        }
    }
    Trace("Protein synonyms complete.") if T(2);
}

=head3 ReadCorrespondingIDs

    ReadCorrespondingIDs($mergeH, $stats, $name);

Read all the data from the corresponding ID table and output it to the
merge file. Corresponding IDs have the highest confidence level (C<A>).

=over 4

=item mergeH

Open output handle for the merge file. Each record of the merge file should
contain (1) a normalized alias, (2) the confidence grade C<B>, (3) an
alias type, and (4) a feature ID.

=item stats

Statistics object for tracking this operation.

=item name

Name of the corresponding ID file.

=back

=cut

sub ReadCorrespondingIDs {
    # Get the parameters.
    my ($mergeH, $stats, $name) = @_;
    # Open the corresponding-ID file.
    my $ih = Open(undef, "<$name");
    Trace("Processing corresponding IDs.") if T(2);
    # Read the header record.
    my ($type0, @types) = Tracer::GetLine($ih);
    # Insure SEED is the first column.
    Confess("Incorrect file format. SEED is not first.") if ($type0 ne 'SEED');
    # Skip the flag record.
    Tracer::GetLine($ih);
    # Loop through the file.
    while (! eof $ih) {
        # Get this FID and its synonym lists.
        my ($fidList, @others) = Tracer::GetLine($ih);
        Trace($stats->Ask('correspondingIDs') . " corresponding ID records read.") if $stats->Check(correspondingIDs => 5000) && T(3);
        if (! $fidList) {
            # Skip this record if there are no FIG IDs in it.
            $stats->Add(correspondingSkip => 1);
        } else {
            # Get the list of FIG IDs.
            my @fids = split /\s*;\s*/, $fidList;
            # Loop through the other aliases.
            for (my $i = 0; $i <= $#others; $i++) {
                # Get this alias type.
                my $type = $types[$i];
                # Loop through the alias list.
                for my $alias (split /\s*;\s*/, $others[$i]) {
                    # Ignore this alias if it's a RefSeq contig.
                    if ($type eq 'RefSeq' && $alias =~ /^[A-Z][CMT]/) {
                        $stats->Add(correspondingContig => 1);
                    } else {
                        # Check for a locus tag disguised as a CMR ID.
                        my $realType = $type;
                        if ($type eq 'CMR' && $alias =~ /^[A-Z]{2,3}_\d+$/) {
                            $realType = 'LocusTag';
                            $stats->Add(correspondingLocus => 1);
                        } else {
                            $stats->Add(correspondingNormal => 1);
                        }
                        # Normalize the alias.
                        my $normalized = AliasAnalysis::Normalize($type => $alias);
                        # Write it out once for each FIG ID.
                        for my $fid (@fids) {
                            WriteToMerge($mergeH, $normalized, A => $type, $fid);
                        }
                    }
                }
            }
        }
    }
    # Close the input file and the sort file.
    close $ih;
    Trace("Corresponding IDs complete.") if T(2);
}

=head3 CreateCorrespondingIdFile

    CreateCorrespondingIdFile($stats, $name);

Create a corresponding-ID file from the data in the SEED database. The
outgoing file will contain a header record with the ID types followed by
a record for each ID group. Within a group, the field for a given ID type
will contain a semicolon-delimited list of the IDs of that type in the
group.

When the SEED database goes away this method will need to be replaced.

=over 4

=item stats

Statistics object to use for tracking progress.

=item name

Name to give to the corresponding-ID file.

=back

=cut

sub CreateCorrespondingIdFile {
    # Get the parameters.
    my ($stats, $name) = @_;
    # Get the FIG database.
    require FIG;
    my $fig = new FIG;
    my $dbh = $fig->db_handle();
    # Open the output file.
    my $oh = Open(undef, ">$name");
    Trace("Creating header for corresponding ID file.") if T(2);
    # Create the header record from the id types table.
    my %types = map { $_->[0] => $_->[1] } @{$dbh->SQL("SELECT id, name FROM id_correspondence_type")};
    my @typeList = sort keys %types;
    my @header = map { $types{$_} } @typeList;
    Trace("Header is " . join(" ", @header) . ".") if T(3);
    Tracer::PutLine($oh, \@header);
    # Now we loop through the id correspondence table, creating groups (sets). We use
    # an SQL statement for this.
    my $sth = $dbh->prepare_command("SELECT set_id, protein_id, type FROM id_correspondence");
    my $rc = $sth->execute();
    if (! $rc) {
        Confess("SELECT error creating corresponding ID file: " . $sth->errstr());
    }
    # These variables contain the ID and content of the current group.
    my ($set, $content) = (-1, undef);
    # These variables will hold the fields from the current record.
    my ($set_id, $protein_id, $type);
    # This flag will be set to TRUE when we're done.
    my $done = 0;
    while (! $done) {
        # Get the next record.
        my $record = $sth->fetchrow_arrayref();
        if (! defined $record) {
            # No record, so we're done.
            Trace("End of correspondence table found.") if T(3);
            $done = 1;
        } else {
            # A record found, so we get its data.
            ($set_id, $protein_id, $type) = @$record;
            Trace($stats->Ask('corrTableRecords') . " corresponding ID table records read.") if $stats->Check(corrTableRecords => 5000) && T(3);
        }
        # Is this a new group?
        if ($done || $set_id != $set) {
            # Yes. If the old group has content, we write it out. Each field is
            # formed by joining the IDs for that type into a string using semicolons.
            if (defined $content) {
                my @typeStrings = map { join("; ", @{$content->{$_}}) } @typeList;
                Tracer::PutLine($oh, \@typeStrings);
            }
            # Check for an error in the sort.
            if ($set > $set_id) {
                Confess("Invalid set order in id_correspondence table: $set to $set_id.");
            }
            # Now start the new group.
            $set = $set_id;
            $content = { map { $_ => [] } @typeList };
        }
        # Put this ID in this group.
        push @{$content->{$type}}, $protein_id;
    }
    # Close up the output file.
    Trace("Corresponding ID file created as $name.") if T(2);
    close $oh;
}


=head3 WriteAlias

    WriteAlias($oh, $alias, $conf, $type, $fid);

Write an alias record to the sort file. The alias record
contains the alias ID, the alias type, its confidence level, and the
corresponding feature ID.

=over 4

=item oh

Open handle for the output file.

=item alias

Alias identifier.

=item conf

Confidence grade for this alias: C<A> is best, C<F> is worst.

=item type

Type of alias (e.g. NCBI, CMR, RefSeq).

=item fid

%FIG{FIG ID} corresponding to the alias.

=back

=cut

sub WriteAlias {
    # Get the parameters.
    my ($oh, $alias, $conf, $type, $fid) = @_;
    # Compute the genome ID.
    my $genomeID = FIGRules::ParseFeatureID($fid);
    # Write this alias to the output file.
    Tracer::PutLine($oh, [$fid, $alias, $type, $conf]);
    $stats->Add(aliasOut => 1);
    $stats->Add("aliasOut$type" => 1);
}

=head3 WriteToMerge

    WriteToMerge($mergeH, $alias, $grade => $aliasType, $fid);

Write an alias connection to the output merge file.

=over 4

=item mergeH

Open output handle for the merge file.

=item alias

Alias identifier.

=item conf

Confidence grade for this alias: C<A> is best, C<F> is worst.

=item type

Type of alias (e.g. NCBI, CMR, RefSeq).

=item fid

%FIG{FIG ID} corresponding to the alias.

=back

=cut

sub WriteToMerge {
    # Get the parameters.
    my ($mergeH, $alias, $grade, $aliasType, $fid) = @_;
    # Write the merge file record.
    Tracer::PutLine($mergeH, [$alias, $grade, $aliasType, $fid]);
    $stats->Add(mergeOut => 1);
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3