[Bio] / Sprout / BadIdFinder.pl Repository:
ViewVC logotype

View of /Sprout/BadIdFinder.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.1 - (download) (as text) (annotate)
Mon Jan 26 21:59:26 2009 UTC (11 years, 4 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, rast_rel_2009_05_18, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, rast_rel_2009_03_26, mgrast_dev_10262011, HEAD
New program to find bad corresponding IDs in the SEED.

#!/usr/bin/perl -w

# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
# This file is part of the SEED Toolkit.
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.

use strict;
use Tracer;
use FIG;
use Stats;

=head1 BadIdFinder Script

    BadIdFinder [options] 

Find errors in corresponding IDs

=head2 Introduction

This script runs through the corresponding_ids IsAlsoFoundIn relationship of the
%FIG{SproutDatabase}% looking for IDs that appear in ID sets for different
genomes. If any such IDs are found, they are output as errors.

=head2 Command-Line Options

=over 4

=item output

Name of the output directory. The output directory will contain three files
produced by this script-- C<contigs.tbl>, C<multiples.tbl>, and
C<incompatibles.tbl>. Each file is a standard tab-delimited file containing
identifiers followed by a comma-delimited list of file set numbers. The presence
of an identifier in a file indicates it has the type of problem indicated
by the file name: I<incompatible> indicates the identifier maps to FIG IDs
in multiple genomes, I<contig> indicates the identifier is a contig ID, and
I<multiple> indicates the identifier maps to multiple FIG IDs in the same
genome. The default output is to the sapling data directory if it exists
and the FIG temporary directory otherwise.

=item limit

Stop after the specified number of identifiers has been processed. This is a utility
parameter for testing purposes.

=item trace

Specifies the tracing level. The higher the tracing level, the more messages
will appear in the trace log. Use E to specify emergency tracing.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item help

Display this command's parameters and options.

=item warn

Create an event in the RSS feed when an error occurs.

=item phone

Phone number to message when the script is complete.



# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(FIG) ],
                                              output => [$FIG_Config::saplingData || $FIG_Config::temp, "output directory name"],
                                              trace => ["2", "tracing level"],
                                              phone => ["", "phone number (international format) to call when load finishes"]
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    Trace("Connecting to databases.") if T(2);
    # Create the FIG object.
    my $fig = FIG->new();
    # Create a statistics object.
    my $stats = Stats->new();
    # Get access to the database.
    my $dbh = $fig->db_handle();
    # Do demand-driven flow control. Otherwise it will take forever to start.
    # Because we're doing the demand-driven thing, this is a two-pass algorithm.
    # The first pass runs through the whole table in protein order. We'll
    # accumulate the proteins of interest in this output file.
    my $tempFileName = "$FIG_Config::temp/BadIdFinderQ$$.tbl";
    my $oh = Open(undef, ">$tempFileName");
    # We want to find all proteins that are in more than one identifier set. This
    # variable tracks the currently-active ID.
    my $thisID;
    # This is a hash of the sets for the current ID. Each set ID is mapped to a
    # file number.
    my %setIDs;
    # Create the initial query.
    my $sth = $dbh->prepare_command("SELECT protein_id, set_id, file_num FROM id_correspondence ORDER BY protein_id");
    $sth->execute() || Confess("Database error: " . $sth->errstr);
    Trace("Starting sort loop.") if T(2);
    # Loop through the results.
    while (my $rowPtr = $sth->fetch()) {
        # Extract the results.
        my ($protein, $set, $file) = @$rowPtr;
        # Accumulate our progress.
        Trace($stats->Ask('relationships') . " relationship records processed.")
            if $stats->Check(relationships => 10000) && T(3);
        # Is this a new identifier?
        if ($protein ne $thisID) {
            # Yes it is. Count it.
            $stats->Add(identifiers => 1);
            # Check the previous identifier and output it if we need to revisit it.
            my $flag = OutputCheck($oh, $thisID, \%setIDs);
            $stats->Add(possibleDuplicate => $flag);
            # Clear the hash.
            %setIDs = ();
            # Save the identifier.
            $thisID = $protein;
        # Save this identifier's set information.
        $setIDs{$set} = $file;
    # Check the final identifier.
    my $flag = OutputCheck($oh, $thisID, \%setIDs);
    $stats->Add(possibleDuplicate => $flag);
    # Clear the statement handle.
    undef $sth;
    Trace("Sort loop completed.") if T(2);
    # Close the output file and re-open it for input.
    close $oh;
    my $ih = Open(undef, "<$tempFileName");
    # Format the SELECT command for the next loop.
    my $command = "SELECT i.protein_id FROM id_correspondence i " .
                  "WHERE i.set_id = ? AND i.type = 1";
    $sth = $dbh->prepare_command($command);
    # Open the output files.
    my %files;
    for my $fileType (qw(incompatibles contigs multiples)) {
        $files{$fileType} = Open(undef, ">$options->{output}/$fileType.tbl");
    # Loop through the identifiers queued for processing.
    Trace("Starting second pass loop.") if T(2);
    while (! eof $ih) {
        # Get this identifier.
        my ($id, $sets, $files) = Tracer::GetLine($ih);
        # Accumulate our progress.
        Trace($stats->Ask('records') . " possibility records processed.")
            if $stats->Check(records => 1000) && T(3);
        # Is this a contig ID?
        if ($id =~ /^[A-Z][A-Z]_\d+$/) {
            Trace("Identifier $id from file $files is a contig ID.") if T(3);
            $stats->Add(contigs => 1);
            Tracer::PutLine($files{contigs}, [$id, $files]);
        } else {
            # We'll track the genomes containing the FIG IDs in here.
            my %genomes;
            # Get the FIG IDs for the sets in the incoming list.
            for my $set (split /,/, $sets) {
                $stats->Add(setQuery => 1);
                # Ask for this set's FIG IDs. There is generally only one. Sometimes there are two.
                $sth->execute($set) || Confess("Error in possibility query: " . $sth->errstr);
                my $rows = $sth->fetchall_arrayref();
                # Did we find anything?
                if (! @$rows) {
                    # No, this is a seedless set.
                    $stats->Add(seedless => 1);
                } else {
                    # Yes! Get the FIG IDs and count their genomes.
                    for my $row (@$rows) {
                        $stats->Add(figIDs => 1);
                        my $fid = $row->[0];
                        my $genome = FIG::genome_of($fid);
                        $genomes{$genome} = 1;
            # Now, if we have more than one genome in this set, it's a problem.
            if (scalar(keys %genomes) > 1) {
                Trace("Identifier $id is in incompatible sets $sets from files $files.") if T(1);
                $stats->Add(incompatibles => 1);
                Tracer::PutLine($files{incompatibles}, [$id, $files]);
            } else {
                my ($genome) = keys %genomes;
                Trace("Identifier $id is in multiple sets $sets for the genome $genome.") if T(3);
                $stats->Add(multiples => 1);
                Tracer::PutLine($files{multiples}, [$id, $files]);
    # Close the output files.
    for my $fileType (keys %files) {
        close $files{$fileType};
    # Display the statistics.
    Trace("Statistics from run:\n" . $stats->Show()) if T(2);
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "BadIdFinder terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);

=head3 OutputCheck

    my $flag = OutputCheck($oh, $thisID, \%setIDs);

Check the specified hash of set IDs. If more than one set ID is present,
output the specified ID along with its set and file information.

=over 4

=item oh

File handle for the output file.

=item thisID

Identifier of current interest.

=item setIDs

Reference to a hash mapping identifier set IDs to file numbers.

=item RETURN

Returns 1 if the identifier is output, else 0.



sub OutputCheck {
    # Get the parameters.
    my ($oh, $thisID, $setIDs) = @_;
    # Declare the return variable.
    my $retVal = 0;
    # Get the set IDs.
    my @sets = sort { $a <=> $b } keys %$setIDs;
    # If there is more than one set, we have a possible duplicate.
    if (@sets > 1) {
        # Denote we're writing out a possible.
        $retVal = 1;
        # Compute the list of file numbers.
        my %fileMap = map { $_ => 1 } values %$setIDs;
        # Output the identifier, the list of sets, and the list of file numbers.
        my $line = join("\t", $thisID,
                              join(",", @sets),
                              join(", ", keys %fileMap));
        print $oh "$line\n";
    # Return the activity indicator.
    return $retVal;


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3