[Bio] / Sprout / EvCodeRefresh.pl Repository:
ViewVC logotype

View of /Sprout/EvCodeRefresh.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Tue Aug 12 06:06:02 2008 UTC (11 years ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, rast_rel_2009_05_18, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.3: +13 -87 lines
Converted to new multi-table design concept for attributes.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use strict;
use Tracer;
use CustomAttributes;
use Stats;
use FIG;

=head1 EvCodeRefresh Script

    EvCodeRefresh [options] <filename>

Recompute evidence codes.

=head2 Introduction

This script generates and reloads evidence codes.

=head2 Command-Line Options

=over 4

=item trace

Specifies the tracing level. The higher the tracing level, the more messages
will appear in the trace log. Use E to specify emergency tracing.

=item load

Specifies a file containing evidence codes. The file should contain 3
columns. Each record should contain a feature ID in the first column, the
literal C<evidence_code> in the middle column, and a corresponding evidence code
in the last column. If this option is specified, the evidence codes are loaded
from the file. Otherwise, the evidence codes are recomputed into a new temporary file.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item help

Display this command's parameters and options.

=item warn

Create an event in the RSS feed when an error occurs.

=item phone

Phone number to message when the script is complete.

=back

=cut

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(ERDB CustomAttributes) ],
                                           {
                                              load => ["", "file containing evidence codes; if none specified, evidence codes will be computed"],
                                              trace => ["2", "tracing level"],
                                              phone => ["", "phone number (international format) to call when load finishes"]
                                           },
                                           "",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    # Create the FIG object.
    my $fig = FIG->new();
    # We'll count our progress in here.
    my $stats = Stats->new();
    # Create a temporary directory for our calculations.
    my $dest=$FIG_Config::temp."/evidence_codes.".time;
    mkdir ($dest, 0775);
    # The file name for the evidence codes will be put in here.
    my $fileName;
    if ($options->{load}) {
        # Here we're loading from a precomputed file.
        $fileName = $options->{load};
        Trace("Evidence codes will be loaded from precomputed file $fileName") if T(2);
    } else {
        # Here we need to create the evidence codes.
        $fileName = GetEvCodes($fig, $dest, $stats);
        # Check for literature data.
        if (-s "$FIG_Config::data/Dlits/dlits") {
            # We have some, so roll it in.
            Trace("Loading dlits.") if T(3);
            system "$FIG_Config::bin/load_dlits";
            Trace("Generating ilits.") if T(3);
            system "$FIG_Config::bin/generate_ilits > $FIG_Config::data/Dlits/ilits";
            Trace("Generating dlit codes.") if T(3);
            system "$FIG_Config::bin/generate_dlit_ev_codes > $FIG_Config::data/Dlits/dlit.ev.codes";
            Trace("Generating ilit codes.") if T(3);
            system "$FIG_Config::bin/generate_ilit_ev_codes < $FIG_Config::data/Dlits/ilits > $FIG_Config::data/Dlits/ilit.ev.codes";
            Trace("Sorting and writing literature codes.") if T(3);
            system "cat $FIG_Config::data/Dlits/[di]lit.ev.codes | sort -u >> $fileName";
        }
        Trace("Evidence codes generated to file $fileName.") if T(2);
    }
    # Now we're ready to load. Get the attributes database.
    my $attr = CustomAttributes->new();
    # Load it.
    Trace("Loading evidence codes.") if T(2);
    $attr->LoadAttributesFrom($fileName, mode => 'concurrent');
    # Tell the user we're done, and show the statistics.
    Trace("Evidence codes loaded.\n" . $stats->Show()) if T(2);
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
}
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "EvCodeRefresh terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}

=head3 GetEvCodes

    my $fileName = GetEvCodes($fig, $dest, $stats);

Create a tab-delimited file of evidence codes. The output file will be in
the specified destination directory, and it will contain three columns.
The first column will contain the name of a feature, and the last will
contain an associated evidence code. The second column is not used. It is
put in solely for compatibility with external scripts.

=over 4

=item fig

A FIG-like object used to access the data store.

=item dest

Directory to contain the working files.

=item stats

A statistics object that can be used to track activity.

=item RETURN

Returns the name of a file in the destination directory. The file will contain the
new evidence codes.

=back

=cut

sub GetEvCodes {
    # Get the parameters.
    my ($fig, $dest, $stats) = @_;
    # The subsystem list goes in here.
    my @all_subsystems = sort grep { $fig->usable_subsystem($_) } $fig->all_subsystems;
    # This will contain all the pegs in subsystems.
    my %insub;
    # Create the output file.
    my $retVal = "$dest/evidenceCodes.tbl";
    my $oh = Open(undef, ">$retVal");
    # Loop through the subsystems.
    for my $sub (@all_subsystems) {
        Trace("Processing $sub (" . $stats->Progress(subsystems => scalar @all_subsystems) . "%).") if T(3);
        my $subsystem = new Subsystem($sub,$fig,0);
        if (! $subsystem) {
            Trace("Error loading subsystem $sub.") if T(1);
            $stats->AddMessage("Could not load subsystem $sub.");
            $stats->Add(badSubsystem => 1);
        } else {
            $stats->Add(subsystem => 1);
            # Get ALL of the coupling data for the pegs in the subsystem.
            my @cdata = $fig->coupled_to_batch($subsystem->get_all_pegs());
            # returns triples [$peg1, $peg2, $score]. create %cdata that maps
            # peg1 => [peg2]
            my %cdata;
            map { push(@{$cdata{$_->[0]}}, $_->[1]) } @cdata;
            # Get the genomes and roles.
            my @genomes = $subsystem->get_genomes();
            my @roles = $subsystem->get_roles;
            # Loop through the genomes.
            for my $genome (@genomes) {
                Trace("Processing $genome for $sub.") if T(4);
                Trace($stats->Ask('subsystemRows') . " total spreadsheet rows processed.") if $stats->Check(subsystemRows => 500) && T(3);
                # Build hashes for the different types of evidence.
                my (%pegs_in_row, %isu, %icw, %isd);
                # Get the variant code. This tells us what roles to expect.
                my $vcode = $subsystem->get_variant_code($subsystem->get_genome_index($genome));
                # Only proceed if it's a real, known variant.
                if (($vcode ne "0") && ($vcode ne "-1")) {
                    # Loop through the roles.
                    for my $role (@roles) {
                        # Get the PEGs for this genome and role (one cell in the spreadsheet).
                        my @pegs = $subsystem->get_pegs_from_cell($genome,$role);
                        # Mark each of the pegs as being in a subsystem.
                        foreach my $peg (@pegs) {
                            $insub{$peg} = 1;
                        }
                        # Accumulate the number of pegs.
                        $stats->Add(pegRoles => scalar @pegs);
                        # The evidence we'll deduce depends on the number of pegs in this cell.
                        if (@pegs == 1) {
                            # If there's only one, save this peg as possessing InSubsystemUnique (isu) evidence.
                            $isu{$pegs[0]} = 1;
                        } elsif (@pegs > 1) {
                            # If there's more than one, then the evidence code is InSubsystemDuplicate (idu),
                            # modified by the number of pegs in the cell.
                            foreach my $peg (@pegs) {
                                $isd{$peg} = @pegs;
                            }
                        }
                        # For each peg, note that it occurs in a this row.
                        for my $peg (@pegs) {
                            $pegs_in_row{$peg} = 1;
                        }
                    }
                }
                # Loop through all the pegs in the current row. Use a batch request for this.
                for my $peg (keys(%pegs_in_row)) {
                    # Get a list of all the pegs in the current row that are coupled to the current peg.
                    my $clist = $cdata{$peg};
                    if ($clist) {
                    my @coup = grep { $pegs_in_row{$_} } @$clist;
                        my $n = @coup;
                        if ($n > 0) {
                            # Denote that we have IsCoupledWith(icw) evidence for this peg, modified by
                            # the number of couplings.
                            $icw{$peg} = $n;
                        }
                    }
                }
                # Convert the subsystem name to a subsystem ID (spaces to underscores).
                $sub =~ s/\s+/_/g;
                # Loop through all the pegs in the current row.
                for my $peg (keys(%pegs_in_row)) {
                    # If this peg has ISU evidence, send it to the output.
                    if ($isu{$peg} ) {
                        Tracer::PutLine($oh, [$peg, 'evidence_code', "isu;$sub"]);
                    }
                    # If this peg has ICW evidence, send it to the output.
                    if (my $n = $icw{$peg} ) {
                        Tracer::PutLine($oh, [$peg, 'evidence_code', "icw($n);$sub"]);
                    } else {
                        # If there's no ICW evidence, ty IDU evidence.
                        if ($n = $isd{$peg} ) {
                            Tracer::PutLine($oh, [$peg, 'evidence_code', "idu($n);$sub"]);
                        }
                    }
                }
            }
        }
    }
    
    # I have chosen to implement the
    #
    #     ff    - in FIGfam
    #     cwn   - clustered with non-hypothetical
    #     cwh   - clustered, but only with hypotheticals
    #
    # in a simple, fast manner.  Perhaps, I should be going through access routines,
    # but I am not
    
    # ff
    # Get the families file and sort out duplicates.
    Trace("Processing fig-families.") if T(2);
    my $ffdata = &FIG::get_figfams_data();
    if ((-s "$ffdata/families.2c") &&
        Open(\*FF,"cut -f2 $ffdata/families.2c | sort -u |")) {
        $stats->Add(familyFile => 1);
        # Read the file.
        while (defined($_ = <FF>)) {
            # If there is a PEG on this line, denote the PEG has fig-family evidence.
            if ($_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)/) {
                Tracer::PutLine($oh, [$1, 'evidence_code', 'ff']);
            }
        }
        close(FF);
    }
    # cwn, cwh
    # Get the coupling scores file.
    Trace("Processing couplings.") if T(2);
    if (open(FC,"<$FIG_Config::data/CouplingData/scores")) {
        my $fc = <FC>;
        # Loop until we hit end-of-file or a line beginning with a space.
        while ($fc && ($fc =~ /^(\S+)/)) {
            $stats->Add(couplingRows => 1);
            my $curr = $1;
            my @set  = ();
            # We expect now to read a set of lines with the current peg in the
            # first column, followed by the coupled peg and a score.
            while ($fc && ($fc =~ /^(\S+)\t(\S+)\t(\d+)/) && ($curr eq $1)) {
                # If the score is high enough, add the coupled peg to the current set.
                if ($3 >= 5) {
                    push(@set,$2);
                }
                $fc = <FC>;
            }
            # Now we're at the beginning of the next peg's couplings, but we need to
            # look at the current peg's data. Only proceed if we found couplings
            # and the PEG is not in a subsystem.
            if (@set > 0 && ! $insub{$curr}) {
                # Find out if all of the coupled pegs are hypothetical.
                my $i;
                for ($i=0; ($i < @set) && &FIG::hypo(scalar $fig->function_of($set[$i])); $i++) {}
                # If we found a non-hypothetical PEG, we are CWN, else we're CWH.
                if ($i < @set) {
                    Tracer::PutLine($oh, [$curr, 'evidence_code', 'cwn']);
                } else {
                    Tracer::PutLine($oh, [$curr, 'evidence_code', 'cwh']);
                }
            }
        }
        close(FC);
    }
    # Close the output file.
    close $oh;
    # Return its name.
    return $retVal;
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3