[Bio] / Sprout / EvCodeRefresh.pl Repository:
ViewVC logotype

View of /Sprout/EvCodeRefresh.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.3 - (download) (as text) (annotate)
Fri Jul 11 01:05:46 2008 UTC (11 years, 7 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_rel_2008_0806
Changes since 1.2: +305 -95 lines
Incorporated methods from get_ev_codes.

#!/usr/bin/perl -w

# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
# This file is part of the SEED Toolkit.
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.

use strict;
use Tracer;
use CustomAttributes;
use Stats;
use FIG;

=head1 EvCodeRefresh Script

    EvCodeRefresh [options] <filename>

Recompute evidence codes.

=head2 Introduction

This script generates and reloads evidence codes.

=head2 Command-Line Options

=over 4

=item trace

Specifies the tracing level. The higher the tracing level, the more messages
will appear in the trace log. Use E to specify emergency tracing.

=item load

Specifies a file containing evidence codes. The file should contain either 2 or 3
columns. Each record should contain a feature ID in the first column and a
corresponding evidence code in the last column. If this option is specified,
the evidence codes are loaded from the file. Otherwise, the evidence codes are
recomputed into a new temporary file.

=item classes

If this option is specified, it should be the name of a tab-delimited file
containing the evidence classes. The evidence class table will be deleted and
reloaded from the file, which should be a valid ERDB load file for the
B<EvidenceClass> table.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item help

Display this command's parameters and options.

=item warn

Create an event in the RSS feed when an error occurs.

=item phone

Phone number to message when the script is complete.



# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(ERDB CustomAttributes) ],
                                              load => ["", "file containing evidence codes; if none specified, evidence codes will be computed"],
                                              trace => ["2", "tracing level"],
                                              classes => ["", "evidence class file name"],
                                              phone => ["", "phone number (international format) to call when load finishes"]
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    # Create the FIG object.
    my $fig = FIG->new();
    # We'll count our progress in here.
    my $stats = Stats->new();
    # Create a temporary directory for our calculations.
    my $dest=$FIG_Config::temp."/evidence_codes.".time;
    mkdir ($dest, 0775);
    # The file name for the evidence codes will be put in here.
    my $fileName;
    if ($options->{load}) {
        # Here we're loading from a precomputed file.
        $fileName = $options->{load};
        Trace("Evidence codes will be loaded from precomputed file $fileName") if T(2);
    } else {
        # Here we need to create the evidence codes.
        $fileName = GetEvCodes($fig, $dest, $stats);
        # Check for literature data.
        if (-s "$FIG_Config::data/Dlits/dlits") {
            # We have some, so roll it in.
            Trace("Loading dlits.") if T(3);
            system "$FIG_Config::bin/load_dlits";
            Trace("Generating ilits.") if T(3);
            system "$FIG_Config::bin/generate_ilits > $FIG_Config::data/Dlits/ilits";
            Trace("Generating dlit codes.") if T(3);
            system "$FIG_Config::bin/generate_dlit_ev_codes > $FIG_Config::data/Dlits/dlit.ev.codes";
            Trace("Generating ilit codes.") if T(3);
            system "$FIG_Config::bin/generate_ilit_ev_codes < $FIG_Config::data/Dlits/ilits > $FIG_Config::data/Dlits/ilit.ev.codes";
            Trace("Sorting and writing literature codes.") if T(3);
            system "cat $FIG_Config::data/Dlits/[di]lit.ev.codes | sort -u >> $fileName";
        Trace("Evidence codes generated to file $fileName.") if T(2);
    # Now we're ready to load. Get the attributes database.
    my $attr = CustomAttributes->new();
    # Check for a class file.
    if ($options->{classes}) {
        # We have one. Load it into the evidence class table.
        Trace("Loading evidence classes from $options->{classes}.") if T(2);
        $attr->LoadTable($options->{classes}, 'EvidenceClass', truncate => 1);
        Trace("Evidence classes loaded.") if T(2);
    # Now we convert the evidence code file into a load file for the IsEvidencedBy
    # table. First, we open it.
    my $ih = Open(undef, "<$fileName");
    # Create the load file. We sort it to speed up the load.
    my $loadFileName = "$FIG_Config::temp/IsEvidencedBy$$.dtx";
    my $oh = Open(undef, "| sort >$loadFileName");
    # Finally, we use this hash to track all the evidence classes.
    my %classes = ();
    Trace("Reading evidence codes.") if T(3);
    # Loop through the input file, writing load records.
    while (! eof $ih) {
        # Read the input record.
        my @fields = Tracer::GetLine($ih);
        my $line = $.;
        Trace("$line input lines processed.") if T(3) && ($. % 10000 == 0);
        # Insure it's valid.
        my $last = $#fields;
        if ($last >= 3 || $last < 1) {
            Trace("Record $line in input file has incorrect number of columns.") if T(3);
            $stats->Add(errors => 1);
        } else {
            # Get the feature ID and the code.
            my ($fid, $code) = @fields[0, $last];
            # Validate the feature ID and the evidence code.
            if (! ($fid =~ /^fig\|\d+/)) {
                Trace("Record $line in input file has invalid feature ID \"$fid\".") if T(3);
                $stats->Add(errors => 1);
            } elsif (! ($code =~ /^([a-z]+)(.*)/)) {
                Trace("Record $line in input file has invalid evidence code \"$code\".") if T(3);
                $stats->Add(errors => 1);
            } else {
                # We have a valid input row. Produce the output line. Note that as a
                # result of the pattern match that validated the evidence code, $1
                # contains the class and $2 the modifier.
                my ($class, $modifier) = ($1, $2);
                Tracer::PutLine($oh, [$fid, $class, $modifier]);
                # Count this as an output row and as a member of the specified class.
                $stats->Add(rows => 1);
                $stats->Add($class => 1);
                $classes{$class} = 1;
    Trace("Evidence codes reformatted.") if T(2);
    # Close the files.
    close $oh;
    close $ih;
    Trace("Evidence codes will be loaded from $loadFileName.") if T(2);
    # Now we need to verify the incoming evidence codes against the known
    # evidence classes. We issue a message for every non-existent class. It's
    # not a serious error, but it's something the user should know.
    Trace("Verifying evidence classes.") if T(2);
    for my $class (keys %classes) {
        if (! $attr->Exists(EvidenceClass => $class)) {
            Trace("Evidence class \"$class\" not found in database!") if T(2);
            $stats->Add(bad_class => 1);
    # Now we load.
    Trace("Loading evidence codes.") if T(2);
    $attr->LoadTable($loadFileName, 'IsEvidencedBy', truncate => 1,
                     mode => 'concurrent');
    # Tell the user we're done, and show the statistics.
    Trace("Evidence codes loaded.\n" . $stats->Show()) if T(2);
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "EvCodeRefresh terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);

=head3 GetEvCodes

    my $fileName = GetEvCodes($fig, $dest, $stats);

Create a tab-delimited file of evidence codes. The output file will be in
the specified destination directory, and it will contain three columns.
The first column will contain the name of a feature, and the last will
contain an associated evidence code. The second column is not used. It is
put in solely for compatibility with external scripts.

=over 4

=item fig

A FIG-like object used to access the data store.

=item dest

Directory to contain the working files.

=item stats

A statistics object that can be used to track activity.

=item RETURN

Returns the name of a file in the destination directory. The file will contain the
new evidence codes.



sub GetEvCodes {
    # Get the parameters.
    my ($fig, $dest, $stats) = @_;
    # The subsystem list goes in here.
    my @all_subsystems = sort grep { $fig->usable_subsystem($_) } $fig->all_subsystems;
    # This will contain all the pegs in subsystems.
    my %insub;
    # Create the output file.
    my $retVal = "$dest/evidenceCodes.tbl";
    my $oh = Open(undef, ">$retVal");
    # Loop through the subsystems.
    for my $sub (@all_subsystems) {
        Trace("Processing $sub (" . $stats->Progress(subsystems => scalar @all_subsystems) . "%).") if T(3);
        my $subsystem = new Subsystem($sub,$fig,0);
        if (! $subsystem) {
            Trace("Error loading subsystem $sub.") if T(1);
            $stats->AddMessage("Could not load subsystem $sub.");
            $stats->Add(badSubsystem => 1);
        } else {
            $stats->Add(subsystem => 1);
            # Get ALL of the coupling data for the pegs in the subsystem.
            my @cdata = $fig->coupled_to_batch($subsystem->get_all_pegs());
            # returns triples [$peg1, $peg2, $score]. create %cdata that maps
            # peg1 => [peg2]
            my %cdata;
            map { push(@{$cdata{$_->[0]}}, $_->[1]) } @cdata;
            # Get the genomes and roles.
            my @genomes = $subsystem->get_genomes();
            my @roles = $subsystem->get_roles;
            # Loop through the genomes.
            for my $genome (@genomes) {
                Trace("Processing $genome for $sub.") if T(4);
                Trace($stats->Ask('subsystemRows') . " total spreadsheet rows processed.") if $stats->Check(subsystemRows => 500) && T(3);
                # Build hashes for the different types of evidence.
                my (%pegs_in_row, %isu, %icw, %isd);
                # Get the variant code. This tells us what roles to expect.
                my $vcode = $subsystem->get_variant_code($subsystem->get_genome_index($genome));
                # Only proceed if it's a real, known variant.
                if (($vcode ne "0") && ($vcode ne "-1")) {
                    # Loop through the roles.
                    for my $role (@roles) {
                        # Get the PEGs for this genome and role (one cell in the spreadsheet).
                        my @pegs = $subsystem->get_pegs_from_cell($genome,$role);
                        # Mark each of the pegs as being in a subsystem.
                        foreach my $peg (@pegs) {
                            $insub{$peg} = 1;
                        # Accumulate the number of pegs.
                        $stats->Add(pegRoles => scalar @pegs);
                        # The evidence we'll deduce depends on the number of pegs in this cell.
                        if (@pegs == 1) {
                            # If there's only one, save this peg as possessing InSubsystemUnique (isu) evidence.
                            $isu{$pegs[0]} = 1;
                        } elsif (@pegs > 1) {
                            # If there's more than one, then the evidence code is InSubsystemDuplicate (idu),
                            # modified by the number of pegs in the cell.
                            foreach my $peg (@pegs) {
                                $isd{$peg} = @pegs;
                        # For each peg, note that it occurs in a this row.
                        for my $peg (@pegs) {
                            $pegs_in_row{$peg} = 1;
                # Loop through all the pegs in the current row. Use a batch request for this.
                for my $peg (keys(%pegs_in_row)) {
                    # Get a list of all the pegs in the current row that are coupled to the current peg.
                    my $clist = $cdata{$peg};
                    if ($clist) {
                    my @coup = grep { $pegs_in_row{$_} } @$clist;
                        my $n = @coup;
                        if ($n > 0) {
                            # Denote that we have IsCoupledWith(icw) evidence for this peg, modified by
                            # the number of couplings.
                            $icw{$peg} = $n;
                # Convert the subsystem name to a subsystem ID (spaces to underscores).
                $sub =~ s/\s+/_/g;
                # Loop through all the pegs in the current row.
                for my $peg (keys(%pegs_in_row)) {
                    # If this peg has ISU evidence, send it to the output.
                    if ($isu{$peg} ) {
                        Tracer::PutLine($oh, [$peg, '', "isu;$sub"]);
                    # If this peg has ICW evidence, send it to the output.
                    if (my $n = $icw{$peg} ) {
                        Tracer::PutLine($oh, [$peg, '', "icw($n);$sub"]);
                    } else {
                        # If there's no ICW evidence, ty IDU evidence.
                        if ($n = $isd{$peg} ) {
                            Tracer::PutLine($oh, [$peg, '', "idu($n);$sub"]);
    # I have chosen to implement the
    #     ff    - in FIGfam
    #     cwn   - clustered with non-hypothetical
    #     cwh   - clustered, but only with hypotheticals
    # in a simple, fast manner.  Perhaps, I should be going through access routines,
    # but I am not
    # ff
    # Get the families file and sort out duplicates.
    Trace("Processing fig-families.") if T(2);
    my $ffdata = &FIG::get_figfams_data();
    if ((-s "$ffdata/families.2c") &&
        Open(\*FF,"cut -f2 $ffdata/families.2c | sort -u |")) {
        $stats->Add(familyFile => 1);
        # Read the file.
        while (defined($_ = <FF>)) {
            # If there is a PEG on this line, denote the PEG has fig-family evidence.
            if ($_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)/) {
                Tracer::PutLine($oh, [$1, '', 'ff']);
    # cwn, cwh
    # Get the coupling scores file.
    Trace("Processing couplings.") if T(2);
    if (open(FC,"<$FIG_Config::data/CouplingData/scores")) {
        my $fc = <FC>;
        # Loop until we hit end-of-file or a line beginning with a space.
        while ($fc && ($fc =~ /^(\S+)/)) {
            $stats->Add(couplingRows => 1);
            my $curr = $1;
            my @set  = ();
            # We expect now to read a set of lines with the current peg in the
            # first column, followed by the coupled peg and a score.
            while ($fc && ($fc =~ /^(\S+)\t(\S+)\t(\d+)/) && ($curr eq $1)) {
                # If the score is high enough, add the coupled peg to the current set.
                if ($3 >= 5) {
                $fc = <FC>;
            # Now we're at the beginning of the next peg's couplings, but we need to
            # look at the current peg's data. Only proceed if we found couplings
            # and the PEG is not in a subsystem.
            if (@set > 0 && ! $insub{$curr}) {
                # Find out if all of the coupled pegs are hypothetical.
                my $i;
                for ($i=0; ($i < @set) && &FIG::hypo(scalar $fig->function_of($set[$i])); $i++) {}
                # If we found a non-hypothetical PEG, we are CWN, else we're CWH.
                if ($i < @set) {
                    Tracer::PutLine($oh, [$curr, '', 'cwn']);
                } else {
                    Tracer::PutLine($oh, [$curr, '', 'cwh']);
    # Close the output file.
    close $oh;
    # Return its name.
    return $retVal;


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3