[Bio] / FigKernelScripts / get_ev_codes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_ev_codes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (download) (as text) (annotate)
Wed Sep 3 20:34:35 2008 UTC (11 years, 5 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, HEAD
Changes since 1.14: +1 -1 lines
Updated comments to mention the replacement scripts.

#!/usr/bin/perl -w

=head1 Get Evidence Codes (DEPRECATED)

Compute evidence codes for all the PEGs and write them to the standard output. The
single positional parameter, if specified, is the name of a file containing subsystem
names. Only the named subsystems will be processed.

The currently-supported command-line options are as follows.

=over 4

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item phone

Phone number to message when the script is complete.

=back

=cut

use strict;
use Tracer;
use Cwd;
use File::Copy;
use File::Path;
use FIG;
use Data::Dumper;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw() ],
                                           {
                                              trace => ["2-", "trace level"],
                                              phone => ["", "phone number (international format) to call when load finishes"],
                                           },
                                           "<subsysListFile>",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    my $fig = FIG->new();
    # The subsystem list goes in here.
    my @all_subsystems;
    if (@parameters > 0) {
        # Here we have a subsystem list file.
        my $subsys = $parameters[0];
        @all_subsystems = `cat $subsys`;
        chomp @all_subsystems;
    } else {
        # Here we default to all usable subsystems.
        @all_subsystems = grep { $fig->usable_subsystem($_) } $fig->all_subsystems;
    }
    # This will be a counter of the total number of pegs.
    my $tot = 0;
    my %insub;
    # Loop through the subsystems.
    foreach $_ (@all_subsystems) {
        # If we read from a file, there may be extra spaces and blank lines. This
        # match fixes both problems.
        if ($_ =~ /^(\S[^\t]+\S)/) {
            # Get the subsystem data.
            my $sub = $1;
            Trace("Processing $sub.") if T(2);
            my $subsystem = new Subsystem($sub,$fig,0);
            if (! $subsystem) {
                Trace("SERIOUS ERROR: cannot load $sub") if T(0);
                next;
            }

	    # Get ALL of the coupling data for the pegs in the subsystem.
	    my @cdata = $fig->coupled_to_batch($subsystem->get_all_pegs());
	    # returns triples [$peg1, $peg2, $score]. create %cdata that maps
	    # peg1 => [peg2]
	    my %cdata;
	    map { push(@{$cdata{$_->[0]}}, $_->[1]) } @cdata;


            # Get the genomes and roles.
            my @genomes = $subsystem->get_genomes();
            my @roles = $subsystem->get_roles;
            # Loop through the genomes.
            foreach my $genome (@genomes) {
            Trace("Processing $genome for $sub.") if T(3);
                # Build hashes for the different types of evidence.
                my (%pegs_in_row, %isu, %icw, %isd);
                # Get the variant code. This tells us what roles to expect.
                my $vcode = $subsystem->get_variant_code($subsystem->get_genome_index($genome));
                # Only proceed if it's a real, known variant.
                if (($vcode ne "0") && ($vcode ne "-1")) {
                    # Loop through the roles.
                    foreach my $role (@roles) {
                        # Get the PEGs for this genome and role (one cell in the spreadsheet).
                        my @pegs = $subsystem->get_pegs_from_cell($genome,$role);
                        # Mark each of the pegs as being in this subsystem.
                        foreach my $peg (@pegs) {
                            $insub{$peg} = 1;
                        }
                        # Accumulate the number of pegs.
                        $tot += @pegs;
                        # The evidence we'll deduce depends on the number of pegs in this cell.
                        if (@pegs == 1) {
                            # If there's only one, save this peg as possessing InSubsystemUnique (isu) evidence.
                            $isu{$pegs[0]} = 1;
                        } elsif (@pegs > 1) {
                            # If there's more than one, then the evidence code is InSubsystemDuplicate (idu),
                            # modified by the number of pegs in the cell.
                            foreach my $peg (@pegs) {
                                $isd{$peg} = @pegs;
                            }
                        }
                        # For each peg, note that it occurs in a this row.
                        foreach my $peg (@pegs) {
                            $pegs_in_row{$peg} = 1;
                        }
                    }
                }
                # Loop through all the pegs in the current row. Use a batch request for this.

                foreach my $peg (keys(%pegs_in_row)) {
                    # Get a list of all the pegs in the current row that are coupled to the current peg.
		    my $clist = $cdata{$peg};
		    next unless $clist;
                    my @coup = grep { $pegs_in_row{$_} } @$clist;
                    my $n = @coup;
                    if ($n > 0) {
                        # Denote that we have IsCoupledWith(icw) evidence for this peg, modified by
                        # the number of couplings.
                        $icw{$peg} = $n;
                    }
                }
                # Convert the subsystem name to a subsystem ID (spaces to underscores).
                $sub =~ s/\s+/_/g;
                # Loop through all the pegs in the current row.
                foreach my $peg (keys(%pegs_in_row)) {
                    # If this peg has ISU evidence, send it to the output.
                    if ($isu{$peg} ) {
                        print "$peg\tevidence_code\tisu;$sub\t\n";
                    }
                    # If this peg has ICW evidence, send it to the output.
                    if (my $n = $icw{$peg} ) {
                        print "$peg\tevidence_code\ticw($n);$sub\t\n";
                    } else {
                        # If there's no ICW evidence, ty IDU evidence.
                        if ($n = $isd{$peg} ) {
                            print "$peg\tevidence_code\tidu($n);$sub\t\n";
                        }
                    }
                }
            }
        }
    }
    Trace("Total peg-role connections in ss = $tot") if T(2);
    
    # I have chosen to implement the
    #
    #     ff    - in FIGfam
    #     cwn   - clustered with non-hypothetical
    #     cwh   - clustered, but only with hypotheticals
    #
    # in a simple, fast manner.  Perhaps, I should be going through access routines,
    # but I am not
    
    # ff
    $tot = 0;
    # Get the families file and sort out duplicates.
    Trace("Processing fig-families.") if T(2);
    my $ffdata = &FIG::get_figfams_data();
    if ((-s "$ffdata/families.2c") &&
        open(FF,"cut -f2 $ffdata/families.2c | sort -u |")) {
        # Read the file.
        while (defined($_ = <FF>)) {
            # If there is a PEG on this line, denote the PEG has fig-family evidence.
            if ($_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)/) {
                print "$1\tevidence_code\tff\n";
                $tot++;
            }
        }
        close(FF);
    }
    Trace("FigFamily evidence codes found = $tot.") if T(2);
    # cwn, cwh
    $tot = 0;
    # Get the coupling scores file.
    Trace("Processing couplings.") if T(2);
    if (open(FC,"<$FIG_Config::data/CouplingData/scores")) {
        my $fc = <FC>;
        # Loop until we hit end-of-file or a line beginning with a space.
        while ($fc && ($fc =~ /^(\S+)/)) {
            my $curr = $1;
            my @set  = ();
            # We expect now to read a set of lines with the current peg in the
            # first column, followed by the coupled peg and a score.
            while ($fc && ($fc =~ /^(\S+)\t(\S+)\t(\d+)/) && ($curr eq $1)) {
                # If the score is high enough, add the coupled peg to the current set.
                if ($3 >= 5) {
                    push(@set,$2);
                }
                $fc = <FC>;
            }
            # Now we're at the beginning of the next peg's couplings, but we need to
            # look at the current peg's data. Only proceed if we found couplings
            # and the PEG is not in a subsystem.
            if (@set > 0 && ! $insub{$curr}) {
                # Find out if all of the coupled pegs are hypothetical.
                my $i;
                for ($i=0; ($i < @set) && &FIG::hypo(scalar $fig->function_of($set[$i])); $i++) {}
                # If we found a non-hypothetical PEG, we are CWN, else we're CWH.
                if ($i < @set) {
                    print "$curr\tevidence_code\tcwn\n";
                } else {
                    print "$curr\tevidence_code\tcwh\n";
                }
                $tot++;
            }
        }
        close(FC);
    }
    Trace("Coupling evidence codes found = $tot.") if T(2);
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
}
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "Get Evidence Codes terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}



1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3