[Bio] / FigKernelScripts / corr_build_connections.pl Repository:
ViewVC logotype

View of /FigKernelScripts/corr_build_connections.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Thu Apr 7 20:32:35 2011 UTC (8 years, 7 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_release_3_0_4, mgrast_dev_06072011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, rast_rel_2014_0729, mgrast_dev_05262011, mgrast_release_3_1_2, mgrast_release_3_1_1, rast_rel_2011_0928, mgrast_dev_04132011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_10262011, mgrast_dev_04082011, mgrast_release_3_1_0, HEAD
New stuff for Salmonella sets.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

=head1 Build Connections from Correspondences

    corr_build_connections [--m=<match%>] [--id=<id%>] <directory>

This script will read all of the files in a directory of correspondence files
and output a 2-column tab-delimited file containing pairs of genes that are
similar to a certain degree. Specifically, they must match over a specified
percentage of the total lengths and have a minimum identity score.

The single positional parameter is the name of the directory containing the
correspondence files. The following command-line options are supported.

=over 4

=item m

The percentage match length. The number of matching amino acids between the two
proteins must be at least this percentage of the total. The default is C<70>.

=item id

The minimum percentage identity between the two proteins. The default is C<70>.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=back

=cut

    use strict;
    use Tracer;
    use Stats;

my ($options, @parameters) = StandardSetup([],
                                       { m => [70, "minimum percentage match length"],
                                         id => [70, "minimum percentage match identity"],
                                         trace => ["3-", "tracing level"]
                                       }, "<inputDirectory>",
                                       @ARGV);

# Get the working directory.
my $dir = $parameters[0];
if (! -d $dir) {
    Confess("Invalid working directory $dir.");
}
# Validate the options.
my $match = $options->{m};
if ($match =~ /[^0-9.]/ || $match <= 0 || $match > 100) {
    Confess("Invalid match length percent \"$match\".");
}
my $idPercent = $options->{id};
if ($idPercent =~ /[^0-9.]/ || $idPercent <= 0 || $idPercent > 100) {
    Confess("Invalid identity percent \"$idPercent\".");
}
# Create a statistics object.
my $stats = Stats->new();
eval {
    # We have valid options here, so we can proceed.
    Trace("Reading directory $dir.") if T(2);
    my @files = OpenDir($dir, 1);
    # Loop through the correspondence files.
    for my $file (@files) {
        Trace("Processing correspondence file $file.") if T(2);
        # Open the correspondence file for input.
        my $ih = Open(undef, "<$dir/$file");
        $stats->Add(files => 1);
        # Loop through the records.
        while (! eof $ih) {
            # Get the next line of correspondence data.
            my @row = Tracer::GetLine($ih);
            Trace($stats->Ask('lines') . " input lines read.") if $stats->Check(lines => 10000) && T(3);
            # Only proceed if we meet the minimum percent identity.
            if ($row[9] >= $idPercent) {
                # Compute the match length percent.
                my $matchLen = 200 * ($row[12] - $row[11] ) / ($row[13] + $row[16]);
                if ($matchLen >= $match) {
                    # The features meet both criteria for similarity, so we can safely write them to
                    # the output.
                    $stats->Add(kept => 1);
                    print "$row[0]\t$row[1]\n";
                }
            }
        }
    }
};
if ($@) {
    Trace("Error in script: $@") if T(0);
}
Trace("Statistics for run:\n" . $stats->Show()) if T(2);


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3