[Bio] / Sprout / DrugClean.pl Repository:
ViewVC logotype

View of /Sprout/DrugClean.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Tue Feb 5 05:47:32 2008 UTC (11 years, 10 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, rast_rel_2008_06_16, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, rast_rel_2009_05_18, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.1: +0 -2 lines
Removed obsolete use clauses.

#!/usr/bin/perl -w

=head1 Drug Cleaner

Clean up a flat file with PEGs in it.

This script runs through a tab-delimited text file, removing duplicate entries and entries
for features not in the Sprout database. The positional parameters should be the names of the
files to clean.

The currently-supported command-line options are as follows.

=over 4

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item sql

If specified, turns on tracing of SQL activity.

=item macFile

If specified, the file is presumed to be in Macintosh format.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item col

Column in the input file that contains feature IDs. The default is C<6>. The
column count is 1-based.

=item phone

Phone number to message when the script is complete.

=back

=cut

use strict;
use Tracer;
use Cwd;
use File::Copy;
use File::Path;
use FIG;
use SFXlate;
use Stats;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(Sprout) ],
                                           {
                                              col => ["6", "1-based index of the column containing feature IDs"],
                                              trace => ["2", "trace level"],
                                              macFile => ["", "If specified, the file is presumed to be in macintosh format."],
                                              phone => ["", "phone number (international format) to call when load finishes"],
                                           },
                                           "<fileName1> <fileName2> ... ",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    # Get a sprout object.
    my $sprout = SFXlate->new_sprout_only();
    # Get the 0-based index of the column containing feature IDs.
    my $col = $options->{col};
    if ($col !~ /^\d+$/) {
        Confess("Invalid column number \"$col\".");
    } else {
        $col--;
        # Check for macintosh format.
        if ($options->{macFile}) {
            # The input file is from the MAC, so use "\r" instead of "\n" for the input.
            # This will not affect output, so we'll be converting the file to Unix as
            # part of the cleaning.
            $/ = "\r";
        }
        # Loop through the files.
        for my $fileName (@parameters) {
            Trace("Processing $fileName.") if T(2);
            # Create a backup file name.
            my $tempFile = "$fileName.tmp~";
            # Create a hash of features. We will skip any feature whose ID is already in the hash.
            my %fids;
            # Open the two files.
            my $inh = Open(undef, "<$fileName");
            my $outh = Open(undef, ">$tempFile");
            # Get a statistics object.
            my $stats = Stats->new();
            # Loop through the input file.
            while (! eof $inh) {
                # Get the current record.
                my @fields = Tracer::GetLine($inh);
                $stats->Add(input => 1);
                # Pull out the feature ID.
                my $fid = $fields[$col];
                # Figure out what to do with this record.
                if (! $fid) {
                    # No feature ID, so this record is considered a bad line and skipped.
                    $stats->Add(badline => 1);
                } elsif (! exists $fids{$fid}) {
                    # Here we are seeing this feature for the first time. Make sure we
                    # don't process it again.
                    $fids{$fid} = 1;
                    # Now, find out if this feature exists.
                    if ($sprout->Exists('Feature', $fid)) {
                        # It does, so write it out.
                        Tracer::PutLine($outh, \@fields);
                        $stats->Add(output => 1);
                    } else {
                        Trace("Feature $fid not found.") if T(3);
                        $stats->Add(notFound => 1);
                    }
                } else {
                    $stats->Add(duplicate => 1);
                }
            }
            # Display the statistics.
            Trace("Statistics for $fileName:\n" . $stats->Show()) if T(2);
            # Close the files.
            close $inh;
            close $outh;
            # Kill the old file and rename the new one.
            my $okFlag = rename($tempFile, $fileName);
            if (! $okFlag) {
                Trace("Could not rename $tempFile to $fileName.") if T(0);
            }
        }
    }
    Trace("Processing complete.") if T(2);
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
}
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "Drug Cleaner terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}



1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3