[Bio] / FigKernelScripts / convert_tbl_files.pl Repository:
ViewVC logotype

View of /FigKernelScripts/convert_tbl_files.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Apr 20 00:39:55 2011 UTC (8 years, 7 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_06072011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, rast_rel_2014_0729, mgrast_dev_05262011, mgrast_release_3_1_2, mgrast_release_3_1_1, rast_rel_2011_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_10262011, mgrast_release_3_1_0, HEAD
New script for fancy tbl file conversion.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

=head1 Convert TBL Files

    convert_tbl_files [-genomes=file | -url=pattern] (csv | html) file1 file2 ...

This script reads tab-delimited files listed on the command line and converts them
to a different format, optionally converting FIG feature IDs to URLs.

The first positional parameter is the output file format. Each output file will
have the same name as an input file with the format as an extension. For
example, if the input file is C<frog.tbl> and the format is C<html>, the
corresponding output file name is C<frog.tbl.html>.

The first positional parameter is the output file format. The remaining parameters are
the input file names. If no input file name is specified, the input is taken from the
standard input and the output is sent to the standard output.

The following output formats are supported.

=over 4

=item csv

Comma-separated values, suitable for import into Excel.

=item html

HTML table, suitable for display as a web page.

=back

The command-line options are as follows.

=over 4

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item genomes

If specified, the name of a file containing genome IDs. When converting feature IDs
to links, genomes specified as an ID by itself are converted to PUBSEED links. All
other genomes are converted to the URL specified in the I<url> parameter, or RAST
links if the parameter is omitted.

=item url

If specified, a URL pattern to be used to convert feature IDs to links. The actual
URL will be formed by appending the feature ID to the pattern. If neither this option
or I<genomes> is specified, then the feature IDs will not be converted to links.

=item cols

If specified, the name of a file containing column titles, one per line.

=back

=cut

    use strict;
    use Tracer;
    use Stats;
    use SeedUtils;
    use CGI;

# List of valid format codes.
use constant FORMATS => { csv => 1, html => 1 };

# PUBSEED URL prefix.
use constant PUBSEEDURL => "http://pubseed.theseed.org/seedviewer.cgi?page=Annotation;feature=";

my ($options, @parameters) = StandardSetup([], {
                                                cols => ["", "if specified, name of a file containing column titles"],
                                                url => ["", "if specified, URL for converting feature IDs to links"],
                                                genomes => ["", "if specified, name of a file containing the IDs of genomes in the PUBSEED for URL conversion"],
                                                trace => ["2-", "tracing level"],
                                               }, "<format> <inputFileNames> ...",
                                       @ARGV);

# Create a statistics object.
my $stats = Stats->new();
eval {
    # Compute the format.
    my $format = $parameters[0];
    Confess("Missing conversion format.") if ! $format;
    Confess("Invalid conversion format \"$format\".") if ! FORMATS->{$format};
    # Determine the URL-processing method. This hash will contain the PUBSEED genomes.
    my %genomes;
    # This will be set to the default URL, or left undefined if no URL conversion is to take place.
    my $urlPrefix;
    # Check for a list of PUBSEED genomes.
    if ($options->{genomes}) {
        # Read in the PUBSEED genomes in the file.
        my $gh = Open(undef, "<$options->{genomes}");
        while (! eof $gh) {
            $stats->Add(genomeLine => 1);
            my ($genome) = Tracer::GetLine($gh);
            if ($genome =~ /^\d+\.\d+$/) {
                $genomes{$genome} = 1;
            }
        }
        Trace(scalar(keys %genomes) . " PUBSEED genomes found in $options->{genomes}.") if T(2);
        # Default the URL prefix to the RAST.
        $urlPrefix = "http://rast.nmpdr.org/seedviewer.cgi?page=Annotation;feature=";
    }
    # Check for column titles.
    my $cols;
    if ($options->{cols}) {
        # Read in the column titles.
        my $ch = Open(undef, "<$options->{cols}");
        while (! eof $ch) {
            $stats->Add(colTitle => 1);
            my ($title) = Tracer::GetLine($ch);
            push @$cols, $title;
        }
    }
    # Check for a URL prefix.
    if ($options->{url}) {
        $urlPrefix = $options->{url};
    }
    if ($urlPrefix) {
        Trace("Default feature ID URL is $urlPrefix.") if T(2);
    }
    # Check for input files.
    if (! $parameters[1]) {
        # No input files, so process standard input to standard output.
        Trace("Copying from standard input to standard output.") if T(2);
        ProcessFile(\*STDIN, \*STDOUT, $format, $cols, \%genomes, $urlPrefix);
    } else {
        # Loop through the input files, processing them.
        for (my $i = 1; $i <= $#parameters; $i++) {
            my $inFile = $parameters[$i];
            Trace("Copying from $inFile to $inFile.$format.") if T(2);
            my $ih = Open(undef, "<$inFile");
            my $oh = Open(undef, ">$inFile.$format");
            ProcessFile($ih, $oh, $format, $cols, \%genomes, $urlPrefix);
        }
    }
};
if ($@) {
    Trace("Error in script: $@") if T(0);
}
Trace("Statistics for run:\n" . $stats->Show()) if T(2);

##
## Convert the input file to the output file.
##
## $ih          open input file handle
## $oh          open output file handle
## $format      output format: cvs or html
## $cols        column titles
## $genomes     hash of PUBSEED genome IDs
## $urlPrefix   URL prefix to use for non-PUBSEED features, or undefined if no feature
##              link translation is to occur
##
sub ProcessFile {
    # Get the parameters.
    my ($ih, $oh, $format, $cols, $genomes, $urlPrefix) = @_;
    $stats->Add(filesIn => 1);
    # If this is HTML, start the page.
    if ($format eq 'html') {
        PrintLine($oh, CGI::start_html());
        PrintLine($oh, CGI::start_table());
    }
    # Check for column headings.
    if (defined $cols) {
        if ($format eq 'html') {
            # In HTML, use table header cells.
            PrintLine($oh, CGI::Tr(CGI::th($cols)));
        } else {
            # For CSV, use simple commas.
            PrintLine($oh, join(",", @$cols));
        }
    }
    # Loop through the input lines.
    while (! eof $ih) {
        # Read the input line.
        my $line = <$ih>;
        chomp $line;
        $stats->Add(linesIn => 1);
        # Process the feature ID conversion.
        $line =~ s/fig\|(\d+\.\d+)\.([a-z]{2,5}\.\d+)/GenerateLink($1, $2, $genomes, $urlPrefix)/ge;
        # Split the line into fields.
        my @fields = split /\t/, $line;
        # Convert it and write it to the output.
        if ($format eq 'html') {
            # In HTML, write out a table row.
            PrintLine($oh, CGI::Tr(CGI::td(\@fields)));
        } else {
            # For CSV, wrap each field with double quotes when needed.
            for my $field (@fields) {
                if ($field =~ /,/) {
                    $field = qq("$field");
                }
            }
            # Join the fields with commas.
            PrintLine($oh, join(",", @fields));
        }
    }
    # If this is HTML, finish the page.
    if ($format eq 'html') {
        PrintLine($oh, CGI::end_table());
        PrintLine($oh, CGI::end_html());
    }
}

##
## Generate a feature ID link.
##
## $genome      genome ID of the feature
## $suffix      type/number suffix of the feature
## $genomeH     reference to a hash of PUBSEED genomes
## $urlPrefix   URL prefix to use for non-PUBSEED genomes
##
sub GenerateLink {
    # Get the parameters.
    my ($genome, $suffix, $genomeH, $urlPrefix) = @_;
    # Compute the actual feature ID.
    my $fid = "fig|$genome.$suffix";
    # We'll compute the URL in here.
    my $url;
    # Is this a PUBSEED genome?
    if ($genomeH->{$genome}) {
        # Yes. Use the PUBSEED URL/
        $url = PUBSEEDURL . $fid;
        $stats->Add(pubseedFeatures => 1);
    } else {
        # No. Use the standard URL.
        $url = $urlPrefix . $fid;
    }
    # Create the link.
    my $retVal = CGI::a({ href => $url}, $fid);
    $stats->Add(links => 1);
    # Return it.
    return $retVal;
}

##
## Print a line of output.
##
## $oh          open output handle
## $line        line to print
##
sub PrintLine {
    # Get the parameters.
    my ($oh, $line) = @_;
    $stats->Add(linesOut => 1);
    print $oh "$line\n";
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3