[Bio] / Sprout / SproutGFF.pl Repository:
ViewVC logotype

View of /Sprout/SproutGFF.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.3 - (download) (as text) (annotate)
Tue Feb 5 05:47:32 2008 UTC (12 years, 5 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, rast_rel_2008_06_16, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, rast_rel_2009_05_18, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.2: +0 -2 lines
Removed obsolete use clauses.

#!/usr/bin/perl -w

# -*- perl -*-
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
# This file is part of the SEED Toolkit.
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.

=head1 SproutGFF

This is a fancy wrapper around B<seed2gff> that can be used to generate the
GFF3 files for the NMPDR. The single parameter is the output directory name.
The files will be organized by NMPDR group.

The currently-supported command-line options are as follows.

=over 4

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=item scan

If specified, the genomes will be collected and the directories created, but no GFF
files will be output. This is mostly useful for testing.

=item phone

Phone number to message when the script is complete.



use strict;
use Tracer;
use Cwd;
use File::Copy;
use File::Path;
use SFXlate;

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
                                              phone => ["", "phone number (international format) to call when load finishes"],
                                              genome => ["", "genome to process; the default is to process all NMPDR core genomes"],
                                              scan => ["", "if specified, the output directories will be created but no files will be written"],
                                           "<output directory>",
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    # Create a Sprout object.
    my $sprout = SFXlate->new_sprout_only();
    # Insure the output directory exists.
    my $outDir = $parameters[0];
    if (! $outDir) {
        Confess("No output directory specified.");
    } else {
        Insure($outDir, 0777);
        # Create the genome map. This lists all the genomes we want along with the corresponding
        # output file name.
        my %genomes;
        # Check for a single-genome situation.
        if ($options->{genome}) {
            # Get the genome name.
            my $genomeID = $options->{genome};
            my $genomeName = $sprout->GenusSpecies($genomeID);
            # Compute the file name.
            my $fileName = CleanGenomeName($genomeName);
            $genomes{$genomeID} = "$outDir/$fileName.gff";
        } else {
            # Here we want all the core organisms, split into super-groups. First, we get the
            # genomes for each group in a hash.
            my %baseGroups = $sprout->GetGroups();
            # Fix it into a hash by super-group.
            my %coreGroups = $sprout->Fix(%baseGroups);
            for my $coreGroup (keys %coreGroups) {
                # Compute the directory and isure it exists.
                my $superDirectory = "$outDir/$coreGroup";
                Insure($superDirectory, 0777);
                # Put all of this group's genomes in the output hash.
                for my $coreGenome (@{$coreGroups{$coreGroup}}) {
                    my $fileName = CleanGenomeName($sprout->GenusSpecies($coreGenome));
                    $genomes{$coreGenome} = "$superDirectory/$fileName.gff";
        # Now we loop through %genomes, creating GFF files.
        for my $genome (sort keys %genomes) {
            my $fileName = $genomes{$genome};
            if ($options->{scan}) {
                Trace("$genome would be written to $fileName") if T(2);
            } else {
                Trace("Writing $genome to $fileName.") if T(3);
                # Do the conversion.
                my @output = `seed2gff -g $genome -o "$fileName" -s -t all -nmpdr`;
                # At trace level 3, we show the output.
                Trace("Output from seed2gff:\n" . join("\n", @output)) if T(3) && scalar(@output);

if ($@) {
    Trace("Script failed with error: $@") if T(0);
    $rtype = "error";
} else {
    Trace("Script complete.") if T(2);
    $rtype = "no error";
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "SproutGFF terminated with $rtype.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);

=head3 CleanGenomeName

    my $cleaned = CleanGenomeName($name);

Clean up a genome name so it can be used as a file name.

=over 4

=item name

Name of the genome, for cleaning purposes.

=item RETURN

Returns the incoming name with all its evil characters converted to dots.



sub CleanGenomeName {
    # Get the parameters.
    my ($name) = @_;
    # Declare the return variable.
    my $retVal = $name;
    # Convert spaces to dots.
    $retVal =~ s/\s+/\./g;
    # Convert double dots to dots.
    $retVal =~ s/\.\./\./g;
    # Convert other bad guys to underscores.
    $retVal =~ tr/():/___/;
    # Return the result.
    return $retVal;


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3