[Bio] / FigKernelScripts / build_sets_from_clusters.pl Repository:
ViewVC logotype

View of /FigKernelScripts/build_sets_from_clusters.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Thu Apr 7 20:41:11 2011 UTC (8 years, 7 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_release_3_0_4, mgrast_dev_06072011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, rast_rel_2014_0729, mgrast_dev_05262011, mgrast_release_3_1_2, mgrast_release_3_1_1, rast_rel_2011_0928, mgrast_dev_04132011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_10262011, mgrast_dev_04082011, mgrast_release_3_1_0, HEAD
Changes since 1.1: +9 -3 lines
Minor fixes for debugging.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

=head1 Build Sets From Clusters

    build_sets_from_clusters genome_file < cluster_file > set_file

This script reads a file of gene sets (pegs only) output by L<cluster_objects.pl>
and forms it into a list of sets. Each set contains either (1) all the genes in
one of the clusters or (2) a single gene not in any cluster. All of the pegs in
all of the input genomes will be in at least one output cluster.

The single positional parameter is the name of a file containing the list of
input genomes. Each line of this file is either a genome directory or the
ID of a genome found in the current SEED. If the file is not specified, then
no singletons will be produced, only the clusters.

The standard output will be a tab-delimited file containing 2 columns: the cluster ID number
and the ID of a gene in the cluster.

The standard input should contain a tab-delimited list of all the genes in a cluster on
each line.

The command-line parameters are as follows.

=over 4

=item i

The name of the input file. If none is specified, the standard input will be used.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item trace

Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
where I<User> is the value of the B<user> option above.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item h

Display this command's parameters and options.

=back

=cut

    use strict;
    use Tracer;
    use Stats;

my ($options, @parameters) = StandardSetup([], { 
                                                trace => ["3-", "tracing level"],
                                                i => ["-", "input file name"]
                                               }, "genomeFile < inputClusters > outputSets",
                                       @ARGV);

# Get the genome file.
my $genomeFile = $parameters[0];
if ($genomeFile && ! -f $genomeFile) {
    Confess("Invalid genome file $genomeFile.");
}
# Create a statistics object.
my $stats = Stats->new();
eval {
    # We have valid options here, so we can proceed. First we create a hash of genes found.
    my %genesH;
    # This will contain the current cluster ID.
    my $clusterID = 1;
    # Now we loop through the input file, writing out the clusters.
    Trace("Reading cluster file.") if T(2);
    my $ih = Open(undef, "<$options->{i}");
    while (! eof $ih) {
        # Get the next cluster.
        my @cluster = Tracer::GetLine($ih);
        Trace($stats->Ask('clusters') . " clusters read.") if $stats->Check(clusters => 1000) && T(3);
        # Process the genes in the cluster.
        for my $gene (@cluster) {
            # Write out the gene's cluster membership.
            print "$clusterID\t$gene\n";
            $stats->Add(inClusters => 1);
            # Denote that we have this gene in a cluster.
            $genesH{$gene} = 1;
        }
        # Advance the cluster ID for the next cluster.
        $clusterID++;
    }
    # The next step is to run through the genome file generating singletons for the unused genes.
    # This process only occurs if a genome file was specified.
    if (! $genomeFile) {
        Trace("No genome file specified. Output contains only connected clusters.") if T(2);
    } else {
        # Open the genome file for input.
        my $ih = Open(undef, "<$genomeFile");
        # Loop through the genomes.
        while (! eof $ih) {
            my ($genome) = Tracer::GetLine($ih);
            $stats->Add(genomes => 1);
            # Compute the name of the genome's PEG file. This depends on whether the genome
            # is a directory name or a genome ID.
            my $pegFileName;
            if ($genome =~ /^\d+\.\d+$/) {
                # We have a SEED genome, so get the SEED genome directory's peg file.
                $pegFileName = "$FIG_Config::organisms/$genome/Features/peg/tbl";
                $stats->Add(seedGenomes => 1);
            } else {
                $pegFileName = "$genome/Features/peg/tbl";
                $stats->Add(fileGenomes => 1);
            }
            # Loop through the pegs in the file.
            Trace("Reading genome PEG file $pegFileName.") if T(2);
            my $ph = Open(undef, "<$pegFileName");
            while (! eof $ph) {
                # Get the next PEG.
                my ($peg) = Tracer::GetLine($ph);
                Trace($stats->Ask('pegs') . " pegs processed in genome files.") if $stats->Check(pegs => 1000) && T(3);
                # Has it been encountered in a cluster?
                if (! $genesH{$peg}) {
                    # No. Add it as a singleton.
                    print "$clusterID\t$peg\n";
                    $stats->Add(singletons => 1);
                    $clusterID++;
                }
            }
        }
    }
};
if ($@) {
    Trace("Error in script: $@") if T(0);
}
Trace("Statistics for run:\n" . $stats->Show()) if T(2);


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3