[Bio] / FigKernelScripts / build_sets_from_clusters.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/build_sets_from_clusters.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     =head1 Build Sets From Clusters
21 :    
22 :     build_sets_from_clusters genome_file < cluster_file > set_file
23 :    
24 :     This script reads a file of gene sets (pegs only) output by L<cluster_objects.pl>
25 :     and forms it into a list of sets. Each set contains either (1) all the genes in
26 :     one of the clusters or (2) a single gene not in any cluster. All of the pegs in
27 :     all of the input genomes will be in at least one output cluster.
28 :    
29 :     The single positional parameter is the name of a file containing the list of
30 :     input genomes. Each line of this file is either a genome directory or the
31 :     ID of a genome found in the current SEED. If the file is not specified, then
32 :     no singletons will be produced, only the clusters.
33 :    
34 :     The standard output will be a tab-delimited file containing 2 columns: the cluster ID number
35 :     and the ID of a gene in the cluster.
36 :    
37 :     The standard input should contain a tab-delimited list of all the genes in a cluster on
38 :     each line.
39 :    
40 :     The command-line parameters are as follows.
41 :    
42 :     =over 4
43 :    
44 : parrello 1.2 =item i
45 :    
46 :     The name of the input file. If none is specified, the standard input will be used.
47 :    
48 : parrello 1.1 =item user
49 :    
50 :     Name suffix to be used for log files. If omitted, the PID is used.
51 :    
52 :     =item trace
53 :    
54 :     Numeric trace level. A higher trace level causes more messages to appear. The
55 :     default trace level is 2. Tracing will be directly to the standard output
56 :     as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
57 :     where I<User> is the value of the B<user> option above.
58 :    
59 :     =item background
60 :    
61 :     Save the standard and error output to files. The files will be created
62 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
63 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
64 :     B<user> option above.
65 :    
66 :     =item h
67 :    
68 :     Display this command's parameters and options.
69 :    
70 :     =back
71 :    
72 :     =cut
73 :    
74 :     use strict;
75 :     use Tracer;
76 :     use Stats;
77 :    
78 :     my ($options, @parameters) = StandardSetup([], {
79 : parrello 1.2 trace => ["3-", "tracing level"],
80 :     i => ["-", "input file name"]
81 : parrello 1.1 }, "genomeFile < inputClusters > outputSets",
82 :     @ARGV);
83 :    
84 :     # Get the genome file.
85 :     my $genomeFile = $parameters[0];
86 :     if ($genomeFile && ! -f $genomeFile) {
87 :     Confess("Invalid genome file $genomeFile.");
88 :     }
89 :     # Create a statistics object.
90 :     my $stats = Stats->new();
91 :     eval {
92 :     # We have valid options here, so we can proceed. First we create a hash of genes found.
93 :     my %genesH;
94 :     # This will contain the current cluster ID.
95 :     my $clusterID = 1;
96 :     # Now we loop through the input file, writing out the clusters.
97 :     Trace("Reading cluster file.") if T(2);
98 : parrello 1.2 my $ih = Open(undef, "<$options->{i}");
99 :     while (! eof $ih) {
100 : parrello 1.1 # Get the next cluster.
101 : parrello 1.2 my @cluster = Tracer::GetLine($ih);
102 : parrello 1.1 Trace($stats->Ask('clusters') . " clusters read.") if $stats->Check(clusters => 1000) && T(3);
103 :     # Process the genes in the cluster.
104 :     for my $gene (@cluster) {
105 :     # Write out the gene's cluster membership.
106 :     print "$clusterID\t$gene\n";
107 :     $stats->Add(inClusters => 1);
108 :     # Denote that we have this gene in a cluster.
109 :     $genesH{$gene} = 1;
110 :     }
111 :     # Advance the cluster ID for the next cluster.
112 :     $clusterID++;
113 :     }
114 :     # The next step is to run through the genome file generating singletons for the unused genes.
115 :     # This process only occurs if a genome file was specified.
116 :     if (! $genomeFile) {
117 :     Trace("No genome file specified. Output contains only connected clusters.") if T(2);
118 :     } else {
119 :     # Open the genome file for input.
120 :     my $ih = Open(undef, "<$genomeFile");
121 :     # Loop through the genomes.
122 :     while (! eof $ih) {
123 :     my ($genome) = Tracer::GetLine($ih);
124 :     $stats->Add(genomes => 1);
125 :     # Compute the name of the genome's PEG file. This depends on whether the genome
126 :     # is a directory name or a genome ID.
127 :     my $pegFileName;
128 :     if ($genome =~ /^\d+\.\d+$/) {
129 :     # We have a SEED genome, so get the SEED genome directory's peg file.
130 :     $pegFileName = "$FIG_Config::organisms/$genome/Features/peg/tbl";
131 :     $stats->Add(seedGenomes => 1);
132 :     } else {
133 :     $pegFileName = "$genome/Features/peg/tbl";
134 :     $stats->Add(fileGenomes => 1);
135 :     }
136 :     # Loop through the pegs in the file.
137 :     Trace("Reading genome PEG file $pegFileName.") if T(2);
138 :     my $ph = Open(undef, "<$pegFileName");
139 :     while (! eof $ph) {
140 :     # Get the next PEG.
141 :     my ($peg) = Tracer::GetLine($ph);
142 :     Trace($stats->Ask('pegs') . " pegs processed in genome files.") if $stats->Check(pegs => 1000) && T(3);
143 :     # Has it been encountered in a cluster?
144 :     if (! $genesH{$peg}) {
145 :     # No. Add it as a singleton.
146 :     print "$clusterID\t$peg\n";
147 :     $stats->Add(singletons => 1);
148 :     $clusterID++;
149 :     }
150 :     }
151 :     }
152 :     }
153 :     };
154 :     if ($@) {
155 :     Trace("Error in script: $@") if T(0);
156 :     }
157 :     Trace("Statistics for run:\n" . $stats->Show()) if T(2);
158 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3