[Bio] / FigKernelScripts / analyze_loose_sets.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/analyze_loose_sets.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     =head1 Analyze Loose Sets
21 :    
22 :     analyze_loose_sets set_file
23 :    
24 :     This script reads a file of loose gene sets with functional assignments and outputs
25 :     an analysis of the assignments found in each set.
26 :    
27 :     The input should be a tab-delimited file with set numbers in the first column, feature
28 :     IDs in the second column, and functional assignments in the third column. All features
29 :     in the same set are believed to be members of the same protein family, so it is
30 :     expected that they would all have the same function.
31 :    
32 :     There will be four output files, each with the same name as the input file plus a
33 :     suffix. Three output files will have the same format: a tab-delimited file
34 :     containing a set number in the first column, a functional assignment in the
35 :     second, a count in the third, and a comma-delimited list of feature IDs in the fourth.
36 :     The count indicates the number of features in the set having that functional
37 :     role. The first output file will have a suffix of C<.con> and will only contain
38 :     sets that have a single functional role. The second output file will have a
39 :     suffix of C<.pinc> and will contain sets that have multiple functional
40 :     roles of which one is believed to be correct. In this file the recommended role will be
41 :     output first. The third output file will have a suffix of C<.inc> and will contain
42 :     sets that have multiple functional roles for which no recommendation can be made. In
43 :     this file the roles will be output from most common to least common.
44 :    
45 :     The fourth output file will have a suffix of C<.chg> and will contain the recommended
46 :     functional role changes. It will have a feature ID in the first column, a constant
47 :     C<analysis> in the second column, and the recommended function ID in the third.
48 :    
49 :     The single positional parameter is the name of the input file and is required.
50 :    
51 :     The command-line parameters are as follows.
52 :    
53 :     =over 4
54 :    
55 :     =item user
56 :    
57 :     Name suffix to be used for log files. If omitted, the PID is used.
58 :    
59 :     =item trace
60 :    
61 :     Numeric trace level. A higher trace level causes more messages to appear. The
62 :     default trace level is 2. Tracing will be directly to the standard output
63 :     as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
64 :     where I<User> is the value of the B<user> option above.
65 :    
66 :     =item background
67 :    
68 :     Save the standard and error output to files. The files will be created
69 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
70 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
71 :     B<user> option above.
72 :    
73 :     =item h
74 :    
75 :     Display this command's parameters and options.
76 :    
77 :     =item dbname
78 :    
79 :     Name of the Sapling database to use. This option is generally only useful for debugging.
80 :    
81 :     =item dbhost
82 :    
83 :     SQL host for the Sapling database to use. This option is generally only useful for debugging.
84 :    
85 :     =item dbport
86 :    
87 :     Database port to use for the Sapling database. This option is generally only useful for debugging.
88 :    
89 :     =back
90 :    
91 :     =cut
92 :    
93 :     use strict;
94 :     use Tracer;
95 :     use Stats;
96 :     use SeedUtils;
97 :     use Sapling;
98 :    
99 :     use RoleRuleSubstring;
100 :    
101 :     my ($options, @parameters) = StandardSetup([], {
102 :     dbname => [$FIG_Config::saplingDB, "name of the Sapling database to use"],
103 :     dbhost => ["", "host containing the Sapling database"],
104 :     dbport => ["", "port for connecting to the Sapling database"],
105 :     trace => ["3-", "tracing level"],
106 :     }, "<inputFileName>",
107 :     @ARGV);
108 :    
109 :     # Create a statistics object.
110 :     my $stats = Stats->new();
111 :     # Get the Sapling database.
112 :     my $sap = Sapling->new(dbName => $options->{dbname}, dbhost => $options->{dbhost},
113 :     port => $options->{dbport});
114 :     # Create the list of role rules.
115 :     my @RoleRules = (RoleRuleSubstring->new($sap, $stats));
116 :     eval {
117 :     # We have valid options here, so we can proceed. This variable will contain the current
118 :     # set number.
119 :     my $currentSet = "";
120 :     # This hash will list the IDs of the sets containing each non-hypothetical assignment found.
121 :     # At the end of the run, information about the multiply-occurring assignments will be displayed.
122 :     my %roleSets;
123 :     # This hash counts the number of times each role was found in the current set.
124 :     my %setRoles;
125 :     # Get the input file name.
126 :     my $inFileName = $parameters[0];
127 :     if (! $inFileName || $inFileName eq "-") {
128 :     Confess("No input file specified.");
129 :     }
130 :     # Open the five files.
131 :     Trace("Opening set file $inFileName.") if T(2);
132 :     my $ih = Open(undef, "<$inFileName");
133 :     Open(\*CONH, ">$inFileName.con");
134 :     Open(\*INCH, ">$inFileName.inc");
135 :     Open(\*PINCH, ">$inFileName.pinc");
136 :     Open(\*CHGH, ">$inFileName.chg");
137 :     # Loop through the input.
138 :     while (! eof $ih) {
139 :     # Get the next input line.
140 :     my ($set, $fid, $function) = Tracer::GetLine($ih);
141 :     $stats->Add(lines => 1);
142 :     # Is this a new set?
143 :     if ($set ne $currentSet) {
144 :     # Yes. Produce output for the old set.
145 :     ProcessOldSet($currentSet, \%setRoles, $stats);
146 :     # Set up for the new set.
147 :     %setRoles = ();
148 :     $currentSet = $set;
149 :     }
150 :     # Record this role for this set.
151 :     push @{$setRoles{$function}}, $fid;
152 :     }
153 :     # Produce output for the residual set.
154 :     ProcessOldSet($currentSet, \%setRoles, $stats);
155 :     # Close the files.
156 :     close $ih;
157 :     close CONH;
158 :     close INCH;
159 :     close PINCH;
160 :     close CHGH;
161 :     };
162 :     if ($@) {
163 :     Trace("Error in script: $@") if T(0);
164 :     }
165 :     Trace("Statistics for run:\n" . $stats->Show()) if T(2);
166 :    
167 :     # This function outputs the analysis of a single set. The parameters are
168 :     # $currentSet ID of the relevant set
169 :     # $setRoles reference to a hash mapping each function to the features for which it occurred in
170 :     # the set (used as input)
171 :     # $stats statistics object (updated)
172 :     sub ProcessOldSet {
173 :     # Get the parameters.
174 :     my ($currentSet, $setRoles, $stats) = @_;
175 :     # Only proceed if this is a real set.
176 :     if ($currentSet ne "") {
177 :     $stats->Add(sets => 1);
178 :     # The chosen output handle goes in here.
179 :     my $oh;
180 :     # The sorted list of roles to output goes here.
181 :     my $sortedRoles;
182 :     # The recommended role, if any, goes here.
183 :     my $recommendedRole;
184 :     # Is this a consistent set?
185 :     if (scalar(keys %$setRoles) == 1) {
186 :     # Yes. Output to the consistent file and denote we have another consistent set.
187 :     $oh = \*CONH;
188 :     $stats->Add(consistentSet => 1);
189 :     $sortedRoles = [keys %$setRoles];
190 :     } else {
191 :     # Here we have an inconsistent set. Loop through the role rules, stopping on the first match.
192 :     for my $roleRule (@RoleRules) { last if defined $sortedRoles;
193 :     # Try to match this rule.
194 :     $sortedRoles = $roleRule->Check($setRoles);
195 :     }
196 :     # If no rules matched, we have a purely inconsistent set.
197 :     if (! defined $sortedRoles) {
198 :     $stats->Add(inconsistentSet => 1);
199 :     $oh = \*INCH;
200 :     # Do a standard sort of the roles.
201 :     $sortedRoles = RoleRule::Sort($setRoles);
202 :     } else {
203 :     # Here we want to recommend a role change.
204 :     $stats->Add(fixableSet => 1);
205 :     $oh = \*PINCH;
206 :     # Get the recommended role.
207 :     $recommendedRole = $sortedRoles->[0];
208 :     }
209 :     }
210 :     # Loop through the roles.
211 :     for my $role (@$sortedRoles) {
212 :     # Output this role.
213 :     my @fids = @{$setRoles->{$role}};
214 :     print $oh join("\t", $currentSet, $role, scalar(@fids), join(", ", @fids)) . "\n";
215 :     $stats->Add(roleSetPairs => 1);
216 :     # If there's a recommended role, output the change instructions.
217 :     if (defined $recommendedRole && $role ne $recommendedRole) {
218 :     for my $fid (@fids) {
219 :     print CHGH "$fid\tanalysis\t$recommendedRole\n";
220 :     $stats->Add(changeRequest => 1);
221 :     }
222 :     }
223 :     }
224 :     }
225 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3