[Bio] / FigKernelScripts / analyze_loose_sets.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/analyze_loose_sets.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     =head1 Analyze Loose Sets
21 :    
22 :     analyze_loose_sets set_file
23 :    
24 :     This script reads a file of loose gene sets with functional assignments and outputs
25 :     an analysis of the assignments found in each set.
26 :    
27 :     The input should be a tab-delimited file with set numbers in the first column, feature
28 :     IDs in the second column, and functional assignments in the third column. All features
29 :     in the same set are believed to be members of the same protein family, so it is
30 :     expected that they would all have the same function.
31 :    
32 : parrello 1.3 There will be six output files, each with the same name as the input file plus a
33 :     suffix.
34 :    
35 :     Four output files will have the same format: a tab-delimited file
36 : parrello 1.1 containing a set number in the first column, a functional assignment in the
37 :     second, a count in the third, and a comma-delimited list of feature IDs in the fourth.
38 :     The count indicates the number of features in the set having that functional
39 :     role. The first output file will have a suffix of C<.con> and will only contain
40 :     sets that have a single functional role. The second output file will have a
41 :     suffix of C<.pinc> and will contain sets that have multiple functional
42 :     roles of which one is believed to be correct. In this file the recommended role will be
43 : parrello 1.2 output first. The third output file will have a suffix of C<.hinc> and will contain sets
44 :     that have multiple functional roles which would be consistent or could be fixed to consistent
45 :     if the hypothetical roles were removed from consideration. The fourth output
46 :     file will have a suffix of C<.inc> and will contain sets that have multiple
47 :     functional roles for which no recommendation can be made. In this file the roles
48 :     will be output from most common to least common.
49 : parrello 1.1
50 : parrello 1.3 The fifth output file will have a suffix of C<.chg> and will contain the recommended
51 : parrello 1.1 functional role changes. It will have a feature ID in the first column, a constant
52 :     C<analysis> in the second column, and the recommended function ID in the third.
53 :    
54 : parrello 1.3 The sixth output file will be a comma-separated values file with a suffix of C<.csv>
55 :     suitable for loading into Excel. The first column in this file will be a set number, the
56 :     second column will be the best functional assignment for the set, and the third column
57 :     will be blank.
58 :    
59 : parrello 1.1 The single positional parameter is the name of the input file and is required.
60 :    
61 :     The command-line parameters are as follows.
62 :    
63 :     =over 4
64 :    
65 :     =item user
66 :    
67 :     Name suffix to be used for log files. If omitted, the PID is used.
68 :    
69 :     =item trace
70 :    
71 :     Numeric trace level. A higher trace level causes more messages to appear. The
72 :     default trace level is 2. Tracing will be directly to the standard output
73 :     as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
74 :     where I<User> is the value of the B<user> option above.
75 :    
76 :     =item background
77 :    
78 :     Save the standard and error output to files. The files will be created
79 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
80 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
81 :     B<user> option above.
82 :    
83 :     =item h
84 :    
85 :     Display this command's parameters and options.
86 :    
87 :     =item dbname
88 :    
89 :     Name of the Sapling database to use. This option is generally only useful for debugging.
90 :    
91 :     =item dbhost
92 :    
93 :     SQL host for the Sapling database to use. This option is generally only useful for debugging.
94 :    
95 :     =item dbport
96 :    
97 :     Database port to use for the Sapling database. This option is generally only useful for debugging.
98 :    
99 :     =back
100 :    
101 :     =cut
102 :    
103 :     use strict;
104 :     use Tracer;
105 :     use Stats;
106 :     use SeedUtils;
107 :     use Sapling;
108 :    
109 :     use RoleRuleSubstring;
110 :    
111 :     my ($options, @parameters) = StandardSetup([], {
112 :     dbname => [$FIG_Config::saplingDB, "name of the Sapling database to use"],
113 :     dbhost => ["", "host containing the Sapling database"],
114 :     dbport => ["", "port for connecting to the Sapling database"],
115 :     trace => ["3-", "tracing level"],
116 :     }, "<inputFileName>",
117 :     @ARGV);
118 :    
119 :     # Create a statistics object.
120 :     my $stats = Stats->new();
121 :     # Get the Sapling database.
122 :     my $sap = Sapling->new(dbName => $options->{dbname}, dbhost => $options->{dbhost},
123 :     port => $options->{dbport});
124 :     # Create the list of role rules.
125 :     my @RoleRules = (RoleRuleSubstring->new($sap, $stats));
126 :     eval {
127 :     # We have valid options here, so we can proceed. This variable will contain the current
128 :     # set number.
129 :     my $currentSet = "";
130 :     # This hash will list the IDs of the sets containing each non-hypothetical assignment found.
131 :     # At the end of the run, information about the multiply-occurring assignments will be displayed.
132 :     my %roleSets;
133 :     # This hash counts the number of times each role was found in the current set.
134 :     my %setRoles;
135 :     # Get the input file name.
136 :     my $inFileName = $parameters[0];
137 :     if (! $inFileName || $inFileName eq "-") {
138 :     Confess("No input file specified.");
139 :     }
140 : parrello 1.2 # Open the input and output files.
141 : parrello 1.1 Trace("Opening set file $inFileName.") if T(2);
142 :     my $ih = Open(undef, "<$inFileName");
143 :     Open(\*CONH, ">$inFileName.con");
144 :     Open(\*INCH, ">$inFileName.inc");
145 :     Open(\*PINCH, ">$inFileName.pinc");
146 : parrello 1.2 Open(\*HINCH, ">$inFileName.hinc");
147 : parrello 1.1 Open(\*CHGH, ">$inFileName.chg");
148 : parrello 1.3 Open(\*CSVH, ">$inFileName.csv");
149 :     # Put column headers in the CSV file.
150 :     print CSVH qq("Set #","Best Functional Assignment","Recommendation"\n);
151 : parrello 1.1 # Loop through the input.
152 :     while (! eof $ih) {
153 :     # Get the next input line.
154 :     my ($set, $fid, $function) = Tracer::GetLine($ih);
155 :     $stats->Add(lines => 1);
156 :     # Is this a new set?
157 :     if ($set ne $currentSet) {
158 :     # Yes. Produce output for the old set.
159 :     ProcessOldSet($currentSet, \%setRoles, $stats);
160 :     # Set up for the new set.
161 :     %setRoles = ();
162 :     $currentSet = $set;
163 :     }
164 :     # Record this role for this set.
165 :     push @{$setRoles{$function}}, $fid;
166 :     }
167 :     # Produce output for the residual set.
168 :     ProcessOldSet($currentSet, \%setRoles, $stats);
169 :     # Close the files.
170 :     close $ih;
171 :     close CONH;
172 :     close INCH;
173 :     close PINCH;
174 : parrello 1.2 close HINCH;
175 : parrello 1.1 close CHGH;
176 :     };
177 :     if ($@) {
178 :     Trace("Error in script: $@") if T(0);
179 :     }
180 :     Trace("Statistics for run:\n" . $stats->Show()) if T(2);
181 :    
182 :     # This function outputs the analysis of a single set. The parameters are
183 :     # $currentSet ID of the relevant set
184 :     # $setRoles reference to a hash mapping each function to the features for which it occurred in
185 :     # the set (used as input)
186 :     # $stats statistics object (updated)
187 :     sub ProcessOldSet {
188 :     # Get the parameters.
189 :     my ($currentSet, $setRoles, $stats) = @_;
190 :     # Only proceed if this is a real set.
191 :     if ($currentSet ne "") {
192 :     $stats->Add(sets => 1);
193 :     # The chosen output handle goes in here.
194 :     my $oh;
195 :     # The sorted list of roles to output goes here.
196 :     my $sortedRoles;
197 :     # The recommended role, if any, goes here.
198 :     my $recommendedRole;
199 :     # Is this a consistent set?
200 :     if (scalar(keys %$setRoles) == 1) {
201 :     # Yes. Output to the consistent file and denote we have another consistent set.
202 :     $oh = \*CONH;
203 :     $stats->Add(consistentSet => 1);
204 :     $sortedRoles = [keys %$setRoles];
205 :     } else {
206 : parrello 1.2 # Check for outnumbered hypotheticals. Non-hypotheticals will be considered
207 :     # "regular" and hypotheticals will be considered "excess".
208 :     my $regularRoles = {};
209 :     my $excessRoles = {};
210 :     my $regularCount = 0;
211 :     my $excessCount = 0;
212 :     for my $role (keys %$setRoles) {
213 :     my $fids = $setRoles->{$role};
214 :     if (SeedUtils::hypo($role)) {
215 :     $excessRoles->{$role} = $fids;
216 :     $excessCount += scalar @$fids;
217 :     } else {
218 :     $regularRoles->{$role} = $fids;
219 :     $regularCount += scalar @$fids;
220 :     }
221 :     }
222 :     my $excessList = [];
223 :     # Do we have outnumbered hypotheticals?
224 :     if ($excessCount && $regularCount > $excessCount * 2) {
225 :     # Yes. Sort the hypotheticals into the excess list.
226 :     $excessList = RoleRule::Sort($excessRoles);
227 :     } else {
228 :     # No. Denote we have no excess rules.
229 :     $regularRoles = $setRoles;
230 :     $excessRoles = [];
231 :     $excessCount = 0;
232 :     }
233 :     # Loop through the role rules, stopping on the first match.
234 : parrello 1.1 for my $roleRule (@RoleRules) { last if defined $sortedRoles;
235 :     # Try to match this rule.
236 : parrello 1.2 $sortedRoles = $roleRule->Check($regularRoles);
237 : parrello 1.1 }
238 :     # If no rules matched, we have a purely inconsistent set.
239 :     if (! defined $sortedRoles) {
240 :     $stats->Add(inconsistentSet => 1);
241 :     $oh = \*INCH;
242 :     # Do a standard sort of the roles.
243 :     $sortedRoles = RoleRule::Sort($setRoles);
244 : parrello 1.2 # Denote the excess roles no longer matter.
245 :     $excessList = [];
246 :     $excessCount = 0;
247 : parrello 1.1 } else {
248 :     # Here we want to recommend a role change.
249 :     $stats->Add(fixableSet => 1);
250 : parrello 1.2 # Output to the proper file depending on whether hypotheticals are a factor.
251 :     $oh = ($excessCount ? \*HINCH : \*PINCH);
252 : parrello 1.1 # Get the recommended role.
253 :     $recommendedRole = $sortedRoles->[0];
254 : parrello 1.2 # Output the change instructions.
255 :     for my $role (@$sortedRoles) {
256 :     if ($role ne $recommendedRole) {
257 :     for my $fid (@{$setRoles->{$role}}) {
258 :     print CHGH "$fid\tanalysis\t$recommendedRole\n";
259 :     $stats->Add(changeRequest => 1);
260 :     }
261 :     }
262 :     }
263 :     # Add the excess roles (if any) to the list for the final output.
264 :     push @$sortedRoles, @$excessList;
265 : parrello 1.1 }
266 :     }
267 : parrello 1.3 # Send the best role to the CSV file.
268 :     print CSVH qq($currentSet,"$sortedRoles->[0]",\n);
269 : parrello 1.1 # Loop through the roles.
270 :     for my $role (@$sortedRoles) {
271 :     # Output this role.
272 :     my @fids = @{$setRoles->{$role}};
273 :     print $oh join("\t", $currentSet, $role, scalar(@fids), join(", ", @fids)) . "\n";
274 :     $stats->Add(roleSetPairs => 1);
275 :     }
276 :     }
277 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3