[Bio] / FigKernelScripts / add_assertions_of_function.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/add_assertions_of_function.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.38 - (view) (download) (as text)

1 : efrank 1.1 # -*- perl -*-
2 : olson 1.31 #
3 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 : efrank 1.1
20 :     # usage: add_assertions_of_function [G1 G2 G3 ...]
21 :    
22 :     use strict;
23 :     use FIG;
24 : parrello 1.23 use Tracer;
25 : efrank 1.1 my $fig = new FIG;
26 : parrello 1.23 my $dbf = $fig->db_handle;
27 : efrank 1.1
28 : overbeek 1.8 my($temp_dir,$user,$prot_id,$quality,$made_by,$file,$line,$assigned_function,$org,@pieces,$piece,@terms,$term);
29 :     my($remove_dups,%assignments,$must_correct);
30 : efrank 1.1
31 : parrello 1.23 # Get the genome list. If all genomes are requested, $mode will be set to 'all'.
32 :     my ($mode, @genomes) = FIG::parse_genome_args(@ARGV);
33 :    
34 :     Trace("Clearing old data.") if T(2);
35 :     if ($mode eq 'all') {
36 : efrank 1.1 $dbf->drop_table( tbl => "assigned_functions" );
37 :     $dbf->create_table( tbl => "assigned_functions",
38 : olson 1.38 flds => qq(prot varchar(64),
39 :     made_by varchar(32),
40 :     assigned_function text,
41 :     quality char,
42 :     org varchar(64),
43 :     annotation_written char(1),
44 :     last_modification timestamp
45 :     )
46 : parrello 1.25 );
47 : overbeek 1.8
48 : olson 1.38 $dbf->drop_table( tbl => "assigned_functions_log" );
49 :     $dbf->create_table( tbl => "assigned_functions_log",
50 :     flds => qq(prot varchar(64),
51 :     made_by varchar(32),
52 :     assigned_function text,
53 :     quality char,
54 :     org varchar(64),
55 :     created timestamp default current_timestamp
56 :     )
57 :     );
58 :    
59 :    
60 : efrank 1.1 $dbf->drop_table( tbl => "roles" );
61 :     $dbf->create_table( tbl => "roles",
62 : olson 1.36 flds => "prot varchar(64), role varchar(255)," .
63 : overbeek 1.2 "made_by varchar(32), org varchar(64)"
64 : parrello 1.25 );
65 : efrank 1.1 }
66 : overbeek 1.3 else
67 :     {
68 :     my $genome;
69 :     foreach $genome (@ARGV)
70 :     {
71 : parrello 1.25 $dbf->SQL("DELETE FROM assigned_functions WHERE ( org = \'$genome\' )");
72 :     $dbf->SQL("DELETE FROM roles WHERE ( org = \'$genome\' )");
73 : overbeek 1.3 }
74 :     }
75 : parrello 1.25
76 : efrank 1.1 $temp_dir = $FIG_Config::temp;
77 : parrello 1.23 Trace("Finding assignments.") if T(2);
78 : parrello 1.27 foreach $_ (&files_with_assignments($mode, @genomes))
79 : efrank 1.1 {
80 : parrello 1.25 # The third value (remove_dups) indicates whether or not updates may have been
81 :     # appended to the end of the file.
82 : efrank 1.1 ($file,$made_by,$remove_dups) = @$_;
83 : parrello 1.23 Trace("Processing $file for user $made_by: remove_dups = $remove_dups") if T(3);
84 : efrank 1.1 $made_by =~ s/\s/_/g;
85 : parrello 1.23
86 :     if ($remove_dups) {
87 : parrello 1.25 undef %assignments;
88 :     $must_correct = 0;
89 :     Open(\*TMP, "<$file");
90 :     while (defined($line = <TMP>))
91 :     {
92 :     if ($line =~ /^(\S+)\t(\S.*\S)\s*$/)
93 :     {
94 :     if ($assignments{$1})
95 :     {
96 :     $must_correct = 1;
97 :     }
98 :     $assignments{$1} = $2;
99 :     }
100 :     }
101 :     close(TMP);
102 :    
103 :     if ($must_correct)
104 :     {
105 :     Trace("Removing duplicates for $file.") if T(3);
106 :     unlink("$file~");
107 :     rename($file,"$file~") || Confess("could not rename $file");
108 :     Open(\*TMP, ">$file");
109 :     foreach $prot_id (sort { &FIG::by_fig_id($a,$b) } keys(%assignments))
110 :     {
111 :     print TMP "$prot_id\t$assignments{$prot_id}\n";
112 :     }
113 :     close(TMP);
114 :     chmod(0777,$file);
115 :     }
116 : efrank 1.1 }
117 : parrello 1.25 # Now $file contains the assignments with all the duplicates removed.
118 : overbeek 1.4 my $aN = 0;
119 : olson 1.38 my $aLN = 0;
120 : overbeek 1.4 my $rN = 0;
121 : parrello 1.23
122 :     if (open(TMP, "<$file"))
123 : efrank 1.1 {
124 : parrello 1.25 Open(\*ASSIGNMENTS, ">$temp_dir/tmp$$");
125 : olson 1.38 Open(\*ASSIGNMENT_LOGS, ">$temp_dir/tmpl$$");
126 : parrello 1.25 Open(\*INDEX, "| sort -u -T $temp_dir > $temp_dir/tmpgen$$");
127 :    
128 :     while (defined($line = <TMP>))
129 :     {
130 :     chomp $line;
131 :     ($prot_id,$assigned_function,$quality) = split(/\t/,$line);
132 :     $assigned_function =~ s/^\s+//;
133 :     $assigned_function =~ s/(\t\S)?\s*$//;
134 :    
135 :     if (($prot_id !~ /^fig\|/) && $quality) { ($assigned_function,$quality) = ("$assigned_function $quality","") }
136 :     if ($prot_id =~ /^pir/) { $assigned_function =~ s/\s+\[imported\]\s*$// }
137 :    
138 :     next if (! $prot_id);
139 :    
140 :     $org = $fig->genome_of($prot_id);
141 :     $org = $org ? $org : "unknown";
142 :    
143 :     $assigned_function =~ s/\\$//; #...Backslashes appear to cause problems for PostGres...
144 :     $assigned_function =~ s/\\/ /g; #...Backslashes appear to cause problems for PostGres...
145 :    
146 : parrello 1.28 if (defined($prot_id) && defined($assigned_function) &&
147 : parrello 1.25 ((! $quality) || (length($quality) == 1)))
148 :     {
149 :     $quality = $quality ? $quality : "";
150 :     if (&verify_row($prot_id,$made_by,$assigned_function,$quality,$org))
151 :     {
152 :     &add_assignment_to_db($dbf,"$temp_dir/tmp$$",\$aN,
153 : olson 1.38 "$prot_id\t$made_by\t$assigned_function\t$quality\t$org\tL\t\\N\n");
154 :     &add_assignment_log_to_db($dbf,"$temp_dir/tmpl$$",\$aLN,
155 :     "$prot_id\t$made_by\t$assigned_function\t$quality\t$org\t\\N\n");
156 : overbeek 1.34 $assigned_function =~ s/\s*[\!\#].*$//; # clear comment from roles
157 : overbeek 1.33 if ((length($assigned_function) <= 255) && ($prot_id =~ /^fig\|/)) {
158 : overbeek 1.34
159 : parrello 1.29 &add_role_to_db($dbf,"$temp_dir/tmpgen$$",\$rN,"$prot_id\t$assigned_function\t$made_by\t$org\n");
160 :     @pieces = grep { length($_) > 3 } split(/\s*;\s+|\s+[\@\/]\s+/,$assigned_function);
161 :     foreach $piece (@pieces)
162 : parrello 1.25 {
163 : parrello 1.29 @terms = ($piece =~ /\d+\.\d+\.\d+\.\d+/g);
164 :     if (@pieces > 1)
165 :     {
166 :     push(@terms,$piece);
167 :     }
168 :    
169 :     foreach $term (@terms)
170 :     {
171 :     &add_role_to_db($dbf,"$temp_dir/tmpgen$$",\$rN,"$prot_id\t$term\t$made_by\t$org\n");
172 :     }
173 : parrello 1.25 }
174 :     }
175 :     }
176 :     }
177 :     }
178 :     close(TMP);
179 :     close(ASSIGNMENTS);
180 : olson 1.38 close(ASSIGNMENT_LOGS);
181 : parrello 1.25 close(INDEX);
182 :    
183 :     if ($aN > 0)
184 :     {
185 :     $dbf->load_table( tbl => "assigned_functions",
186 :     file => "$temp_dir/tmp$$" );
187 : olson 1.38 }
188 :     if ($aLN > 0)
189 :     {
190 :     $dbf->load_table( tbl => "assigned_functions_log",
191 :     file => "$temp_dir/tmpl$$" );
192 : parrello 1.25 }
193 :     if ($rN > 0)
194 :     {
195 :     $dbf->load_table( tbl => "roles",
196 :     file => "$temp_dir/tmpgen$$" );
197 :     }
198 : efrank 1.1 }
199 :     else
200 :     {
201 : parrello 1.25 Trace("Could not open $file.") if T(0);
202 : efrank 1.1 }
203 :     }
204 :    
205 : parrello 1.23 if ($mode eq 'all')
206 : overbeek 1.4 {
207 : parrello 1.25 Trace("Creating assignments index.") if T(2);
208 : overbeek 1.4 $dbf->create_index( idx => "assignments_ix",
209 : parrello 1.25 tbl => "assigned_functions",
210 :     type => "btree",
211 :     flds => "prot,made_by" );
212 : olson 1.38 $dbf->create_index( idx => "assignments_ix2",
213 :     tbl => "assigned_functions",
214 :     type => "btree",
215 :     flds => "org" );
216 :     $dbf->create_index( idx => "assignments_log_ix",
217 :     tbl => "assigned_functions_log",
218 :     type => "btree",
219 :     flds => "prot,made_by" );
220 : parrello 1.25 Trace("Creating roles index.") if T(2);
221 : overbeek 1.4 $dbf->create_index( idx => "roles_ix",
222 : parrello 1.25 tbl => "roles",
223 :     type => "btree",
224 :     flds => "role,org" );
225 :     Trace("Creating protein index.") if T(2);
226 : overbeek 1.4 $dbf->create_index( idx => "roles_ix2",
227 : parrello 1.25 tbl => "roles",
228 :     type => "btree",
229 :     flds => "prot" );
230 : olson 1.35 Trace("Creating org index.") if T(2);
231 :     $dbf->create_index( idx => "roles_ix3",
232 :     tbl => "roles",
233 :     type => "btree",
234 :     flds => "org" );
235 : parrello 1.25 Trace("Vaccuuming tables.") if T(2);
236 : overbeek 1.4 $dbf->vacuum_it("assigned_functions");
237 : olson 1.38 $dbf->vacuum_it("assigned_functions_log");
238 : overbeek 1.4 $dbf->vacuum_it("roles");
239 :     }
240 : efrank 1.1
241 :     unlink("$temp_dir/tmp$$","$temp_dir/tmpgen$$");
242 :     undef $fig;
243 : parrello 1.25 Trace("Function assertions added.") if T(2);
244 : efrank 1.1
245 :     sub files_with_assignments {
246 : overbeek 1.26 my($mode,@genomes) = @_;
247 : efrank 1.1 my(@files,$genome,@users,$user) ;
248 : parrello 1.23
249 : efrank 1.1 @files = ();
250 : overbeek 1.26 if ($mode eq "all")
251 : efrank 1.1 {
252 : parrello 1.25 @files = (["$FIG_Config::global/ext_func.table","master",0]);
253 :     opendir(ORG,"$FIG_Config::organisms") || die "Where are the organisms?";
254 :     @genomes = sort {$a <=> $b} grep { $_ =~ /^\d+\.\d+$/ } readdir(ORG);
255 :     closedir(ORG);
256 : efrank 1.1 }
257 :    
258 :     foreach $genome (@genomes)
259 :     {
260 : parrello 1.25 if (-s "$FIG_Config::organisms/$genome/assigned_functions")
261 :     {
262 :     push(@files,["$FIG_Config::organisms/$genome/assigned_functions","master",1]);
263 :     }
264 :    
265 : overbeek 1.32 if (0 && ##### I am turning off user models as of June 1, 2006 [RAO]
266 :     (-d "$FIG_Config::organisms/$genome/UserModels") &&
267 : parrello 1.25 opendir(USERS,"$FIG_Config::organisms/$genome/UserModels"))
268 :     {
269 :     @users = grep { $_ !~ /^\./ } readdir(USERS);
270 :     closedir(USERS);
271 :     foreach $user (@users)
272 :     {
273 :     if (-s "$FIG_Config::organisms/$genome/UserModels/$user/assigned_functions")
274 :     {
275 :     push(@files,["$FIG_Config::organisms/$genome/UserModels/$user/assigned_functions",$user,1]);
276 :     }
277 :     }
278 :     }
279 : efrank 1.1 }
280 :     return @files;
281 :     }
282 :    
283 : parrello 1.23 # The idea here is we build up the assignment file, and if it exceeds a certain
284 :     # size, we copy it into the database.
285 : overbeek 1.4 sub add_assignment_to_db {
286 : parrello 1.24 my($dbf, $file, $aNP, $row) = @_;
287 : overbeek 1.4 if ($$aNP > 50000)
288 :     {
289 : parrello 1.25 my @tmp = `date`;
290 :     Trace("Copying assignments: $tmp[0]") if T(4);
291 :     close(ASSIGNMENTS);
292 :    
293 :     $dbf->load_table( tbl => "assigned_functions",
294 :     file => "$temp_dir/tmp$$" );
295 :    
296 :     $$aNP = 0;
297 :     Open(\*ASSIGNMENTS, ">$file");
298 : overbeek 1.4 }
299 :     print ASSIGNMENTS $row;
300 :     $$aNP++;
301 :     }
302 :    
303 : olson 1.38 sub add_assignment_log_to_db {
304 :     my($dbf, $file, $aNP, $row) = @_;
305 :     if ($$aNP > 50000)
306 :     {
307 :     my @tmp = `date`;
308 :     Trace("Copying assignments: $tmp[0]") if T(4);
309 :     close(ASSIGNMENT_LOGS);
310 :    
311 :     $dbf->load_table( tbl => "assigned_functions_log",
312 :     file => "$temp_dir/tmp$$" );
313 :    
314 :     $$aNP = 0;
315 :     Open(\*ASSIGNMENT_LOGS, ">$file");
316 :     }
317 :     print ASSIGNMENT_LOGS $row;
318 :     $$aNP++;
319 :     }
320 :    
321 : overbeek 1.4 sub add_role_to_db {
322 :     my($dbf,$file,$rNP,$row) = @_;
323 :    
324 :     if ($$rNP > 50000)
325 :     {
326 : parrello 1.25 Trace("Copying role index for $file.") if T(3);
327 :     close(INDEX);
328 :    
329 :     $dbf->load_table( tbl => "roles",
330 :     file => "$temp_dir/tmpgen$$" );
331 :     $$rNP = 0;
332 :     Open(\*INDEX, "| sort -u -T $temp_dir > $file");
333 : overbeek 1.4 }
334 :     print INDEX $row;
335 :     $$rNP++;
336 :     }
337 : overbeek 1.14
338 :     sub verify_row {
339 :     my($prot_id,$made_by,$assigned_function,$quality,$org) = @_;
340 :    
341 : olson 1.37 if ((length($prot_id) <= 64) &&
342 : parrello 1.25 (length($made_by) <= 32) &&
343 :     (length($quality) <= 1) &&
344 :     (length($org) <= 64))
345 : overbeek 1.14 {
346 : parrello 1.25 return 1;
347 : overbeek 1.14 }
348 : parrello 1.23
349 :     Trace("Field-width overflow in entry: \"$prot_id\t$made_by\t$assigned_function\t$quality\t$org\"") if T(0);
350 : overbeek 1.14 return 0;
351 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3