[Bio] / FigKernelScripts / load_attributes.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/load_attributes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.46 - (view) (download) (as text)

1 : olson 1.29 #
2 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
3 :     # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 : parrello 1.45 #
7 : olson 1.29 # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 : parrello 1.45 # Public License.
10 : olson 1.29 #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :    
18 : overbeek 1.1 use FIG;
19 : parrello 1.14 use Tracer;
20 : redwards 1.17 use strict;
21 : parrello 1.14
22 : overbeek 1.1 my $fig = new FIG;
23 :    
24 : parrello 1.34 =head1 Attribute Loader
25 : redwards 1.16
26 : parrello 1.34 This script loads attributes into the FIG database. The load process drops and re-creates
27 :     the attribute table and then applies any transactions present in the logs.
28 : redwards 1.16
29 : parrello 1.34 This script begins by deleting the database tables for ALL attributes. It then
30 :     reloads the data. It then processes through each of the genome directories according
31 :     to C<< $fig->genomes() >> and looks for attributes in each directory. These are written to
32 :     a temporary file and then loaded.
33 : overbeek 1.27
34 : parrello 1.34 Note that key names can only contain the characters matched by the \w method
35 :     (i.e. [a-zA-Z0-9_])
36 : redwards 1.16
37 : parrello 1.34 The following command-line options are supported.
38 : redwards 1.16
39 : parrello 1.34 =over 4
40 : redwards 1.16
41 : parrello 1.34 =item trace
42 :    
43 :     Tracing level. A higher trace level means more messages will appear. The default
44 :     trace level is 2. Tracing will be to the file C<trace.log> in the FIG temporary
45 :     directory as well as to the standard output.
46 :    
47 :     =item sql
48 :    
49 :     Turn on tracing for SQL commands.
50 :    
51 :     =item links
52 :    
53 :     Include the links as attributes. Currently, only pubmed IDs are loaded as links.
54 :    
55 :     =item keep
56 :    
57 :     Keep the temporary files. The temporary files are used to load the database.
58 :    
59 :     =item noglobal
60 :    
61 :     Ignore attributes in the global directory. This means only feature attributes will
62 :     be loaded.
63 :    
64 : parrello 1.35 =item safe
65 : parrello 1.34
66 : parrello 1.35 Normally, if errors or bad keys are found in an input file, the input file is replaced
67 :     with a cleaned copy. If this flag is set, the input file will be left alone and a the
68 :     cleaned copy will remain in the directory with the input file.
69 : parrello 1.34
70 :     =back
71 : redwards 1.16
72 : parrello 1.37 In addition to the command-line options, the user can specify one or more genome IDs as
73 :     positional parameters. If specified, only these genomes would be processed; however, the
74 :     entire data table is dropped, so this option should only be used in testing.
75 :    
76 : redwards 1.16 =cut
77 :    
78 : parrello 1.34 # Get the command-line options.
79 : parrello 1.41 my ($options, @genomes) = StandardSetup([],
80 :     { links => [0, "include the links as attributes"],
81 :     safe => [0, "do not replace input files with clean copies"],
82 :     keep => [0, "do not delete temporary load files"],
83 :     noglobal => [0, "ignore attributes in the global directory"],
84 :     }, "",
85 :     @ARGV);
86 : redwards 1.5
87 : redwards 1.17 Trace("Deleting and Recreating attribute table.") if T(2);
88 : parrello 1.34
89 : redwards 1.43
90 :     my %IGNORE_ATTR=('evidence_code'=>1);
91 :    
92 :    
93 : parrello 1.34 # Set up the database tables. We have an attribute table and the a table of data about
94 :     # the attribute keys.
95 : parrello 1.14 my $dbf = $fig->db_handle;
96 : overbeek 1.1 $dbf->drop_table( tbl => "attribute" );
97 : redwards 1.46 $dbf->create_table( tbl => 'attribute', flds => "genome varchar(255), ftype varchar(64), id varchar(64), tag varchar(64), val text, url text");
98 : overbeek 1.27 $dbf->drop_table( tbl => "attribute_metadata" );
99 :     $dbf->create_table( tbl => 'attribute_metadata', flds => "attrkey varchar(64), metakey varchar(64), metaval text");
100 : redwards 1.17
101 : olson 1.44 if ($FIG_Config::preIndex)
102 :     {
103 :     create_indexes();
104 :     }
105 :    
106 : parrello 1.34 # we are going to store any transaction_logs we encounter here, and then process them at the end
107 :     my @tlogs;
108 :     # we are going to store any attributes metadata we encounter here, and then process them at the end
109 :     my @akeys;
110 :    
111 :     # Loop through the genomes. We will store the attribute data in flat files and then load them
112 :     # all at once.
113 : parrello 1.37 if (! @genomes) {
114 :     @genomes = $fig->genomes;
115 :     }
116 :     Trace("Processing genomes.") if T(2);
117 :     foreach my $genome (@genomes) {
118 : parrello 1.34 # Get a unique attribute file name for this genome. We look for a file name that
119 :     # does not yet exist. We don't expect there to be many, since keeping the files
120 :     # is nonstandard.
121 :     my $filecount = 1;
122 :     while (-e "$FIG_Config::temp/load_attributes.$$.$genome.$filecount") {$filecount++}
123 :     my $attributesFN = "$FIG_Config::temp/load_attributes.$$.$genome.$filecount";
124 :     # Open the file for output.
125 :     my $attributesFH = Open(undef, ">$attributesFN");
126 :     my %kv;
127 :     # I have rewritten this to allow the following things:
128 :     # 1. Attributes for genomes are now available in $FIG_Config::organisms/$genome/Attributes
129 :     # 2. Attributes for features (not just pegs) are now available in $FIG_Config::organisms/$genome/Features/*/Attributes
130 :    
131 :     my $dir = "$FIG_Config::organisms/$genome/Attributes";
132 :     # Process the genome attribute directory.
133 :     process_directory($dir, $attributesFH);
134 :    
135 : parrello 1.39 # Now find the feature attributes files. There is one feature subdirectory
136 :     # for each feature type-- peg, rna, etc. The attribute directories are below
137 :     # this level.
138 : parrello 1.34 # We should use File::Find here, but I am not sure if that is in the
139 :     # default distro, so I'll just write a quickie. Not as good, though.
140 :    
141 :     my $fattdir="$FIG_Config::organisms/$genome/Features";
142 : parrello 1.39 # This loop gets the feature type directories.
143 : parrello 1.36 foreach my $dir (OpenDir($fattdir, 1, 1)) {
144 : parrello 1.34 # Look for hyperlinks in the feature directory.
145 :     if ($options->{links} && -e "$fattdir/$dir/$dir.links") {
146 :     Trace("Loading links for feature directory $dir.") if T(4);
147 :     # Convert the links into attributes.
148 :     &links_file("$fattdir/$dir/$dir.links", $attributesFH);
149 :     }
150 : parrello 1.39 # Process the feature attribute directory for this feature type.
151 :     process_directory("$fattdir/$dir/Attributes", $attributesFH);
152 : parrello 1.34 }
153 :     close($attributesFH);
154 :     # If we didn't find anything for this genome, delete its file.
155 :     if (!-s "$attributesFN") {
156 :     unlink($attributesFN);
157 :     } else {
158 :     # finally load all the attributes
159 :     my $result = $dbf->load_table( tbl => "attribute",
160 :     file => "$attributesFN" );
161 : parrello 1.39 Trace("Got $result for " . $fig->genus_species($genome) . " ($genome) while trying to load database.") if T(3);
162 : parrello 1.38 if (! $options->{keep}) {
163 : parrello 1.34 unlink($attributesFN);
164 : parrello 1.38 } else {
165 :     Trace("Genome load file $attributesFN kept.");
166 : parrello 1.34 }
167 :     }
168 : redwards 1.17 }
169 :    
170 :     # now we need to load the global attributes files
171 : parrello 1.37 if (! $options->{noglobal}) {
172 : parrello 1.34 Trace("Processing global attributes.") if T(2);
173 : parrello 1.37 my $globalDir = "$FIG_Config::global/Attributes";
174 : parrello 1.34 my $globalFN = "$FIG_Config::temp/global_attributes";
175 :     my $globalFH = Open(undef, ">$globalFN");
176 :     process_directory($globalDir, $globalFH);
177 :     close $globalFH;
178 : parrello 1.37 if (-s "$globalFN") {
179 : parrello 1.34 my $result = $dbf->load_table( tbl => "attribute", file => "$globalFN" );
180 :     Trace("Got $result for global load from $globalFN") if T(2);
181 :     }
182 : parrello 1.38 if (! $options->{keep}) {
183 : parrello 1.34 unlink("$globalFN");
184 : parrello 1.38 } else {
185 :     Trace("Global load file $globalFN kept.") if T(2);
186 : parrello 1.34 }
187 :     } else {
188 :     Trace("Global attributes not requested.") if T(2);
189 : parrello 1.45 }
190 : redwards 1.17
191 : parrello 1.34 # finally parse the transaction_log files and attributes_metadata Note that we only
192 :     # do this if the lists are non-empty.
193 : redwards 1.18 &parse_transaction_logs(\@tlogs) if (scalar(@tlogs));
194 : overbeek 1.27 &parse_attributes_metadata(\@akeys) if (scalar(@akeys));
195 : redwards 1.18
196 : olson 1.44 if (not $FIG_Config::preIndex)
197 :     {
198 :     create_indexes();
199 :     }
200 : overbeek 1.27
201 :    
202 : overbeek 1.15 Trace("Attributes loaded.") if T(2);
203 : redwards 1.17 exit(0);
204 :    
205 : olson 1.44 sub create_indexes
206 :     {
207 :     Trace("Creating indexes.") if T(2);
208 : parrello 1.45
209 : olson 1.44 # rob messing with indexes
210 :     # fields are now : genome ftype id key val url
211 :     $dbf->create_index( idx => "attribute_genome_ix", tbl => "attribute", type => "btree", flds => "id,genome,ftype");
212 :     $dbf->create_index( idx => "attribute_genome_ftype_ix", tbl => "attribute", type => "btree", flds => "genome, ftype");
213 :     $dbf->create_index( idx => "attribute_key_ix", tbl => "attribute", type => "btree", flds => "tag" );
214 :     #$dbf->create_index( idx => "attribute_val_ix", tbl => "attribute", type => "btree", flds => "val");
215 :     #$dbf->create_index( idx => "attribute_metadata_ix", tbl => "attribute_metadata", type => "btree", flds => "attrkey, metakey, metaval");
216 :     $dbf->create_index( idx => "attribute_metadata_ix", tbl => "attribute_metadata", type => "btree", flds => "attrkey, metakey");
217 :     }
218 :    
219 :    
220 : parrello 1.34 =head3 process_directory
221 :    
222 : parrello 1.45 process_directory($dir, $attributesFH);
223 : parrello 1.34
224 :     Process attribute files in a particular directory. Transaction log file names will be
225 :     stored in the global C<@tlogs> and metadata files will be stored in C<@akeys>. All
226 :     other non-temporary files in the directory will be parsed into the file handle in
227 :     I<$attributesFH>. I<$dir> must be the directory name.
228 :    
229 :     =cut
230 :    
231 :     sub process_directory {
232 :     my ($dir, $attributesFH) = @_;
233 :     # Look for files in the attribute directory for this genome. The map is applied to file
234 :     # names that aren't temporary and a failure to open is ignored.
235 :     # Transaction log files and metadata file names are saved in the lists. The other files
236 :     # are parsed into the database load file by "parse_file_to_temp".
237 :     map {
238 :     $_ eq "transaction_log" ?
239 :     push @tlogs, "$dir/$_"
240 :     : ($_ eq "attribute_keys" || $_ eq "attribute_metadata") ?
241 :     push @akeys, "$dir/$_"
242 :     : &parse_file_to_temp("$dir/$_", $attributesFH);
243 :     } OpenDir($dir, 1, 1);
244 :     }
245 : redwards 1.17
246 :     =head3 links_file()
247 :    
248 : parrello 1.34 Read the links and write them to the output filehandle provided. Requires two arguments -
249 :     the links file and the filehandle where they should be written to
250 : redwards 1.17
251 :     =cut
252 :    
253 :     sub links_file {
254 :     # we are going to parse the links into a temporary file, and then read them
255 : parrello 1.45 # at the moment there is something weird where links has lots of things like gi, uniprot id, and so on. These are aliases
256 : redwards 1.17 # and I am not sure why they are in links.
257 :     # I am just going to keep the pubmed links for now
258 :     # however, I am going to parse out any pubmed link that may be for the genome article.
259 :     # this will be done by removing any article with some large number of hits
260 :     my ($links_file, $write_to)=@_;
261 :     return unless (-e $links_file);
262 : parrello 1.45
263 : parrello 1.34 Open(\*IN, "<$links_file");
264 : redwards 1.17 my $output;
265 : parrello 1.34 # Loop through the links file.
266 :     while (<IN>) {
267 :     # We only process PUBMED links.
268 :     next unless (/pubmed/i);
269 :     chomp;
270 :     # Parse out the FIG ID, the link, and the link text.
271 :     m#^(fig\|\d+\.\d+\.\w\w\w\.\d+).*(http.*)>(.*?)</a>#i;
272 :     unless ($1 && $2 && $3) {
273 :     Trace("Error parsing\n>>>$_<<<\n") if T(1);
274 :     next
275 :     }
276 :     my ($peg, $url, $val) = ($1, $2, $3);
277 :     # Remove the pubmed title from the link text.
278 :     $val =~ s/pubmed\s+//i;
279 :     # Create a feature attribute for the PUBMED link.
280 :     push (@{$output->{$val}}, "$peg\tPUBMED\t$val\t$url\n");
281 : redwards 1.17 }
282 : parrello 1.34 # Only output a set of links if there are 100 or fewer.
283 :     if ($output) {
284 :     foreach my $key (keys %$output) {
285 :     next if (scalar @{$output->{$key}} > 100);
286 :     print $write_to @{$output->{$key}};
287 :     }
288 :     }
289 : redwards 1.17 }
290 :    
291 :    
292 :    
293 :     =head2 parse_file_to_temp()
294 :    
295 : parrello 1.34 This method takes two arguments, the name of a file to read and a filehandle to write to.
296 : redwards 1.43 The file is opened, comments and blank lines are ignored, a couple of tests are applied,
297 : parrello 1.34 and the data is written to the filehandle. The incoming file must be an attribute file.
298 : redwards 1.17
299 : redwards 1.43 Note, we also ignore the attributes stored in the hash %IGNORE_ATTR. These are mainly computed attributes.
300 :    
301 : redwards 1.17 =cut
302 :    
303 :     sub parse_file_to_temp {
304 : parrello 1.34 my ($from, $to) = @_;
305 :     return unless ($from);
306 :     unless ($to) {
307 :     open ($to, ">-")
308 :     } #open $to to STDOUT if needed.
309 :    
310 : parrello 1.37 Trace("Parsing $from.") if T(3);
311 : parrello 1.34 Open(\*IN, "<$from");
312 :    
313 : parrello 1.37 # Create a file to contain a cleaned copy of the data. We do some fancy dancing to
314 :     # try to make the name reasonable and unique.
315 :     my $cleanName;
316 :     if ($from =~ m#([^/]+)/Attributes/(.*)$#i) {
317 :     $cleanName = "$FIG_Config::temp/$1$2.$$.cleaned";
318 :     } else {
319 :     $cleanName = "$FIG_Config::temp/attr.$$.cleaned";
320 :     }
321 :     my $fileCount = 1;
322 :     while (-e "$cleanName$fileCount") {
323 :     $fileCount++;
324 :     }
325 :     $cleanName = "$cleanName$fileCount";
326 :     Open(\*CLEAN, ">$cleanName");
327 :     # Count the input lines, errors, and comments.
328 : parrello 1.34 my $lineCount = 0;
329 :     my $errorCount = 0;
330 :     my $cleanCount = 0;
331 :     while (<IN>) {
332 :     $lineCount++;
333 :     # Unlike chomp, Strip removes \r\n when needed.
334 :     my $inputLine = Tracer::Strip($_);
335 :     # Fix internal \r characters.
336 :     $inputLine =~ s/\r/ /g;
337 :     # Now we have a cleaned-up input line. We are going to set $comment to
338 :     # 1 if the line should be skipped and $error to 1 if the line is in
339 :     # error. Skipped lines are echoed unmodified to the output. Error
340 :     # lines are converted to comments. Unskipped lines will be reassembled
341 :     # and written back.
342 :     my $error = 0;
343 :     my $comment = 0;
344 :     # We'll split the line into this variable.
345 :     my @line = ();
346 :     if ($inputLine =~ /^\s*\#/ || $inputLine =~ /^\s*$/) {
347 :     # Echo blank and comment lines unmodified.
348 :     $comment = 1;
349 :     } else {
350 :     @line = split /\t/, $inputLine;
351 : redwards 1.43 # quietly ignore the IGNORE_ATTR keys
352 :     next if ($IGNORE_ATTR{$line[1]});
353 :    
354 : parrello 1.45
355 : parrello 1.34 if (! $line[0]) {
356 :     Trace("No ID at line $lineCount in $from.") if T(1);
357 :     $error = 1;
358 :     } elsif (! $line[1]) {
359 :     Trace("No key at line $lineCount in $from.") if T(1);
360 :     $error = 1;
361 :     } elsif (! $line[2]) {
362 :     Trace("No value at line $lineCount in $from.") if T(1);
363 :     $error = 1;
364 :     } elsif (length($line[1]) > 64) {
365 :     Trace("Key is longer than 64 characters at line $lineCount in $from.") if T(1);
366 :     $error = 1;
367 :     } else {
368 : parrello 1.37 if ($#line > 3) {
369 : parrello 1.34 Trace("Line $lineCount in $from has more than 4 columns.") if T(1);
370 :     $error = 1;
371 :     } else {
372 :     # Clean the key.
373 :     if ($line[1] =~ /\W/) {
374 :     $cleanCount++;
375 :     $line[1] = $fig->clean_attribute_key($line[1]);
376 :     }
377 :     }
378 :     }
379 :     }
380 :     # Now we output the line to the cleaned file.
381 :     if ($comment) {
382 :     print CLEAN "$inputLine\n";
383 :     } elsif ($error) {
384 :     print CLEAN "## ERROR ## $inputLine\n";
385 :     $errorCount++;
386 :     } else {
387 : parrello 1.39 # Insure we have a URL value.
388 :     unless (defined $line[3]) {
389 :     $line[3] = "";
390 :     }
391 :     # Rejoin the line and print it to the clean file.
392 : parrello 1.34 print CLEAN join("\t", @line) . "\n";
393 :     # The clean file has been handled. Now we output to the load file.
394 :     # Replace the first element in the line with the split feature as
395 :     # appropriate.
396 :     splice(@line, 0, 1, $fig->split_attribute_oid($line[0]));
397 :     # Unescape the periods. Postgres behaves in a goofy way regarding
398 :     # escape sequences.
399 :     $inputLine = join "\t", @line;
400 :     $inputLine =~ s/\\\./\./g;
401 :     print $to "$inputLine\n";
402 :     }
403 :     }
404 :     close IN;
405 :     Trace("$lineCount lines read from $from.") if T(4);
406 :     close CLEAN;
407 :     # Now we figure out what to do with the clean file. If we did real work, then
408 :     # we'll replace the original file with it. Otherwise, we delete it.
409 :     if ($cleanCount || $errorCount) {
410 :     Trace("$cleanCount malformed keys and $errorCount errors found in $from.") if T(1);
411 : parrello 1.35 if (! $options->{safe}) {
412 : parrello 1.40 rename $from, "$from~";
413 : parrello 1.37 rename $cleanName, $from;
414 : parrello 1.38 } else {
415 :     Trace("Clean file $cleanName kept.") if T(3);
416 : parrello 1.35 }
417 : parrello 1.34 } else {
418 : parrello 1.37 unlink $cleanName;
419 : parrello 1.34 }
420 : redwards 1.17 }
421 : redwards 1.18
422 : parrello 1.34 =head2 parse_transaction_logs
423 : redwards 1.18
424 : parrello 1.34 This method takes a reference to an array of paths to transactions_logs and will read
425 :     and process them
426 : redwards 1.18
427 :     =cut
428 :    
429 :     sub parse_transaction_logs {
430 : parrello 1.34 my $logs = shift;
431 :     return unless $logs;
432 :     foreach my $l (@$logs) {
433 : redwards 1.46 Trace("Parsing transaction log $l") if T(2);
434 : parrello 1.34 $fig->read_attribute_transaction_log($l);
435 :     }
436 : redwards 1.18 }
437 : overbeek 1.27
438 : parrello 1.34 =head2 parse_attributes_metadata
439 : overbeek 1.27
440 : parrello 1.34 This method takes a reference to an array of attributes metadata files and loads
441 :     them into the database. It will also rename attribute_keys to attribute_metadata
442 :     to be consistent and hopefully clearer.
443 : overbeek 1.27
444 :     =cut
445 :    
446 :     sub parse_attributes_metadata {
447 : parrello 1.34 my $akeys = shift;
448 :     return unless ($akeys);
449 :     # first we are going to see if we need to rename or append any files
450 :     my %attributekeys;
451 :     foreach my $ak (@$akeys) {
452 :     # rename attribute_keys to attribute_metadata by
453 :     # appending to a file in case there is more data there.
454 :     if ($ak =~ /attribute_keys$/) {
455 :     my $location=$fig->update_attributes_metadata($ak);
456 :     $attributekeys{$location}=1;
457 :     } else {
458 :     $attributekeys{$ak} = 1;
459 :     }
460 :     }
461 :     foreach my $ak (keys %attributekeys) {
462 :     Trace("Parsing attribute metadata $ak.") if T(4);
463 :     Open(\*IN, "<$ak");
464 :     while (<IN>) {
465 :     next if (/^\s*\#/);
466 :     chomp;
467 :     my @line = split /\t/;
468 :     # here we pass in the attribute key (line[0]) and a reference to
469 :     # an array with metakey and key info
470 :     $fig->key_info($line[0], {$line[1]=>$line[2]}, 1);
471 :     }
472 :     }
473 : overbeek 1.27 }
474 : parrello 1.34
475 : overbeek 1.42 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3