[Bio] / FigKernelScripts / load_attributes.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/load_attributes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.37 - (view) (download) (as text)

1 : olson 1.29 #
2 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
3 :     # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 :     #
7 :     # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 :     # Public License.
10 :     #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :    
18 : overbeek 1.1 use FIG;
19 : parrello 1.14 use Tracer;
20 : redwards 1.17 use strict;
21 : parrello 1.14
22 : overbeek 1.1 my $fig = new FIG;
23 :    
24 : parrello 1.34 =head1 Attribute Loader
25 : redwards 1.16
26 : parrello 1.34 This script loads attributes into the FIG database. The load process drops and re-creates
27 :     the attribute table and then applies any transactions present in the logs.
28 : redwards 1.16
29 : parrello 1.34 This script begins by deleting the database tables for ALL attributes. It then
30 :     reloads the data. It then processes through each of the genome directories according
31 :     to C<< $fig->genomes() >> and looks for attributes in each directory. These are written to
32 :     a temporary file and then loaded.
33 : overbeek 1.27
34 : parrello 1.34 Note that key names can only contain the characters matched by the \w method
35 :     (i.e. [a-zA-Z0-9_])
36 : redwards 1.16
37 : parrello 1.34 The following command-line options are supported.
38 : redwards 1.16
39 : parrello 1.34 =over 4
40 : redwards 1.16
41 : parrello 1.34 =item trace
42 :    
43 :     Tracing level. A higher trace level means more messages will appear. The default
44 :     trace level is 2. Tracing will be to the file C<trace.log> in the FIG temporary
45 :     directory as well as to the standard output.
46 :    
47 :     =item sql
48 :    
49 :     Turn on tracing for SQL commands.
50 :    
51 :     =item links
52 :    
53 :     Include the links as attributes. Currently, only pubmed IDs are loaded as links.
54 :    
55 :     =item keep
56 :    
57 :     Keep the temporary files. The temporary files are used to load the database.
58 :    
59 :     =item noglobal
60 :    
61 :     Ignore attributes in the global directory. This means only feature attributes will
62 :     be loaded.
63 :    
64 : parrello 1.35 =item safe
65 : parrello 1.34
66 : parrello 1.35 Normally, if errors or bad keys are found in an input file, the input file is replaced
67 :     with a cleaned copy. If this flag is set, the input file will be left alone and a the
68 :     cleaned copy will remain in the directory with the input file.
69 : parrello 1.34
70 :     =back
71 : redwards 1.16
72 : parrello 1.37 In addition to the command-line options, the user can specify one or more genome IDs as
73 :     positional parameters. If specified, only these genomes would be processed; however, the
74 :     entire data table is dropped, so this option should only be used in testing.
75 :    
76 : redwards 1.16 =cut
77 :    
78 : parrello 1.34 # Get the command-line options.
79 : parrello 1.37 my ($options, @genomes) = StandardSetup([], { links => 0,
80 : parrello 1.35 safe => 0,
81 : parrello 1.34 keep => 0,
82 : parrello 1.35 noglobal => 0
83 :     }, @ARGV);
84 : redwards 1.5
85 : redwards 1.17 Trace("Deleting and Recreating attribute table.") if T(2);
86 : parrello 1.34
87 :     # Set up the database tables. We have an attribute table and the a table of data about
88 :     # the attribute keys.
89 : parrello 1.14 my $dbf = $fig->db_handle;
90 : overbeek 1.1 $dbf->drop_table( tbl => "attribute" );
91 : overbeek 1.30 $dbf->create_table( tbl => 'attribute', flds => "genome varchar(64), ftype varchar(64), id varchar(64), tag varchar(64), val text, url text");
92 : overbeek 1.27 $dbf->drop_table( tbl => "attribute_metadata" );
93 :     $dbf->create_table( tbl => 'attribute_metadata', flds => "attrkey varchar(64), metakey varchar(64), metaval text");
94 : redwards 1.17
95 : parrello 1.34 # we are going to store any transaction_logs we encounter here, and then process them at the end
96 :     my @tlogs;
97 :     # we are going to store any attributes metadata we encounter here, and then process them at the end
98 :     my @akeys;
99 :    
100 :     # Loop through the genomes. We will store the attribute data in flat files and then load them
101 :     # all at once.
102 : parrello 1.37 if (! @genomes) {
103 :     @genomes = $fig->genomes;
104 :     }
105 :     Trace("Processing genomes.") if T(2);
106 :     foreach my $genome (@genomes) {
107 : parrello 1.34 # Get a unique attribute file name for this genome. We look for a file name that
108 :     # does not yet exist. We don't expect there to be many, since keeping the files
109 :     # is nonstandard.
110 :     my $filecount = 1;
111 :     while (-e "$FIG_Config::temp/load_attributes.$$.$genome.$filecount") {$filecount++}
112 :     my $attributesFN = "$FIG_Config::temp/load_attributes.$$.$genome.$filecount";
113 :     # Open the file for output.
114 :     my $attributesFH = Open(undef, ">$attributesFN");
115 :     my %kv;
116 :     # I have rewritten this to allow the following things:
117 :     # 1. Attributes for genomes are now available in $FIG_Config::organisms/$genome/Attributes
118 :     # 2. Attributes for features (not just pegs) are now available in $FIG_Config::organisms/$genome/Features/*/Attributes
119 :    
120 :     my $dir = "$FIG_Config::organisms/$genome/Attributes";
121 :     # Process the genome attribute directory.
122 :     process_directory($dir, $attributesFH);
123 :    
124 :     # Now find the feature attributes files.
125 :     # We should use File::Find here, but I am not sure if that is in the
126 :     # default distro, so I'll just write a quickie. Not as good, though.
127 :    
128 :     my $fattdir="$FIG_Config::organisms/$genome/Features";
129 : parrello 1.36 foreach my $dir (OpenDir($fattdir, 1, 1)) {
130 : parrello 1.34 # Look for hyperlinks in the feature directory.
131 :     if ($options->{links} && -e "$fattdir/$dir/$dir.links") {
132 :     Trace("Loading links for feature directory $dir.") if T(4);
133 :     # Convert the links into attributes.
134 :     &links_file("$fattdir/$dir/$dir.links", $attributesFH);
135 :     }
136 :     # Process the feature attribute directory.
137 :     process_directory($dir, $attributesFH);
138 :     }
139 :     close($attributesFH);
140 :     # If we didn't find anything for this genome, delete its file.
141 :     if (!-s "$attributesFN") {
142 :     unlink($attributesFN);
143 :     } else {
144 :     # finally load all the attributes
145 :     my $result = $dbf->load_table( tbl => "attribute",
146 :     file => "$attributesFN" );
147 :     Trace("Got $result for ", $fig->genus_species($genome), " ($genome) while trying to load database.") if T(3);
148 :     if (!$options->{keep}) {
149 :     unlink($attributesFN);
150 :     }
151 :     }
152 : redwards 1.17 }
153 :    
154 :     # now we need to load the global attributes files
155 : parrello 1.37 if (! $options->{noglobal}) {
156 : parrello 1.34 Trace("Processing global attributes.") if T(2);
157 : parrello 1.37 my $globalDir = "$FIG_Config::global/Attributes";
158 : parrello 1.34 my $globalFN = "$FIG_Config::temp/global_attributes";
159 :     my $globalFH = Open(undef, ">$globalFN");
160 :     process_directory($globalDir, $globalFH);
161 :     close $globalFH;
162 : parrello 1.37 if (-s "$globalFN") {
163 : parrello 1.34 my $result = $dbf->load_table( tbl => "attribute", file => "$globalFN" );
164 :     Trace("Got $result for global load from $globalFN") if T(2);
165 :     }
166 :     if (!$options->{keep}) {
167 :     unlink("$globalFN");
168 :     }
169 :     } else {
170 :     Trace("Global attributes not requested.") if T(2);
171 :     }
172 : redwards 1.17
173 : parrello 1.34 # finally parse the transaction_log files and attributes_metadata Note that we only
174 :     # do this if the lists are non-empty.
175 : redwards 1.18 &parse_transaction_logs(\@tlogs) if (scalar(@tlogs));
176 : overbeek 1.27 &parse_attributes_metadata(\@akeys) if (scalar(@akeys));
177 : redwards 1.18
178 : parrello 1.34 Trace("Creating indexes.") if T(2);
179 : overbeek 1.27 # rob messing with indexes
180 :     # fields are now : genome ftype id key val url
181 :     $dbf->create_index( idx => "attribute_genome_ix", tbl => "attribute", type => "btree", flds => "id,genome,ftype");
182 :     $dbf->create_index( idx => "attribute_genome_ftype_ix", tbl => "attribute", type => "btree", flds => "genome, ftype");
183 : overbeek 1.30 $dbf->create_index( idx => "attribute_key_ix", tbl => "attribute", type => "btree", flds => "tag" );
184 : overbeek 1.31 #$dbf->create_index( idx => "attribute_val_ix", tbl => "attribute", type => "btree", flds => "val");
185 :     #$dbf->create_index( idx => "attribute_metadata_ix", tbl => "attribute_metadata", type => "btree", flds => "attrkey, metakey, metaval");
186 :     $dbf->create_index( idx => "attribute_metadata_ix", tbl => "attribute_metadata", type => "btree", flds => "attrkey, metakey");
187 : overbeek 1.27
188 :    
189 : overbeek 1.15 Trace("Attributes loaded.") if T(2);
190 : redwards 1.17 exit(0);
191 :    
192 : parrello 1.34 =head3 process_directory
193 :    
194 :     C<< process_directory($dir, $attributesFH); >>
195 :    
196 :     Process attribute files in a particular directory. Transaction log file names will be
197 :     stored in the global C<@tlogs> and metadata files will be stored in C<@akeys>. All
198 :     other non-temporary files in the directory will be parsed into the file handle in
199 :     I<$attributesFH>. I<$dir> must be the directory name.
200 :    
201 :     =cut
202 :    
203 :     sub process_directory {
204 :     my ($dir, $attributesFH) = @_;
205 :     # Look for files in the attribute directory for this genome. The map is applied to file
206 :     # names that aren't temporary and a failure to open is ignored.
207 :     # Transaction log files and metadata file names are saved in the lists. The other files
208 :     # are parsed into the database load file by "parse_file_to_temp".
209 :     map {
210 :     $_ eq "transaction_log" ?
211 :     push @tlogs, "$dir/$_"
212 :     : ($_ eq "attribute_keys" || $_ eq "attribute_metadata") ?
213 :     push @akeys, "$dir/$_"
214 :     : &parse_file_to_temp("$dir/$_", $attributesFH);
215 :     } OpenDir($dir, 1, 1);
216 :     }
217 : redwards 1.17
218 :     =head3 links_file()
219 :    
220 : parrello 1.34 Read the links and write them to the output filehandle provided. Requires two arguments -
221 :     the links file and the filehandle where they should be written to
222 : redwards 1.17
223 :     =cut
224 :    
225 :     sub links_file {
226 :     # we are going to parse the links into a temporary file, and then read them
227 :     # at the moment there is something weird where links has lots of things like gi, uniprot id, and so on. These are aliases
228 :     # and I am not sure why they are in links.
229 :     # I am just going to keep the pubmed links for now
230 :     # however, I am going to parse out any pubmed link that may be for the genome article.
231 :     # this will be done by removing any article with some large number of hits
232 :     my ($links_file, $write_to)=@_;
233 :     return unless (-e $links_file);
234 :    
235 : parrello 1.34 Open(\*IN, "<$links_file");
236 : redwards 1.17 my $output;
237 : parrello 1.34 # Loop through the links file.
238 :     while (<IN>) {
239 :     # We only process PUBMED links.
240 :     next unless (/pubmed/i);
241 :     chomp;
242 :     # Parse out the FIG ID, the link, and the link text.
243 :     m#^(fig\|\d+\.\d+\.\w\w\w\.\d+).*(http.*)>(.*?)</a>#i;
244 :     unless ($1 && $2 && $3) {
245 :     Trace("Error parsing\n>>>$_<<<\n") if T(1);
246 :     next
247 :     }
248 :     my ($peg, $url, $val) = ($1, $2, $3);
249 :     # Remove the pubmed title from the link text.
250 :     $val =~ s/pubmed\s+//i;
251 :     # Create a feature attribute for the PUBMED link.
252 :     push (@{$output->{$val}}, "$peg\tPUBMED\t$val\t$url\n");
253 : redwards 1.17 }
254 : parrello 1.34 # Only output a set of links if there are 100 or fewer.
255 :     if ($output) {
256 :     foreach my $key (keys %$output) {
257 :     next if (scalar @{$output->{$key}} > 100);
258 :     print $write_to @{$output->{$key}};
259 :     }
260 :     }
261 : redwards 1.17 }
262 :    
263 :    
264 :    
265 :     =head2 parse_file_to_temp()
266 :    
267 : parrello 1.34 This method takes two arguments, the name of a file to read and a filehandle to write to.
268 :     he file is opened, comments and blank lines are ignored, a couple of tests are applied,
269 :     and the data is written to the filehandle. The incoming file must be an attribute file.
270 : redwards 1.17
271 :     =cut
272 :    
273 :     sub parse_file_to_temp {
274 : parrello 1.34 my ($from, $to) = @_;
275 :     return unless ($from);
276 :     unless ($to) {
277 :     open ($to, ">-")
278 :     } #open $to to STDOUT if needed.
279 :    
280 : parrello 1.37 Trace("Parsing $from.") if T(3);
281 : parrello 1.34 Open(\*IN, "<$from");
282 :    
283 : parrello 1.37 # Create a file to contain a cleaned copy of the data. We do some fancy dancing to
284 :     # try to make the name reasonable and unique.
285 :     my $cleanName;
286 :     if ($from =~ m#([^/]+)/Attributes/(.*)$#i) {
287 :     $cleanName = "$FIG_Config::temp/$1$2.$$.cleaned";
288 :     } else {
289 :     $cleanName = "$FIG_Config::temp/attr.$$.cleaned";
290 :     }
291 :     my $fileCount = 1;
292 :     while (-e "$cleanName$fileCount") {
293 :     $fileCount++;
294 :     }
295 :     $cleanName = "$cleanName$fileCount";
296 :     Open(\*CLEAN, ">$cleanName");
297 :     # Count the input lines, errors, and comments.
298 : parrello 1.34 my $lineCount = 0;
299 :     my $errorCount = 0;
300 :     my $cleanCount = 0;
301 :     while (<IN>) {
302 :     $lineCount++;
303 :     # Unlike chomp, Strip removes \r\n when needed.
304 :     my $inputLine = Tracer::Strip($_);
305 :     # Fix internal \r characters.
306 :     $inputLine =~ s/\r/ /g;
307 :     # Now we have a cleaned-up input line. We are going to set $comment to
308 :     # 1 if the line should be skipped and $error to 1 if the line is in
309 :     # error. Skipped lines are echoed unmodified to the output. Error
310 :     # lines are converted to comments. Unskipped lines will be reassembled
311 :     # and written back.
312 :     my $error = 0;
313 :     my $comment = 0;
314 :     # We'll split the line into this variable.
315 :     my @line = ();
316 :     if ($inputLine =~ /^\s*\#/ || $inputLine =~ /^\s*$/) {
317 :     # Echo blank and comment lines unmodified.
318 :     $comment = 1;
319 :     } else {
320 :     @line = split /\t/, $inputLine;
321 :     if (! $line[0]) {
322 :     Trace("No ID at line $lineCount in $from.") if T(1);
323 :     $error = 1;
324 :     } elsif (! $line[1]) {
325 :     Trace("No key at line $lineCount in $from.") if T(1);
326 :     $error = 1;
327 :     } elsif (! $line[2]) {
328 :     Trace("No value at line $lineCount in $from.") if T(1);
329 :     $error = 1;
330 :     } elsif (length($line[1]) > 64) {
331 :     Trace("Key is longer than 64 characters at line $lineCount in $from.") if T(1);
332 :     $error = 1;
333 :     } else {
334 : parrello 1.37 if ($#line > 3) {
335 : parrello 1.34 Trace("Line $lineCount in $from has more than 4 columns.") if T(1);
336 :     $error = 1;
337 :     } else {
338 :     # Clean the key.
339 :     if ($line[1] =~ /\W/) {
340 :     $cleanCount++;
341 :     $line[1] = $fig->clean_attribute_key($line[1]);
342 :     }
343 :     }
344 :     }
345 :     }
346 :     # Now we output the line to the cleaned file.
347 :     if ($comment) {
348 :     print CLEAN "$inputLine\n";
349 :     } elsif ($error) {
350 :     print CLEAN "## ERROR ## $inputLine\n";
351 :     $errorCount++;
352 :     } else {
353 :     print CLEAN join("\t", @line) . "\n";
354 :     # The clean file has been handled. Now we output to the load file.
355 :     # Replace the first element in the line with the split feature as
356 :     # appropriate.
357 :     splice(@line, 0, 1, $fig->split_attribute_oid($line[0]));
358 :     unless (defined $line[3]) {
359 :     $line[3] = "";
360 :     }
361 :     # Unescape the periods. Postgres behaves in a goofy way regarding
362 :     # escape sequences.
363 :     $inputLine = join "\t", @line;
364 :     $inputLine =~ s/\\\./\./g;
365 :     print $to "$inputLine\n";
366 :     }
367 :     }
368 :     close IN;
369 :     Trace("$lineCount lines read from $from.") if T(4);
370 :     close CLEAN;
371 :     # Now we figure out what to do with the clean file. If we did real work, then
372 :     # we'll replace the original file with it. Otherwise, we delete it.
373 :     if ($cleanCount || $errorCount) {
374 :     Trace("$cleanCount malformed keys and $errorCount errors found in $from.") if T(1);
375 : parrello 1.35 if (! $options->{safe}) {
376 : parrello 1.37 rename $cleanName, $from;
377 : parrello 1.35 }
378 : parrello 1.34 } else {
379 : parrello 1.37 unlink $cleanName;
380 : parrello 1.34 }
381 : redwards 1.17 }
382 : redwards 1.18
383 : parrello 1.34 =head2 parse_transaction_logs
384 : redwards 1.18
385 : parrello 1.34 This method takes a reference to an array of paths to transactions_logs and will read
386 :     and process them
387 : redwards 1.18
388 :     =cut
389 :    
390 :     sub parse_transaction_logs {
391 : parrello 1.34 my $logs = shift;
392 :     return unless $logs;
393 :     foreach my $l (@$logs) {
394 :     Trace("Parsing transaction log $l") if T(3);
395 :     $fig->read_attribute_transaction_log($l);
396 :     }
397 : redwards 1.18 }
398 : overbeek 1.27
399 : parrello 1.34 =head2 parse_attributes_metadata
400 : overbeek 1.27
401 : parrello 1.34 This method takes a reference to an array of attributes metadata files and loads
402 :     them into the database. It will also rename attribute_keys to attribute_metadata
403 :     to be consistent and hopefully clearer.
404 : overbeek 1.27
405 :     =cut
406 :    
407 :     sub parse_attributes_metadata {
408 : parrello 1.34 my $akeys = shift;
409 :     return unless ($akeys);
410 :     # first we are going to see if we need to rename or append any files
411 :     my %attributekeys;
412 :     foreach my $ak (@$akeys) {
413 :     # rename attribute_keys to attribute_metadata by
414 :     # appending to a file in case there is more data there.
415 :     if ($ak =~ /attribute_keys$/) {
416 :     my $location=$fig->update_attributes_metadata($ak);
417 :     $attributekeys{$location}=1;
418 :     } else {
419 :     $attributekeys{$ak} = 1;
420 :     }
421 :     }
422 :     foreach my $ak (keys %attributekeys) {
423 :     Trace("Parsing attribute metadata $ak.") if T(4);
424 :     Open(\*IN, "<$ak");
425 :     while (<IN>) {
426 :     next if (/^\s*\#/);
427 :     chomp;
428 :     my @line = split /\t/;
429 :     # here we pass in the attribute key (line[0]) and a reference to
430 :     # an array with metakey and key info
431 :     $fig->key_info($line[0], {$line[1]=>$line[2]}, 1);
432 :     }
433 :     }
434 : overbeek 1.27 }
435 : parrello 1.34
436 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3