[Bio] / FigKernelScripts / load_attributes.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/load_attributes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.30 - (view) (download) (as text)

1 : olson 1.29 #
2 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
3 :     # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 :     #
7 :     # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 :     # Public License.
10 :     #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :    
18 : overbeek 1.1 use FIG;
19 : parrello 1.14 use Tracer;
20 : redwards 1.17 use strict;
21 : parrello 1.14
22 : overbeek 1.1 my $fig = new FIG;
23 :    
24 : redwards 1.16 =pod
25 :    
26 :     =head1 load_attributes
27 :    
28 :     load_attributes is used to reload the attributes file. You can run it with the following command line prompts
29 :     -v be verbose and -vv be very verbose. This may be superceded by the trace commands?
30 : redwards 1.23 -links include the links as attributes. At the moment only pubmed ID's are loaded as links.
31 : redwards 1.17 -keep keeps the temporary files. Usually the temporary files are deleted, but this will keep (at least some of) them
32 :     -noglobal normally attributes in $FIG::Config::global are also processed. This will ignore those
33 : overbeek 1.27 -force will remove non [a-zA-Z0-9_] characters from key names before adding the key
34 :    
35 : redwards 1.16
36 :     load_attributes begins by deleting the database tables for ALL attributes, and then reloads the data. This is NOT selective.
37 :    
38 : redwards 1.17 We then process through each of the genome directories according to $fig->genomes() and look for attributes in each directory. These are written to a temporary file and then loaded.
39 : redwards 1.16
40 : overbeek 1.27 Note that key names can only contain the characters matched by the \w method (i.e. [a-zA-Z0-9_])
41 : redwards 1.16
42 :     =cut
43 :    
44 : overbeek 1.27 my ($verbose, $flush, $links, $keep, $doglobal, $force)=(0,0,0,0,1,0);
45 : redwards 1.8 while (@ARGV)
46 :     {
47 : parrello 1.14 my $try=shift(@ARGV);
48 :     if ($try eq "-v") {$verbose=1}
49 : redwards 1.17 elsif ($try eq "-vv") {$verbose=2}
50 : redwards 1.20 elsif ($try eq "-links") {$links=1}
51 : redwards 1.17 elsif ($try eq "-keep") {$keep=1}
52 :     elsif ($try eq "-noglobal") {$doglobal=0}
53 : overbeek 1.27 elsif ($try eq "-force") {$force=1}
54 : redwards 1.8 }
55 : redwards 1.5
56 : redwards 1.17 Trace("Deleting and Recreating attribute table.") if T(2);
57 : parrello 1.14 my $dbf = $fig->db_handle;
58 : overbeek 1.1 $dbf->drop_table( tbl => "attribute" );
59 : overbeek 1.30 $dbf->create_table( tbl => 'attribute', flds => "genome varchar(64), ftype varchar(64), id varchar(64), tag varchar(64), val text, url text");
60 : overbeek 1.27 $dbf->drop_table( tbl => "attribute_metadata" );
61 :     $dbf->create_table( tbl => 'attribute_metadata', flds => "attrkey varchar(64), metakey varchar(64), metaval text");
62 : redwards 1.17
63 : redwards 1.18 my @tlogs; # we are going to store any transaction_logs we encounter here, and then process them at the end
64 : overbeek 1.27 my @akeys; # we are going to store any attributes metadata we encounter here, and then process them at the end
65 : redwards 1.18
66 : redwards 1.17 # This is where we are going to store all the attributes data, and then we will load it all at once
67 :     foreach my $genome ($fig->genomes)
68 : redwards 1.18 #foreach my $genome (qw[158879.1 9986.1 83333.1]) # this is for testing so we only load 2 genomes!
69 : redwards 1.17 {
70 :     # if we are keeping the file, this will be incremented so that we don't overwrite each genome
71 :     my $filecount=1;
72 :     while (-e "$FIG_Config::temp/load_attributes.$$.$genome.$filecount") {$filecount++}
73 :    
74 :     my $attributesFH;
75 :     open($attributesFH, ">$FIG_Config::temp/load_attributes.$$.$genome.$filecount")
76 :     || die "can't open $FIG_Config::temp/load_attributes.$$.$genome.$filecount for writing";
77 :     my %kv;
78 :     # I have rewritten this to allow the following things:
79 :     # 1. Attributes for genomes are now available in $FIG_Config::organisms/$genome/Attributes
80 :     # 2. Attributes for features (not just pegs) are now available in $FIG_Config::organisms/$genome/Features/*/Attributes
81 : redwards 1.4
82 : redwards 1.17 my $dir = "$FIG_Config::organisms/$genome/Attributes";
83 :     # note that this grep ignores emacs editing files and . and ..
84 : redwards 1.18 map
85 :     {
86 :     $_ eq "transaction_log" ? push @tlogs, "$FIG_Config::organisms/$genome/Attributes/$_" :
87 : overbeek 1.27 ($_ eq "attribute_keys" || $_ eq "attribute_metadata") ? push @akeys, "$FIG_Config::organisms/$genome/Attributes/$_" :
88 : redwards 1.18 &parse_file_to_temp("$FIG_Config::organisms/$genome/Attributes/$_", $attributesFH);
89 :     }
90 :     grep { $_ !~ /^\./ && $_ !~ /^\#/ } readdir(ATTR) if (opendir(ATTR,$dir));
91 : redwards 1.4
92 : redwards 1.17 # Now find the other attributes files
93 :     # We should use File::Find here, but I am not sure if that is in the default distro, so I'll just write a quickie. Not as good, though
94 :    
95 :     my $fattdir="$FIG_Config::organisms/$genome/Features";
96 :     opendir(FEATURES, $fattdir) || die "Can't open dir $fattdir";
97 :     foreach my $dir (grep { $_ !~ /^\./ && $_ !~ /^\#/ } readdir(FEATURES))
98 :     {
99 : redwards 1.23 &links_file("$fattdir/$dir/$dir.links", $attributesFH) if ($links && -e "$fattdir/$dir/$dir.links");
100 : redwards 1.18 map
101 :     {
102 :     $_ eq "transaction_log" ? push @tlogs, "$fattdir/$dir/Attributes/$_" :
103 : overbeek 1.27 ($_ eq "attribute_keys" || $_ eq "attribute_metadata") ? push @akeys,"$fattdir/$dir/Attributes/$_" :
104 : redwards 1.18 &parse_file_to_temp("$fattdir/$dir/Attributes/$_", $attributesFH);
105 :     }
106 :     grep { $_ !~ /^\./ && $_ !~ /^\#/ } readdir(ATTR) if (opendir(ATTR,"$fattdir/$dir/Attributes"));
107 : redwards 1.17 }
108 :    
109 :     close($attributesFH);
110 :     #remove the file if it has zero size. No need in continuing
111 :     if (!-s "$FIG_Config::temp/load_attributes.$$.$genome.$filecount") {unlink("$FIG_Config::temp/load_attributes.$$.$genome.$filecount"); next}
112 : redwards 1.4
113 : redwards 1.17 # finally load all the attributes
114 :     my $result = $dbf->load_table( tbl => "attribute", file => "$FIG_Config::temp/load_attributes.$$.$genome.$filecount" );
115 : overbeek 1.27 if ($verbose && $result != -1) {print STDERR "Got $result for ", $fig->genus_species($genome), " ($genome) while trying to load.\n"}
116 : redwards 1.17 if (!$keep) {unlink("$FIG_Config::temp/load_attributes.$$.$genome.$filecount")}
117 :     }
118 :    
119 :     # now we need to load the global attributes files
120 :     if ($doglobal)
121 :     {
122 :     if (opendir(DIR, "$FIG_Config::global/Attributes/"))
123 :     {
124 :     my $globalFH;
125 :     open($globalFH, ">$FIG_Config::temp/global_attributes") || die "Can't open $FIG_Config::temp/global_attributes for writing";
126 : redwards 1.18 map
127 :     {
128 :     $_ eq "transaction_log" ? push @tlogs, "$FIG_Config::global/Attributes/$_" :
129 : overbeek 1.27 ($_ eq "attribute_keys" || $_ eq "attribute_metadata") ? push @akeys,"$FIG_Config::global/Attributes/$_" :
130 : redwards 1.18 &parse_file_to_temp("$FIG_Config::global/Attributes/$_", $globalFH);
131 :     }
132 :     grep {$_ !~ /^\./ && $_ !~ /^\#/} readdir(DIR);
133 : redwards 1.17 close $globalFH;
134 :     }
135 :     my $result = $dbf->load_table( tbl => "attribute", file => "$FIG_Config::temp/global_attributes" ) if (-e "$FIG_Config::temp/global_attributes");
136 :     if ($verbose) {print STDERR "Got $result for $FIG_Config::temp/global_attributes\n"}
137 :     if (!$keep) {unlink("$FIG_Config::temp/global_attributes")}
138 : overbeek 1.1 }
139 : redwards 1.17 else {print STDERR "$FIG_Config::global/Attributes/ was not parsed\n"}
140 :    
141 :    
142 : overbeek 1.27 # finally parse the transaction_log files and attributes_metadata
143 : redwards 1.18 &parse_transaction_logs(\@tlogs) if (scalar(@tlogs));
144 : overbeek 1.27 &parse_attributes_metadata(\@akeys) if (scalar(@akeys));
145 : redwards 1.18
146 :    
147 : parrello 1.14 Trace("Creating index.") if T(2);
148 : overbeek 1.27 # rob messing with indexes
149 :     # fields are now : genome ftype id key val url
150 :     $dbf->create_index( idx => "attribute_genome_ix", tbl => "attribute", type => "btree", flds => "id,genome,ftype");
151 :     $dbf->create_index( idx => "attribute_genome_ftype_ix", tbl => "attribute", type => "btree", flds => "genome, ftype");
152 : overbeek 1.30 $dbf->create_index( idx => "attribute_key_ix", tbl => "attribute", type => "btree", flds => "tag" );
153 : overbeek 1.27 $dbf->create_index( idx => "attribute_val_ix", tbl => "attribute", type => "btree", flds => "val");
154 :     $dbf->create_index( idx => "attribute_metadata_ix", tbl => "attribute_metadata", type => "btree", flds => "attrkey, metakey, metaval");
155 :    
156 :    
157 : overbeek 1.15 Trace("Attributes loaded.") if T(2);
158 : redwards 1.17 exit(0);
159 :    
160 :    
161 :     =head3 links_file()
162 :    
163 :     Read the links and write them to the output filehandle provided. Requires two arguments - the links file and the filehandle where they should be written to
164 :    
165 :     =cut
166 :    
167 :     sub links_file {
168 :     # we are going to parse the links into a temporary file, and then read them
169 :     # at the moment there is something weird where links has lots of things like gi, uniprot id, and so on. These are aliases
170 :     # and I am not sure why they are in links.
171 :     # I am just going to keep the pubmed links for now
172 :     # however, I am going to parse out any pubmed link that may be for the genome article.
173 :     # this will be done by removing any article with some large number of hits
174 :     my ($links_file, $write_to)=@_;
175 :     return unless (-e $links_file);
176 :    
177 :     open (IN, $links_file) || die "Can't open $links_file";
178 :     my $output;
179 :     while (<IN>)
180 :     {
181 :     next unless (/pubmed/i);
182 :     chomp;
183 :     m#^(fig\|\d+\.\d+\.\w\w\w\.\d+).*(http.*)>(.*?)</a>#i;
184 :     unless ($1 && $2 && $3) {print STDERR "Error parsing\n>>>$_<<<\n"; next}
185 :     my ($peg, $url, $val)=($1, $2, $3);
186 :     $val =~ s/pubmed\s+//i;
187 :     push (@{$output->{$val}}, "$peg\tPUBMED\t$val\t$url\n");
188 :     }
189 :     # only output if we want to keep it
190 :     if ($output)
191 :     {
192 :     foreach my $key (keys %$output)
193 :     {
194 :     next if (scalar @{$output->{$key}} > 100);
195 :     print $write_to @{$output->{$key}};
196 :     }
197 :     }
198 :     }
199 :    
200 :    
201 :    
202 :     =head2 parse_file_to_temp()
203 :    
204 :     This method takes two arguments, the name of a file to read and a filehandle to write to. The file is opened, comments and blank lines are ignored, a couple of tests are applied, and the data is written to the filehandle.
205 :    
206 :     =cut
207 :    
208 :     sub parse_file_to_temp {
209 :     my ($from, $to)=@_;
210 :     return unless ($from);
211 :     unless ($to) {open ($to, ">-")} #open $to to STDOUT
212 : redwards 1.18
213 :     if ($verbose == 2) {print STDERR "Parsing from $from to $to\n"}
214 : redwards 1.17 open (IN, "$from") || die "can't open $from for reading";
215 :     while (<IN>)
216 :     {
217 : overbeek 1.26 if (/\r.*\r/) {print STDERR "The file $from appears to have multiple \\r delimters. Please remove these, as this file has been skipped\n"; next}
218 :     s/\r//;# catch lines that end \r\n
219 : redwards 1.17 chomp;
220 :     next if (/^\s*\#/); # ignore comments
221 :     next if (/^\s*$/); # ignore blanks or whitespace only lines
222 :    
223 :     # there is a problem with periods. They can not be escaped or else the dbh load will fail. We unescape them here
224 :     s/\\\./\./g;
225 :     my @line=split /\t/;
226 : overbeek 1.28 unless ($line[0]) {if ($verbose) {print STDERR "No ID at line in $from at:\n$_\n"} next}
227 :     unless ($line[1]) {if ($verbose) {print STDERR "No key at line in $from at:\n$_\n"} next}
228 :     unless ($line[2]) {if ($verbose) {print STDERR "No value at line in $from at:\n$_\n"} next} # can you have a key without a value?
229 :     if (length($line[1]) > 64) {print STDERR "Key is longer than 64 characters in $from at:\n$_\n"; next}
230 : redwards 1.22 if ($#line ==2) {$line[3]=''}
231 : overbeek 1.28 unless ($#line == 3) {print STDERR "Line \n$_\n in $from have more than 4 columns. You are only allowed feature, key, value, and url\n"; next}
232 : redwards 1.19 # clean the key
233 : overbeek 1.27 if ($line[1] =~ /\W/ && !$force)
234 :     {
235 :     print STDERR "the key: $line[1] from $from has characters that are not [a-zA-Z0-9] and _. Please correct this or use the force option to clean the key\n";
236 :     next;
237 :     }
238 :     elsif ($line[1] =~ /\W/ && $force)
239 :     {
240 :     $line[1] = $fig->clean_attribute_key($line[1]);
241 :     }
242 :    
243 :     # replace the first element in the line with the split feature as appropriate
244 :     splice(@line, 0, 1, $fig->split_attribute_oid($line[0]));
245 : redwards 1.17 unless (defined $line[3]) {$line[3] = ""}
246 :     print $to (join "\t", @line) . "\n";
247 :     }
248 :     }
249 : redwards 1.18
250 :     =head2 parse_transaction_logs()
251 :    
252 :     This method takes a reference to an array of paths to transactions_logs and will read and process them
253 :    
254 :     =cut
255 :    
256 :     sub parse_transaction_logs {
257 :     my $logs=shift;
258 :     return unless $logs;
259 : overbeek 1.27 foreach my $l (@$logs)
260 :     {
261 : redwards 1.18 if ($verbose) {print STDERR "Parsing transaction log $l\n"}
262 :     $fig->read_attribute_transaction_log($l);
263 :     }
264 :     }
265 : overbeek 1.27
266 :    
267 :     =head2 parse_attributes_metadata()
268 :    
269 :     This method takes a reference to an array of attributes metadata files and loads them into the database. It will also rename attribute_keys to attribute_metadata to be consistent and hopefully clearer.
270 :    
271 :     =cut
272 :    
273 :     sub parse_attributes_metadata {
274 :     my $akeys=shift;
275 :     return unless ($akeys);
276 :    
277 :     # first we are going to see if we need to rename or append any files
278 :     my %attributekeys;
279 :     foreach my $ak (@$akeys)
280 :     {
281 :     # rename attribute_keys to attribute_metadata by appending to a file in case there is more data there.
282 :     if ($ak =~ /attribute_keys$/)
283 :     {
284 :     my $location=$fig->update_attributes_metadata($ak);
285 :     $attributekeys{$location}=1;
286 :     }
287 :     else
288 :     {
289 :     $attributekeys{$ak}=1;
290 :     }
291 :     }
292 :    
293 :     foreach my $ak (keys %attributekeys)
294 :     {
295 :     if ($verbose) {print STDERR "Parsing attribute metadata $ak\n"}
296 :     open(IN, $ak) || die "Can't open $ak";
297 :     while (<IN>)
298 :     {
299 :     next if (/^\s*\#/);
300 :     chomp;
301 :     my @line=split /\t/;
302 :     # here we pass in the attribute key (line[0]) and a reference to an array with metakey and key info
303 :     $fig->key_info($line[0], {$line[1]=>$line[2]}, 1);
304 :     }
305 :     }
306 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3