[Bio] / FigKernelScripts / load_attributes.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/load_attributes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.24 - (view) (download) (as text)

1 : overbeek 1.1 use FIG;
2 : parrello 1.14 use Tracer;
3 : redwards 1.17 use strict;
4 : parrello 1.14
5 : overbeek 1.1 my $fig = new FIG;
6 :    
7 : redwards 1.16 =pod
8 :    
9 :     =head1 load_attributes
10 :    
11 :     load_attributes is used to reload the attributes file. You can run it with the following command line prompts
12 :     -v be verbose and -vv be very verbose. This may be superceded by the trace commands?
13 : redwards 1.23 -links include the links as attributes. At the moment only pubmed ID's are loaded as links.
14 : redwards 1.17 -keep keeps the temporary files. Usually the temporary files are deleted, but this will keep (at least some of) them
15 :     -noglobal normally attributes in $FIG::Config::global are also processed. This will ignore those
16 : redwards 1.16
17 :     load_attributes begins by deleting the database tables for ALL attributes, and then reloads the data. This is NOT selective.
18 :    
19 : redwards 1.17 We then process through each of the genome directories according to $fig->genomes() and look for attributes in each directory. These are written to a temporary file and then loaded.
20 : redwards 1.16
21 :    
22 :    
23 :     =cut
24 :    
25 : redwards 1.20 my ($verbose, $flush, $links, $keep, $doglobal)=(0,0,0,0,1);
26 : redwards 1.8 while (@ARGV)
27 :     {
28 : parrello 1.14 my $try=shift(@ARGV);
29 :     if ($try eq "-v") {$verbose=1}
30 : redwards 1.17 elsif ($try eq "-vv") {$verbose=2}
31 : redwards 1.20 elsif ($try eq "-links") {$links=1}
32 : redwards 1.17 elsif ($try eq "-keep") {$keep=1}
33 :     elsif ($try eq "-noglobal") {$doglobal=0}
34 : redwards 1.8 }
35 : redwards 1.5
36 : redwards 1.4
37 : redwards 1.17 Trace("Deleting and Recreating attribute table.") if T(2);
38 : parrello 1.14 my $dbf = $fig->db_handle;
39 : overbeek 1.1 $dbf->drop_table( tbl => "attribute" );
40 : redwards 1.17 $dbf->create_table( tbl => 'attribute', flds => "fid varchar(64), tag varchar(64), val text, url text");
41 :    
42 : redwards 1.18 my @tlogs; # we are going to store any transaction_logs we encounter here, and then process them at the end
43 :    
44 : redwards 1.17 # This is where we are going to store all the attributes data, and then we will load it all at once
45 :     foreach my $genome ($fig->genomes)
46 : redwards 1.18 #foreach my $genome (qw[158879.1 9986.1 83333.1]) # this is for testing so we only load 2 genomes!
47 : redwards 1.17 {
48 :     # if we are keeping the file, this will be incremented so that we don't overwrite each genome
49 :     my $filecount=1;
50 :     while (-e "$FIG_Config::temp/load_attributes.$$.$genome.$filecount") {$filecount++}
51 :    
52 :     my $attributesFH;
53 :     open($attributesFH, ">$FIG_Config::temp/load_attributes.$$.$genome.$filecount")
54 :     || die "can't open $FIG_Config::temp/load_attributes.$$.$genome.$filecount for writing";
55 :     my %kv;
56 : overbeek 1.24 #Trace("Processing $genome.") if T(3);
57 : redwards 1.17 # I have rewritten this to allow the following things:
58 :     # 1. Attributes for genomes are now available in $FIG_Config::organisms/$genome/Attributes
59 :     # 2. Attributes for features (not just pegs) are now available in $FIG_Config::organisms/$genome/Features/*/Attributes
60 : redwards 1.4
61 : redwards 1.17 my $dir = "$FIG_Config::organisms/$genome/Attributes";
62 :     # note that this grep ignores emacs editing files and . and ..
63 : redwards 1.18 map
64 :     {
65 :     $_ eq "transaction_log" ? push @tlogs, "$FIG_Config::organisms/$genome/Attributes/$_" :
66 :     &parse_file_to_temp("$FIG_Config::organisms/$genome/Attributes/$_", $attributesFH);
67 :     }
68 :     grep { $_ !~ /^\./ && $_ !~ /^\#/ } readdir(ATTR) if (opendir(ATTR,$dir));
69 : redwards 1.4
70 : redwards 1.17 # Now find the other attributes files
71 :     # We should use File::Find here, but I am not sure if that is in the default distro, so I'll just write a quickie. Not as good, though
72 :    
73 :     my $fattdir="$FIG_Config::organisms/$genome/Features";
74 :     opendir(FEATURES, $fattdir) || die "Can't open dir $fattdir";
75 :     foreach my $dir (grep { $_ !~ /^\./ && $_ !~ /^\#/ } readdir(FEATURES))
76 :     {
77 : redwards 1.23 &links_file("$fattdir/$dir/$dir.links", $attributesFH) if ($links && -e "$fattdir/$dir/$dir.links");
78 : redwards 1.18 map
79 :     {
80 :     $_ eq "transaction_log" ? push @tlogs, "$fattdir/$dir/Attributes/$_" :
81 :     &parse_file_to_temp("$fattdir/$dir/Attributes/$_", $attributesFH);
82 :     }
83 :     grep { $_ !~ /^\./ && $_ !~ /^\#/ } readdir(ATTR) if (opendir(ATTR,"$fattdir/$dir/Attributes"));
84 : redwards 1.17 }
85 :    
86 :     close($attributesFH);
87 :     #remove the file if it has zero size. No need in continuing
88 :     if (!-s "$FIG_Config::temp/load_attributes.$$.$genome.$filecount") {unlink("$FIG_Config::temp/load_attributes.$$.$genome.$filecount"); next}
89 : redwards 1.4
90 : redwards 1.17 # finally load all the attributes
91 :     my $result = $dbf->load_table( tbl => "attribute", file => "$FIG_Config::temp/load_attributes.$$.$genome.$filecount" );
92 :     if ($verbose) {print STDERR "Got $result for ", $fig->genus_species($genome), "\n"}
93 :     if (!$keep) {unlink("$FIG_Config::temp/load_attributes.$$.$genome.$filecount")}
94 :     }
95 :    
96 :     # now we need to load the global attributes files
97 :     if ($doglobal)
98 :     {
99 :     if (opendir(DIR, "$FIG_Config::global/Attributes/"))
100 :     {
101 :     my $globalFH;
102 :     open($globalFH, ">$FIG_Config::temp/global_attributes") || die "Can't open $FIG_Config::temp/global_attributes for writing";
103 : redwards 1.18 map
104 :     {
105 :     $_ eq "transaction_log" ? push @tlogs, "$FIG_Config::global/Attributes/$_" :
106 :     $_ eq "attribute_keys" ? 1 :
107 :     &parse_file_to_temp("$FIG_Config::global/Attributes/$_", $globalFH);
108 :     }
109 :     grep {$_ !~ /^\./ && $_ !~ /^\#/} readdir(DIR);
110 : redwards 1.17 close $globalFH;
111 :     }
112 :     my $result = $dbf->load_table( tbl => "attribute", file => "$FIG_Config::temp/global_attributes" ) if (-e "$FIG_Config::temp/global_attributes");
113 :     if ($verbose) {print STDERR "Got $result for $FIG_Config::temp/global_attributes\n"}
114 :     if (!$keep) {unlink("$FIG_Config::temp/global_attributes")}
115 : overbeek 1.1 }
116 : redwards 1.17 else {print STDERR "$FIG_Config::global/Attributes/ was not parsed\n"}
117 :    
118 :    
119 : redwards 1.18 # finally parse the transaction_log files
120 :     &parse_transaction_logs(\@tlogs) if (scalar(@tlogs));
121 :    
122 :    
123 : parrello 1.14 Trace("Creating index.") if T(2);
124 : overbeek 1.3 $dbf->create_index( idx => "attribute_fid_ix",
125 : redwards 1.17 tbl => "attribute",
126 :     type => "btree",
127 :     flds => "fid"
128 :     );
129 : overbeek 1.15 Trace("Attributes loaded.") if T(2);
130 : redwards 1.17 exit(0);
131 :    
132 :    
133 :     =head3 links_file()
134 :    
135 :     Read the links and write them to the output filehandle provided. Requires two arguments - the links file and the filehandle where they should be written to
136 :    
137 :     =cut
138 :    
139 :     sub links_file {
140 :     # we are going to parse the links into a temporary file, and then read them
141 :     # at the moment there is something weird where links has lots of things like gi, uniprot id, and so on. These are aliases
142 :     # and I am not sure why they are in links.
143 :     # I am just going to keep the pubmed links for now
144 :     # however, I am going to parse out any pubmed link that may be for the genome article.
145 :     # this will be done by removing any article with some large number of hits
146 :     my ($links_file, $write_to)=@_;
147 :     return unless (-e $links_file);
148 :    
149 :     open (IN, $links_file) || die "Can't open $links_file";
150 :     my $output;
151 :     while (<IN>)
152 :     {
153 :     next unless (/pubmed/i);
154 :     chomp;
155 :     m#^(fig\|\d+\.\d+\.\w\w\w\.\d+).*(http.*)>(.*?)</a>#i;
156 :     unless ($1 && $2 && $3) {print STDERR "Error parsing\n>>>$_<<<\n"; next}
157 :     my ($peg, $url, $val)=($1, $2, $3);
158 :     $val =~ s/pubmed\s+//i;
159 :     push (@{$output->{$val}}, "$peg\tPUBMED\t$val\t$url\n");
160 :     }
161 :     # only output if we want to keep it
162 :     if ($output)
163 :     {
164 :     foreach my $key (keys %$output)
165 :     {
166 :     next if (scalar @{$output->{$key}} > 100);
167 :     print $write_to @{$output->{$key}};
168 :     }
169 :     }
170 :     }
171 :    
172 :    
173 :    
174 :     =head2 parse_file_to_temp()
175 :    
176 :     This method takes two arguments, the name of a file to read and a filehandle to write to. The file is opened, comments and blank lines are ignored, a couple of tests are applied, and the data is written to the filehandle.
177 :    
178 :     =cut
179 :    
180 :     sub parse_file_to_temp {
181 :     my ($from, $to)=@_;
182 :     return unless ($from);
183 :     unless ($to) {open ($to, ">-")} #open $to to STDOUT
184 : redwards 1.18
185 :     if ($verbose == 2) {print STDERR "Parsing from $from to $to\n"}
186 : redwards 1.17 open (IN, "$from") || die "can't open $from for reading";
187 :     while (<IN>)
188 :     {
189 :     chomp;
190 : redwards 1.20 s/\r/\n/;
191 : redwards 1.17 next if (/^\s*\#/); # ignore comments
192 :     next if (/^\s*$/); # ignore blanks or whitespace only lines
193 :    
194 :     # there is a problem with periods. They can not be escaped or else the dbh load will fail. We unescape them here
195 :     s/\\\./\./g;
196 :     my @line=split /\t/;
197 :     unless ($line[0]) {print STDERR "No ID at line in $from at:\n$_"; next}
198 :     unless ($line[1]) {print STDERR "No key at line in $from at:\n$_"; next}
199 :     if (length($line[1]) > 64) {print STDERR "Key is longer than 64 characters in $from at:\n$_"; next}
200 : redwards 1.22 if ($#line ==2) {$line[3]=''}
201 : redwards 1.21 unless ($#line == 3) {print STDERR "Lines in $from have more than 4 columns. You are only allowed feature, key, value, and url\n"; next}
202 : redwards 1.19 # clean the key
203 :     $line[1] = $fig->clean_attribute_key($line[1]);
204 : redwards 1.17 unless (defined $line[3]) {$line[3] = ""}
205 :     print $to (join "\t", @line) . "\n";
206 :     }
207 :     }
208 : redwards 1.18
209 :     =head2 parse_transaction_logs()
210 :    
211 :     This method takes a reference to an array of paths to transactions_logs and will read and process them
212 :    
213 :     =cut
214 :    
215 :     sub parse_transaction_logs {
216 :     my $logs=shift;
217 :     return unless $logs;
218 :     foreach my $l (@$logs) {
219 :     if ($verbose) {print STDERR "Parsing transaction log $l\n"}
220 :     $fig->read_attribute_transaction_log($l);
221 :     }
222 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3