[Bio] / FigKernelScripts / load_attributes.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/load_attributes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (view) (download) (as text)

1 : overbeek 1.1 use FIG;
2 :     my $fig = new FIG;
3 :    
4 : redwards 1.5 # there are several hidden utilities here. If you call the file:
5 : redwards 1.4 # Filename Output
6 :     # load_attributes nothing
7 : redwards 1.8 # -v information about those organisms that we add information about
8 :     # -vv information about all organisms
9 :     # -flush remove any "assigned attributes" file, so that only the base attributes are loaded.
10 :     # -link take the links from proteins and store them as attributes
11 : redwards 1.5
12 : redwards 1.8 my ($verbose, $flush, $links)=(0,0,0);
13 :     while (@ARGV)
14 :     {
15 :     my $try=shift(@ARGV);
16 :     if ($try eq "-v") {$verbose=1}
17 :     if ($try eq "-vv") {$verbose=2}
18 :     if ($try eq "-flush") {$flush=1}
19 :     if ($try eq "-link") {$link=1}
20 :     }
21 : redwards 1.5
22 : redwards 1.4
23 :    
24 : overbeek 1.1 my $dbf = $fig->{_dbf};
25 :     $dbf->drop_table( tbl => "attribute" );
26 :     $dbf->create_table( tbl => 'attribute',
27 : overbeek 1.2 flds => "fid varchar(64), tag varchar(64), val text, url text"
28 : overbeek 1.1 );
29 : overbeek 1.3
30 :     my(%kv,$genome);
31 :     foreach $genome ($fig->genomes)
32 : overbeek 1.1 {
33 : redwards 1.4 # I have rewritten this to allow the following things:
34 :     # 1. Attributes for genomes are now available in $FIG_Config::organisms/$genome/Attributes
35 :     # 2. Attributes for features (not just pegs) are now available in $FIG_Config::organisms/$genome/Features/*/Attributes
36 :     # 3. in 2, above, we don't know what * is. It should minimally be pegs and tRNA's but there is no reason it is not
37 :     # something else.
38 :    
39 : overbeek 1.3 undef %kv;
40 : redwards 1.4 my $stderr; # error messages that show progress, but I only want to print if we have something interesting to say
41 :     # The files that we will read for this genome are stored in @allfiles.
42 :     my @allfiles;
43 :    
44 :     # Find the genome attributes files.
45 :     $stderr .= "Reading from ". $fig->genus_species($genome). " ($genome)\n";
46 :    
47 :     my $dir = "$FIG_Config::organisms/$genome/Attributes";
48 : overbeek 1.3 if (opendir(ATTR,$dir))
49 :     {
50 : redwards 1.4 my @files = grep { $_ !~ /^\./ } readdir(ATTR);
51 : redwards 1.9 $stderr .= "\tFound ". (scalar @files) . " genome attributes\n";
52 : redwards 1.4 closedir(ATTR);
53 :    
54 : redwards 1.8 if ($flush) {
55 : redwards 1.5 if (-e "$dir/assigned_attributes") {`rm -f $dir/assigned_attributes`}
56 :     }
57 : redwards 1.4 # here is an alternate way of making assigned_attributes at the end of the list if the file exists
58 :     # my speculation is that this is so these overwrite all other attributes, si?
59 :     push @allfiles, map { $_ = "$dir/$_"} grep { $_ ne "assigned_attributes" } @files;
60 :     push @allfiles, "$dir/assigned_attributes" if (-e "$dir/assigned_attributes");
61 :     }
62 :    
63 :     # Now find the other attributes files
64 :     # We should use File::Find here, but I am not sure if that is in the default distro, so I'll just write a quickie. Not as good, though
65 :    
66 :     opendir(FEATURES, "$FIG_Config::organisms/$genome/Features") || die "Can't open dir $FIG_Config::organisms/$genome/Features/";
67 :     foreach my $dir (readdir(FEATURES)) {
68 :     next if ($dir =~ /^\./);
69 : redwards 1.8 if ($flush) {
70 : redwards 1.5 if (-e "$FIG_Config::organisms/$genome/Features/$dir/Attributes/assigned_attributes") {
71 :     `rm -f $FIG_Config::organisms/$genome/Features/$dir/Attributes/assigned_attributes`;
72 :     }
73 :     }
74 : redwards 1.4 if (-e "$FIG_Config::organisms/$genome/Features/$dir/Attributes" && opendir(ATTR, "$FIG_Config::organisms/$genome/Features/$dir/Attributes"))
75 :     {
76 :     my @files = grep { $_ !~ /^\./ } readdir(ATTR);
77 : overbeek 1.3 closedir(ATTR);
78 : redwards 1.4 next unless (scalar @files); # you could have an empty attributes dir, no problem
79 :     $stderr .= "\tFound ". (scalar @files) . " attributes files in $dir\n";
80 :     push @allfiles, map { $_ = "$FIG_Config::organisms/$genome/Features/$dir/Attributes/$_" } grep { $_ ne "assigned_attributes" } @files;
81 :     push @allfiles, "$FIG_Config::organisms/$genome/Features/$dir/Attributes/assigned_attributes"
82 :     if (-e "$FIG_Config::organisms/$genome/Features/$dir/Attributes/assigned_attributes");
83 :     }
84 : redwards 1.9 if ($link && -e "$FIG_Config::organisms/$genome/Features/$dir/$dir.links")
85 :     {
86 :     # we are going to parse the links into a temporary file, and then read them
87 :     # at the moment there is something weird where links has lots of things like gi, uniprot id, and so on. These are aliases
88 :     # and I am not sure why they are in links.
89 :     # I am just going to keep the pubmed links for now
90 :    
91 :     # however, I am going to parse out any pubmed link that may be for the genome article.
92 :     # this will be done by removing any article with some large number of hits
93 :     open (IN, "$FIG_Config::organisms/$genome/Features/$dir/$dir.links") || die "Can't open $FIG_Config::organisms/$genome/Features/$dir/$dir.links";
94 :     my $output;
95 :     while (<IN>)
96 :     {
97 :     next unless (/pubmed/i);
98 :     chomp;
99 :     m#^(fig\|\d+\.\d+\.\w\w\w\.\d+).*(http.*)>(.*?)</a>#i;
100 :     unless ($1 && $2 && $3)
101 :     {
102 :     print STDERR "Error parsing\n>>>$_<<<\n";
103 :     next;
104 :     }
105 :     my ($peg, $url, $val)=($1, $2, $3);
106 :     $val =~ s/pubmed\s+//i;
107 :     push (@{$output->{$val}}, "$peg\tPUBMED\t$val\t$url\n");
108 :     }
109 :     # only output if we want to keep it
110 :     if ($output)
111 :     {
112 :     open (OUT, ">>$FIG_Config::temp/linkstmp$$") || die "Can't open $FIG_Config::temp/linkstmp$$ for writing";
113 :     foreach my $key (keys %$output)
114 :     {
115 :     next if (scalar @{$output->{$key}} > 100);
116 :     print OUT @{$output->{$key}};
117 :     }
118 :     close OUT;
119 :     }
120 :    
121 :     }
122 : redwards 1.4 }
123 :    
124 : redwards 1.9 push @allfiles, "$FIG_Config::temp/linkstmp$$" if (-e "$FIG_Config::temp/linkstmp$$");
125 : overbeek 1.3
126 : redwards 1.4 # now read each of the files and save the data
127 :     foreach my $file (@allfiles)
128 :     {
129 :     if ((-s "$file") && open(TMPATTR,"<$file"))
130 :     {
131 :     while (defined($_ = <TMPATTR>))
132 :     {
133 : redwards 1.9 chomp;
134 :     # allow comments
135 : redwards 1.8 next if (/^\s*\#/);
136 :     my ($id, $tag, $val, $url)=split /\t/; # we can allow spaces in the elements
137 :     if ($id && $tag)
138 : redwards 1.4 {
139 : redwards 1.6 $tag =~ s/^\s+//; $tag =~ s/\s+$//; $tag=uc($tag);
140 :     if ($val)
141 : redwards 1.4 {
142 : redwards 1.9 push @{$kv{"$id\t$tag"}}, "$val\t$url\n";
143 : redwards 1.4 }
144 :     else
145 :     {
146 : redwards 1.6 delete $kv{"$id\t$tag"};
147 : redwards 1.4 }
148 :     }
149 : redwards 1.8 else
150 :     {
151 :     print STDERR "There was an error parsing $_ from $file\n";
152 :     next;
153 :     }
154 : redwards 1.4 }
155 :     close(TMPATTR);
156 :     }
157 :     }
158 : redwards 1.9
159 :     # delete the temp file that has the links data in it
160 :     unlink("$FIG_Config::temp/linkstmp$$");
161 :    
162 : redwards 1.8 if ($verbose == 1 && scalar keys %kv)
163 : redwards 1.4 {
164 :     print "$stderr\tWe have ", scalar keys %kv, " attributes to add for ", $fig->genus_species($genome), " ($genome)\n";
165 :     }
166 : redwards 1.8 elsif ($verbose == 2) {
167 : redwards 1.4 print "$stderr\tWe have ", scalar keys %kv, " attributes to add for ", $fig->genus_species($genome), " ($genome)\n";
168 :     }
169 :    
170 :     if (open(TMPATTR,">$FIG_Config::temp/tmp$$"))
171 :     {
172 : overbeek 1.3 my($pegK,$k,$v);
173 :     foreach $pegK (sort keys(%kv))
174 :     {
175 :     ($peg,$k) = split(/\t/,$pegK);
176 : redwards 1.9 foreach my $l (@{$kv{$pegK}})
177 :     {
178 :     print TMPATTR "$peg\t$k\t$l";
179 :     }
180 : overbeek 1.3 }
181 :     close(TMPATTR);
182 : overbeek 1.1 $dbf->load_table( tbl => "attribute",
183 : overbeek 1.3 file => "$FIG_Config::temp/tmp$$" );
184 :     unlink("$FIG_Config::temp/tmp$$");
185 : redwards 1.4 }
186 :     else
187 :     {
188 : overbeek 1.3 die "$FIG_Config::temp/tmp$$";
189 : overbeek 1.1 }
190 :     }
191 : overbeek 1.3 $dbf->create_index( idx => "attribute_fid_ix",
192 :     tbl => "attribute",
193 :     type => "btree",
194 :     flds => "fid"
195 :     );

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3