[Bio] / FigKernelPackages / raelib.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/raelib.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (view) (download) (as text)

1 : redwards 1.1 # -*- perl -*-
2 :    
3 :     =pod
4 :    
5 : parrello 1.13 =head1 RAE Library
6 : redwards 1.1
7 :     Some routines and things that Rob uses. Please feel free to use at will and incorporate into
8 :     your own code or move them into FIG.pm or elsewhere.
9 :    
10 :     =cut
11 :    
12 :     package raelib;
13 :     use strict;
14 :     use FIG;
15 :     my $fig=new FIG;
16 :    
17 : redwards 1.5 =head2 new
18 :    
19 :     Just instantiate the object and return $self
20 :    
21 :     =cut
22 :    
23 :     sub new {
24 :     my $self=shift;
25 :     return $self
26 :     }
27 :    
28 :    
29 :    
30 :    
31 : redwards 1.4 =head2 features_on_contig
32 :    
33 :     Returns a reference to an array containing all the features on a contig in a genome.
34 :    
35 :     use:
36 :    
37 :     my $arrayref=$rae->features_on_contig($genome, $contig);
38 :    
39 :     or
40 :    
41 :     foreach my $peg (@{$rae->features_on_contig($genome, $contig)}) {
42 :     ... blah blah ...
43 :     }
44 :    
45 :     returns undef if contig is not a part of genome or there is nothing to return, otherwise returns a list of pegs
46 :    
47 :     v. experimental and guaranteed not to work!
48 :    
49 :     =cut
50 :    
51 :     sub features_on_contig {
52 :     my ($self, $genome, $contig)=@_;
53 :     # were this in FIG.pm you'd use this line:
54 :     #my $rdbH = $self->db_handle;
55 :    
56 :     my $rdbH = $fig->db_handle;
57 :     my $relational_db_response=$rdbH->SQL('SELECT id FROM features WHERE (genome = \'' . $genome . '\' AND location ~* \'' . $contig . '\')');
58 :     # this is complicated. A reference to an array of references to arrays, and we only want the first element.
59 :     # simplify.
60 :     my @results;
61 :     foreach my $res (@$relational_db_response) {push @results, $res->[0]}
62 :     return \@results;
63 :     }
64 :    
65 :    
66 :    
67 :    
68 :    
69 :    
70 :    
71 : redwards 1.7 =head2 pirsfcorrespondence
72 : redwards 1.1
73 : redwards 1.7 Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written.
74 : redwards 1.1 This is so that we can easily go back and forth.
75 :    
76 : redwards 1.7 The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat
77 : redwards 1.1
78 : redwards 1.9 This method takes three arguments:
79 :     from : pirsfinfo.dat file
80 :     to : file to write information to
81 :     verbose : report on progress
82 :    
83 :     Returns the number of lines in the pirsinfo file that were read.
84 :    
85 : redwards 1.1 =cut
86 :    
87 : redwards 1.7 sub pirsfcorrespondence {
88 : redwards 1.9 my ($self, $from, $to, $verbose)=@_;
89 : redwards 1.10 unless (-e $from) {
90 :     print STDERR "File $from does not exist as called in $0\n";
91 :     return 0;
92 :     }
93 : redwards 1.1 open (IN, $from) || die "Can't open $from";
94 : redwards 1.8 open (OUT, ">$to") || die "Can't write to $to";
95 : redwards 1.9 my $linecount;
96 : redwards 1.1 while (<IN>) {
97 : redwards 1.9 $linecount++;
98 : redwards 1.14 if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"}
99 : redwards 1.1 if (/^>/) {print OUT; next}
100 :     chomp;
101 : redwards 1.14 foreach my $peg ($self->swiss_pir_ids($_)) {
102 : redwards 1.1 print OUT $_, "\t", $peg, "\n";
103 :     }
104 : redwards 1.14 }
105 :     close IN;
106 :     close OUT;
107 :     return $linecount;
108 :     }
109 :    
110 :     =head2 uniprotcorrespondence
111 :    
112 :     Generate a correspondence table between uniprot knowledge base IDs and FIG ID's.
113 :    
114 :     The uniprot KB file is in the form: UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name
115 :    
116 :     This method takes three arguments:
117 :     from : uniprotKB file
118 :     to : file to write information to
119 :     verbose : report on progress
120 :    
121 :     Returns the number of lines in the pirsinfo file that were read.
122 :    
123 :     =cut
124 :    
125 :     sub uniprotcorrespondence {
126 :     my ($self, $from, $to, $verbose)=@_;
127 :     unless (-e $from) {
128 :     print STDERR "File $from does not exist as called in $0\n";
129 :     return 0;
130 :     }
131 :     open (IN, $from) || die "Can't open $from";
132 :     open (OUT, ">$to") || die "Can't write to $to";
133 :     my $linecount;
134 :     while (<IN>) {
135 :     chomp;
136 :     $linecount++;
137 :     if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"}
138 :     my @line=split /\s+\|\s+/;
139 : redwards 1.16 my $added;
140 : redwards 1.14 foreach my $peg ($self->swiss_pir_ids($line[0])) {
141 :     print OUT "$_ | $peg\n";
142 : redwards 1.16 $added=1;
143 : redwards 1.12 }
144 : redwards 1.16 unless ($added) {print OUT "$_\n"}
145 : redwards 1.1 }
146 :     close IN;
147 :     close OUT;
148 : redwards 1.9 return $linecount;
149 : redwards 1.1 }
150 :    
151 : redwards 1.14
152 :    
153 :     =head2 swiss_pir_ids()
154 :    
155 :     SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's.
156 :    
157 :     =cut
158 :    
159 :     sub swiss_pir_ids {
160 :     my ($self, $id)=@_;
161 :     return () unless ($id);
162 :    
163 : redwards 1.15 my @return=($fig->by_alias("uni|$id"));
164 : redwards 1.14 return @return if ($return[0]);
165 :    
166 : redwards 1.15 @return=($fig->by_alias("tr|$id"));
167 : redwards 1.14 return @return if ($return[0]);
168 :    
169 : redwards 1.15 @return=($fig->by_alias("sp|$id"));
170 : redwards 1.14 return @return if ($return[0]);
171 :    
172 :     return ();
173 :     }
174 : redwards 1.1
175 :     =head2 ss_by_id
176 :    
177 :     Generate a list of subsystems that a peg occurs in. This is a ; separated list.
178 :     This is a wrapper that removes roles and ignores essential things
179 :    
180 :     =cut
181 :    
182 :     sub ss_by_id {
183 :     my ($self, $peg)=@_;
184 :     my $ssout;
185 :     foreach my $ss (sort $fig->subsystems_for_peg($peg))
186 :     {
187 :     next if ($$ss[0] =~ /essential/i); # Ignore the Essential B-subtilis subsystems
188 :     $ssout.=$$ss[0]."; ";
189 :     }
190 :     $ssout =~ s/; $//;
191 :     return $ssout;
192 :     }
193 :    
194 : redwards 1.3 =head2 ss_by_homol
195 :    
196 :     Generate a list of subsystems that homologs of a peg occur in. This is a ; separated list.
197 :     This is also a wrapper around sims and ss, but makes everything unified
198 :    
199 :     =cut
200 :    
201 :     sub ss_by_homol {
202 :     my ($self, $peg)=@_;
203 :     return unless ($peg);
204 :     my ($maxN, $maxP)=(50, 1e-20);
205 :    
206 :     # find the sims
207 :     my @sims=$fig->sims($peg, $maxN, $maxP, 'fig');
208 :    
209 :     # we are only going to keep the best hit for each peg
210 :     # in a subsystem
211 :     my $best_ss_score; my $best_ss_id;
212 :     foreach my $sim (@sims)
213 :     {
214 :     my $simpeg=$$sim[1];
215 :     my $simscore=$$sim[10];
216 :     my @subsys=$fig->subsystems_for_peg($simpeg);
217 :     foreach my $ss (@subsys)
218 :     {
219 :     if (! defined $best_ss_score->{$$ss[0]}) {$best_ss_score->{$$ss[0]}=$simscore; $best_ss_id->{$$ss[0]}=$simpeg}
220 :     elsif ($best_ss_score->{$$ss[0]} > $simscore)
221 :     {
222 :     $best_ss_score->{$$ss[0]}=$simscore;
223 :     $best_ss_id->{$$ss[0]}=$simpeg;
224 :     }
225 :     }
226 :     }
227 :    
228 :     my $ssoutput=join "", (map {"$_ (".$best_ss_id->{$_}."), "} keys %$best_ss_id);
229 :    
230 :     $ssoutput =~ s/, $//;
231 :     return $ssoutput;
232 :     }
233 :    
234 :     =head2 tagvalue
235 :    
236 :     This will just check for tag value pairs and return either an array of values or a single ; separated list (if called as a scalar)
237 :    
238 :     e.g. $values=raelib->tagvalue($peg, "PIRSF"); print join "\n", @$values;
239 :    
240 :     Returns an empty array if no tag/value appropriate.
241 :    
242 :     Just because I use this a lot I don't want to waste rewriting it.
243 :    
244 :     =cut
245 :    
246 :     sub tagvalue {
247 :     my ($self, $peg, $tag)=@_;
248 :     my @return;
249 :     my @attr=$fig->feature_attributes($peg);
250 :     foreach my $attr (@attr) {
251 : redwards 1.11 my ($gotpeg, $gottag, $val, $link)=@$attr;
252 : redwards 1.3 push @return, $val if ($gottag eq $tag);
253 :     }
254 :     return wantarray ? @return : join "; ", @return;
255 :     }
256 : redwards 1.1
257 : redwards 1.5 =head2 locations_on_contig
258 :    
259 :     Return the locations of a sequence on a contig.
260 :    
261 :     This will look for exact matches to a sequence on a contig, and return a reference to an array that has all the locations.
262 :    
263 :     my $locations=$raelib->locations_on_contig($genome, $contig, 'GATC', undef);
264 :     foreach my $bp (@$locations) { ... do something ... }
265 :    
266 :     first argument : genome number
267 :     second argument : contig name
268 :     third argument : sequence to look for
269 :     fourth argument : beginning position to start looking from (can be undef)
270 :     fifth argument : end position to stop looking from (can be undef)
271 :     sixth argument : check reverse complement (0 or undef will check forward, 1 or true will check rc)
272 :    
273 :     Note, the position is calculated before the sequence is rc'd
274 :    
275 :     =cut
276 :    
277 :     sub locations_on_contig {
278 :     my ($self, $genome, $contig, $sequence, $from, $to, $check_reverse)=@_;
279 :     my $return=[];
280 :    
281 :     # get the dna sequence of the contig, and make sure it is uppercase
282 :     my $contig_ln=$fig->contig_ln($genome, $contig);
283 :     return $return unless ($contig_ln);
284 :     unless ($from) {$from=1}
285 :     unless ($to) {$to=$contig_ln}
286 :     if ($from > $to) {($from, $to)=($to, $from)}
287 :     my $dna_seq=$fig->dna_seq($genome, $contig."_".$from."_".$to);
288 :     $dna_seq=uc($dna_seq);
289 :    
290 :     # if we want to check the rc, we actually rc the query
291 :     $sequence=$fig->reverse_comp($sequence) if ($check_reverse);
292 :     $sequence=uc($sequence);
293 :    
294 :     # now find all the matches
295 :     my $posn=index($dna_seq, $sequence, 0);
296 :     while ($posn > -1) {
297 :     push @$return, $posn;
298 :     $posn=index($dna_seq, $sequence, $posn+1);
299 :     }
300 :     return $return;
301 :     }
302 :    
303 :    
304 :     =head2 scrolling_org_list
305 :    
306 :     This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once.
307 :    
308 :     use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple);
309 :    
310 :     multiple selections will only be set if $multiple is true
311 :    
312 :     =cut
313 :    
314 :     sub scrolling_org_list {
315 :     my ($self, $cgi, $multiple)=@_;
316 :     unless ($multiple) {$multiple=0}
317 :    
318 :     my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' );
319 :    
320 :     #
321 :     # Canonical names must match the keywords used in the DBMS. They are
322 :     # defined in compute_genome_counts.pl
323 :     #
324 :     my %canonical = (
325 :     'All' => undef,
326 :     'Archaea' => 'Archaea',
327 :     'Bacteria' => 'Bacteria',
328 :     'Eucarya' => 'Eukaryota',
329 :     'Viruses' => 'Virus',
330 :     'Environmental samples' => 'Environmental Sample'
331 :     );
332 :    
333 :     my $req_dom = $cgi->param( 'domain' ) || 'All';
334 :     my @domains = $cgi->radio_group( -name => 'domain',
335 :     -default => $req_dom,
336 :     -override => 1,
337 :     -values => [ @display ]
338 :     );
339 :    
340 :     my $n_domain = 0;
341 :     my %dom_num = map { ( $_, $n_domain++ ) } @display;
342 :     my $req_dom_num = $dom_num{ $req_dom } || 0;
343 :    
344 :     #
345 :     # Viruses and Environmental samples must have completeness = All (that is
346 :     # how they are in the database). Otherwise, default is Only "complete".
347 :     #
348 :     my $req_comp = ( $req_dom_num > $dom_num{ 'Eucarya' } ) ? 'All'
349 :     : $cgi->param( 'complete' ) || 'Only "complete"';
350 :     my @complete = $cgi->radio_group( -name => 'complete',
351 :     -default => $req_comp,
352 :     -override => 1,
353 :     -values => [ 'All', 'Only "complete"' ]
354 :     );
355 :     #
356 :     # Use $fig->genomes( complete, restricted, domain ) to get org list:
357 :     #
358 :     my $complete = ( $req_comp =~ /^all$/i ) ? undef : "complete";
359 :    
360 :     my $orgs; my $label;
361 :     @$orgs = $fig->genomes( $complete, undef, $canonical{ $req_dom } );
362 :    
363 :     foreach (@$orgs) {
364 :     my $gs = $fig->genus_species($_);
365 :     my $gc=scalar $fig->all_contigs($_);
366 :     $label->{$_} = "$gs ($_) [$gc contigs]";
367 :     }
368 :    
369 :     @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs;
370 :    
371 :     my $n_genomes = @$orgs;
372 :    
373 :     return ( "<TABLE>\n",
374 :     " <TR>\n",
375 :     " <TD>",
376 : redwards 1.6 $cgi->scrolling_list( -name => 'korgs',
377 :     -values => $orgs,
378 :     -labels => $label,
379 :     -size => 10,
380 :     -multiple => $multiple,
381 : redwards 1.5 ), $cgi->br,
382 :     "$n_genomes genomes shown ",
383 :     $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br,
384 :     " </TD>",
385 :     " <TD>",
386 :     join( "<br>", "<b>Domain(s) to show:</b>", @domains), "<br>\n",
387 :     join( "<br>", "<b>Completeness?</b>", @complete), "\n",
388 :     "</TD>",
389 :     " </TR>\n",
390 :     "</TABLE>\n",
391 :     $cgi->hr
392 :     );
393 :     }
394 :    
395 :    
396 : redwards 1.1
397 :    
398 :    
399 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3