# -*- perl -*- =pod =head1 RAE Library Some routines and things that Rob uses. Please feel free to use at will and incorporate into your own code or move them into FIG.pm or elsewhere. =cut package raelib; use strict; use FIG; my $fig=new FIG; =head2 new Just instantiate the object and return $self =cut sub new { my $self=shift; return $self } =head2 features_on_contig Returns a reference to an array containing all the features on a contig in a genome. use: my $arrayref=$rae->features_on_contig($genome, $contig); or foreach my $peg (@{$rae->features_on_contig($genome, $contig)}) { ... blah blah ... } returns undef if contig is not a part of genome or there is nothing to return, otherwise returns a list of pegs v. experimental and guaranteed not to work! =cut sub features_on_contig { my ($self, $genome, $contig)=@_; # were this in FIG.pm you'd use this line: #my $rdbH = $self->db_handle; my $rdbH = $fig->db_handle; my $relational_db_response=$rdbH->SQL('SELECT id FROM features WHERE (genome = \'' . $genome . '\' AND location ~* \'' . $contig . '\')'); # this is complicated. A reference to an array of references to arrays, and we only want the first element. # simplify. my @results; foreach my $res (@$relational_db_response) {push @results, $res->[0]} return \@results; } =head2 pirsfcorrespondence Generate the pirsf->fig id correspondence. This is only done once and the correspondence file is written. This is so that we can easily go back and forth. The correspondence has PIR ID \t FIG ID\n, and is probably based on ftp://ftp.pir.georgetown.edu/pir_databases/pirsf/data/pirsfinfo.dat This method takes three arguments: from : pirsfinfo.dat file to : file to write information to verbose : report on progress Returns the number of lines in the pirsinfo file that were read. =cut sub pirsfcorrespondence { my ($self, $from, $to, $verbose)=@_; unless (-e $from) { print STDERR "File $from does not exist as called in $0\n"; return 0; } open (IN, $from) || die "Can't open $from"; open (OUT, ">$to") || die "Can't write to $to"; my $linecount; while () { $linecount++; if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"} if (/^>/) {print OUT; next} chomp; foreach my $peg ($self->swiss_pir_ids($_)) { print OUT $_, "\t", $peg, "\n"; } } close IN; close OUT; return $linecount; } =head2 uniprotcorrespondence Generate a correspondence table between uniprot knowledge base IDs and FIG ID's. The uniprot KB file is in the form: UniProtKB_Primary_Accession | UniProtKB_ID | Section | Protein Name This method takes three arguments: from : uniprotKB file to : file to write information to verbose : report on progress Returns the number of lines in the pirsinfo file that were read. =cut sub uniprotcorrespondence { my ($self, $from, $to, $verbose)=@_; unless (-e $from) { print STDERR "File $from does not exist as called in $0\n"; return 0; } open (IN, $from) || die "Can't open $from"; open (OUT, ">$to") || die "Can't write to $to"; my $linecount; while () { chomp; $linecount++; if ($verbose && !($linecount % 10000)) {print STDERR "Parsed $linecount lines\n"} my @line=split /\s+\|\s+/; my $added; foreach my $peg ($self->swiss_pir_ids($line[0])) { print OUT "$_ | $peg\n"; $added=1; } unless ($added) {print OUT "$_\n"} } close IN; close OUT; return $linecount; } =head2 swiss_pir_ids() SwissProt/PIR have lots of ID's that we want to get, usually in this order - uni --> tr --> sp. This routine will map swissprot/pir ids to fig id's. =cut sub swiss_pir_ids { my ($self, $id)=@_; return () unless ($id); my @return=($fig->by_alias("uni|$id")); return @return if ($return[0]); @return=($fig->by_alias("tr|$id")); return @return if ($return[0]); @return=($fig->by_alias("sp|$id")); return @return if ($return[0]); return (); } =head2 ss_by_id Generate a list of subsystems that a peg occurs in. This is a ; separated list. This is a wrapper that removes roles and ignores essential things =cut sub ss_by_id { my ($self, $peg)=@_; my $ssout; foreach my $ss (sort $fig->subsystems_for_peg($peg)) { next if ($$ss[0] =~ /essential/i); # Ignore the Essential B-subtilis subsystems $ssout.=$$ss[0]."; "; } $ssout =~ s/; $//; return $ssout; } =head2 ss_by_homol Generate a list of subsystems that homologs of a peg occur in. This is a ; separated list. This is also a wrapper around sims and ss, but makes everything unified =cut sub ss_by_homol { my ($self, $peg)=@_; return unless ($peg); my ($maxN, $maxP)=(50, 1e-20); # find the sims my @sims=$fig->sims($peg, $maxN, $maxP, 'fig'); # we are only going to keep the best hit for each peg # in a subsystem my $best_ss_score; my $best_ss_id; foreach my $sim (@sims) { my $simpeg=$$sim[1]; my $simscore=$$sim[10]; my @subsys=$fig->subsystems_for_peg($simpeg); foreach my $ss (@subsys) { if (! defined $best_ss_score->{$$ss[0]}) {$best_ss_score->{$$ss[0]}=$simscore; $best_ss_id->{$$ss[0]}=$simpeg} elsif ($best_ss_score->{$$ss[0]} > $simscore) { $best_ss_score->{$$ss[0]}=$simscore; $best_ss_id->{$$ss[0]}=$simpeg; } } } my $ssoutput=join "", (map {"$_ (".$best_ss_id->{$_}."), "} keys %$best_ss_id); $ssoutput =~ s/, $//; return $ssoutput; } =head2 tagvalue This will just check for tag value pairs and return either an array of values or a single ; separated list (if called as a scalar) e.g. $values=raelib->tagvalue($peg, "PIRSF"); print join "\n", @$values; Returns an empty array if no tag/value appropriate. Just because I use this a lot I don't want to waste rewriting it. =cut sub tagvalue { my ($self, $peg, $tag)=@_; my @return; my @attr=$fig->feature_attributes($peg); foreach my $attr (@attr) { my ($gotpeg, $gottag, $val, $link)=@$attr; push @return, $val if ($gottag eq $tag); } return wantarray ? @return : join "; ", @return; } =head2 locations_on_contig Return the locations of a sequence on a contig. This will look for exact matches to a sequence on a contig, and return a reference to an array that has all the locations. my $locations=$raelib->locations_on_contig($genome, $contig, 'GATC', undef); foreach my $bp (@$locations) { ... do something ... } first argument : genome number second argument : contig name third argument : sequence to look for fourth argument : beginning position to start looking from (can be undef) fifth argument : end position to stop looking from (can be undef) sixth argument : check reverse complement (0 or undef will check forward, 1 or true will check rc) Note, the position is calculated before the sequence is rc'd =cut sub locations_on_contig { my ($self, $genome, $contig, $sequence, $from, $to, $check_reverse)=@_; my $return=[]; # get the dna sequence of the contig, and make sure it is uppercase my $contig_ln=$fig->contig_ln($genome, $contig); return $return unless ($contig_ln); unless ($from) {$from=1} unless ($to) {$to=$contig_ln} if ($from > $to) {($from, $to)=($to, $from)} my $dna_seq=$fig->dna_seq($genome, $contig."_".$from."_".$to); $dna_seq=uc($dna_seq); # if we want to check the rc, we actually rc the query $sequence=$fig->reverse_comp($sequence) if ($check_reverse); $sequence=uc($sequence); # now find all the matches my $posn=index($dna_seq, $sequence, 0); while ($posn > -1) { push @$return, $posn; $posn=index($dna_seq, $sequence, $posn+1); } return $return; } =head2 scrolling_org_list This is the list from index.cgi, that I call often. It has one minor modification: the value returned is solely the organisms id and does not contain genus_species information. I abstracted this here: 1, so I could call it often, and 2, so I could edit it once. use like this push @$html, $raelib->scrolling_org_list($cgi, $multiple); multiple selections will only be set if $multiple is true =cut sub scrolling_org_list { my ($self, $cgi, $multiple)=@_; unless ($multiple) {$multiple=0} my @display = ( 'All', 'Archaea', 'Bacteria', 'Eucarya', 'Viruses', 'Environmental samples' ); # # Canonical names must match the keywords used in the DBMS. They are # defined in compute_genome_counts.pl # my %canonical = ( 'All' => undef, 'Archaea' => 'Archaea', 'Bacteria' => 'Bacteria', 'Eucarya' => 'Eukaryota', 'Viruses' => 'Virus', 'Environmental samples' => 'Environmental Sample' ); my $req_dom = $cgi->param( 'domain' ) || 'All'; my @domains = $cgi->radio_group( -name => 'domain', -default => $req_dom, -override => 1, -values => [ @display ] ); my $n_domain = 0; my %dom_num = map { ( $_, $n_domain++ ) } @display; my $req_dom_num = $dom_num{ $req_dom } || 0; # # Viruses and Environmental samples must have completeness = All (that is # how they are in the database). Otherwise, default is Only "complete". # my $req_comp = ( $req_dom_num > $dom_num{ 'Eucarya' } ) ? 'All' : $cgi->param( 'complete' ) || 'Only "complete"'; my @complete = $cgi->radio_group( -name => 'complete', -default => $req_comp, -override => 1, -values => [ 'All', 'Only "complete"' ] ); # # Use $fig->genomes( complete, restricted, domain ) to get org list: # my $complete = ( $req_comp =~ /^all$/i ) ? undef : "complete"; my $orgs; my $label; @$orgs = $fig->genomes( $complete, undef, $canonical{ $req_dom } ); foreach (@$orgs) { my $gs = $fig->genus_species($_); my $gc=scalar $fig->all_contigs($_); $label->{$_} = "$gs ($_) [$gc contigs]"; } @$orgs = sort {$label->{$a} cmp $label->{$b}} @$orgs; my $n_genomes = @$orgs; return ( "\n", " \n", " ", " ", " \n", "
", $cgi->scrolling_list( -name => 'korgs', -values => $orgs, -labels => $label, -size => 10, -multiple => $multiple, ), $cgi->br, "$n_genomes genomes shown ", $cgi->submit( 'Update List' ), $cgi->reset, $cgi->br, " ", join( "
", "Domain(s) to show:", @domains), "
\n", join( "
", "Completeness?", @complete), "\n", "
\n", $cgi->hr ); } 1;