[Bio] / FigKernelPackages / SeedUtils.pm Repository:
ViewVC logotype

View of /FigKernelPackages/SeedUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.15 - (download) (as text) (annotate)
Tue Nov 24 19:23:36 2009 UTC (10 years, 4 months ago) by parrello
Branch: MAIN
Changes since 1.14: +195 -6 lines
New methods for Sapling server.

#!/usr/bin/perl -w

# This is a SAS component.

# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
# This file is part of the SEED Toolkit.
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.

package SeedUtils;

    use strict;
    no warnings qw(once);
    use base qw(Exporter);
    our @EXPORT = qw(hypo boundaries_of parse_fasta_record create_fasta_record
                     rev_comp genome_of min max sims verify_dir between translate

=head1 SEED Utility Methods

=head2 Introduction

This is a simple utility package that performs functions useful for
bioinformatics, but that do not require access to the databases.

=head2 Public Methods

=head3 boundaries_of

    my ($contig, $min, $max) = boundaries_of($locs);

Return the boundaries of a set of locations. The contig, the leftmost
location, and the rightmost location will be returned to the caller. If
more than one contig is represented, the method will return an undefined
value for the contig (indicating failure).

=over 4

=item locs

Reference to a list of location strings. A location string contains a contig ID,
and underscore (C<_>), a starting offset, a strand identifier (C<+> or C<->), and
a length (e.g. C<360108.3:NC_10023P_1000+2000> begins at offset 1000 of contig
B<360108.3:NC_10023P> and covers 2000 base pairs on the C<+> strand).

=item RETURN

Returns a 3-element list. The first element is the contig ID from all the locations,
the second is the offset of leftmost base pair represented in the locations, and the
third is the offset of the rightmost base pair represented in the locations.



sub boundaries_of {
    # Get the parameters.
    my ($locs) = @_;
    # Declare the return variables.
    my ($contig, $min, $max);
    # We'll put all the starting and ending offsets found in here.
    my @offsets;
    # This will count the number of errors found.
    my $error = 0;
    # Loop through the locations.
    for my $loc (@$locs) {
        # Parse this location.
        if ($loc =~ /^(.+)_(\d+)(\+|\-)(\d+)$/) {
            my ($newContig, $begin, $dir, $len) = ($1, $2, $3, $4);
            # Is this contig valid?
            if ($contig && $newContig ne $contig) {
                # No, skip this location.
            } else {
                # Save the contig.
                $contig = $newContig;
                # Compute the ending offset.
                my $end = ($dir eq '+' ? $begin + $len - 1 : $begin - $len + 1);
                # Save both offsets.
                push @offsets, $begin, $end;
        } else {
            # The location is invalid, so it's an error,
    # If there's an error, clear the contig ID.
    if ($error) {
        $contig = undef;
    # Compute the min and max from the offsets collected.
    $min = min(@offsets);
    $max = max(@offsets);
    # Return the results.
    return ($contig, $min, $max);

=head3 max

    my $max = max(@nums);

Return the maximum number from all the values in the specified list.

=over 4

=item nums

List of numbers to examine.

=item RETURN

Returns the maximum numeric value from the specified parameters, or
an undefined value if an empty list is passed in.



sub max {
    my ($retVal, @nums) = @_;
    for my $num (@nums) {
        if ($num > $retVal) {
            $retVal = $num;
    return $retVal;

=head3 min

    my $min = min(@nums);

Return the minimum number from all the values in the specified list.

=over 4

=item nums

List of numbers to examine.

=item RETURN

Returns the minimum numeric value from the specified parameters, or
an undefined value if an empty list is passed in.



sub min {
    my ($retVal, @nums) = @_;
    for my $num (@nums) {
        if ($num < $retVal) {
            $retVal = $num;
    return $retVal;

=head3 create_fasta_record

    my $fastaString = create_fasta_record($id, $comment, $sequence, $stripped);

Create a FASTA record from the specified DNA or protein sequence. The
sequence will be split into 60-character lines, and the record will
include an identifier line.

=over 4

=item id

ID for the sequence, to be placed at the beginning of the identifier

=item comment (optional)

Comment text to place after the ID on the identifier line. If this parameter
is empty, undefined, or 0, no comment will be placed.

=item sequence

Sequence of letters to form into FASTA. For purposes of convenience, whitespace
characters in the sequence will be removed automatically.

=item stripped (optional)

If TRUE, then the sequence will be returned unmodified instead of converted
to FASTA format. The default is FALSE.

=item RETURN

Returns the desired sequence in FASTA format.



sub create_fasta_record {
    # Get the parameters.
    my ($id, $comment, $sequence, $stripped) = @_;
    # Declare the return variable.
    my $retVal;
    # If we're in stripped mode, we just return the sequence.
    if ($stripped) {
        $retVal = $sequence;
    } else {
        # Here we have to do the FASTA conversion. Start with the ID.
        my $header = ">$id";
        # Add a comment, if any.
        if ($comment) {
            $header .= " $comment";
        # Clean up the sequence.
        $sequence =~ s/\s+//g;
        # We need to format the sequence into 60-byte chunks. We use the infamous
        # grep-split trick. The split, because of the presence of the parentheses,
        # includes the matched delimiters in the output list. The grep strips out
        # the empty list items that appear between the so-called delimiters, since
        # the delimiters are what we want.
        my @chunks = grep { $_ } split /(.{1,60})/, $sequence;
        # Add the chunks and the trailer.
        $retVal = join("\n", $header, @chunks) . "\n";
    # Return the result.
    return $retVal;

=head3 rev_comp

    my $revcmp = rev_comp($dna);



Return the reverse complement of a DNA string.

=over 4

=item dna

Either a DNA string, or a reference to a DNA string.

=item RETURN

If the input is a DNA string, returns the reverse complement. If the
input is a reference to a DNA string, the string itself is reverse



sub rev_comp {
    # Get the parameters.
    my ($dna) = @_;
    # Determine how we were called.
    my ($retVal, $refMode);
    if (ref $dna eq 'SCALAR') {
        $retVal = lc reverse $dna;
        $refMode = 0;
    } else {
        $retVal = lc reverse $$dna;
        $refMode = 1;
    # Now $retVal contains the reversed DNA string in all lower case, and
    # $refMode is TRUE iff the user passed in a reference. The following
    # translation step complements the string.
    $retVal =~ tr/acgtumrwsykbdhv/tgcaakywsrmvhdb/;
    # Return the result in the method corresponding to the way it came in.
    if ($refMode) {
        $$dna = $retVal;
    } else {
        return $retVal;

=head3 by_fig_id

    my @sorted_by_fig_id = sort { FIG::by_fig_id($a,$b) } @fig_ids;

Compare two feature IDs.

This function is designed to assist in sorting features by ID. The sort is by
genome ID followed by feature type and then feature number.

=over 4

=item a

First feature ID.

=item b

Second feature ID.

=item RETURN

Returns a negative number if the first parameter is smaller, zero if both parameters
are equal, and a positive number if the first parameter is greater.



sub by_fig_id {
    my($a,$b) = @_;
    if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
         ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
        ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
    } else {
        $a cmp $b;

=head3 genome_of

    my $genomeID = genome_of($fid);

Return the Genome ID embedded in the specified FIG feature ID.

=over 4

=item fid

Feature ID of interest.

=item RETURN

Returns the genome ID in the middle portion of the FIG feature ID. If the
feature ID is invalid, this method returns an undefined value.



sub genome_of {
    # Get the parameters.
    my ($fid) = @_;
    # Declare the return variable.
    my $retVal;
    # Parse the feature ID.
    if ($fid =~ /^fig\|(\d+\.\d+)\./) {
        $retVal = $1;
    # Return the result.
    return $retVal;

=head3 sims

    my $sims = sims($id, \%seen, $maxN, $maxP, $select, $max_expand, $filters);

Retrieve similarities from the network similarity server. The similarity retrieval
is performed using an HTTP user agent that returns similarity data in multiple
chunks. An anonymous subroutine is passed to the user agent that parses and
reformats the chunks as they come in. The similarites themselves are returned
as B<Sim> objects. Sim objects are actually list references with 15 elements.
The Sim object methods allow access to the elements by name.

Similarities can be either raw or expanded. The raw similarities are basic
hits between features with similar DNA. Expanding a raw similarity drags in any
features considered substantially identical. So, for example, if features B<A1>,
B<A2>, and B<A3> are all substatially identical to B<A>, then a raw similarity
B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.

=over 4

=item id

ID of the feature whose similarities are desired, or reference to a list
of the IDs of the features whose similarities are desired.

=item maxN

Maximum number of similarities to return.

=item maxP

The maximum allowable similarity score.

=item select

Selection criterion: C<raw> means only raw similarities are returned; C<fig>
means only similarities to FIG features are returned; C<all> means all expanded
similarities are returned; and C<figx> means similarities are expanded until the
number of FIG features equals the maximum.

=item max_expand

The maximum number of features to expand.

=item filters

Reference to a hash containing filter information, or a subroutine that can be
used to filter the sims.

=item RETURN

Returns a list of L<Sim> objects.



sub sims {
    # Get the parameters.
    my($id, $maxN, $maxP, $select, $max_expand, $filters) = @_;
    # Get the URL for submitting to the sims server.
    my $url = $FIG_Config::sim_server_url || "http://bioseed.mcs.anl.gov/simserver/perl/sims.pl";
    # Get a list of the IDs to process.
    my @ids;
    if (ref($id) eq "ARRAY") {
        @ids = @$id;
    } else {
        @ids = ($id);
    # Form a list of the parameters to pass to the server.
    my %args = ();
    $args{id} = \@ids;
    $args{maxN} = $maxN if defined($maxN);
    $args{maxP} = $maxP if defined($maxP);
    $args{select} = $select if defined($select);
    $args{max_expand} = $max_expand if defined($max_expand);
    # If the filter is a hash, put the filters in the argument list.
    if (ref($filters) eq 'HASH') {
        for my $k (keys(%$filters))
            $args{"filter_$k"}= $filters->{$k};
    # Get the user agent.
    require LWP::UserAgent;
    my $ua = LWP::UserAgent->new();
    # Insure we have the Sim module.
    require Sim;
    # Our next task is to create the anonymous subroutine that will process the
    # chunks that come back from the server. We require three global variables:
    # @sims to hold the similarities found, $tail to remember the unprocessed
    # data from the previous chunk, and $chunks to count the chunks.
    my @retVal;
    my $tail;
    my $chunks = 0;
    my $cb = sub {
        eval {
            # Get the parameters.
            my ($data, $command) = @_;
            # Check for a reset command. If we get one, we discard any data
            # in progress.
            if ($command && $command eq 'reset') {
                $tail = '';
            } else {
                # Get the data to process. Note we concatenate it to the incoming
                # tail from last time.
                my $c = $tail . $data;
                # Make sure the caller hasn't messed up the new-line character.
                # FASTA readers in particular are notorious for doing things
                # like that.
                local $/ = "\n";
                # Split the input into lines.
                my @lines = split(/\n/, $c);
                # If the input does not end with a new-line, we have a partial
                # chunk and need to put it in the tail for next time. If not,
                # there is no tail for next time.
                if (substr($c, -1, 1) ne "\n") {
                    $tail = pop @lines;
                } else {
                    $tail = '';
                # Loop through the lines. Note there's no need to chomp because
                # the SPLIT took out the new-line characters.
                for my $l (@lines) {
                    # Split the line into fields.
                    my @s = split(/\t/, $l);
                    # Insure we have all the fields we need.
                    if (@s >= 9) {
                        # Check to see if we've seen this SIM before.
                        my $id1 = $s[0];
                        my $id2 = $s[1];
                        # Add it to the result list.
                        push(@retVal, bless \@s, 'Sim');
    # Now we're ready to start. Because networking is an iffy thing, we set up
    # to try our request multiple times.
    my $n_retries = 10;
    my $attempts = 0;
    # Set the timeout value, in seconds.
    # Loop until we succeed or run out of retries.
    my $done = 0;
    while (! $done && $attempts++ < $n_retries) {
        # Reset the content processor. This clears the tail.
        &$cb(undef, 'reset');
        my $resp = $ua->post($url, \%args, ':content_cb' => $cb);
        if ($resp->is_success) {
            # If the response was successful, get the content. This triggers
            # the anonymous subroutine.
            my $x = $resp->content;
            # Denote we've been successful.
            $done = 1;
    return @retVal;

=head3 verify_dir


Insure that the specified directory exists. If the directory does not
exist, it will be created.

=over 4

=item dirName

Name of the relevant directory.



sub verify_dir {
    # Get the parameters.
    my ($dirName) = @_;
    # Strip off the final slash, if any.
    $dirName =~ s#/$##;
    # Only proceed if the directory does NOT already exist.
    if (! -d $dirName) {
        # If there is a parent directory, recursively insure it is there.
        if ($dirName =~ m#(.+)/[^/]+$#) {
        # Create this particular directory with full permissions.
        mkdir $dirName, 0777;

=head3 parse_fasta_record

    my ($id, $comment, $seq) = parse_fasta_record($string);

Extract the ID, comment, and sequence from a single FASTA record. For
backward compatability, instead of a FASTA record the ID and sequence can
be specified separated by a comma. In this case, the returned comment
will be empty.

=over 4

=item string

A single FASTA record, or an ID and sequence separated by a single comma,
an unadorned sequence, a 2-element list consisting of an ID and a sequence,
or a 3-element list consisting of an ID, a comment, and a sequence.

=item RETURN

Returns a three-element list consisting of the incoming ID, the associated
comment, and the specified DNA or protein sequence. If the incoming string is
invalid, all three list elements will come back undefined. If no ID is
specified, an MD5 will be provided.



sub parse_fasta_record {
    # Get the parameters.
    my ($string) = @_;
    # Declare the return variables.
    my ($id, $comment, $seq);
    # Check the type of input string.
    if (! defined $string) {
        # Do nothing if no string was passed in. This extra check prevents a
        # warning at runtime.
    } elsif ($string =~ /^>(\S+)([\t ]+[^\r\n]*)?[\r\n]+(.+)/s) {
        # Here we have a standard FASTA string.
        ($id, $comment, $seq) = ($1, $2, $3);
        # Remove white space from the sequence string.
        $seq =~ s/\s+//sg;
        # Trim front of comment.
        $comment =~ s/^s+//;
    } elsif ($string =~ /(.+?)\s*,\s*(.+)/) {
        ($id, $comment, $seq) = ($1, '', $2);
    } elsif (ref $string eq 'ARRAY') {
        # Here the data came in pre-formatted as a list reference.
        ($id, $comment, $seq) = @$string;
        # If there's no comment, we need to adjust.
        if (! defined $seq) {
            $seq = $comment;
            $comment = '';
    } else {
        # Here we have only a sequence. We need to construct the ID.
        $seq = $string;
        require Digest::MD5;
        $id = "md5|" . Digest::MD5::md5_base64($seq);
        $comment = "";
    # Return the results.
    return ($id, $comment, $seq);

=head3 hypo

    my $flag = hypo($func);

Return TRUE if the specified functional role is hypothetical, else FALSE.
Hypothetical functional roles are identified by key words in the text,
such as I<hypothesis>, I<predicted>, or I<glimmer> (among others).

=over 4

=item func

Text of the functional role whose nature is to be determined.

=item RETURN

Returns TRUE if the role is hypothetical, else FALSE.



sub hypo {
    my ($func) = @_;
    if (! $func)                             { return 1 }
    if ($func =~ /lmo\d+ protein/i)          { return 1 }
    if ($func =~ /hypoth/i)                  { return 1 }
    if ($func =~ /conserved protein/i)       { return 1 }
    if ($func =~ /gene product/i)            { return 1 }
    if ($func =~ /interpro/i)                { return 1 }
    if ($func =~ /B[sl][lr]\d/i)             { return 1 }
    if ($func =~ /^U\d/)                     { return 1 }
    if ($func =~ /^orf[^_]/i)                { return 1 }
    if ($func =~ /uncharacterized/i)         { return 1 }
    if ($func =~ /pseudogene/i)              { return 1 }
    if ($func =~ /^predicted/i)              { return 1 }
    if ($func =~ /AGR_/)                     { return 1 }
    if ($func =~ /similar to/i)              { return 1 }
    if ($func =~ /similarity/i)              { return 1 }
    if ($func =~ /glimmer/i)                 { return 1 }
    if ($func =~ /unknown/i)                 { return 1 }
    if (($func =~ /domain/i) ||
        ($func =~ /^y[a-z]{2,4}\b/i) ||
        ($func =~ /complete/i) ||
        ($func =~ /ensang/i) ||
        ($func =~ /unnamed/i) ||
        ($func =~ /EG:/) ||
        ($func =~ /orf\d+/i) ||
        ($func =~ /RIKEN/) ||
        ($func =~ /Expressed/i) ||
        ($func =~ /[a-zA-Z]{2,3}\|/) ||
        ($func =~ /predicted by Psort/) ||
        ($func =~ /^bh\d+/i) ||
        ($func =~ /cds_/i) ||
        ($func =~ /^[a-z]{2,3}\d+[^:\+\-0-9]/i) ||
        ($func =~ /similar to/i) ||
        ($func =~ / identi/i) ||
        ($func =~ /ortholog of/i) ||
        ($func =~ /structural feature/i))    { return 1 }
    return 0;


=head3 between

    my $flag = between($x, $y, $z);

Determine whether or not $y is between $x and $z.

=over 4

=item x

First edge number.

=item y

Number to examine.

=item z

Second edge number.

=item RETURN

Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check
is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns
TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than
I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,
then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.


#: Return Type $;
sub between {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($x,$y,$z) = @_;

    if ($x < $z) {
        return (($x <= $y) && ($y <= $z));
    } else {
        return (($x >= $y) && ($y >= $z));

=head3 standard_genetic_code

    my $code = standard_genetic_code();

Return a hash containing the standard translation of nucleotide triples to proteins.
Methods such as L</translate> can take a translation scheme as a parameter. This method
returns the default translation scheme. The scheme is implemented as a reference to a
hash that contains nucleotide triplets as keys and has protein letters as values.


sub standard_genetic_code {

    my $code = {};

    $code->{"AAA"} = "K";
    $code->{"AAC"} = "N";
    $code->{"AAG"} = "K";
    $code->{"AAT"} = "N";
    $code->{"ACA"} = "T";
    $code->{"ACC"} = "T";
    $code->{"ACG"} = "T";
    $code->{"ACT"} = "T";
    $code->{"AGA"} = "R";
    $code->{"AGC"} = "S";
    $code->{"AGG"} = "R";
    $code->{"AGT"} = "S";
    $code->{"ATA"} = "I";
    $code->{"ATC"} = "I";
    $code->{"ATG"} = "M";
    $code->{"ATT"} = "I";
    $code->{"CAA"} = "Q";
    $code->{"CAC"} = "H";
    $code->{"CAG"} = "Q";
    $code->{"CAT"} = "H";
    $code->{"CCA"} = "P";
    $code->{"CCC"} = "P";
    $code->{"CCG"} = "P";
    $code->{"CCT"} = "P";
    $code->{"CGA"} = "R";
    $code->{"CGC"} = "R";
    $code->{"CGG"} = "R";
    $code->{"CGT"} = "R";
    $code->{"CTA"} = "L";
    $code->{"CTC"} = "L";
    $code->{"CTG"} = "L";
    $code->{"CTT"} = "L";
    $code->{"GAA"} = "E";
    $code->{"GAC"} = "D";
    $code->{"GAG"} = "E";
    $code->{"GAT"} = "D";
    $code->{"GCA"} = "A";
    $code->{"GCC"} = "A";
    $code->{"GCG"} = "A";
    $code->{"GCT"} = "A";
    $code->{"GGA"} = "G";
    $code->{"GGC"} = "G";
    $code->{"GGG"} = "G";
    $code->{"GGT"} = "G";
    $code->{"GTA"} = "V";
    $code->{"GTC"} = "V";
    $code->{"GTG"} = "V";
    $code->{"GTT"} = "V";
    $code->{"TAA"} = "*";
    $code->{"TAC"} = "Y";
    $code->{"TAG"} = "*";
    $code->{"TAT"} = "Y";
    $code->{"TCA"} = "S";
    $code->{"TCC"} = "S";
    $code->{"TCG"} = "S";
    $code->{"TCT"} = "S";
    $code->{"TGA"} = "*";
    $code->{"TGC"} = "C";
    $code->{"TGG"} = "W";
    $code->{"TGT"} = "C";
    $code->{"TTA"} = "L";
    $code->{"TTC"} = "F";
    $code->{"TTG"} = "L";
    $code->{"TTT"} = "F";

    return $code;

=head3 translate

    my $aa_seq = translate($dna_seq, $code, $fix_start);

Translate a DNA sequence to a protein sequence using the specified genetic code.
If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to
C<M>. (In the standard genetic code, these two combinations normally translate
to C<V> and C<L>, respectively.)

=over 4

=item dna_seq

DNA sequence to translate. Note that the DNA sequence can only contain
known nucleotides.

=item code

Reference to a hash specifying the translation code. The hash is keyed by
nucleotide triples, and the value for each key is the corresponding protein
letter. If this parameter is omitted, the L</standard_genetic_code> will be

=item fix_start

TRUE if the first triple is to get special treatment, else FALSE. If TRUE,
then a value of C<TTG> or C<GTG> in the first position will be translated to
C<M> instead of the value specified in the translation code.

=item RETURN

Returns a string resulting from translating each nucleotide triple into a
protein letter.


#: Return Type $;
sub translate {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);

    my( $dna,$code,$start ) = @_;
    my( $i,$j,$ln );
    my( $x,$y );
    my( $prot );
    if (! defined($code)) {
        $code = &FIG::standard_genetic_code;
    $ln = length($dna);
    $prot = "X" x ($ln/3);
    $dna =~ tr/a-z/A-Z/;

    for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {
        $x = substr($dna,$i,3);
        if ($y = $code->{$x}) {
            substr($prot,$j,1) = $y;

    if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {
        substr($prot,0,1) = 'M';
    return $prot;


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3