[Bio] / FigKernelPackages / ANNO.pm Repository:
ViewVC logotype

View of /FigKernelPackages/ANNO.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (download) (as text) (annotate)
Thu Oct 28 19:12:11 2010 UTC (9 years ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_02212011, rast_rel_2010_1206, rast_rel_2011_0119, mgrast_dev_02222011
Changes since 1.7: +3 -0 lines
kmer info changes

#!/usr/bin/perl -w
use strict;

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#
package ANNO;

use strict;
use ERDB;
use Tracer;
use SeedUtils;
use ServerThing;

sub new {
    my ($class) = @_;
    # Create the sapling object.
    my $sap = ERDB::GetDatabase('Sapling');
    # Create the server object.
    my $retVal = { db => $sap };
    # Bless and return it.
    bless $retVal, $class;
    return $retVal;
}


=head2 Primary Methods

=head3 methods

    my $methodList =        $ssObject->methods();

Return a list of the methods allowed on this object.

=cut

use constant METHODS => [qw(metabolic_reconstruction
			    assign_function_to_prot
			    call_genes
			    find_rnas
			    assign_functions_to_DNA
			    find_special_proteins
                            assign_functions_to_dna_small
			    get_dataset
			    get_vector_basis_sets
			    get_active_datasets
                        )];

sub methods {
    # Get the parameters.
    my ($self) = @_;
    # Return the result.
    return METHODS;
}

#
# Docs are in ANNOserver.pm.
#

sub find_special_proteins {
    # Get the parameters.
    my ($self, $args) = @_;
    # Pull in the special protein finder.
    require find_special_proteins;
    # Convert the hash to the form expected by find_special_proteins.
    my $params = {
        contigs => $args->{-contigs},
        is_init => $args->{-is_init},
        is_alt => $args->{-is_alt},
        is_term => $args->{-is_term},
        comment => $args->{-comment}
        };
    if (exists $args->{-templates}) {
        my $templates = $args->{-templates};
        if (ref $templates eq 'ARRAY') {
            $params->{references} = $templates;
        } elsif ($templates =~ /^pyr/) {
            $params->{pyrrolysine} = 1
        }
    }
    # Process the input.
    my @retVal = find_special_proteins::find_selenoproteins($params);
    # Return the result.
    return \@retVal;
}

sub metabolic_reconstruction {
    # Get the parameters.
    my ($self, $args) = @_;

    my $sapling = $self->{db};
    my $retVal = [];

    # This counter will be used to generate user IDs for roles without them.
    my $next = 1000;

    my $id_roles = $args->{-roles};
    my @id_roles1 = map { (ref $_ ? $_ : [$_, "FR" . ++$next]) } @$id_roles;

    my @id_roles = ();
    foreach my $tuple (@id_roles1)
    {
	my($function,$id) = @$tuple;
	foreach my $role (split(/(?:; )|(?: [\]\@] )/,$function))
	{
	    push(@id_roles,[$role,$id]);
	}
    }

    my %big;
    my $id_display = 1;
    map {push(@{$big{$_->[0]}}, $_->[1])} @id_roles;
    my @resultRows = $sapling->GetAll("Subsystem Includes Role", 
                            'Subsystem(usable) = ? ORDER BY Subsystem(id), Includes(sequence)',
			    [1], [qw(Subsystem(id) Role(id) Includes(abbreviation))]);
    my %ss_roles;
    foreach my $row (@resultRows) {
        my ($sub, $role, $abbr) = @$row;
        $ss_roles{$sub}->{$role} = $abbr;
    }
    foreach my $sub (keys %ss_roles) {
        my $roles = $ss_roles{$sub};
	my @rolesubset = grep { $big{$_} } keys %$roles;
        my @abbr = map{$roles->{$_}} @rolesubset;
        my $set =  join(" ",  @abbr);
        if (@abbr > 0) {
            my ($variant, $size) = $self->get_max_subset($sub, $set);
            if ($variant) {
                foreach my $role (keys %$roles) {
                    if ($id_display) {
			if (exists $big{$role}) {
			    foreach my $id (@{$big{$role}}) {
			        push (@$retVal, [$variant, $role, $id]);
			    }
			}
                    } else {
                        push (@$retVal, [$variant, $role]);
                    }
                }
            }
        }
    }
    # Return the result.
    return $retVal;
}

=head3 assign_functions_to_dna_small

    my $idHash =            $annoObject->assign_functions_to_dna_small({
                                -seqs => [[$id1, $comment1, $seq1],
                                          [$id2, $comment2, $seq2],
                                          ... ],
                                -kmer => 10,
                                -minHits => 3,
                                -maxGap => 600,
                            });

This method uses FIGfams to assign functions to sequences. It is intended for smaller
sequence sets than the main method, because it eschews the normal flow control; however,
it is easier to use for things like the EXCEL interface.

The parameters are as follows.

=item parameter

The parameter should be a reference to a hash with the following keys.

=over 8

=item -seqs

Reference to a list of 3-tuples, each consisting of (0) an arbitrary unique ID and
(1) a comment, and (2) a sequence associated with the ID.

=item -kmer

KMER size (7 to 12) to use for the FIGfam analysis. Larger sizes are faster, smaller
sizes are more accurate.

=item -minHits (optional)

A number from 1 to 10, indicating the minimum number of matches required to
consider a protein as a candidate for assignment to a FIGfam. A higher value
indicates a more reliable matching algorithm; the default is C<3>.

=item -maxGap (optional)

When looking for a match, if two sequence elements match and are closer than
this distance, then they will be considered part of a single match. Otherwise,
the match will be split. The default is C<600>.

=back

=item RETURN

Returns a hash mapping each incoming ID to a list of hit regions. Each hit
region is a n-tuple consisting of (0) the number of matches to the function, (1) the
start location, (2) the stop location, (3) the proposed function, (4) the name
of the Genome Set from which the gene is likely to have originated, (5) the ID
number of the OTU (or C<undef> if the OTU was not found), and (6) the IDs of the
roles represented in the function, if any of them have IDs.


=back

=cut

sub assign_functions_to_dna_small {
    # Get the parameters.
    my ($self, $args) = @_;
    # Get the Kmers object.
    my $kmers = $self->{kmers};
    # Analyze the options.
    my $maxGap = $args->{-maxGap} || 600;
    my $minHits = $args->{-minHits} || 3;
    # Get the KMER size.
    my $kmer = $args->{-kmer};
    # Declare the return variable.
    my $retVal = {};
    # Get the sapling database.
    my $sap = $self->{db};
    # Get the sequence tuples.
    my $seqs = ServerThing::GetIdList(-seqs => $args);
    # Loop through the sequences, finding assignments.
    for my $seqTuple (@$seqs) {
        # Extract the ID and sequence.
        my ($id, undef, $seq) = @$seqTuple;
        # Compute the assignment.
        my $assignment = $kmers->assign_functions_to_PEGs_in_DNA($kmer, $seq,
                                                                 $minHits, $maxGap);
        # Loop through the assignments, adding the function and OTU IDs.
        for my $tuple (@$assignment) {
            # Extract the function and OTU.
            my $function = $tuple->[3];
            my $otu = $tuple->[4];
            # Get the IDs for the roles (if any).
            my @roleIdx;
            if ($function) {
                # We have a function, so split it into roles.
                my @roles = roles_of_function($function);
                # Accumulate the IDs for the roles found.
                for my $role (@roles) {
                    push @roleIdx, $sap->GetEntityValues(Role => $role, ['role-index']);
                }
            }
            # Get the ID for the OTU (if any).
            my $otuIdx;
            if ($otu) {
                ($otuIdx) = $sap->GetFlat("Genome IsCollectedInto", 
                            'Genome(scientific-name) = ?', [$otu], 
                            'IsCollectedInto(to-link)');
            }
            # Update the tuple.
            splice @$tuple, 5, undef, $otuIdx, @roleIdx;
        }
        # Store the result.
        $retVal->{$id} = $assignment;
    }
    # Return the results.
    return $retVal;
}


=head2 Internal Utility Methods

=head3 set_kmer_data

    $annoObject->set_kmer_data($kmers);

Store the default KMER object for this annotation service.

=cut

sub set_kmer_data {
    # Get the parameters.
    my ($self, $kmers) = @_;
    # Store the specified object.
    $self->{kmers} = $kmers;
}

=head3 get_max_subset

    my ($max_variant, $max_size) = $ssObject->get_max_subset($sub, $setA);

Given a subsystem ID and a role rule, return the ID of the variant for
the subsystem that matches the most roles in the rule and the number of
roles matched.

=over 4

=item sub

Name (ID) of the subsystem whose variants are to be examined.

=item setA

A space-delimited list of role abbreviations, lexically ordered. This provides
a unique specification of the roles in the set.

=item RETURN

Returns a 2-element list consisting of name variant found (subsystem name, colon,
and variant code) and the number of roles matched.

=back

=cut

sub get_max_subset {
    my ($self, $sub, $setA) = @_;
    my $sapling = $self->{db};
    my $max_size = 0;
    my $max_set;
    my $max_variant;
    my %set_hash;
    my $qh = $sapling->Get("Subsystem Describes Variant", 'Subsystem(id) = ? AND Variant(type) = ?', [$sub, 'normal']);
    while (my $resultRow = $qh->Fetch()) {
        my @variantRoleRule = $resultRow->Value('Variant(role-rule)');
        my ($variantCode) = $resultRow->Value('Variant(code)');
        my $variantId = $sub.":".$variantCode;
        foreach my $setB (@variantRoleRule) {
                    my $size = is_A_a_superset_of_B($setA, $setB);
                    if ($size  && $size > $max_size) {
                            $max_size = $size;
                            $max_set = $setB;
                            $max_variant = $variantId;
                    }
        }
    }
    #if ($max_size) {
            #print STDERR "Success $max_variant, $max_set\n";
    #}
    return($max_variant, $max_size);
}


=head3 is_A_a_superset_of_B

    my $size = SS::is_A_a_superset_of_B($a, $b);

This method takes as input two role rules, and returns 0 if the first
role rule is NOT a superset of the second; otherwise, it returns the size
of the second rule. A role rule is a space-delimited list of role
abbreviations in lexical order. This provides a unique identifier for a
set of roles in a subsystem.

=over 4

=item a

First role rule.

=item b

Second role rule.

=item RETURN

Returns 0 if the first rule is NOT a superset of the second and the size of the
second rule if it is. As a result, if the first rule IS a superset, this method
will evaluate to TRUE, and to FALSE otherwise.

=back

=cut

sub is_A_a_superset_of_B {
    my ($a, $b) = @_;
    my @a = split(" ", $a);
    my @b = split(" ", $b);
    if (@b > @a) {
            return(0);
    }
    my %given;
    map { $given{$_} = 1} @a;
    map { if (! $given{$_}) {return 0}} split(" ", $b);
    my $l = scalar(@b);
    return scalar(@b);
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3