[Bio] / FigKernelPackages / ANNO.pm Repository:
ViewVC logotype

View of /FigKernelPackages/ANNO.pm

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.6 - (download) (as text) (annotate)
Wed Aug 25 15:49:01 2010 UTC (9 years, 7 months ago) by parrello
Branch: MAIN
CVS Tags: rast_rel_2010_0928, rast_rel_2010_0827
Changes since 1.5: +103 -0 lines
Added support for small DNA function service.

#!/usr/bin/perl -w
use strict;

# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
# This file is part of the SEED Toolkit.
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
package ANNO;

use strict;
use ERDB;
use Tracer;
use SeedUtils;
use ServerThing;

sub new {
    my ($class) = @_;
    # Create the sapling object.
    my $sap = ERDB::GetDatabase('Sapling');
    # Create the server object.
    my $retVal = { db => $sap };
    # Bless and return it.
    bless $retVal, $class;
    return $retVal;

=head2 Primary Methods

=head3 methods

    my $methodList =        $ssObject->methods();

Return a list of the methods allowed on this object.


use constant METHODS => [qw(metabolic_reconstruction

sub methods {
    # Get the parameters.
    my ($self) = @_;
    # Return the result.
    return METHODS;

# Docs are in ANNOserver.pm.

sub find_special_proteins {
    # Get the parameters.
    my ($self, $args) = @_;
    # Pull in the special protein finder.
    require find_special_proteins;
    # Convert the hash to the form expected by find_special_proteins.
    my $params = {
        contigs => $args->{-contigs},
        is_init => $args->{-is_init},
        is_alt => $args->{-is_alt},
        is_term => $args->{-is_term},
        comment => $args->{-comment}
    if (exists $args->{-templates}) {
        my $templates = $args->{-templates};
        if (ref $templates eq 'ARRAY') {
            $params->{references} = $templates;
        } elsif ($templates =~ /^pyr/) {
            $params->{pyrrolysine} = 1
    # Process the input.
    my @retVal = find_special_proteins::find_selenoproteins($params);
    # Return the result.
    return \@retVal;

sub metabolic_reconstruction {
    # Get the parameters.
    my ($self, $args) = @_;

    my $sapling = $self->{db};
    my $retVal = [];

    # This counter will be used to generate user IDs for roles without them.
    my $next = 1000;

    my $id_roles = $args->{-roles};
    my @id_roles1 = map { (ref $_ ? $_ : [$_, "FR" . ++$next]) } @$id_roles;

    my @id_roles = ();
    foreach my $tuple (@id_roles1)
	my($function,$id) = @$tuple;
	foreach my $role (split(/(?:; )|(?: [\]\@] )/,$function))

    my %big;
    my $id_display = 1;
    map {push(@{$big{$_->[0]}}, $_->[1])} @id_roles;
    my @resultRows = $sapling->GetAll("Subsystem Includes Role", 
                            'Subsystem(usable) = ? ORDER BY Subsystem(id), Includes(sequence)',
			    [1], [qw(Subsystem(id) Role(id) Includes(abbreviation))]);
    my %ss_roles;
    foreach my $row (@resultRows) {
        my ($sub, $role, $abbr) = @$row;
        $ss_roles{$sub}->{$role} = $abbr;
    foreach my $sub (keys %ss_roles) {
        my $roles = $ss_roles{$sub};
	my @rolesubset = grep { $big{$_} } keys %$roles;
        my @abbr = map{$roles->{$_}} @rolesubset;
        my $set =  join(" ",  @abbr);
        if (@abbr > 0) {
            my ($variant, $size) = $self->get_max_subset($sub, $set);
            if ($variant) {
                foreach my $role (keys %$roles) {
                    if ($id_display) {
			if (exists $big{$role}) {
			    foreach my $id (@{$big{$role}}) {
			        push (@$retVal, [$variant, $role, $id]);
                    } else {
                        push (@$retVal, [$variant, $role]);
    # Return the result.
    return $retVal;

=head3 assign_functions_to_dna_small

    my $idHash =            $annoObject->assign_functions_to_dna_small({
                                -seqs => [[$id1, $comment1, $seq1],
                                          [$id2, $comment2, $seq2],
                                          ... ],
                                -kmer => 10,
                                -minHits => 3,
                                -maxGap => 600,

This method uses FIGfams to assign functions to sequences. It is intended for smaller
sequence sets than the main method, because it eschews the normal flow control; however,
it is easier to use for things like the EXCEL interface.

The parameters are as follows.

=item parameter

The parameter should be a reference to a hash with the following keys.

=over 8

=item -seqs

Reference to a list of 3-tuples, each consisting of (0) an arbitrary unique ID and
(1) a comment, and (2) a sequence associated with the ID.

=item -kmer

KMER size (7 to 12) to use for the FIGfam analysis. Larger sizes are faster, smaller
sizes are more accurate.

=item -minHits (optional)

A number from 1 to 10, indicating the minimum number of matches required to
consider a protein as a candidate for assignment to a FIGfam. A higher value
indicates a more reliable matching algorithm; the default is C<3>.

=item -maxGap (optional)

When looking for a match, if two sequence elements match and are closer than
this distance, then they will be considered part of a single match. Otherwise,
the match will be split. The default is C<600>.


=item RETURN

Returns a hash mapping each incoming ID to a list of hit regions. Each hit
region is a 5-tuple consisting of (0) the number of matches to the function, (1) the
start location, (2) the stop location, (3) the proposed function, and (4) the name
of the Genome Set from which the gene is likely to have originated.



sub assign_functions_to_dna_small {
    # Get the parameters.
    my ($self, $args) = @_;
    # Get the Kmers object.
    my $kmers = $self->{kmers};
    # Analyze the options.
    my $maxGap = $args->{-maxGap} || 600;
    my $minHits = $args->{-minHits} || 3;
    # Get the KMER size.
    my $kmer = $args->{-kmer};
    # Declare the return variable.
    my $retVal = {};
    # Get the sequence tuples.
    my $seqs = ServerThing::GetIdList(-seqs => $args);
    # Loop through the sequences, finding assignments.
    for my $seqTuple (@$seqs) {
        # Extract the ID and sequence.
        my ($id, undef, $seq) = @$seqTuple;
        # Compute the assignment.
        my $assignment = $kmers->assign_functions_to_PEGs_in_DNA($kmer, $seq,
                                                                 $minHits, $maxGap);
        # Store the result.
        $retVal->{$id} = $assignment;
    # Return the results.
    return $retVal;

=head2 Internal Utility Methods

=head3 set_kmer_data


Store the default KMER object for this annotation service.


sub set_kmer_data {
    # Get the parameters.
    my ($self, $kmers) = @_;
    # Store the specified object.
    $self->{kmers} = $kmers;

=head3 get_max_subset

    my ($max_variant, $max_size) = $ssObject->get_max_subset($sub, $setA);

Given a subsystem ID and a role rule, return the ID of the variant for
the subsystem that matches the most roles in the rule and the number of
roles matched.

=over 4

=item sub

Name (ID) of the subsystem whose variants are to be examined.

=item setA

A space-delimited list of role abbreviations, lexically ordered. This provides
a unique specification of the roles in the set.

=item RETURN

Returns a 2-element list consisting of name variant found (subsystem name, colon,
and variant code) and the number of roles matched.



sub get_max_subset {
    my ($self, $sub, $setA) = @_;
    my $sapling = $self->{db};
    my $max_size = 0;
    my $max_set;
    my $max_variant;
    my %set_hash;
    my $qh = $sapling->Get("Subsystem Describes Variant", 'Subsystem(id) = ? AND Variant(type) = ?', [$sub, 'normal']);
    while (my $resultRow = $qh->Fetch()) {
        my @variantRoleRule = $resultRow->Value('Variant(role-rule)');
        my ($variantCode) = $resultRow->Value('Variant(code)');
        my $variantId = $sub.":".$variantCode;
        foreach my $setB (@variantRoleRule) {
                    my $size = is_A_a_superset_of_B($setA, $setB);
                    if ($size  && $size > $max_size) {
                            $max_size = $size;
                            $max_set = $setB;
                            $max_variant = $variantId;
    #if ($max_size) {
            #print STDERR "Success $max_variant, $max_set\n";
    return($max_variant, $max_size);

=head3 is_A_a_superset_of_B

    my $size = SS::is_A_a_superset_of_B($a, $b);

This method takes as input two role rules, and returns 0 if the first
role rule is NOT a superset of the second; otherwise, it returns the size
of the second rule. A role rule is a space-delimited list of role
abbreviations in lexical order. This provides a unique identifier for a
set of roles in a subsystem.

=over 4

=item a

First role rule.

=item b

Second role rule.

=item RETURN

Returns 0 if the first rule is NOT a superset of the second and the size of the
second rule if it is. As a result, if the first rule IS a superset, this method
will evaluate to TRUE, and to FALSE otherwise.



sub is_A_a_superset_of_B {
    my ($a, $b) = @_;
    my @a = split(" ", $a);
    my @b = split(" ", $b);
    if (@b > @a) {
    my %given;
    map { $given{$_} = 1} @a;
    map { if (! $given{$_}) {return 0}} split(" ", $b);
    my $l = scalar(@b);
    return scalar(@b);


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3