[Bio] / FigKernelScripts / evaluate_annotation_consistency.pl Repository:
ViewVC logotype

View of /FigKernelScripts/evaluate_annotation_consistency.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Mar 11 18:42:02 2013 UTC (6 years, 8 months ago) by fangfang
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
script for evaluating annotation consistency

use strict;
use Carp;
use Data::Dumper;
use Digest::MD5;

my $usage = "Usage: $0 MD5-sets [id-pattern] < protein.fasta \n
    - MD5-sets is a two-column tab-delimited table [ family-index, md5 ] that defines a set of protein families.
    - protein.fasta is the input file the functional annotation as comments whose consistency is to be computed.
\n";

my $md5_sets = shift @ARGV;
(-s $md5_sets) || die "You need to specify the file of md5 sets as the first argument";
my $id_pat = shift @ARGV;
if (! $id_pat) { $id_pat = '\\S' }

my %seen_md5;
my $N=0;

my %keep_md5s = map { ($_ =~ /^\S+\t(\S+)$/) ? ($1 => 1) : ()  } `cat $md5_sets`;
my %md5_to_func;

$/ = "\n>";
while (defined($_ = <STDIN>))
{
    chomp;
    if ($_ =~ /^>?(\S+)\s+([^\n]*)\n(.*)/s)
    {
	my $id = $1;
	my $hdr = $2;
	my $sequence = $3;
	$sequence =~ s/\s//g;
	my $md5 = Digest::MD5::md5_hex( uc $sequence );
	if ($keep_md5s{$md5})
	{
	    my @id_def_org = nr_def_as_id_def_org("$id $hdr");
	    my @lookat = grep { $_->[0] =~ /$id_pat/ } @id_def_org;
	    foreach my $tuple (@lookat)
	    {
		my($id,$func,$org) = @$tuple;

                # Remove genome-specific string from function
		$func =~ s/\s\S+\=.*$//;                      # sprot/trembl   function OS=Frog virus 3 GN=FV3-004R ...
		$func =~ s/\s\S+\[[^\]]*$//;                  # SEED and nr:   function [Organism ID]
                $func =~ s/^(Missing locus_\S+|\w+_\w+)\s+//; # IMG:           locus_ID function

		$md5_to_func{$md5}->{$func}++;
	    }
	    $N++ unless $seen_md5{$md5}++;
	}
    }
}
$/ = "\n";
open(SETS,"<","$md5_sets") || die "$md5_sets needs to be specified in the first argument";
my $all  = 0;
my $bad  = 0;

my $last = <SETS>;
while ($last && ($last =~ /^(\S+)/))
{
    my @set;
    my $curr = $1;
    while ($last && ($last =~ /^(\S+)\t(\S+)/) && ($curr eq $1))
    {
	push(@set,$2);
	$last = <SETS>;
    }

    my %by_func;
    foreach my $md5 (@set)
    {
	my $funcs = $md5_to_func{$md5};
	if ($funcs)
	{
	    foreach my $f (keys(%$funcs))
	    {
		$by_func{$f} += $funcs->{$f};
	    }
	}
    }
    my @vals = map { $by_func{$_} } keys(%by_func);
    my $tot=0;
    foreach $_ (@vals)
    {
	$tot += $_;
    }
    $all += int($tot * ($tot-1)/2);

    my $i;
    my $x = 0;
    my $d = 0;
    foreach my $v (@vals) {
        $x += $v;
        $d += $v * $v;
    }
    $bad += int(($x*$x - $d)/2);
    
    # my $i;
    # my $j;
    # for ($i=0; ($i < $#vals); $i++)
    # {
    #     for ($j=$i+1; ($j < @vals); $j++)
    #     {
    #         $bad += $vals[$i] * $vals[$j];
    #     }
    # }
}
my $frac = $bad / $all;
print "unique-md5s-in-sets=$N all=$all bad=$bad ";
print sprintf("inconsistency=%0.6f",$frac),"\n";


# functions from gjoseqlib.pm

#-------------------------------------------------------------------------------
#  Some functions to split fasta file def lines into [id, definition, organism]
#  These forms DO NOT want a > at the beinging of the line (it will become part
#  of the id).
#
#     @id_def_org = nr_def_as_id_def_org( $nr_seq_title )
#     @id_def_org = split_id_def_org( @seq_titles );
#
#-------------------------------------------------------------------------------
#
#  @id_def_org = nr_def_as_id_def_org( $nr_id_def_line );
#
sub nr_def_as_id_def_org
{
    defined($_[0]) ? split_id_def_org( split /\001/, $_[0] ) : ();
}
 

#
#  @id_def_org = split_id_def_org( @id_def_org_lines );
#
sub split_id_def_org
{
    map { ! defined( $_ )                           ? ()              
        : ! /^\s*\S/                                ? ()              
        : /^\s*(\S+)\s+(.*\S)\s+\[([^\[\]]+)\]\s*$/ ? [ $1, $2, $3 ]  # id def org
        : /^\s*(\S+)\s+\[([^\[\]]+)\]\s*$/          ? [ $1, '', $2 ]  # id     org
        : /^\s*(\S+)\s+(.*[^\]\s])\s*$/             ? [ $1, $2, '' ]  # id def
        : /^\s*(\S+)\s*$/                           ? [ $1, '', '' ]  # id
        :                                             split_id_def_org_2( $_ );
        } @_;
}

#
#  @id_def_org = split_id_def_org( @id_def_org_lines );
#
sub split_id_def_org_2
{
    local $_ = $_[0];
    s/^\s+//;
    s/\s+$//;
    # split into segments starting with [ or ]
    my @parts = grep { defined($_) && length($_) } split /([\]\[][^\]\[]*)/;
    my $n = 0;
    for ( my $i = @parts-1; $i > 0; $i-- )
    {
        # a part starts with [ or ].
        if    ( $parts[$i] =~ /^\[/ )
        {
            $n--;
            #  If the open bracket balances, and there
            #  is white space before, we are done
            if ( $n == 0 && $parts[$i-1] =~ s/\s+$// )
            {
                my $def = join( '', @parts[  0 .. $i-1 ] );
                $parts[$i]       =~ s/^\[\s*//;  # remove open bracket
                $parts[@parts-1] =~ s/\s*\]$//;  # remove close bracket
                my $org = join( '', @parts[ $i .. @parts-1 ] );
                return $def =~ /^(\S+)\s+(\S.*)$/ ? [ $1, $2, $org ] : [ $def, '', $org ];
            }
        }
        elsif ( $parts[$i] =~ /^\]/ )
        {
            $n++;
        }
        else
        {
            carp "Logic error in split_id_def_org_2()";
            return /^(\S+)\s+(\S.*)$/ ? [ $1, $2, '' ] : [ $_, '', '' ];
        }
    }

    #  Fall through means that we failed to balance organism brackets

    /^(\S+)\s+(\S.*)$/ ? [ $1, $2, '' ] : [ $_, '', '' ];
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3