[Bio] / FigKernelScripts / add_structured_english.pl Repository:
ViewVC logotype

View of /FigKernelScripts/add_structured_english.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon May 19 18:02:28 2008 UTC (11 years, 6 months ago) by disz
Branch: MAIN
Ross's code to generate annotations for NCBI

use CGI;
my $cgi = new CGI;
use FIG;

my $fig = new FIG;

while (defined($_ = <STDIN>))
{
    chop;
    ($peg,$refseq,$seq,undef,$fixed_func,$ecs,undef,undef,undef) = split(/\t/,$_);
    $funcSeed = $fig->function_of($peg,undef,1);
    my($func1,$ecs1) = &fix_func($funcSeed);
    if (($func1 ne $fixed_func) || ($ecs1 ne $ecs))
    {
	print STDERR &Dumper($fixed_func,$func1,$ecs,$ecs1);
	print STDERR "$_\n";
    }
    else
    {
	my $structured_english = &to_structured_english($fig,$peg);
	my $text = $cgi->unescape($structured_english);
	if ($structured_english)
	{
	    print join("\t",($peg,$refseq,$seq,'',$fixed_func,$ecs,$structured_english,'','')),"\n";
	}
    }
}

sub to_structured_english {
    my($fig,$peg) = @_;

    my @ev_codes = &evidence_codes($fig,$peg);
    my $by_sub = {};
    foreach my $code (@ev_codes)
    {
	if ($code =~ /^isu;(\S.*\S)/)                { $by_sub->{$1}->{'isu'} = 1  }
	if ($code =~ /^icw\((\d+)\);(\S.*\S)/)       { $by_sub->{$2}->{'icw'} = $1 }
    }

    my @insubs = grep { $fig->usable_subsystem($_,1) } $fig->peg_to_subsystems($peg,1);
    my %subs = map { $_ => 1 } @insubs;
    $funcSeed = $fig->function_of($peg,undef,1);
    if (@insubs < 1) { return "" }

    my $pieces = [];
    &add_func_assertion($pieces,$funcSeed);
    &add_in_subs($pieces,\@insubs);
    foreach my $sub (@insubs)
    {
	&add_clustering_and_dup($pieces,$by_sub->{$sub},$sub);
    }
    return &render($pieces);
}

sub render {
    my($pieces) = @_;

    my @lines = ();
    my $curr  = "";
    foreach my $piece (@$pieces)
    {
	$piece = "$piece  ";
	$curr = $curr . $piece;

	while (length($curr) > 100)
	{
	    my($p1,$p2) = &split_piece($curr,100);
	    $p1 =~ s/^\s+//;
	    push(@lines, $p1);
	    $curr = $p2;
	}
    }
    if ($curr) 
    { 
	$curr =~ s/^\s+//; 
	push(@lines,$curr) ;
    }

    my $encoded = $cgi->escape(join("\n",@lines) . "\n");
#    print &Dumper($pieces,$encoded);
    return $encoded;
}

sub split_piece {
    my($piece,$n) = @_;

    my $i;
    for ($i = $n; ($i > 0) && (substr($piece,$i,1) ne " "); $i--) {}
    if ($i)
    {
	return (substr($piece,0,$i+1),substr($piece,$i+1));
    }
    else
    {
	return ($piece,"");
    }
}


sub add_clustering_and_dup {
    my($pieces,$by_sub_entry,$sub) = @_;

    if ($by_sub_entry)
    {
	if ($by_sub_entry->{isu} || $by_sub_entry->{icw})
	{
	    my $fixed_sub = &fix_sub_name($sub);
	    push(@$pieces,"In $fixed_sub, " . &isu_and_icw($by_sub_entry->{isu},$by_sub_entry->{icw}));
	}
    }
}

sub isu_and_icw {
    my($isu,$icw) = @_;

    if ($isu && $icw) { return "it appears to play a functional role that we have not associated with any other gene, and it occurs in close proximity on the chromosome with " . (($icw == 1) ? "another gene from the subsystem." : "$icw other genes from the subsystem.") }
    if ($isu)         { return "it appears to play a functional role that we have not associated with any other gene." }
    if ($icw)         { "It occurs in close proximity on the chromosome with " . (($icw == 1) ? "another gene from the subsystem." : "$icw other genes from the subsystem.") }
}

sub add_func_assertion {
    my($pieces,$func) = @_;

    push(@$pieces,"We currently believe that the function of the encoded protein is \"$funcSeed\".");
    return;
}

sub add_in_subs {
    my($pieces,$insubs) = @_;

    if (@$insubs > 0)
    {
	my $n = @$insubs;
	if ($n > 0)
	{
	    my $in_sub_state = "The protein occurs in " .
		               (($n == 1) ? "1 subsystem" : "$n subsystems") . ': ' . &subs($insubs) . ".";
	    push(@$pieces,$in_sub_state);
	}
    }
}

sub subs {
    my($subs) = @_;

    if (@$subs == 1) { return &fix_sub_name($subs->[0]) }
    my @subL = map { &fix_sub_name($_) } @$subs;
    $subL[$#subL] = "and $subL[$#subL]";
    return join(", ",@subL);
}

sub fix_sub_name {
    my($x) = @_;

    $x =~ s/_/ /g;
    return "\"$x\"";
}

sub evidence_codes {
    my($fig,$peg) = @_;

    if ($peg !~ /^fig\|\d+\.\d+\.peg\.\d+$/) { return "" }

    my @codes = grep { $_->[1] =~ /^evidence_code/i } $fig->get_attributes($peg);
    return map { $_->[2] } @codes;
}


sub fix_func {
    my($func) = @_;
    my($ecs);

    $ecs = {};
    while ($func =~ /^(.*\S)\s*\(EC ([^\)]+)\)(.*)$/)
    {
        $ecs->{$2} = 1;
        $func = $3 ? $1 . $3 : $1;
    }
    $func =~ s/\s+\@\s+/ AND /g;
    $func =~ s/;\s+/ AND\/OR /g;
    return ($func,join(";",sort keys(%$ecs)));
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3