[Bio] / FigKernelScripts / add_structured_english.pl Repository:
ViewVC logotype

View of /FigKernelScripts/add_structured_english.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Fri May 23 20:53:03 2008 UTC (11 years, 6 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2008_06_18, rast_rel_2008_06_16
Changes since 1.1: +31 -6 lines
Fixes for NCBI upload.

use CGI;
my $cgi = new CGI;
use FIG;

my $fig = new FIG;


#fig|169963.1.peg.1      NC_003210       318_1673        NP_463534.1     isu,icw(1),ff   dnaA    Chromosomal replication initiator protein dnaA  CDD:pfam00004,CDD:pfam00308,CDD:pfam01695,CDD:smart00382,GeneID:984365,InterPro:IPR001957,InterPro:IPR003593,NCBI_gi:16802049,Pfam:PF00308,SMART:Q8YAW2,SMART:SM00382,UniProtKB:Q8YAW2                  http://www.nmpdr.org/linkin.cgi?id=fig|169963.1.peg.1   MQSIED

while (defined($_ = <STDIN>))
{
    chop;
    ($peg,$contig_refseq,$contig_refseq_coords,$refseq,$ev, $gene, $fixed_func, $dbxref, $kegg, $subsystems, $link, $figfam, $seq) = split(/\t/,$_);
    $funcSeed = $fig->function_of($peg,undef,1);
    my($func1,$ecs1) = &fix_func($funcSeed);
    if (0) 
    #if (($func1 ne $fixed_func) || ($ecs1 ne $ecs))
    #if ($func1 ne $fixed_func)
    {
	print STDERR &Dumper($fixed_func,$func1,$ecs,$ecs1);
	print STDERR "$_\n";
    }
    else
    {
	my ($evcodes, $subs, $structured_english) = &to_structured_english($fig,$peg);
	my $text = $cgi->unescape($structured_english);
	
	my $nmpdr_id_text = "nmpdr_id=$peg";
	my $contig_refseq_text= "reference_contig_refseq=$contig_refseq";
	my $refseq_coords_text = "reference_contig_refseq_coordinates=$contig_refseq_coords";
	my $refseq_id_text= "reference_protein_refseq=$refseq";
	my $evc_text= "evidence_code=$ev";
	my $gene_symbol_text = "gene_symbol=$gene";
	my $desc_text = "description=\"$func1\"";
	#my $desc_text = "description=\"$fixed_func\"";
	my $dbxref_text = "dbxref=$dbxref";
	my $kegg_text = "kegg=$kegg";
	my $subsystem_text = "subsystem=\"$subs\"";
	my $nmpdr_link_text = "nmpdr_web_page=\"$link\"";
	my $structured_eng_text= "structured_description=\`$structured_english\`";
	my $figfam_text= "FIGfam=$figfam";
	my $seq_text= "protein_sequence=\"$seq\"";

	if ($structured_english && $seq)
	{
	    print join("\n",($nmpdr_id_text, $contig_refseq_text, $refseq_coords_text, $refseq_id_text, $evc_text, $gene_symbol_text, $desc_text, $dbxref_text, $subsystem_text, $nmpdr_link_text, $figfam_text, $structured_eng_text, $seq_text)), "\n//\n"; 
	    #print join("\n",($nmpdr_id_text, $contig_refseq_text, $refseq_coords_text, $refseq_id_text, $evc_text, $gene_symbol_text, $desc_text, $dbxref_text, $kegg_text, $subsystem_text, $nmpdr_link_text, $structured_eng_text, $seq_text)), "\n"; 
	    #print join("\t",($peg,$refseq,$seq,'',$fixed_func,$ecs,$structured_english,'','')),"\n";
	}
    }
}

sub to_structured_english {
    my($fig,$peg) = @_;

    my @ev_codes = &evidence_codes($fig,$peg);
    my $by_sub = {};
    foreach my $code (@ev_codes)
    {
	if ($code =~ /^isu;(\S.*\S)/)                { $by_sub->{$1}->{'isu'} = 1  }
	if ($code =~ /^icw\((\d+)\);(\S.*\S)/)       { $by_sub->{$2}->{'icw'} = $1 }
    }

    my @insubs = grep { $fig->usable_subsystem($_,1) } $fig->peg_to_subsystems($peg,1);
    my %subs = map { $_ => 1 } @insubs;
    $funcSeed = $fig->function_of($peg,undef,1);
    if (@insubs < 1) { return "" }

    my $pieces = [];
    &add_func_assertion($pieces,$funcSeed);
    &add_in_subs($pieces,\@insubs);
    foreach my $sub (@insubs)
    {
	&add_clustering_and_dup($pieces,$by_sub->{$sub},$sub);
    }
    return join(",", @ev_codes), join(",", @insubs), &render($pieces);
}

sub render {
    my($pieces) = @_;

    my @lines = ();
    my $curr  = "";
    foreach my $piece (@$pieces)
    {
	$piece = "$piece  ";
	$curr = $curr . $piece;

	while (length($curr) > 100)
	{
	    my($p1,$p2) = &split_piece($curr,100);
	    $p1 =~ s/^\s+//;
	    push(@lines, $p1);
	    $curr = $p2;
	}
    }
    if ($curr) 
    { 
	$curr =~ s/^\s+//; 
	push(@lines,$curr) ;
    }

    my $encoded = $cgi->escape(join("\n",@lines) . "\n");
#    print &Dumper($pieces,$encoded);
    return $encoded;
}

sub split_piece {
    my($piece,$n) = @_;

    my $i;
    for ($i = $n; ($i > 0) && (substr($piece,$i,1) ne " "); $i--) {}
    if ($i)
    {
	return (substr($piece,0,$i+1),substr($piece,$i+1));
    }
    else
    {
	return ($piece,"");
    }
}


sub add_clustering_and_dup {
    my($pieces,$by_sub_entry,$sub) = @_;

    if ($by_sub_entry)
    {
	if ($by_sub_entry->{isu} || $by_sub_entry->{icw})
	{
	    my $fixed_sub = &fix_sub_name($sub);
	    push(@$pieces,"In $fixed_sub, " . &isu_and_icw($by_sub_entry->{isu},$by_sub_entry->{icw}));
	}
    }
}

sub isu_and_icw {
    my($isu,$icw) = @_;

    if ($isu && $icw) { return "it appears to play a functional role that we have not associated with any other gene, and it occurs in close proximity on the chromosome with " . (($icw == 1) ? "another gene from the subsystem." : "$icw other genes from the subsystem.") }
    if ($isu)         { return "it appears to play a functional role that we have not associated with any other gene." }
    if ($icw)         { "It occurs in close proximity on the chromosome with " . (($icw == 1) ? "another gene from the subsystem." : "$icw other genes from the subsystem.") }
}

sub add_func_assertion {
    my($pieces,$func) = @_;

    push(@$pieces,"We currently believe that the function of the encoded protein is \"$funcSeed\".");
    return;
}

sub add_in_subs {
    my($pieces,$insubs) = @_;

    if (@$insubs > 0)
    {
	my $n = @$insubs;
	if ($n > 0)
	{
	    my $in_sub_state = "The protein occurs in " .
		               (($n == 1) ? "1 subsystem" : "$n subsystems") . ': ' . &subs($insubs) . ".";
	    push(@$pieces,$in_sub_state);
	}
    }
}

sub subs {
    my($subs) = @_;

    if (@$subs == 1) { return &fix_sub_name($subs->[0]) }
    my @subL = map { &fix_sub_name($_) } @$subs;
    $subL[$#subL] = "and $subL[$#subL]";
    return join(", ",@subL);
}

sub fix_sub_name {
    my($x) = @_;

    $x =~ s/_/ /g;
    return "\"$x\"";
}

sub evidence_codes {
    my($fig,$peg) = @_;

    if ($peg !~ /^fig\|\d+\.\d+\.peg\.\d+$/) { return "" }

    my @codes = grep { $_->[1] =~ /^evidence_code/i } $fig->get_attributes($peg);
    return map { $_->[2] } @codes;
}


sub fix_func {
    my($func) = @_;
    my($ecs);

    $ecs = {};
    while ($func =~ /^(.*\S)\s*\(EC ([^\)]+)\)(.*)$/)
    {
        $ecs->{$2} = 1;
        $func = $3 ? $1 . $3 : $1;
    }
    $func =~ s/\s+\@\s+/ AND /g;
    $func =~ s/;\s+/ AND\/OR /g;
    return ($func,join(";",sort keys(%$ecs)));
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3