[Bio] / FigKernelScripts / add_structured_english.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/add_structured_english.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : olson 1.2
2 : disz 1.1 use CGI;
3 :     my $cgi = new CGI;
4 :     use FIG;
5 :    
6 :     my $fig = new FIG;
7 :    
8 : olson 1.2
9 :     #fig|169963.1.peg.1 NC_003210 318_1673 NP_463534.1 isu,icw(1),ff dnaA Chromosomal replication initiator protein dnaA CDD:pfam00004,CDD:pfam00308,CDD:pfam01695,CDD:smart00382,GeneID:984365,InterPro:IPR001957,InterPro:IPR003593,NCBI_gi:16802049,Pfam:PF00308,SMART:Q8YAW2,SMART:SM00382,UniProtKB:Q8YAW2 http://www.nmpdr.org/linkin.cgi?id=fig|169963.1.peg.1 MQSIED
10 :    
11 : disz 1.1 while (defined($_ = <STDIN>))
12 :     {
13 :     chop;
14 : olson 1.2 ($peg,$contig_refseq,$contig_refseq_coords,$refseq,$ev, $gene, $fixed_func, $dbxref, $kegg, $subsystems, $link, $figfam, $seq) = split(/\t/,$_);
15 : disz 1.1 $funcSeed = $fig->function_of($peg,undef,1);
16 :     my($func1,$ecs1) = &fix_func($funcSeed);
17 : olson 1.2 if (0)
18 :     #if (($func1 ne $fixed_func) || ($ecs1 ne $ecs))
19 :     #if ($func1 ne $fixed_func)
20 : disz 1.1 {
21 :     print STDERR &Dumper($fixed_func,$func1,$ecs,$ecs1);
22 :     print STDERR "$_\n";
23 :     }
24 :     else
25 :     {
26 : olson 1.2 my ($evcodes, $subs, $structured_english) = &to_structured_english($fig,$peg);
27 : disz 1.1 my $text = $cgi->unescape($structured_english);
28 : olson 1.2
29 :     my $nmpdr_id_text = "nmpdr_id=$peg";
30 :     my $contig_refseq_text= "reference_contig_refseq=$contig_refseq";
31 :     my $refseq_coords_text = "reference_contig_refseq_coordinates=$contig_refseq_coords";
32 :     my $refseq_id_text= "reference_protein_refseq=$refseq";
33 :     my $evc_text= "evidence_code=$ev";
34 :     my $gene_symbol_text = "gene_symbol=$gene";
35 :     my $desc_text = "description=\"$func1\"";
36 :     #my $desc_text = "description=\"$fixed_func\"";
37 :     my $dbxref_text = "dbxref=$dbxref";
38 :     my $kegg_text = "kegg=$kegg";
39 :     my $subsystem_text = "subsystem=\"$subs\"";
40 :     my $nmpdr_link_text = "nmpdr_web_page=\"$link\"";
41 :     my $structured_eng_text= "structured_description=\`$structured_english\`";
42 :     my $figfam_text= "FIGfam=$figfam";
43 :     my $seq_text= "protein_sequence=\"$seq\"";
44 :    
45 :     if ($structured_english && $seq)
46 : disz 1.1 {
47 : olson 1.2 print join("\n",($nmpdr_id_text, $contig_refseq_text, $refseq_coords_text, $refseq_id_text, $evc_text, $gene_symbol_text, $desc_text, $dbxref_text, $subsystem_text, $nmpdr_link_text, $figfam_text, $structured_eng_text, $seq_text)), "\n//\n";
48 :     #print join("\n",($nmpdr_id_text, $contig_refseq_text, $refseq_coords_text, $refseq_id_text, $evc_text, $gene_symbol_text, $desc_text, $dbxref_text, $kegg_text, $subsystem_text, $nmpdr_link_text, $structured_eng_text, $seq_text)), "\n";
49 :     #print join("\t",($peg,$refseq,$seq,'',$fixed_func,$ecs,$structured_english,'','')),"\n";
50 : disz 1.1 }
51 :     }
52 :     }
53 :    
54 :     sub to_structured_english {
55 :     my($fig,$peg) = @_;
56 :    
57 :     my @ev_codes = &evidence_codes($fig,$peg);
58 :     my $by_sub = {};
59 :     foreach my $code (@ev_codes)
60 :     {
61 :     if ($code =~ /^isu;(\S.*\S)/) { $by_sub->{$1}->{'isu'} = 1 }
62 :     if ($code =~ /^icw\((\d+)\);(\S.*\S)/) { $by_sub->{$2}->{'icw'} = $1 }
63 :     }
64 :    
65 :     my @insubs = grep { $fig->usable_subsystem($_,1) } $fig->peg_to_subsystems($peg,1);
66 :     my %subs = map { $_ => 1 } @insubs;
67 :     $funcSeed = $fig->function_of($peg,undef,1);
68 :     if (@insubs < 1) { return "" }
69 :    
70 :     my $pieces = [];
71 :     &add_func_assertion($pieces,$funcSeed);
72 :     &add_in_subs($pieces,\@insubs);
73 :     foreach my $sub (@insubs)
74 :     {
75 :     &add_clustering_and_dup($pieces,$by_sub->{$sub},$sub);
76 :     }
77 : olson 1.2 return join(",", @ev_codes), join(",", @insubs), &render($pieces);
78 : disz 1.1 }
79 :    
80 :     sub render {
81 :     my($pieces) = @_;
82 :    
83 :     my @lines = ();
84 :     my $curr = "";
85 :     foreach my $piece (@$pieces)
86 :     {
87 :     $piece = "$piece ";
88 :     $curr = $curr . $piece;
89 :    
90 :     while (length($curr) > 100)
91 :     {
92 :     my($p1,$p2) = &split_piece($curr,100);
93 :     $p1 =~ s/^\s+//;
94 :     push(@lines, $p1);
95 :     $curr = $p2;
96 :     }
97 :     }
98 :     if ($curr)
99 :     {
100 :     $curr =~ s/^\s+//;
101 :     push(@lines,$curr) ;
102 :     }
103 :    
104 :     my $encoded = $cgi->escape(join("\n",@lines) . "\n");
105 :     # print &Dumper($pieces,$encoded);
106 :     return $encoded;
107 :     }
108 :    
109 :     sub split_piece {
110 :     my($piece,$n) = @_;
111 :    
112 :     my $i;
113 :     for ($i = $n; ($i > 0) && (substr($piece,$i,1) ne " "); $i--) {}
114 :     if ($i)
115 :     {
116 :     return (substr($piece,0,$i+1),substr($piece,$i+1));
117 :     }
118 :     else
119 :     {
120 :     return ($piece,"");
121 :     }
122 :     }
123 :    
124 :    
125 :     sub add_clustering_and_dup {
126 :     my($pieces,$by_sub_entry,$sub) = @_;
127 :    
128 :     if ($by_sub_entry)
129 :     {
130 :     if ($by_sub_entry->{isu} || $by_sub_entry->{icw})
131 :     {
132 :     my $fixed_sub = &fix_sub_name($sub);
133 :     push(@$pieces,"In $fixed_sub, " . &isu_and_icw($by_sub_entry->{isu},$by_sub_entry->{icw}));
134 :     }
135 :     }
136 :     }
137 :    
138 :     sub isu_and_icw {
139 :     my($isu,$icw) = @_;
140 :    
141 :     if ($isu && $icw) { return "it appears to play a functional role that we have not associated with any other gene, and it occurs in close proximity on the chromosome with " . (($icw == 1) ? "another gene from the subsystem." : "$icw other genes from the subsystem.") }
142 :     if ($isu) { return "it appears to play a functional role that we have not associated with any other gene." }
143 :     if ($icw) { "It occurs in close proximity on the chromosome with " . (($icw == 1) ? "another gene from the subsystem." : "$icw other genes from the subsystem.") }
144 :     }
145 :    
146 :     sub add_func_assertion {
147 :     my($pieces,$func) = @_;
148 :    
149 :     push(@$pieces,"We currently believe that the function of the encoded protein is \"$funcSeed\".");
150 :     return;
151 :     }
152 :    
153 :     sub add_in_subs {
154 :     my($pieces,$insubs) = @_;
155 :    
156 :     if (@$insubs > 0)
157 :     {
158 :     my $n = @$insubs;
159 :     if ($n > 0)
160 :     {
161 :     my $in_sub_state = "The protein occurs in " .
162 :     (($n == 1) ? "1 subsystem" : "$n subsystems") . ': ' . &subs($insubs) . ".";
163 :     push(@$pieces,$in_sub_state);
164 :     }
165 :     }
166 :     }
167 :    
168 :     sub subs {
169 :     my($subs) = @_;
170 :    
171 :     if (@$subs == 1) { return &fix_sub_name($subs->[0]) }
172 :     my @subL = map { &fix_sub_name($_) } @$subs;
173 :     $subL[$#subL] = "and $subL[$#subL]";
174 :     return join(", ",@subL);
175 :     }
176 :    
177 :     sub fix_sub_name {
178 :     my($x) = @_;
179 :    
180 :     $x =~ s/_/ /g;
181 :     return "\"$x\"";
182 :     }
183 :    
184 :     sub evidence_codes {
185 :     my($fig,$peg) = @_;
186 :    
187 :     if ($peg !~ /^fig\|\d+\.\d+\.peg\.\d+$/) { return "" }
188 :    
189 :     my @codes = grep { $_->[1] =~ /^evidence_code/i } $fig->get_attributes($peg);
190 :     return map { $_->[2] } @codes;
191 :     }
192 :    
193 :    
194 :     sub fix_func {
195 :     my($func) = @_;
196 :     my($ecs);
197 :    
198 :     $ecs = {};
199 :     while ($func =~ /^(.*\S)\s*\(EC ([^\)]+)\)(.*)$/)
200 :     {
201 :     $ecs->{$2} = 1;
202 :     $func = $3 ? $1 . $3 : $1;
203 :     }
204 :     $func =~ s/\s+\@\s+/ AND /g;
205 :     $func =~ s/;\s+/ AND\/OR /g;
206 :     return ($func,join(";",sort keys(%$ecs)));
207 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3