[Bio] / FigKernelScripts / promote_orfs_to_pegs.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/promote_orfs_to_pegs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (view) (download) (as text)

1 : overbeek 1.1 # -*- perl -*-
2 : gdpusch 1.11 ########################################################################
3 : overbeek 1.1 # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 : gdpusch 1.11 ########################################################################
18 : overbeek 1.1
19 : olson 1.9 use strict;
20 : gdpusch 1.13 use warnings;
21 :    
22 : overbeek 1.1 use FIG;
23 : gdpusch 1.14 my $fig = FIG->new();
24 : overbeek 1.1
25 :     use NewGenome;
26 :     use ToCall;
27 :    
28 : gdpusch 1.13 $0 =~ m/([^\/]+)$/o;
29 :     my $self = $1;
30 : overbeek 1.5 my $usage = "usage: promote_orfs_to_pegs To_Call_Dir FoundFamiliesFile";
31 : overbeek 1.1
32 :     if (@ARGV && ($ARGV[0] =~ m/-h(elp)?/)) {
33 :     print STDERR "\n $usage\n\n";
34 :     exit (0);
35 :     }
36 :    
37 : overbeek 1.5 my ($to_call_dir, $found_file);
38 : overbeek 1.1
39 :     (
40 :     ($to_call_dir = shift @ARGV)
41 : overbeek 1.5 &&
42 :     ($found_file = shift @ARGV)
43 : overbeek 1.1 )
44 : overbeek 1.2 || die "\n $usage\n\n";
45 : overbeek 1.1
46 : overbeek 1.3 $to_call_dir =~ s/\/$//o;
47 :    
48 : overbeek 1.1 my $to_call;
49 : gdpusch 1.13 $to_call = NewGenome->new($to_call_dir, "all");
50 : overbeek 1.5
51 : gdpusch 1.14
52 :     my @neighbors = ();
53 :     my $neighbors_file;
54 :     if ((-s ($neighbors_file = "$to_call_dir/closest.genomes")) ||
55 :     (-s ($neighbors_file = "$to_call_dir/neighbors"))
56 :     ) {
57 :     @neighbors = map { m/^(\d+\.\d+)/o ? ($1) : () } &FIG::file_read($neighbors_file);
58 :     }
59 : gdpusch 1.15 else {
60 :     warn qq(WARNING: $self could not read either nearest neighbors file\n);
61 :     }
62 : overbeek 1.5
63 : gdpusch 1.13 open(CALLED_BY, ">>$to_call_dir/called_by")
64 :     || die "could not open $to_call_dir/called_by";
65 :    
66 : overbeek 1.5 #...Build file of "nearby" PEGs
67 : gdpusch 1.15 if (@neighbors == 0) {
68 :     warn qq($self: No nearby genomes found --- Skipping promotion based on sims\n);
69 :     }
70 :     else {
71 : gdpusch 1.12 my $tmp_close = "$FIG_Config::temp/tmp_close_$$.fasta";
72 :     system("/bin/rm -f $tmp_close*");
73 : overbeek 1.5
74 : gdpusch 1.12 foreach my $org (@neighbors) {
75 :     print STDERR "Appending $org to $tmp_close\n" if $ENV{VERBOSE};
76 :     system("cat $FIG_Config::organisms/$org/Features/peg/fasta >> $tmp_close");
77 : overbeek 1.5 }
78 : gdpusch 1.15
79 :     if (!-s $tmp_close) {
80 :     warn qq($self: Something is wrong --- Could not find any PEG sequences for nearby genomes);
81 :     }
82 :     else {
83 :     &FIG::run("formatdb -i$tmp_close -pT");
84 :    
85 :     my @promoted_fids;
86 :     my $tmp_sims = "$FIG_Config::temp/tmp_sims.$$";
87 :     open(TMP_SIMS, ">$tmp_sims")
88 :     || die "Could not write-open $tmp_sims";
89 :     foreach my $orf_id ($to_call->get_fids_for_type('orf')) {
90 : gdpusch 1.12 my $sims = &blast_against($to_call->get_feature_sequence($orf_id), $tmp_close);
91 :    
92 :     if (@$sims) {
93 : gdpusch 1.13 my $num_sims = (scalar @$sims);
94 :     print STDERR "Promoting $orf_id based on $num_sims sims\n" if $ENV{VERBOSE};
95 : gdpusch 1.12
96 :     my $orf = &NewGenome::ORF::new('NewGenome::ORF', $to_call, $orf_id);
97 : gdpusch 1.13 my $annot = [ qq(RAST),
98 :     qq(Called by "$self" based on $num_sims sims.)
99 :     ];
100 :     my $fid = $orf->promote_to_peg($sims, undef, $annot);
101 :     if ($fid) {
102 :     push @promoted_fids, $fid;
103 :     @$sims = map { $_->[0] = $fid; $_ } @$sims;
104 :     print TMP_SIMS map { join("\t", @$_) . qq(\n) } @$sims;
105 :    
106 :     print CALLED_BY "$fid\tpromote_remaining_orfs (with sims)\n";
107 :    
108 :     print STDERR "Succeeded:\t$orf_id --> $fid\n";
109 :     }
110 :     else {
111 :     print STDERR "Could not promote ORF $orf_id to a PEG\n";
112 :     }
113 : gdpusch 1.12 }
114 :     else {
115 :     print STDERR "No sims for $orf_id\n" if $ENV{VERBOSE};
116 :     }
117 :     }
118 :    
119 : gdpusch 1.15 if (not $ENV{DEBUG}) {
120 :     #...Clean up...
121 :     close(TMP_SIMS) || die "Could not close $tmp_sims";
122 :     unlink($tmp_sims) || die "Could not remove $tmp_sims";
123 :     unlink($tmp_close) || die "Could not remove $tmp_close";
124 : overbeek 1.5 }
125 :     }
126 : gdpusch 1.12
127 : gdpusch 1.15 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
128 :     # Disable the old auto assignment of the fids we just called.
129 :     #=======================================================================
130 :     # if (0)
131 :     # {
132 :     # my $tmp_seqs = "$FIG_Config::temp/tmp_seqs.$$";
133 :     # open(TMP_SEQS, ">$tmp_seqs")
134 :     # || die "Could not write-open $tmp_seqs";
135 :     # print TMP_SEQS map { join("\t", ($_, $to_call->get_feature_sequence($_))) . qq(\n) } @promoted_fids;
136 :     # close(TMP_SEQS) || die "Could not close $tmp_seqs";
137 :     # my @auto_assigns = `cat $tmp_seqs | auto_assign sims=$tmp_sims`;
138 :     #
139 :     # open(FOUND, ">>$found_file")
140 :     # || die "Could not append-open FoundFamiliesFile $found_file";
141 :     # foreach my $entry (@auto_assigns) {
142 :     # if ($entry =~ m/^(\S+)\t(.*)$/o) {
143 :     # if ($to_call->set_function($1, $2)) {
144 :     # print FOUND "$1\tCLOSE_SIMS\t$2\n";
145 :     # }
146 :     # else {
147 :     # die "Could not set function of $1 to $2";
148 :     # }
149 :     # }
150 :     # else {
151 :     # die "Could not parse auto-assignment: $entry";
152 :     # }
153 :     # }
154 :     # close(FOUND) || die "Could not close $found_file";
155 :     # unlink($tmp_seqs) || die "Could not remove $tmp_seqs";
156 :     # }
157 :     #-----------------------------------------------------------------------
158 :    
159 : overbeek 1.5 }
160 : gdpusch 1.15
161 : gdpusch 1.12
162 : overbeek 1.5
163 : gdpusch 1.12 #...Promote any remaining unpromoted ORFs...
164 : gdpusch 1.13 $to_call->promote_remaining_orfs(\*CALLED_BY);
165 : gdpusch 1.11
166 : gdpusch 1.13 close(CALLED_BY);
167 : gdpusch 1.12
168 :     #...NOTE: export_features also writes assigned_functions
169 : overbeek 1.1 $to_call->export_features;
170 : overbeek 1.2
171 : olson 1.10
172 : overbeek 1.4 if (! -s "$to_call_dir/Features/orf/tbl") {
173 : olson 1.6 # Hrm, weird NFS effects here. There was a failure of this even though the directory
174 :     # was indeed empty.
175 : gdpusch 1.7 if (system("/bin/rm -fRv $to_call_dir/Features/orf")) {
176 : olson 1.6 warn "First remove did not work; sleeping and trying again";
177 : gdpusch 1.7 sleep(30);
178 :     system("/bin/rm -fRv $to_call_dir/Features/orf");
179 : olson 1.6 }
180 : overbeek 1.2 }
181 :     else {
182 :     die "Something is wrong --- $to_call_dir/Features/orf/tbl is non-empty";
183 :     }
184 : gdpusch 1.12 exit(0);
185 : overbeek 1.5
186 :    
187 :    
188 :     sub blast_against {
189 :     my ($seq, $db) = @_;
190 :     my $seq_len = length($seq);
191 :    
192 :     my $tmp_seq = "$FIG_Config::temp/tmp_seq.$$";
193 :     open( TMP, ">$tmp_seq" ) || die "Could not write-open $tmp_seq";
194 :     print TMP ">query\n$seq\n";
195 :     close(TMP) || die "Could not close $tmp_seq";
196 :    
197 :     my $sims = [];
198 :     @$sims = map { chomp $_;
199 : olson 1.9 my @x = split(/\t/, $_);
200 : overbeek 1.5 push @x, ($seq_len, $fig->translation_length($x[1]));
201 :     bless( [@x], "Sim" )
202 :     } `blastall -i $tmp_seq -d $db -p blastp -m8 -e1.0e-10`;
203 :    
204 :     unlink($tmp_seq) || die "Could not unlink $tmp_seq";
205 :     return $sims;
206 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3