[Bio] / FigKernelScripts / run_glimmer3.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/run_glimmer3.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (view) (download) (as text)

1 : overbeek 1.1 # -*- perl -*-
2 :    
3 : gdpusch 1.2 use strict;
4 :     use warnings;
5 : overbeek 1.1
6 : gdpusch 1.2 use FIG;
7 :     my $fig = FIG->new();
8 : overbeek 1.1
9 : gdpusch 1.2 $0 =~ m/([^\/]+)$/;
10 :     my $self = $1;
11 : parrello 1.14 my $usage = "$self [-verbose] [-code=genetic_code_num] [-minlen=min_training_len] [-train=training_tbl[,training_contigs]] [-skip_called] Taxon_ID contigs";
12 : gdpusch 1.2
13 : gdpusch 1.3
14 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
15 :     #...PATH environment variables
16 :     #-----------------------------------------------------------------------
17 : gdpusch 1.4 my $elph_bin = $ENV{ELPH_BIN} || $FIG_Config::ext_bin;
18 :     my $glimmer_bin = $ENV{GLIMMER_3_BIN} || "$FIG_Config::ext_bin/../apps/glimmer3/bin";
19 :     my $glimmer_scripts = $ENV{GLIMMER_3_SCRIPTS} || "$FIG_Config::ext_bin/../apps/glimmer3/bin";
20 : gdpusch 1.3
21 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
22 :     #...Defaults for vars set during argument parsing
23 :     #-----------------------------------------------------------------------
24 :     my $genetic_code = 11; #...Default to the "standard" code
25 : gdpusch 1.2
26 :     my $train = qq(); #...Flag associated with '-train' switch
27 :     my $training_tbl = qq();
28 :     my $training_contigs = qq();
29 :    
30 : gdpusch 1.11 my $min_training_len = 2000; #...Shortest contig used for self-training
31 :    
32 : gdpusch 1.9 my $glimmeropts = qq(-o50 -g110 -t30 -l); #...NOTE: Make these the default for a switch
33 : gdpusch 1.2
34 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
35 :     # NOTE: '-skip' is an obscure option introduced to handle one job
36 :     # for which new contigs had been added to a set of existing contigs,
37 :     # and it was required that we not re-call the existing contigs,
38 :     # but instead use their calls as the 'training set' for the new contigs.
39 :     # It seems quite unlikely that this option will ever be used again.
40 :     #-----------------------------------------------------------------------
41 :     my %skip; #...Hash storing already-called contig IDs to be skipped.
42 :     my $skip_called = qq(); #...Flag indicating whether '-skip' switch is present.
43 :    
44 :    
45 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
46 :     #... Parse argument list
47 :     #-----------------------------------------------------------------------
48 :     my $trouble = 0;
49 : overbeek 1.5 for (my $arg=$#ARGV; ($arg >= 0); $arg--)
50 : overbeek 1.1 {
51 : gdpusch 1.10 if ($ARGV[$arg] =~ m/^-{1,2}help/) {
52 :     warn "\n\tusage: $usage\n\n";
53 :     exit(0);
54 :     }
55 :     elsif ($ARGV[$arg] =~ m/^-{1,2}code=(\d+)/) {
56 : gdpusch 1.2 $genetic_code = $1;
57 : gdpusch 1.7 splice(@ARGV, $arg, 1);
58 : overbeek 1.1 }
59 : gdpusch 1.11 elsif ($ARGV[$arg] =~ m/^-{1,2}minlen=(\d+)/) {
60 :     $min_training_len = $1;
61 :     splice(@ARGV, $arg, 1);
62 :     }
63 : parrello 1.14 elsif ($ARGV[$arg] =~ m/^-{1,2}verbose/) {
64 :     $ENV{VERBOSE} = 1;
65 :     splice(@ARGV, $arg, 1);
66 :     }
67 : overbeek 1.5 elsif ($ARGV[$arg] =~ m/^-{1,2}train=(\S+)/) {
68 : gdpusch 1.2 $train = $1;
69 :     ($training_tbl, $training_contigs) = split /,/, $train;
70 :    
71 :     if (-s $training_tbl) {
72 :     print STDERR "Training features will be taken from $training_tbl\n"
73 :     if $ENV{VERBOSE};
74 :     }
75 :     else {
76 :     $trouble = 1;
77 :     warn "Training tbl $training_tbl does not exist";
78 :     }
79 :    
80 :     if ($training_contigs) {
81 :     if (-s $training_contigs) {
82 :     print STDERR "Training set will be extracted from $training_contigs\n"
83 :     if $ENV{VERBOSE};
84 : overbeek 1.1 }
85 : gdpusch 1.2 else {
86 : overbeek 1.1 $trouble = 1;
87 : gdpusch 1.2 warn "Training contigs $training_contigs do not exist";
88 : overbeek 1.1 }
89 :     }
90 : gdpusch 1.7 splice(@ARGV, $arg, 1);
91 : overbeek 1.1 }
92 : overbeek 1.5 elsif ($ARGV[$arg] =~ m/^-{1,2}skip_called/) {
93 :     $skip_called = $ARGV[$arg];
94 : gdpusch 1.2 print STDERR "Skip option is set\n" if $ENV{VERBOSE};
95 : overbeek 1.5 splice(@ARGV,$arg,1);
96 : overbeek 1.1 }
97 : overbeek 1.5 elsif ($ARGV[$arg] =~ /^-/) {
98 : gdpusch 1.2 $trouble = 1;
99 : overbeek 1.5 warn "Unrecognized argument $ARGV[$arg]";
100 : overbeek 1.1 }
101 :     }
102 :    
103 : gdpusch 1.2 my $genetic_code_switch = qq(-z $genetic_code);
104 : overbeek 1.1
105 :    
106 : gdpusch 1.2 my $taxon_ID = qq();
107 :     my $contigs_file = qq();
108 : overbeek 1.1
109 : gdpusch 1.2 if (@ARGV == 2) {
110 :     ($taxon_ID, $contigs_file) = @ARGV;
111 : overbeek 1.6 if (! $training_contigs) { $training_contigs = $contigs_file }
112 :    
113 : gdpusch 1.2 if ($taxon_ID && ($taxon_ID =~ m/^\d+\.\d+$/)) {
114 :     print STDERR "Taxon-ID is $taxon_ID\n" if $ENV{VERBOSE};
115 :     } else {
116 :     $trouble = 1;
117 :     warn "Bad Taxon-ID $taxon_ID\n";
118 : overbeek 1.1 }
119 :    
120 : gdpusch 1.2 if (-s $contigs_file) {
121 :     print STDERR "Contigs file is $contigs_file\n" if $ENV{VERBOSE};
122 :     } else {
123 :     $trouble = 1;
124 :     warn "Missing or zero-size contigs file $contigs_file";
125 : overbeek 1.1 }
126 :     }
127 : gdpusch 1.7 else {
128 :     $trouble = 1;
129 :     print STDERR
130 :     qq(\nWrong number of mandatory arguments: ),
131 :     &FIG::flatten_dumper(@ARGV),
132 :     qq(\n);
133 :     }
134 : gdpusch 1.2 die "\n\tusage: $usage\n\n" if $trouble;
135 :    
136 :    
137 :    
138 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
139 :     # ... Initial Training Pass
140 :     #-----------------------------------------------------------------------
141 :     if (not $training_contigs) {
142 :     $training_contigs = $contigs_file;
143 :     }
144 :    
145 : gdpusch 1.3 my $tmp_prefix = "$FIG_Config::temp/tmp$$";
146 :     $fig->run("rm -f $tmp_prefix.*");
147 :    
148 : gdpusch 1.11 my $tmp_contig = "$tmp_prefix.contig";
149 :     my $tmp_coords = "$tmp_prefix.coords";
150 :     my $tmp_train = "$tmp_prefix.train";
151 :     my $tmp_model = "$tmp_prefix.icm";
152 : gdpusch 1.2
153 :     my ($contig_id, $seqP);
154 :    
155 :     if (not $train) {
156 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
157 :     print STDERR "\nFinding training ORFs using GLIMMER default procedure\n"
158 :     if $ENV{VERBOSE};
159 :     #-----------------------------------------------------------------------
160 : overbeek 1.1 if (-s "$tmp_train") {
161 :     system("rm -f $tmp_train")
162 :     && die "Could not remove $tmp_train";
163 :     }
164 : parrello 1.14 print STDERR "Training file is $training_contigs\n" if $ENV{VERBOSE};
165 : overbeek 1.1 open(CONTIGS, "<$training_contigs")
166 : gdpusch 1.7 || die "could not read-open training contigs file: $training_contigs";
167 : overbeek 1.1
168 : gdpusch 1.3 $training_tbl = "$tmp_prefix.train.tbl";
169 : gdpusch 1.7 open(TRAINING, ">$training_tbl") || die "Could not write-open training_tbl file: $training_tbl";
170 : gdpusch 1.2
171 :     my $orf_num = 0;
172 :     my %len_of; #...Hash storing training contig lens
173 : overbeek 1.1 while (($contig_id, $seqP) = &read_fasta_record(\*CONTIGS))
174 :     {
175 : gdpusch 1.10 my $len = $len_of{$contig_id} = length($$seqP);
176 : overbeek 1.1
177 :     open( TMP, ">$tmp_contig") || die "Could not write-open $tmp_contig";
178 :     &display_id_and_seq($contig_id, $seqP, \*TMP);
179 :     close(TMP) || die "Could not close $tmp_contig";
180 :    
181 : gdpusch 1.11 if ($len >= $min_training_len) {
182 : gdpusch 1.10 print STDERR "\nScanning contig $contig_id for long orfs\n" if $ENV{VERBOSE};
183 :     }
184 :     else {
185 :     print STDERR "\nSkipping contig $contig_id --- too short (len=$len)\n" if $ENV{VERBOSE};
186 :     next;
187 :     }
188 : gdpusch 1.2
189 : gdpusch 1.11 my $longorfs_err = qq();
190 :     my $tmp_longorfs_err = "$tmp_prefix.longorfs.err";
191 :     my $rc = system("$glimmer_bin/long-orfs -l -n -t 1.15 $genetic_code_switch $tmp_contig $tmp_coords > $tmp_longorfs_err 2>&1");
192 :     if (-s $tmp_longorfs_err) {
193 :     $longorfs_err = &FIG::file_read($tmp_longorfs_err);
194 :     print STDERR $longorfs_err if $ENV{VERBOSE};
195 :     }
196 : gdpusch 1.12
197 :     if ($rc) {
198 :     if ($longorfs_err) {
199 :     if ($longorfs_err =~ m/WHAT: ERROR: No valid orfs found below entropy cutoff/so) {
200 :     #...Error is harmless --- ignore it
201 :     }
202 :     else {
203 :     die (qq(Could not extract training ORFs from contig $contig_id --- return-code $rc:\n), $longorfs_err);
204 :     }
205 : gdpusch 1.11 }
206 :     else {
207 : gdpusch 1.12 die qq(Could not extract training ORFs from contig $contig_id --- return-code $rc);
208 : gdpusch 1.11 }
209 :     }
210 : gdpusch 1.3
211 :     system("$glimmer_bin/extract -t $tmp_contig $tmp_coords >> $tmp_train")
212 : overbeek 1.1 && die "Could not extract training sequences from contig $contig_id";
213 : gdpusch 1.2
214 :     #... Translate GLIMMER $tmp_coords into SEED $training_tbl
215 :     open(TMP_COORDS, "<$tmp_coords") || die "Could not read-open $tmp_coords";
216 :     print TRAINING map { ++$orf_num;
217 :     chomp $_;
218 :     my (undef, $beg, $end) = split /\s+/o, $_;
219 :     die "Bad coords in entry: $_" unless ($beg && $end);
220 :     my $fid = qq(orf) . (qq(0)x(5-length($orf_num))) . $orf_num;
221 :     my $loc = join(qq(_), ($contig_id, $beg, $end));
222 :     $_ = qq($fid\t$loc\n)
223 :     } <TMP_COORDS>;
224 :     }
225 :     close(TRAINING) || die "Could not close $training_tbl";
226 :     close(CONTIGS) || die "Could not close $contigs_file";
227 :     }
228 :    
229 :    
230 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
231 :     print STDERR ("\n",
232 :     "Training using:\n",
233 :     " contigs --- $training_contigs\n",
234 :     " ORFs --- $training_tbl\n"
235 :     )
236 :     if $ENV{VERBOSE};
237 :     #-----------------------------------------------------------------------
238 :     open(CONTIGS, "<$training_contigs")
239 :     || die "Could not read-open $training_contigs";
240 :    
241 :     my ($len_of, $seq_of);
242 :     while (($contig_id, $seqP) = &read_fasta_record(\*CONTIGS)) {
243 :     $len_of->{$contig_id} = length($$seqP);
244 :     $seq_of->{$contig_id} = $$seqP;
245 :     }
246 :     close(CONTIGS) || die "Could not close $contigs_file";
247 :    
248 :     my $entry;
249 :     my $orf_num;
250 :     my $max_orf_num = 0;
251 :     my %training_tbl;
252 :     open(TBL, "<$training_tbl") || die "Could not read-open $training_tbl";
253 :     open(TRAIN, ">$tmp_train") || die "Could not write-open $tmp_train";
254 :     while (defined($entry = <TBL>)) {
255 :     chomp $entry;
256 :     my ($fid, $locus) = split /\t/, $entry;
257 :    
258 :     if ($fid =~ m/(\d+)$/) {
259 :     $orf_num = $1;
260 :     $max_orf_num = ($orf_num > $max_orf_num) ? $orf_num : $max_orf_num;
261 :     }
262 :     else {
263 :     die "Could not parse FID $fid for training entry $.: $entry";
264 :     }
265 :    
266 :     #...If -skip_called option was selected, record which contigs are in the training set
267 :     if ($skip_called) { $skip{$contig_id} = 1; }
268 :    
269 :     my $training_seq = qq();
270 :     my @exons = split /,/, $locus;
271 :     foreach my $exon (@exons) {
272 :     if ($exon =~ m/^(\S+)_(\d+)_(\d+)/) {
273 :     my ($contig, $beg, $end) = ($1, $2, $3);
274 :    
275 :     my $contig_seq = $seq_of->{$contig};
276 :    
277 :     my $dna = &get_dna_seq($contig, $beg, $end, $len_of, $seq_of);
278 :    
279 :     $training_seq .= lc($dna);
280 :     }
281 :     else {
282 :     die "Could not parse exon $exon for training entry $.: $entry";
283 :     }
284 :    
285 :     my $training_ID = qq(orf) . (qq(0) x (5-length($orf_num))) . $orf_num;
286 :     &display_id_and_seq($training_ID, \$training_seq, \*TRAIN);
287 : overbeek 1.1 }
288 :     }
289 : gdpusch 1.2 close(TRAIN) || die "Could not close $tmp_train";
290 :     close(TBL) || die "Could not close $training_tbl";
291 : overbeek 1.1
292 : gdpusch 1.2 if (($_ = `grep -c "^>" $tmp_train`) && ($_ =~ m/^\s*(\d+)/)) {
293 :     print STDERR "\nExtracted $1 training sequences\n\n" if $ENV{VERBOSE};
294 : overbeek 1.1 }
295 : gdpusch 1.2 else {
296 : overbeek 1.1 die "\nCould not extract any training sequences";
297 :     }
298 :    
299 :    
300 : gdpusch 1.2
301 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
302 :     #... Build ICM (interpolated context model)
303 :     #-----------------------------------------------------------------------
304 :     print STDERR ("Building interpolated context model ---\n",
305 :     " output in $tmp_model\n\n"
306 :     ) if $ENV{VERBOSE};
307 :    
308 :     if (-s "$tmp_model") {
309 : overbeek 1.1 system("rm -f $tmp_model") && die "Could not remove $tmp_model";
310 :     }
311 :    
312 : gdpusch 1.3 $fig->run("$glimmer_bin/build-icm -r $tmp_model < $tmp_train");
313 : gdpusch 1.2
314 :    
315 :    
316 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
317 :     #... First GLIMMER pass
318 :     #-----------------------------------------------------------------------
319 :     print STDERR "Running first GLIMMER pass\n" if $ENV{VERBOSE};
320 : gdpusch 1.8 $fig->run("$glimmer_bin/glimmer3 $glimmeropts $genetic_code_switch $contigs_file $tmp_model $tmp_prefix.pass_1");
321 : gdpusch 1.2
322 : overbeek 1.1
323 : gdpusch 1.2
324 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
325 :     #... Extract upstream regions and START codon counts
326 :     #-----------------------------------------------------------------------
327 : gdpusch 1.3 my $tmp_predictions_pass_1 = "$tmp_prefix.pass_1.predict";
328 : gdpusch 1.2 print STDERR "\nExtracting upstream regions and START counts from $tmp_predictions_pass_1\n"
329 :     if $ENV{VERBOSE};
330 :    
331 :     open(PREDICT, "<$tmp_predictions_pass_1")
332 :     || die "Could not read-open $tmp_predictions_pass_1";
333 :    
334 : gdpusch 1.3 my $tmp_upstream = "$tmp_prefix.upstream";
335 : gdpusch 1.2 open(UPSTREAM, ">$tmp_upstream")
336 :     || die "Could not write-open $tmp_upstream";
337 :    
338 :     my %start_counts; #...Hash to hold START codon counts
339 : gdpusch 1.13 $start_counts{0} = 0;
340 :     $start_counts{atg} = 0;
341 :     $start_counts{gtg} = 0;
342 :     $start_counts{ttg} = 0;
343 : gdpusch 1.2
344 :     $contig_id = qq();
345 :     while(defined($entry = <PREDICT>)) {
346 :     chomp $entry;
347 :    
348 :     if ($entry =~ m/^>(\S+)/) {
349 :     $contig_id = $1;
350 : overbeek 1.1 next;
351 :     }
352 : gdpusch 1.2 elsif ($entry =~ m/^(\S+)\s+(\d+)\s+(\d+)/) {
353 :     my ($id, $beg, $end) = ($1, $2, $3);
354 :    
355 :     my $sign = ($beg < $end) ? +1 : -1;
356 :    
357 :     my $end_start = $beg + $sign * 2;
358 :     my $start_codon = &get_dna_seq($contig_id, $beg, $end_start, $len_of, $seq_of);
359 :    
360 :     ++$start_counts{0};
361 :     ++$start_counts{$start_codon};
362 :    
363 :     my $up_id = $id;
364 :     $up_id =~ s/^orf/ups/o;
365 :     my $up_beg = $beg - $sign * 25;
366 :     my $up_end = $beg - $sign;
367 :    
368 :     my $up_seq;
369 :     if ($up_seq = &get_dna_seq($contig_id, $up_beg, $up_end, $len_of, $seq_of)) {
370 :     &display_id_and_seq($id, \$up_seq, \*UPSTREAM);
371 :     }
372 :     }
373 :     else {
374 :     die "Could not parse prediction $.: $entry";
375 :     }
376 :     }
377 :    
378 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
379 :     #... Compute pseudocount-stabilized START codon frequencies
380 :     #-----------------------------------------------------------------------
381 :     my $atg_freq = ($start_counts{atg} + 80) / ($start_counts{0} + 100);
382 :     my $gtg_freq = ($start_counts{gtg} + 15) / ($start_counts{0} + 100);
383 :     my $ttg_freq = ($start_counts{ttg} + 5) / ($start_counts{0} + 100);
384 :     my $start_freqs = "$atg_freq,$gtg_freq,$ttg_freq";
385 :    
386 :     print STDERR "start_freqs = $start_freqs\n\n" if $ENV{VERBOSE};
387 :    
388 :    
389 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
390 :     #... Build Position Weight Matrix for upstream regions using ELPH
391 :     #-----------------------------------------------------------------------
392 : gdpusch 1.4 print STDERR "\nExtracting upstream motifs from $tmp_predictions_pass_1\n";
393 :    
394 :     my $motif_width = 6;
395 :     my $tmp_motifs = "$tmp_prefix.motifs";
396 :     system("$elph_bin/elph $tmp_upstream LEN=$motif_width > $tmp_motifs")
397 :     && ((-s $tmp_motifs)
398 :     || die("Could not extract upstream motifs")
399 :     );
400 :    
401 :    
402 :     print STDERR "\nBuilding PWM from upstream motifs in $tmp_motifs\n";
403 :    
404 :     open( MOTIFS, "<$tmp_motifs") || die "Could not read-open $tmp_motifs";
405 :     my @motifs = <MOTIFS>;
406 :     close(MOTIFS) || die "Could not close $tmp_motifs";
407 :     while ($motifs[0] !~ m/^Motif counts/) { shift @motifs };
408 :     shift @motifs;
409 :     my $last = 0;
410 :     for (my $i=0; $i < @motifs; ++$i) {
411 :     last unless ($motifs[$i] =~ m/\S+/);
412 :     if ($motifs[$i] =~ m/^[acgt]:((\s+\d+){$motif_width})/) {
413 :     $last = $i;
414 :     }
415 :     }
416 :     $#motifs = $last;
417 : gdpusch 1.2
418 : gdpusch 1.3 my $tmp_matrix = "$tmp_prefix.pwm";
419 : gdpusch 1.4 open( MATRIX, ">$tmp_matrix") || die "Could not write-open $tmp_matrix";
420 :     print MATRIX "$motif_width\n";
421 :     foreach my $line (@motifs) {
422 :     chomp $line;
423 :     my @fields = split /\s+/, $line;
424 :     my $base = substr((shift @fields), 0, 1);
425 :     print MATRIX ($base, (map { sprintf(qq(%7u), $_) } @fields), qq(\n));
426 :     }
427 :     close(MATRIX) || die "Could not close $tmp_matrix";
428 :    
429 :     (-s $tmp_matrix) || die "Could not construct PWM --- empty file $tmp_matrix";
430 : gdpusch 1.2
431 :    
432 :    
433 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
434 :     #... Re-call PEGs using ICM and PWM
435 :     #-----------------------------------------------------------------------
436 :     print STDERR "Re-calling PEGs using trained ICM\n" if $ENV{VERBOSE};
437 :    
438 : gdpusch 1.8 $fig->run("$glimmer_bin/glimmer3 $glimmeropts $genetic_code_switch -b $tmp_matrix -P $start_freqs $contigs_file $tmp_model $tmp_prefix");
439 : gdpusch 1.3
440 : gdpusch 1.2
441 :    
442 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
443 :     #... Parse and write final predictions
444 :     # NOTE: Does not yet handle '-skip' option
445 :     #-----------------------------------------------------------------------
446 :     my $tmp_predictions = "$tmp_prefix.predict";
447 :     print STDERR "\nParsing $tmp_predictions\n"
448 :     if $ENV{VERBOSE};
449 :    
450 :     open(PREDICT, "<$tmp_predictions")
451 :     || die "Could not read-open $tmp_predictions";
452 :    
453 :     my $fid_num = 0;
454 :    
455 :     $contig_id = qq();
456 :     while(defined($entry = <PREDICT>)) {
457 :     chomp $entry;
458 : overbeek 1.1
459 : gdpusch 1.2 if ($entry =~ m/^>(\S+)/) {
460 :     $contig_id = $1;
461 :     next;
462 :     }
463 :     elsif ($entry =~ m/^(\S+)\s+(\d+)\s+(\d+)/) {
464 :     my ($id, $beg, $end) = ($1, $2, $3);
465 : overbeek 1.1
466 : gdpusch 1.2 if ($beg && $end) {
467 :     ++$fid_num;
468 :     my $fid = qq(fig|$taxon_ID.peg.) . $fid_num;
469 :     print STDOUT (qq($fid\t), join(qq(_), ($contig_id, $beg, $end)), qq(\n));
470 : overbeek 1.1 }
471 : gdpusch 1.2 else {
472 :     die "Error in $tmp_predictions line $.: $entry";
473 : overbeek 1.1 }
474 :     }
475 : gdpusch 1.2 else {
476 :     die "Could not parse prediction $.: $entry";
477 :     }
478 : overbeek 1.1 }
479 :    
480 : gdpusch 1.4 if (not $ENV{VERBOSE}) {
481 : gdpusch 1.3 $fig->run("rm -f $tmp_prefix.*");
482 :     }
483 : gdpusch 1.4 else {
484 :     print STDERR "\nKeeping tmp files\n\n";
485 :     }
486 : overbeek 1.1
487 :     exit 0;
488 :    
489 :    
490 : gdpusch 1.2 ########################################################################
491 :     sub get_dna_seq {
492 :     my ($contig, $beg, $end, $len_of, $seq_of) = @_;
493 :     my $dna = qq();
494 :    
495 :     my $contig_len = $len_of->{$contig};
496 :     my $contig_seq = $seq_of->{$contig};
497 :    
498 :     return undef if (&max($beg, $end) > $contig_len);
499 :     return undef if (&min($beg, $end) < 1);
500 :    
501 :     if ($beg < $end) {
502 :     $dna = substr($contig_seq, ($beg-1), ($end+1-$beg));
503 :     }
504 :     else {
505 :     $dna = substr($contig_seq, ($end-1), ($beg+1-$end));
506 :     $dna = $ { &rev_comp(\$dna) };
507 :     }
508 :    
509 :     return lc($dna);
510 :     }
511 :    
512 : overbeek 1.1
513 :     sub read_fasta_record
514 :     {
515 :     my ($file_handle) = @_;
516 :     my ( $old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record );
517 :    
518 :     if (not defined($file_handle)) { $file_handle = \*STDIN; }
519 :    
520 :     $old_end_of_record = $/;
521 :     $/ = "\n>";
522 :    
523 :     if (defined($fasta_record = <$file_handle>))
524 :     {
525 :     chomp $fasta_record;
526 :     @lines = split( /\n/, $fasta_record );
527 :     $head = shift @lines;
528 :     $head =~ s/^>?//;
529 :     $head =~ m/^(\S+)/;
530 :     $seq_id = $1;
531 :    
532 :     if ($head =~ m/^\S+\s+(.*)$/) { $comment = $1; } else { $comment = ""; }
533 :    
534 :     $sequence = join( "", @lines );
535 :    
536 :     @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
537 :     }
538 :     else
539 :     {
540 :     @parsed_fasta_record = ();
541 :     }
542 :    
543 :     $/ = $old_end_of_record;
544 :    
545 :     return @parsed_fasta_record;
546 :     }
547 :    
548 :     sub display_id_and_seq
549 :     {
550 :     my( $id, $seq, $fh ) = @_;
551 :    
552 :     if (! defined($fh) ) { $fh = \*STDOUT; }
553 :    
554 :     print $fh ">$id\n";
555 :     &display_seq($seq, $fh);
556 :     }
557 :    
558 :     sub display_seq
559 :     {
560 :     my ( $seq, $fh ) = @_;
561 :     my ( $i, $n, $ln );
562 :    
563 :     if (! defined($fh) ) { $fh = \*STDOUT; }
564 :    
565 :     $n = length($$seq);
566 :     # confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
567 :     for ($i=0; ($i < $n); $i += 60)
568 :     {
569 :     if (($i + 60) <= $n)
570 :     {
571 :     $ln = substr($$seq,$i,60);
572 :     }
573 :     else
574 :     {
575 :     $ln = substr($$seq,$i,($n-$i));
576 :     }
577 :     print $fh "$ln\n";
578 :     }
579 :     }
580 :    
581 :     sub rev_comp
582 :     {
583 :     my( $seqP ) = @_;
584 :     my( $rev );
585 :    
586 :     $rev = reverse( $$seqP );
587 :     $rev =~ tr/a-z/A-Z/;
588 :     $rev =~ tr/ACGTUMRWSYKBDHV/TGCAAKYWSRMVHDB/;
589 :     return \$rev;
590 :     }
591 :    
592 :     sub min { my ($x, $y) = @_; return (($x < $y) ? $x : $y); }
593 :    
594 :     sub max { my ($x, $y) = @_; return (($x > $y) ? $x : $y); }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3