use FIG; my $fig = new FIG; $usage = "usage: build_initial_objects_for_start_predictions < pegs_in_cluster > initial_objects"; @pegs = ; chomp @pegs; # print "NPEGS=", scalar(@pegs), "\n"; for ($i=0; $i < @pegs; $i++) # strip off all but peg ids at front { if ($pegs[$i] =~ /(\S+)\t.*/) { $pegs[$i] = $1; } } foreach $peg (@pegs) { $genome = $fig->genome_of($peg); $loc = $fig->feature_location($peg); # print "$len{$peg}\t$peg\t$loc\n"; if ($loc =~ /,/) { print STDERR "skipping $peg - do not handle complex locs\n"; next; } my $lnC = $fig->contig_ln($genome,$contig); ($contig,$begin,$end) = $fig->boundaries_of($loc); if ($begin < $end) { $l = sprintf("%s_%d_%d",$contig,1,$begin-1); if ($end < ($lnC-3)) { $end += 3; } } else { $l = sprintf("%s_%d_%d",$contig,$begin+1,$fig->contig_ln($genome,$contig)); if ($end > 3) { $end -= 3; } } $loc = join("_",($contig,$begin,$end)); $seq = uc $fig->dna_seq($genome,$loc); if (! $seq) { print STDERR &Dumper($peg,$loc,$seq); die "could not get sequence for $peg"; } # The following hack handles uncertainty about whether SEED gives back the STOP or not if (substr($seq,-6,3) =~ /TAA|TAG|TGA/i) { $end = ($begin < $end) ? $end-3 : $end+3; $seq = substr($seq,0,length($seq) - 3); } elsif (substr($seq,-3,3) !~ /TAA|TAG|TGA/i) { warn "BAD STOP: $peg $genome $loc $seq\n"; } $loc = join("_",($contig,$begin,$end)); # BACK INTO THE STOP FROM START OF PEG $pre_peg = uc $fig->dna_seq($genome,$l); $found = 0; $len_pre_peg = length($pre_peg); for ($i=$len_pre_peg-3; $i > 0 && ! $found; $i-=3) { $stop = substr($pre_peg,$i,3); if ($stop eq "TAG" || $stop eq "TAA" || $stop eq "TGA") { # print "FOUND $stop for $l\n"; $orf = substr($pre_peg,$i+3) . $seq; if (($i-27) > 0) { $pre_orf = substr($pre_peg,$i-27,30); my $gs = $fig->org_of($peg); my @aliases = grep { $_ =~ /^([NXYZA]P_[0-9\.]+)|(uni\|)|(sp\|)/ } $fig->feature_aliases($peg); print "ID=$peg\n"; print "GENOME=$gs\n"; if (@aliases > 0) { print "ALIASES=",join(",",@aliases),"\n"; } print "CONTIG=$contig\n"; #####print "BEGIN=$begin\n"; if ($begin < $end) { print "BEGIN=", $begin-($len_pre_peg-$i-3), "\n"; } else { print "BEGIN=", $begin+($len_pre_peg-$i-3), "\n"; } print "END=$end\n"; print "SEQ=$orf\n"; print "PREFIX=$pre_orf\n"; print "OLD_START_POS=", $len_pre_peg-$i-2, "\n"; print "///\n"; $found = 1; } else { print STDERR "skipping $peg - not enough prefix before orf\n"; } last; } } if ( ! $found) { print STDERR "DID NOT FIND STOP BEFORE $peg $contig $begin $end $l\n"; } }