[Bio] / FigKernelScripts / get_fasta_for_tbl_entries.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_fasta_for_tbl_entries.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (download) (as text) (annotate)
Tue Sep 22 04:10:24 2009 UTC (10 years, 1 month ago) by gdpusch
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.10: +1 -1 lines
Modified to remove complaints about uninitialed values when TBL entry does not contain aliases.

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################


use FIG;
use strict;
use warnings;

my $usage = "get_fasta_for_tbl_entries [-code=genetic_code_number] Contigs [Stops] < tbl > fasta";

my ($contigs_file, $code, $stop);
my (%seq_of, %len_of, $id, $seqP);
my ($loc, @loc, $exon, $contig, $beg, $end, $dna, $subseq, $prot);

my $code_num = 11;   #...Default genetic code

my $trouble  =  0;
while (@ARGV && ($ARGV[0] =~ m/^-/)) {
    if ($ARGV[0] =~ m/-code=(\d+)/) {
	$code_num = $1;
	unless (($code_num == 4) || ($1 == 11)) {
	    $trouble = 1;
	    warn "Sorry, cannot handle genetic code = $code_num;"
		, " we currently only support Genetic Codes 11 and 4\n";
	}
    }
    else {
	$trouble = 1;
	print STDERR "Invalid arg: $ARGV[0]\n";
    }
    
    shift @ARGV;
}
die "\n\n  usage: $usage\n\n" if $trouble;


(($contigs_file = shift @ARGV) && (-s $contigs_file))
    || die "Contigs file $contigs_file does not exist or has zero size";

$code = &FIG::genetic_code($code_num);
my $stops = "";
if (@ARGV > 0) 
{ 
    $stops = shift @ARGV; 
    $stops = "\'$stops\'";
    foreach $stop (("TGA","TAA","TAG"))
    {
	if ($stops !~ /$stop/i)
	{
	    $code->{$stop} = "X";
	}
    }
}


$trouble = 0;
open(CONTIGS, "<$contigs_file") || die "Could not read-open $contigs_file";
while (($id, $seqP) = &FIG::read_fasta_record(\*CONTIGS))
{
    if (not defined($len_of{$id})) {
	$seq_of{$id} = $$seqP;
	$len_of{$id} = length($$seqP);
    }
    else {
	$trouble = 1;
	warn "Duplicate contig ID: $id\n";
    }
}
close(CONTIGS) || die "Could not close contigs file $contigs_file";

if ($trouble) {
    die "\nAborting due to duplicate contig IDs\n\n";
}


my $entry;
while (defined($entry = <STDIN>)) {
    $trouble = 0;
    if ($entry =~ /^(\S+)\t(\S+)(\t(.*))?/) {
	my $comment;
	($id, $loc) = ($1, $2);
	@loc = split /,/, $loc;
	$comment = $4 ? qq( $4) : qq();
	
	$dna = "";
	foreach $exon (@loc) {
	    ($contig, $beg, $end) = &FIG::boundaries_of($exon);
	    
	    if (defined($seq_of{$contig}) && $len_of{$contig}) {
		if ( ($beg < 1) || ($beg > $len_of{$contig}) ||
		     ($end < 1) || ($end > $len_of{$contig})
		     )
		{
		    warn "For FID $id, exon $exon is out of bounds: contig=$contig, beg=$beg, end=$end, len=$len_of{$contig}\n";
		    $trouble = 1;
		    next;
		}
		else {
		    if ($beg < $end) {
			$subseq = substr($seq_of{$contig}, ($beg-1), ($end-$beg+1));
		    }
		    else {
			$subseq = substr($seq_of{$contig}, ($end-1), ($beg-$end+1));
			$subseq = $ { &FIG::rev_comp(\$subseq) };
		    }
		    
		    $dna .= $subseq;
		}
	    }
	    else {
		warn "For FID $id, exon $exon, no sequence or undefined contig $contig";
		$trouble = 1;
		next;
	    }
	}
	
	if ($trouble) {
	    warn "For FID $id, bad location $loc --- skipping\n\n";
	    next;
	}
	
	if ($prot = &FIG::translate($dna,$code,1)) {
	    if ($prot =~ m/\*$/)  { chop $prot; }
	    print ">$id$comment\n$prot\n";
	}
	else {
	    warn "No translation for $id";
	}
    }
    else
    {
	die "malformed tbl entry: $entry";
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3