[Bio] / FigKernelScripts / parse_genbank.pl Repository:
ViewVC logotype

View of /FigKernelScripts/parse_genbank.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Wed Dec 3 19:39:29 2003 UTC (16 years, 5 months ago) by efrank
Branch: MAIN
CVS Tags: delong-01, delong-02
Changes since 1.1: +86 -10 lines

fixed a bunch of changes

use FIG;

$usage = "usage: parse_genbank Genome Dir < genbank.entry";
(
 $genome = shift @ARGV &&
 $dir    = shift @ARGV
)
    || die $usage;

$prefixP = "fig|$genome.peg.";
$prefixR = "fig|$genome.rna.";

&verify_dir("$dir/Features/peg");
&verify_dir("$dir/Features/rna");
open(CONTIGS,">$dir/contigs") "" die "could not open $dir/contigs";
open(TBLPEG,">$dir/Features/peg/tbl") || die "could not open $dir/Features/peg/tbl";
open(FASTAPEG,">$dir/Features/peg/fasta") || die "could not open $dir/Features/peg/fasta";
open(ASSIGNMENTS,">$dir/assigned_functions") "" die "could not open $dir/assigned_functions";
open(TBLRNA,">$dir/Features/rna/tbl") || die "could not open $dir/Features/rna/tbl";
open(FASTARNA,">$dir/Features/rna/fasta") || die "could not open $dir/Features/rna/fasta";

$/ = "\nLOCUS";
while ($contig = <STDIN>)
{
    if ($contig =~ /\nACCESSION\s+(\S+)/s)
    {
	$id = $1;
	if ($contig =~ /ORIGIN(.*)(\/\/|LOCUS)/s)
	{
	    $seq = $1;
	    $seq =~ s/\s//gs;
	    $seq =~ s/\d//g;
	    undef $contigs;
	    $contigs->{$id} = $seq;
	    &FIG::display_id_and_seq($id,\$seq,\*CONTIGS);
	}
	else
	{
	    print STDERR &Dumper($contig);
	    die "could not find sequence";
	}

	while ($contig =~ /\n\s{4,6}CDS\s+([^\n]+(\n\s{20,22}\S[^\n]*)+)/gs)
	{
	    $cds = $1;
	    &process_cds($id,\$cds,$prefix,\$idN,$contigs,\*TBLPEG,\*FASTAPEG,\*ASSIGNMENTS);
	}

	while ($contig =~ /\n\s{3,6}[tr]RNA\s+([^\n]+(\n\s{20,22}\S[^\n]*)+)/gs)
	{
	    $rna = $1;
	    &process_rna($id,\$rna,$prefixR,\$idNr,$contigs,\*TBLRNA,\*FASTARNA);
	}
    }
}
close(CONTIGS);
close(TBLPEG);
close(FASTPEG);
close(ASSIGNMENTS);
close(TBLRNA);
close(FASTARNA);

if (! -s "$dir/contigs")            { unlink("$dir/contigs"); print STDERR "no contigs in $dir\n"; }
if (! -s "$dir/assigned_functions") { unlink("$dir/assigned_functions"); print STDERR "no assigned_functions in $dir }
if (! -s "$dir/Features/peg/tbl")   { system "rm -rf $dir/Features/peg"; print STDERR "no PEGs in $dir }
if (! -s "$dir/Features/rna/tbl")   { system "rm -rf $dir/Features/rna"; print STDERR "no RNAs in $dir }


sub process_cds {
    my($contigID,$cdsP,$prefix,$idNP,$contigs,$fh_tbl,$fh_fasta,$fh_ass,$fh_dna) = @_;
    my($loc,@aliases,$func,$trans,$id,$precise);

    ($loc,$precise)  = &get_loc($contigID,$cdsP);
    @aliases = &get_aliases($cdsP);
    $func    = &get_func($cdsP);
    $trans   = &get_trans($cdsP);

    if ($loc || $trans)
    {
	my $dna  = &FIG::extract_seq($contigs,$loc);
	if ($precise)
	{
	    my $prot = &FIG::translate($dna,undef,1);
	    if ($trans && (uc $prot ne uc $trans))
	    {
		print STDERR "BAD TRANSLATION: $contigID $$cdsP\n";
		print STDERR &Dumper($trans,$prot,$dna),"\n";
	    }
	}

	$id = $prefix . "$$idNP";
	$$idNP++;
	print $fh_tbl "$id\t$loc\t",join("\t",@aliases),"\n";
	&FIG::display_id_and_seq($id,\$dna,$fh_dna);
	if ($func)
	{
	    print $fh_ass "$id\t$func\n";
	}
	if ($trans)
	{
	    &FIG::display_id_and_seq($id,\$trans,$fh_fasta);
	}
    }
    else
    {
	print &Dumper($cdsP); die "aborted";
    }
}

sub get_loc {
    my($contigID,$cdsP) = @_;
    my($beg,$end,$loc,$locS,$iter,$n,$args,@pieces);
    my($precise);

    if ($$cdsP =~ /^(([^\n]+)((\n\s{21,23}[^\/ ][^\n]+)+)?)/s)
    {
	@loc = ();
	$locS = $1;
	$locS =~ s/\s//g;
	$precise = ($locS !~ /[<>]/);

	@pieces = ();
	$n = 0;
	$iter = 500;
	while (($locS !~ /^\[\d+\]$/) && $iter)
	{
	    $iter--;
	    if ($locS =~ s/[<>]?(\d+)\.\.[<>]?(\d+)/\[$n\]/)
	    {
		push(@pieces,["loc",$1,$2]);
		$n++;
	    }

	    if ($locS =~ s/([,\(])(\d+)([,\)])/$1\[$n\]$3/)
	    {
		push(@pieces,["loc",$2,$2]);
		$n++;
	    }

	    if ($locS =~ s/join\((\[\d+\](,\[\d+\])*)\)/\[$n\]/)
	    {
		$args = $1;
		push(@pieces,["join",map { $_ =~ /^\[(\d+)\]$/; $1 } split(/,/,$args)]);
		$n++;
	    }

	    if ($locS =~ s/complement\((\[\d+\](,\[\d+\])*)\)/\[$n\]/g)
	    {
		$args = $1;
		push(@pieces,["complement",map { $_ =~ /^\[(\d+)\]$/; $1 } split(/,/,$args)]);
		$n++;
	    }
	}
	if ($locS =~ /^\[(\d+)\]$/)
	{
	    $loc = &conv(\@pieces,$1,$contigID);
	}
	else
	{
	    print STDERR &Dumper(["could not parse $locS $iter",$cdsP]);
	    die "aborted";
	}

	my @locs = split(/,/,$loc);
	&trim_stop(\@locs);
	$loc = join(",",@locs);
    }
    else
    {
	print STDERR &Dumper($cdsP); die "could not parse location";
	die "aborted";
    }
    return ($loc,$precise);
}

sub trim_stop {
    my($locs) = @_;
    my($to_trim,$n);

    $to_trim = 3;
    while ($to_trim && (@$locs > 0))
    {
	$n  = @$locs - 1;
	if ($locs->[$n] =~ /^(\S+)_(\d+)_(\d+)$/)
	{
	    if ($2 <= ($3-$to_trim))
	    {
		$locs->[$n] = join("_",($1,$2,$3-$to_trim));
		$to_trim = 0;
	    }
	    elsif ($2 >= ($3 + $to_trim))
	    {
		$locs->[$n] = join("_",($1,$2,$3+$to_trim));
		$to_trim = 0;
	    }
	    else
	    {
		$to_trim -= abs($3-$2) + 1;
		pop @$locs;
	    }
	}
	else
	{
	    die "could not parse $locs->[$n]";
	}
    }
}


sub conv {
    my($pieces,$n,$contigID) = @_;

    if ($pieces->[$n]->[0] eq "loc")
    {
	return join("_",$contigID,@{$pieces->[$n]}[1..2]);
    }
    elsif ($pieces->[$n]->[0] eq "join")
    {
	return join(",",map { &conv($pieces,$_,$contigID) } @{$pieces->[$n]}[1..$#{$pieces->[$n]}]);
    }
    elsif ($pieces->[$n]->[0] eq "complement")
    {
	return join(",",&complement(join(",",map { &conv($pieces,$_,$contigID) } @{$pieces->[$n]}[1..$#{$pieces->[$n]}])));;
    }
}

sub complement {
    my($loc) = @_;
    my(@locs);

    @locs = reverse split(/,/,$loc);
    foreach $loc (@locs)
    {
	$loc =~ /^(\S+)_(\d+)_(\d+)$/;
	$loc = join("_",($1,$3,$2));
    }
    return join(",",@locs);
}

sub get_aliases {
    my($cdsP) = @_;
    my($db_ref);

    my @aliases = ();
    if ($$cdsP =~ /\/(gene|locus_tag)=\"([^\"]+)\"/s)
    {
	push(@aliases,$2);
    }
    
    if ($$cdsP =~ /\/db_xref=\"([^\"]+)\"/s)
    {
	$db_ref = $1;
	$db_ref =~ s/^GI:/gi\|/;
	push(@aliases,$db_ref);
    }

    return @aliases;
}

sub get_trans {
    my($cdsP) = @_;
    my $tran = "";

    if ($$cdsP =~ /\/translation=\"([^"]*)\"/s)
    {
	$tran = $1;
	$tran =~ s/\s//gs;
    }
    return $tran;
}

sub get_func {
    my($cdsP) = @_;
    my $func = "";

    if ($$cdsP =~ /\/product=\"([^"]*)\"/s)
    {
	$func = $1;
	$func =~ s/\s+/ /gs;
    }
    return $func;
}

sub process_rna {
    my($contigID,$cdsP,$prefix,$idNP,$contigs,$fh_tbl,$fh_dna) = @_;
    my($loc,@aliases,$func,$trans,$id,$precise);

    ($loc,$precise)  = &get_loc($contigID,$cdsP);
    $func    = &get_rna_func($cdsP);

    if ($loc)
    {
	my $dna  = &FIG::extract_seq($contigs,$loc);
	$id = $prefix . "$$idNP";
	$$idNP++;
	if (! $func )
	{
	    warn "could not get func $$cdsP\n";
	}
	else
	{
	    print $fh_tbl "$id\t$loc\t$func\n";
	    &FIG::display_id_and_seq($id,\$dna,$fh_dna);
	}
    }
    else
    {
	print &Dumper($cdsP); die "aborted";
    }
}

sub get_rna_func {
    my($cdsP) = @_;
    my $func = "";

    if ($$cdsP =~ /\/product=\"([^"]*)\"/s)
    {
	$func = $1;
	$func =~ s/\s+/ /gs;
    }
    elsif ($$cdsP =~ /\/gene=\"([^"]*)\"/s)
    {
	$func = $1;
	$func =~ s/\s+/ /gs;
    }
    elsif ($$cdsP =~ /\/note=\"([^"]*)\"/s)
    {
	$func = $1;
	$func =~ s/\s+/ /gs;
    }
    return $func;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3