[Bio] / FigKernelScripts / parse_genbank.pl Repository:
ViewVC logotype

View of /FigKernelScripts/parse_genbank.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Dec 1 20:46:40 2003 UTC (16 years, 6 months ago) by efrank
Branch: MAIN
CVS Tags: V00-00-01, V00-00-00

Makefile:
	get it to work with the release tools

All the rest:
	had to rename foo to foo.pl so that makefiles could recognize
	perl source from, say, Makefiles and READMEs

use FIG;

$usage = "usage: parse_genbank Tbl Fasta Assignments Prefix idN ContigFasta pegDNA < genbank.entry";
(($tbl = shift @ARGV) && open(TBL,">$tbl") &&
 ($fasta = shift @ARGV) && open(FASTA,">$fasta") &&
 ($assignments = shift @ARGV) && open(ASSIGNMENTS,">$assignments") &&
 ($prefix = shift @ARGV) &&
 ($idN = shift @ARGV) &&
 ($contigF = shift @ARGV) && open(CONTIGS,">$contigF") &&
 ($pegDNA = shift @ARGV) && open(PEGDNA,">$pegDNA")
)
    || die $usage;

$/ = "\nLOCUS";
while ($contig = <STDIN>)
{
    if ($contig =~ /\nACCESSION\s+(\S+)/s)
    {
	$id = $1;
	if ($contig =~ /ORIGIN(.*)(\/\/|LOCUS)/s)
	{
	    $seq = $1;
	    $seq =~ s/\s//gs;
	    $seq =~ s/\d//g;
	    undef $contigs;
	    $contigs->{$id} = $seq;
	    &FIG::display_id_and_seq($id,\$seq,\*CONTIGS);
	}
	else
	{
	    print STDERR &Dumper($contig);
	    die "could not find sequence";
	}

	while ($contig =~ /\n\s{4,6}CDS\s+([^\n]+(\n\s{20,22}\S[^\n]*)+)/gs)
	{
	    $cds = $1;
	    &process_cds($id,\$cds,$prefix,\$idN,$contigs,\*TBL,\*FASTA,\*ASSIGNMENTS,\*PEGDNA);
	}
    }
}

sub process_cds {
    my($contigID,$cdsP,$prefix,$idNP,$contigs,$fh_tbl,$fh_fasta,$fh_ass,$fh_dna) = @_;
    my($loc,@aliases,$func,$trans,$id,$precise);

    ($loc,$precise)  = &get_loc($contigID,$cdsP);
    @aliases = &get_aliases($cdsP);
    $func    = &get_func($cdsP);
    $trans   = &get_trans($cdsP);

    if ($loc || $trans)
    {
	my $dna  = &FIG::extract_seq($contigs,$loc);
	if ($precise)
	{
	    my $prot = &FIG::translate($dna,undef,1);
	    if ($trans && (uc $prot ne uc $trans))
	    {
		print STDERR "BAD TRANSLATION: $contigID $$cdsP\n";
		print STDERR &Dumper($trans,$prot,$dna),"\n";
	    }
	}

	$id = $prefix . "$$idNP";
	$$idNP++;
	print $fh_tbl "$id\t$loc\t",join("\t",@aliases),"\n";
	&FIG::display_id_and_seq($id,\$dna,$fh_dna);
	if ($func)
	{
	    print $fh_ass "$id\t$func\n";
	}
	if ($trans)
	{
	    &FIG::display_id_and_seq($id,\$trans,$fh_fasta);
	}
    }
    else
    {
	print &Dumper($cdsP); die "aborted";
    }
}

sub get_loc {
    my($contigID,$cdsP) = @_;
    my($beg,$end,$loc,$locS,$iter,$n,$args,@pieces);
    my($precise);

    if ($$cdsP =~ /^(([^\n]+)((\n\s{21,23}[^\/][^\n]+)+)?)/s)
    {
	@loc = ();
	$locS = $1;
	$locS =~ s/\s//g;
	$precise = ($locS !~ /[<>]/);

	@pieces = ();
	$n = 0;
	$iter = 500;
	while (($locS !~ /^\[\d+\]$/) && $iter)
	{
	    $iter--;
	    if ($locS =~ s/[<>]?(\d+)\.\.[<>]?(\d+)/\[$n\]/)
	    {
		push(@pieces,["loc",$1,$2]);
		$n++;
	    }

	    if ($locS =~ s/([,\(])(\d+)([,\)])/$1\[$n\]$3/)
	    {
		push(@pieces,["loc",$2,$2]);
		$n++;
	    }

	    if ($locS =~ s/join\((\[\d+\](,\[\d+\])*)\)/\[$n\]/)
	    {
		$args = $1;
		push(@pieces,["join",map { $_ =~ /^\[(\d+)\]$/; $1 } split(/,/,$args)]);
		$n++;
	    }

	    if ($locS =~ s/complement\((\[\d+\](,\[\d+\])*)\)/\[$n\]/g)
	    {
		$args = $1;
		push(@pieces,["complement",map { $_ =~ /^\[(\d+)\]$/; $1 } split(/,/,$args)]);
		$n++;
	    }
	}
	if ($locS =~ /^\[(\d+)\]$/)
	{
	    $loc = &conv(\@pieces,$1,$contigID);
	}
	else
	{
	    print STDERR &Dumper(["could not parse $locS $iter",$cdsP]);
	    die "aborted";
	}

	my @locs = split(/,/,$loc);
	&trim_stop(\@locs);
	$loc = join(",",@locs);
    }
    else
    {
	print STDERR &Dumper($cdsP); die "could not parse location";
	die "aborted";
    }
    return ($loc,$precise);
}

sub trim_stop {
    my($locs) = @_;
    my($to_trim,$n);

    $to_trim = 3;
    while ($to_trim && (@$locs > 0))
    {
	$n  = @$locs - 1;
	if ($locs->[$n] =~ /^(\S+)_(\d+)_(\d+)$/)
	{
	    if ($2 <= ($3-$to_trim))
	    {
		$locs->[$n] = join("_",($1,$2,$3-$to_trim));
		$to_trim = 0;
	    }
	    elsif ($2 >= ($3 + $to_trim))
	    {
		$locs->[$n] = join("_",($1,$2,$3+$to_trim));
		$to_trim = 0;
	    }
	    else
	    {
		$to_trim -= abs($3-$2) + 1;
		pop @$locs;
	    }
	}
	else
	{
	    die "could not parse $locs->[$n]";
	}
    }
}


sub conv {
    my($pieces,$n,$contigID) = @_;

    if ($pieces->[$n]->[0] eq "loc")
    {
	return join("_",$contigID,@{$pieces->[$n]}[1..2]);
    }
    elsif ($pieces->[$n]->[0] eq "join")
    {
	return join(",",map { &conv($pieces,$_,$contigID) } @{$pieces->[$n]}[1..$#{$pieces->[$n]}]);
    }
    elsif ($pieces->[$n]->[0] eq "complement")
    {
	return join(",",&complement(join(",",map { &conv($pieces,$_,$contigID) } @{$pieces->[$n]}[1..$#{$pieces->[$n]}])));;
    }
}

sub complement {
    my($loc) = @_;
    my(@locs);

    @locs = reverse split(/,/,$loc);
    foreach $loc (@locs)
    {
	$loc =~ /^(\S+)_(\d+)_(\d+)$/;
	$loc = join("_",($1,$3,$2));
    }
    return join(",",@locs);
}

sub get_aliases {
    my($cdsP) = @_;
    my($db_ref);

    my @aliases = ();
    if ($$cdsP =~ /\/(gene|locus_tag)=\"([^\"]+)\"/s)
    {
	push(@aliases,$2);
    }
    
    if ($$cdsP =~ /\/db_xref=\"([^\"]+)\"/s)
    {
	$db_ref = $1;
	$db_ref =~ s/^GI:/gi\|/;
	push(@aliases,$db_ref);
    }

    return @aliases;
}

sub get_trans {
    my($cdsP) = @_;
    my $tran = "";

    if ($$cdsP =~ /\/translation=\"([^"]*)\"/s)
    {
	$tran = $1;
	$tran =~ s/\s//gs;
    }
    return $tran;
}

sub get_func {
    my($cdsP) = @_;
    my $func = "";

    if ($$cdsP =~ /\/product=\"([^"]*)\"/s)
    {
	$func = $1;
	$func =~ s/\s+/ /gs;
    }
    return $func;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3