[Bio] / FigKernelScripts / parse_genbank.pl Repository:
ViewVC logotype

View of /FigKernelScripts/parse_genbank.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (download) (as text) (annotate)
Sun Oct 2 14:54:34 2005 UTC (14 years, 6 months ago) by overbeek
Branch: MAIN
CVS Tags: caBIG-dataload-0, caBIG-00-00-00
Changes since 1.10: +22 -8 lines
Fixed bug in function-parsing logic. -- /gdp

use FIG;

$usage = "usage: parse_genbank Genome Dir < genbank.entry";
while ($ARGV[0] =~ m/^-/)
{
    if    ($ARGV[0] =~ m/^-{1,2}genome=\S+/) 
    {
	$force_genome =  shift @ARGV;
	$force_genome =~ s/^-{1,2}genome=\"?//;
	$force_genome =~ s/\"$//;
	$force_genome =~ s/\s+/ /g;
	
	print STDERR "\nGenome bioname will be taken as: \"$force_genome\"\n";
    }
    elsif ($ARGV[0] =~ m/^-{1,2}tax=\S+/) 
    {
	$force_tax =  shift @ARGV;
	$force_tax =~ s/^-{1,2}tax=\"?//;
	$force_tax =~ s/\"$//;
	$force_tax =~ s/\s+/ /g;
	if ($force_tax !~ m/\.$/) { $force_tax .= "."; }
	
	print STDERR "\nTaxonomy will be taken as: \"$force_tax\"\n";
    }
    else
    {
	die "Could not handle $ARGV[0]";
    }
}

(
 ($genome = shift(@ARGV)) &&
 ($dir    = shift(@ARGV))
)
    || die $usage;

$prefixP = "fig|$genome.peg.";
$prefixR = "fig|$genome.rna.";

&FIG::verify_dir("$dir/Features/peg");
&FIG::verify_dir("$dir/Features/rna");
open(CONTIGS,">$dir/contigs") || die "could not open $dir/contigs";
open(TBLPEG,">$dir/Features/peg/tbl") || die "could not open $dir/Features/peg/tbl";
open(FASTAPEG,">$dir/Features/peg/fasta") || die "could not open $dir/Features/peg/fasta";
open(ASSIGNMENTS,">$dir/assigned_functions") || die "could not open $dir/assigned_functions";
open(TBLRNA,">$dir/Features/rna/tbl") || die "could not open $dir/Features/rna/tbl";
open(FASTARNA,">$dir/Features/rna/fasta") || die "could not open $dir/Features/rna/fasta";

open(TAX,">$dir/TAXONOMY") || die "could not open $dir/TAXONOMY";
open(GENOME,">$dir/GENOME") || die "could not open $dir/GENOME";
open(PROJECT,">$dir/PROJECT") || die "could not open $dir/PROJECT";
print PROJECT "Parsed NCBI Reference Sequences\n";
close(PROJECT);

$idNp = 1;
$idNr = 1;

$/ = "\n//\n";
while ($record = <STDIN>)
{
    if ($record !~ /LOCUS\s+(\S+)/s)
    {
	
	print STDERR "No LOCUS line for record begining with:\n"
	    , substr($record, 0, 160), "\n\n";
    }
    else
    {
	$id = $1;
	if ($record =~ /ORIGIN(.*?)(\/\/|LOCUS)/s)
	{
	    $seq = $1;
	    $seq =~ s/\s//gs;
	    $seq =~ s/\d//g;
	    undef $contigs;
	    $contigs->{$id} = $seq;
	    &FIG::display_id_and_seq($id,\$seq,\*CONTIGS);
	}
	else
	{
	    warn "could not find contig sequence for $id\n";
	    next;
	}

	if ($record =~ /\n {0,4}ORGANISM\s+(\S[^\n]+\S)((\n\s{10,14}\S[^\n]+\S)+)/s)
	{
	    $genome = $1;
	    $tax = $2;
	    $tax =~ s/\n\s+//g;
	    $tax =~ s/ {2,}/ /g;
	    if (! $written_genome)
	    {
		if ($force_genome) { $genome = $force_genome; }
		print GENOME "$genome\n";
		close(GENOME);
		
		if ($force_tax) { $tax = $force_tax; }
		print TAX "$tax\n";
		close(TAX);

		$written_genome = "$tax\t$genome";
	    }
	    elsif (($written_genome ne "$tax\t$genome") && (!$force_genome || !$force_tax)) 
	    {
		print STDERR "serious mismatch in GENOME/TAX for $id\n$written_genome\n$tax\t$genome\n\n";
	    }
	}

	while ($record =~ /\n\s{4,6}CDS\s+([^\n]+(\n {20,}[^\n]*)+)/gs)
	{
	    $cds = $1;
	    if (($cds !~ /\/pseudo/) && (($cds !~ /\/exception/) || ($cds =~ /\/translation/)))
	    {
		&process_cds($id,\$cds,$prefixP,\$idNp,$contigs,\*TBLPEG,\*FASTAPEG,\*ASSIGNMENTS);
	    }
	}

	while ($record =~ /\n\s{3,6}[tr]RNA\s+([^\n]+(\n\s{20,22}\S[^\n]*)+)/gs)
	{
	    $rna = $1;
	    &process_rna($id,\$rna,$prefixR,\$idNr,$contigs,\*TBLRNA,\*FASTARNA);
	}
    }
}
close(CONTIGS);
close(TBLPEG);
close(FASTAPEG);
close(ASSIGNMENTS);
close(TBLRNA);
close(FASTARNA);

if (! -s "$dir/contigs")            { unlink("$dir/contigs"); print STDERR "no contigs in $dir\n"; }
if (! -s "$dir/assigned_functions") { unlink("$dir/assigned_functions"); print STDERR "no assigned_functions in $dir\n"; }
if (! -s "$dir/Features/peg/tbl")   { system "rm -rf $dir/Features/peg"; print STDERR "no PEGs in $dir\n"; }
if (! -s "$dir/Features/rna/tbl")   { system "rm -rf $dir/Features/rna"; print STDERR "no RNAs in $dir\n"; }
if ((! -s "$dir/contigs") && (! -s "$dir/Features/peg/tbl")) { system "rm -rf $dir" }


sub process_cds {
    my($contigID,$cdsP,$prefix,$idNp,$contigs,$fh_tbl,$fh_fasta,$fh_ass) = @_;
    my($loc,@aliases,$func,$trans,$id,$precise,$dna,$prot);
    
    ($loc,$precise)  = &get_loc($contigID,$cdsP);
    @aliases = &get_aliases($cdsP);
    $func    = &get_func($cdsP);
    $trans   = &get_trans($cdsP);

    if ($loc || $trans)
    {
	if ($dna  = &FIG::extract_seq($contigs,$loc))
	{
	    if ($precise)
	    {
		$prot = &FIG::translate($dna,undef,1);
		if ($trans && (uc $prot ne uc $trans))
		{
#		print STDERR "BAD TRANSLATION: $contigID $$cdsP\n";
#		print STDERR &Dumper($trans,$prot,$dna),"\n";
		}
	    }
	}
	else
	{
	    warn "Could not get DNA sequence for CDS at $loc for entry:\n$$cdsP\nof record begining with:\n"
	    , substr($record, 0, 160), "\n\n";
	} 
	
	$id = $prefix . "$$idNp";
	$$idNp++;
	print $fh_tbl "$id\t$loc\t",join("\t",@aliases),"\n";

	if ($func)
	{
	    print $fh_ass "$id\t$func\n";
	}
	
	if ($trans)
	{
	    &FIG::display_id_and_seq($id,\$trans,$fh_fasta);
	}
	else
	{
	    warn "No translation for $id";
	}
    }
    else
    {
	print &Dumper($cdsP); die "aborted";
    }
}

sub get_loc {
    my($contigID,$cdsP) = @_;
    my($beg,$end,$loc,$locS,$iter,$n,$args,@pieces);
    my($precise);

    if ($$cdsP =~ /^(([^\n]+)((\n\s{21,23}[^\/ ][^\n]+)+)?)/s)
    {
	$locS = $1;
	$locS =~ s/\s//g;
	$precise = ($locS !~ /[<>]/);
	
	@pieces = ();
	$n = 0;
	$iter = 500;
	while (($locS !~ /^\[\d+\]$/) && $iter)
	{
	    $iter--;
	    if ($locS =~ s/[<>]?(\d+)\.\.[<>]?(\d+)/\[$n\]/)
	    {
		push(@pieces,["loc",$1,$2]);
		$n++;
	    }
	    
	    if ($locS =~ s/([,\(])(\d+)([,\)])/$1\[$n\]$3/)
	    {
		push(@pieces,["loc",$2,$2]);
		$n++;
	    }
	    
	    if ($locS =~ s/join\((\[\d+\](,\[\d+\])*)\)/\[$n\]/)
	    {
		$args = $1;
		push(@pieces,["join",map { $_ =~ /^\[(\d+)\]$/; $1 } split(/,/,$args)]);
		$n++;
	    }
	    
	    if ($locS =~ s/complement\((\[\d+\](,\[\d+\])*)\)/\[$n\]/g)
	    {
		$args = $1;
		push(@pieces,["complement",map { $_ =~ /^\[(\d+)\]$/; $1 } split(/,/,$args)]);
		$n++;
	    }
	}
	
	if ($locS =~ /^\[(\d+)\]$/)
	{
	    $loc = &conv(\@pieces,$1,$contigID);
	}
	else
	{
	    print STDERR &Dumper(["could not parse $locS $iter",$cdsP]);
	    die "aborted";
	}
	
	my @locs = split(/,/,$loc);
#...STOP is now included, so don't trim it off...
#	&trim_stop(\@locs);
	$loc = join(",",@locs);
    }
    else
    {
	print STDERR &Dumper($cdsP); die "could not parse location";
	die "aborted";
    }
    return ($loc,$precise);
}

sub trim_stop {
    my($locs) = @_;
    my($to_trim,$n);

    $to_trim = 3;
    while ($to_trim && (@$locs > 0))
    {
	$n  = @$locs - 1;
	if ($locs->[$n] =~ /^(\S+)_(\d+)_(\d+)$/)
	{
	    if ($2 <= ($3-$to_trim))
	    {
		$locs->[$n] = join("_",($1,$2,$3-$to_trim));
		$to_trim = 0;
	    }
	    elsif ($2 >= ($3 + $to_trim))
	    {
		$locs->[$n] = join("_",($1,$2,$3+$to_trim));
		$to_trim = 0;
	    }
	    else
	    {
		$to_trim -= abs($3-$2) + 1;
		pop @$locs;
	    }
	}
	else
	{
	    die "could not parse $locs->[$n]";
	}
    }
}


sub conv {
    my($pieces,$n,$contigID) = @_;

    if ($pieces->[$n]->[0] eq "loc")
    {
	return join("_",$contigID,@{$pieces->[$n]}[1..2]);
    }
    elsif ($pieces->[$n]->[0] eq "join")
    {
	return join(",",map { &conv($pieces,$_,$contigID) } @{$pieces->[$n]}[1..$#{$pieces->[$n]}]);
    }
    elsif ($pieces->[$n]->[0] eq "complement")
    {
	return join(",",&complement(join(",",map { &conv($pieces,$_,$contigID) } @{$pieces->[$n]}[1..$#{$pieces->[$n]}])));;
    }
}

sub complement {
    my($loc) = @_;
    my(@locs);

    @locs = reverse split(/,/,$loc);
    foreach $loc (@locs)
    {
	$loc =~ /^(\S+)_(\d+)_(\d+)$/;
	$loc = join("_",($1,$3,$2));
    }
    return join(",",@locs);
}

sub get_aliases {
    my($cdsP) = @_;
    my($db_ref);

    my @aliases = ();
    while ($$cdsP =~ /\/(protein_id|gene|locus_tag)=\"([^\"]+)\"/sg)
    {
	push(@aliases,$2);
    }
    
    while ($$cdsP =~ /\/db_xref=\"([^\"]+)\"/sg)
    {
	$db_ref = $1;
	$db_ref =~ s/^GI:/gi\|/;
	$db_ref =~ s/SWISS-PROT:/sp\|/;
	push(@aliases,$db_ref);
    }

    return @aliases;
}

sub get_trans {
    my($cdsP) = @_;
    my $tran = "";

    if ($$cdsP =~ /\/translation=\"([^"]*)\"/s)
    {
	$tran = $1;
	$tran =~ s/\s//gs;
    }
    elsif ($$cdsP =~ /\/protein_id=\"([^"]+)\"/)
    {
	$tran = $1;
	$tran =~ s/\s//gs;
    }
    return $tran;
}

sub get_func {
    my($cdsP) = @_;
    my $func = "";
    
    ++$recnum;
#   print STDERR "\nRecord $recnum:\n$$cdsP\n";
    if (($$cdsP =~ /\/function=\"([^"]*)\"/s) && &ok_func($1))
    {
        $func = $1;
#       print STDERR "Branch 1:\n$func -->\n";
    }
    elsif (($$cdsP =~ /\/product=\"([^"]*)\"/s) && &ok_func($1) &&
            (($1 =~ / /) || ($$cdsP !~ /\/note/)))
    {
	$func = $1;
#        print STDERR "Branch 2:\n$func -->\n";
    }
    elsif (($$cdsP =~ /\/note=\"([^"]*)\"/s) && &ok_func($1))
    {
        $func = $1;
#       print STDERR "Branch 3:\n$func -->\n";
    }
    else
    {
#       print STDERR "No non-hypo found\n";
    }
    $func =~ s/\s+/ /gs;
#   print STDERR "--> $func\n";

    $func = &fixup_func($func);
#   print STDERR "Returning func = $func\n\n";

    return $func;
}

sub fixup_func {
    my($func) = @_;

    $func =~ s/^COG\d+:\s+//i;
    $func =~ s/^ORFID:\S+\s+//i;
    return $func;
}

sub ok_func {
    my($func) = @_;

    return (
            ($func !~ /^[a-zA-Z]{1,3}\d+[a-zA-Z]{0,3}(\.\d+([a-zA-Z])?)?$/) &&
            ($func !~ /^\d+$/) &&
            ($func !~ /ensangp/i) &&
            ($func !~ /agr_/i) &&
	    ($func !~ /^SC.*:/) &&
	    ($func !~ /^RIKEN/) &&
	    ($func !~ /\slen: \d+/) &&
	    ($func !~ /^similar to/i) &&
	    ($func !~ /^CPX/i) &&
	    ($func !~ /^\S{3,4}( or \S+ protein)?$/i) &&
	    ($func !~ /^putative$/) &&
	    ($func !~ /\scomes from/i) &&
	    ($func !~ /^unknown( function)?$/i) &&
	    ($func !~ /^hypothetical( (function|protein))?$/i) &&
            ($func !~ /^orf /i) &&
            ($func !~ /^ORF\s?\d+[lr]?$/i) &&
            ($func !~ /^[a-z]{1,3}\s?\d+$/i)
	   )
}

sub process_rna {
    my($contigID,$cdsP,$prefix,$idNr,$contigs,$fh_tbl,$fh_dna) = @_;
    my($loc,@aliases,$func,$trans,$id,$precise,$DNA);
    
    ($loc,$precise)  = &get_loc($contigID,$cdsP);
    $func    = &get_rna_func($cdsP);

    if ($loc)
    {
	if ($dna  = &FIG::extract_seq($contigs,$loc))
	{
	    $id = $prefix . "$$idNr";
	    $$idNr++;
	    if (! $func )
	    {
		warn "could not get func $$cdsP\n";
	    }
	    else
	    {
		print $fh_tbl "$id\t$loc\t$func\n";
		&FIG::display_id_and_seq($id,\$dna,$fh_dna);
	    }
	}
	else
	{
	    warn "Could not get DNA sequence for RNA at $loc for entry\n$$cdsP\nof record begining with:\n"
	    , substr($record, 0, 160), "\n\n";
	}
    }
    else
    {
	print &Dumper($cdsP); die "aborted";
    }
}

sub get_rna_func {
    my($cdsP) = @_;
    my $func = "";

    if ($$cdsP =~ /\/product=\"([^"]*)\"/s)
    {
	$func = $1;
	$func =~ s/\s+/ /gs;
    }
    elsif ($$cdsP =~ /\/gene=\"([^"]*)\"/s)
    {
	$func = $1;
	$func =~ s/\s+/ /gs;
    }
    elsif ($$cdsP =~ /\/note=\"([^"]*)\"/s)
    {
	$func = $1;
	$func =~ s/\s+/ /gs;
    }
    return $func;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3