[Bio] / FigKernelScripts / to_genbank.pl Repository:
ViewVC logotype

View of /FigKernelScripts/to_genbank.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Wed Oct 10 20:41:34 2007 UTC (12 years, 5 months ago) by gdpusch
Branch: MAIN
CVS Tags: rast_rel_2008_06_18, rast_rel_2008_06_16, rast_rel_2008_07_21, rast_2008_0924, rast_rel_2008_04_23, rast_rel_2008_09_30, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, rast_rel_2008_10_09, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, mgrast_rel_2008_1110, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, rast_rel_2008_08_07
Changes since 1.3: +4 -1 lines
Added support for multiple contig files. -- /gdp

# -*- perl -*-

use FIG;
$fig = new FIG;

$usage = "usage: parse_genbank [-genome=genus_species] [-strain=strain] [-tax=Taxonomy] [-def=definition_line] TaxID [Dir] > genbank.output";

while ($ARGV[0] =~ m/^-/)
{
    if    ($ARGV[0] =~ m/^-{1,2}genome=\S+/) 
    {
	$force_genome =  shift @ARGV;
	$force_genome =~ s/^-{1,2}genome=\"?//;
	$force_genome =~ s/\"$//;
	$force_genome =~ s/\s+/ /g;
	
	print STDERR "\nGenus and species will be taken as: \"$force_genome\"\n";
    }
    if    ($ARGV[0] =~ m/^-{1,2}strain=\S+/) 
    {
	$force_strain =  shift @ARGV;
	$force_strain =~ s/^-{1,2}strain=\"?//;
	$force_strain =~ s/\"$//;
	$force_strain =~ s/\s+/ /g;
	
	print STDERR "\nStrain will be taken as: \"$force_strain\"\n";
    }
    elsif ($ARGV[0] =~ m/^-{1,2}tax=\S+/) 
    {
	$force_tax =  shift @ARGV;
	$force_tax =~ s/^-{1,2}tax=\"?//;
	$force_tax =~ s/\"$//;
	$force_tax =~ s/\s+/ /g;
	if ($force_tax !~ m/\.$/) { $force_tax .= "."; }
	
	print STDERR "\nTaxonomy will be taken as: \"$force_tax\"\n";
    }
    elsif ($ARGV[0] =~ m/^-{1,2}def=\S+/) 
    {
	$defline =  shift @ARGV;
        $defline =~ s/^-{1,2}def=\"?//;
        $defline =~ s/\"$//;
        $defline =~ s/\s+/ /g;
        if ($defline !~ m/\.$/) { $defline .= "."; }
	
	print STDERR "\nDefinition line will be:\n\"$defline\"\n";
    }
    else
    {
	die "Could not handle $ARGV[0]";
    }
}

($taxid = shift @ARGV) || die $usage;


if ($ARGV[0] && (-d $ARGV[0]))
{
    $dir = shift @ARGV;
}
else
{
    $dir = "$FIG_Config::organisms/$taxid";
}
die "Organism directory $dir does not exist" unless (-d $dir);

if (@ARGV) { die "Unknown args ", join(", ", @ARGV), "\n\n\tusage: $usage\n\n"; }

if ($force_taxonomy)
{
    $taxonomy = $force_taxonomy;
}
else
{
    if (open(TAXONOMY, "<$dir/TAXONOMY"))
    {
	@_ = <TAXONOMY>;  chomp @_;
	$taxonomy =  join("", @_);
	$taxonomy =~ s/\s+/ /go;
	close(TAXONOMY);
    }
    else
    {
	die "could not read-open $dir/TAXONOMY";
    }
}


if ($force_genome)
{
    $genome = $force_genome;
    if ($strain) { $genome .= " $strain"; }
}
else
{
    if (open(GENOME, "<$dir/GENOME"))
    {
	$genome = <GENOME>;
	chomp $genome;
	close(GENOME);
    }
    else
    {
	die "could not read-open $dir/GENOME";
    }
}

if ((not $strain) && ($genome =~ m/^\S+\s+\S+\s+(.*)/))
{
    $strain = $1;
}

if (not $defline)
{
    $defline = $genome;
}


if (open(PROJECT, "<$dir/PROJECT"))
{
    @_ = <PROJECT>;  chomp @_;
    $project =  join("", @_);
    $project =~ s/\s+/ /go;
    close(PROJECT);
}
else
{
    die "could not read-open $dir/PROJECT";
}


use constant FID    =>  0;
use constant LOCUS  =>  1;
use constant CONTIG =>  2;
use constant LEFT   =>  3;
use constant RIGHT  =>  4;
use constant LEN    =>  5;
use constant STRAND =>  6;
use constant TYPE   =>  7;
use constant FUNC   =>  8;

opendir(DIR, $dir) || die "Could not opendir $dir";
@contig_files = map { "$dir/$_" } grep { m/^contigs\d*$/ } readdir(DIR);
closedir(DIR) || die "Could not closedir $dir";
($seq_of,  $len_of)  = &load_fasta(@contig_files);
($peg_seq, $peg_len) = &load_fasta("$dir/Features/peg/fasta");

@tbls = ("$dir/Features/peg/tbl");
if (-s "$dir/Features/rna/tbl")  { push @tbls, "$dir/Features/rna/tbl"; }
$tbl = &load_tbls(@tbls);


use Time::localtime;
$time = localtime;
$date = sprintf "%02d-%3s-%04d"
    , $time->mday
    , (qw(JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC))[$time->mon]
    , (1900+$time->year);

foreach $contig (sort keys %$len_of)
{
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Build up header in the Format Accumulator...
#-------------------------------------------------------------------------------

    formline <<END, $contig, $len_of->{$contig}, $date;
LOCUS       @<<<<<<<<<<<<<<<<<<<@####### bp    DNA     circular BCT @<<<<<<<<<<<
END

    formline <<END, $defline;
DEFINITION  ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
    formline <<END, $defline;
~~          ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END

    formline <<END, $genome;
ACCESSION   Unknown Unknown
VERSION     Unknown
KEYWORDS    WGS.
SOURCE      @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END

    formline <<END, $genome, $taxonomy;
  ORGANISM  @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
            ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
    formline <<END, $taxonomy;
~~          ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END

    formline <<END, $len_of->{$contig}, $len_of->{$contig};
REFERENCE   1  (bases 1 to @#######)
  AUTHORS   [Insert Names here]
  TITLE     Direct Submission
  JOURNAL   [Insert paper submission information here]
COMMENT     [Insert project information here]
FEATURES             Location/Qualifiers
     source          1..@<<<<<<<
END

    &make_multiline('organism', $genome);
    &make_multiline('mol_type', 'genomic DNA');
    &make_multiline('strain', $strain) if $strain;
    $taxid =~ m/^(\d+)/;
    &make_multiline('db_xref', "taxon:$1");

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Print header and clear Format-Accumulator...
#===============================================================================
    print $^A;  $^A = ""; 
#-------------------------------------------------------------------------------

    
    $features = $tbl->{$contig};
    foreach $feature (@$features)
    {
	if ($feature->[STRAND] eq '+')
	{
	    $locus = "$feature->[LEFT]\.\.$feature->[RIGHT]";
        }
	else
	{
	    $locus = "complement($feature->[LEFT]\.\.$feature->[RIGHT])";
        }
	$ltag = "\"$feature->[FID]\"";
	
	formline <<END, $locus, ($tmp = $ltag);
     gene            @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
                     /locus_tag=^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END

	if ($feature->[TYPE] eq 'peg')
	{
	    formline <<END, $locus, ($tmp = $ltag);
     CDS             @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
                     /locus_tag=^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END

            if ($feature->[FUNC])
	    {
		&make_multiline('product', $feature->[FUNC]);
	    }
	    
	    &make_multiline('db_xref', $feature->[FID]);
	    &make_multiline('translation', $peg_seq->{$feature->[FID]});
	}
	elsif ($feature->[TYPE] eq 'rRNA')
	{
	    formline <<END, $locus, ($tmp = $ltag);
     rRNA            @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
                     /locus_tag=^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
            if ($feature->[FUNC])
	    {
		&make_multiline('product', $feature->[FUNC]);
	    }
	    
	    &make_multiline('db_xref', $feature->[FID]);
	}
	elsif ($feature->[TYPE] eq 'tRNA')
	{
	    formline <<END, $locus, ($tmp = $ltag);
     tRNA            @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
                     /locus_tag=^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
            if ($feature->[FUNC])
	    {
		&make_multiline('product', $feature->[FUNC]);
	    }
    
	    &make_multiline('db_xref', $feature->[FID]);
	}
	else
	{
	    warn "Skipping unknown feature: ", join(", ", @$feature), "\n";
	}

	print $^A;  $^A = "";
    }
    
    &write_contig($contig);
    
    print "//\n";
}


sub make_multiline
{
    my ($tag, $text) = @_;
    
    my $tmp = "/$tag=\"$text\"";
    
    formline <<END, $tmp;
                     ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
	
    formline <<END, $tmp;
~~                   ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
    
    return;
}

sub write_contig
{
    my ($contig) = @_;
    
    $tmp = $seq_of->{$contig};
    
    formline <<END;
ORIGIN
END

    $charcount = 1;
    while ($tmp)
    {
	formline <<END, $charcount, $tmp, $tmp, $tmp, $tmp, $tmp, $tmp;
@######## ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<
END
        $charcount += 60;
    }
    
    print $^A;   $^A = "";
    
    return;
}


sub load_fasta
{
    my (@files) = @_;
    my ($file, $id, $seqP, $len);
    
    my $seq_of = {};
    my $len_of = {};
    
    foreach $file (@files)
    {
	print STDERR "Loading $file\n" if $ENV{VERBOSE};
	
	open (FILE, "<$file") or die "could not read-open $file";
	while (($id, $seqP) = &FIG::read_fasta_record(\*FILE))
	{
	    $len = $len_of->{$id} = length($$seqP);
#	    print STDERR "\tSeq $id ($len chars)\n";
	    
	    if (($$seqP =~ tr/acgtACGT//) > 0.9*$len) {
		$$seqP  =~ tr/A-Z/a-z/;
	    } else {
		$$seqP  =~ tr/a-z/A-Z/;
	    }
	    $seq_of->{$id} = $$seqP;
	}
	close(FILE) or die "could not close $file";
    }
    
    return ($seq_of, $len_of);
}

sub load_tbls
{
    my (@files) = @_;
    my ($file, $entry, $fid, $locus, $alias, $contig, $left, $right, $len, $strand, $type, $func);
    my $x;
    my $tbl = {};
    
    foreach $file (@files)
    {
	print STDERR "Loading $file ...\n" if $ENV{VERBOSE};
	
	open(TBL, "<$file") || die "Could not read-open $file";
	while (defined($entry = <TBL>))
	{
	    chomp $entry;
	    
	    ($fid, $locus, $alias) = split /\t/, $entry;
	    $fid  =~ m/^[^\|]+\|\d+\.\d+\.([^\.]+)/;
	    $type =  $1;
	    
	    if ((($contig, $left, $right, $len, $strand) = &from_locus($locus)) 
		&& defined($contig) && $contig
		&& defined($left)   && $left
		&& defined($right)  && $right
		&& defined($len)    && $len
		&& defined($strand) && $strand
		)
	    {
		if (not defined($tbl->{$contig})) { $tbl->{$contig} = []; }
		$x = $tbl->{$contig};
		
		$func = undef;
		if ($type eq 'peg')
		{
		    $func = $fig->function_of($fid);
		}
		elsif ($type eq 'rna')
		{
		    $func = $alias;
		    if ($alias =~ m/tRNA/o) {
			$type = 'tRNA';
		    }
		    elsif ($alias =~ m/ribosomal/io) {
			$type = 'rRNA';
		    }
		    else
		    {
			$type = 'RNA';
		    }
		}
		else
		{
		    warn "$fid has unknown feature type $type";
		    next;
		}
		
		push @$x, [ $fid, $locus, $contig, $left, $right, $len, $strand, $type, $func ];
	    }
	    else
	    {
		warn "INVALID ENTRY in $file:\t$entry\n";
	    }
	}
	close(TBL) || die "Could not close $file";
    }
    
    foreach $contig (keys %$tbl)
    {
	$x  = $tbl->{$contig};
	@$x = sort by_locus @$x;
    }
    
    return $tbl;
}

sub from_locus
{
    my ($locus) = @_;
    
    if ($locus =~ m/^(\S+)_(\d+)_(\d+)$/)
    {
	return ($1
	       , &FIG::min($2, $3)
	       , &FIG::max($2, $3)
	       , (1+abs($3-$2))
	       , (($2 < $3) ? '+' : '-')
	       );
    }
    else
    {
	die "Invalid locus $locus";
    }
    
    return ();
}

sub by_locus
{
    my (undef, undef, $A_contig, $A_left, $A_right, $A_len, $A_strand) = @$a;
    my (undef, undef, $B_contig, $B_left, $B_right, $B_len, $B_strand) = @$b;
    
    return (  ($A_contig cmp $B_contig) 
	   || ($A_left <=> $B_left)
	   || ($B_len  <=> $A_len)
	   || ($A_strand cmp $B_strand)
	   );
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3