[Bio] / FigKernelScripts / to_genbank.pl Repository:
ViewVC logotype

View of /FigKernelScripts/to_genbank.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (download) (as text) (annotate)
Fri Sep 10 19:46:55 2010 UTC (9 years, 2 months ago) by gdpusch
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.6: +1 -1 lines
Fixed bug handling 5S.

# -*- perl -*-

use FIG;
use FIGV;

$usage = "usage: parse_genbank [-genome=genus_species] [-strain=strain] [-tax=Taxonomy] [-def=definition_line] TaxID [Dir] > genbank.output";

while ($ARGV[0] =~ m/^-/)
{
    if    ($ARGV[0] =~ m/^-{1,2}genome=\S+/) {
	$force_genome =  shift @ARGV;
	$force_genome =~ s/^-{1,2}genome=\"?//;
	$force_genome =~ s/\"$//;
	$force_genome =~ s/\s+/ /g;
	
	print STDERR "\nGenus and species will be taken as: \"$force_genome\"\n";
    }
    elsif ($ARGV[0] =~ m/^-{1,2}strain=\S+/) {
	$force_strain =  shift @ARGV;
	$force_strain =~ s/^-{1,2}strain=\"?//;
	$force_strain =~ s/\"$//;
	$force_strain =~ s/\s+/ /g;
	
	print STDERR "\nStrain will be taken as: \"$force_strain\"\n";
    }
    elsif ($ARGV[0] =~ m/^-{1,2}tax=\S+/) {
	$force_tax =  shift @ARGV;
	$force_tax =~ s/^-{1,2}tax=\"?//;
	$force_tax =~ s/\"$//;
	$force_tax =~ s/\s+/ /g;
	if ($force_tax !~ m/\.$/) { $force_tax .= "."; }
	
	print STDERR "\nTaxonomy will be taken as: \"$force_tax\"\n";
    }
    elsif ($ARGV[0] =~ m/^-{1,2}def=\S+/) {
	$defline =  shift @ARGV;
        $defline =~ s/^-{1,2}def=\"?//;
        $defline =~ s/\"$//;
        $defline =~ s/\s+/ /g;
        if ($defline !~ m/\.$/) { $defline .= "."; }
	
	print STDERR "\nDefinition line will be:\n\"$defline\"\n";
    }
    else {
	die "Could not handle $ARGV[0]";
    }
}

($taxid = shift @ARGV) || die $usage;


if ($ARGV[0] && (-d $ARGV[0])) {
    $dir = shift @ARGV;
    $fig = FIGV->new($dir);
}
else {
    $fig = FIG->new();
    $dir = "$FIG_Config::organisms/$taxid";
}
die "Organism directory $dir does not exist" unless (-d $dir);

if (@ARGV) { die "Unknown args ", join(", ", @ARGV), "\n\n\tusage: $usage\n\n"; }

if ($force_taxonomy) {
    $taxonomy = $force_taxonomy;
}
else {
    if (open(TAXONOMY, "<$dir/TAXONOMY")) {
	@_ = <TAXONOMY>;  chomp @_;
	$taxonomy =  join("", @_);
	$taxonomy =~ s/\s+/ /go;
	close(TAXONOMY);
    }
    else {
	die "could not read-open $dir/TAXONOMY";
    }
}


if ($force_genome) {
    $genome = $force_genome;
    if ($strain) { $genome .= " $strain"; }
}
else {
    if (open(GENOME, "<$dir/GENOME")) {
	$genome = <GENOME>;
	chomp $genome;
	close(GENOME);
    }
    else {
	die "could not read-open $dir/GENOME";
    }
}

if ((not $strain) && ($genome =~ m/^\S+\s+\S+\s+(.*)/)) {
    $strain = $1;
}

if (not $defline) {
    $defline = $genome;
}


if (open(PROJECT, "<$dir/PROJECT")) {
    @_ = <PROJECT>;  chomp @_;
    $project =  join("", @_);
    $project =~ s/\s+/ /go;
    close(PROJECT);
}
else {
    die "could not read-open $dir/PROJECT";
}


use constant FID    =>  0;
use constant LOCUS  =>  1;
use constant CONTIG =>  2;
use constant LEFT   =>  3;
use constant RIGHT  =>  4;
use constant LEN    =>  5;
use constant STRAND =>  6;
use constant TYPE   =>  7;
use constant FUNC   =>  8;

opendir(DIR, $dir) || die "Could not opendir $dir";
@contig_files = map { "$dir/$_" } grep { m/^contigs\d*$/ } readdir(DIR);
closedir(DIR) || die "Could not closedir $dir";
($seq_of,  $len_of)  = &load_fasta(@contig_files);
($peg_seq, $peg_len) = &load_fasta("$dir/Features/peg/fasta");

@tbls = ("$dir/Features/peg/tbl");
if (-s "$dir/Features/rna/tbl")  { push @tbls, "$dir/Features/rna/tbl"; }
$tbl = &load_tbls(@tbls);


use Time::localtime;
$time = localtime;
$date = sprintf "%02d-%3s-%04d"
    , $time->mday
    , (qw(JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC))[$time->mon]
    , (1900+$time->year);

foreach $contig (sort keys %$len_of)
{
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Build up header in the Format Accumulator...
#-------------------------------------------------------------------------------

    formline <<END, $contig, $len_of->{$contig}, $date;
LOCUS       @<<<<<<<<<<<<<<<<<<<@####### bp    DNA     circular BCT @<<<<<<<<<<<
END

    formline <<END, $defline;
DEFINITION  ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
    formline <<END, $defline;
~~          ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END

    formline <<END, $genome;
ACCESSION   Unknown Unknown
VERSION     Unknown
KEYWORDS    WGS.
SOURCE      @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END

    formline <<END, $genome, $taxonomy;
  ORGANISM  @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
            ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
    formline <<END, $taxonomy;
~~          ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END

    formline <<END, $len_of->{$contig}, $len_of->{$contig};
REFERENCE   1  (bases 1 to @#######)
  AUTHORS   [Insert Names here]
  TITLE     Direct Submission
  JOURNAL   [Insert paper submission information here]
COMMENT     [Insert project information here]
FEATURES             Location/Qualifiers
     source          1..@<<<<<<<
END

    &make_multiline('organism', $genome);
    &make_multiline('mol_type', 'genomic DNA');
    &make_multiline('strain', $strain) if $strain;
    $taxid =~ m/^(\d+)/;
    &make_multiline('db_xref', "taxon:$1");

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Print header and clear Format-Accumulator...
#===============================================================================
    print $^A;  $^A = ""; 
#-------------------------------------------------------------------------------

    
    $features = $tbl->{$contig};
    foreach $feature (@$features) {
	next if $fig->is_deleted();
	
	if ($feature->[STRAND] eq '+') {
	    $locus = "$feature->[LEFT]\.\.$feature->[RIGHT]";
        }
	else {
	    $locus = "complement($feature->[LEFT]\.\.$feature->[RIGHT])";
        }
	$ltag = "\"$feature->[FID]\"";
	
	formline <<END, $locus, ($tmp = $ltag);
     gene            @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
                     /locus_tag=^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END

        if ($feature->[TYPE] eq 'peg') {
	    formline <<END, $locus, ($tmp = $ltag);
     CDS             @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
                     /locus_tag=^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END

            if ($feature->[FUNC]) {
		&make_multiline('product', $feature->[FUNC]);
	    }
	    
	    &make_multiline('db_xref', $feature->[FID]);
	    &make_multiline('translation', $peg_seq->{$feature->[FID]});
	}
	elsif ($feature->[TYPE] eq 'rRNA') {
	    formline <<END, $locus, ($tmp = $ltag);
     rRNA            @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
                     /locus_tag=^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
            if ($feature->[FUNC]) {
		&make_multiline('product', $feature->[FUNC]);
	    }
	    
	    &make_multiline('db_xref', $feature->[FID]);
	}
	elsif ($feature->[TYPE] eq 'tRNA') {
	    formline <<END, $locus, ($tmp = $ltag);
     tRNA            @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
                     /locus_tag=^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
            if ($feature->[FUNC]) {
		&make_multiline('product', $feature->[FUNC]);
	    }
	    
	    &make_multiline('db_xref', $feature->[FID]);
	}
	elsif ($feature->[TYPE] eq 'misc_RNA') {
	    formline <<END, $locus, ($tmp = $ltag);
 misc_RNA            @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
                     /locus_tag=^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
            if ($feature->[FUNC]) {
		&make_multiline('product', $feature->[FUNC]);
	    }
	    
	    &make_multiline('db_xref', $feature->[FID]);
	}
	else {
	    warn "Skipping unknown feature: ", join(", ", @$feature), "\n";
	}
	
	print $^A;  $^A = "";
    }
    
    &write_contig($contig);
    
    print "//\n";
}


sub make_multiline {
    my ($tag, $text) = @_;
    
    my $tmp = "/$tag=\"$text\"";
    
    formline <<END, $tmp;
                     ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
	
    formline <<END, $tmp;
~~                   ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
END
    
    return;
}

sub write_contig {
    my ($contig) = @_;
    
    $tmp = $seq_of->{$contig};
    
    formline <<END;
ORIGIN
END

    $charcount = 1;
    while ($tmp)
    {
	formline <<END, $charcount, $tmp, $tmp, $tmp, $tmp, $tmp, $tmp;
@######## ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<< ^<<<<<<<<<
END
        $charcount += 60;
    }
    
    print $^A;   $^A = "";
    
    return;
}


sub load_fasta {
    my (@files) = @_;
    my ($file, $id, $seqP, $len);
    
    my $seq_of = {};
    my $len_of = {};
    
    foreach $file (@files)
    {
	print STDERR "Loading $file\n" if $ENV{VERBOSE};
	
	open (FILE, "<$file") or die "could not read-open $file";
	while (($id, $seqP) = &FIG::read_fasta_record(\*FILE))
	{
	    $len = $len_of->{$id} = length($$seqP);
#	    print STDERR "\tSeq $id ($len chars)\n";
	    
	    if (($$seqP =~ tr/acgtACGT//) > 0.9*$len) {
		$$seqP  =~ tr/A-Z/a-z/;
	    } else {
		$$seqP  =~ tr/a-z/A-Z/;
	    }
	    $seq_of->{$id} = $$seqP;
	}
	close(FILE) or die "could not close $file";
    }
    
    return ($seq_of, $len_of);
}

sub load_tbls {
    my (@files) = @_;
    my ($file, $entry, $fid, $locus, $alias, $contig, $left, $right, $len, $strand, $type, $func);
    my $x;
    my $tbl = {};
    
    foreach $file (@files)
    {
	print STDERR "Loading $file ...\n" if $ENV{VERBOSE};
	
	open(TBL, "<$file") || die "Could not read-open $file";
	while (defined($entry = <TBL>))
	{
	    chomp $entry;
	    
	    ($fid, $locus, $alias) = split /\t/, $entry;
	    $fid  =~ m/^[^\|]+\|\d+\.\d+\.([^\.]+)/;
	    $type =  $1;
	    
	    if ((($contig, $left, $right, $len, $strand) = &from_locus($locus)) 
		&& defined($contig) && $contig
		&& defined($left)   && $left
		&& defined($right)  && $right
		&& defined($len)    && $len
		&& defined($strand) && $strand
		)
	    {
		if (not defined($tbl->{$contig})) { $tbl->{$contig} = []; }
		$x = $tbl->{$contig};
		
		$func = undef;
		if ($type eq 'peg') {
		    $func = $fig->function_of($fid);
		}
		elsif ($type eq 'rna') {
		    $func = $fig->function_of($fid);
		    if (($func !~ m/\S+/o) && $alias) {
			$func = $alias;
		    }
		    
		    if ($func =~ m/tRNA/o) {
			$type = 'tRNA';
		    }
		    elsif (($func =~ m/ribosomal/io) || ($func =~ m/5S\s+RNA/io)) {
			$type = 'rRNA';
		    }
		    else {
			$type = 'misc_RNA';
		    }
		}
		else {
		    warn "$fid has unknown feature type $type";
		    next;
		}
		
		push @$x, [ $fid, $locus, $contig, $left, $right, $len, $strand, $type, $func ];
	    }
	    else {
		warn "INVALID ENTRY in $file:\t$entry\n";
	    }
	}
	close(TBL) || die "Could not close $file";
    }
    
    foreach $contig (keys %$tbl)
    {
	$x  = $tbl->{$contig};
	@$x = sort by_locus @$x;
    }
    
    return $tbl;
}

sub from_locus
{
    my ($locus) = @_;
    
    if ($locus =~ m/^(\S+)_(\d+)_(\d+)$/)
    {
	return ($1
	       , &FIG::min($2, $3)
	       , &FIG::max($2, $3)
	       , (1+abs($3-$2))
	       , (($2 < $3) ? '+' : '-')
	       );
    }
    else {
	die "Invalid locus $locus";
    }
    
    return ();
}

sub by_locus {
    my (undef, undef, $A_contig, $A_left, $A_right, $A_len, $A_strand) = @$a;
    my (undef, undef, $B_contig, $B_left, $B_right, $B_len, $B_strand) = @$b;
    
    return (  ($A_contig cmp $B_contig) 
	   || ($A_left <=> $B_left)
	   || ($B_len  <=> $A_len)
	   || ($A_strand cmp $B_strand)
	   );
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3