[Bio] / FortyEightMeta / mg_to_gbk.pl Repository:
ViewVC logotype

View of /FortyEightMeta/mg_to_gbk.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Wed Apr 29 16:41:51 2009 UTC (10 years, 11 months ago) by dsouza
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Changes since 1.3: +30 -61 lines
set meta status to error if gff required and not found, cleaned out junk

#
# Create the gff and genbank export files from the similarities.
#

use DB_File;
use Data::Dumper;

use JobStage;
use strict;
use FIG;
use FIGV;
use FIG_Config;
use File::Basename;
use File::Copy;
use GenomeMeta;
use Carp 'croak';
use FortyEightMeta::ExpandSims;
use URI::Escape;

my $STAGE = "export_to_genbank";

(@ARGV == 1 or @ARGV == 2) or die "Usage: $0 job-dir [-use_existing_gff]\n";

my $jobdir = shift;
my $use_existing_gff = (@ARGV and $ARGV[0] eq '-use_existing_gff')? 1 : 0;

my $stage = new JobStage('Job48', $STAGE, $jobdir);
$stage or die "Cannot create job for $jobdir\n";

my $job_id = basename($jobdir);
my $job    = $stage->job();
my $genome = $job->genome_id();

my $gff = "$jobdir/proc/$genome.gff";
if ( $use_existing_gff and ! -e $gff ) {
    print "gff file not found for job $jobdir ($gff)\n";
    $stage->log("gff file not found for job $jobdir ($gff)");
    $stage->set_status("error");
    exit;
}

my $meta = $job->meta;

print "Running job! $jobdir\n";

my $blast2gff_exe = "$FIG_Config::bin/blast2gff";
-x $blast2gff_exe or $stage->fatal("Executable $blast2gff_exe missing");

$stage->set_status("in_progress");
$stage->set_running("yes");

my $db_info = db_info();

create_seed_gff($db_info) unless $use_existing_gff;

if ($db_info)
{
    gff_to_genbank($db_info);
}

$stage->set_status("complete");
$stage->set_running("no");

exit(0);

sub db_info {
    #
    # We need to first find the information about the SEED database
    # we used to generate this data.
    #
    my $db_list = $stage->get_metadata("sims.database_list");

    #
    # Find the one named "SEED".
    #
    if (my @s = grep { $_->{name} eq 'SEED'} @$db_list)
    {
	if (@s > 1)
	{
	    $stage->log("Multiple SEED databases were processed, using version $s[0]->{version}");
	}
	
	my $db = $s[0]->{files}->[0];
	return $db;
    }
    else
    {
	$stage->log("No SEED database was processed in this job");
	return 0; 
    }
}
#
# Process the seed blastx results into an organism directory.
#
sub create_seed_gff
{
    my($db) = @_;

    my $sims_dir = "$jobdir/proc/$db->{dir}";
    my $sims     = "$sims_dir.raw";
    consolidate_sims($sims_dir, $sims);
    
    my $gff      = "$jobdir/proc/$genome.gff";
    my $out_sims = "$jobdir/proc/$genome.sims";

    #
    # Use the btree-indexed NR from the data directory if possible.
    #

    my $nr    = "$db->{fasta}.btree";
    my $nrlen = "$db->{fasta}-len.btree";
    my @nrarg;
    if (-f $nr)
    {
	@nrarg = ('-nr', $nr);
    }
    if (-f $nrlen)
    {
	push(@nrarg, "-nrlen", $nrlen);
    }
    

    #
    # Verify we have the inputs we need for generating the orgdir.
    #

    my $fasta = $meta->get_metadata("preprocess.fasta_file");
    ($fasta and -f $fasta) or $stage->fatal("fasta not found: '$fasta'");

    my $taxonomy = &FIG::file_head("$jobdir/TAXONOMY", 1);
    my $project  = &FIG::file_head("$jobdir/PROJECT", 1);
    chomp $taxonomy;
    chomp $project;

    #
    # Start the process.
    #

    my @args = (-f => $fasta,
		-s => $out_sims,
		-o => $gff,
		-gs => $job->genome_name(),
		-taxonomy => $taxonomy,
		-proj => $project,
		-g => $genome,
		@nrarg,
		$sims);

    my $pid = open(BOUT, "-|");
    if ($pid == 0)
    {
	exec { $blast2gff_exe } $blast2gff_exe, @args;
	die "Failed: $!: $blast2gff_exe @args";
    }

    print "Started pid=$pid $blast2gff_exe @args\n";

    while (<BOUT>)
    {
	print "Blast2gff: $_";
    }

    if (!close(BOUT))
    {
	$stage->fatal("Failed with \$!=$! \$?=$?: $blast2gff_exe @args");
    }

    return $db;
}

#
# Unpack the gff file into the organism directory, do the rest of the post-install cleanup.

#
sub gff_to_genbank
{
    my($db) = @_;
    
    my $genome   = $job->genome_id();
    my $gff      = "$jobdir/proc/$genome.gff";
    my $out_sims = "$jobdir/proc/$genome.sims";

    -f $gff or $stage->fatal("gff file $gff does not exist");
    
    my $fasta = $meta->get_metadata("preprocess.fasta_file");
    ($fasta and -f $fasta) or $stage->fatal("fasta not found: '$fasta'");

    ## read the fasta file
    my $contigs = read_fasta($fasta) or $stage->fatal("fasta could not be read: '$fasta'");

    # read the gff file
    #
    my $data = read_gff($gff) or $stage->fatal("gff file could not be read: '$gff'");

    # print the data into genbank format file
    my $output = "$jobdir/$genome.gbk";
    print_gbk($output, $data, $contigs) or $stage->fatal("could not print to output: '$output'");
    system("gzip $output");
}
    
sub consolidate_sims
{
    my($dir, $sims) = @_;
    if (! -d $dir)
    {
	$stage->fatal("Sim directory $dir not found");
    }
    
    open(TL, "<$dir/task.list") or $stage->fatal("Cannot open $dir/task.list: $!");
    
    #
    # Open output file.
    #
    open(O, ">$sims") or $stage->fatal("Cannot create $sims: $!");
	
    while (<TL>)
    {
	chomp;
	my ($id, $in, $nr, $flags, $out, $err) = split(/\t/);
	
	if (! -f $out)
	{
	    $stage->fatal("Sims output file $out missing for sims dir $dir");
	}
	
	copy($out, \*O);
    }
    close(TL);
    close(O);
}
    
sub run_pipe
{
    my($outfh, $cmd, @args) = @_;

    my $pid = open(BOUT, "-|");
    if ($pid == 0)
    {
	exec { $cmd } $cmd, @args;
	die "Failed: $!: $cmd @args";
    }

    print "Started pid=$pid $cmd @args\n";

    while (<BOUT>)
    {
	print $outfh $_;
    }

    if (!close(BOUT))
    {
	$stage->fatal("Failed with \$!=$! \$?=$?: $cmd @args");
    }
}

sub makedir
{
    my($dir) = @_;

    if (! -d $dir)
    {
	mkdir($dir, 0777) or $stage->fatal("mkdir $dir failed: $!");
    }
    chmod 0777, $dir;
}

sub touch
{
    my($file) = @_;
    my $fh = new FileHandle(">$file") or $stage->fatal("touch: cannot open $file for writing: $!");
    close($fh);
}

    
#
# Use the C index_sims_file app to create a berkeley db index
# of the sims file.
#

sub index_sims
{
    my($sims, $index_file) = @_;

    my $path = &FIG::find_fig_executable("index_sims_file");

    open(IDX, "$path 0 < $sims |") or die "Cannot open index_sims_file pipe: $!\n";

    my %index;
    my $tied = tie %index, 'DB_File', $index_file, O_RDWR | O_CREAT, 0666, $DB_BTREE;

    $tied or $stage->fatal("Creation of hash $index_file failed: $!\n");

    while (<IDX>)
    {
	chomp;
	my($peg, undef, $seek, $len) = split(/\t/);
	
	$index{$peg} = "$seek,$len";
    }
    close(IDX);
    
    $tied->sync();
    untie %index;
}

sub read_gff{
    my $gff = shift @_;

    my @desired_dbxref = qw(RefSeq_Prot
			    IMG
			    TIGR
			    SP
			    UniProtKB
			    KEGG
			    InterPro
			    PFAM
			    SMART
			    Swiss-Prot
			    );
    my %desired_dbxref = map { $_ => 1} @desired_dbxref;

    open(GFF, "<", $gff) or die "cannot open gff file $gff: $!\n";

    my %dat;
    my $contig;
    while (<GFF>)
    {
	last if /^##FASTA/;
	chomp;
	if (/^#seed/){
	    my ($type, $key, $value) = split (/\t/, $_);
	    $dat{$key} = $value;
	}

	next if /^\s*\#/;
    
	my($contig_id, undef, $what, $start, $end, undef, $strand, undef, $dat) = split(/\t/);
	
	my @parts = split(/;/, uri_unescape($dat));
	my %attr;
	for my $p (@parts)
	{
	    if ($p =~ /^([^=]+)=(.*)/)
	    {
		$attr{$1} = [split(/,/, $2)];
	    }
	}
	
	my @dbxref = map { [ split(/:/) ] } @{$attr{Dbxref}};
	
	my $id = $attr{ID}->[0];

	if ($what eq 'contig')
	{
	    $contig = $attr{Name}->[0];
	    $contig =~ s/^Contig\s+//;
	}
	elsif ($what eq 'cds')
	{
	    #
	    # generate lines with
	    # id contig coords refseq-prot evcode gene-sym descr dbxref kegg subsys weblink struct-desc proteinseq
	    #
	    my $fig_id   = (map { $_->[1] } grep { $_->[0] eq 'FIG_ID' } @dbxref)[0];
	    my $coords   = $strand eq '+' ? "${start}_$end"  : "${end}_$start";
	    my $refseq   = (map { $_->[1] } grep { $_->[0] eq 'RefSeq_Prot' } @dbxref)[0];
	    
	    my $evcode   = join(",", @{$attr{evidence_code}}) if $attr{evidence_code};
	    
	    my $gene_sym = $id;
	    my $descr    = $attr{description}->[0] if $attr{description};
	    my $dbxref   = join(",", map { join(":", @$_) } grep { $desired_dbxref{$_->[0]} } @dbxref);
	    my $kegg     = join(",", map { join(":", @$_) } grep { $_->[0] =~ /^KEGG/} @dbxref);
	    my $weblink;
	    if ($attr{web_id})
	    {
		$weblink = "http://www.nmpdr.org/linkin.cgi?id=$attr{web_id}->[0]";
	    }
	    push (@{$dat{$contig_id}}, [$fig_id, $contig, $coords, $refseq, $evcode, $gene_sym, $descr, $dbxref, $kegg, '', $weblink]);
	}
    }
    
    my $cur_id;
    my $cur_seq;
    my $seqs={};
    while (<GFF>)
    {
	chomp;
	if (/^>(\S+)/)
	{
	    if ($cur_id)
	    {
		$seqs->{$cur_id} = $cur_seq;
	    }
	    $cur_seq = '';
	    $cur_id = $1;
	}
	else
	{
	    s/\s+//g;
	    $cur_seq .= $_;
	}
    }
    if ($cur_id)
    {
	$seqs->{$cur_id} = $cur_seq;
    }
    $dat{"seqs"} = $seqs;
    return \%dat;
}


sub read_fasta {
    my $fasta = shift @_;

    my $seqs = {};
    open (FH, $fasta) or die "cannot open fasta file $fasta: $!\n";

    my $line = <FH>;
    while ($line && ($line =~ /^>(\S+)/))
    {
	my $id = $1;
	my @seq = ();
	while (defined($line = <FH>) && ($line !~ /^>/))
	{
	    $line =~ s/\s//g;
	    push(@seq,$line);
	}
	my $seq = join("",@seq);
    
	$seqs->{$id} = $seq;
    }
    close FH;

    return $seqs;
}

sub print_gbk {
    my ($output, $data,$contigs) = @_;

    open (OUT, ">$output")  or die "cannot open output file $output: $!\n";
    
    foreach my $contig (keys %$contigs){

	my $seq    = $contigs->{$contig};
	my $length = length($seq);

	my $locus      = "LOCUS" . " "x7 . "$contig\t$length bp\tdna\tlinear\tUNK";
	my $definition = "DEFINITION" . " "x2 . "Contig $contig from " . $data->{name};
	my $accession  = "ACCESSION   unknown";
	my $features   = "FEATURES" . " "x13 . "Location/Qualifiers";
	
	# print headers
	print OUT join ("\n", ($locus,$definition,$accession,$features)) . "\n";

	my $source =[];
	## get the source section data
	#
	push (@$source, "1..".$length);    # Location

	my $mol_type = ($data->{mol_type}) ? $data->{mol_type} : "genomic DNA";
	push (@$source, "/mol_type=\"".$mol_type."\"");

	my $md5 = ($data->{md5}) ? $data->{md5} : "";
	push (@$source, "/genome_md5=\"".$md5."\"");

	my $project = ($data->{project}) ? $data->{project} : "";
	push (@$source, "/project=\"".$project."\"");
	
	my $genome_id = ($data->{genome_id}) ? $data->{genome_id} : "";
	push (@$source, "/genome_id=\"".$genome_id."\"");

	my $organism = ($data->{name}) ? $data->{name} : "";
	push (@$source, "/organism=\"".$organism."\"");

	my $taxon;
	if ($data->{taxon}){
	    $taxon = $data->{taxon};
	}
	elsif ($genome_id =~ m/(.*?)\./){
	    $taxon = $1;
	}
	else{
	    $taxon = "";
	}
	push (@$source, "/dbxref=\"taxon: ".$taxon."\"");

	# print the source section
	#
	print OUT " "x5 . "source". " "x10 . join("\n"." "x21, @$source). "\n";
	
	foreach my $cds (@{$data->{$contig}}){
	    my ($start,$end) = split(/_/, $cds->[2]);
	    my $loc;
	    if ($end < $start){
		$loc = "complement(".$end."..".$start.")";
	    }
	    else{
		$loc = $start. "..". $end;
	    }

	    my $translation_id = $cds->[0];
	    my $translation    = format_section("/translation=\"", $data->{seqs}->{$translation_id});
	    my $product        = format_section("/product=\"", $cds->[6]);
	    my $note           = format_section("/Note=\"",$cds->[6]);

	    # parse out the ec number if there is one
	    my $ec;
	    if ($cds->[6] =~ /E\.*C\.*\s+(\d+|-)\.(\d+|-)\.(\d+|-)\.(\d+|-)/)
	    {
		$ec = "/ec_number=\"". join (".", $1,$2,$3,$4) . "\"";
	    }

	    my $gene_symbol = "/gene_symbol=\"".$cds->[5]."\"";
	    
	    my $db_xref = "/Dbxref=\"FIG_ID:".$cds->[0]."\"";

	    # print the CDS section
	    #
	    if ($ec){
		print OUT " "x5 . "CDS". " "x13 . "$loc\n". " "x21 . join ("\n"." "x21,  ($translation,$product,$gene_symbol,$ec,$note,$db_xref )) . "\n"; 
	    }
	    else{
		print OUT " "x5 . "CDS". " "x13 . "$loc\n". " "x21 . join ("\n"." "x21,  ($translation,$product,$gene_symbol,$note,$db_xref )) . "\n"; 
	    }		
		
	}

	# print the sequence
	#
	my $formatted_dna = format_dna($seq);
	my $base_count = "BASE COUNT";
	$base_count .= sprintf('%9s', $formatted_dna->{a}) . " a";
	$base_count .= sprintf('%7s', $formatted_dna->{c}) . " c";
	$base_count .= sprintf('%7s', $formatted_dna->{g}) . " g";
	$base_count .= sprintf('%7s', $formatted_dna->{t}) . " t";
	$base_count .= sprintf('%7s', $formatted_dna->{others}) . " others";

	my $origin = "ORIGIN\n".$formatted_dna->{seq} ;
	
	print OUT $base_count . "\n" . $origin . "\n//\n";

    }
    
    close OUT;

}

sub format_dna{
    my ($seq) = @_;

    my $dna_seq  = {'a' => 0, 'c' => 0, 'g' => 0, 't' => 0, 'others' => 0};
    my $dnachunk = [];
    my $count    = 0;
    my $segment  = sprintf ('%9s',  1). " ";
    my $loop     = 0;

    foreach my $dna (split (//, $seq)){
	$dna_seq->{$dna}++;
	$dna_seq->{others}++ if ($dna !~ /[acgt]/);
	
	if ($count <60){
	    $segment .= " " if ($count % 10 == 0 and $count > 0);
	    $segment .= $dna;
	}
	else{
	    push @$dnachunk, $segment;
	    $loop++;
	    $segment  = sprintf '%9s',  1+(60*$loop);
	    $segment .= " $dna";
	    $count    = 0;
	}
	$count++;
    }
    push @$dnachunk, $segment;

    $dna_seq->{seq} = join("\n", @$dnachunk);
		     
    return $dna_seq;
}


sub format_section {
    my ($segment, $string) = @_;
    
    my @segs;

    my $max = 58;
    my $count = length($segment);
    foreach my $aa (split (//, $string), "\""){
	if ($count < $max){
	    $segment .= $aa;
	}
	else{ # ($count >= $max){
	    push @segs, $segment;
	    $segment = $aa;
	    $count   = 0;
	}
	$count++;
    }
    push (@segs, $segment);

    return join ("\n"." "x21, @segs);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3