[Bio] / FigKernelScripts / pubseed_data_export.pl Repository:
ViewVC logotype

View of /FigKernelScripts/pubseed_data_export.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Jan 28 17:46:23 2013 UTC (6 years, 9 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
Data exporter for pubseed.

#
# Export all PubSEED data to the FTP site.
#
# We write to a directory /vol/export/genomes/data.DateStamp
# When the creation of all exports is complete, we swing the symlinks
# /vol/export/genomes/SEED and /vol/export/genomes/genbank to point
# into the latest data.DateStamp directory.
#

use strict;
use Carp::Always;
use POSIX;
use FIG;
use IPC::Run qw(start run finish);
use Proc::ParallelLoop;
use gjoseqlib;
use Data::Dumper;

my %suffix_map = (genbank => 'gbk',
		  gff	  => 'gff',
		  embl	  => 'embl',
		  gtf	  => 'gtf');

my $now = time;

#my $dest_root = "/tmp/genomes";
my $dest_root = "/vol/export/genomes";
my $datestamp = strftime("%Y-%m-%d", localtime $now);

my $data_dir = "data.$datestamp";
my $work_root = "$dest_root/$data_dir";

if (! -d $dest_root)
{
    die "Destination directory $dest_root does not exist\n";
}

if (-d $work_root)
{
    die "Work directory $work_root already exists\n";
}

mkdir $work_root or die "mkdir $work_root failed: $!";

my $genbank_dir = "$work_root/genbank";
my $seed_dir    = "$work_root/SEED";
mkdir "$genbank_dir" or die "mkdir $genbank_dir failed: $!";
mkdir "$seed_dir" or die "mkdir $seed_dir failed: $!";

$FIG_Config::use_singleton = 0;


my $out;
run ["$FIG_Config::bin/fig", "genomes"], '>', \$out;

my @genomes = $out =~ /^(\d+\.\d+)/gm;

pareach \@genomes, sub {
    my($genome) = @_;
    my $fig = new FIG;
    export_genome($fig, $genome, $genbank_dir, $seed_dir);
};

#
# Change symlinks.
#

for my $d (qw(genbank SEED))
{
    unlink("$dest_root/$d") or die "Cannot unlink $d: $!";
}

for my $d (qw(genbank SEED))
{
    symlink("$data_dir/$d", "$dest_root/$d") or die "Cannot symlink $data_dir/$d $dest_root/$d: $!";
}

sub export_genome
{
    my($fig, $genome, $genbank_dir, $seed_dir) = @_;

    my @work = ([genbank => $genbank_dir],
		[gff	 => $seed_dir],
		[embl	 => $seed_dir]);

    for my $item (@work)
    {
	my($type, $dir) = @$item;
	export_with_exporter($fig, $genome, $type, $dir);
    };

#    export_with_exporter($fig, $genome, 'genbank', $genbank_dir);
#    export_with_exporter($fig, $genome, 'gff', $seed_dir);
#    export_with_exporter($fig, $genome, 'embl', $seed_dir);
    export_dna($fig, $genome, $seed_dir);
}

sub export_with_exporter
{
    my($fig, $genome, $type, $dest) = @_;

    my $suffix = $suffix_map{$type};
    my $file = "$dest/$genome.$suffix";

    if (-f $file)
    {
	print STDERR "Removing $file before $type export of $genome\n";
	unlink($file) or die "Cannot unlink $file: $!";
    }
    
    my $data = {
	genome	      => $genome,
	directory     => "$dest/",
	export_format => $type,
	strip_ec      => 0,
    };

    SeedExport::export($fig, $data);

    if (! -f $file)
    {
	die "Export of $genome did not write $file\n";
    }
}

sub export_dna
{
    my($fig, $genome, $dest) = @_;

    my $genome_dir = $fig->organism_directory($genome);
    my $genetic_code = 11;
    if (open(F, "<", "$genome_dir/GENETIC_CODE"))
    {
	my $l = <F>;
	if ($l =~ /^(\d+)/)
	{
	    $genetic_code = $1;
	}
	close(F);
    }

    open(FA, ">", "$dest/$genome.faa") or die "Cannot write $dest/$genome.faa: $!";
    
    #
    # Start exporting DNA using script.
    #
    my @cmd2 = (["$FIG_Config::bin/get_dna",  "$genome_dir/contigs", "$genome_dir/Features/peg/tbl"]);
    push @cmd2, ">", "$dest/$genome.fna";
    my $h2 = start @cmd2;

    #
    # And export translations here.
    #

    my $gs = $fig->genus_species($genome);
    my $feats = $fig->all_features_detailed_fast($genome);
    for my $ent (@$feats)
    {
	my($id, $loc, $aliases, $type, $left, $right, $fn) = @$ent;
	my $trans = $fig->get_translation($id);
	next if $trans eq '';
	print_seq_as_fasta(\*FA, $id, "$fn [$gs]", $trans);
    }
    close(FA);

    if (!$h2->finish)
    {
	die "DNA export for $genome failed with $?: @cmd2\n";
    }
}




package SeedExport;

use strict;
use FIG;
use FIG_Config;
use FIGV;
use URI::Escape;
use Bio::FeatureIO;
use Bio::SeqIO;
use Bio::Seq;
use Bio::Location::Split;
use Bio::Location::Simple;
use Bio::SeqFeature::Generic;
use Bio::SeqFeature::Annotated;
use File::Basename;

sub export {
    my ($fig, $parameters) = @_;
    
    # get parameters
    my $virt_genome_dir = $parameters->{'virtual_genome_directory'};
    my $genome          = $parameters->{'genome'};
    my $directory       = $parameters->{'directory'};
    my $format          = $parameters->{'export_format'};
    my $strip_ec        = $parameters->{'strip_ec'};
    my $filename   	= $parameters->{'filename'};
    
    # set some variables
    my $user = "master:master";
    my $format_ending;
    
    # check format
    if ($format eq "genbank") {
	$format_ending = "gbk";
    } elsif ($format eq "GTF") {
	$format_ending = "gtf";
    } elsif ($format eq "gff") {
	$format = "GTF";
	$format_ending = "gff";
    } elsif ($format eq "embl") {
	$format_ending = "embl";
    } else {
	die "unknown export format: $format\n";
    }
    
    # initialize fig object
    my $virt_genome;
    if ($virt_genome_dir) {
	$fig = new FIGV($virt_genome_dir, undef, $fig);
	$virt_genome = basename($virt_genome_dir);
    };
    
    # check for genus species
    my $gs = $fig->genus_species($genome);
    unless ($gs) {
	return (undef, "Couldn't recognize $genome.");
    }
    
    # get taxonomy id and taxonomy
    my $taxid = $genome;
    $taxid =~ s/\.\d+$//;
    my $taxonomy = $fig->taxonomy_of($genome);
    
    # get project
    if ($virt_genome_dir and $genome eq $virt_genome) {
	open(PROJECT, "<$virt_genome_dir/PROJECT") or die "Error opening $virt_genome_dir/PROJECT: $!\n";
    } else {
	open( PROJECT, "<$FIG_Config::organisms/$genome/PROJECT" ) or die "Error opening $FIG_Config::organisms/$genome/PROJECT: $!\n";
    }
    my @project = <PROJECT>;
    chomp(@project);
    close PROJECT;
    map {s/^\<a href\=\".*?\"\>(.*)\<\/a\>/$1/i} @project;
    
    # get md5 checksum
    my $md5 = $fig->genome_md5sum($genome);
    
    # create the variable for the bio object
    my $bio;
    my $bio2;
    my $gff_export;
    
    # get the contigs
    foreach my $contig ($fig->contigs_of($genome)) {
	my $cloc = $contig.'_1_'.$fig->contig_ln($genome, $contig);
	my $seq = $fig->dna_seq($genome, $cloc);
	$seq =~ s/>//;  
	$bio->{$contig} = Bio::Seq->new( -id => $contig, 
					-seq => $seq,
				       );
	$bio->{$contig}->desc("Contig $contig from $gs");
	
	my $feature = Bio::SeqFeature::Generic->new(
						    -start	=> 1,
						    -end	=> $fig->contig_ln($genome, $contig),
						    -tag        => { db_xref     => "taxon: $taxid", 
									 organism   => $gs, 
									 mol_type   => "genomic DNA", 
									 genome_id  => $genome,
									 project    => join(" ", @project),
									 genome_md5 => $md5 },
						    -primary    => "source" );
	$bio->{$contig}->add_SeqFeature($feature);
    }

    # get the functional role name -> GO file
    open(FH, $FIG_Config::data . "/Ontologies/GO/fr2go") or die "could not open fr2go";
    my $fr2go = {};
    while (<FH>) {
	chomp;
	my ($fr, $go) = split /\t/;
	$fr2go->{$fr} = [] unless (exists $fr2go->{$fr});
	push @{$fr2go->{$fr}}, $go;
    }
    close FH;
    
    my @all_features = sort { &FIG::by_fig_id($a,$b) } $fig->pegs_of($genome), $fig->rnas_of($genome);
    my $feature_fns = $fig->function_of_bulk(\@all_features);
    my @all_locations = $fig->feature_location_bulk(\@all_features);
    my %all_locations;
    $all_locations{$_->[0]} = $_->[1] foreach @all_locations;
    my $all_aliases = $fig->feature_aliases_bulk(\@all_features);

    # get the pegs
    foreach my $peg (@all_features) {
	my $note;
	# my $func = $fig->function_of($peg, $user);
	my $func = $feature_fns->{$peg};
	
	my %ecs;
	my @gos;
	
	# get EC / GO from role
	if (defined $func) {
	    foreach my $role ($fig->roles_of_function($func)) {
		my ($ec) = ($role =~ /\(EC (\d+\.\d+\.\d+\.\d+)\)/);
		$ecs{$ec} = 1 if ($ec);
		push @gos, @{$fr2go->{$role}} if ($fr2go->{$role});
	    }
	}

	push @{$note->{db_xref}}, "SEED:$peg";
	
	# remove duplicate gos
	my %gos = map { $_ => 1 } @gos if (scalar(@gos));
	
	# add GOs
	push @{$note->{"db_xref"}}, @gos;
	
	# add ECs
	push @{$note->{"EC_number"}}, keys(%ecs);
	
	# get the aliases from principal id
	my $pid = $fig->maps_to_id($peg);
	my @rw_aliases = map { $fig->rewrite_db_xrefs($_->[0]) } $fig->mapped_prot_ids($pid);
	my @aliases;
	foreach my $a (@rw_aliases) {
	    push @{$note->{"db_xref"}}, $a if ( $a );
	}
	
	# get the links
	foreach my $ln ($fig->fid_links($peg, $all_aliases->{$peg})) {
	    my ($db, $id);
	    if ($ln =~ /field0=CATID/ && $ln =~ /query0=(\d+)/ && ($id=$1) && $ln =~ /pcenter=harvard/) {
		$db = "HarvardClone";
	    } elsif ($ln =~ /(PIRSF)(\d+)/) {
		($db, $id) = ($1, $2);
	    } elsif ($ln =~ />(\S+)\s+(\S+.*?)</) {
		($db, $id) = ($1, $2);
	    }
	    
	    $db =~ s/\://;
	    if (!$db && !$id) {
		print STDERR "Ignored link: $ln\n";
		next;
	    }
	    push @{$note->{"db_xref"}}, "$db:$id";
	}
	
	# add FIG id as a note
	# push @{$note->{"db_xref"}}, "FIG_ID:$peg";
	
	# get the features
	
	my $loc_obj;
#	my @location = $fig->feature_location($peg);
	my @location = split(/,/, $all_locations{$peg});
	my @loc_info;
	my $contig;
	foreach my $loc (@location) {
	    my($start, $stop);
	    $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
	    ($contig, $start, $stop) = ($1, $2, $3);
	    my $original_contig = $contig;
	    my $strand = '+';
	    my $frame = $start % 3;
	    if ($start > $stop) {
		$frame = $stop % 3;
		($start, $stop, $strand) = ($stop, $start, '-');
	    } elsif ($start == $stop) {
		$strand = ".";
		$frame = ".";
	    }

	    push(@loc_info, [$contig, $start, $stop, $strand, $frame]);
	    
	    my $sloc = new Bio::Location::Simple(-start => $start,
						 -end => $stop,
						 -strand => $strand);
	    if (@location == 1)
	    {
		$loc_obj = $sloc;
	    }
	    else
	    {
		$loc_obj = new Bio::Location::Split() if !$loc_obj;
		$loc_obj->add_sub_Location($sloc);
	    }
	}
	
	my $source = "FIG";
	my $type = $fig->ftype($peg);
	my $feature;
	
	# strip EC from function
	my $func_ok = $func;
	if ($strip_ec) {
	    $func_ok =~ s/\s+\(EC \d+\.(\d+|-)\.(\d+|-)\.(\d+|-)\)//g;
	    $func_ok =~ s/\s+\(TC \d+\.(\d+|-)\.(\d+|-)\.(\d+|-)\)//g;
	}
	
	if ($type eq "peg") {
	    $feature = Bio::SeqFeature::Generic->new(-location => $loc_obj,
						     -primary  => 'CDS',
						     -tag      => {
							 product     => $func_ok,
							 translation => $fig->get_translation($peg),
						     },
						    );
	    
	    foreach my $tagtype (keys %$note) {
		$feature->add_tag_value($tagtype, @{$note->{$tagtype}});
	    }
	  
	    # work around to get annotations into gff
	    # this is probably still wrong for split locations.
	    $func_ok =~ s/ #.+//;
	    $func_ok =~ s/;/%3B/g;
	    $func_ok =~ s/,/2C/g;
	    $func_ok =~ s/=//g;
	    for my $l (@loc_info)
	    {
	      my $ec = "";
	      my @ecs = ($func =~ /[\(\[]*EC[\s:]?(\d+\.[\d-]+\.[\d-]+\.[\d-]+)[\)\]]*/ig);
	      if (scalar(@ecs)) {
		$ec = ";Ontology_term=".join(',', map { "KEGG_ENZYME:" . $_ } @ecs);
	      }
	      my($contig, $start, $stop, $strand, $frame) = @$l;
	      push @$gff_export, "$contig\t$source\tCDS\t$start\t$stop\t.\t$strand\t$frame\tID=".$peg.";Name=".$func_ok.$ec."\n";
	    }
		
		
	} elsif ($type eq "rna") {
	    my $primary;
	    if ( $func =~ /tRNA/ ) {
		$primary = 'tRNA';
	    } elsif ( $func =~ /(Ribosomal RNA|5S RNA)/ ) {
		$primary = 'rRNA';
	    } else {
		$primary = 'RNA';
	    }
	    
	    $feature = Bio::SeqFeature::Generic->new(-location => $loc_obj,
						     -primary  => $primary,
						     -tag      => {
							 product => $func,
						     },
						     
						    );
	    $func_ok =~ s/ #.+//;
	    $func_ok =~ s/;/%3B/g;
	    $func_ok =~ s/,/2C/g;
	    $func_ok =~ s/=//g;
	    foreach my $tagtype (keys %$note) {
		$feature->add_tag_value($tagtype, @{$note->{$tagtype}});
		
		# work around to get annotations into gff
		for my $l (@loc_info)
		{
		    my($contig, $start, $stop, $strand, $frame) = @$l;
		    push @$gff_export, "$contig\t$source\t$primary\t$start\t$stop\t.\t$strand\t.\tID=$peg;Name=$func_ok\n";
		}
	    }
	    
	} else {
	    print STDERR "unhandled feature type: $type\n";
	}

	my $bc = $bio->{$contig};
	if (ref($bc))
	{
	    $bc->add_SeqFeature($feature);
	}
	else
	{
	    print STDERR "No contig found for $contig on $feature\n";
	}
    } 


    # generate filename
    if (!$filename)
    {
	$filename = $directory . $genome . "." . $format_ending;
    }
    
    # check for FeatureIO or SeqIO
    if ($format eq "GTF") {
	#my $fio = Bio::FeatureIO->new(-file => ">$filename", -format => "GTF");
	#foreach my $feature (@$bio2) {
	#$fio->write_feature($feature);
	#}
	open (GTF, ">$filename") or die "Cannot open file $filename.";
	print GTF "##gff-version 3\n";
	foreach (@$gff_export) {
	    print GTF $_;
	}
	close(GTF);
	
    } else {
#	my $sio = Bio::SeqIO->new(-file => ">$filename", -format => $format);
	#
	# bioperl writes lowercase dna. We want uppercase for biophython happiness.
	#
	my $sio = Bio::SeqIO->new(-file => "| sed '/^LOCUS/s/dna/DNA/' >$filename", -format => $format);
	foreach my $seq (keys %$bio) {
	    $sio->write_seq($bio->{$seq});
	}
    }
    
    return ($filename, "Output file successfully written.");
}





MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3