[Bio] / FortyEight / SeedExport.pm Repository:
ViewVC logotype

View of /FortyEight/SeedExport.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (download) (as text) (annotate)
Mon Nov 22 22:06:02 2010 UTC (9 years ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011
Changes since 1.12: +5 -1 lines
fix case bug

package SeedExport;

1;

use strict;
use FIG;
use FIG_Config;
use FIGV;
use URI::Escape;
use Bio::FeatureIO;
use Bio::SeqIO;
use Bio::Seq;
use Bio::Location::Split;
use Bio::Location::Simple;
use Bio::SeqFeature::Generic;
use Bio::SeqFeature::Annotated;
use File::Basename;

sub export {
    my ($parameters) = @_;
    
    # get parameters
    my $virt_genome_dir = $parameters->{'virtual_genome_directory'};
    my $genome          = $parameters->{'genome'};
    my $directory       = $parameters->{'directory'};
    my $format          = $parameters->{'export_format'};
    my $strip_ec        = $parameters->{'strip_ec'};
    my $filename   	= $parameters->{'filename'};
    
    # set some variables
    my $user = "master:master";
    my $format_ending;
    
    # check format
    if ($format eq "genbank") {
	$format_ending = "gbk";
    } elsif ($format eq "GTF") {
	$format_ending = "gtf";
    } elsif ($format eq "gff") {
	$format = "GTF";
	$format_ending = "gff";
    } elsif ($format eq "embl") {
	$format_ending = "embl";
    } else {
	die "unknown export format: $format\n";
    }
    
    # initialize fig object
    my $fig;
    my $virt_genome;
    if ($virt_genome_dir) {
	$fig = new FIGV($virt_genome_dir);
	$virt_genome = basename($virt_genome_dir);
    } else {
	$fig = new FIG;
    }
    
    # check for genus species
    my $gs = $fig->genus_species($genome);
    unless ($gs) {
	return (undef, "Couldn't recognize $genome.");
    }
    
    # get taxonomy id and taxonomy
    my $taxid = $genome;
    $taxid =~ s/\.\d+$//;
    my $taxonomy = $fig->taxonomy_of($genome);
    
    # get project
    if ($virt_genome_dir and $genome eq $virt_genome) {
	open(PROJECT, "<$virt_genome_dir/PROJECT") or die "Error opening $virt_genome_dir/PROJECT: $!\n";
    } else {
	open( PROJECT, "<$FIG_Config::organisms/$genome/PROJECT" ) or die "Error opening $FIG_Config::organisms/$genome/PROJECT: $!\n";
    }
    my @project = <PROJECT>;
    chomp(@project);
    close PROJECT;
    map {s/^\<a href\=\".*?\"\>(.*)\<\/a\>/$1/i} @project;
    
    # get md5 checksum
    my $md5 = $fig->genome_md5sum($genome);
    
    # create the variable for the bio object
    my $bio;
    my $bio2;
    my $gff_export;
    
    # get the contigs
    foreach my $contig ($fig->contigs_of($genome)) {
	my $cloc = $contig.'_1_'.$fig->contig_ln($genome, $contig);
	my $seq = $fig->dna_seq($genome, $cloc);
	$seq =~ s/>//;  
	$bio->{$contig} = Bio::Seq->new( -id => $contig, 
					-seq => $seq,
				       );
	$bio->{$contig}->desc("Contig $contig from $gs");
	
	my $feature = Bio::SeqFeature::Generic->new(
						    -start	=> 1,
						    -end	=> $fig->contig_ln($genome, $contig),
						    -tag        => { db_xref     => "taxon: $taxid", 
									 organism   => $gs, 
									 mol_type   => "genomic DNA", 
									 genome_id  => $genome,
									 project    => join(" ", @project),
									 genome_md5 => $md5 },
						    -primary    => "source" );
	$bio->{$contig}->add_SeqFeature($feature);
    }

    # get the functional role name -> GO file
    open(FH, $FIG_Config::data . "/Ontologies/GO/fr2go") or die "could not open fr2go";
    my $fr2go = {};
    while (<FH>) {
	chomp;
	my ($fr, $go) = split /\t/;
	$fr2go->{$fr} = [] unless (exists $fr2go->{$fr});
	push @{$fr2go->{$fr}}, $go;
    }
    close FH;
    
    # get the pegs
    foreach my $peg (sort { &FIG::by_fig_id($a,$b) } $fig->pegs_of($genome), $fig->rnas_of($genome)) {
	my $note;
	my $func = $fig->function_of($peg, $user);
	
	my %ecs;
	my @gos;
	
	# get EC / GO from role
	if (defined $func) {
	    foreach my $role ($fig->roles_of_function($func)) {
		my ($ec) = ($role =~ /\(EC (\d+\.\d+\.\d+\.\d+)\)/);
		$ecs{$ec} = 1 if ($ec);
		push @gos, @{$fr2go->{$role}} if ($fr2go->{$role});
	    }
	}
	
	# remove duplicate gos
	my %gos = map { $_ => 1 } @gos if (scalar(@gos));
	
	# add GOs
	push @{$note->{"db_xref"}}, @gos;
	
	# add ECs
	push @{$note->{"EC_number"}}, keys(%ecs);
	
	# get the aliases from principal id
	my $pid = $fig->maps_to_id($peg);
	my @rw_aliases = map { $fig->rewrite_db_xrefs($_->[0]) } $fig->mapped_prot_ids($pid);
	my @aliases;
	foreach my $a (@rw_aliases) {
	    push @{$note->{"db_xref"}}, $a if ( $a );
	}
	
	# get the links
	foreach my $ln ($fig->fid_links($peg)) {
	    my ($db, $id);
	    if ($ln =~ /field0=CATID/ && $ln =~ /query0=(\d+)/ && ($id=$1) && $ln =~ /pcenter=harvard/) {
		$db = "HarvardClone";
	    } elsif ($ln =~ /(PIRSF)(\d+)/) {
		($db, $id) = ($1, $2);
	    } elsif ($ln =~ />(\S+)\s+(\S+.*?)</) {
		($db, $id) = ($1, $2);
	    }
	    
	    $db =~ s/\://;
	    if (!$db && !$id) {
		print STDERR "Ignored link: $ln\n";
		next;
	    }
	    push @{$note->{"db_xref"}}, "$db:$id";
	}
	
	# add FIG id as a note
	# push @{$note->{"db_xref"}}, "FIG_ID:$peg";
	
	# get the features
	
	my $loc_obj;
	my @location = $fig->feature_location($peg);
	my @loc_info;
	my $contig;
	foreach my $loc (@location) {
	    my($start, $stop);
	    $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
	    ($contig, $start, $stop) = ($1, $2, $3);
	    my $original_contig = $contig;
	    my $strand = '+';
	    my $frame = $start % 3;
	    if ($start > $stop) {
		$frame = $stop % 3;
		($start, $stop, $strand) = ($stop, $start, '-');
	    } elsif ($start == $stop) {
		$strand = ".";
		$frame = ".";
	    }

	    push(@loc_info, [$contig, $start, $stop, $strand, $frame]);
	    
	    my $sloc = new Bio::Location::Simple(-start => $start,
						 -end => $stop,
						 -strand => $strand);
	    if (@location == 1)
	    {
		$loc_obj = $sloc;
	    }
	    else
	    {
		$loc_obj = new Bio::Location::Split() if !$loc_obj;
		$loc_obj->add_sub_Location($sloc);
	    }
	}
	
	my $source = "FIG";
	my $type = $fig->ftype($peg);
	my $feature;
	
	# strip EC from function
	my $func_ok = $func;
	if ($strip_ec) {
	    $func_ok =~ s/\s+\(EC \d+\.(\d+|-)\.(\d+|-)\.(\d+|-)\)//g;
	    $func_ok =~ s/\s+\(TC \d+\.(\d+|-)\.(\d+|-)\.(\d+|-)\)//g;
	}
	
	if ($type eq "peg") {
	    $feature = Bio::SeqFeature::Generic->new(-location => $loc_obj,
						     -primary  => 'CDS',
						     -tag      => {
							 product     => $func_ok,
							 translation => $fig->get_translation($peg),
						     },
						    );
	    
	    foreach my $tagtype (keys %$note) {
		$feature->add_tag_value($tagtype, @{$note->{$tagtype}});
	    }
	  
	    # work around to get annotations into gff
	    # this is probably still wrong for split locations.
	    $func_ok =~ s/ #.+//;
	    $func_ok =~ s/;/%3B/g;
	    $func_ok =~ s/,/2C/g;
	    $func_ok =~ s/=//g;
	    for my $l (@loc_info)
	    {
	      my $ec = "";
	      my @ecs = ($func =~ /[\(\[]*EC[\s:]?(\d+\.[\d-]+\.[\d-]+\.[\d-]+)[\)\]]*/ig);
	      if (scalar(@ecs)) {
		$ec = ";Ontology_term=".join(',', map { "KEGG_ENZYME:" . $_ } @ecs);
	      }
	      my($contig, $start, $stop, $strand, $frame) = @$l;
	      push @$gff_export, "$contig\t$source\tCDS\t$start\t$stop\t.\t$strand\t$frame\tID=".$peg.";Name=".$func_ok.$ec."\n";
	    }
		
		
	} elsif ($type eq "rna") {
	    my $primary;
	    if ( $func =~ /tRNA/ ) {
		$primary = 'tRNA';
	    } elsif ( $func =~ /(Ribosomal RNA|5S RNA)/ ) {
		$primary = 'rRNA';
	    } else {
		$primary = 'RNA';
	    }
	    
	    $feature = Bio::SeqFeature::Generic->new(-location => $loc_obj,
						     -primary  => $primary,
						     -tag      => {
							 product => $func,
						     },
						     
						    );
	    $func_ok =~ s/ #.+//;
	    $func_ok =~ s/;/%3B/g;
	    $func_ok =~ s/,/2C/g;
	    $func_ok =~ s/=//g;
	    foreach my $tagtype (keys %$note) {
		$feature->add_tag_value($tagtype, @{$note->{$tagtype}});
		
		# work around to get annotations into gff
		for my $l (@loc_info)
		{
		    my($contig, $start, $stop, $strand, $frame) = @$l;
		    push @$gff_export, "$contig\t$source\t$primary\t$start\t$stop\t.\t$strand\t.\tID=$peg;Name=$func_ok\n";
		}
	    }
	    
	} else {
	    print STDERR "unhandled feature type: $type\n";
	}
      $bio->{$contig}->add_SeqFeature($feature);
    } 


    # generate filename
    if (!$filename)
    {
	$filename = $directory . $genome . "." . $format_ending;
    }
    
    # check for FeatureIO or SeqIO
    if ($format eq "GTF") {
	#my $fio = Bio::FeatureIO->new(-file => ">$filename", -format => "GTF");
	#foreach my $feature (@$bio2) {
	#$fio->write_feature($feature);
	#}
	open (GTF, ">$filename") or die "Cannot open file $filename.";
	print GTF "##gff-version 3\n";
	foreach (@$gff_export) {
	    print GTF $_;
	}
	close(GTF);
	
    } else {
#	my $sio = Bio::SeqIO->new(-file => ">$filename", -format => $format);
	#
	# bioperl writes lowercase dna. We want uppercase for biophython happiness.
	#
	my $sio = Bio::SeqIO->new(-file => "| sed '/^LOCUS/s/dna/DNA/' >$filename", -format => $format);
	foreach my $seq (keys %$bio) {
	    $sio->write_seq($bio->{$seq});
	}
    }
    
    return ($filename, "Output file successfully written.");
}




MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3