[Bio] / FortyEight / SeedExport.pm Repository:
ViewVC logotype

View of /FortyEight/SeedExport.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Thu Feb 8 21:11:01 2007 UTC (13 years, 5 months ago) by paarmann
Branch: MAIN
Changes since 1.4: +15 -3 lines
added workaround to export gff with annotations

package SeedExport;

1;

use strict;
use FIG;
use FIG_Config;
use FIGV;
use URI::Escape;
use Bio::FeatureIO;
use Bio::SeqIO;
use Bio::Seq;
use Bio::SeqFeature::Generic;
use Bio::SeqFeature::Annotated;
use File::Basename;

sub export {
    my ($parameters) = @_;

    # get parameters
    my $virt_genome_dir = $parameters->{'virtual_genome_directory'};
    my $genome          = $parameters->{'genome'};
    my $directory       = $parameters->{'directory'};
    my $format          = $parameters->{'export_format'};

    # set some variables
    my $user = "master:master";
    my $format_ending;

    # check format
    if ($format eq "genbank") {
      $format_ending = "gbk";
    } elsif ($format eq "GTF") {
      $format_ending = "gtf";
    } elsif ($format eq "gff") {
      $format = "GTF";
      $format_ending = "gff";
    } elsif ($format eq "embl") {
      $format_ending = "embl";
    } else {
      die "unknown export format: $format\n";
    }

    # initialize fig object
    my $fig;
    my $virt_genome;
    if ($virt_genome_dir) {
	$fig = new FIGV($virt_genome_dir);
	$virt_genome = basename($virt_genome_dir);
    } else {
	$fig = new FIG;
    }

    # check for genus species
    my $gs = $fig->genus_species($genome);
    unless ($gs) {
	return (undef, "Couldn't recognize $genome.");
    }

    # get taxonomy id and taxonomy
    my $taxid = $genome;
    $taxid =~ s/\.\d+$//;
    my $taxonomy = $fig->taxonomy_of($genome);

    # get project
    if ($virt_genome_dir and $genome eq $virt_genome) {
	open(PROJECT, "<$virt_genome_dir/PROJECT") or die "Error opening $virt_genome_dir/PROJECT: $!\n";
    } else {
	open( PROJECT, "<$FIG_Config::organisms/$genome/PROJECT" ) or die "Error opening $FIG_Config::organisms/$genome/PROJECT: $!\n";
    }
    my @project = <PROJECT>;
    chomp(@project);
    close PROJECT;
    map {s/^\<a href\=\".*?\"\>(.*)\<\/a\>/$1/i} @project;

    # get md5 checksum
    my $md5 = $fig->genome_md5sum($genome);

    # create the variable for the bio object
    my $bio;
    my $bio2;
    my $gff_export;
    
    # get the contigs
    foreach my $contig ($fig->contigs_of($genome)) {
	my $cloc = $contig.'_1_'.$fig->contig_ln($genome, $contig);
 	my $seq = $fig->dna_seq($genome, $cloc);
	$seq =~ s/>//;
	$bio->{$contig} = Bio::Seq->new(-id => "$contig", seq => $seq);
	$bio->{$contig}->desc("Contig $contig from $gs");
	my $feature = Bio::SeqFeature::Generic->new(
	    -start	=> 1,
	    -end	=> $fig->contig_ln($genome, $contig),
	    -tag        => { dbxref     => "taxon: $taxid", 
			     organism   => $gs, 
			     mol_type   => "genomic DNA", 
			     genome_id  => $genome,
			     project    => join(" ", @project),
			     genome_md5 => $md5 },
	    -primary    => "source" );
	$bio->{$contig}->add_SeqFeature($feature);
    }

    # get the functional role name -> GO file
    open(FH, $FIG_Config::data . "/Ontologies/GO/fr2go") or die "could not open fr2go";
    my $fr2go;
    while (<FH>) {
      chomp;
      my ($fr, $go) = split /\t/;
      $fr2go->{$fr} = $go;
    }
    close FH;

    # get the pegs
    foreach my $peg (sort { &FIG::by_fig_id($a,$b) } $fig->pegs_of($genome), $fig->rnas_of($genome)) {
	my $note;
	my $func = $fig->function_of($peg, $user);

	# check for a GO number
	if (exists($fr2go->{$func})) {
	  push @{$note->{"Dbxref"}}, $fr2go->{$func};
	}
	
	# parse out the ec number if there is one
	my $ec;
	if ($func =~ /E\.*C\.*\s+(\d+|-)\.(\d+|-)\.(\d+|-)\.(\d+|-)/) {
	    push @{$note->{"EC_number"}}, "$1.$2.$3.$4";
	}

	# check the aliases
	foreach my $alias ($fig->feature_aliases($peg))
	{
	    if ($alias =~ /^NP_/ || $alias =~ /YP_/) {
	      push @{$note->{"Dbxref"}}, "genpept:$alias"
	    }
	    elsif ($alias =~ /gi\|/)
	    {
		$alias =~ s/^gi\|//;
		push @{$note->{"Dbxref"}}, "GI:$alias";
	    }
	    elsif ($alias =~ /^kegg/i)
	    {
		$alias =~ s/kegg\|/KEGG:/i;
		$alias =~ s/^(.*):/$1+/;
		push @{$note->{"Dbxref"}}, $alias
	    }
	    elsif ($alias =~ /^uni/)
	    {
		$alias =~ s/uni\|/UniProt:/;
		push @{$note->{"Dbxref"}}, "UniProtKB/TrEMBL:$alias";
	    }
	    elsif ($alias =~ /^sp\|/)
	    {
		$alias =~ s/sp\|/Swiss-Prot:/;
		push @{$note->{"Dbxref"}}, "UniProtKB/Swiss-Prot:$alias";
	    }
	    elsif ($alias =~ /^[a-zA-Z][a-zA-Z0-9]{2,}\_[a-zA-Z]*\d+$/)
	    {
		# that should be the regexp for a valid locus tag. Starts with a letter,
		# has at least three alphanumerics before the _
		# then has a series of optional letters and ends with numbers
		push @{$note->{"locus_tag"}}, $alias;
	    }
	    elsif ($alias =~ /^.{4,6}$/)
	    {
		push @{$note->{"gene_symbol"}}, $alias;
	    }
	    elsif ($alias =~ /^[a-zA-Z]+\d+$/)
	    {
		push @{$note->{"locus"}}, $alias;
	    }
	    else {
		# don't know what it is so keep it as an alias
		push @{$note->{"Alias"}}, $alias;
	    }
	}

	# get the links
	foreach my $ln ($fig->fid_links($peg))
	{
	    my ($db, $id);
	    if ($ln =~ /field0=CATID/ && $ln =~ /query0=(\d+)/ && ($id=$1) && $ln =~ /pcenter=harvard/) {
		$db = "HarvardClone";
	    } elsif ($ln =~ /(PIRSF)(\d+)/) {
		($db, $id) = ($1, $2);
	    } elsif ($ln =~ />(\S+)\s+(\S+.*?)</) {
		($db, $id) = ($1, $2);
	    }
	    
	    $db =~ s/\://;
	    if (!$db && !$id) {
		print STDERR "Ignored link: $ln\n";
		next;
	    }
	    push @{$note->{"Dbxref"}}, "$db:$id";
	}
	
	# add FIG id as a note
	push @{$note->{"Dbxref"}}, "FIG_ID:$peg";
	
	# get the features
	my @location = $fig->feature_location($peg);
	foreach my $loc (@location) {
	    $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
	    my ($contig, $start, $stop) = ($1, $2, $3);
	    my $original_contig = $contig;
	    my $strand = '1';
	    my $frame = $start % 3;
	    if ($start > $stop) {
	      $frame = $stop % 3;
	      ($start, $stop, $strand) = ($stop, $start, '-1');
	    }
	    elsif ($start == $stop) {
	      $strand = ".";
	      $frame = ".";
	    }
	    my $source = "FIG";
	    my $type = $fig->ftype($peg);
	    my $feature;
	    if ($type eq "peg") {
		$feature = Bio::SeqFeature::Generic->new(
		    -start    => $start,
		    -end      => $stop,
		    -strand   => $strand,
		    -primary  => 'CDS',
		    -tag      => {
				  product     => $func,
				  translation => $fig->get_translation($peg),
				 },
		    );
		
		foreach my $tagtype (keys %$note) {
		    $feature->add_tag_value($tagtype, @{$note->{$tagtype}});
		}
		
		my $feature2 = Bio::SeqFeature::Annotated->new( -start    => $start,
								-end      => $stop,
								-strand   => $strand,
								-phase    => 0,
								-frame    => $frame,
								-source   => $source,
								-type     => "CDS",
								-seq_id   => $contig );

		push(@$bio2, $feature2);

		# work around to get annotations into gff
		push @$gff_export, "$contig\t$source\tCDS\t$start\t$stop\t.\t$strand\t$frame\t$func\n"

		
	    } elsif ($type eq "rna") {
		$feature = Bio::SeqFeature::Generic->new(
		    -start    => $start,
		    -end      => $stop,
		    -strand   => $strand,
		    -primary  => 'RNA',
		    );
		foreach my $tagtype (keys %$note) {
		    $feature->add_tag_value($tagtype, @{$note->{$tagtype}});
		}
		
	    } else {
		print STDERR "unhandled feature type: $type\n";
	    }
	    
	    $bio->{$contig}->add_SeqFeature($feature);
	} 
    }

    # generate filename
    my $filename = $directory . $genome . "." . $format_ending;
    
    # check for FeatureIO or SeqIO
    if ($format eq "GTF") {
      #my $fio = Bio::FeatureIO->new(-file => ">$filename", -format => "GTF");
      #foreach my $feature (@$bio2) {
      #$fio->write_feature($feature);
      #}
      open (GTF, ">$filename") or die "Cannot open file $filename.";
      print GTF "##gff-version 3\n";
      foreach (@$gff_export) {
	print GTF $_;
      }
      close(GTF);
      
    } else {
      my $sio = Bio::SeqIO->new(-file => ">$filename", -format => $format);
      foreach my $seq (keys %$bio) {
	$sio->write_seq($bio->{$seq});
      }
    }
    
    return ($filename, "Output file successfully written.");
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3