[Bio] / FortyEight / SeedExport.pm Repository:
ViewVC logotype

View of /FortyEight/SeedExport.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (download) (as text) (annotate)
Tue Oct 2 18:01:00 2007 UTC (12 years, 9 months ago) by paarmann
Branch: MAIN
Changes since 1.8: +27 -11 lines
fixed export of rna, weird sourceforge bug commented out

package SeedExport;

1;

use strict;
use FIG;
use FIG_Config;
use FIGV;
use URI::Escape;
use Bio::FeatureIO;
use Bio::SeqIO;
use Bio::Seq;
use Bio::SeqFeature::Generic;
use Bio::SeqFeature::Annotated;
use File::Basename;

sub export {
  my ($parameters) = @_;

  # get parameters
  my $virt_genome_dir = $parameters->{'virtual_genome_directory'};
  my $genome          = $parameters->{'genome'};
  my $directory       = $parameters->{'directory'};
  my $format          = $parameters->{'export_format'};
  my $strip_ec        = $parameters->{'strip_ec'};

  # set some variables
  my $user = "master:master";
  my $format_ending;

  # check format
  if ($format eq "genbank") {
    $format_ending = "gbk";
  } elsif ($format eq "GTF") {
    $format_ending = "gtf";
  } elsif ($format eq "gff") {
    $format = "GTF";
    $format_ending = "gff";
  } elsif ($format eq "embl") {
    $format_ending = "embl";
  } else {
    die "unknown export format: $format\n";
  }

  # initialize fig object
  my $fig;
  my $virt_genome;
  if ($virt_genome_dir) {
    $fig = new FIGV($virt_genome_dir);
    $virt_genome = basename($virt_genome_dir);
  } else {
    $fig = new FIG;
  }

  # check for genus species
  my $gs = $fig->genus_species($genome);
  unless ($gs) {
    return (undef, "Couldn't recognize $genome.");
  }

  # get taxonomy id and taxonomy
  my $taxid = $genome;
  $taxid =~ s/\.\d+$//;
  my $taxonomy = $fig->taxonomy_of($genome);

  # get project
  if ($virt_genome_dir and $genome eq $virt_genome) {
    open(PROJECT, "<$virt_genome_dir/PROJECT") or die "Error opening $virt_genome_dir/PROJECT: $!\n";
  } else {
    open( PROJECT, "<$FIG_Config::organisms/$genome/PROJECT" ) or die "Error opening $FIG_Config::organisms/$genome/PROJECT: $!\n";
  }
  my @project = <PROJECT>;
  chomp(@project);
  close PROJECT;
  map {s/^\<a href\=\".*?\"\>(.*)\<\/a\>/$1/i} @project;

  # get md5 checksum
  my $md5 = $fig->genome_md5sum($genome);

  # create the variable for the bio object
  my $bio;
  my $bio2;
  my $gff_export;
    
  # get the contigs
  foreach my $contig ($fig->contigs_of($genome)) {
    my $cloc = $contig.'_1_'.$fig->contig_ln($genome, $contig);
    my $seq = $fig->dna_seq($genome, $cloc);
    $seq =~ s/>//;  
    $bio->{$contig} = Bio::Seq->new( -id => $contig, 
				     -seq => $seq,
				   );
    $bio->{$contig}->desc("Contig $contig from $gs");

    my $feature = Bio::SeqFeature::Generic->new(
						-start	=> 1,
						-end	=> $fig->contig_ln($genome, $contig),
						-tag        => { db_xref     => "taxon: $taxid", 
								 organism   => $gs, 
								 mol_type   => "genomic DNA", 
								 genome_id  => $genome,
								 project    => join(" ", @project),
								 genome_md5 => $md5 },
						-primary    => "source" );
    $bio->{$contig}->add_SeqFeature($feature);
  }

  # get the functional role name -> GO file
  open(FH, $FIG_Config::data . "/Ontologies/GO/fr2go") or die "could not open fr2go";
  my $fr2go = {};
  while (<FH>) {
    chomp;
    my ($fr, $go) = split /\t/;
    $fr2go->{$fr} = [] unless (exists $fr2go->{$fr});
    push @{$fr2go->{$fr}}, $go;
  }
  close FH;

  # get the pegs
  foreach my $peg (sort { &FIG::by_fig_id($a,$b) } $fig->pegs_of($genome), $fig->rnas_of($genome)) {
    my $note;
    my $func = $fig->function_of($peg, $user);

    my %ecs;
    my @gos;
      
    # get EC / GO from role
    if (defined $func) {
      foreach my $role ($fig->roles_of_function($func)) {
	my ($ec) = ($role =~ /\(EC (\d+\.\d+\.\d+\.\d+)\)/);
	$ecs{$ec} = 1 if ($ec);
	push @gos, @{$fr2go->{$role}} if ($fr2go->{$role});
      }
    }
      
    # remove duplicate gos
    my %gos = map { $_ => 1 } @gos if (scalar(@gos));
      
    # add GOs
    push @{$note->{"db_xref"}}, @gos;
      
    # add ECs
    push @{$note->{"EC_number"}}, keys(%ecs);
      
    # get the aliases from principal id
    my $pid = $fig->maps_to_id($peg);
    my @rw_aliases = map { $fig->rewrite_db_xrefs($_->[0]) } $fig->mapped_prot_ids($pid);
    my @aliases;
    foreach my $a (@rw_aliases) {
      push @{$note->{"db_xref"}}, $a if ( $a );
    }
      
    # get the links
    foreach my $ln ($fig->fid_links($peg)) {
      my ($db, $id);
      if ($ln =~ /field0=CATID/ && $ln =~ /query0=(\d+)/ && ($id=$1) && $ln =~ /pcenter=harvard/) {
	$db = "HarvardClone";
      } elsif ($ln =~ /(PIRSF)(\d+)/) {
	($db, $id) = ($1, $2);
      } elsif ($ln =~ />(\S+)\s+(\S+.*?)</) {
	($db, $id) = ($1, $2);
      }
	
      $db =~ s/\://;
      if (!$db && !$id) {
	print STDERR "Ignored link: $ln\n";
	next;
      }
      push @{$note->{"db_xref"}}, "$db:$id";
    }
      
    # add FIG id as a note
    # push @{$note->{"db_xref"}}, "FIG_ID:$peg";
      
    # get the features
    my @location = $fig->feature_location($peg);
    foreach my $loc (@location) {
      $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
      my ($contig, $start, $stop) = ($1, $2, $3);
      my $original_contig = $contig;
      my $strand = '1';
      my $frame = $start % 3;
      if ($start > $stop) {
	$frame = $stop % 3;
	($start, $stop, $strand) = ($stop, $start, '-1');
      } elsif ($start == $stop) {
	$strand = ".";
	$frame = ".";
      }
      my $source = "FIG";
      my $type = $fig->ftype($peg);
      my $feature;

      # strip EC from function
      my $func_ok = $func;
      if ($strip_ec) {
	$func_ok =~ s/\s+\(EC \d+\.(\d+|-)\.(\d+|-)\.(\d+|-)\)//g;
	$func_ok =~ s/\s+\(TC \d+\.(\d+|-)\.(\d+|-)\.(\d+|-)\)//g;
      }
      
      if ($type eq "peg") {
	$feature = Bio::SeqFeature::Generic->new(
						 -start    => $start,
						 -end      => $stop,
						 -strand   => $strand,
						 -primary  => 'CDS',
						 -tag      => {
							       product     => $func_ok,
							       translation => $fig->get_translation($peg),
							      },
						);
	  
	foreach my $tagtype (keys %$note) {
	  $feature->add_tag_value($tagtype, @{$note->{$tagtype}});
	}
	  
# 	my $feature2 = Bio::SeqFeature::Annotated->new( -start    => $start,
# 							-end      => $stop,
# 							-strand   => $strand,
# 							-phase    => 0,
# 							-frame    => $frame,
# 							-source   => $source,
# 							-type     => "CDS",
# 							-seq_id   => $contig );
	  
# 	push(@$bio2, $feature2);
	  
	# work around to get annotations into gff
	push @$gff_export, "$contig\t$source\tCDS\t$start\t$stop\t.\t$strand\t$frame\t$func\n"
	    
	    
      } elsif ($type eq "rna") {
	  my $primary;
	  if ( $func =~ /tRNA/ ) {
	      $primary = 'tRNA';
	  } elsif ( $func =~ /(Ribosomal RNA|5S RNA)/ ) {
	      $primary = 'rRNA';
	  } else {
	      $primary = 'RNA';
	  }
	      
	$feature = Bio::SeqFeature::Generic->new(
						 -start    => $start,
						 -end      => $stop,
						 -strand   => $strand,
						 -primary  => $primary,
						 -tag      => {
							       product => $func,
							   },

						);
	foreach my $tagtype (keys %$note) {
	  $feature->add_tag_value($tagtype, @{$note->{$tagtype}});

	# work around to get annotations into gff
	push @$gff_export, "$contig\t$source\t$primary\t$start\t$stop\t.\t$strand\t.\t$func\n"
	}
	  
      } else {
	print STDERR "unhandled feature type: $type\n";
      }
      $bio->{$contig}->add_SeqFeature($feature);
    } 
  }
    
  # generate filename
  my $filename = $directory . $genome . "." . $format_ending;
    
  # check for FeatureIO or SeqIO
  if ($format eq "GTF") {
    #my $fio = Bio::FeatureIO->new(-file => ">$filename", -format => "GTF");
    #foreach my $feature (@$bio2) {
    #$fio->write_feature($feature);
    #}
    open (GTF, ">$filename") or die "Cannot open file $filename.";
    print GTF "##gff-version 3\n";
    foreach (@$gff_export) {
      print GTF $_;
    }
    close(GTF);
      
  } else {
    my $sio = Bio::SeqIO->new(-file => ">$filename", -format => $format);
    foreach my $seq (keys %$bio) {
      $sio->write_seq($bio->{$seq});
    }
  }
    
  return ($filename, "Output file successfully written.");
}




MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3