[Bio] / FigKernelScripts / seed2genbank.pl Repository:
ViewVC logotype

View of /FigKernelScripts/seed2genbank.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (download) (as text) (annotate)
Sat Jan 7 20:34:22 2012 UTC (7 years, 10 months ago) by redwards
Branch: MAIN
CVS Tags: rast_rel_2014_0912, mgrast_version_3_2, rast_rel_2014_0729, HEAD
Changes since 1.5: +17 -3 lines
adding SOURCE and taxonomy information

# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


# some pod

=head1 SEED2GENBANK CONVERTER


The beginnings of a module to take SEED data and output it in GenBank
format. This will use the BioPerl modules, and so once the data are
loaded onto the sequences, in essence you can output the data in any
format you would like.

=head1 NOTE ON EXTERNAL MODULES
 
I use the following modules:
 
 Bio::SeqIO    - used for input/output for the sequences
 Bio::Species  - for the taxonomy and SOURCE information
 Getopt::Long  - this is used for parsing the command line
 URI::Escape   - this is to escape any text that we use. This is available from http://search.cpan.org/~rse/lcwa-1.0.0/

=head1 BUGS AND CONTACT

Contact Rob Edwards (RobE@thefig.info) with any bugs and/or comments.


=cut



use strict;
use FIG;
use FIGV;
use URI::Escape; # this is used to escape the sequences and comes from http://search.cpan.org/~rse/lcwa-1.0.0/
use Getopt::Long;
use Bio::SeqIO;
use Bio::Seq;
use Bio::Species;
use Bio::SeqFeature::Generic;
use File::Basename;

my $usage=<<EOF;

$0 <OPTIONS>
 -d <genome directory>	    Genome directory to extract from.
 -g <genome number>         Number of the genome to extract. Required.
 -o <output file>           Default is "genomeid.gbk". The string "%g" will be replaced with teh genome id.
 -u <user>                  Default is master:master
 -b <value>		    First base to include (inclusive)
 -e <value>                 Last base to include (inclusive)
 -c contig		    Name of the contig
 -s 			    Output the CDS and protein sequences in FASTA format as well as the whole genome DNA sequence
 -t |trn|pro|cds|gen|all|   Include features in the output. See below.
 -nmpdr			    Include NMPDR specific requirements
 -color			    Color pegs according to the artemis coloring scheme.
 
 Features:

 Features can be any of:
 	trn  :  transcript
 	pro  :  protein
 	cds  :  cds
 	gen  :  gene
 	all  :  trn, pro, cds, gen

 Note at the moment this will probably put out the same information for all of these!!
 
EOF

my ($virt_genome_dir, $allgen, $default_outputf, $user, $beg, $end, $outputfasta, $trn, $nmpdr);
my ($wantedcontig, $colorpegs);
GetOptions(
 'd|dir:s'         => \$virt_genome_dir,
 'g|genome:s'      => \$allgen,
 'o|output:s'      => \$default_outputf,
 'u|user:s'        => \$user,
 'b|begin:i'       => \$beg,
 'e|end:i'         => \$end,
 'c|contig:s'	   => \$wantedcontig,
 's|seq'           => \$outputfasta,
 't|type:s'        => \$trn,    
 'nmpdr'	   => \$nmpdr,
 'color:s'           => \$colorpegs,
);

my $fig;
my $virt_genome;
if ($virt_genome_dir)
{
    $fig = new FIGV($virt_genome_dir);
    $virt_genome = basename($virt_genome_dir);
}
else
{
    $fig = new FIG;
}


die $usage unless $allgen;

my %outputtype;
foreach (split /\,\s*/, $trn) {
 if (/trn/i || /transcript/i) {$outputtype{'transcript'}=1}
 elsif (/pro/i || /protein/i) {$outputtype{'protein'}=1}
 elsif (/cds/i) {$outputtype{'cds'}=1}
 elsif (/gen/i) {$outputtype{'gene'}=1}
 elsif (/all/i) {foreach my $t (qw[transcript protein cds gene]) {$outputtype{$t}=1}}
 else {print STDERR "Don't know what output type |$_| is, so we ignored it\n"}
}
my $fastasequences;

unless ($user) {$user="master:master"}

# set the source depending on if we are making the files for the NMPDR
my $source="The SEED";
$nmpdr ? $source = "NMPDR" : 1;
  
# now process each genome
my $gs;
my  $partialoutput=0;
foreach my $genome (split /\,/, $allgen)
{
    my $outputf;
    if (defined($default_outputf))
    {
	$outputf = $default_outputf;
	$outputf =~ s/%g/$genome/g;
    }
    else
    {
	$outputf = "$genome.gbk";
    }
 if (defined $beg && defined $end) 
 {
  if ($end < $beg) {($end, $beg)=($beg, $end)}
  print STDERR "Only outputting the ", int(($end-$beg)/1000), " kb between $beg and $end\n";
  $partialoutput=1;
  # $outputf =~ s/\.gbk$//;
  # $outputf .= "_" . $beg . "_" . $end . ".gbk";
 }
 
 $gs=$fig->genus_species($genome);
 unless ($gs) {
  print STDERR "Couldn't recognize $genome. Skipped\n";
  next;
 }
 else {print STDERR "Converting $gs ($genome) to GenBank. Output will be in $outputf\n"} 
 
 my $taxid=$genome; $taxid=~s/\.\d+$//;
 my $taxonomy=$fig->taxonomy_of($genome);

 if ($virt_genome_dir and $genome eq $virt_genome)
 {
     open(PROJECT, "<$virt_genome_dir/PROJECT") or die "Error opening $virt_genome_dir/PROJECT: $!\n";
 }
 else
 {
     open( PROJECT, "<$FIG_Config::organisms/$genome/PROJECT" ) or die "Error opening $FIG_Config::organisms/$genome/PROJECT: $!\n";
 }
 my @project = <PROJECT>;
 chomp(@project);
 close PROJECT;
 map {s/^\<a href\=\".*?\"\>(.*)\<\/a\>/$1/i} @project;

 my $md5=$fig->genome_md5sum($genome);

 # first get the contigs
 my $bio;
 warn "Processing contigs\n";
 foreach my $contig ($fig->contigs_of($genome))
 {
 	next if ($wantedcontig && ($contig ne $wantedcontig));
	
	my ($from, $to)=(1, $fig->contig_ln($genome, $contig));
	$beg && ($from = $beg);
	$end && ($to   = $end);
	
	my $cloc=$contig.'_'.$from.'_'.$to;
 	my $seq=$fig->dna_seq($genome, $cloc);
	$bio->{$contig}=Bio::Seq->new(-id=>"$cloc", seq=>$seq);
	if ($partialoutput) {
		$bio->{$contig}->desc("Contig $contig from $gs from $from to $to");
	} else {
		$bio->{$contig}->desc($gs);
	}
	# set the species
	my @tax = split /\s*\;\s*/, $taxonomy;
	@tax=reverse @tax;
	my $speciesObj = Bio::Species->new(-classification=>\@tax, -ncbi_taxid=>$taxid);
	$speciesObj->common_name($gs);
	$bio->{$contig}->species($speciesObj);
	my $feature=Bio::SeqFeature::Generic->new(
		-start	=> 1,
		-end	=> ($to-$from)+1,
		-tag    => {
			dbxref=>"taxon: $taxid", 
			organism=>$gs, 
			mol_type=>"genomic DNA", 
			genome_id=>$genome,
			project=>join(" ", @project),
			genome_md5=>$md5,
			},
		-primary => "source",
		);
	$bio->{$contig}->add_SeqFeature($feature);
 }

 warn "Processing features\n";
 foreach my $peg (sort { &FIG::by_fig_id($a,$b) } $fig->pegs_of($genome), $fig->rnas_of($genome)) {
  my $note;
  my $func=$fig->function_of($peg, $user);

  # parse out the ec number if there is one
  my $ec;
  if ($func =~ /E\.*C\.*\s+(\d+|-)\.(\d+|-)\.(\d+|-)\.(\d+|-)/)
  {
     push @{$note->{"ec_number"}}, "$1.$2.$3.$4";
  }
  
  # if this is for the nmpdr we are required to have a description, so we'll make it hypothetical protein :)
  ($nmpdr && !$func && $peg =~ /peg/) ? ($func = "hypothetical protein") : 1;
  ($nmpdr && $func)  ? push @{$note->{"description"}}, $func : $func ? push @{$note->{"Note"}}, $func : 1;

  
  foreach my $alias ($fig->feature_aliases($peg))
  {
   if ($alias =~ /^NP_/ || $alias =~ /YP_/) {push @{$note->{"Dbxref"}}, "NCBI_genpept:$alias"}
   elsif ($alias =~ /gi\|/)
   {
    $alias =~ s/^gi\|//;
    push @{$note->{"Dbxref"}}, "NCBI_gi:$alias";
   }
   elsif ($alias =~ /^kegg/i)
   {
    $alias =~ s/kegg\|/KEGG:/i;
    $alias =~ s/^(.*):/$1+/;
    push @{$note->{"Dbxref"}}, $alias
   }
   elsif ($alias =~ /^uni/)
   {
    $alias =~ s/uni\|/UniProt:/;
    push @{$note->{"Dbxref"}}, $alias;
   }
   elsif ($alias =~ /^sp\|/)
   {
    $alias =~ s/sp\|/Swiss-Prot:/;
    push @{$note->{"Dbxref"}}, $alias
   }
   elsif ($alias =~ /^[a-zA-Z][a-zA-Z0-9]{2,}\_[a-zA-Z]*\d+$/)
   {
   	# that should be the regexp for a valid locus tag. Starts with a letter, has at least three alphanumerics before the _
	# then has a series of optional letters and ends with numbers
	push @{$note->{"locus_tag"}}, $alias;
   }
   elsif ($alias =~ /^.{4,6}$/)
   {
   	push @{$note->{"gene_symbol"}}, $alias;
   }
   elsif ($alias =~ /^[a-zA-Z]+\d+$/)
   {
   	push @{$note->{"locus"}}, $alias;
   }
   else {
    # don't know what it is so keep it as an alias
    $alias = $alias; # just in case!
    push @{$note->{"Alias"}}, $alias;
   }
  }
 
  # now go through the links and see if there is anything that we should add as a Dbxref.
  # the links are stored like this:
  # <A HREF=http://smart.embl-heidelberg.de/smart/do_annotation.pl?ACC=SM00482>SMART SM00482</A>
  # we will parse out the things between the > and </A>. In this case the SMART SM00482
  # generally these are in the format DB\s+ID, but I have added some special cases
  #
  # Set links_to_keep to true for the link to keep it. Set links to keep to false to ignore it
  
  my %links_to_keep=(
        Pfam            => 1,
        PubMed          => 1,
        Entrez          => 1,
        PIRSF           => 1,
        SMART           => 1,
        InterPro        => 1,
        PDB             => 1,
        UniProt         => 1,
        IEDB            => 1,
        EcoCyc          => 1,
        Colibri         => 1,
	SubtiList	=> 1,
        ATP             => 0,
        Animation       => 0,
        vibrio          => 0,
	HarvardClone	=> 0,
	"E."		=> 0,
    );

  my %warn;
  foreach my $ln ($fig->fid_links($peg))
  {
      my ($db, $id);
      if ($ln =~ /field0=CATID/ && $ln =~ /query0=(\d+)/ && ($id=$1) && $ln =~ /pcenter=harvard/) {$db="HarvardClone"}
      elsif ($ln =~ /(PIRSF)(\d+)/) {($db, $id)=($1, $2)} # special case for pirsf
      elsif ($ln =~ />(\S+)\s+(\S+.*?)</) {($db, $id)=($1, $2)}
      $db =~ s/\://;
      if (!$db && !$id) {print STDERR "Ignored link: $ln\n"; next}
      if (!defined $links_to_keep{$db} && !$warn{$db}) 
      {
        $warn{$db}=1; 
        print STDERR "Not sure whether to keep or ignore DB: |$db| ID: |$id| $ln\n";
        next
      } #the warn is so we only error once
      if ($links_to_keep{$db}) {push @{$note->{"Dbxref"}}, "$db:$id"}
  }

  # Color the feature
  # Artemis and some other programs allow you to color features. This applies some simple rules to color things
  # Artemis color codes:
  # 1  dark grey
  # 2  red
  # 3  green
  # 4  dark blue
  # 5  default blue
  # 6  pink
  # 7  yellow
  # 8  light green
  # 9  light blue
  # 10 orange
  # 11 brown
  # 12 light pink
  # 13 light grey
  # 14 black
  # 15 red
  # 16 light red
  # 17 very light red
  # 18 white
  # 
 
  if ($colorpegs) {
  	if ($fig->hypo(scalar($fig->function_of($peg)))) {$note->{"color"}->[0] = 13} # set hypotheticals to light grey
  }
  if ($colorpegs eq "phage") {
	  my $fn = scalar($fig->function_of($peg));
	  if ($fn =~ /integrase/i) {$note->{"color"}->[0] = 2}
	  elsif ($fn =~ /phage/i) {$note->{"color"}->[0] = 3}
 }
		
 
  # the LAST thing I am going to add as a note is the FIG id so that I can grep it out easily
  push @{$note->{"Dbxref"}}, "FIG_ID:$peg";
  
  my @location=$fig->feature_location($peg);
  foreach my $loc (@location) {
   $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
   my ($contig, $start, $stop)=($1, $2, $3);
   my $original_contig=$contig;
   my $strand='1';
  
   next if (defined $wantedcontig && ($contig ne $wantedcontig));
   next if (defined $beg && ($start < $beg || $stop < $beg));
   next if (defined $end && ($start > $end || $stop > $end));

   # print STDERR "KEPT: $peg at $loc with $start and $stop because between $beg and $end\n";
   
   if ($start > $stop) {($start, $stop, $strand)=($stop, $start, '-1')}
   elsif ($start == $stop) {$strand="."}
   
   my $type=$fig->ftype($peg);
   my $feature;
   if ($type eq "peg") {
        $beg && ( $start = ($start - $beg)+1 );
	$end && ( $stop  = ($stop  - $beg)+1 ); 
	$end && ($stop > $end) ? ($stop=$end) : 1; # there is an issue if stop-beg+1 > end 
   	$feature=Bio::SeqFeature::Generic->new(
		-start    => $start,
		-end      => $stop,
		-strand   => $strand,
		-primary  => 'CDS',
		-tag  	  => {
			product=>$func,
			translation=>$fig->get_translation($peg),
			},
		);
	
	foreach my $tagtype (keys %$note)
	{
		$feature->add_tag_value($tagtype, @{$note->{$tagtype}});
	}
   } # end the if type==peg
   elsif ($type eq "rna") {
        $beg && ( $start = ($start - $beg)+1 );
	$end && ( $stop  = ($stop  - $beg)+1 ); 
	($stop > $end) ? ($stop=$end) : 1; # there is an issue if stop-beg+1 > end 
    	$feature=Bio::SeqFeature::Generic->new(
                -start    => $start,
                -end      => $stop,
                -strand   => $strand,
                -primary  => 'RNA',
                );
	foreach my $tagtype (keys %$note)
	{
		$feature->add_tag_value($tagtype, @{$note->{$tagtype}});
	}
	
   } # end the if type == rna
   else {
    die "Don't know what type: |$type| is";
   }
   
   $bio->{$contig}->add_SeqFeature($feature);
  } # end the foreach $loc (@location)
 } # end the foreach peg/rna

 warn "writing output\n";

    my $out_desc;
    if ($outputf =~ /gz$/)
    {
	$out_desc = "| gzip -c > $outputf";
    }
    else
    {
	$out_desc = ">$outputf";
    }
    warn "Writing output using '$out_desc'\n";
    my $sio=Bio::SeqIO->new(-file => $out_desc, -format=>"genbank");
 foreach my $seq (keys %$bio)
 {
 	$sio->write_seq($bio->{$seq});
 }
}



MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3