[Bio] / FigKernelScripts / seed2genbank.pl Repository:
ViewVC logotype

View of /FigKernelScripts/seed2genbank.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.3 - (download) (as text) (annotate)
Tue Jul 17 20:26:42 2007 UTC (12 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2008_06_18, rast_rel_2008_06_16, rast_rel_2008_12_18, rast_rel_2008_07_21, rast_2008_0924, rast_rel_2008_04_23, rast_rel_2008_09_30, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, rast_rel_2008_10_09, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, mgrast_rel_2008_1110, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, rast_rel_2008_11_24, rast_rel_2008_08_07
Changes since 1.2: +37 -8 lines
Tweak outptu file handling, expand '%g' into genome name. Allow diret output into gzip pipe.

# -*- perl -*-
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
# This file is part of the SEED Toolkit.
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.

# some pod


The beginnings of a module to take SEED data and output it in GenBank
format. This will use the BioPerl modules, and so once the data are
loaded onto the sequences, in essence you can output the data in any
format you would like.

I use the following modules:
 Bio::SeqIO    - used for input/output for the sequences
 Getopt::Long  - this is used for parsing the command line
 URI::Escape   - this is to escape any text that we use. This is available from http://search.cpan.org/~rse/lcwa-1.0.0/


Contact Rob Edwards (RobE@thefig.info) with any bugs and/or comments.


use strict;
use FIG;
use FIGV;
use URI::Escape; # this is used to escape the sequences and comes from http://search.cpan.org/~rse/lcwa-1.0.0/
use Getopt::Long;
use Bio::SeqIO;
use Bio::Seq;
use Bio::SeqFeature::Generic;
use File::Basename;

my $usage=<<EOF;

 -d <genome directory>	    Genome directory to extract from.
 -g <genome number>         Number of the genome to extract. Required.
 -o <output file>           Default is "genomeid.gbk". The string "%g" will be replaced with teh genome id.
 -u <user>                  Default is master:master
 -b <value>		    First base to include (inclusive)
 -e <value>                 Last base to include (inclusive)
 -s 			    Output the CDS and protein sequences in FASTA format as well as the whole genome DNA sequence
 -t |trn|pro|cds|gen|all|   Include features in the output. See below.
 -nmpdr			    Include NMPDR specific requirements

 Features can be any of:
 	trn  :  transcript
 	pro  :  protein
 	cds  :  cds
 	gen  :  gene
 	all  :  trn, pro, cds, gen

 Note at the moment this will probably put out the same information for all of these!!

my ($virt_genome_dir, $allgen, $default_outputf, $user, $beg, $end, $outputfasta, $trn, $nmpdr);
 'd|dir:s'         => \$virt_genome_dir,
 'g|genome:s'      => \$allgen,
 'o|output:s'      => \$default_outputf,
 'u|user:s'        => \$user,
 'b|begin:i'       => \$beg,
 'e|end:i'         => \$end,
 's|seq'           => \$outputfasta,
 't|type:s'        => \$trn,    
 'nmpdr'	   => \$nmpdr,

my $fig;
my $virt_genome;
if ($virt_genome_dir)
    $fig = new FIGV($virt_genome_dir);
    $virt_genome = basename($virt_genome_dir);
    $fig = new FIG;

die $usage unless $allgen;

my %outputtype;
foreach (split /\,\s*/, $trn) {
 if (/trn/i || /transcript/i) {$outputtype{'transcript'}=1}
 elsif (/pro/i || /protein/i) {$outputtype{'protein'}=1}
 elsif (/cds/i) {$outputtype{'cds'}=1}
 elsif (/gen/i) {$outputtype{'gene'}=1}
 elsif (/all/i) {foreach my $t (qw[transcript protein cds gene]) {$outputtype{$t}=1}}
 else {print STDERR "Don't know what output type |$_| is, so we ignored it\n"}
my $fastasequences;

unless ($user) {$user="master:master"}

# set the source depending on if we are making the files for the NMPDR
my $source="The SEED";
$nmpdr ? $source = "NMPDR" : 1;
# now process each genome
my $gs;
foreach my $genome (split /\,/, $allgen)
    my $outputf;
    if (defined($default_outputf))
	$outputf = $default_outputf;
	$outputf =~ s/%g/$genome/g;
	$outputf = "$genome.gbk";
 if (defined $beg && defined $end) 
  if ($end < $beg) {($end, $beg)=($beg, $end)}
  print STDERR "Only outputting the ", int(($end-$beg)/1000), " kb between $beg and $end\n";
  $outputf .= "_" . $beg . "_" . $end;
 unless ($gs) {
  print STDERR "Couldn't recognize $genome. Skipped\n";
 else {print STDERR "Converting $gs ($genome) to GenBank. Output will be in $outputf\n"} 
 my $taxid=$genome; $taxid=~s/\.\d+$//;
 my $taxonomy=$fig->taxonomy_of($genome);

 if ($virt_genome_dir and $genome eq $virt_genome)
     open(PROJECT, "<$virt_genome_dir/PROJECT") or die "Error opening $virt_genome_dir/PROJECT: $!\n";
     open( PROJECT, "<$FIG_Config::organisms/$genome/PROJECT" ) or die "Error opening $FIG_Config::organisms/$genome/PROJECT: $!\n";
 my @project = <PROJECT>;
 close PROJECT;
 map {s/^\<a href\=\".*?\"\>(.*)\<\/a\>/$1/i} @project;

 my $md5=$fig->genome_md5sum($genome);

 # first get the contigs
 my $bio;
 warn "Processing contigs\n";
 foreach my $contig ($fig->contigs_of($genome))

	my $cloc=$contig.'_1_'.$fig->contig_ln($genome, $contig);
 	my $seq=$fig->dna_seq($genome, $cloc);
	$bio->{$contig}=Bio::Seq->new(-id=>"$contig", seq=>$seq);
	$bio->{$contig}->desc("Contig $contig from $gs");
	my $feature=Bio::SeqFeature::Generic->new(
		-start	=> 1,
		-end	=> $fig->contig_ln($genome, $contig),
		-tag    => {
			dbxref=>"taxon: $taxid", 
			mol_type=>"genomic DNA", 
			project=>join(" ", @project),
		-primary => "source",

 warn "Processing features\n";
 foreach my $peg (sort { &FIG::by_fig_id($a,$b) } $fig->pegs_of($genome), $fig->rnas_of($genome)) {
  my $note;
  my $func=$fig->function_of($peg, $user);

  # parse out the ec number if there is one
  my $ec;
  if ($func =~ /E\.*C\.*\s+(\d+|-)\.(\d+|-)\.(\d+|-)\.(\d+|-)/)
     push @{$note->{"ec_number"}}, "$1.$2.$3.$4";
  # if this is for the nmpdr we are required to have a description, so we'll make it hypothetical protein :)
  ($nmpdr && !$func && $peg =~ /peg/) ? ($func = "hypothetical protein") : 1;
  ($nmpdr && $func)  ? push @{$note->{"description"}}, $func : $func ? push @{$note->{"Note"}}, $func : 1;

  foreach my $alias ($fig->feature_aliases($peg))
   if ($alias =~ /^NP_/ || $alias =~ /YP_/) {push @{$note->{"Dbxref"}}, "NCBI_genpept:$alias"}
   elsif ($alias =~ /gi\|/)
    $alias =~ s/^gi\|//;
    push @{$note->{"Dbxref"}}, "NCBI_gi:$alias";
   elsif ($alias =~ /^kegg/i)
    $alias =~ s/kegg\|/KEGG:/i;
    $alias =~ s/^(.*):/$1+/;
    push @{$note->{"Dbxref"}}, $alias
   elsif ($alias =~ /^uni/)
    $alias =~ s/uni\|/UniProt:/;
    push @{$note->{"Dbxref"}}, $alias;
   elsif ($alias =~ /^sp\|/)
    $alias =~ s/sp\|/Swiss-Prot:/;
    push @{$note->{"Dbxref"}}, $alias
   elsif ($alias =~ /^[a-zA-Z][a-zA-Z0-9]{2,}\_[a-zA-Z]*\d+$/)
   	# that should be the regexp for a valid locus tag. Starts with a letter, has at least three alphanumerics before the _
	# then has a series of optional letters and ends with numbers
	push @{$note->{"locus_tag"}}, $alias;
   elsif ($alias =~ /^.{4,6}$/)
   	push @{$note->{"gene_symbol"}}, $alias;
   elsif ($alias =~ /^[a-zA-Z]+\d+$/)
   	push @{$note->{"locus"}}, $alias;
   else {
    # don't know what it is so keep it as an alias
    $alias = $alias; # just in case!
    push @{$note->{"Alias"}}, $alias;
  # now go through the links and see if there is anything that we should add as a Dbxref.
  # the links are stored like this:
  # <A HREF=http://smart.embl-heidelberg.de/smart/do_annotation.pl?ACC=SM00482>SMART SM00482</A>
  # we will parse out the things between the > and </A>. In this case the SMART SM00482
  # generally these are in the format DB\s+ID, but I have added some special cases
  # Set links_to_keep to true for the link to keep it. Set links to keep to false to ignore it
  my %links_to_keep=(
        Pfam            => 1,
        PubMed          => 1,
        Entrez          => 1,
        PIRSF           => 1,
        SMART           => 1,
        InterPro        => 1,
        PDB             => 1,
        UniProt         => 1,
        IEDB            => 1,
        EcoCyc          => 1,
        Colibri         => 1,
	SubtiList	=> 1,
        ATP             => 0,
        Animation       => 0,
        vibrio          => 0,
	HarvardClone	=> 0,
	"E."		=> 0,

  my %warn;
  foreach my $ln ($fig->fid_links($peg))
      my ($db, $id);
      if ($ln =~ /field0=CATID/ && $ln =~ /query0=(\d+)/ && ($id=$1) && $ln =~ /pcenter=harvard/) {$db="HarvardClone"}
      elsif ($ln =~ /(PIRSF)(\d+)/) {($db, $id)=($1, $2)} # special case for pirsf
      elsif ($ln =~ />(\S+)\s+(\S+.*?)</) {($db, $id)=($1, $2)}
      $db =~ s/\://;
      if (!$db && !$id) {print STDERR "Ignored link: $ln\n"; next}
      if (!defined $links_to_keep{$db} && !$warn{$db}) 
        print STDERR "Not sure whether to keep or ignore DB: |$db| ID: |$id| $ln\n";
      } #the warn is so we only error once
      if ($links_to_keep{$db}) {push @{$note->{"Dbxref"}}, "$db:$id"}

  # the LAST thing I am going to add as a note is the FIG id so that I can grep it out easily
  push @{$note->{"Dbxref"}}, "FIG_ID:$peg";
  my @location=$fig->feature_location($peg);
  foreach my $loc (@location) {
   $loc =~ /^(.*)\_(\d+)\_(\d+)$/;
   my ($contig, $start, $stop)=($1, $2, $3);
   my $original_contig=$contig;
   my $strand='1';
   next if (defined $beg && ($start < $beg || $stop < $beg));
   next if (defined $end && ($start > $end || $stop > $end));
   if ($start > $stop) {($start, $stop, $strand)=($stop, $start, '-1')}
   elsif ($start == $stop) {$strand="."}
   my $type=$fig->ftype($peg);
   my $feature;
   if ($type eq "peg") {
		-start    => $start,
		-end      => $stop,
		-strand   => $strand,
		-primary  => 'CDS',
		-tag  	  => {
	foreach my $tagtype (keys %$note)
		$feature->add_tag_value($tagtype, @{$note->{$tagtype}});
   } # end the if type==peg
   elsif ($type eq "rna") {
                -start    => $start,
                -end      => $stop,
                -strand   => $strand,
                -primary  => 'RNA',
	foreach my $tagtype (keys %$note)
		$feature->add_tag_value($tagtype, @{$note->{$tagtype}});
   } # end the if type == rna
   else {
    die "Don't know what type: |$type| is";
  } # end the foreach $loc (@location)
 } # end the foreach peg/rna

 warn "writing output\n";

    my $out_desc;
    if ($outputf =~ /gz$/)
	$out_desc = "| gzip -c > $outputf";
	$out_desc = ">$outputf";
    warn "Writing output using '$out_desc'\n";
    my $sio=Bio::SeqIO->new(-file => $out_desc, -format=>"genbank");
 foreach my $seq (keys %$bio)

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3