[Bio] / FigKernelScripts / embl2gff.pl Repository:
ViewVC logotype

View of /FigKernelScripts/embl2gff.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.20 - (download) (as text) (annotate)
Mon Dec 5 18:56:37 2005 UTC (13 years, 11 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.19: +17 -0 lines
Add license words.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

#__perl__

use URI::Escape;
use Data::Dumper;

my $usage= "Usage:  embl2gff -taxId N -orgVersion N [-extra fileName] [-ensemblVer verString] emblFile1.dat [emblFile2.dat ...]\n";

##
# cmd line args
##

my ($taxId, $orgVerNo, $extraFile,$ensemblVer)=(-1, -1,"extra.txt", "Ensembl-31");
my @fileNames;
while (@ARGV) {
    my $t=shift;

    if ($t eq "-orgVersion") {
	$orgVerNo=shift;
	print "Set org version to $orgVerNo\n";
    } elsif ($t eq "-taxId") {
	$taxId=shift; print "Taxon ID set to $taxId\n";
    } elsif ($t eq "-extra") {
	$extraFile=shift; print "Set extras file to $extraFile\n";
    } elsif ($t eq "-ensemblVer") {
	$ensemblVer=shift; print "Set Ensembl version to $ensemblVer\n";
    } else {
	push( @fileNames, $t ); 
    }
}

if ($orgVerNo eq "-1") { die $usage; }
if ($taxId eq "-1") { die $usage; }

print "Files to process:", &Dumper(\@fileNames),"\n";

##
# Genes can be split over clones. Those clones can end up in
# two EMBL entries. In that case, the EMBL file enters the
# transcripts *twice*, duplicating the translation and we end
# up duplicating the transcript.  When this happens, the location
# info includes a colon (:) as part of scoping information to
# point to the other clone.  We can't just skip anything with a
# :, so when we hit a colon in the location info, we put the transcript
# ID into this hash (colonHash) and let it go through.  If we hit
# another colon, we look in the hash, and if th id is already there,
# we skip it
# 
##

my %colonHash;
my $hasColon = 0;
my %tscriptHash;

##
# Initialize a hash that remaps tag names in db_xrefs.  A
# Xref not in this list will be dropped. 
##

my %dbXrefMap = (
	  #these are from scanning ensembl -31 release
	  #human and common to many
	      RefSeq_dna => 'NCBI_NM',
	      RefSeq_dna_predicted => 'NCBI_NM',
	      RefSeq_peptide => 'NCBI_NP',
	      RefSeq_peptide_predicted => 'NCBI_NP',
	      HUGO => 'HGNC',
	      EntrezGene => 'EntrezGene',
	      UniGene => 'UniGene',
	      "Uniprot/SPTREMBL" => 'Uniprot/SPTREMBL',
	      "Uniprot/SWISSPROT" => 'Uniprot/SWISSPROT',
	      EMBL => 'EMBL',
	      protein_id => 'protein_id',
	      MIM => 'MIM',
	      GO => 'GO',
	      IPI => 'IPI',
	      PDB => 'PDB',
          #Anopheles related
	      Anopheles_symbol => 'Anopheles_symbol',
	      Celera_Gene => 'Celera_Gene',
	      Celera_Pep => 'Celera_Pep',
	      Celera_Trans => 'Celera_Trans',
	      prediction_SPTREMBL => 'prediction_SPTREMBL',
          #worm
	      wormbase_transcript => 'WP',
	      wormpep_id => 'WP',
	  #chicken
	  #chimp
	      Ens_Hs_transcript => 'Ens_Hs_transcript',
	      Ens_Hs_translation => 'Ens_Hs_translation',
	  #fly
	      FlyBaseName_translations => 'FlyBaseName_translations',
	      FlyBaseORFNames => 'FlyBaseORFNames',
	      FlyBaseSynonyms => 'FlyBaseSynonyms',
	      drosophila_translation_id => 'drosophila_translation_id',
	      flybase_polypeptide_id => 'FB',
	      flybase_transcript_id => 'FB',
	  #mouse- are these only in STS?
	      MGD => 'MGD',
	      "Whitehead-MRC_RH" => 'Whitehead-MRC_RH',
	  #rat- some are mostly STS
	      RGD => 'RGD',
	      RGD_NUM => 'RGD_NUM',
	      RH => 'RH',
	      'RH_map.2.2' => 'RH_map.2.2',
	  #yeast
	      SGD => 'SGD_LOCUS',
	  #tetraodon (green puffer)
	      Genoscope_annotated_gene => 'Genoscope_annotated_gene',
	      Genoscope_pred_gene => 'Genoscope_pred_gene',
	      Genoscope_pred_transcript => 'Genoscope_pred_transcript',
          #zebra fish
	      ZFIN => "ZFIN",
	      ZFIN_ID => 'ZFIN_ID'
          );



##
# load up additional alias and attribute
# info that was parsed separately
##

if ( ! $extraFile eq ""  && ! -r( $extraFile) ) {
    die "File with pre-parsed information, $extraFile, not found.\n" 
}

my $extra_info=load_extra("<$extraFile");
#print &Dumper($extra_info),"\n";


# Watch out- this prototype has global vars, like below, that are
# changed in do_file() and also in the foreach file loop below, coupled
# with state in the parser so that you get one big snarled ball of
# interdependent goo.  For heaven's sake.


my $taxonomy = "";  
my $write_header= 1;

my $file_counter = 0;
my $out_file_counter = 0;
my $peg_counter = 0; 
my $thresh= 800*1024*1024;
my $bytes = 10 * $thresh;   #force fake over high water mark 1st time to open files

foreach $file (@fileNames)
{
    
    print "Doing file $file_counter ($file).\n";
    print "  peg_counter=$peg_counter\n";
    
    if ($bytes > $thresh)
    {
	print "  $bytes above high-water of $thresh. Starting new output file.\n";

	if ($file_counter > 0) {
	    "  Concat'd files\n";
	    system `cat Sample_header_$out_file_counter.gff Sample_body_$out_file_counter.gff Sample_seqs_$out_file_counter.gff Sample_contigs_$out_file_counter.gff > Sample_$out_file_counter.gff`;
	}

	$out_file_counter = $out_file_counter + 1;

	open(OUTPUT1,">Sample_header_$out_file_counter.gff");
	open(OUTPUT2,">Sample_body_$out_file_counter.gff");
	open(OUTPUT3,">Sample_seqs_$out_file_counter.gff");
	open(OUTPUT4,">Sample_contigs_$out_file_counter.gff");
	$bytes = 0;
	$write_header = 1;
    }

    ($thisBytes, $peg_counter) = do_file( $file, $peg_counter, 1);
    $write_header = 0;
    $bytes = $bytes + $thisBytes;
    print "  Wrote $thisBytes bytes. Now total=$bytes\n";
    
    $file_counter = $file_counter + 1;
}


system `cat Sample_header_$out_file_counter.gff Sample_body_$out_file_counter.gff Sample_seqs_$out_file_counter.gff Sample_contigs_$out_file_counter.gff > Sample_$out_file_counter.gff`;

exit(0);    


sub do_file {

    my($file, $peg_counter, $debug) = @_;

    my $bytesOut = 0;

    $new_entry = 0;           #says when we move to new CDS
    $biggest = 0;             #used to find start/end of gene
    $smallest = 100000000;    #used to find start/end of gene
    $record_translation = 0;  
    $last_seq_region_name = "";
    $record_next_line = 0;
    my $doing_tax;

    #the exact value of this is used by embl2gff_addmd5
    my $checksumPlaceholder = "checksum_placeholder_xxxxxxxxxxx";

    open(INPUT,$file);
    while ($_ =<INPUT>)
    {
	if ($write_header)
	{ 
	    if ($_ =~ /AC\s+(.*)(\s+)$/){$contig = $1;}
	    if ($_ =~ /OS\s+(.*)/ ){$name = $1;}
	    if ($_ =~ /OC\s+(.*)/ )
	    {
		$temp = $1;
		if ($taxonomy eq "") {$doing_tax=1;}
		if ($doing_tax) {$taxonomy = $taxonomy.$1;}
	    }
	    if ($_ =~ /XX/ )
	    {
		if ($doing_tax) {$doing_tax=0;}
	    }

	    if ($_ =~ /FT\s+source\s+(\d+)..(\d+)/ ){$start = $1; $stop =$2}
	    if ($_ =~ /db_xref="taxon:(.*)"/ )
	    {
		$taxon_id = $1;

		$bytesOut = $bytesOut + length( "#gff-version 3\n");
		$bytesOut = $bytesOut + length( "#seed\tgenome_id\t$taxon_id.$orgVerNo\n");
		$bytesOut = $bytesOut + length( "#seed\ttaxon_id\t$taxon_id\n");
		$bytesOut = $bytesOut + length( "#seed\tname\t$name\n");
		$bytesOut = $bytesOut + length( "#seed\ttaxonomy\t$taxonomy\n");
		$bytesOut = $bytesOut + length( "#seed\tgenome_md5\t$checksumPlaceholder\n");
		$bytesOut = $bytesOut + length( "#seed\tproject\t$ensemblVer\n");

 		print OUTPUT1 "#gff-version 3\n";	 
 		print OUTPUT1 "#seed\tgenome_id\t$taxon_id.$orgVerNo\n";
 		print OUTPUT1 "#seed\ttaxon_id\t$taxon_id\n";
 		print OUTPUT1 "#seed\tname\t$name\n";
 		print OUTPUT1 "#seed\ttaxonomy\t$taxonomy\n";
 		print OUTPUT1 "#seed\tgenome_md5\t$checksumPlaceholder\n";
 		print OUTPUT1 "#seed\tproject\t$ensemblVer\n";
		$write_header=0;
	    }
	}
  

	if ($_ =~ /^AC\s+(.*)/ ){$contig = $1;}
	if ($_ =~ /FT\s+source\s+(\d+)..(\d+)/ )
	{
	    $start = $1; $stop =$2;
	    $seq_region_name = "##sequence-region\t$contig\t$start\t$stop\n";
	    if ($seq_region_name ne $last_seq_region_name)
	    {
		$bytesOut = $bytesOut + length("##sequence-region\t$contig\t$start\t$stop\n");
		print OUTPUT2 "##sequence-region\t$contig\t$start\t$stop\n";
		$last_seq_region_name = $seq_region_name;
	    }
	}
    

	if ($_ =~/FT\s+CDS\s+/) 
	{
	    #if ( !( index($_, ":") eq -1)  ) {
	    #  print "Skipping split CDS $_\n";
	    #} else {
	    if (1) {
		#print "KEEPing CDS $_\n";
		if ($record_translation eq 1){
		    print "ERROR- new CDS before wrote translation $gene $tscript\n";
		}

		$strand ="";
		$new_entry = 1; 
		$peg_counter = $peg_counter + 1;
		$figXref = "FIG_ID:fig|$taxon_id.$orgVerNo.peg.$peg_counter";

		$col9 = "ID=cds."."$peg_counter;Alias=";
		$col9Sep = "";
		$col9Ont = "";
		$col9OntSep = "";
		$col9Xref = uri_escape($figXref);
		$col9XrefSep =",";
		$prot_id = "pro.".$peg_counter;

		if ($debug) { print "\n\n"; }
	    }
	}
    
	if($new_entry)
	{
	    if($_ =~ /(complement\()?(\d+)\..*\.(\d+)/)
	    {
		if($1){$strand ="-";} else{$strand = "+";} 
		if ($2 < $3){ $bigger = $3; $smaller = $2}
		else { $bigger = $2; $smaller = $3}
		if($bigger > $biggest){$biggest = $bigger};
		if($smaller < $smallest){$smallest = $smaller};
	    }    
		

	    if ($_ =~/gene="(.*)"/)
	    {
		if ($debug) { print "$peg_counter\n"; }
		if ($debug) { print "$contig\n"; }
		$gene = $1;
		$col9 = $col9.$col9Sep.uri_escape("EnsemblGene:$gene");
		$col9Sep = ",";
		if ($debug) { print "GENE:$gene\n"; };
	    } 	
        
	    if ($_ =~/protein_id="(.*)"/ )
	    {
		$protId = $1;
		$col9 = $col9.$col9Sep.uri_escape("EnsemblProtein:$protId");
		$col9Sep = ",";
		if ($debug) { print "ProteinId:$protId\n"; }
	    }

	    if ($_ =~/"transcript_id=(.*)"/ )
	    {
		$tscript = $1;
		if ($colonHash{$tscript}) {
		    #already handled this one. bail out
		    $oldContig=$colonHash{$tscript};
		    if ($contig eq $oldContig) {
			print "Repeated occurance for $tscript. AC=$contig. Old=$oldContig";
		    } else  {
			print "Repeated occurance for $tscript. DIFFER AC=$contig. Old=$oldContig";
		    }
		    $peg_counter = $peg_counter - 1;
		    $new_entry=0;
#next
		    next;
		} else {
		    print "First occurance for $tscript. AC=$contig.\n";
		    $colonHash{$tscript}=$contig;
		}

		$col9 = $col9.$col9Sep.uri_escape("EnsemblTranscript:$tscript");
		$col9Sep = ",";
		if ($debug) { print "TSCRIPT:$tscript\n"; }
	    }

		
	    # handle the db_xrefs

	    if ($_ =~/db_xref="(.*)"/)
	    {
		@temp = split(":",$1);
		$oldTag = @temp[0];
		$newTag = $dbXrefMap{$oldTag};
		if ($newTag) {
		    $newValue = join( ":", @temp[1..$#temp]);
		    #GO is goofy because they did GO:GO:1234
		    if ($oldTag eq 'GO') {
			$x = uri_escape("$newValue");
			$col9Ont = $col9Ont.$col9OntSep.$x;
			$col9OntSep=",";
		    } else {
			$x = uri_escape("$newTag:$newValue");
			
			$col9 = $col9.$col9Sep.$x;
			$col9Sep = ",";

			$col9Xref = $col9Xref.$col9XrefSep.$x;
			$col9XrefSep = ",";
		    }
		}
	    }
	    

#	    if ($_ =~ /FT\s+\/translation="(\w+)"/)  #This line is wrong but gets emacs to indent
	    if ($_ =~ /FT\s+\/translation="(\w+)/)
	    {
		$translation = $1;
		$record_translation = 1;
	    }
 
	    if ($record_translation)
	    {
   	        if ($_ =~ /FT\s+\/translation="([\*\w]+)\"$/)
	        {
		    #trans all in one line. already caught the
                    #translation above so don't need to append
		    #but do need to finalize so set to 0:
		    $record_translation = 0;
	        }
		#if ($_ =~ /FT\s+([\*\w]+)/ ) { 
                #    $translation = $translation.$1; 
                #}
                #changed to prevent duplication of last line of seq
		if ($_ =~ /FT\s+([\*\w]+)[^\"]$/ ) { 
                    $translation = $translation.$1; 
                }
	        if ($_ =~/FT\s+([\*\w]+)\"$/ )
		{
		    $translation = $translation.$1;
		    $record_translation = 0;
                }

		if ($_ =~/FT\s+\"$/ )
		{
		    $record_translation = 0;
                }

       
 	        if (!$record_translation)
	        {
		    #add extra info per gene

		    if ($gene && $extra_info->{$gene} )
		    {
			foreach $x (@{$extra_info->{$gene}})
			{
			    #print "  extra gene info $gene -> $x\n";
			    $col9 = $col9.$col9Sep.uri_escape("$x");
			    $col9Sep = ",";
			}
		    }

		    #
		    #add extra info per transcript

		    if ($tscript && $extra_info->{$tscript})
		    {
			foreach $x (@{$extra_info->{$tscript}} )
			{
			    #print "   extra txcript info $tscript -> $x\n";
			    $col9 = $col9.$col9Sep.uri_escape("$x");
			    $col9Sep = ",";
			}
		    }

		    #look for a function

		    my $function;
		    if ($tscript && $extra_info->{$tscript."_function"})
		    {
			#there should only be one entry.
			foreach $fn (@{$extra_info->{$tscript."_function"}} ) {
			    print "   found function [$fn]\n";
			    $function=uri_escape($fn);
			}
		    }


		    $col9 = $col9.";Dbxref=".$col9Xref.";Ontology_term=".$col9Ont.";Note=".$function.";translation_id=$prot_id;";
                    if ($debug) { print "COLLATE\t$peg_counter\t$gene\t$tscript\n" };
		    if ($debug) { print "col9 = [$gene] $col9\n"; }
		    $bytesOut = $bytesOut + length("$contig\tEnsembl\tcds\t$smallest\t$biggest\t.\t$strand\t.\t$col9\n");
		    print OUTPUT2 "$contig\tEnsembl\tcds\t$smallest\t$biggest\t.\t$strand\t.\t$col9\n";
		    $new_entry = 0;
		    $biggest = 0; $smallest = 100000000;
		    $record_translation = 0;
		    if ($debug) { print "final:$translation\n"; }
		    $translation =~ s/\s//g;  
	      
		    $bytesOut = $bytesOut + length( ">$prot_id\n$translation\n" );
		    print OUTPUT3 ">$prot_id\n$translation\n";
		    $translation="";
		    $gene="";
		    $tscript="";
 	        } 
	    } #record translation
       

	} # new entry
      
	if($record_next_line)
	{ 
	    if($_ =~ /(\w+)(\s+)?(\w+)?(\s+)?(\w+)?(\s+)?(\w+)?(\s+)?(\w+)?(\s+)?(\w+)?.*/)
		#if($_ =~ /([GACTNX])(\s+)?([GACTNX])?(\s+)?([GACTNX])?(\s+)?([GACTNX])?(\s+)?([GACTNX])?(\s+)?([GACTNX])?(\s*)(\d+)/)
	    {
		$temp = $1.$3.$5.$7.$9.$11;
		#print OUTPUT4 "$temp\n";
		if($temp =~/(\w+[^0-9])(\d+)$/)
		{
		    $seq = $1;
		    $seq =~ s/[0-9]//g; 
		    #if ($debug) { print "CULPRIT:$2\n"; }
                    $bytesOut=$bytesOut + length("$seq\n");
		    print OUTPUT4 "$seq\n";
		}
		else
		{
		    $temp =~ s/[0-9]//g; 
		    $bytesOut = $bytesOut + length( "$temp\n" );
		    print OUTPUT4 "$temp\n";
                }   
	    }
	    else
	    {
		$record_next_line = 0;
	    }
	}
	if ($_ =~ /^SQ.*/)
	{
	    $bytesOut = $bytesOut + length( ">$contig\n");
	    print OUTPUT4 ">$contig\n"; 
	    $record_next_line = 1;
	}

    } #while input 

    return $bytesOut, $peg_counter;

} #end do_file


sub load_extra {
    # some info is not in the embl file.  load a file with extra information to
    # add to the alias and attribute info that's been parsed already.  The format
    # of each line is
    #
    # key Alias|Attribute text
    #
    # where the pieces are tab separated.  key is typically an ensembl gene id
    # or ensembl transcript id depending upon whether the extra info is to be
    # associated per gene or per transcript.
    #
    # returns a hash from key to text.
    #


    my($fname) = @_;
    my %extra_info;

    open(EXTRA_INPUT,$fname);
    @lines = <EXTRA_INPUT>;
    foreach $_ (@lines)
    {
	chomp $_;
	@temp = split("\t",$_);
	$key = @temp[0];
	$text = @temp[2];
	push( @{$extra_info{$key}},  $text);
    }

    #print &Dumper(\%extra_info),"\n";
    return \%extra_info;
} 


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3