[Bio] / FigKernelScripts / gff2seed.pl Repository:
ViewVC logotype

View of /FigKernelScripts/gff2seed.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.29 - (download) (as text) (annotate)
Fri Apr 3 05:35:03 2009 UTC (10 years, 10 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.28: +2 -2 lines
removing check for refseq ids

# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
# This file is part of the SEED Toolkit.
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.



=head1 gff2seed

Convert a gff3 file format to a SEED directory. This will extract all the data and prepare a SEED directory for your use. The SEED directory is typically created in FIG_Config::temp, and then you use fig add_genome to load the new data.

There is an issue that is basically unique to metagenomes. At the moment there are contigs in the file that do not have any features associated with them. These are lost in converting to the SEED directory. There is no easy way (as yet) to determine what these are, and so you'll just have to deal. Since I'm not sure how much longer we're going to be using this route for the metagenomes, I'm reluctant to put a permanent fix in place.


use strict;
use FIG;
my $fig=new FIG;
use Data::Dumper;
use URI::Escape;

use FigGFF;

my $usage =<<EOF;
gff2seed [-append] [-debug] [-unpack] gff-file [output-dir] [fake genome number]

-debug print out a lot of information
-unpack allow a genome to be extracted to Tmp even if it exists
-append append to files from previous gff2seed for this org
-mapping  write out a directory called Mapping that contains two files. A fasta file of proteins and their original IDs this is useful for getting the correspondance between IDs.
-force  keep going even when there are duplicate IDs. Do not use this for installing since it means there is a serious error in the file

You can add a genome nummber to make that as the genome id


my ($debug, $unpack, $append, $force, $mapping)=(0,0,0,0,0);

while (@ARGV)
    my $arg = $ARGV[0];
    if ($arg eq '-debug')
	$debug = 1;
    elsif ($arg eq '-unpack')
	$unpack = 1;
    elsif ($arg eq '-mapping')
	$mapping = 1;
    elsif ($arg eq '-force')
	$force = 1;
    elsif ($arg eq '-append')
	$append = 1;
    elsif ($arg =~ /^-/)
	die "Unknown argument $arg\n";

my $file = shift or die $usage;
my $location = shift;
my $fakegenome=shift;

if (!$location)
    $location = $FIG_Config::temp;

my $parser=new GFFParser;
my $fob = $parser->parse($file);

-f $file or die "$file does not exist\n";

#print Dumper($fob);

my $genome=$fob->genome_id;
unless ($genome) {$genome=$fakegenome}

# Check the genome number and add a version number. This should be automated in the
# GFF file to ensure that the ID
# is unique, but we will bodge it right now

if ($genome =~ /^(\d+)\.(\d+)$/)
   unless ($unpack || $append) {
       die "Sorry $genome already exists" if ($fig->genus_species($genome));
   else {
       warn("$genome already exists. Do not install. Unpacking only");
elsif ($genome =~ /^(\d+)$/)
    my ($tempgenome, $count)=($1, 1);
    while ($fig->genus_species("$tempgenome.$count")) {$count++}
    die "$genome is not a valid genome ID. Please check it";

my $genomedir = $location . "/" . $genome;
print STDERR "We are writing the genome to $genomedir\n";  
if ($append) {
    # genomedir must already exist from prior run
    opendir( GENOME, $genomedir ) || die print STDERR "Could not open $genomedir. It must exist from a prior run without -append\n";
} else {
    mkdir ($genomedir, 0755) || die "Can't make genome dir $genomedir";
# add the base files we need in the genome direcotory
# Assume that base files created by the first, non-append gff2seed run are
# correct for all further -append invocation.

if (!$append) {
    open(OUT, ">$genomedir/GENOME") || die "Can't open $genomedir/GENOME";
    print OUT $fob->genome_name, "\n";
    close OUT;

    open(OUT, ">$genomedir/PROJECT") || die "Can't open $genomedir/PROJECT";
    print OUT $fob->project, "\n";
    close OUT;
    open(OUT, ">$genomedir/TAXONOMY") || die "Can't open $genomedir/TAXONOMY";
    print OUT $fob->taxonomy, "\n";
    close OUT;

    open(OUT, ">$genomedir/VERSION") || die "Can't open $genomedir/VERSION";
    print OUT $genome, ".", time, "\n";
    close OUT;

# write the DNA sequence contigs. If in append mode, make sure the
# contigs info goes to a *new* file rather than appending.  Otherwise,
# we get  >2GB files for euks.

my $contigCount    = 0;
my $contigFilename = "$genomedir/contigs";
my $max_existing_peg = 0;
if ($append) {

    my $max=1;  #dirty trick. If only files is "contigs" or "contigs1", next is 2.
    my $contig;
    foreach $contig ( grep { $_ =~ /^contigs\d*$/ } readdir( GENOME ) ) {
	if (($contig =~ /^contigs(\d+)$/) && ($1 > $max)) { $max = $1 }
    $max = $max+1;
    if ($debug) {print "Contigs file will be $contigFilename\n"; }

    my $tbl_file = "$genomedir/Features/peg/tbl";
    if (-e $tbl_file) {
	my $entry;
	open(TMP, "<$tbl_file") || die "Could not read-open $tbl_file";
	while (defined($entry = <TMP>)) {
	    if ($entry =~ m/^fig\|\d+\.\d+\.peg\.(\d+)/) {
		$max_existing_peg = &FIG::max($1, $max_existing_peg);
	    else {
		die "Could not parse tbl entry $entry";
	close(TMP) || die "Could not close $tbl_file";

open(OUT, ">$contigFilename") || die "Can't open $genomedir/contigs";
foreach my $contig (@{$fob->contigs}) 
   # in case we have a genbank NM or NC number, remove the version
    ##if ($contig =~ /(\w\w\_\d+)\.\d+/) {$contig=$1}
    print OUT ">$contig\n";
    my $seq=$fob->fasta_data($contig);
    # a sanity check. Why not!
    # this should be turned into a real method in FIG.pm that will account for all variants of DNA sequences
    # bioperl has a guess_alphabet method that we can probably steal. At the moment we just put out a warning
    if (uc($seq) !~ /^[ACGTNX]*$/) {print STDERR "$contig appears to have letters that are not GATCN or X. It was written to contigs anyway, please check\n"}
    $seq =~ s/(.{60})/$1\n/g;	# split to 60 chars per line
    chomp($seq); # remove any potential trailing \n so we only have one per seq
    print OUT "$seq\n";

# create the directory structure for cds and rna. Even in append mode
# we may have to create them because they are rm'd at the end of gff2seed
# when empty after loading.

if ( !-d ("$genomedir/Features") ) {
    mkdir("$genomedir/Features", 0755) || die "Could not make directory Features";

if ( !-d ("$genomedir/Features/peg") ) {
    mkdir("$genomedir/Features/peg", 0755) || die "Could not make directory Features/peg";

if ( !-d ("$genomedir/Features/rna") ) {
    mkdir("$genomedir/Features/rna", 0755) || die "Could not make directory Features/rna";

# open the file handles for pegs and rnas

my $mode;
if ($append) { 
    $mode= ">>";
} else {
    $mode = ">";

my ($pegfeatfh, $pegseqfh, $rnafeatfh, $rnaseqfh, $anno_fh, $assigns_fh);
my ($annomapfh, $fastamapfh);

open($pegfeatfh, "$mode$genomedir/Features/peg/tbl")   || die "Can't open $genomedir/Features/peg/tbl for writing";
open($pegseqfh,  "$mode$genomedir/Features/peg/fasta") || die "Can't open $genomedir/Features/peg/fasta for writing";
open($rnafeatfh, "$mode$genomedir/Features/rna/tbl")   || die "Can't open $genomedir/Features/rna/tbl for writing";
open($rnaseqfh,  "$mode$genomedir/Features/rna/fasta") || die "Can't open $genomedir/Features/rna/fasta for writing";

open($anno_fh,    "$mode$genomedir/annotations")
    or die "Cannot open $genomedir/annotations for writing: $!";
open($assigns_fh, "$mode$genomedir/assigned_functions")
    or die "Cannot open $genomedir/assigned_functions for writing: $!";

if ($mapping) 
    # make a directory with the mappings in it
    mkdir "$genomedir/Mapping", 0755;
    open($annomapfh, "$mode$genomedir/Mapping/assigned_functions") ||die  "Can't open $mode$genomedir/Mapping/assigned_functions";
    open($fastamapfh, "$mode$genomedir/Mapping/fasta") || die "Can't open $mode$genomedir/Mapping/fasta";

my %seen;			# make sure that the ids are unique
foreach my $allfeat ($fob->features_for_genome()) {
    foreach my $feat (@$allfeat) {
	# feat is a feature and has the following tags:
	# ID => identifier
	# score => score
	# source => source db
	# start, end, strand
	# seqid => contig
	# alias : other aliases --- ref to array
	# phase 
	# fig => fig ID
	# type => rna or cds
	# attributes => hash of attributes such as Alias, ID, translation_id (cds), Dbxref
	# Dbxref
	my $featfh; my $seqfh;
	my $id=$feat->ID;
	my ($seqloc, $start, $stop, $strand) = ($feat->seqid, $feat->start, $feat->end, $feat->strand);

        # tigr has a bug where it writes some features where start==stop
        next if ($start == $stop);
	if ($seen{$id}) {
            if ($force) 
                print STDERR "$id was already seen. But we kept going\n";
	        print STDERR "$id was already seen and written\n";

	# this will become the peg, but we need to massage it a little depending on what it is
	my $peg = $id;

	# this will become the sequence but we need to know whether it is protein or DNA or RNA
	my $seq='';
	if ($debug) {print STDERR "Found a feature of type ", $feat->type, " with $peg\n"}
	if ($feat->type eq "cds")
	    $peg =~ s/cds/peg/i; 
	    $peg =~ s/pro/peg/i;
	    $peg =  "fig|".$genome.".".$peg;
	    if ($peg =~ m/^fig\|\d+\.\d+\.peg\.(\d+)/) {
		if ($1 <= $max_existing_peg) {
		    warn "$peg may clobber existing feature";
	    else {
		die "Malformed PEG-ID $peg";
	    my $transid=$feat->{attributes}->{translation_id};
	    if ($transid) { 
	    else {
        #       Don't print a warning anymore since we grab it from the nuc sequence
	#	print STDERR "No translation was found for $peg\n";
	    if ($debug) {print STDERR "\tFound $transid with for it\n"}
	elsif ($feat->type eq "rna" || $feat->type eq "tRNA" || $feat->type eq "rRNA") {
	else {
	    my $type=$feat->type;
	    unless (-e "$genomedir/Features/$type") {
		mkdir "$genomedir/Features/$type", 0755; 
		print STDERR "Addded a directory for features that are $type\n";
	    open($featfh, ">>$genomedir/Features/$type/tbl")   || die "Can't append to Features/$type/tbl";
	    open($seqfh,  ">>$genomedir/Features/$type/fasta") || die "Can't append to Features/$type/fasta";
	unless ($peg) {print "Couldn't find anything to add for $feat\n"; next}
	unless ($featfh && $seqfh) {print STDERR "Couldn't find any file handles for $peg from $feat\n"; next}
	# make sure we have some sequence for the fasta file
	# unless it is a special case like a protein where we have a translation, we just take the fasta sequence
	unless ($seq) {$seq = $fob->fasta_data($id)}
	# finally if we do not have the sequence, we will just get the DNA sequence from the string
	unless ($seq) {
            # correct an off by one error
            $start--; $stop--;
	    # NC_ sequences may or may not have the stupid version number. Ugh.
	    if (!$seq && $seqloc =~ /(NC_\d+)\.\d+$/) {$seq=$fob->fasta_data($1)}
	    $seq=substr($seq, $start, $stop-$start+1); 
            if ($strand eq "-") {$seq =~ tr/AGCTagct/TCGAtcga/; $seq=reverse $seq} # I am sure that there are subroutines for this, but its quicker to add it
            # now we are just going to translate the sequence and be done
            if ($feat->type eq "cds")
                # only translate the proteins, otherwise leave as DNA.
                $seq = $fig->translate($seq, undef, 1);
	# in case we have a genbank NM or NC number, remove the version
	## if ($seqloc =~ /(\w\w\_\d+)\.\d+/) {$seqloc=$1}
	$seq =~ s/(.{60})/$1\n/g; # split to 60 chars per line
	chomp($seq); # remove any potential trailing \n so we only have one per seq
	unless ($seq) {print STDERR "No seq for $id\n"}
	print $seqfh ">$peg\n$seq\n";
        if ($mapping) {print $fastamapfh ">$id\n$seq\n"}
	# figure out the info for tbl
	if ($feat->strand eq "-" || $feat->strand eq "-1") {($start, $stop)=($stop, $start)} # -1 isn't valid but people use it all the time so screw it
	my $location=$seqloc . "_" . $start . "_" . $stop;

	my %features;
	if (ref($feat->Alias) eq "ARRAY")
	    map { $features{$_}++ } @{$feat->Alias};

	# If we see GENE:blah, add "blah" to the aliases, unless it's already there.

	for my $fname (keys %features)
	    if ($fname =~ /^GENE:(\S+)$/)
	my $fig_id;
	my $dbxref = $feat->attributes->{Dbxref};
	if ($dbxref)
	    $dbxref = [$dbxref] unless ref($dbxref);

	    for my $ref (@$dbxref)
		if ($ref =~ /^FIG_ID:(\S+)/)
		    $fig_id = $1;
		my $a = map_dbxref_to_seed_alias($ref);
		if ($a ne $ref)
	my $features = join("\t", sort keys %features);
	print $featfh "$peg\t$location\t$id\t$features\n";
	# Define the assignment:
	#   Increasing precedence to ncbi_annotation, Note, and description.
	#   This really ought to just be decided and set to one of them.

	my $anno  = $feat->attributes->{description};
        if (ref($anno) eq "ARRAY") {$anno=join(" ", @$anno)}
        if ($mapping) {print $annomapfh "$id\t$anno\n"}
	my $anno_source = "gff2Seed";
	if (! $anno ) { $anno = $feat->attributes->{Note} };
	if (! $anno ) {
	    $anno = $feat->attributes->{ncbi_annotation};
	    $anno_source = "ncbi"

	if (ref($anno) eq 'ARRAY')
	    $anno = uri_unescape(join(",", @$anno));
	    $anno = uri_unescape($anno);

	if ($anno and $fig_id)
	    print $assigns_fh "$fig_id\t$anno\n";
	    print $anno_fh join("\n",
				"Set master function to",
				'//') . "\n";

# Purge empty peg or rna dirs.

for my $what ('peg', 'rna')

my $size=-s "$genomedir/Features/$what/fasta";
    if (-s "$genomedir/Features/$what/fasta" == 0 and -s "$genomedir/Features/$what/tbl" == 0)
	print "Removing empty $genomedir/Features/$what\n";
	system("rm -rf $genomedir/Features/$what");


print "\n\n\nThe genome directory has been created in $genomedir\n";
print "You should now run the following command to load your data:\n";
print "		fig master:newgenome add_genome $genomedir (please also add -force -skipnr as required)\n";
print "         PLEASE NOTE: At the moment there is a known issue where sequences with known features are not copied across correctly. You can just copy your source fasta file into the organism directory to replace the contigs file to overcome this. See the gff2seed.pl docs for more information\n";

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3