[Bio] / FigKernelScripts / gff2Ontology.pl Repository:
ViewVC logotype

View of /FigKernelScripts/gff2Ontology.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Mon Dec 5 18:56:37 2005 UTC (13 years, 11 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.3: +17 -0 lines
Add license words.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

#__perl__

#########################################################################
# A little misnamed as this only handle's GO at the moment.
#
# Maybe this should just disappear and be internalized by gff2Seed??
#
# Pull the Ontology information from column9 and insert it as
# controlled vocabulary for the relevant PEGs. We assume the
# GFF has already been loaded into the Seed instance.
#
# Running this modifies the attributes tables and files.  It does NOT
# zero anything before running, so do that by hand if needed.
#
# To use make_go_triplet_file you must have GO.terms_ids_obs in the
# FIG/Data/Global/CV directory.
#########################################################################


##
# handle args, vars, etc
##

use strict;
use FIG;
my $fig=new FIG;
use Data::Dumper;
use URI::Escape;

use FigGFF;

my $usage = "gff2Ontology  [-debug] [-make_go_triplet_file] fileName \n" .
    "    -debug print out a lot of information\n" .
    "    -make_go_triplet_file print out triplet-format file with go info to <fileName>\n" .
    "    -go_file <fileName> to use fileName rather than FIG/Data/Global/CV/GO.terms_ids_obs\n" .

    "\n" .
    "For make_go_triplet_file, fileName is the file to write, otherwise it is the GFF file\n" .
    "to process. You'll need GO.terms_ids_obs from GO in the FIG/Data/Global dir to\n" .
    "make the triplet file.\n";


my ($debug, $make_go_triplet_file)=(0, 0);

my $go_terms_file= $FIG_Config::global."/CV/GO.terms_ids_obs";
   #
   # we read this from Data/Global becausee we expect that later
   # this will be part of a chain that downloads the file from
   # GO.  So, this follows Rob's parttern for PIRSF info handling.


while (@ARGV)
{
    my $arg = $ARGV[0];
    if ($arg eq '-debug')
    {
	$debug = 1;
	shift;
    }
    elsif ($arg eq '-make_go_triplet_file')
    {
	$make_go_triplet_file = 1;
	shift;
    }
    elsif ($arg eq '-go_file')
    {
	$go_terms_file=@ARGV[1];
	shift;
	shift;
    }
    elsif ($arg =~ /^-/)
    {
	die "Unknown argument $arg\n";
    }
    else
    {
	last;
    }
}

my $file = shift or die $usage;

##
# Load the GO terminology
##

open(IN,"<$go_terms_file");
if ($make_go_triplet_file) { open(OUT, ">$file") }

if ($debug) { print  "Loading GO terms from $go_terms_file...\n"; }

my %go_terms;
my @lines = <IN>;
my $nGoRead=1;
for my $l (@lines) {
    chomp $l;
    if ($debug && ($nGoRead < 25)) {print "Line= $l\n"};
    my @toks=split('\t', $l);
    if ($toks[0] =~ /^(GO:\S)/)  {
	my $id = $toks[0];
	my $term = $toks[1];
	my $section = $toks[2];
	my $obsolete = 0;
	if ($toks[3]) {
	    $obsolete=1;
	    $term = $term . " OBSOLETE";
	}

	$go_terms{$id} = [$term, $section, $obsolete];
	if ($debug && ($nGoRead < 25)) {print "$id gives  [$term, $section, $obsolete]\n"};
	if ($make_go_triplet_file) {
	    print OUT "GO\t$id\t$term\n";
	}
    }
    $nGoRead = $nGoRead + 1;
}

if ($debug) { print  "Done loading GO terms\n"; }
if ($make_go_triplet_file) {
    close OUT;
    print "Done building triplet file $file from $go_terms_file. Exiting.\n";;
    exit;
}


##
# parse the GFF
##

my $parser=new GFFParser;
my $fob = $parser->parse($file);

-f $file or die "$file does not exist\n";

#print Dumper($fob);

my $genome=$fob->genome_id;

# Make sure the thing has already been loaded and make sure there
# is a version number.

if ($genome =~ /^(\d+)\.(\d+)$/)
{
    die "Sorry $genome not found in this Seed." unless ($fig->genus_species($genome));
}
else
{
    die "TaxonId and version number ($genome) found in the gff file are not Number.number format."
}



my %seen;			# make sure that the ids are unique
foreach my $allfeat ($fob->features_for_genome()) {
    foreach my $feat (@$allfeat) {
	# feat is a feature and has the following tags:
	# ID => identifier
	# score => score
	# source => source db
	# start, end, strand
	# seqid => contig
	# alias : other aliases --- ref to array
	# phase 
	# fig => fig ID
	# type => rna or cds
	# attributes => hash of attributes such as Alias, ID, translation_id (cds), Dbxref
	# Dbxref
	
	
	my $id=$feat->ID;
	
	if ($seen{$id}) {
	    print  "$id was already seen\n";
	    exit(-1);
	}
	$seen{$id}=1;

	# this will become the peg, but we need to massage it a little depending on what it is
	my $peg = $id;

	# this will become the sequence but we need to know whether it is protein or DNA or RNA
	my $seq='';
	if ($debug) {print  "Found a feature of type ", $feat->type, " with $peg\n"}
	if ($feat->type eq "cds")
	{
	    $peg =~ s/cds/peg/i;
	    $peg = "fig|".$genome.".".$peg;
	    
	}
	elsif ($feat->type eq "rna" || $feat->type eq "tRNA" || $feat->type eq "rRNA") {
	    $peg="fig|".$genome.".".$peg;
	}
	else {
	    my $type=$feat->type;
	    if ($debug) {print  "Found a feature of type  $type\n"; }
	    }
	
	unless ($peg) {print  "Couldn't find anything to add for $feat\n"; next}
	
	# figure out the info for tbl
	my %go_ids;
	my $id;

	my $ontology_term = $feat->attributes->{Ontology_term};
	if ($ontology_term) {
	    $ontology_term = [$ontology_term] unless ref($ontology_term);

	    for my $ref (@$ontology_term) {
		if ($ref =~ /^(GO:\S+)/) {
		    $go_ids{$ref}++;
		}
	    }
	}


	#
	if (1) {
	    foreach my $id (sort keys %go_ids) {
		my $term;
		my $section;
		my $obs;
		my $info = $go_terms{$id};
		$term = @$info[0];
		$section=@$info[1];
		$obs = @$info[2];
		#
		# keep even the obsolete terms..can deal with them later but
		# if we drop them on floor now, there'd be nothing to work with.
		#
		#if (! $obs ) {
		#
		if (1 ) {
		    print "$peg has (GO, $id, $term)\n";
		    my $status = $fig->add_cv_term( "master:batch", $peg, "GO", $id, $term);
		    if (!$status) {
			print "$peg- Added (GO, $id, $term)\n";
		    } else {
			print "$peg- Error for (GO, $id, $term):\t$status\n";
		    }

		    
		} else {
		    print "OBSOLETE $peg has (GO, $id, $term) [skipped]\n";
		}
	    }
	}
	

    }
}


print "\n\n\nDone.\n";


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3