[Bio] / FigKernelScripts / gff2tab.pl Repository:
ViewVC logotype

View of /FigKernelScripts/gff2tab.pl

Parent Directory Parent Directory | Revision Log Revision Log

Revision 1.5 - (download) (as text) (annotate)
Fri May 18 13:59:59 2007 UTC (12 years, 6 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.4: +113 -370 lines
Rewritten gff2tab and updated FigGFF POD/methods



=head1 gff2tab

A rewritten gff2tab converter written to take advantage of the GFF3 parser we already have. Thanks Bob.


use strict;
use FigGFF;
use FIG;

my $fig  = new FIG;
my $fgff = GFFParser->new($fig);

my ($brc, $file)=@ARGV;
unless ($brc && $file) {die "$0 <brc name> <filename>\nNote that BRC name will be used as the output directory, and prepended to the ids\n"}

if (-e "$brc/$file.fasta") {print STDERR "Warning: We are appending this data to the existing $brc data files\n"}
else {mkdir $brc, 0755}

my $ofile=$file; $ofile =~ s/^.*\///;
open(FA, ">>$brc/$ofile.fasta") || die "Can't open fasta";
open(AF, ">>$brc/$ofile.assigned_functions") || die "Can't open assigned_functions";

# parse the file using the OO interface
#print STDERR Dumper($fgff), "\n\n\n";
# get the stree structure
my $tree = $fgff->feature_tree;

# now that we have the feature tree, we can look for the features that have descriptions, an also have fasta sequences.

my $proteincount;
foreach my $feat (keys %$tree)
	my ($fasta, $desc);
	# get the descriptions. Note everyone *should* use description to house the functional role, but I'm not 100% sure they are
	# we're going to get all the descriptions, and join them with ' // ' on a single line
	push @$desc, $fgff->feature_index->{$feat}->{'attributes'}->{'description'} if ($fgff->feature_index->{$feat}->{'attributes'}->{'description'});
	$desc=&traverse_attribute_tree($tree->{$feat}, 'description', $desc);

	# this is a bug that I found, some of the files have unescaped commas in their description fields. They shouldn't have, and this fixes it.
	my $checkdesc; my %flatten;
	map {
		if (ref($_) eq "ARRAY")
			$_ = join(",", @$_);
	} @$desc;
	my $description=join(" // ", keys %flatten);
	# now find the fasta sequence in this tree
	my $proteins;
	if ($fgff->fasta_data->{$feat} && &is_protein($fgff->fasta_data->{$feat}, $feat))
		# get the seqid and then the genome id
		my $sid=$fgff->feature_index->{$feat}->{'seqid'};
		my $gid=$fgff->feature_index->{$sid}->{'attributes'}->{'Dbxref'};
		if ($gid =~ /taxon\:(\d+)/) {$gid=$1}
			$gid =~ s/\s+/_/g;
		push @$proteins, [$feat, $fgff->fasta_data->{$feat}, $gid]
	$proteins=&traverse_protein_tree($tree->{$feat}, $proteins);
	if (defined $proteins)
# now print out all the functions and proteins
		foreach my $tple (@$proteins)
			my ($id, $seq, $gid)=@$tple;
			print FA ">",lc($brc), "|", $gid, ".", $id, "\n$seq\n";
			print AF lc($brc), "|", $gid, ".", $id, "\t$description\n";

unless ($proteincount) {print STDERR "WARNING: There were no proteins in $file\n"}

sub traverse_attribute_tree {
	my ($tree, $need, $array)=@_;
	foreach my $k (keys %$tree)
		push @$array, $fgff->feature_index->{$k}->{'attributes'}->{$need} if (defined $fgff->feature_index->{$k}->{'attributes'}->{$need});
		if (ref($tree->{$k}) eq "HASH") {$array=&traverse_attribute_tree($tree->{$k}, $need, $array)}
	return $array;

sub traverse_protein_tree {
	my ($tree, $array)=@_;
	foreach my $k (keys %$tree)
		if ($fgff->fasta_data->{$k} && &is_protein($fgff->fasta_data->{$k}, $k))
			my $sid=$fgff->feature_index->{$k}->{'seqid'};
			#print STDERR Dumper($sid);
			my $gid=$fgff->feature_index->{$sid}->{'attributes'}->{'Dbxref'};
			#print STDERR Dumper($gid);
			if ($gid =~ /taxon\:(\d+)/) {$gid=$1}
				$gid =~ s/\s+/_/g;
			push @$array, [$k, $fgff->fasta_data->{$k}, $gid];
		if (ref($tree->{$k}) eq "HASH") {$array=&traverse_protein_tree($tree->{$k}, $array)}
	return $array;

sub is_protein {
	my ($seq, $id)=@_;
	$seq =~ s/[\*\+\#]$//; # remove stop codons from the end
	if ($seq =~ /[\*\+\#]/) {print STDERR "Sequence $id in $file has stop codons in it (first 30 aa): ", substr($seq, 0, 30), "\n"; return 0}
	$seq =~ s/[ngatc]//ig; # remove dna bases
	(length($seq) > 10) ? return 1 : return 0; # call it protein if there are more than 10 non-dna bases :)

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3