[Bio] / FigKernelScripts / gff2tab.pl Repository:
ViewVC logotype

View of /FigKernelScripts/gff2tab.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Fri May 18 13:59:59 2007 UTC (12 years, 6 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.4: +113 -370 lines
Rewritten gff2tab and updated FigGFF POD/methods

#__perl__
#

=pod

=head1 gff2tab

A rewritten gff2tab converter written to take advantage of the GFF3 parser we already have. Thanks Bob.

=cut

use strict;
use FigGFF;
use FIG;

my $fig  = new FIG;
my $fgff = GFFParser->new($fig);

my ($brc, $file)=@ARGV;
unless ($brc && $file) {die "$0 <brc name> <filename>\nNote that BRC name will be used as the output directory, and prepended to the ids\n"}


if (-e "$brc/$file.fasta") {print STDERR "Warning: We are appending this data to the existing $brc data files\n"}
else {mkdir $brc, 0755}

my $ofile=$file; $ofile =~ s/^.*\///;
open(FA, ">>$brc/$ofile.fasta") || die "Can't open fasta";
open(AF, ">>$brc/$ofile.assigned_functions") || die "Can't open assigned_functions";


# parse the file using the OO interface
$fgff->parse($file);
#print STDERR Dumper($fgff), "\n\n\n";
# get the stree structure
my $tree = $fgff->feature_tree;

# now that we have the feature tree, we can look for the features that have descriptions, an also have fasta sequences.

my $proteincount;
foreach my $feat (keys %$tree)
{
	my ($fasta, $desc);
	# get the descriptions. Note everyone *should* use description to house the functional role, but I'm not 100% sure they are
	# we're going to get all the descriptions, and join them with ' // ' on a single line
	push @$desc, $fgff->feature_index->{$feat}->{'attributes'}->{'description'} if ($fgff->feature_index->{$feat}->{'attributes'}->{'description'});
	$desc=&traverse_attribute_tree($tree->{$feat}, 'description', $desc);

	# this is a bug that I found, some of the files have unescaped commas in their description fields. They shouldn't have, and this fixes it.
	my $checkdesc; my %flatten;
	map {
		if (ref($_) eq "ARRAY")
		{
			$_ = join(",", @$_);
		}
		s/^\s+//;
		s/\s+$//;
		$flatten{$_}=1;
	} @$desc;
	my $description=join(" // ", keys %flatten);
	
	# now find the fasta sequence in this tree
	my $proteins;
	if ($fgff->fasta_data->{$feat} && &is_protein($fgff->fasta_data->{$feat}, $feat))
	{
		# get the seqid and then the genome id
		my $sid=$fgff->feature_index->{$feat}->{'seqid'};
		my $gid=$fgff->feature_index->{$sid}->{'attributes'}->{'Dbxref'};
		if ($gid =~ /taxon\:(\d+)/) {$gid=$1}
		else 
		{
			$gid=$fgff->feature_index->{$sid}->{'attributes'}->{'organism_name'};
			$gid =~ s/\s+/_/g;
		}
		push @$proteins, [$feat, $fgff->fasta_data->{$feat}, $gid]
	}
	$proteins=&traverse_protein_tree($tree->{$feat}, $proteins);
	
	if (defined $proteins)
	{
# now print out all the functions and proteins
		foreach my $tple (@$proteins)
		{
			my ($id, $seq, $gid)=@$tple;
			print FA ">",lc($brc), "|", $gid, ".", $id, "\n$seq\n";
			print AF lc($brc), "|", $gid, ".", $id, "\t$description\n";
			$proteincount++;
		}
	}
}


unless ($proteincount) {print STDERR "WARNING: There were no proteins in $file\n"}


sub traverse_attribute_tree {
	my ($tree, $need, $array)=@_;
	foreach my $k (keys %$tree)
	{
		push @$array, $fgff->feature_index->{$k}->{'attributes'}->{$need} if (defined $fgff->feature_index->{$k}->{'attributes'}->{$need});
		if (ref($tree->{$k}) eq "HASH") {$array=&traverse_attribute_tree($tree->{$k}, $need, $array)}
	}
	return $array;
}

sub traverse_protein_tree {
	my ($tree, $array)=@_;
	foreach my $k (keys %$tree)
	{
		if ($fgff->fasta_data->{$k} && &is_protein($fgff->fasta_data->{$k}, $k))
		{
			my $sid=$fgff->feature_index->{$k}->{'seqid'};
			#print STDERR Dumper($sid);
			my $gid=$fgff->feature_index->{$sid}->{'attributes'}->{'Dbxref'};
			#print STDERR Dumper($gid);
			if ($gid =~ /taxon\:(\d+)/) {$gid=$1}
			else 
			{
				$gid=$fgff->feature_index->{$sid}->{'attributes'}->{'organism_name'};
				$gid =~ s/\s+/_/g;
			}
			push @$array, [$k, $fgff->fasta_data->{$k}, $gid];
		}
		if (ref($tree->{$k}) eq "HASH") {$array=&traverse_protein_tree($tree->{$k}, $array)}
	}
	return $array;
}


sub is_protein {
	my ($seq, $id)=@_;
	$seq =~ s/[\*\+\#]$//; # remove stop codons from the end
	if ($seq =~ /[\*\+\#]/) {print STDERR "Sequence $id in $file has stop codons in it (first 30 aa): ", substr($seq, 0, 30), "\n"; return 0}
	$seq =~ s/[ngatc]//ig; # remove dna bases
	(length($seq) > 10) ? return 1 : return 0; # call it protein if there are more than 10 non-dna bases :)
}
	
	

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3