[Bio] / FigKernelPackages / FigGFF.pm Repository:
ViewVC logotype

View of /FigKernelPackages/FigGFF.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Mar 14 19:17:37 2005 UTC (15 years ago) by olson
Branch: MAIN
Add general-purpose GFF writing stuff.

#
# FIG GFF utilities.
#

#
# A GFFWriter handles the generation of GFF files from SEED data structures.
#


package GFFWriter;

use strict;
use FIG;

use Carp;
use URI::Escape;
use Data::Dumper;

sub new
{
    my($class, $fig, %options) = @_;

    my $default_options = {
	escapespace => 0,
	outputfasta => 1,
	linelength => 60,
    };

    map { $default_options->{$_} = $options{$_} } keys(%options);

    my $self = {
	options => $default_options,
	contig_length_cache => {},
	fig => $fig,
    };

    return bless $self, $class;
}


=head1 gff3_for_feature

Returns the GFF3 information for a given feature.

The return is a pair ($contig_data, $fasta_sequences) that can be passed 
into write_gff3().

$contig_data is a hashref mapping a contig name to a list of GFF3 file lines
for the sequences in that contig.

=cut

sub gff3_for_feature
{
    my($self, $fid, $user) = @_;

    #
    # Options we need to figure out somehow.
    #
    my $options = $self->{options};
    
    my $escapespace = $options->{escapespace};
    my $outputfasta = $options->{outputfasta};
    my %outputtype = (cds => 1,
		      protein => 1,
		      transcript => 1,
		      gene => 1);

    my $fastasequences = '';
    my $contig_data;
    my $linelength = $options->{linelength};

    my $fig = $self->{fig};

    #
    # Do this first to make sure that we really have a feature.
    #
    my @location = $fig->feature_location($fid);
    print Dumper(\@location);
    if (@location == 0 or !defined($location[0]))
    {
	warn "No location found for feature $fid\n";
	return ({}, "");
    }
    
    ###########
    #
    # Begin figuring out the column 9 information about notes and aliases and GO terms
    # All the information is temporarily stored in @alias or @note, and at the end is joined
    # into $allnote
    #
    ###########

    #
    # the notes for the last column
    #
    my $note;
    #
    # all the aliases we are going to use
    #
    my @alias;

    my $func = $fig->function_of($fid, $user);
    if ($func) 
    {
	push @$note, ("Note=\"" . uri_escape($func) . '"');
    }
  
    # now find aliases
    foreach my $alias ($fig->feature_aliases($fid))
    {
	if ($alias =~ /^NP/)
	{
	    push @$note, "Dbxref=\"NCBI_genpept:$alias\"";
	}
	elsif ($alias =~ /gi\|/)
	{
	    $alias =~ s/^gi\|//;
	    push @$note, "Dbxref=\"NCBI_gi:$alias\"";
	}
	elsif ($alias =~ /^kegg/i)
	{
	    $alias =~ s/kegg\|/KEGG:/i;
	    $alias =~ s/^(.*):/$1+/;
	    push @$note, "Dbxref=\"$alias\"";
	}
	elsif ($alias =~ /^uni/)
	{
	    $alias =~ s/uni\|/UniProt:/;
	    # $note = check_go($note, $alias) if ($go);
	    push @$note, "Dbxref=\"$alias\"";
	}
	elsif ($alias =~ /^sp\|/)
	{
	    $alias =~ s/sp\|/Swiss-Prot:/;
	    push @$note, "Dbxref=\"$alias\"";
	}
	else
	{
	    # don't know what it is so keep it as an alias
	    $alias = uri_escape($alias); # just in case!
	    push @alias, $alias;
	}
  }
  
    # now just join all the aliases and put them into @$note so we can add it to the array
    if (@alias)
    {
	push @$note, "Alias=\"". join (",", @alias) . '"';
    }
 
    # the LAST thing I am going to add as a note is the FIG id so that I can grep it out easily
    push @$note, "Dbxref=\"SEED:$fid\"";
  
    # finally join all the notes into a long string that can be added as column 9
    my $allnotes;
    $allnotes = join ";", @$note;
  
    # do we want to convert '%20' to  ' '
    unless ($escapespace)
    {
	$allnotes =~ s/\%20/ /g;
    }
  
    ###########
    #
    # End figuring out the column 9 information about notes and aliases and GO terms
    #
    ###########
 
    #
    # Cache contig  lengths.
    #
    my $len = $self->{contig_length_cache};

    my $genome = $fig->genome_of($fid);
    
    foreach my $loc (@location)
    {
	$loc =~ /^(.*)\_(\d+)\_(\d+)$/;
	my ($contig, $start, $stop) = ($1, $2, $3);
	my $original_contig=$contig;

	#
	# the contig name must be escaped
	#
	$contig = uri_escape($contig);

	#my $contig_key = "$genome:$contig";
	my $contig_key = $contig;

	unless (defined $len->{$contig})
	{
	    $len->{$contig}=$fig->contig_ln($genome, $original_contig);
	}
	my $strand='+';

	#
	# These were bounds-checking for dumping all of a genome.
	#
	#next if (defined $beg && ($start < $beg || $stop < $beg));
	#next if (defined $end && ($start > $end || $stop > $end));
   
	if ($start > $stop)
	{
	    ($start, $stop, $strand)=($stop, $start, '-');
	}
	elsif ($start == $stop)
	{
	    $strand=".";
	}
   
	my $type=$fig->ftype($fid);
   
	if ($type eq "peg")
	{
	    # it is a protein coding gene
	    # create an artificial id that is just the fid.(\d+) information
	    # we will use this to create ids in the form cds.xxx; trn.xxxx; pro.xxx; gen.xxx;
	    $fid =~ /\.peg\.(\d+)/;
	    my $geneid=$1;
    
	    ############## KLUDGE
	    #
	    # At the moment the outputs for transcript, gene, CDS, and pro are all the same.
	    # This is clearly a kludge and wrong, but it will work at the moment
	    #
    
	    # defined some truncations
	    my %trunc=(
		       "transcript"  => "trn",
		       "gene"        => "gen",
		       "protein"     => "pro",
		       "cds"         => "cds",
		      );
	    
	    # SO terms:
	    # transcript: SO:0000673
	    # gene: SO:0000704
	    # cds: SO:0000316
	    # protein:  NOT VALID: should be protein_coding_primary_transcript SO:0000120
	    foreach my $type (keys %outputtype)
	    {
		my ($id, $type)=($trunc{$type} . "." . $geneid, $type);
		# we want to store some sequences to be output
		if ($outputfasta && $type eq "protein")
		{
		    my $addseq = $fig->get_translation($fid);

		    #
		    # the chomp is so that we know for sure to add the line back
		    #
		    $addseq =~ s/(.{$linelength})/$1\n/g;
		    chomp($addseq); 
		    $fastasequences .= ">$id\n$addseq\n";
		}
		if ($outputfasta && $type eq "cds")
		{
		    my $addseq = uc($fig->dna_seq($genome, $fig->feature_location($fid)));
		    $addseq =~ s/(.{$linelength})/$1\n/g; chomp($addseq);
		    $fastasequences .= ">$id\n$addseq\n";
		}
		# correct the incorrect sofa term
		if ($type eq "protein")
		{
		    $type = "SO:0000120";
		}

		push (@{$contig_data->{$contig_key}},
		      (join "\t",
		       ($contig, "The SEED", $type, $start, $stop, ".", $strand, ".", "Id=$id;$allnotes")));
	    } # end the foreach my $type
	} # end the if type==peg
	elsif ($type eq "rna")
	{
	    $fid =~ /\.rna\.(\d+)/;
	    my $geneid=$1;
	    #
	    # tRNA is a valid SOFA term == SO:0000253	    
	    #
	    my ($id, $type)=("rna.$geneid", "tRNA"); 
	    if ($outputfasta)
	    {
		my $addseq = $fig->dna_seq($genome, $fig->feature_location($fid));
		$addseq =~ s/(.{$linelength})/$1\n/g; chomp($addseq);
		$fastasequences .= ">$id\n$addseq\n";
	    }
	    push (@{$contig_data->{$contig_key}}, (join "\t", ($contig, "The SEED", $type, $start, $stop, ".", $strand, ".", "Id=$id;$allnotes")));
	} # end the if type == rna
	else
	{
	    die "Don't know what type: |$type| is";
	}
    }
    return ($contig_data, $fastasequences);
}

=head1 write_gff3

Write a set of gff3 per-contig data and fasta sequence data to a file or filehandle.

$genome is the genome these contigs are a part of.
$contig_list is a list of contig-data hashes as returned by gff_for_feature.
$fast_list is a list of fasta data strings.

=cut    

sub write_gff3
{
    my($self, $output, $genome, $contig_list, $fasta_list) = @_;

    my $fig = $self->{fig};

    my $len = $self->{contig_length_cache};
    my $fh;

    my $beg = $self->{options}->{beg};
    my $end = $self->{options}->{end};

    my $close_output;

    if (ref($output))
    {
	$fh = $output;
    }
    else
    {
	open($fh, ">$output") or confess "Cannot open output '$output': $!";
	$close_output = 1;
    }

    #
    # Build a data structure from the list of contigs
    # that has a list of lists of data per contig name.
    # (Do this so we don't copy all of the contig data itself, as it
    # could be quite large).
    #
    my %contigs;

    #
    # iterate over the given list of contig hashes.
    #
    for my $chash (@$contig_list)
    {
	#
	# Then for each contig in the individual contig hashes,
	# add the data list to %contigs.
	#
	for my $contig (keys %$chash)
	{
	    push(@{$contigs{$contig}}, $chash->{$contig});
	}
    }

    foreach my $contig (sort keys %contigs)
    {
	print $fh "##sequence-region\t$contig\t";
	if (defined $beg) {
	    print $fh "$beg\t";
	} else {
	    print $fh "1\t";
	}
	if (defined $end) {
	    print $fh "$end\n";
	} else {
	    print $fh "$len->{$contig}\n";
	}
	for my $list (@{$contigs{$contig}})
	{
	    print $fh join "\n", @$list;
	}
    }
    
    print $fh "##FASTA\n";
    # print out the cds and pro if we need them

    if ($self->{options}->{outputfasta})
    {
	for my $fastasequences (@$fasta_list)
	{
	    print $fh $fastasequences;
	}
    }
 
    my $ll = $self->{options}->{linelength};
    foreach my $contig (sort keys %contigs)
    {
	my $len=$fig->contig_ln($genome, $contig);
	my $dna_seq=$fig->dna_seq($genome, $contig . "_1_". $len);
	if (defined $beg) 
	{
	    unless (defined $end) {
		$end=$len;
	    }
	    $dna_seq = substr($dna_seq, $beg, $end);
	} 
	elsif (defined $end)
	{
	    $beg=1;
	    $dna_seq = substr($dna_seq, $beg, $end);
	}
  
	my $contig=uri_escape($contig);

	$dna_seq =~ s/(.{$ll})/$1\n/g;
	chomp($dna_seq); # just remove the last \n if there is one
	print $fh ">$contig\n$dna_seq\n";
    }

    close($fh) if $close_output;
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3