[Bio] / FigKernelPackages / Contigs.pm Repository:
ViewVC logotype

View of /FigKernelPackages/Contigs.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Sun Mar 1 21:08:13 2009 UTC (10 years, 9 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011
new Contigs.pm and compare_coding.pm

package Contigs;

#
#  Build and manage information about a set of contigs.  They can be
#  supplied as file names (in which case the sequence locations are
#  indexed), or as id/sequence pairs ( [ id, seq ], ... ) or sequence
#  entries ( [ id, def, seq ] ), in which case an index of references
#  is built.  When a sequence on disk is used, it is cached in memory.
#  (This should be replaced by indexing subsequences on the disk, but
#  for now ....)
#
#  New contigs object:
#
#    $contigs = Contigs->new(   $file, ...   )                # files
#    $contigs = Contigs->new( [ $file, ... ] )                # ref to file list
#    $contigs = Contigs->new(   [ $id, $seq ], ...   )        # id-seq pairs
#    $contigs = Contigs->new( [ [ $id, $seq ], ... ] )        # ref to pair list
#    $contigs = Contigs->new(   [ $id, $def, $seq ], ...   )  # id-def-seq triples
#    $contigs = Contigs->new( [ [ $id, $def, $seq ], ... ] )  # ref to triple list
#
#  Contigs access functions:
#
#    @ids      = $contigs->ids( );
#   \@ids      = $contigs->ids( );
#    $length   = $contigs->length( $id );
#    $sequence = $contigs->sequence( $id );
#   \$sequence = $contigs->sequenceP( $id );
#    $subseq   = $contigs->subseq( $id, $beg, $end );
#
#  Test scripts:
#
#  perl -e 'use Contigs; $contigs=Contigs->new("Vibrio/contigs"); print join "\n", $contigs->ids(), ""'
#
#  perl -e 'use Contigs; $contigs=Contigs->new("Vibrio/contigs"); $c="Contig83"; print $contigs->sequence($c), "\n", ${$contigs->sequenceP($c)}, "\n"'
#
#  perl -e 'use Contigs; $contigs=Contigs->new("Vibrio/contigs"); $c="Contig83"; print $contigs->length($c), "\n", $contigs->subseq($c,1,50), "\n", $contigs->subseq($c,50,1), "\n"'
#  1097
#  ggtTTACTCAAAATATTAAATCAGCAGGTCAATTTGAAAGCTACgaagaT
#  AtcttcGTAGCTTTCAAATTGACCTGCTGATTTAATATTTTGAGTAAacc
#
#  perl -e 'use gjoseqlib; use Contigs; $contigs=Contigs->new(read_fasta("Vibrio/contigs")); $c="Contig83"; print $contigs->length($c), "\n", $contigs->subseq($c,1,50), "\n", $contigs->subseq($c,50,1), "\n"'
#  1097
#  ggtTTACTCAAAATATTAAATCAGCAGGTCAATTTGAAAGCTACgaagaT
#  AtcttcGTAGCTTTCAAATTGACCTGCTGATTTAATATTTTGAGTAAacc
#

use strict;
use gjoseqlib;     # read_fasta(), DNA_subseq()
use Data::Dumper;

#
#  $contigs = Contigs->new(   $file, ...   ) 
#  $contigs = Contigs->new( [ $file, ... ] )
#  $contigs = Contigs->new(   [ $id, $seq ], ...   )
#  $contigs = Contigs->new( [ [ $id, $seq ], ... ] )
#

sub new
{
    my $class = shift;
    my $self = {};

    my $type;
    my @files;
    my @seqs;

    #  Scalars are file names
    if    ( ! ref $_[0] )
    {
        @files = grep { -f } @_;
        if ( @files != @_ )
        {
            print STDERR "The following file(s) were not found:\n";
            foreach ( grep { ! -f } @_ ) { print STDERR "   '$_'\n" }
        }
        $type = 'file';
    }

    #  Scalars in an array ref can be file names
    elsif ( @_ == 1
         && ! ref $_[0]->[0]
         && ( @files = grep { -f } @{$_[0]} )
         && ( @files == @{$_[0]} )
          ) { $type = 'file' }

    #  Array refs of scalars can be sequence entries
    elsif ( ! ref $_[0]->[0]
         && ( @seqs = grep { @$_ == 2 || @$_ == 3 } @_ )
         && ( @seqs == @_ )
          ) { $type = 'seqs' }

    #  Arrays of array refs can be sequence entries
    elsif ( ref $_[0]->[0]
         && ( @seqs = grep { @$_ == 2 || @$_ == 3 } @{$_[0]} )
         && ( @seqs = @{$_[0]} )
          ) { $type = 'seqs' }

    else
    {
        print STDERR "Contigs->new called wtih arguments that could no be interpretted as files or sequences.\n";
        print STDERR Dumper( @_ );
        return undef;
    }

    $self->{ type } = $type;

    if    ( $type eq 'file' )
    {
        my ( $c_index, $c_length ) = index_contig_file( @files );
        $self->{ contig_index } = $c_index;
        $self->{ contig_len }   = $c_length;
        $self->{ contigs }      = {};
    }
    elsif ( $type eq 'seqs' )
    {
        my %contigs = map { $_->[0] => $_ } @seqs;
        $self->{ contigs } = \%contigs;

        my %contig_len = map { $_->[0] => length $_->[-1] } @seqs;
        $self->{ contig_len } = \%contig_len;
    }

    bless $self, $class;
}


sub ids
{
    my ( $self ) = @_;
    my @ids = sort { lc $a cmp lc $b } keys %{ $self->{ contig_len } };
    wantarray ? @ids : \@ids;
}


sub length
{
    my ( $self, $contig ) = @_;
    my $len = $self->{ contig_len }->{ $contig };
    print STDERR "Unable to evalute length( '$contig' ).\n" if ! defined $len;
    return $len;
}


sub sequence
{
    my ( $self, $contig ) = @_;

    my $c_data = $self->{ contigs }->{ $contig };

    if ( ! $c_data && $self->{ type } eq 'file' )
    {
        my $c_index = $self->{ contig_index }->{ $contig };
        $c_data = read_contig_from_file( @$c_index ) if $c_index;
        $self->{ contigs }->{ $contig } = $c_data if $c_data;
    }

    if ( ! $c_data )
    {
        print STDERR "Unable to evalute sequence( '$contig' ).\n";
        return undef;
    }

    $c_data->[-1]
}


sub sequenceP
{
    my ( $self, $contig ) = @_;

    my $c_data = $self->{ contigs }->{ $contig };

    if ( ! $c_data && $self->{ type } eq 'file' )
    {
        my $c_index = $self->{ contig_index }->{ $contig };
        $c_data = read_contig_from_file( @$c_index ) if $c_index;
        $self->{ contigs }->{ $contig } = $c_data if $c_data;
    }

    if ( ! $c_data )
    {
        print STDERR "Unable to evalute sequenceP( '$contig' ).\n";
        return undef;
    }

    \$c_data->[-1]
}


sub subseq
{
    my ( $self, $contig, $beg, $end ) = @_;

    my $c_data = $self->{ contigs }->{ $contig };

    if ( ! $c_data && $self->{ type } eq 'file' )
    {
        my $c_index = $self->{ contig_index }->{ $contig };
        $c_data = read_contig_from_file( @$c_index ) if $c_index;
        $self->{ contigs }->{ $contig } = $c_data if $c_data;
    }

    if ( ! $c_data )
    {
        print STDERR "Unable to evalute subseq( '$contig', $beg, $end ).\n";
        return undef;
    }

    DNA_subseq( \$c_data->[-1], $beg, $end )
}


#===============================================================================
#  Internal functions:
#
#
#  ( \%records, \%lengths ) = index_contig_file( $file, ... )
#
#       $record = [ $id, $def, $file, $offset, $seqbytes ];
#
#
#  \@seq_entry = read_contig_from_file( $id, $def, $file, $offset, $seqbytes )
#
#      $seq_entry = [ $id, $def, $seq ]
#
#===============================================================================

sub index_contig_file
{
    my %c_index;
    my %c_length;
    foreach my $file ( @_ )
    {
        my $prev_id = "";
        open GREP, "grep -b '^>' < '$file' |";
        while ( $_ = <GREP> )
        {
            s/^(\d+)\://;
            my $off = $1 + 0;
            my $prev_ind = $c_index{ $prev_id };
            $prev_ind->[4] = $off - $prev_ind->[3] if $prev_ind;

            $off += CORE::length($_);
            chomp;
            my ( $id, $def ) = /^>(\S+)(\s+(\S.*)?)?$/;
            $c_index{ $id } = [ $id, $def, $file, $off, undef ];
            $prev_id = $id;
        }
        close GREP;

        my $prev_ind = $c_index{ $prev_id };
        $prev_ind->[4] = (-s $file) - $prev_ind->[3] if $prev_ind;

        open SIZE, "fasta_size < '$file' |";
        while ( $_ = <SIZE> )
        {
            my ( $size, $id ) = /^\s*(\d+)\s+(\S+)/;
            $c_length{ $id } = $size if $id && $c_index{ $id };
        }
        close SIZE;
    }

    ( \%c_index, \%c_length );
}


sub read_contig_from_file
{
    my ( $id, $def, $file, $offset, $seqbytes ) = @_;

    my $seq;
    open  FILE, "<$file" or return undef;
    seek  FILE, $offset, 0;
    read  FILE, $seq, $seqbytes;
    close FILE;
    $seq =~ tr/ \n\r\t//d;

    [ $id, $def, $seq ]
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3