[Bio] / FigKernelPackages / FIG.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FIG.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.646, Thu Dec 6 15:44:05 2007 UTC revision 1.647, Wed Jan 2 01:42:50 2008 UTC
# Line 1  Line 1 
1  # -*- perl -*-  # -*- perl -*-
2  ########################################################################  ########################################################################
3  # Copyright (c) 2003-2006 University of Chicago and Fellowship  # Copyright (c) 2003-2007 University of Chicago and Fellowship
4  # for Interpretations of Genomes. All Rights Reserved.  # for Interpretations of Genomes. All Rights Reserved.
5  #  #
6  # This file is part of the SEED Toolkit.  # This file is part of the SEED Toolkit.
# Line 5326  Line 5326 
5326  }  }
5327    
5328    
5329    =head3 next_feature
5330    
5331        my $feature = $fig->next_feature( \%options );
5332    
5333    Locate the next feature (optionally filtered by type) in a contig. The start
5334    position for the search can be defined by supplying genome, contig and
5335    position, or by supplying a feature id.  Feature locations are defined by
5336    their midpoint.  If a fid is supplied with contig and position, the latter
5337    are used to resolve ambiguities in the desired segement of a feature with
5338    a complex location.
5339    
5340    =over 4
5341    
5342    =item options
5343    
5344    Options:
5345    
5346    after =>  $fid
5347    after => \@fids
5348    
5349    Id(s) of features that should preceed the returned feature.  This is a local
5350    operation, and is only meant to resolve features that are otherwise tied in
5351    location.
5352    
5353    contig => $contig
5354    
5355    Name of contig of features.
5356    
5357    exclude =>  $id
5358    exclude => \@ids
5359    
5360    Id(s) of features to exclude.  Note that features listed with the 'after'
5361    option are also excluded (and that is most commonly the desired behavior).
5362    
5363    fid => $fid
5364    
5365    Alternative to supplying a location.  It is possible to supply a fid and
5366    contig and position, which allows disambiguating the desired segment of
5367    a feature with a complex location.
5368    
5369    genome => $genome
5370    
5371    Name of genome of features.
5372    
5373    position => $position
5374    
5375    Feature midpoint must be >= $position.  Note that this can be any multiple
5376    of 1/2.  If the supplied value is negative, the position is taken from the
5377    right end of the contig.
5378    
5379    type =>  $type
5380    type => \@types
5381    
5382    Type(s) of desired feature (default is any type).
5383    
5384    =item RETURN
5385    
5386    Feature id or undef.
5387    
5388    =back
5389    
5390    =cut
5391    
5392    sub next_feature
5393    {
5394        my ( $self, $options ) = @_;
5395        return undef unless ref( $options ) eq 'HASH';
5396    
5397        my $fid      = $options->{ fid };
5398        my $genome   = $options->{ genome };
5399        my $contig   = $options->{ contig };
5400        my $position = $options->{ position };
5401    
5402        if ( ! $genome )
5403        {
5404          return undef unless $fid;
5405          ( $genome ) = $fid =~ /^fig\|(\d+\.\d+)\./;
5406        }
5407    
5408        if ( ! $contig || ! $position )
5409        {
5410          return undef unless $fid;
5411          my ( $region ) = grep { ( ! $contig ) || ( $_->[0] eq $contig ) }
5412                           FIG::boundaries_of_2( scalar $self->feature_location( $fid ) );
5413          my ( $beg, $end );
5414          ( $contig, $beg, $end ) = @{ $region || [] };
5415          return undef unless ( $contig && $beg && $end );
5416          $position = 0.5 * ( $beg + $end );
5417        }
5418    
5419        my $length = $self->contig_ln( $genome, $contig );
5420        return undef unless $length;
5421    
5422        #  Negative position counts from left end of contig
5423    
5424        if ( $position < 0 ) { $position += $length + 1 }
5425        my $pos2 = 2 * $position;
5426    
5427        my $after = $options->{ after };
5428        my @after = ref( $after ) eq 'ARRAY' ?  @$after
5429                  : $after                   ? ( $after )
5430                  : ();
5431        my %after = map { $_ => 1 } @after;
5432        $after{ $fid } = 1 if $fid;
5433    
5434        my $exclude = $options->{ exclude };
5435        my @exclude = ref( $exclude ) eq 'ARRAY' ?  @$exclude
5436                    : $exclude                   ? ( $exclude )
5437                    : ();
5438        my %exclude = map { $after{ $_ } ? () : ( $_ => 1 ) } @exclude;
5439    
5440        my $type  = $options->{ type };
5441        my @types = ref( $type ) eq 'ARRAY' ?  @$type
5442                  : $type                   ? ( $type )
5443                  : ();
5444        my %types = map { $_ => 1 } @types;
5445    
5446        my $rdbH = $self->db_handle;
5447        my $minV = int( $position + 0.5 );  #  Round up if not integer
5448        my $maxV = $minV + 9999;
5449        my $relational_db_response;
5450    
5451        while ( $minV <= $length )
5452        {
5453            my $query = "SELECT id "
5454                      . "FROM features "
5455                      . "WHERE ( minloc <= $maxV ) "
5456                        . "AND ( maxloc >= $minV ) "
5457                        . "AND ( genome = \'$genome\' ) "
5458                        . "AND ( contig = \'$contig\' );";
5459            ( $relational_db_response = $rdbH->SQL( $query ) ) or return undef;
5460    
5461            if ( @$relational_db_response >= 1 )
5462            {
5463                my @tmp = map  { my $loc  = $self->feature_location( $_ );
5464                                 my @loc2 = filtered_location( $loc, $contig, $position, 0.5*$length );
5465                                 my ( $type, $numb ) = $_ =~ /\.([^.]+)\.(\d+)$/;
5466                                 $loc2[0] && ( $loc2[1]+$loc2[2] >= $pos2 ) ? [ $_, @loc2, $type, $numb+0 ] : ()
5467                               }
5468                          grep { my ( $type ) = $_ =~ /^fig\|\d+\.\d+\.([^.]+)\.\d+$/;
5469                                 $type && !$exclude{$_} && ( !@types || $types{$type} )
5470                               }
5471                          map  { $_->[0] }           # extract the id
5472                          @$relational_db_response;
5473    
5474                if ( @tmp )
5475                {
5476                    #  This sort produces an unambiguous ordering of all features.
5477                    #  In a more general ordering, the contig would also be sorted,
5478                    #  but presently we know that they are all on one contig.
5479                    @tmp = sort {    ($a->[2]+$a->[3]) <=>    ($b->[2]+$b->[3])  # midpoint
5480                               || min($a->[2],$a->[3]) <=> min($b->[2],$b->[3])  # left end
5481                               ||  lc $a->[4]          cmp  lc $b->[4]           # type
5482                               ||     $a->[5]          <=>     $b->[5]           # number
5483                                }
5484                           @tmp;
5485    
5486                    #  Process the 'after' ids:
5487                    my $data;
5488                    foreach ( reverse @tmp )
5489                    {
5490                        last if ( $after{ $_->[0] } );
5491                        $data = $_;
5492                    }
5493    
5494                    #  An id was found.  Make sure that its midpoint was within
5495                    #  the range of coordinates surveyed.  If not, there might be
5496                    #  a smaller feature with a closer midpoint, but which was not
5497                    #  found in the DB query.
5498    
5499                    if ( $data )
5500                    {
5501                        my $mid  = 0.5 * ( $data->[2] + $data->[3] );
5502    
5503                        #======== This is the only return with a defined id ========
5504    
5505                        return $data->[0] if $mid <= $maxV;
5506    
5507                        $maxV = int( $mid + 0.5 );  #  Might have missed a shorter feature
5508                    }
5509                    else  # Nothing yet; look further.
5510                    {
5511                        $minV  = $maxV + 1;
5512                        $maxV += 10000;
5513                    }
5514                }
5515                else  # Nothing yet; look further.
5516                {
5517                    $minV  = $maxV + 1;
5518                    $maxV += 10000;
5519                }
5520            }
5521            else  # Nothing yet; look further.
5522            {
5523                $minV  = $maxV + 1;
5524                $maxV += 10000;
5525            }
5526        }
5527    
5528        return undef;
5529    }
5530    
5531    
5532    =head3 previous_feature
5533    
5534        my $feature = $fig->previous_feature( \%options );
5535    
5536    Locate the previous feature (optionally filtered by type) in a contig. The
5537    start position for the search can be defined by supplying genome, contig and
5538    position, or by supplying a feature id.  Feature locations are defined by
5539    their midpoint.  If a fid is supplied with contig and position, the latter
5540    are used to resolve ambiguities in the desired segement of a feature with
5541    a complex location.
5542    
5543    =over 4
5544    
5545    =item options
5546    
5547    Options:
5548    
5549    before =>  $fid
5550    before => \@fids
5551    
5552    Id(s) of features that should follow the returned feature.  This is a local
5553    operation, and is only meant to resolve features that are otherwise tied in
5554    location.
5555    
5556    contig => $contig
5557    
5558    Name of contig of features.
5559    
5560    exclude =>  $id
5561    exclude => \@ids
5562    
5563    Id(s) of features to exclude.  Note that features listed with the 'before'
5564    option are also excluded (and that is most commonly the desired behavior).
5565    
5566    fid => $fid
5567    
5568    Alternative to supplying a location.  It is possible to supply a fid and
5569    contig and position, which allows disambiguating the desired segment of
5570    a feature with a complex location.
5571    
5572    genome => $genome
5573    
5574    Name of genome of features.
5575    
5576    position => $position
5577    
5578    Feature midpoint must be >= $position.  Note that this can be any multiple
5579    of 1/2.  If the supplied value is negative, the position is taken from the
5580    right end of the contig.
5581    
5582    type =>  $type
5583    type => \@types
5584    
5585    Type(s) of desired feature (default is any type).
5586    
5587    =item RETURN
5588    
5589    Feature id or undef.
5590    
5591    =back
5592    
5593    =cut
5594    
5595    sub previous_feature
5596    {
5597        my ( $self, $options ) = @_;
5598        return undef unless ref( $options ) eq 'HASH';
5599    
5600        my $fid      = $options->{ fid };
5601        my $genome   = $options->{ genome };
5602        my $contig   = $options->{ contig };
5603        my $position = $options->{ position };
5604    
5605        if ( ! $genome )
5606        {
5607          return undef unless $fid;
5608          ( $genome ) = $fid =~ /^fig\|(\d+\.\d+)\./;
5609        }
5610    
5611        if ( ! $contig || ! $position )
5612        {
5613          return undef unless $fid;
5614          my ( $region ) = grep { ( ! $contig ) || ( $_->[0] eq $contig ) }
5615                           FIG::boundaries_of_2( scalar $self->feature_location( $fid ) );
5616          my ( $beg, $end );
5617          ( $contig, $beg, $end ) = @{ $region || [] };
5618          return undef unless ( $contig && $beg && $end );
5619          $position = 0.5 * ( $beg + $end );
5620        }
5621    
5622        my $length = $self->contig_ln( $genome, $contig );
5623        return undef unless $length;
5624    
5625        #  Negative position counts from left end of contig
5626    
5627        if ( $position < 0 ) { $position += $length + 1 }
5628        my $pos2 = 2 * $position;
5629    
5630        my $before = $options->{ before };
5631        my @before = ref( $before ) eq 'ARRAY' ?  @$before
5632                   : $before                   ? ( $before )
5633                   : ();
5634        my %before = map { $_ => 1 } @before;
5635        $before{ $fid } = 1 if $fid;
5636    
5637        my $exclude = $options->{ exclude };
5638        my @exclude = ref( $exclude ) eq 'ARRAY' ?  @$exclude
5639                    : $exclude                   ? ( $exclude )
5640                    : ();
5641        my %exclude = map { $before{ $_ } ? () : ( $_ => 1 ) } @exclude;
5642    
5643        my $type  = $options->{ type };
5644        my @types = ref( $type ) eq 'ARRAY' ?  @$type
5645                  : $type                   ? ( $type )
5646                  : ();
5647        my %types = map { $_ => 1 } @types;
5648    
5649        my $rdbH = $self->db_handle;
5650        my $maxV = int( $position );   #  Round down if not integer
5651        my $minV = $maxV - 9999;
5652        my $relational_db_response;
5653    
5654        while ( $maxV >= 1 )
5655        {
5656            my $query = "SELECT id "
5657                      . "FROM features "
5658                      . "WHERE ( minloc <= $maxV ) "
5659                        . "AND ( maxloc >= $minV ) "
5660                        . "AND ( genome = \'$genome\' ) "
5661                        . "AND ( contig = \'$contig\' );";
5662            ( $relational_db_response = $rdbH->SQL( $query ) ) or return undef;
5663    
5664            if ( @$relational_db_response >= 1 )
5665            {
5666                my @tmp = map  { my $loc  = $self->feature_location( $_ );
5667                                 my @loc2 = filtered_location( $loc, $contig, $position, 0.5*$length );
5668                                 my ( $type, $numb ) = $_ =~ /\.([^.]+)\.(\d+)$/;
5669                                 ( $loc2[0] && ( $loc2[1]+$loc2[2] <= $pos2 ) ) ? [ $_, @loc2, $type, $numb+0 ] : ()
5670                               }
5671                          grep { my ( $type ) = $_ =~ /^fig\|\d+\.\d+\.([^.]+)\.\d+$/;
5672                                 $type && !$exclude{$_} && ( !@types || $types{$type} )
5673                               }
5674                          map  { $_->[0] }           # extract the id
5675                          @$relational_db_response;
5676    
5677                if ( @tmp )
5678                {
5679                    #  This sort produces an unambiguous ordering of all features.
5680                    #  In a more general ordering, the contig would also be sorted,
5681                    #  but presently we know that they are all on one contig.
5682                    @tmp = sort {    ($a->[2]+$a->[3]) <=>    ($b->[2]+$b->[3])  # midpoint
5683                               || min($a->[2],$a->[3]) <=> min($b->[2],$b->[3])  # left end
5684                               ||  lc $a->[4]          cmp  lc $b->[4]           # type
5685                               ||     $a->[5]          <=>     $b->[5]           # number
5686                                }
5687                           @tmp;
5688                    # if ( 1 ) { print STDERR Dumper( \@tmp ) }
5689    
5690                    #  Process the 'before' ids:
5691                    my $data;
5692                    foreach ( @tmp )
5693                    {
5694                        last if ( $before{ $_->[0] } );
5695                        $data = $_;
5696                    }
5697    
5698                    #  An id was found.  Make sure that its midpoint was within
5699                    #  the range of coordinates surveyed.  If not, there might be
5700                    #  a smaller feature with a closer midpoint, but which was not
5701                    #  found in the DB query.
5702    
5703                    if ( $data )
5704                    {
5705                        my $mid  = 0.5 * ( $data->[2] + $data->[3] );
5706    
5707                        #======== This is the only return with a defined id ========
5708    
5709                        return $data->[0] if $mid >= $minV;
5710    
5711                        $minV = int( $mid );  #  Might have missed a shorter feature
5712                    }
5713                    else  # Nothing yet; look further.
5714                    {
5715                        $maxV  = $minV - 1;
5716                        $minV -= 10000;
5717                    }
5718                }
5719                else  # Nothing yet; look further.
5720                {
5721                    $maxV  = $minV - 1;
5722                    $minV -= 10000;
5723                }
5724            }
5725            else  # Nothing yet; look further.
5726            {
5727                $maxV  = $minV - 1;
5728                $minV -= 10000;
5729            }
5730        }
5731    
5732        return undef;
5733    }
5734    
5735    
5736    #  Process a (possibly complex) location for the most applicable segment(s).
5737    #  Mostly this is for dealing locations that wrap through the origin of a
5738    #  contig.
5739    #
5740    #     ( $contig, $beg, $end ) = filtered_location( $loc, $contig, $focus, $range )
5741    #
5742    sub filtered_location
5743    {
5744        my ( $loc, $contig, $focus, $range ) = @_;
5745        my @regions = grep { $_->[0] eq $contig } boundaries_of_2( $loc );
5746        if ( @regions < 2 ) { return @{ $regions[0] || [] } }
5747    
5748        #  Okay, life was not simple.  Let's see if we can throw out the most
5749        #  distant parts.
5750    
5751        my $min_mid = $focus - $range;
5752        my $max_mid = $focus + $range;
5753        @regions = grep { my $mid = 0.5 * ( $_->[1] + $_->[2] );
5754                          $mid >= $min_mid && $mid <= $max_mid
5755                        }
5756                   @regions;
5757        if ( @regions < 2 ) { return @{ $regions[0] || [] } }
5758    
5759        #  This should be very rare, and returning an empty list would be
5760        #  reasonable.  However, let's fall back to an interval that covers
5761        #  everything remaining.
5762    
5763        my ( $beg, $end ) = ( shift @regions )[1,2];
5764        foreach ( @regions )
5765        {
5766            my ( $b1, $e1 ) = @$_[1,2];
5767            if ( $beg < $end ) { $beg = $b1 if $b1 < $beg; $end = $e1 if $e1 > $end }
5768            else               { $beg = $b1 if $b1 > $beg; $end = $e1 if $e1 < $end }
5769        }
5770    
5771        ( $contig, $beg, $end )
5772    }
5773    
5774    
5775  =head3 genes_in_region  =head3 genes_in_region
5776    
# Line 6479  Line 6924 
6924      return undef;      return undef;
6925  }  }
6926    
6927    
6928    =head3 boundaries_of_2
6929    
6930        \@regions = $fig->boundaries_of_2( $location );
6931    
6932         @regions = $fig->boundaries_of_2( $location );
6933    
6934    Locations can be a list of intervals (contig_beg_end), but the intervals
6935    need not be on a single contig, contiguous, or in a consistent orientation
6936    (e.g., a feature that wraps from the end to the beginning of a genome, or a
6937    trans-spliced protein).  This function defines a region of a gene a sequence
6938    parts of the location that are on the same contig, in the same orientation,
6939    and with end points that progress along the contig in the same direction as
6940    the individual parts.  This function is a generalization of boundaries_of().
6941    The latter function returns undef if the first and last contigs are not
6942    the same, and returns a location spanning nearly the entire contig it the
6943    location spans the origin.
6944    
6945    =over 4
6946    
6947    =item location
6948    
6949    contig1_beg1_end1,contig2_beg2_end2,...
6950    
6951    =item @regions
6952    
6953       ( [contig, beg, end], [contig, beg, end], ... )
6954    
6955    where consecutive location intervals with a common contig, direction, and
6956    consistent direction of progression along the contig are merged.  The vast
6957    majority of genes will be reduced to the a single region, which is that
6958    returned by boundaries_of().  That is, most of the time:
6959    
6960        boundaries_of( $loc )
6961    
6962    is the same as
6963    
6964        @{ boundaries_of_2( $loc )->[0] || [] }
6965    
6966    =back
6967    
6968    =cut
6969    
6970    sub boundaries_of_2
6971    {
6972        if ( UNIVERSAL::isa( $_[0], __PACKAGE__ ) ) { shift }
6973    
6974        my ( $loc ) = @_;
6975        my @parts = grep { $_->[0] && $_->[1] && $_->[2] }
6976                    map  { [ /^(.+)_(\d+)_(\d+)$/ ] }
6977                    split /,/, $loc;
6978        @parts or return wantarray ? () : [];
6979    
6980        my ( $contig, $beg, $end ) = @{ shift @parts };
6981        my $dir = ( $end <=> $beg );
6982        my @regions;
6983        foreach ( @parts )
6984        {
6985            my ( $c1, $b1, $e1 ) = @$_;
6986            my $d1 = ( $e1 <=> $b1 );
6987            if ( ( $c1 eq $contig ) && ( $d1 == $dir ) && ( ( $e1 <=> $end ) == $dir ) )
6988            {
6989                $end = $e1;
6990            }
6991            else
6992            {
6993                push @regions, [ $contig, $beg, $end ];
6994                ( $contig, $beg, $end, $dir ) = ( $c1, $b1, $e1, $d1 );
6995            }
6996        }
6997        push @regions, [ $contig, $beg, $end ];
6998        wantarray() ? @regions : \@regions;
6999    }
7000    
7001    
7002  =head3 all_features_detailed  =head3 all_features_detailed
7003    
7004      my $featureList = $fig->all_features_detailed($genome);      my $featureList = $fig->all_features_detailed($genome);

Legend:
Removed from v.1.646  
changed lines
  Added in v.1.647

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3