[Bio] / FigKernelPackages / FIG.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FIG.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.140, Sun Aug 22 21:36:42 2004 UTC revision 1.141, Mon Aug 23 03:34:32 2004 UTC
# Line 2051  Line 2051 
2051      return ([],$l,$u);      return ([],$l,$u);
2052  }  }
2053    
2054    
2055    #=============================================================================
2056    #  Using the following version is better, but it brings out a very annoying
2057    #  issue with some genomes. It already exists in the current code (above)
2058    #  for some genes in some genomes.  For example, visit fig|70601.1.peg.1.
2059    #  This is true for any genome that has a feature that crosses the origin.
2060    #  The root of the problem lies in boundaries_of.  I am working on a fix that
2061    #  replaces boundaries_of with a more sophisticated function.  When it is
2062    #  all done, genes_in_retion should behave as desired.  -- GJO, Aug. 22, 2004
2063    #=============================================================================
2064    #
2065    #  =pod
2066    #
2067    #  =head1 genes_in_region
2068    #
2069    #  usage: ( $features_in_region, $min_coord, $max_coord )
2070    #           = $fig->genes_in_region( $genome, $contig, $beg, $end )
2071    #
2072    #  It is often important to be able to find the genes that occur in a specific
2073    #  region on a chromosome.  This routine is designed to provide this information.
2074    #  It returns all genes that overlap the region ( $genome, $contig, $beg, $end ).
2075    #  $min_coord is set to the minimum coordinate of the returned genes (which may
2076    #  preceed the given region), and $max_coord the maximum coordinate.  Because
2077    #  the database is indexed by the leftmost and rightmost coordinates of each
2078    #  feature, the function makes no assumption about the length of the feature, but
2079    #  it can (and probably will) miss features spanning multiple contigs.
2080    #
2081    #  =cut
2082    #
2083    #
2084    #  sub genes_in_region {
2085    #      my ( $self, $genome, $contig, $beg, $end ) = @_;
2086    #      my ( $x, $db_response, $feature_id, $b1, $e1, @tmp, @bounds );
2087    #      my ( $min_coord, $max_coord );
2088    #
2089    #      my @features = ();
2090    #      my $rdbH = $self->db_handle;
2091    #
2092    #      if ( ( $db_response = $rdbH->SQL( "SELECT id
2093    #                                           FROM features
2094    #                                          WHERE ( contig = '$contig' )
2095    #                                            AND ( genome = '$genome' )
2096    #                                            AND ( minloc <= $end )
2097    #                                            AND ( maxloc >= $beg );"
2098    #                                      )
2099    #           )
2100    #        && ( @$db_response > 0 )
2101    #         )
2102    #      {
2103    #          #  The sort is unnecessary, but provides a consistent ordering
2104    #
2105    #       @tmp = sort { (  $a->[1]             cmp   $b->[1] )             # contig
2106    #                  || ( ($a->[2] + $a->[3] ) <=> ( $b->[2] + $b->[3] ) ) # midpoint
2107    #                      }
2108    #              map  { $feature_id = $_->[0];
2109    #                     ( ( ! $self->is_deleted_fid( $feature_id ) )       # not deleted
2110    #                       && ( $x = $self->feature_location( $feature_id ) )  # and has location
2111    #                       && ( ( @bounds = boundaries_of( $x ) ) == 3 )       # and has bounds
2112    #                        ) ? [ $feature_id, @bounds ] : ()
2113    #                      } @$db_response;
2114    #
2115    #       ( $min_coord, $max_coord ) = ( 10000000000, 0 );
2116    #
2117    #       foreach $x ( @tmp )
2118    #       {
2119    #           ( $feature_id, undef, $b1, $e1 ) = @$x;
2120    #           push @features, $feature_id;
2121    #           my ( $min, $max ) = ( $b1 <= $e1 ) ? ( $b1, $e1 ) : ( $e1, $b1 );
2122    #           ( $min_coord <= $min ) || ( $min_coord = $min );
2123    #           ( $max_coord >= $max ) || ( $max_coord = $max );
2124    #       }
2125    #      }
2126    #
2127    #      return ( @features ) ? ( [ @features ], $min_coord, $max_coord )
2128    #                           : ( [], undef, undef );
2129    #  }
2130    
2131    #  These will be part of the fix to genes_in_region.  -- GJO
2132    
2133    =pod
2134    
2135    =head1 regions_spanned
2136    
2137    usage: ( [ $contig, $beg, $end ], ... ) = $fig->regions_spanned( $loc )
2138    
2139    The location of a feature in a scalar context is
2140    
2141        contig_b1_e1,contig_b2_e2,...   [one contig_b_e for each segment]
2142    
2143    This routine takes as input a fig location and reduces it to one or more
2144    regions spanned by the gene.  Unlike boundaries_of, regions_spanned handles
2145    wrapping through the orgin, features split over contigs and exons that are
2146    not ordered nicely along the chromosome (ugly but true).
2147    
2148    =cut
2149    
2150    sub regions_spanned {
2151        shift if UNIVERSAL::isa( $_[0], __PACKAGE__ );
2152        my( $location ) = ( @_ == 1 ) ? $_[0] : $_[1];
2153        defined( $location ) || return undef;
2154    
2155        my @regions = ();
2156    
2157        my ( $cur_contig, $cur_beg, $cur_end, $cur_dir );
2158        my ( $contig, $beg, $end, $dir );
2159        my @segs = split( /\s*,\s*/, $location );  # should not have space, but ...
2160        @segs || return undef;
2161    
2162        #  Process the first segment
2163    
2164        my $seg = shift @segs;
2165        ( ( $cur_contig, $cur_beg, $cur_end ) = ( $seg =~ /^(\S+)_(\d+)_\d+$/ ) )
2166           || return undef;
2167        $cur_dir = ( $cur_end >= $cur_beg ) ? 1 : -1;
2168    
2169        foreach $seg ( @segs ) {
2170            ( ( $contig, $beg, $end ) = ( $seg =~ /^(\S+)_(\d+)_\d+$/ ) ) || next;
2171            $dir = ( $end >= $beg ) ? 1 : -1;
2172    
2173            #  Is this a continuation?  Update end
2174    
2175            if ( ( $contig eq $cur_contig )
2176              && ( $dir == $cur_dir )
2177              && ( ( ( $dir > 0 ) && ( $end > $cur_end ) )
2178                || ( ( $dir < 0 ) && ( $end < $cur_end ) ) )
2179               )
2180            {
2181                $cur_end = $end;
2182            }
2183    
2184            #  Not a continuation.  Report previous and update current.
2185    
2186            else
2187            {
2188                push @regions, [ $cur_contig, $cur_beg, $cur_end ];
2189                ( $cur_contig, $cur_beg, $cur_end, $cur_dir )
2190                   = ( $contig, $beg, $end, $dir );
2191            }
2192        }
2193    
2194        #  There should alwasy be a valid, unreported region.
2195    
2196        push @regions, [ $cur_contig, $cur_beg, $cur_end ];
2197    
2198        return wantarray ? @regions : \@regions;
2199    }
2200    
2201    
2202    =pod
2203    
2204    =head1 filter_regions
2205    
2206    usage:  @regions = filter_regions( $contig, $min, $max,  @regions )
2207           \@regions = filter_regions( $contig, $min, $max,  @regions )
2208            @regions = filter_regions( $contig, $min, $max, \@regions )
2209           \@regions = filter_regions( $contig, $min, $max, \@regions )
2210    
2211    This function provides a simple filter for extracting a list of genome regions
2212    for those that overlap a particular interval.  Region definitions correspond
2213    to those produced by regions_spanned.  That is, [ contig, beg, end ].  In the
2214    function call, either $contig or $min and $max can be undefined (permitting
2215    anything).
2216    
2217    =cut
2218    
2219    sub filter_regions {
2220        my ( $contig, $min, $max, @regions ) = @_;
2221    
2222        @regions || return ();
2223        ( ref( $regions[0] ) eq "ARRAY" ) || return undef;
2224    
2225        #  Is it a region list, or a reference to a region list?
2226    
2227        if ( ref( $regions[0]->[0] ) eq "ARRAY" ) { @regions = @{ $regions[0] } }
2228    
2229        if ( ! defined( $contig ) )
2230        {
2231            ( defined( $min ) && defined( $max ) ) || return undef;
2232        }
2233        else       # with a defined contig name, allow undefined range
2234        {
2235            defined( $min ) || ( $min =          1 );
2236            defined( $max ) || ( $max = 1000000000 );
2237        }
2238        ( $min <= $max ) || return ();
2239    
2240        my ( $c, $b, $e );
2241        my @filtered = grep { ( @$_ >= 3 )               #  Allow extra fields?
2242                           && ( ( $c, $b, $e ) = @$_ )
2243                           && ( ( ! defined( $contig ) ) || ( $c eq $contig ) )
2244                           && ( ( $e >= $b ) || ( ( $b, $e ) = ( $e, $b ) ) )
2245                           && ( ( $b <= $max ) && ( $e >= $min ) )
2246                            } @regions;
2247    
2248        return wantarray ? @filtered : \@filtered;
2249    }
2250    
2251    
2252  sub close_genes {  sub close_genes {
2253      my($self,$fid,$dist) = @_;      my($self,$fid,$dist) = @_;
2254    
# Line 3904  Line 4102 
4102          foreach $entry (@$relational_db_response)          foreach $entry (@$relational_db_response)
4103          {          {
4104              ($fid,$when,$fileno,$seek,$len) = @$entry;              ($fid,$when,$fileno,$seek,$len) = @$entry;
4105              if (($fid =~ /^fig\|(\d+\.\d+)/) && $genomes{$1} && (! $self->id_deleted_fid($fid)))              if (($fid =~ /^fig\|(\d+\.\d+)/) && $genomes{$1} && (! $self->is_deleted_fid($fid)))
4106              {              {
4107                  $ann = $self->read_annotation($fileno,$seek,$len);                  $ann = $self->read_annotation($fileno,$seek,$len);
4108    

Legend:
Removed from v.1.140  
changed lines
  Added in v.1.141

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3