[Bio] / FigKernelPackages / PickFeatureSet.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/PickFeatureSet.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : overbeek 1.1 package PickFeatureSet;
2 :    
3 :     use Data::Dumper;
4 :     use Carp;
5 :    
6 :     # Candidates are the features we are considering. The idea is to pick
7 :     # an optimal set.
8 :     #
9 :     # Scored features look like
10 :     #
11 :     # $scored_fid = [ $type, $location, $score, \%extra ]
12 :     #
13 :    
14 :    
15 :     sub pick_feature_set {
16 :     my($candidates,$opt) = @_;
17 :    
18 :     if (ref($candidates) ne 'ARRAY')
19 :     {
20 :     confess 'You need to pass candidates as an array';
21 :     }
22 :     $opt ||= {};
23 :    
24 :     # Score the spacing between two orfs:
25 :    
26 :     my $max_sp_scr = $opt->{max_sp_scr} ||= 3.0; # peak bonus for optimal spacing
27 :     my $max_overlap = $opt->{max_overlap} ||= 60; # maximum overlapping nt
28 :     my $sp_decay = $opt->{sp_decay} ||= 100; # decay constant for too great a spacing
29 :    
30 :     $opt->{min_scr} = 5 if ! defined($opt->{min_scr});
31 :     my $min_scr = $opt->{min_scr}; # we just do not like things less than this scr
32 :    
33 :     # $orf = [ $type, $id, $loc, $scr, $contig, $dir, $l, $r ]
34 :    
35 :     my @ftrs = sort { $a->[2] cmp $b->[2] || $a->[4] <=> $b->[4] } # sort by contig and left end
36 :     grep { ( $_->[5] - $_->[4] > $max_overlap ) # Exclude very short ...
37 :     || ( $_->[1] > 10 ) # .... unless high score
38 :     }
39 :     map { my ( $type, $loc, $scr, $extra ) = @$_; # extra is a hash of extra stuff
40 :     my ( $c, $b, $e ) = &boundaries_of($loc);
41 :     my $dir = ( $e <=> $b );
42 :     my ( $l, $r ) = ( $b < $e ) ? ( $b, $e ) : ( $e, $b );
43 :     [ $_, $scr, $c, $dir, $l, $r ]
44 :     }
45 :     grep { $_->[2] >= $min_scr }
46 :     @$candidates;
47 :    
48 :     my @called = ();
49 :     my @current = ();
50 :     my $c0 = '';
51 :    
52 :     foreach my $ftr ( @ftrs )
53 :     {
54 :     my ( $data, $scr, $c, $dir, $l, $r ) = @$ftr;
55 :     if ( $c ne $c0 )
56 :     {
57 :     record( \@called, \@current );
58 :     @current = ( [ undef, 0, $c, 1, -100000, -99999, undef, 0 ] );
59 :     $c0 = $c;
60 :     }
61 :    
62 :     # Find best total score so far that is completed to the left of $l:
63 :    
64 :     my ( $best_ttl ) = sort { $b <=> $a }
65 :     map { $_->[7] }
66 :     grep { $_->[5] < $l }
67 :     @current;
68 :    
69 :     # Remove current end points that are to the left of $l, and with a
70 :     # score less than $best_ttl + $max_sp_scr (these can never be productively
71 :     # extended):
72 :    
73 :     if ( $best_ttl )
74 :     {
75 :     @current = grep { $_->[5] >= $l
76 :     || $_->[7] + $max_sp_scr >= $best_ttl
77 :     }
78 :     @current;
79 :     }
80 :    
81 :     # Find scores for adding to each possible predacessor. Record as:
82 :     # [ [ $type, $loc, $scr, $extra ], $scr, $contig, $dir, $l, $r, $prefix, $ttl_scr ]
83 :     # 0 1 2 3 4 5 6 7
84 :    
85 :     my ( $place ) = sort { $b->[7] <=> $a->[7] } # A high-scoring position
86 :     map { my $sp_scr = spacer_score( $_->[3], $_->[5], $dir, $l, $max_sp_scr, $max_overlap, $sp_decay );
87 :     [ @$ftr, $_, $_->[7] + $sp_scr + $scr ];
88 :     }
89 :     grep { $_->[5] - $max_overlap <= $ftr->[4] } # excess overlap?
90 :     @current;
91 :    
92 :     push @current, $place;
93 :     }
94 :    
95 :     &record( \@called, \@current );
96 :    
97 :     # Print type, id, location, score
98 :    
99 :     my @to_return = map { $_->[0] } @called;
100 :     return wantarray ? @to_return : \@to_return;
101 :     }
102 :    
103 :     #===============================================================================
104 :     # Subroutines below
105 :     #===============================================================================
106 :    
107 :     sub boundaries_of {
108 :     my($loc) = @_;
109 :    
110 :     my @locs = split(/,/,$loc);
111 :     my($contig,$beg) = ($locs[0] =~ /^(\S+)_(\d+)_\d+$/);
112 :     $locs[-1] =~ /_(\d+)$/;
113 :     return ($contig,$beg,$1);
114 :     }
115 :    
116 :     # Find the last member of the chain of orfs that led to the best score:
117 :    
118 :     sub record
119 :     {
120 :     my ( $called, $current ) = @_;
121 :     my ( $best_set ) = sort { $b->[7] <=> $a->[7] } @$current;
122 :     record1( $called, $best_set );
123 :     }
124 :    
125 :     # Backtrach through chain of orfs that led to best total score:
126 :    
127 :     sub record1
128 :     {
129 :     my ( $called, $tail ) = @_;
130 :     return if ! $tail->[0]; # End of chain (well, start actually )
131 :     record1( $called, $tail->[6] ) if $tail->[6];
132 :     push @$called, $tail;
133 :     }
134 :    
135 :    
136 :     # Calculate score for following
137 :     sub spacer_score
138 :     {
139 :     my ( $fr1, $r1, $fr2, $l2, $max_sp_scr, $max_overlap, $sp_decay ) = @_;
140 :     my $ideal = $fr1 > 0 ? ( $fr2 > 0 ? 5 : 100 ) # same : converge
141 :     : ( $fr2 > 0 ? 100 : 5 ); # diverge : same
142 :     my $x = $r1 - $l2 + 1 + $ideal; # amount less than ideal space
143 :     $max_sp_scr * ( ( $x >= 0 ) ? 1 - $x / ( $max_overlap + $ideal )
144 :     : exp( $x / $sp_decay )
145 :     );
146 :     }
147 :    
148 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3