[Bio] / FigKernelPackages / ComparedRegions.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/ComparedRegions.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : paczian 1.1 package ComparedRegions;
2 :    
3 :     1;
4 :    
5 :     use strict;
6 :     use warnings;
7 :    
8 :     use FIG;
9 :     use FIGV;
10 : parrello 1.3 use Tracer;
11 : paczian 1.1
12 :     sub get_compared_regions {
13 :     my ($params) = @_;
14 :    
15 :     # check parameters
16 :     my $fig = $params->{fig} || return undef;
17 :     my $peg = $params->{id} || return undef;
18 :     my $is_sprout = $params->{is_sprout} || 0;
19 :     my $region_size = $params->{region_size} || 16000;
20 :     my $number_of_genomes = $params->{number_of_genomes} || 5;
21 :    
22 :     # initialize data variable
23 :     my $data = [];
24 :    
25 :     # get the n pegs closest to the one passed
26 :     my @closest_pegs = &get_closest_pegs($fig, $peg, $is_sprout, $number_of_genomes);
27 :     unshift(@closest_pegs, $peg);
28 :    
29 :     # iterate over the returned pegs
30 :     foreach my $peg (@closest_pegs) {
31 :     my $loc = $fig->feature_location($peg);
32 :     my ($contig,$beg,$end) = $fig->boundaries_of($loc);
33 :     if ($contig && $beg && $end) {
34 :     my $mid = int(($beg + $end) / 2);
35 :     my $min = int($mid - ($region_size / 2));
36 :     my $max = int($mid + ($region_size / 2));
37 :     my $features = [];
38 :     my $feature_ids;
39 :     ($feature_ids, undef, undef) = $fig->genes_in_region($fig->genome_of($peg),$contig,$min,$max);
40 :     foreach my $fid (@$feature_ids) {
41 :     my $floc = $fig->feature_location($fid);
42 :     my ($contig1,$beg1,$end1) = $fig->boundaries_of($fig->feature_location($fid));
43 :     $beg1 = &in_bounds($min,$max,$beg1);
44 :     $end1 = &in_bounds($min,$max,$end1);
45 :     push(@$features, { 'start' => $beg1,
46 :     'stop' => $end1,
47 :     'id' => $fid });
48 :     }
49 :     push(@$data, { 'features' => $features,
50 :     'contig' => $contig,
51 :     'offset' => $mid - 8000 });
52 :     }
53 :     }
54 :    
55 :     # return the data
56 :     return $data;
57 :     }
58 :    
59 : dsouza 1.4 sub get_scan_for_matches_pattern {
60 : dsouza 1.2 my($params) = @_;
61 : dsouza 1.4 # get scan_for_matches pattern from the 'fasta' file
62 : dsouza 1.2
63 : dsouza 1.4 # parse input file name
64 : dsouza 1.2 my $file = $params->{'file'} || return undef;
65 : dsouza 1.4 if ($file =~ /^([a-z0-9]+)$/ or $file =~ /^tmp_([a-z0-9]+)\.cache$/) {
66 :     $file = 'tmp_' . $1 . '.fasta';
67 :     } else {
68 :     return undef;
69 : dsouza 1.2 }
70 :    
71 : dsouza 1.4 # add path
72 :     $file = "$FIG_Config::temp/$file";
73 :     # check if file exists
74 : dsouza 1.2 (-e $file) || return undef;
75 :    
76 : dsouza 1.4 my $pattern = '';
77 :     my $line;
78 :    
79 : dsouza 1.2 open(PAT, "<$file") or die "could not open file '$file': $!";
80 : dsouza 1.4 while (defined($line = <PAT>)) {
81 :     #chomp $line;
82 : dsouza 1.2 $pattern .= $line;
83 :     }
84 :     close(PAT) or die "could not close file '$file': $!";
85 :    
86 :     return $pattern;
87 :     }
88 :    
89 : dsouza 1.4 sub get_scan_for_matches_hits {
90 : dsouza 1.2 my($params) = @_;
91 :     # get Bruce's hit locations
92 :    
93 : dsouza 1.4 my $fig = $params->{'fig'} || return undef;
94 :     my $file = $params->{'file'} || return undef;
95 : dsouza 1.2
96 : dsouza 1.4 # parse input file name
97 :     if ($file =~ /^[a-z0-9]+$/) {
98 : dsouza 1.2 $file = 'tmp_' . $file . '.cache';
99 :     }
100 : dsouza 1.4
101 :     # add path
102 :     $file = "$FIG_Config::temp/$file";
103 :     # check if file exists
104 : dsouza 1.2 (-e $file) || return undef;
105 :    
106 :     my($line, %locations);
107 :     open(HITS, "<$file") or die "could not open file '$file': $!";
108 : dsouza 1.4 $line = <HITS>; # ignore column header line
109 :     while (defined($line = <HITS>)) {
110 : dsouza 1.2 my($hit) = split(/\s+/, $line);
111 :    
112 : dsouza 1.4 if ($hit =~ /^fig\|(\S+)\.peg\.(\d+)_(\d+)\+(\d+)$/) {
113 : dsouza 1.2 # output from protein sequence scan
114 :     my($org_id, $peg_num, $offset, $ln) = ($1, $2, $3, $4);
115 :    
116 :     my $fid = 'fig|' . $org_id . '.peg.' . $peg_num;
117 :     my $loc = $fig->feature_location($fid);
118 :    
119 : dsouza 1.4 if (not $fig->is_deleted_fid($fid))
120 :     {
121 :     my($contig,$beg,$end) = $fig->boundaries_of($loc);
122 :    
123 :     my($hit_beg, $hit_end);
124 :    
125 :     if ( $beg <= $end ) {
126 :     $hit_beg = $beg + ($offset * 3);
127 :     $hit_end = $hit_beg + ($ln * 3);
128 :     } else {
129 :     $hit_beg = $beg - ($offset * 3);
130 :     $hit_end = $hit_beg - ($ln * 3);
131 :     }
132 :    
133 :     push @{$locations{$org_id}{$contig}}, [$hit_beg, $hit_end];
134 : dsouza 1.2 }
135 :     } else {
136 :     # output from DNA scan
137 : dsouza 1.4 # this will need to be cleaned up depending on the format Bruce puts the output in
138 : dsouza 1.2 my($org_id, $loc) = split(":", $hit);
139 :     my($contig, $beg, $end) = $fig->boundaries_of($fig->feature_location($loc));
140 :     push @{$locations{$org_id}{$contig}}, [$beg, $end];
141 :     }
142 :     }
143 :     close(HITS) or die "could not close file '$file': $!";
144 :     return \%locations;
145 :     }
146 :    
147 : dsouza 1.4 sub add_hits_in_regions {
148 :     my($fig, $regions, $patscan_hits) = @_;
149 :     # find scan_for_matches hits (from $patscan_hits) which overlap the regions
150 : dsouza 1.2 # containing the closest pegs (from $peg_regions)
151 :    
152 : dsouza 1.4 # should probably sort regions and perform search intelligently
153 :     foreach my $org_id (keys %$regions) {
154 :     foreach my $contig (keys %{$regions->{$org_id}}) {
155 :     if (exists $patscan_hits->{$org_id} and exists $patscan_hits->{$org_id}{$contig}) {
156 :     foreach my $region (@{$regions->{$org_id}{$contig}}) {
157 :     my($reg_beg, $reg_end) = ($region->{'reg_beg'}, $region->{'reg_end'});
158 :     # build up list of hits which overlap the region
159 : dsouza 1.2 my $overlapping_hits = [];
160 : dsouza 1.4 foreach my $hit (@{$patscan_hits->{$org_id}{$contig}}) {
161 : dsouza 1.2 my($hit_beg, $hit_end) = @$hit;
162 : dsouza 1.4 if ($fig->between($reg_beg, $hit_beg, $reg_end) or $fig->between($reg_beg, $hit_end, $reg_end)) {
163 : dsouza 1.2 push @$overlapping_hits, [$hit_beg, $hit_end];
164 :     }
165 :     }
166 :    
167 : dsouza 1.4 if (@$overlapping_hits) {
168 :     # add the overlapping hits in the region
169 :     $region->{'hits'} = $overlapping_hits;
170 : dsouza 1.2 }
171 :     }
172 :     }
173 :     }
174 :     }
175 :    
176 : dsouza 1.4 return $regions;
177 : dsouza 1.2 }
178 :    
179 : dsouza 1.4 sub add_features_in_regions {
180 : dsouza 1.2 my($fig, $regions) = @_;
181 : dsouza 1.4 # add features in region to data structure containing closest pegs
182 :     foreach my $org_id (keys %$regions) {
183 :     foreach my $contig (keys %{$regions->{$org_id}}) {
184 :     foreach my $region (@{$regions->{$org_id}{$contig}}) {
185 :     # get closest peg and boundaries of region
186 :     my $peg = $region->{'peg'};
187 :     my $min = $region->{'reg_beg'};
188 :     my $max = $region->{'reg_end'};
189 : dsouza 1.2
190 : dsouza 1.4 # get list of features which fall in this region
191 :     my($feature_ids) = $fig->genes_in_region($fig->genome_of($peg),$contig,$min,$max);
192 :    
193 :     # make up a list of all these features
194 : dsouza 1.2 my $features = [];
195 :     foreach my $fid (@$feature_ids) {
196 :     my $floc = $fig->feature_location($fid);
197 :     my ($contig1,$beg1,$end1) = $fig->boundaries_of($fig->feature_location($fid));
198 :     $beg1 = &in_bounds($min,$max,$beg1);
199 :     $end1 = &in_bounds($min,$max,$end1);
200 : dsouza 1.4 push(@$features, { 'feat_beg' => $beg1,
201 :     'feat_end' => $end1,
202 :     'feat_id' => $fid });
203 : dsouza 1.2 }
204 :    
205 : dsouza 1.4 # add the feature data to the region
206 : dsouza 1.2 $region->{'features'} = $features;
207 :     }
208 :     }
209 :     }
210 :    
211 :     return $regions;
212 :     }
213 :    
214 : dsouza 1.4 sub sort_regions {
215 :     my($fig, $regions) = @_;
216 :    
217 :     # sort regions based on location on the contig, regardless of strand
218 :     # i.e. sort on the 'left-most' location of the region on the contig
219 :     foreach my $org_id (keys %$regions) {
220 :     foreach my $contig (keys %{$regions->{$org_id}}) {
221 :     my $contig_regions = $regions->{$org_id}{$contig};
222 :    
223 :     if (@$contig_regions > 1) {
224 :     my @sorted_regions = map {$_->[0]}
225 :     sort {$a->[1] <=> $b->[1]}
226 :     map {[$_, $fig->min($_->{'reg_beg'}, $_->{'reg_end'})]}
227 :     @$contig_regions;
228 :    
229 :     $regions->{$org_id}{$contig} = \@sorted_regions;
230 : dsouza 1.2 }
231 :     }
232 :     }
233 :    
234 : dsouza 1.4 return $regions;
235 : dsouza 1.2 }
236 :    
237 : dsouza 1.4 sub collapse_regions {
238 :     my($fig, $regions, $window_size) = @_;
239 :     # collapse regions which display regions which show basically the same information
240 :     # if two (or more) regions:
241 :     # a. overlap by more than half the region size, and
242 :     # b. the pattern match is located entirely within the overlap,
243 :     # only one of the regions will be retained for display.
244 :     # since the regions are centred on the 'close' peg, this should give a reasonable
245 :     # neighborhood.
246 :     # if regions get collapsed, the region displayed is the 'left-most' on the contig.
247 :    
248 :     my $half_region = 0.5 * $window_size;
249 :    
250 :     foreach my $org_id (keys %$regions) {
251 :     foreach my $contig (keys %{$regions->{$org_id}}) {
252 :     my $contig_regions = $regions->{$org_id}{$contig};
253 :    
254 :     if (@$contig_regions > 1) {
255 :     # hash for index of regions which should not be displayed
256 :     my %discard;
257 :     for (my $i = 0; $i < @$contig_regions; $i++) {
258 :     for (my $j = ($i+1); $j < @$contig_regions; $j++) {
259 :     my $overlap = &regions_overlap($contig_regions->[$i], $contig_regions->[$j]);
260 :    
261 :     if ($overlap and
262 :     ($overlap > $half_region) and
263 :     &all_hits_overlap_region($contig_regions->[$i], $contig_regions->[$j]{'hits'}))
264 :     {
265 :     # region i display contains all relevant details from region j,
266 :     # region j can be discarded.
267 :    
268 :     $discard{$j} = 1;
269 :     }
270 :     else
271 :     {
272 :     # region i display does not contain all relevant details from region j,
273 :     # region j needs to be kept.
274 :     # next value of $i needs to be $j
275 :     $i = $j - 1;
276 :     # set $j so that we exit second loop
277 :     $j = @$contig_regions;
278 :     }
279 :     }
280 :     }
281 :    
282 :     my @keep = grep {not exists $discard{$_}} (0..$#{$contig_regions});
283 :     $regions->{$org_id}{$contig} = [@$contig_regions[@keep]];
284 :     }
285 :     }
286 :     }
287 :     return $regions;
288 :     }
289 :    
290 :     sub all_hits_overlap_region {
291 :     my($region, $hits) = @_;
292 :     # Check whether ALL hits overlap the region, if yes return 1, if no return 0
293 :     # Return 1 if no hits present
294 :     # This will have problems for hits which are longer than the region displayed
295 :    
296 :     my($reg_beg, $reg_end) = ($region->{'reg_beg'}, $region->{'reg_end'});
297 : dsouza 1.2
298 : dsouza 1.4 foreach my $hit (@$hits) {
299 :     my($hit_beg, $hit_end) = sort {$a <=> $b} @$hit;
300 :     # get length of hit
301 :     my $hit_ln = $hit_end - $hit_beg + 1;
302 :     if (&overlap($reg_beg, $reg_end, $hit_beg, $hit_end) < $hit_ln) {
303 :     # overlap is less than length of hit
304 :     return 0;
305 :     }
306 :     }
307 :    
308 :     # all hits overlap the region
309 :     return 1;
310 :     }
311 :    
312 :     sub regions_overlap {
313 :     my($r1, $r2) = @_;
314 :     # return overlap in bp between two regions
315 :     return &overlap($r1->{'reg_beg'}, $r1->{'reg_end'}, $r2->{'reg_beg'}, $r2->{'reg_end'});
316 :     }
317 :    
318 :     sub overlap {
319 :     my($x1, $x2, $y1, $y2) = @_;
320 :     # return 1 if regions [$x1,$x2] and [$y1,$y2] overlap, otherwise return 0
321 :    
322 :     my $overlap = 0;
323 :     ($x1, $x2) = sort {$a <=> $b} ($x1, $x2);
324 :     ($y1, $y2) = sort {$a <=> $b} ($y1, $y2);
325 :    
326 :     if (($x2 < $y1) or ($y2 < $x1)) {
327 :     $overlap = 0;
328 :     } elsif ($x1 <= $y1) {
329 :     if ($y2 <= $x2) {
330 :     $overlap = $y2 - $y1 + 1;
331 :     } else {
332 :     $overlap = $x2 - $y1 + 1;
333 :     }
334 :     } elsif ($y1 <= $x1) {
335 :     if ($x2 <= $y2) {
336 :     $overlap = $x2 - $x1 + 1;
337 :     } else {
338 :     $overlap = $y2 - $x1 + 1;
339 :     }
340 :     }
341 :    
342 :     return $overlap;
343 :     }
344 :    
345 :     sub closest_peg_regions {
346 :     my($params) = @_;
347 :    
348 : dsouza 1.2 # check parameters
349 : dsouza 1.4 my $fig = $params->{'fig'} || return undef;
350 :     my $closest_pegs = $params->{'closest_pegs'} || return undef;
351 :     my $window_size = $params->{'window_size'} || return undef;
352 :     my $half_region = int($window_size/2);
353 :    
354 : dsouza 1.2 # initialize data variable
355 : dsouza 1.4 my $regions = {};
356 : dsouza 1.2
357 : dsouza 1.4 # iterate over the pegs
358 :     foreach my $peg (@$closest_pegs) {
359 :     # get location of peg
360 : dsouza 1.2 my $loc = $fig->feature_location($peg);
361 : dsouza 1.4 my($peg_contig,$peg_beg,$peg_end) = $fig->boundaries_of($loc);
362 :     if ($peg_contig && $peg_beg && $peg_end) {
363 :     # get organism ID
364 : dsouza 1.2 my $org_id;
365 : dsouza 1.4 if ($peg =~ /^fig\|([^\|]+)\|/) {
366 : dsouza 1.2 $org_id = $1;
367 :     } else {
368 :     my $org_name = $fig->org_of($peg);
369 :     $org_id = $fig->orgid_of_orgname($org_name);
370 :     }
371 : dsouza 1.4
372 :     # find mid-point of peg
373 :     my $mid = int(($peg_beg + $peg_end)/2);
374 : dsouza 1.2
375 : dsouza 1.4 # add peg, peg location and region location to hash
376 :     push @{ $regions->{$org_id}{$peg_contig} }, {'peg' => $peg,
377 :     'peg_beg' => $peg_beg,
378 :     'peg_end' => $peg_end,
379 :     'reg_beg' => ($mid - $half_region),
380 :     'reg_end' => ($mid + $half_region)};
381 : dsouza 1.2 }
382 :     }
383 :    
384 : dsouza 1.4 # return regions data
385 :     return $regions;
386 : dsouza 1.2 }
387 :    
388 : paczian 1.1 sub get_closest_pegs {
389 : dsouza 1.4 my ($params) = @_;
390 :     # returns the n closest pegs, sorted by taxonomy
391 : paczian 1.1
392 : dsouza 1.4 # check parameters
393 :     my $fig = $params->{'fig'} || return undef;
394 :     my $peg = $params->{'id'} || return undef;
395 :     my $is_sprout = $params->{'is_sprout'} || 0;
396 :     my $n = $params->{'number_of_genomes'} || 50000; # return ALL closest pegs
397 :    
398 :     # get the n pegs closest to the one passed
399 : paczian 1.1
400 : dsouza 1.4 my($id2,$d,$peg2,$i);
401 :    
402 :     my @closest;
403 :     if ($is_sprout) {
404 :     @closest = map { $_->[0] } sort { $a->[1] <=> $b->[1] } $fig->bbhs($peg, 1.0e-10);
405 :     } else {
406 :     @closest = map { $id2 = $_->id2; ($id2 =~ /^fig\|/) ? $id2 : () } $fig->sims($peg,&FIG::max(20,$n*4),1.0e-20,"fig",&FIG::max(20,$n*4));
407 :     }
408 :    
409 :     if (@closest >= ($n-1)) {
410 :     $#closest = $n-2 ;
411 :     }
412 :     my %closest = map { $_ => 1 } @closest;
413 :    
414 :     # there are dragons flying around...
415 :     my @pinned_to = grep { ($_ ne $peg) && (! $closest{$_}) } $fig->in_pch_pin_with($peg);
416 :     my $g1 = $fig->genome_of($peg);
417 :     @pinned_to = map {$_->[1] } sort { $a->[0] <=> $b->[0] } map { $peg2 = $_; $d = $fig->crude_estimate_of_distance($g1,$fig->genome_of($peg2)); [$d,$peg2] } @pinned_to;
418 :    
419 :     if (@closest == ($n-1)) {
420 :     $#closest = ($n - 2) - &FIG::min(scalar @pinned_to,int($n/2));
421 :     for ($i=0; ($i < @pinned_to) && (@closest < ($n-1)); $i++) {
422 :     if (! $closest{$pinned_to[$i]}) {
423 :     $closest{$pinned_to[$i]} = 1;
424 :     push(@closest,$pinned_to[$i]);
425 :     }
426 :     }
427 :     }
428 :    
429 :     if ($fig->possibly_truncated($peg)) {
430 :     push(@closest, &possible_extensions($fig, $peg, \@closest));
431 : paczian 1.1 }
432 : dsouza 1.4 @closest = $fig->sort_fids_by_taxonomy(@closest);
433 :    
434 :     unshift(@closest, $peg);
435 :     return \@closest;
436 : paczian 1.1 }
437 :    
438 :     sub in_bounds {
439 :     my($min,$max,$x) = @_;
440 :    
441 :     if ($x < $min) { return $min }
442 :     elsif ($x > $max) { return $max }
443 :     else { return $x }
444 :     }
445 :    
446 :     sub possible_extensions {
447 :     my($fig, $peg,$closest_pegs) = @_;
448 :     my($g,$sim,$id2,$peg1,%poss);
449 :    
450 :     $g = &FIG::genome_of($peg);
451 :    
452 :     foreach $peg1 (@$closest_pegs) {
453 : dsouza 1.4 if ($g ne &genome_of($peg1)) {
454 :     foreach $sim ($fig->sims($peg1,500,1.0e-5,"all")) {
455 :     $id2 = $sim->id2;
456 :     if (($id2 ne $peg) && ($id2 =~ /^fig\|$g\./) && $fig->possibly_truncated($id2)) {
457 :     $poss{$id2} = 1;
458 :     }
459 :     }
460 : paczian 1.1 }
461 :     }
462 :     return keys(%poss);
463 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3