[Bio] / FigKernelPackages / ComparedRegions.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/ComparedRegions.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : paczian 1.1 package ComparedRegions;
2 :    
3 :     1;
4 :    
5 :     use strict;
6 :     use warnings;
7 :    
8 : parrello 1.3 use Tracer;
9 : parrello 1.5 use BasicLocation;
10 : paczian 1.1
11 :     sub get_compared_regions {
12 :     my ($params) = @_;
13 :    
14 :     # check parameters
15 :     my $fig = $params->{fig} || return undef;
16 :     my $peg = $params->{id} || return undef;
17 :     my $is_sprout = $params->{is_sprout} || 0;
18 :     my $region_size = $params->{region_size} || 16000;
19 :     my $number_of_genomes = $params->{number_of_genomes} || 5;
20 :    
21 :     # initialize data variable
22 :     my $data = [];
23 :    
24 :     # get the n pegs closest to the one passed
25 :     my @closest_pegs = &get_closest_pegs($fig, $peg, $is_sprout, $number_of_genomes);
26 :     unshift(@closest_pegs, $peg);
27 :    
28 :     # iterate over the returned pegs
29 :     foreach my $peg (@closest_pegs) {
30 :     my $loc = $fig->feature_location($peg);
31 : parrello 1.5 my $genome = $fig->genome_of($peg);
32 : paczian 1.1 my ($contig,$beg,$end) = $fig->boundaries_of($loc);
33 :     if ($contig && $beg && $end) {
34 :     my $mid = int(($beg + $end) / 2);
35 :     my $min = int($mid - ($region_size / 2));
36 :     my $max = int($mid + ($region_size / 2));
37 :     my $features = [];
38 : parrello 1.5 my $feature_ids = $fig->all_features_detailed($genome, $min, $max, $contig);
39 :     # "feature_ids" now contains a list of tuples. Each tuple consists of nine
40 :     # elements: (0) the feature ID, (1) the feature location (as a comma-delimited
41 :     # list of location specifiers), (2) the feature aliases (as a comma-delimited
42 :     # list of named aliases), (3) the feature type, (4) the leftmost index of the
43 :     # feature's leftmost location, (5) the rightmost index of the feature's
44 :     # rightmost location, (6) the current functional assignment, (7) the user who
45 :     # made the assignment, and (8) the quality of the assignment (which is usually a space).
46 :     foreach my $featureTuple (@$feature_ids) {
47 :     my $floc = $featureTuple->[1];
48 :     my ($contig1,$beg1,$end1) = $fig->boundaries_of($floc);
49 : paczian 1.1 $beg1 = &in_bounds($min,$max,$beg1);
50 :     $end1 = &in_bounds($min,$max,$end1);
51 :     push(@$features, { 'start' => $beg1,
52 :     'stop' => $end1,
53 : parrello 1.5 'id' => $featureTuple->[0] });
54 : paczian 1.1 }
55 :     push(@$data, { 'features' => $features,
56 :     'contig' => $contig,
57 :     'offset' => $mid - 8000 });
58 :     }
59 :     }
60 :    
61 :     # return the data
62 :     return $data;
63 :     }
64 :    
65 : dsouza 1.4 sub get_scan_for_matches_pattern {
66 : dsouza 1.2 my($params) = @_;
67 : dsouza 1.4 # get scan_for_matches pattern from the 'fasta' file
68 : dsouza 1.2
69 : dsouza 1.4 # parse input file name
70 : dsouza 1.2 my $file = $params->{'file'} || return undef;
71 : dsouza 1.4 if ($file =~ /^([a-z0-9]+)$/ or $file =~ /^tmp_([a-z0-9]+)\.cache$/) {
72 :     $file = 'tmp_' . $1 . '.fasta';
73 :     } else {
74 :     return undef;
75 : dsouza 1.2 }
76 :    
77 : dsouza 1.4 # add path
78 :     $file = "$FIG_Config::temp/$file";
79 :     # check if file exists
80 : dsouza 1.2 (-e $file) || return undef;
81 :    
82 : dsouza 1.4 my $pattern = '';
83 :     my $line;
84 :    
85 : dsouza 1.2 open(PAT, "<$file") or die "could not open file '$file': $!";
86 : dsouza 1.4 while (defined($line = <PAT>)) {
87 :     #chomp $line;
88 : dsouza 1.2 $pattern .= $line;
89 :     }
90 :     close(PAT) or die "could not close file '$file': $!";
91 :    
92 :     return $pattern;
93 :     }
94 :    
95 : dsouza 1.4 sub get_scan_for_matches_hits {
96 : dsouza 1.2 my($params) = @_;
97 :     # get Bruce's hit locations
98 :    
99 : dsouza 1.4 my $fig = $params->{'fig'} || return undef;
100 :     my $file = $params->{'file'} || return undef;
101 : dsouza 1.2
102 : dsouza 1.4 # parse input file name
103 :     if ($file =~ /^[a-z0-9]+$/) {
104 : dsouza 1.2 $file = 'tmp_' . $file . '.cache';
105 :     }
106 : dsouza 1.4
107 :     # add path
108 :     $file = "$FIG_Config::temp/$file";
109 :     # check if file exists
110 : dsouza 1.2 (-e $file) || return undef;
111 :     my($line, %locations);
112 :     open(HITS, "<$file") or die "could not open file '$file': $!";
113 : dsouza 1.4 $line = <HITS>; # ignore column header line
114 :     while (defined($line = <HITS>)) {
115 : dsouza 1.2 my($hit) = split(/\s+/, $line);
116 : parrello 1.5 Trace("Hit string is $hit.") if T(3);
117 :     my $locObject = BasicLocation->new($hit);
118 :     my $object = $locObject->Contig;
119 :     if ($object =~ /^fig\|/) {
120 : dsouza 1.2 # output from protein sequence scan
121 : parrello 1.5 my($org_id, $offset, $ln) = ($fig->genome_of($object), $locObject->Begin, $locObject->Length);
122 : dsouza 1.2
123 : parrello 1.5 my $fid = $object;
124 : dsouza 1.2 my $loc = $fig->feature_location($fid);
125 :    
126 : dsouza 1.4 if (not $fig->is_deleted_fid($fid))
127 :     {
128 :     my($contig,$beg,$end) = $fig->boundaries_of($loc);
129 :    
130 :     my($hit_beg, $hit_end);
131 :    
132 :     if ( $beg <= $end ) {
133 :     $hit_beg = $beg + ($offset * 3);
134 :     $hit_end = $hit_beg + ($ln * 3);
135 :     } else {
136 :     $hit_beg = $beg - ($offset * 3);
137 :     $hit_end = $hit_beg - ($ln * 3);
138 :     }
139 :    
140 :     push @{$locations{$org_id}{$contig}}, [$hit_beg, $hit_end];
141 : dsouza 1.2 }
142 :     } else {
143 :     # output from DNA scan
144 : dsouza 1.4 # this will need to be cleaned up depending on the format Bruce puts the output in
145 : dsouza 1.2 my($org_id, $loc) = split(":", $hit);
146 :     my($contig, $beg, $end) = $fig->boundaries_of($fig->feature_location($loc));
147 :     push @{$locations{$org_id}{$contig}}, [$beg, $end];
148 :     }
149 :     }
150 :     close(HITS) or die "could not close file '$file': $!";
151 :     return \%locations;
152 :     }
153 :    
154 : dsouza 1.4 sub add_hits_in_regions {
155 :     my($fig, $regions, $patscan_hits) = @_;
156 :     # find scan_for_matches hits (from $patscan_hits) which overlap the regions
157 : dsouza 1.2 # containing the closest pegs (from $peg_regions)
158 :    
159 : dsouza 1.4 # should probably sort regions and perform search intelligently
160 :     foreach my $org_id (keys %$regions) {
161 :     foreach my $contig (keys %{$regions->{$org_id}}) {
162 :     if (exists $patscan_hits->{$org_id} and exists $patscan_hits->{$org_id}{$contig}) {
163 :     foreach my $region (@{$regions->{$org_id}{$contig}}) {
164 :     my($reg_beg, $reg_end) = ($region->{'reg_beg'}, $region->{'reg_end'});
165 :     # build up list of hits which overlap the region
166 : dsouza 1.2 my $overlapping_hits = [];
167 : dsouza 1.4 foreach my $hit (@{$patscan_hits->{$org_id}{$contig}}) {
168 : dsouza 1.2 my($hit_beg, $hit_end) = @$hit;
169 : parrello 1.5 if (FIG::between($reg_beg, $hit_beg, $reg_end) or FIG::between($reg_beg, $hit_end, $reg_end)) {
170 : dsouza 1.2 push @$overlapping_hits, [$hit_beg, $hit_end];
171 :     }
172 :     }
173 :    
174 : dsouza 1.4 if (@$overlapping_hits) {
175 :     # add the overlapping hits in the region
176 :     $region->{'hits'} = $overlapping_hits;
177 : dsouza 1.2 }
178 :     }
179 :     }
180 :     }
181 :     }
182 :    
183 : dsouza 1.4 return $regions;
184 : dsouza 1.2 }
185 :    
186 : dsouza 1.4 sub add_features_in_regions {
187 : dsouza 1.2 my($fig, $regions) = @_;
188 : dsouza 1.4 # add features in region to data structure containing closest pegs
189 : parrello 1.5 my @regionList = keys %$regions;
190 :     Trace("Looking for features in " . scalar(@regionList) . " regions.") if T(3);
191 :     foreach my $org_id (@regionList) {
192 :     Trace("Processing genome $org_id.") if T(3);
193 : dsouza 1.4 foreach my $contig (keys %{$regions->{$org_id}}) {
194 : parrello 1.5 Trace("Processing contig $contig.") if T(3);
195 : dsouza 1.4 foreach my $region (@{$regions->{$org_id}{$contig}}) {
196 :     # get closest peg and boundaries of region
197 :     my $peg = $region->{'peg'};
198 :     my $min = $region->{'reg_beg'};
199 :     my $max = $region->{'reg_end'};
200 : parrello 1.5 Trace("Region of interest is with $peg from $min to $max.") if T(3);
201 : dsouza 1.4 # get list of features which fall in this region
202 :     my($feature_ids) = $fig->genes_in_region($fig->genome_of($peg),$contig,$min,$max);
203 :    
204 :     # make up a list of all these features
205 : dsouza 1.2 my $features = [];
206 :     foreach my $fid (@$feature_ids) {
207 :     my $floc = $fig->feature_location($fid);
208 : parrello 1.5 my ($contig1,$beg1,$end1) = $fig->boundaries_of($floc);
209 : dsouza 1.2 $beg1 = &in_bounds($min,$max,$beg1);
210 :     $end1 = &in_bounds($min,$max,$end1);
211 : dsouza 1.4 push(@$features, { 'feat_beg' => $beg1,
212 :     'feat_end' => $end1,
213 :     'feat_id' => $fid });
214 : dsouza 1.2 }
215 : parrello 1.5 Trace(scalar(@$features) . " features found in region of interest.") if T(3);
216 : dsouza 1.4 # add the feature data to the region
217 : dsouza 1.2 $region->{'features'} = $features;
218 :     }
219 :     }
220 :     }
221 : parrello 1.5 Trace("Completed features-in-regions process.") if T(3);
222 : dsouza 1.2 return $regions;
223 :     }
224 :    
225 : dsouza 1.4 sub sort_regions {
226 :     my($fig, $regions) = @_;
227 :    
228 :     # sort regions based on location on the contig, regardless of strand
229 :     # i.e. sort on the 'left-most' location of the region on the contig
230 :     foreach my $org_id (keys %$regions) {
231 :     foreach my $contig (keys %{$regions->{$org_id}}) {
232 :     my $contig_regions = $regions->{$org_id}{$contig};
233 :    
234 :     if (@$contig_regions > 1) {
235 :     my @sorted_regions = map {$_->[0]}
236 :     sort {$a->[1] <=> $b->[1]}
237 :     map {[$_, $fig->min($_->{'reg_beg'}, $_->{'reg_end'})]}
238 :     @$contig_regions;
239 :    
240 :     $regions->{$org_id}{$contig} = \@sorted_regions;
241 : dsouza 1.2 }
242 :     }
243 :     }
244 :    
245 : dsouza 1.4 return $regions;
246 : dsouza 1.2 }
247 :    
248 : dsouza 1.4 sub collapse_regions {
249 :     my($fig, $regions, $window_size) = @_;
250 :     # collapse regions which display regions which show basically the same information
251 :     # if two (or more) regions:
252 :     # a. overlap by more than half the region size, and
253 :     # b. the pattern match is located entirely within the overlap,
254 :     # only one of the regions will be retained for display.
255 :     # since the regions are centred on the 'close' peg, this should give a reasonable
256 :     # neighborhood.
257 :     # if regions get collapsed, the region displayed is the 'left-most' on the contig.
258 :    
259 :     my $half_region = 0.5 * $window_size;
260 :    
261 :     foreach my $org_id (keys %$regions) {
262 :     foreach my $contig (keys %{$regions->{$org_id}}) {
263 :     my $contig_regions = $regions->{$org_id}{$contig};
264 :    
265 :     if (@$contig_regions > 1) {
266 :     # hash for index of regions which should not be displayed
267 :     my %discard;
268 :     for (my $i = 0; $i < @$contig_regions; $i++) {
269 :     for (my $j = ($i+1); $j < @$contig_regions; $j++) {
270 :     my $overlap = &regions_overlap($contig_regions->[$i], $contig_regions->[$j]);
271 :    
272 :     if ($overlap and
273 :     ($overlap > $half_region) and
274 :     &all_hits_overlap_region($contig_regions->[$i], $contig_regions->[$j]{'hits'}))
275 :     {
276 :     # region i display contains all relevant details from region j,
277 :     # region j can be discarded.
278 :    
279 :     $discard{$j} = 1;
280 :     }
281 :     else
282 :     {
283 :     # region i display does not contain all relevant details from region j,
284 :     # region j needs to be kept.
285 :     # next value of $i needs to be $j
286 :     $i = $j - 1;
287 :     # set $j so that we exit second loop
288 :     $j = @$contig_regions;
289 :     }
290 :     }
291 :     }
292 :    
293 :     my @keep = grep {not exists $discard{$_}} (0..$#{$contig_regions});
294 :     $regions->{$org_id}{$contig} = [@$contig_regions[@keep]];
295 :     }
296 :     }
297 :     }
298 :     return $regions;
299 :     }
300 :    
301 :     sub all_hits_overlap_region {
302 :     my($region, $hits) = @_;
303 :     # Check whether ALL hits overlap the region, if yes return 1, if no return 0
304 :     # Return 1 if no hits present
305 :     # This will have problems for hits which are longer than the region displayed
306 :    
307 :     my($reg_beg, $reg_end) = ($region->{'reg_beg'}, $region->{'reg_end'});
308 : dsouza 1.2
309 : dsouza 1.4 foreach my $hit (@$hits) {
310 :     my($hit_beg, $hit_end) = sort {$a <=> $b} @$hit;
311 :     # get length of hit
312 :     my $hit_ln = $hit_end - $hit_beg + 1;
313 :     if (&overlap($reg_beg, $reg_end, $hit_beg, $hit_end) < $hit_ln) {
314 :     # overlap is less than length of hit
315 :     return 0;
316 :     }
317 :     }
318 :    
319 :     # all hits overlap the region
320 :     return 1;
321 :     }
322 :    
323 :     sub regions_overlap {
324 :     my($r1, $r2) = @_;
325 :     # return overlap in bp between two regions
326 :     return &overlap($r1->{'reg_beg'}, $r1->{'reg_end'}, $r2->{'reg_beg'}, $r2->{'reg_end'});
327 :     }
328 :    
329 :     sub overlap {
330 :     my($x1, $x2, $y1, $y2) = @_;
331 :     # return 1 if regions [$x1,$x2] and [$y1,$y2] overlap, otherwise return 0
332 :    
333 :     my $overlap = 0;
334 :     ($x1, $x2) = sort {$a <=> $b} ($x1, $x2);
335 :     ($y1, $y2) = sort {$a <=> $b} ($y1, $y2);
336 :    
337 :     if (($x2 < $y1) or ($y2 < $x1)) {
338 :     $overlap = 0;
339 :     } elsif ($x1 <= $y1) {
340 :     if ($y2 <= $x2) {
341 :     $overlap = $y2 - $y1 + 1;
342 :     } else {
343 :     $overlap = $x2 - $y1 + 1;
344 :     }
345 :     } elsif ($y1 <= $x1) {
346 :     if ($x2 <= $y2) {
347 :     $overlap = $x2 - $x1 + 1;
348 :     } else {
349 :     $overlap = $y2 - $x1 + 1;
350 :     }
351 :     }
352 :    
353 :     return $overlap;
354 :     }
355 :    
356 :     sub closest_peg_regions {
357 :     my($params) = @_;
358 :    
359 : dsouza 1.2 # check parameters
360 : dsouza 1.4 my $fig = $params->{'fig'} || return undef;
361 :     my $closest_pegs = $params->{'closest_pegs'} || return undef;
362 :     my $window_size = $params->{'window_size'} || return undef;
363 :     my $half_region = int($window_size/2);
364 :    
365 : dsouza 1.2 # initialize data variable
366 : dsouza 1.4 my $regions = {};
367 : parrello 1.5 Trace(scalar(@$closest_pegs) . " closest pegs coming in.") if T(3);
368 : dsouza 1.4 # iterate over the pegs
369 :     foreach my $peg (@$closest_pegs) {
370 :     # get location of peg
371 : dsouza 1.2 my $loc = $fig->feature_location($peg);
372 : dsouza 1.4 my($peg_contig,$peg_beg,$peg_end) = $fig->boundaries_of($loc);
373 :     if ($peg_contig && $peg_beg && $peg_end) {
374 :     # get organism ID
375 : dsouza 1.2 my $org_id;
376 : dsouza 1.4 if ($peg =~ /^fig\|([^\|]+)\|/) {
377 : dsouza 1.2 $org_id = $1;
378 :     } else {
379 :     my $org_name = $fig->org_of($peg);
380 :     $org_id = $fig->orgid_of_orgname($org_name);
381 :     }
382 : dsouza 1.4
383 :     # find mid-point of peg
384 :     my $mid = int(($peg_beg + $peg_end)/2);
385 : dsouza 1.2
386 : dsouza 1.4 # add peg, peg location and region location to hash
387 :     push @{ $regions->{$org_id}{$peg_contig} }, {'peg' => $peg,
388 :     'peg_beg' => $peg_beg,
389 :     'peg_end' => $peg_end,
390 :     'reg_beg' => ($mid - $half_region),
391 :     'reg_end' => ($mid + $half_region)};
392 : dsouza 1.2 }
393 :     }
394 :    
395 : parrello 1.5 Trace(scalar(keys %$regions) . " regions found.") if T(3);
396 : dsouza 1.4 # return regions data
397 :     return $regions;
398 : dsouza 1.2 }
399 :    
400 : paczian 1.1 sub get_closest_pegs {
401 : dsouza 1.4 my ($params) = @_;
402 :     # returns the n closest pegs, sorted by taxonomy
403 :     # check parameters
404 :     my $fig = $params->{'fig'} || return undef;
405 :     my $peg = $params->{'id'} || return undef;
406 :     my $n = $params->{'number_of_genomes'} || 50000; # return ALL closest pegs
407 : parrello 1.5 Trace("Looking for closest peg to $peg.") if T(3);
408 : dsouza 1.4
409 :     # get the n pegs closest to the one passed
410 : paczian 1.1
411 : dsouza 1.4 my($id2,$d,$peg2,$i);
412 :    
413 :     my @closest;
414 : parrello 1.5 @closest = map { $id2 = $_->id2; ($id2 =~ /^fig\|/) ? $id2 : () } $fig->sims($peg,&FIG::max(20,$n*4),1.0e-20,"fig",&FIG::max(20,$n*4));
415 : dsouza 1.4
416 :     if (@closest >= ($n-1)) {
417 :     $#closest = $n-2 ;
418 :     }
419 :     my %closest = map { $_ => 1 } @closest;
420 :    
421 :     # there are dragons flying around...
422 :     my @pinned_to = grep { ($_ ne $peg) && (! $closest{$_}) } $fig->in_pch_pin_with($peg);
423 :     my $g1 = $fig->genome_of($peg);
424 :     @pinned_to = map {$_->[1] } sort { $a->[0] <=> $b->[0] } map { $peg2 = $_; $d = $fig->crude_estimate_of_distance($g1,$fig->genome_of($peg2)); [$d,$peg2] } @pinned_to;
425 :    
426 :     if (@closest == ($n-1)) {
427 :     $#closest = ($n - 2) - &FIG::min(scalar @pinned_to,int($n/2));
428 :     for ($i=0; ($i < @pinned_to) && (@closest < ($n-1)); $i++) {
429 :     if (! $closest{$pinned_to[$i]}) {
430 :     $closest{$pinned_to[$i]} = 1;
431 :     push(@closest,$pinned_to[$i]);
432 :     }
433 :     }
434 :     }
435 :    
436 :     if ($fig->possibly_truncated($peg)) {
437 :     push(@closest, &possible_extensions($fig, $peg, \@closest));
438 : paczian 1.1 }
439 : dsouza 1.4 @closest = $fig->sort_fids_by_taxonomy(@closest);
440 :    
441 :     unshift(@closest, $peg);
442 : parrello 1.5 Trace("Returning " . scalar(@closest) . " close pegs.") if T(3);
443 : dsouza 1.4 return \@closest;
444 : paczian 1.1 }
445 :    
446 :     sub in_bounds {
447 :     my($min,$max,$x) = @_;
448 :    
449 :     if ($x < $min) { return $min }
450 :     elsif ($x > $max) { return $max }
451 :     else { return $x }
452 :     }
453 :    
454 :     sub possible_extensions {
455 :     my($fig, $peg,$closest_pegs) = @_;
456 :     my($g,$sim,$id2,$peg1,%poss);
457 :    
458 :     $g = &FIG::genome_of($peg);
459 :    
460 :     foreach $peg1 (@$closest_pegs) {
461 : dsouza 1.4 if ($g ne &genome_of($peg1)) {
462 :     foreach $sim ($fig->sims($peg1,500,1.0e-5,"all")) {
463 :     $id2 = $sim->id2;
464 :     if (($id2 ne $peg) && ($id2 =~ /^fig\|$g\./) && $fig->possibly_truncated($id2)) {
465 :     $poss{$id2} = 1;
466 :     }
467 :     }
468 : paczian 1.1 }
469 :     }
470 :     return keys(%poss);
471 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3