[Bio] / FigKernelPackages / ComparedRegions.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/ComparedRegions.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (view) (download) (as text)

1 : paczian 1.1 package ComparedRegions;
2 :    
3 :     1;
4 :    
5 :     use strict;
6 :     use warnings;
7 :    
8 : parrello 1.3 use Tracer;
9 : parrello 1.5 use BasicLocation;
10 : paczian 1.1
11 :     sub get_compared_regions {
12 :     my ($params) = @_;
13 :    
14 :     # check parameters
15 :     my $fig = $params->{fig} || return undef;
16 :     my $peg = $params->{id} || return undef;
17 :     my $is_sprout = $params->{is_sprout} || 0;
18 :     my $region_size = $params->{region_size} || 16000;
19 :     my $number_of_genomes = $params->{number_of_genomes} || 5;
20 :    
21 :     # initialize data variable
22 :     my $data = [];
23 :    
24 :     # get the n pegs closest to the one passed
25 :     my @closest_pegs = &get_closest_pegs($fig, $peg, $is_sprout, $number_of_genomes);
26 :     unshift(@closest_pegs, $peg);
27 :    
28 :     # iterate over the returned pegs
29 :     foreach my $peg (@closest_pegs) {
30 :     my $loc = $fig->feature_location($peg);
31 : parrello 1.5 my $genome = $fig->genome_of($peg);
32 : paczian 1.1 my ($contig,$beg,$end) = $fig->boundaries_of($loc);
33 :     if ($contig && $beg && $end) {
34 :     my $mid = int(($beg + $end) / 2);
35 :     my $min = int($mid - ($region_size / 2));
36 :     my $max = int($mid + ($region_size / 2));
37 :     my $features = [];
38 : parrello 1.5 my $feature_ids = $fig->all_features_detailed($genome, $min, $max, $contig);
39 :     # "feature_ids" now contains a list of tuples. Each tuple consists of nine
40 :     # elements: (0) the feature ID, (1) the feature location (as a comma-delimited
41 :     # list of location specifiers), (2) the feature aliases (as a comma-delimited
42 :     # list of named aliases), (3) the feature type, (4) the leftmost index of the
43 :     # feature's leftmost location, (5) the rightmost index of the feature's
44 :     # rightmost location, (6) the current functional assignment, (7) the user who
45 :     # made the assignment, and (8) the quality of the assignment (which is usually a space).
46 :     foreach my $featureTuple (@$feature_ids) {
47 :     my $floc = $featureTuple->[1];
48 :     my ($contig1,$beg1,$end1) = $fig->boundaries_of($floc);
49 : paczian 1.1 $beg1 = &in_bounds($min,$max,$beg1);
50 :     $end1 = &in_bounds($min,$max,$end1);
51 :     push(@$features, { 'start' => $beg1,
52 :     'stop' => $end1,
53 : parrello 1.5 'id' => $featureTuple->[0] });
54 : paczian 1.1 }
55 :     push(@$data, { 'features' => $features,
56 :     'contig' => $contig,
57 :     'offset' => $mid - 8000 });
58 :     }
59 :     }
60 :    
61 :     # return the data
62 :     return $data;
63 :     }
64 :    
65 : dsouza 1.4 sub get_scan_for_matches_pattern {
66 : dsouza 1.2 my($params) = @_;
67 : dsouza 1.4 # get scan_for_matches pattern from the 'fasta' file
68 : dsouza 1.2
69 : dsouza 1.4 # parse input file name
70 : dsouza 1.2 my $file = $params->{'file'} || return undef;
71 : dsouza 1.4 if ($file =~ /^([a-z0-9]+)$/ or $file =~ /^tmp_([a-z0-9]+)\.cache$/) {
72 :     $file = 'tmp_' . $1 . '.fasta';
73 :     } else {
74 :     return undef;
75 : dsouza 1.2 }
76 :    
77 : dsouza 1.4 # add path
78 :     $file = "$FIG_Config::temp/$file";
79 :     # check if file exists
80 : dsouza 1.2 (-e $file) || return undef;
81 :    
82 : dsouza 1.4 my $pattern = '';
83 :     my $line;
84 :    
85 : dsouza 1.2 open(PAT, "<$file") or die "could not open file '$file': $!";
86 : dsouza 1.4 while (defined($line = <PAT>)) {
87 :     #chomp $line;
88 : dsouza 1.2 $pattern .= $line;
89 :     }
90 :     close(PAT) or die "could not close file '$file': $!";
91 :    
92 :     return $pattern;
93 :     }
94 :    
95 : dsouza 1.4 sub get_scan_for_matches_hits {
96 : dsouza 1.2 my($params) = @_;
97 :     # get Bruce's hit locations
98 :    
99 : dsouza 1.4 my $fig = $params->{'fig'} || return undef;
100 :     my $file = $params->{'file'} || return undef;
101 : dsouza 1.2
102 : dsouza 1.4 # parse input file name
103 :     if ($file =~ /^[a-z0-9]+$/) {
104 : dsouza 1.2 $file = 'tmp_' . $file . '.cache';
105 :     }
106 : dsouza 1.4
107 :     # add path
108 :     $file = "$FIG_Config::temp/$file";
109 :     # check if file exists
110 : dsouza 1.2 (-e $file) || return undef;
111 :     my($line, %locations);
112 : parrello 1.6 Open(\*HITS, "<$file");
113 : dsouza 1.4 $line = <HITS>; # ignore column header line
114 :     while (defined($line = <HITS>)) {
115 : dsouza 1.2 my($hit) = split(/\s+/, $line);
116 : parrello 1.5 Trace("Hit string is $hit.") if T(3);
117 :     my $locObject = BasicLocation->new($hit);
118 :     my $object = $locObject->Contig;
119 :     if ($object =~ /^fig\|/) {
120 : dsouza 1.2 # output from protein sequence scan
121 : parrello 1.5 my($org_id, $offset, $ln) = ($fig->genome_of($object), $locObject->Begin, $locObject->Length);
122 : dsouza 1.2
123 : parrello 1.5 my $fid = $object;
124 : dsouza 1.2 my $loc = $fig->feature_location($fid);
125 :    
126 : dsouza 1.4 if (not $fig->is_deleted_fid($fid))
127 :     {
128 :     my($contig,$beg,$end) = $fig->boundaries_of($loc);
129 :    
130 :     my($hit_beg, $hit_end);
131 :    
132 :     if ( $beg <= $end ) {
133 :     $hit_beg = $beg + ($offset * 3);
134 :     $hit_end = $hit_beg + ($ln * 3);
135 :     } else {
136 :     $hit_beg = $beg - ($offset * 3);
137 :     $hit_end = $hit_beg - ($ln * 3);
138 :     }
139 :    
140 :     push @{$locations{$org_id}{$contig}}, [$hit_beg, $hit_end];
141 : dsouza 1.2 }
142 :     } else {
143 :     # output from DNA scan
144 : dsouza 1.4 # this will need to be cleaned up depending on the format Bruce puts the output in
145 : parrello 1.6 my($org_id) = split(":", $hit);
146 :     my($contig, $beg, $end) = ($locObject->Contig, $locObject->Begin, $locObject->EndPoint);
147 : dsouza 1.2 push @{$locations{$org_id}{$contig}}, [$beg, $end];
148 :     }
149 :     }
150 : parrello 1.6 close(HITS) or Confess("could not close file '$file': $!");
151 : dsouza 1.2 return \%locations;
152 :     }
153 :    
154 : dsouza 1.4 sub add_hits_in_regions {
155 :     my($fig, $regions, $patscan_hits) = @_;
156 :     # find scan_for_matches hits (from $patscan_hits) which overlap the regions
157 : dsouza 1.2 # containing the closest pegs (from $peg_regions)
158 : parrello 1.6 my @regionKeys = keys %$regions;
159 :     Trace("Adding hits for " . scalar(@regionKeys) . " regions.") if T(3);
160 :     # Keep in here a list of the organisms that had hits.
161 :     my %hitOrgs = ();
162 : dsouza 1.4 # should probably sort regions and perform search intelligently
163 : parrello 1.6 foreach my $org_id (@regionKeys) {
164 :     Trace("Finding hits for $org_id.") if T(3);
165 : dsouza 1.4 foreach my $contig (keys %{$regions->{$org_id}}) {
166 : parrello 1.6 Trace("Finding hits for $contig.") if T(3);
167 : dsouza 1.4 if (exists $patscan_hits->{$org_id} and exists $patscan_hits->{$org_id}{$contig}) {
168 : parrello 1.6 Trace("Hits found on $contig.") if T(3);
169 :     $hitOrgs{$org_id}{$contig} = 1;
170 : dsouza 1.4 foreach my $region (@{$regions->{$org_id}{$contig}}) {
171 :     my($reg_beg, $reg_end) = ($region->{'reg_beg'}, $region->{'reg_end'});
172 :     # build up list of hits which overlap the region
173 : dsouza 1.2 my $overlapping_hits = [];
174 : dsouza 1.4 foreach my $hit (@{$patscan_hits->{$org_id}{$contig}}) {
175 : dsouza 1.2 my($hit_beg, $hit_end) = @$hit;
176 : parrello 1.5 if (FIG::between($reg_beg, $hit_beg, $reg_end) or FIG::between($reg_beg, $hit_end, $reg_end)) {
177 : dsouza 1.2 push @$overlapping_hits, [$hit_beg, $hit_end];
178 :     }
179 :     }
180 : parrello 1.6 Trace(scalar(@$overlapping_hits) . " overlapping hits found between $reg_beg and $reg_end.") if T(3);
181 : dsouza 1.4 if (@$overlapping_hits) {
182 :     # add the overlapping hits in the region
183 :     $region->{'hits'} = $overlapping_hits;
184 : dsouza 1.2 }
185 :     }
186 :     }
187 :     }
188 :     }
189 : parrello 1.6 # Now delete the unused genomes and contigs.
190 :     for my $org_id (@regionKeys) {
191 :     if (! exists $hitOrgs{$org_id}) {
192 :     delete $regions->{$org_id};
193 :     } else {
194 :     my $contigHash = $regions->{$org_id};
195 :     my @contigKeys = keys %{$contigHash};
196 :     for my $contig (@contigKeys) {
197 :     if (! exists $hitOrgs{$org_id}->{$contig}) {
198 :     delete $contigHash->{$contig};
199 :     }
200 :     }
201 :     }
202 :     }
203 : dsouza 1.4 return $regions;
204 : dsouza 1.2 }
205 :    
206 : dsouza 1.4 sub add_features_in_regions {
207 : dsouza 1.2 my($fig, $regions) = @_;
208 : dsouza 1.4 # add features in region to data structure containing closest pegs
209 : parrello 1.5 my @regionList = keys %$regions;
210 :     Trace("Looking for features in " . scalar(@regionList) . " regions.") if T(3);
211 :     foreach my $org_id (@regionList) {
212 :     Trace("Processing genome $org_id.") if T(3);
213 : dsouza 1.4 foreach my $contig (keys %{$regions->{$org_id}}) {
214 : parrello 1.5 Trace("Processing contig $contig.") if T(3);
215 : dsouza 1.4 foreach my $region (@{$regions->{$org_id}{$contig}}) {
216 :     # get closest peg and boundaries of region
217 :     my $peg = $region->{'peg'};
218 :     my $min = $region->{'reg_beg'};
219 :     my $max = $region->{'reg_end'};
220 : parrello 1.5 Trace("Region of interest is with $peg from $min to $max.") if T(3);
221 : dsouza 1.4 # get list of features which fall in this region
222 :     my($feature_ids) = $fig->genes_in_region($fig->genome_of($peg),$contig,$min,$max);
223 :    
224 :     # make up a list of all these features
225 : dsouza 1.2 my $features = [];
226 :     foreach my $fid (@$feature_ids) {
227 :     my $floc = $fig->feature_location($fid);
228 : parrello 1.5 my ($contig1,$beg1,$end1) = $fig->boundaries_of($floc);
229 : dsouza 1.2 $beg1 = &in_bounds($min,$max,$beg1);
230 :     $end1 = &in_bounds($min,$max,$end1);
231 : dsouza 1.4 push(@$features, { 'feat_beg' => $beg1,
232 :     'feat_end' => $end1,
233 :     'feat_id' => $fid });
234 : dsouza 1.2 }
235 : parrello 1.5 Trace(scalar(@$features) . " features found in region of interest.") if T(3);
236 : dsouza 1.4 # add the feature data to the region
237 : dsouza 1.2 $region->{'features'} = $features;
238 :     }
239 :     }
240 :     }
241 : parrello 1.5 Trace("Completed features-in-regions process.") if T(3);
242 : dsouza 1.2 return $regions;
243 :     }
244 :    
245 : dsouza 1.4 sub sort_regions {
246 :     my($fig, $regions) = @_;
247 :    
248 :     # sort regions based on location on the contig, regardless of strand
249 :     # i.e. sort on the 'left-most' location of the region on the contig
250 :     foreach my $org_id (keys %$regions) {
251 :     foreach my $contig (keys %{$regions->{$org_id}}) {
252 :     my $contig_regions = $regions->{$org_id}{$contig};
253 :    
254 :     if (@$contig_regions > 1) {
255 :     my @sorted_regions = map {$_->[0]}
256 :     sort {$a->[1] <=> $b->[1]}
257 :     map {[$_, $fig->min($_->{'reg_beg'}, $_->{'reg_end'})]}
258 :     @$contig_regions;
259 :    
260 :     $regions->{$org_id}{$contig} = \@sorted_regions;
261 : dsouza 1.2 }
262 :     }
263 :     }
264 :    
265 : dsouza 1.4 return $regions;
266 : dsouza 1.2 }
267 :    
268 : dsouza 1.4 sub collapse_regions {
269 :     my($fig, $regions, $window_size) = @_;
270 :     # collapse regions which display regions which show basically the same information
271 :     # if two (or more) regions:
272 :     # a. overlap by more than half the region size, and
273 :     # b. the pattern match is located entirely within the overlap,
274 :     # only one of the regions will be retained for display.
275 :     # since the regions are centred on the 'close' peg, this should give a reasonable
276 :     # neighborhood.
277 :     # if regions get collapsed, the region displayed is the 'left-most' on the contig.
278 :    
279 :     my $half_region = 0.5 * $window_size;
280 :    
281 :     foreach my $org_id (keys %$regions) {
282 :     foreach my $contig (keys %{$regions->{$org_id}}) {
283 :     my $contig_regions = $regions->{$org_id}{$contig};
284 :    
285 :     if (@$contig_regions > 1) {
286 :     # hash for index of regions which should not be displayed
287 :     my %discard;
288 :     for (my $i = 0; $i < @$contig_regions; $i++) {
289 :     for (my $j = ($i+1); $j < @$contig_regions; $j++) {
290 :     my $overlap = &regions_overlap($contig_regions->[$i], $contig_regions->[$j]);
291 :    
292 :     if ($overlap and
293 :     ($overlap > $half_region) and
294 :     &all_hits_overlap_region($contig_regions->[$i], $contig_regions->[$j]{'hits'}))
295 :     {
296 :     # region i display contains all relevant details from region j,
297 :     # region j can be discarded.
298 :    
299 :     $discard{$j} = 1;
300 :     }
301 :     else
302 :     {
303 :     # region i display does not contain all relevant details from region j,
304 :     # region j needs to be kept.
305 :     # next value of $i needs to be $j
306 :     $i = $j - 1;
307 :     # set $j so that we exit second loop
308 :     $j = @$contig_regions;
309 :     }
310 :     }
311 :     }
312 :    
313 :     my @keep = grep {not exists $discard{$_}} (0..$#{$contig_regions});
314 :     $regions->{$org_id}{$contig} = [@$contig_regions[@keep]];
315 :     }
316 :     }
317 :     }
318 :     return $regions;
319 :     }
320 :    
321 :     sub all_hits_overlap_region {
322 :     my($region, $hits) = @_;
323 :     # Check whether ALL hits overlap the region, if yes return 1, if no return 0
324 :     # Return 1 if no hits present
325 :     # This will have problems for hits which are longer than the region displayed
326 :    
327 :     my($reg_beg, $reg_end) = ($region->{'reg_beg'}, $region->{'reg_end'});
328 : dsouza 1.2
329 : dsouza 1.4 foreach my $hit (@$hits) {
330 :     my($hit_beg, $hit_end) = sort {$a <=> $b} @$hit;
331 :     # get length of hit
332 :     my $hit_ln = $hit_end - $hit_beg + 1;
333 :     if (&overlap($reg_beg, $reg_end, $hit_beg, $hit_end) < $hit_ln) {
334 :     # overlap is less than length of hit
335 :     return 0;
336 :     }
337 :     }
338 :    
339 :     # all hits overlap the region
340 :     return 1;
341 :     }
342 :    
343 :     sub regions_overlap {
344 :     my($r1, $r2) = @_;
345 :     # return overlap in bp between two regions
346 :     return &overlap($r1->{'reg_beg'}, $r1->{'reg_end'}, $r2->{'reg_beg'}, $r2->{'reg_end'});
347 :     }
348 :    
349 :     sub overlap {
350 :     my($x1, $x2, $y1, $y2) = @_;
351 :     # return 1 if regions [$x1,$x2] and [$y1,$y2] overlap, otherwise return 0
352 :    
353 :     my $overlap = 0;
354 :     ($x1, $x2) = sort {$a <=> $b} ($x1, $x2);
355 :     ($y1, $y2) = sort {$a <=> $b} ($y1, $y2);
356 :    
357 :     if (($x2 < $y1) or ($y2 < $x1)) {
358 :     $overlap = 0;
359 :     } elsif ($x1 <= $y1) {
360 :     if ($y2 <= $x2) {
361 :     $overlap = $y2 - $y1 + 1;
362 :     } else {
363 :     $overlap = $x2 - $y1 + 1;
364 :     }
365 :     } elsif ($y1 <= $x1) {
366 :     if ($x2 <= $y2) {
367 :     $overlap = $x2 - $x1 + 1;
368 :     } else {
369 :     $overlap = $y2 - $x1 + 1;
370 :     }
371 :     }
372 :    
373 :     return $overlap;
374 :     }
375 :    
376 :     sub closest_peg_regions {
377 :     my($params) = @_;
378 :    
379 : dsouza 1.2 # check parameters
380 : dsouza 1.4 my $fig = $params->{'fig'} || return undef;
381 :     my $closest_pegs = $params->{'closest_pegs'} || return undef;
382 :     my $window_size = $params->{'window_size'} || return undef;
383 :     my $half_region = int($window_size/2);
384 :    
385 : dsouza 1.2 # initialize data variable
386 : dsouza 1.4 my $regions = {};
387 : parrello 1.5 Trace(scalar(@$closest_pegs) . " closest pegs coming in.") if T(3);
388 : dsouza 1.4 # iterate over the pegs
389 :     foreach my $peg (@$closest_pegs) {
390 :     # get location of peg
391 : dsouza 1.2 my $loc = $fig->feature_location($peg);
392 : dsouza 1.4 my($peg_contig,$peg_beg,$peg_end) = $fig->boundaries_of($loc);
393 :     if ($peg_contig && $peg_beg && $peg_end) {
394 :     # get organism ID
395 : dsouza 1.2 my $org_id;
396 : dsouza 1.4 if ($peg =~ /^fig\|([^\|]+)\|/) {
397 : dsouza 1.2 $org_id = $1;
398 :     } else {
399 :     my $org_name = $fig->org_of($peg);
400 :     $org_id = $fig->orgid_of_orgname($org_name);
401 :     }
402 : dsouza 1.4
403 :     # find mid-point of peg
404 :     my $mid = int(($peg_beg + $peg_end)/2);
405 : dsouza 1.2
406 : dsouza 1.4 # add peg, peg location and region location to hash
407 :     push @{ $regions->{$org_id}{$peg_contig} }, {'peg' => $peg,
408 :     'peg_beg' => $peg_beg,
409 :     'peg_end' => $peg_end,
410 :     'reg_beg' => ($mid - $half_region),
411 :     'reg_end' => ($mid + $half_region)};
412 : dsouza 1.2 }
413 :     }
414 :    
415 : parrello 1.5 Trace(scalar(keys %$regions) . " regions found.") if T(3);
416 : dsouza 1.4 # return regions data
417 :     return $regions;
418 : dsouza 1.2 }
419 :    
420 : paczian 1.1 sub get_closest_pegs {
421 : dsouza 1.4 my ($params) = @_;
422 :     # returns the n closest pegs, sorted by taxonomy
423 :     # check parameters
424 :     my $fig = $params->{'fig'} || return undef;
425 :     my $peg = $params->{'id'} || return undef;
426 :     my $n = $params->{'number_of_genomes'} || 50000; # return ALL closest pegs
427 : parrello 1.6 Trace("Looking for closest peg to $peg in $n genomes.") if T(3);
428 :     # Create a hash of legal genomes.
429 :     my %genomeIDs = map { $_ => 1 } $fig->genomes();
430 : dsouza 1.4
431 :     # get the n pegs closest to the one passed
432 : paczian 1.1
433 : dsouza 1.4 my($id2,$d,$peg2,$i);
434 :    
435 :     my @closest;
436 : parrello 1.5 @closest = map { $id2 = $_->id2; ($id2 =~ /^fig\|/) ? $id2 : () } $fig->sims($peg,&FIG::max(20,$n*4),1.0e-20,"fig",&FIG::max(20,$n*4));
437 : dsouza 1.4
438 :     if (@closest >= ($n-1)) {
439 :     $#closest = $n-2 ;
440 :     }
441 :     my %closest = map { $_ => 1 } @closest;
442 :    
443 : parrello 1.6 my $g1 = $fig->genome_of($peg);
444 : dsouza 1.4 # there are dragons flying around...
445 : parrello 1.6 Trace("Checking pins.") if T(3);
446 :     my @pinned_to = grep { ($_ ne $peg) && (! $closest{$_}) && $genomeIDs{$fig->genome_of($_)} } $fig->in_pch_pin_with($peg);
447 :     Trace("Mapping by distance.") if T(3);
448 : dsouza 1.4 @pinned_to = map {$_->[1] } sort { $a->[0] <=> $b->[0] } map { $peg2 = $_; $d = $fig->crude_estimate_of_distance($g1,$fig->genome_of($peg2)); [$d,$peg2] } @pinned_to;
449 : parrello 1.6 Trace("computing pin stuff.") if T(3);
450 : dsouza 1.4 if (@closest == ($n-1)) {
451 :     $#closest = ($n - 2) - &FIG::min(scalar @pinned_to,int($n/2));
452 :     for ($i=0; ($i < @pinned_to) && (@closest < ($n-1)); $i++) {
453 :     if (! $closest{$pinned_to[$i]}) {
454 :     $closest{$pinned_to[$i]} = 1;
455 :     push(@closest,$pinned_to[$i]);
456 :     }
457 :     }
458 :     }
459 : parrello 1.6 # Trace("Checking for extensions.") if T(3);
460 :     # if ($fig->possibly_truncated($peg)) {
461 :     # push(@closest, &possible_extensions($fig, $peg, \@closest));
462 :     # }
463 :     Trace("Sorting by taxonomy.") if T(3);
464 : dsouza 1.4 @closest = $fig->sort_fids_by_taxonomy(@closest);
465 :    
466 :     unshift(@closest, $peg);
467 : parrello 1.5 Trace("Returning " . scalar(@closest) . " close pegs.") if T(3);
468 : dsouza 1.4 return \@closest;
469 : paczian 1.1 }
470 :    
471 :     sub in_bounds {
472 :     my($min,$max,$x) = @_;
473 :    
474 :     if ($x < $min) { return $min }
475 :     elsif ($x > $max) { return $max }
476 :     else { return $x }
477 :     }
478 :    
479 :     sub possible_extensions {
480 :     my($fig, $peg,$closest_pegs) = @_;
481 :     my($g,$sim,$id2,$peg1,%poss);
482 :    
483 :     $g = &FIG::genome_of($peg);
484 :    
485 :     foreach $peg1 (@$closest_pegs) {
486 : parrello 1.6 if ($g ne $fig->genome_of($peg1)) {
487 : dsouza 1.4 foreach $sim ($fig->sims($peg1,500,1.0e-5,"all")) {
488 :     $id2 = $sim->id2;
489 :     if (($id2 ne $peg) && ($id2 =~ /^fig\|$g\./) && $fig->possibly_truncated($id2)) {
490 :     $poss{$id2} = 1;
491 :     }
492 :     }
493 : paczian 1.1 }
494 :     }
495 :     return keys(%poss);
496 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3