[Bio] / FigKernelPackages / ComparedRegions.pm Repository:
ViewVC logotype

View of /FigKernelPackages/ComparedRegions.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Tue Jun 19 18:52:40 2007 UTC (12 years, 11 months ago) by dsouza
Branch: MAIN
Changes since 1.1: +204 -0 lines
Added routines to handle scan_for_matches hits. MD

package ComparedRegions;

1;

use strict;
use warnings;

use FIG;
use FIGV;

sub get_compared_regions {
  my ($params) = @_;

  # check parameters
  my $fig = $params->{fig} || return undef;
  my $peg = $params->{id} || return undef;
  my $is_sprout = $params->{is_sprout} || 0;
  my $region_size = $params->{region_size} || 16000;
  my $number_of_genomes = $params->{number_of_genomes} || 5;

  # initialize data variable
  my $data = [];

  # get the n pegs closest to the one passed
  my @closest_pegs = &get_closest_pegs($fig, $peg, $is_sprout, $number_of_genomes);
  unshift(@closest_pegs, $peg);
  
  # iterate over the returned pegs
  foreach my $peg (@closest_pegs) { 
    my $loc = $fig->feature_location($peg);
    my ($contig,$beg,$end) = $fig->boundaries_of($loc);
    if ($contig && $beg && $end) {
      my $mid = int(($beg + $end) / 2);
      my $min = int($mid - ($region_size / 2));
      my $max = int($mid + ($region_size / 2));
      my $features = [];
      my $feature_ids;
      ($feature_ids, undef, undef) = $fig->genes_in_region($fig->genome_of($peg),$contig,$min,$max);
      foreach my $fid (@$feature_ids) {
	my $floc = $fig->feature_location($fid);
	my ($contig1,$beg1,$end1) = $fig->boundaries_of($fig->feature_location($fid));
	$beg1 = &in_bounds($min,$max,$beg1);
	$end1 = &in_bounds($min,$max,$end1);
	push(@$features, { 'start' => $beg1,
			   'stop' => $end1,
			   'id' => $fid });
      }
      push(@$data, { 'features' => $features,
		     'contig' => $contig,
		     'offset' => $mid - 8000 });
    }
  }

  # return the data
  return $data;
}

sub get_pattern {
    my($params) = @_;
    # get pattern

    my $pattern = '';
    my $line;

    my $file = $params->{'file'} || return undef;

    if ( $file =~ /^([a-z0-9]+)$/ ) {
	$file = 'tmp_' . $file . '.fasta';
    } elsif ( $file =~ /^(tmp_[a-z0-9]+)\.cache$/ ) {
	$file = $1 . '.fasta';
    }

    # add path -- need to check if temp directory is correct
    $file    = "$FIG_Config::temp/$file";
    (-e $file) || return undef;

    open(PAT, "<$file") or die "could not open file '$file': $!";
    while ( defined($line = <HITS>) ) {
	chomp $line;
	$pattern .= $line;
    }
    close(PAT) or die "could not close file '$file': $!";

    return $pattern;
}

sub get_marked_locations {
    my($params) = @_;
    # get Bruce's hit locations

    my $fig  = $params->{'fig'} || return undef;
    # my $file = $params->{'file_name'} || return undef;
    my $file = $params->{'file'} || 'tmp_19857906a0d1ea9aa7ce1a91a77d11b9.cache';

    if ( $file =~ /^[a-z0-9]+$/ ) {
	$file = 'tmp_' . $file . '.cache';
    }
    # add path -- need to check if temp directory is correct
    $file    = "$FIG_Config::temp/$file";
    (-e $file) || return undef;

    my($line, %locations);
    open(HITS, "<$file") or die "could not open file '$file': $!";
    $line = <HITS>;  # column header line
    while ( defined($line = <HITS>) ) {
	my($hit) = split(/\s+/, $line);

	if ( $hit =~ /^fig\|(\S+)\.peg\.(\d+)_(\d+)\+(\d+)$/ ) {
	    # output from protein sequence scan
	    my($org_id, $peg_num, $offset, $ln) = ($1, $2, $3, $4);

	    my $fid = 'fig|' . $org_id . '.peg.' . $peg_num;
	    my $loc = $fig->feature_location($fid);
	    my($contig,$beg,$end) = $fig->boundaries_of($loc);

	    my($hit_beg, $hit_end);

	    if ( $beg <= $end ) {
		$hit_beg = $beg + ($offset * 3);
		$hit_end = $hit_beg + ($ln * 3);
	    } else {
		$hit_beg = $beg - ($offset * 3);
		$hit_end = $hit_beg - ($ln * 3);
	    }

	    push @{$locations{$org_id}{$contig}}, [$hit_beg, $hit_end];
	} else {
	    # output from DNA scan
	    # this will need to be cleaned up -- check Bruce's file format
	    my($org_id, $loc) = split(":", $hit);
	    my($contig, $beg, $end) = $fig->boundaries_of($fig->feature_location($loc));
	    push @{$locations{$org_id}{$contig}}, [$beg, $end];
	}
    }
    close(HITS) or die "could not close file '$file': $!";
    return \%locations;
}

sub get_hits_in_regions {
    my($fig, $peg_regions, $hit_loc) = @_;
    # find scan_for_matches hits (from $hit_loc) which overlap the regions 
    # containing the closest pegs (from $peg_regions)
    
    my %regions;
    
    # should probably sort hits & regions and perform search intelligently
    foreach my $org_id ( keys %$peg_regions ) {
	foreach my $contig ( keys %{$peg_regions->{$org_id}} ) {
	    if ( exists $hit_loc->{$org_id} and exists $hit_loc->{$org_id}{$contig} ) {
		foreach my $rec ( @{$peg_regions->{$org_id}{$contig}} ) {
		    my($reg_beg, $reg_end, $peg) = @$rec;
		    my $overlapping_hits = [];
		    foreach my $hit ( @{$hit_loc->{$org_id}{$contig}} ) {
			my($hit_beg, $hit_end) = @$hit;
			if ( $fig->between($reg_beg, $hit_beg, $reg_end) or $fig->between($reg_beg, $hit_end, $reg_end) ) {
			    push @$overlapping_hits, [$hit_beg, $hit_end];
			}
		    }

		    if ( @$overlapping_hits ) {
			push @{$regions{$org_id}{$contig}}, {'id'    => $peg,
							     'start' => $reg_beg,
							     'stop'  => $reg_end,
							     'hits'  => $overlapping_hits};
		    }
		}
	    }
	}
    }
    
    return \%regions;
}

sub get_features_in_regions {
    my($fig, $regions) = @_;
    # add features in region to data structure containing closest pegs, with hits in defined window
    foreach my $org_id ( keys %$regions ) {
	foreach my $contig ( keys %{$regions->{$org_id}} ) {
	    foreach my $region ( @{$regions->{$org_id}{$contig}} ) {
		my $peg = $region->{'id'};
		my $min = $region->{'start'};
		my $max = $region->{'stop'};

		my $features = [];
		my $feature_ids;
		($feature_ids, undef, undef) = $fig->genes_in_region($fig->genome_of($peg),$contig,$min,$max);
		foreach my $fid (@$feature_ids) {
		    my $floc = $fig->feature_location($fid);
		    my ($contig1,$beg1,$end1) = $fig->boundaries_of($fig->feature_location($fid));
		    $beg1 = &in_bounds($min,$max,$beg1);
		    $end1 = &in_bounds($min,$max,$end1);
		    push(@$features, { 'start' => $beg1,
				       'stop' => $end1,
				       'id' => $fid });
		}

		$region->{'features'} = $features;
	    }
	    
	}
    }

    return $regions;
}

sub expand_region {
    my($loc, $region_size) = @_;
    my $half_region = int($region_size/2);
    
    foreach my $org_id ( keys %$loc ) {
	foreach my $contig ( keys %{$loc->{$org_id}} ) {
	    foreach my $rec ( @{$loc->{$org_id}{$contig}} ) {
		my($beg, $end, $peg) = @$rec;
		my $mid = int(($beg + $end)/2);
		$rec = [($mid - $half_region), ($mid + $half_region), $peg];
	    }
	}
    }

    return $loc;
}

sub get_closest_pegs_with_locs {
    my ($params) = @_;
    # return the locations of the closest pegs
    
    # check parameters
    my $fig = $params->{fig} || return undef;
    my $peg = $params->{id}  || return undef;
    my $is_sprout = $params->{is_sprout} || 0;
    my $number_of_genomes = $params->{number_of_genomes} || 50000; # return ALL closest pegs
    
    # initialize data variable
    my $locations = {};
    
    # get the n pegs closest to the one passed
    my @closest_pegs = &get_closest_pegs($fig, $peg, $is_sprout, $number_of_genomes);
    unshift(@closest_pegs, $peg);
    
    # iterate over the returned pegs
    foreach my $peg (@closest_pegs) { 
	my $loc = $fig->feature_location($peg);
	my ($contig,$beg,$end) = $fig->boundaries_of($loc);
	if ($contig && $beg && $end) {
	    my $org_id;
	    if ( $peg =~ /^fig\|([^\|]+)\|/ ) {
		$org_id = $1;
	    } else {
		my $org_name = $fig->org_of($peg);
		$org_id      = $fig->orgid_of_orgname($org_name);
	    }
		
	    # add peg and location
	    push @{ $locations->{$org_id}{$contig} }, [$beg, $end, $peg];
	}
    }
	
    # return the locations of the closest pegs
    return $locations;
}

# returns the n closest pegs, sorted by taxonomy
sub get_closest_pegs {
  my ($fig, $peg, $is_sprout, $n) = @_;

  my($id2,$d,$peg2,$i);

  my @closest;
  if ($is_sprout) {
    @closest = map { $_->[0] } sort { $a->[1] <=> $b->[1] } $fig->bbhs($peg, 1.0e-10);
  } else {
    @closest = map { $id2 = $_->id2; ($id2 =~ /^fig\|/) ? $id2 : () } $fig->sims($peg,&FIG::max(20,$n*4),1.0e-20,"fig",&FIG::max(20,$n*4));
  }

  if (@closest >= ($n-1)) { 
    $#closest = $n-2 ;
  }
  my %closest = map { $_ => 1 } @closest;
  
  # there are dragons flying around...
  my @pinned_to = grep { ($_ ne $peg) && (! $closest{$_}) } $fig->in_pch_pin_with($peg);
  my $g1 = $fig->genome_of($peg);
  @pinned_to = map {$_->[1] } sort { $a->[0] <=> $b->[0] } map { $peg2 = $_; $d = $fig->crude_estimate_of_distance($g1,$fig->genome_of($peg2)); [$d,$peg2] } @pinned_to;
  
  if (@closest == ($n-1)) {
    $#closest = ($n - 2) - &FIG::min(scalar @pinned_to,int($n/2));
    for ($i=0; ($i < @pinned_to) && (@closest < ($n-1)); $i++) {
      if (! $closest{$pinned_to[$i]}) {
	$closest{$pinned_to[$i]} = 1;
	push(@closest,$pinned_to[$i]);
      }
    }
  }

  if ($fig->possibly_truncated($peg)) {
    push(@closest, &possible_extensions($fig, $peg, \@closest));
  }
  @closest = $fig->sort_fids_by_taxonomy(@closest);

  return @closest;

}

sub in_bounds {
    my($min,$max,$x) = @_;

    if     ($x < $min)     { return $min }
    elsif  ($x > $max)     { return $max }
    else                   { return $x   }
}

sub possible_extensions {
  my($fig, $peg,$closest_pegs) = @_;
  my($g,$sim,$id2,$peg1,%poss);
  
  $g = &FIG::genome_of($peg);
  
  foreach $peg1 (@$closest_pegs) {
    if ($g ne &genome_of($peg1)) {
      foreach $sim ($fig->sims($peg1,500,1.0e-5,"all")) {
	$id2 = $sim->id2;
	if (($id2 ne $peg) && ($id2 =~ /^fig\|$g\./) && $fig->possibly_truncated($id2)) {
	  $poss{$id2} = 1;
	}
      }
    }
  }
  return keys(%poss);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3