[Bio] / FigKernelPackages / PinnedRegions.pm Repository:
ViewVC logotype

View of /FigKernelPackages/PinnedRegions.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Tue Feb 5 21:04:36 2008 UTC (12 years, 1 month ago) by dsouza
Branch: MAIN
Changes since 1.4: +51 -7 lines
1. Added FigFam name to feature data (for PEGs in FigFams).
2. Added support for user selection of region genomes.

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################


package PinnedRegions;

use strict;
use warnings;

use FIG;
use FigFams;
use FIG_Config;

use Time::HiRes qw( usleep ualarm gettimeofday tv_interval );

sub pinned_regions {
    my($fig, $pin_desc, $fast_color, $sims_from, $map_sz) = @_;

    # Get list of pegs required by the description in $pin_desc
    my $pinned_pegs = &expand_peg_list($fig, $pin_desc);

    # Get regions around pinned pegs -- boundaries, features, etc.
    my $regions = &define_regions($fig, $map_sz, $pinned_pegs);

    # Filter out overlapping regions caused by gene fusions, frame shifts etc. where multiple PEGs
    # are pinned (by similarity or PCH) to the input PEG
    $regions = &filter_regions_1($pin_desc, $regions);

    # Get information for each feature -- location, strand, function etc.
    my $feature_data = &feature_data($fig, $regions);

    &add_functional_coupling($fig, $pin_desc, $regions, $feature_data);
    &add_figfams($fig, $feature_data);
    &add_subsystem_data($fig, $pin_desc, $feature_data);
    
    # Assign a set number to some PEGs through transitive closure based on similarity, from blast scores
    &color_pegs($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from);

    # Filter out regions which have only a single PEG (the pinned one) colored.
#    $regions = &filter_regions_2($pin_desc, $regions, $feature_data);

    # Add feature data to the regions to make the final maps
    my $maps = &make_maps($fig, $regions, $feature_data);

    return $maps;
}

sub expand_peg_list {
    my($fig, $pin_desc) = @_;
    my @pegs = ();

    # Filter for legitimate genome IDS -- should handle deleted organisms and (in NMPDR) non-NMPDR orgs
    my %ok_genome_id = map {$_ => 1} $fig->genomes;

    # If the user has selected the genomes to be displayed, filter out all other genomes
    if ( @{ $pin_desc->{'show_genomes'} } )
    {
	# create new %ok_genome_id hash from intersection of selected genomes and legitimate genome IDs
	%ok_genome_id = map {$_ => 1} grep {$ok_genome_id{$_}} @{ $pin_desc->{'show_genomes'} };
    }

    if ( @{ $pin_desc->{'pegs'} } > 1 )
    {
	# Input already contains list of pegs, no need for expansion
	@pegs = grep {$ok_genome_id{$fig->genome_of($_)}} @{ $pin_desc->{'pegs'} };
    }
    else
    {
	# Get PCH pinned pegs
	my $pch_pinned_pegs = &pch_pinned_pegs($fig, $pin_desc, \%ok_genome_id);

	$pch_pinned_pegs = &select_pegs($fig, $pin_desc, $pin_desc->{'n_pch_pins'}, $pch_pinned_pegs);

	# Get similarity pinned pegs
	my $sim_pegs        = &sim_pegs($fig, $pin_desc, \%ok_genome_id);

	# Remove PCH PEGs (if any) from similarity pinned PEG list
	my %seen  = map {$_ => 1} @$pch_pinned_pegs;
	$sim_pegs = [grep {! $seen{$_}} @$sim_pegs];

	$sim_pegs = &select_pegs($fig, $pin_desc, $pin_desc->{'n_sims'}, $sim_pegs);

	@pegs = ();

	# Get input peg
	my $peg = $pin_desc->{'pegs'}[0];
	if ( $pin_desc->{'sort_by'} eq 'phylogeny' )
	{
	    # First add input peg, then sort by phylogeny
	    @pegs = $fig->sort_fids_by_taxonomy($peg, @$sim_pegs, @$pch_pinned_pegs);
	}
	else 
	{
	    # Sort by phylogenetic distance or organism name
	    my $g1 = $fig->genome_of($peg);
	    @pegs  = map {$_->[0] } 
	               sort {$a->[1] <=> $b->[1] or $a->[2] cmp $b->[2]} 
	                 map {[$_, $fig->crude_estimate_of_distance($g1,$fig->genome_of($_)), $fig->org_of($_)]} @$sim_pegs, @$pch_pinned_pegs;
	    # Add input peg at front of list
	    unshift @pegs, $peg;
	}
     }

    return \@pegs
}

sub pch_pinned_pegs {
    my($fig, $pin_desc, $ok_genome_id) = @_;

    my @pegs = ();

    if ( $pin_desc->{'n_pch_pins'} > 0 )
    {
	# Get input peg
	my $peg = $pin_desc->{'pegs'}[0];

	# Get pegs in pch pin with input peg, and filter out deleted or non-NMPDR organisms
	@pegs = grep {$ok_genome_id->{$fig->genome_of($_)}} 
	          grep {$_ ne $peg} 
                    map {$_->[0]} $fig->in_pch_pin_with_and_evidence($peg);
    }

    return \@pegs;
}

sub sim_pegs {
    my($fig, $pin_desc, $ok_genome_id) = @_;

    # Return a list of PEGS similar to the input PEG, with evalue less than or equal to 
    # the user defined similarity cutoff.
    # If the number of sims requested is 0, return the empty list 

    my $n_sims     = $pin_desc->{'n_sims'};
    my $sim_cutoff = $pin_desc->{'sim_cutoff'};
    my @pegs = ();
    
    if ( $n_sims > 0 )
    {
	my $peg = $pin_desc->{'pegs'}[0];
	my %seen;
 	@pegs = grep {$ok_genome_id->{$fig->genome_of($_)}} 
	          grep {! $seen{$_}++}
	            map {$_->[1]} $fig->sims($peg, 10000, $sim_cutoff, "fig");

	my $n_pegs = @pegs;
    }

    return \@pegs;
}

sub select_pegs {
    my($fig, $pin_desc, $n_pegs, $pegs) = @_;

    if ( $n_pegs == 0 )
    {
	return [];
    }

    if ( $pin_desc->{'collapse_close_genomes'} == 1 )
    {
	# The ordering of the PEGs done here is in order to =select= the correct set, the ordering for the
	# display is done later.

	# To collapse the PEG list, we want to:
	# a. Return at most $n_pegs PEGs, 
	# b. include a representative PEG from every genus, subject to a, and 
	# c. include a representative PEG from each genus-species, subject to a and b.
	# Individual strains will get represented by a single PEG, chosen arbitrarily.

	# The PEG selection is defined by the order of the PEGs. This could be done beter.
	
	my(%seen, @unique_genus, @unique_genus_species);
	
	foreach my $peg ( @$pegs )
	{
	    my $org = $fig->org_of($peg);
	    # 'org_of' returns undef for deleted PEGs, these need to be omitted from the peg list returned
	    
	    if ( $org )
	    {
		# Use only genus+species to drop strain information
		my($genus, $species) = split(/\s+/, $org);
		my $gs = "$genus $species";

		if ( not $seen{$genus}++ ) 
		{
		    # First PEG from this genus, add it to @unique_genus.
		    # Mark the genus+species as seen.
		    # A subsequent PEG with the same genus and species will be dropped.
		    # A subsequent PEG with the same genus but different species will be added to @unique_genus_species.
		    $seen{$gs}++;
		    push @unique_genus, $peg;
		} 
		elsif ( not $seen{$gs}++ ) 
		{
		    # First PEG from this genus+species, add it to @unique_genus_species.
		    push @unique_genus_species, $peg;
		}
	    }
	}

	# Keep the unique_genus PEGS at the top, followed by the unique_genus_species
	$pegs = [@unique_genus, @unique_genus_species];
    }

    # Truncate list if necessary
    if ( $n_pegs < @$pegs )
    {
	$#{ $pegs } = $n_pegs - 1;
    }
    
    return $pegs;
}

sub filter_regions_1 {
    my($pin_desc, $regions) = @_;
    my $new_regions;

    # Filter out some overlapping regions caused by gene fusions, frame shifts etc. where multiple PEGs
    # are pinned (by similarity or PCH) to the input PEG
    # Note that all overlaps are NOT eliminated

    if ( @{ $pin_desc->{'pegs'} } == 1 )
    {
	# Input pin description is for a single input peg
	my %seen;
	
	foreach my $region ( @$regions )
	{
	    my $pinned_peg = $region->{'pinned_peg'};
	    
	    # If the pinned peg from a region has already been 'seen', don't
	    # add the region into the @$new_regions array
	    
	    if ( not $seen{$pinned_peg} )
	    {
		push @$new_regions, $region;
		
		my $fids = $region->{'features'};
		foreach my $fid ( @$fids )
		{
		    $seen{$fid} = 1;
		}
	    }
	}
    }
    else
    {
 	# input PEGs is a list -- keep all of them
 	$new_regions = $regions;
    }

    return $new_regions;
}

sub filter_regions_2 {
    my($pin_desc, $regions, $feature_data) = @_;
    my $new_regions;

    if ( @{ $pin_desc->{'pegs'} } == 1 )
    {
	# Input is single peg
	my $input_peg = $pin_desc->{'pegs'}[0];

	foreach my $region ( @$regions )
	{
	    my $fids = $region->{'features'};
	    my $n_in_sets = grep {$feature_data->{$_}{'set_number'}} @$fids;
	    
	    if ( $n_in_sets > 1 or $region->{'pinned_peg'} eq $input_peg )
	    {
		# Regions should be displayed only if:
		# a. more than one PEG is in a 'colored set' OR
		# b. the pinned PEG is the input PEG
		push @$new_regions, $region;
	    }
	}
    }
    else
    {
	# Input is a list of pegs -- keep all of the regions
	$new_regions = $regions
    }

    return $new_regions;
}

sub make_maps {
    my($fig, $regions, $feature_data) = @_;

    foreach my $region ( @$regions )
    {
	my $features = [];
	my $fids = $region->{'features'};

	foreach my $fid ( @$fids )
	{
	    push @$features, $feature_data->{$fid};
	}

	$region->{'features'} = $features;
    }

    return $regions;
}

sub define_regions {
    my($fig, $map_sz, $pinned_pegs) = @_;

    # Define the 'region' around the pinned peg, get features, and store some region information
    my $regions = [];
    my $half_map_sz = int($map_sz/2);

    foreach my $peg ( @$pinned_pegs )
    {
	my $genome = $fig->genome_of($peg);
	my $loc    = $fig->feature_location($peg);
	my($contig, $beg, $end) = $fig->boundaries_of($loc);

	my $region_mid = int(($beg + $end)/2);
	my $region_beg = $region_mid - $half_map_sz;
	my $region_end = $region_mid + $half_map_sz;

	my($fids) = $fig->genes_in_region($genome, $contig, $region_beg, $region_end);

	my $region = {};
	$region->{'genome_id'} = $fig->genome_of($peg);
	$region->{'org_name'}  = $fig->genus_species($region->{'genome_id'});
	$region->{'contig'}    = $contig;
	$region->{'beg'}       = $region_beg;
	$region->{'mid'}       = $region_mid;
	$region->{'end'}       = $region_end;
	$region->{'features'}  = $fids;
	$region->{'pinned_peg_strand'} = ($beg <= $end)? '+' : '-';
	$region->{'pinned_peg'} = $peg;

	push @$regions, $region;
    }

    return $regions;
}

sub feature_data {
    my($fig, $regions) = @_;
    my %feature_data;

    foreach my $region ( @$regions )
    {
	my $region_mid = $region->{'mid'};
	my $fids       = $region->{'features'};

	foreach my $fid ( @$fids )
	{
	    # Get feature data if this feature has not been seen before, this avoids repeating
	    #  this step when pegs occur in multiple regions
	    if ( not exists $feature_data{$fid} )
	    {
		$feature_data{$fid} = &feature_entry($fig, $fid, $region_mid);
	    }
	}
    }

    return \%feature_data;
}

sub feature_entry {
    my($fig, $fid, $region_mid) = @_;

    my $type                = $fig->ftype($fid);
    my $loc                 = $fig->feature_location($fid);
    my($contig, $beg, $end) = $fig->boundaries_of($loc);
    my($left, $right)       = sort {$a <=> $b} ($beg, $end);
    my $size                = $right - $left + 1;
    my $strand              = ($beg <= $end)? '+' : '-';
    my $offset              = int(($left + $right)/2) - $region_mid;
    my $offset_beg          = $left  - $region_mid;
    my $offset_end          = $right - $region_mid;
    my $func                = scalar $fig->function_of($fid) || '';

    return {
	     'fid'        => $fid,
	     'type'       => $type,
	     'contig'     => $contig,
	     'beg'        => $beg,
	     'end'        => $end,
	     'size'       => $size,
	     'strand'     => $strand,
	     'offset'     => $offset,
	     'offset_beg' => $offset_beg,
	     'offset_end' => $offset_end,
	     'function'   => $func,
	 };
}

sub add_functional_coupling {
    my($fig, $pin_desc, $regions, $feature_data) = @_;

    my $fc_cutoff = defined($pin_desc->{'fc_cutoff'})? $pin_desc->{'fc_cutoff'} : 4;

    foreach my $region ( @$regions )
    {
	my $pin  = $region->{'pinned_peg'};
	my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} @{ $region->{'features'} };

	foreach my $couple ( $fig->coupled_to_batch(@pegs) )
	{
	    my($peg1, $peg2, $sc) = @$couple;
	    
	    if ( $peg1 eq $pin and $sc >= $fc_cutoff )
	    {
		$feature_data->{$peg2}{'fc_score'} = $sc;
	    }
	}
    }
}

sub add_figfams {
    my($fig, $feature_data) = @_;

    my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;

    my $figfam_data = $FIG_Config::FigfamsData;
    my $figfams     = new FigFams($fig, $figfam_data);

    my $figfam          = $figfams->families_containing_peg_bulk(\@pegs);
    my $family_function = $figfams->family_functions();
    
    foreach my $fid  ( keys %$feature_data )
    {
	if ( $figfam->{$fid} )
	{
	    $feature_data->{$fid}{'figfam'} = $figfam->{$fid} . ": " . $family_function->{$figfam->{$fid}};
	}
    }
}

sub add_subsystem_data {
    my($fig, $pin_desc, $feature_data) = @_;

    # Get subsystem_information for =all= pegs
    my %peg_to_ss = $fig->subsystems_for_pegs([grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data]);
    
    # Count number of occurences of each subsystem
    my %ss_count;
    foreach my $ss ( map {$_->[0]} map {@$_} values %peg_to_ss )
    {
	$ss_count{$ss}++;
    }

    # Sort subsystems based on count and create hash with unique index for each subsystem where 
    # lower indices correspond to higher number of occurences of the subsystem.
    my %ss_index;
    my $index = 0;
    foreach my $ss ( sort {$ss_count{$b} <=> $ss_count{$a} or $a cmp $b} keys %ss_count )
    {
	$ss_index{$ss} = ++$index;
    }

    # Add subsystem information for pegs in subsystems
    foreach my $fid ( keys %peg_to_ss )
    {
	my @subsystems = ();
	foreach my $rec ( @{ $peg_to_ss{$fid} } )
	{
	    if ( @$rec )
	    {
		push @subsystems, $rec->[0];
	    }
	}

	if ( @subsystems )
	{
	    $feature_data->{$fid}{'subsystems'} = [sort {$a->[1] <=> $b->[1]} map {$_->[0] =~ s/_/ /g; $_} map {[$_, $ss_index{$_}]} @subsystems];
	}
    }
}

sub color_pegs {
    my($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from) = @_;

    # Run blastp and get connected pegs
    my $blast_hit = {};
    my $color_sim_cutoff = $pin_desc->{'color_sim_cutoff'}; 

    if ( $sims_from eq 'blast' )
    {
	$blast_hit = &blast_hits($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff);
    }
    else
    {
	$blast_hit = &blast_hits_from_sims($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff);
    }

    # Assign set numbers to pegs based on blast scores
    my $color_sets = &partition_pegs($blast_hit);

    # Sort peg sets to that a) larger sets have smaller set numbers, and
    #                       b) sets closer to the pinned peg have smaller set numbers
    $color_sets = &sort_color_sets($pin_desc, $pinned_pegs, $color_sets, $feature_data);

    # Add color set number into $feature_data 
    for (my $i = 0; $i < @$color_sets; $i++)
    {
	# Use an index that starts at 1 (matches with coloring index)
	my $n = $i + 1;
	foreach my $peg ( @{ $color_sets->[$i] } )
	{
	    $feature_data->{$peg}{'set_number'} = $n;
	    
	    # If this is the pinned set, set the 'color' to 'red'
	    if ( $n == 1 )
	    {
		$feature_data->{$peg}{'color'} = 'red';
	    }
	}
    }
}

sub sort_color_sets {
    my($pin_desc, $pinned_pegs, $color_sets, $feature_data) = @_;

    # Find the color set containing the set of pinned pegs, i.e. the set containing the input peg.
    # If the input peg is found (or if input is a list of pegs, returned value is the array index for the
    # set.
    # If the input peg is not found, the returned value is an reference to an array containing the input peg.
    my $set = &pinned_set($pin_desc, $pinned_pegs, $color_sets);
    
    my $pinned_color_set;
    if ( $set =~ /^\d+$/ )
    {
	# Splice out the color set containing the input peg
	($pinned_color_set) = splice(@$color_sets,$set,1);
    }
    else
    {
	$pinned_color_set = $set;
    }

    # Add offset (summed distance from 
    my @color_sets = map {[$_, &offset($_, $feature_data)]}  @$color_sets;
    # Sort the remaining color sets based on:
    # a. size (number of pegs) and
    # b. offset from mid-point of region
    @$color_sets = map {$_->[0]} 
                     sort {@{$b->[0]} <=> @{$a->[0]} or $a->[1] <=> $b->[1]} 
#                     sort {$a->[1] <=> $b->[1] or @{$b->[0]} <=> @{$a->[0]}} 
                       map {[$_, &offset($_, $feature_data)]}  @$color_sets;

    # Add the set of pinned pegs at the front of the returned list so that it gets colored red
    return [$pinned_color_set, @$color_sets];
}

sub pinned_set {
    my($pin_desc, $pinned_pegs, $color_sets) = @_;

    # Returns an integer if the input is a peg-list or if the input peg is in a set.
    # Returns the input peg (as an array reference) if the input peg is not in a set.

    if ( @{ $pin_desc->{'pegs'} } == 1 )
    {
	# Get input peg if it exists -- the set containing this peg should be colored red
	my $peg = $pin_desc->{'pegs'}[0];

	# Iterate through the color sets until you find the set containing the input peg
	for (my $i = 0; $i < @$color_sets; $i++)
	{
	    foreach my $peg2 ( @{ $color_sets->[$i] } )
	    {
		if ( $peg2 eq $peg )
		{
		    # Return the set index
		    return $i;
		}
	    }
	}

	# The input peg may not be in a set if there is only one region or the coloring cutoff is very stringent.
	return [$peg];
    }
    else
    {
	# Input is a peg-list, which may be split into multiple sets -- the largest will be called the 
	# "pinned" set.
	my($i, $max_count);
	my %pinned = map {$_ => 1} @$pinned_pegs;

	for (my $j = 0; $j < @$color_sets; $j++)
	{
	    # count how many pinned pegs are found in each set
	    my $count = scalar grep {$pinned{$_}} @{ $color_sets->[$j] };

	    if ( $max_count < $count )
	    {
		# Keep track of the set having the largest number of pinned pegs
		$max_count = $count;
		$i = $j;
	    }
	}

	return $i;
    }
}

sub offset {
    my($set, $feature_data) = @_;

    my $offset;
    foreach my $peg ( @$set )
    {
	$offset += abs($feature_data->{$peg}{'offset'});
    }
    
    return sprintf("%.2f", $offset/@$set);
}

sub partition_pegs {
    my($blast_hit) = @_;

    my %seen;
    my $sets = [];

    # Iterate through the pegs with blast hits in arbitrary order
    foreach my $peg1 ( keys %$blast_hit )
    {
	# If it has not been 'seen' (and placed in a set) use it as the seed for a set
	if ( not $seen{$peg1} )
	{
	    my $set = [$peg1];

	    # Mark $peg1 as seen
	    $seen{$peg1} = 1;

	    # Grow set through transitive closure -- note that @$set keeps growing
	    for (my $i = 0; $i < @$set; $i++)
	    {
		$peg1 = $set->[$i];

		# Iterate through blast hits
		foreach my $peg2 ( keys %{ $blast_hit->{$peg1} } )
		{
		    # If $peg2 has not been seen, put it in the set, and mark it as seen
		    if ( not exists $seen{$peg2} )
		    {
			push @$set, $peg2;
			$seen{$peg2} = 1;
		    }
		}
	    }

	    # Add the new set to the set of sets
	    push @$sets, $set;
	}
    }

    return $sets;
}

sub blast_hits {
    my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff) = @_;
    my($t0, $dt);

    # Set blast environment variables
    $ENV{"BLASTMAT"} ||= "$FIG_Config::blastmat";
    if ($ENV{"PATH"} !~ /fig\/bin/) { $ENV{"PATH"} = "$FIG_Config::bin:" . $ENV{"PATH"}; }

    # Get amino acid sequences
    my $sequences = &get_peg_sequences($fig, $feature_data);

    # Write the entire set of sequences to a fasta file
    my $all_seqs_file = "$FIG_Config::temp/all_seqs.$$.tmp.fasta";
    &write_seqs_to_file($sequences, $all_seqs_file);

    # Run formatdb
    &formatdb_file($fig, $all_seqs_file);

    # If $fast_color == 0, the complete blast of all vs. all sequences is performed
    # Otherwise, instead of all vs. all, the sequences from a single pinned peg region
    # is blasted against all sequences. If any of the hits are better than a given cutoff
    # ($cutoff_2), the pegs hit are deemed to be 'done', i.e. it is assumed that blasting this 
    # peg will not yield additional information for forming the peg sets. These 'done' pegs
    # are then omitted from the blast when the sequences from the region they belong to 
    # is blasted.
    my %blast_hit;
    if ( $fast_color )
    {
	my $cutoff_2 = $color_sim_cutoff * 1e-20;
	my %done_with;

	foreach my $region ( @$regions )
	{
	    # Iterate through each region
	    my $fids = $region->{'features'};

	    # Build up a hash of peg sequences which are not 'done_with'
	    my %region_seqs;
	    foreach my $peg ( grep {$feature_data->{$_}{'type'} eq 'peg'} @$fids )
	    {
		if ( $sequences->{$peg} and not $done_with{$peg} )
		{
		    $region_seqs{$peg} = $sequences->{$peg};
		}
	    }

	    if ( scalar keys %region_seqs )
	    {
		# Write the region sequences to a file
		my $region_seqs_file = "$FIG_Config::temp/region_seqs.$$.tmp.fasta";
		&write_seqs_to_file(\%region_seqs, $region_seqs_file);
		
		# BLAST the region sequences against all other sequences
		my $hits = &blast_files($fig, $region_seqs_file, $all_seqs_file, $color_sim_cutoff);
		
		# Build up hash of blast hits
		foreach my $hit ( @$hits )
		{
		    my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];
		
		    if ( $peg1 ne $peg2 )
		    {
			$blast_hit{$peg1}{$peg2} = 1;
			$blast_hit{$peg2}{$peg1} = 1;

			# If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so 
			# that it will not be blasted with it's region sequences
			if ( $sc <= $cutoff_2 )
			{
			    $done_with{$peg2} = 1;
			}
		    }
		}
	    }
	}
    }
    else
    {
	# BLAST sequence file against itself
	my $hits = &blast_files($fig, $all_seqs_file, $all_seqs_file, $color_sim_cutoff);

	# Build up hash of blast hits
	foreach my $hit ( @$hits )
	{
	    my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];

	    if ( $peg1 ne $peg2 )
	    {
		$blast_hit{$peg1}{$peg2} = 1;
		$blast_hit{$peg2}{$peg1} = 1;
	    }
	}
    }

    return \%blast_hit;
}

sub blast_hits_from_sims {
    my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff) = @_;

    my %blast_hit;

    my $maxN       = 2000;
    my $maxP       = $color_sim_cutoff;
    my $select     = 'fig';
    my $max_expand = '';
    my $filters    = '';    

    # Create a hash of all pegs
    my %region_peg = map {$_ => 1} grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;

    if ( $fast_color == 1 )
    {
 	my $cutoff_2 = $color_sim_cutoff * 1e-20;
 	my %done_with;

	# Iterate through each peg
	foreach my $peg1 ( keys %region_peg )
	{
	    # Skip the 'done_with' pegs
	    next if ($done_with{$peg1});

	    my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);

	    foreach my $sim ( grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
	    {
		my($peg2, $sc) = @$sim[1,10];

		$blast_hit{$peg1}{$peg2} = 1;
		$blast_hit{$peg2}{$peg1} = 1;
		
		# If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so 
		# that it will not be blasted with it's region sequences
		if ( $sc <= $cutoff_2 )
		{
		    $done_with{$peg2} = 1;
		}
	    }
	}
    }
    else
    {
	# Iterate through each peg
	foreach my $peg1 ( keys %region_peg )
	{
	    my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);

	    foreach my $peg2 ( map {$_->[1]} grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
	    {
		$blast_hit{$peg1}{$peg2} = 1;
		$blast_hit{$peg2}{$peg1} = 1;
	    }
	}
    }

    return \%blast_hit;
}

sub blast_files {
    my($fig, $input, $database, $cutoff) = @_;

    my $cmd  = "$FIG_Config::ext_bin/blastall";
    my @args = ('-p', 'blastp',  '-i', $input, '-d', $database, '-m', 8, '-e', $cutoff);
    my @blast_out = $fig->run_gathering_output($cmd, @args);

    return \@blast_out;
}

sub formatdb_file {
    my($fig, $file) = @_;

    my $cmd = "$FIG_Config::ext_bin/formatdb -i $file -p T";
    $fig->run($cmd);
}

sub write_seqs_to_file {
    my($seq, $fasta_file) = @_;

    open(FASTA, ">$fasta_file") or die "could not create FASTA file '$fasta_file': $!";
    foreach my $peg ( keys %$seq )
    {
	print FASTA ">$peg\n$seq->{$peg}\n";
    }
    close(FASTA) or die "could not close file FASTA file '$fasta_file': $!";
}

sub get_peg_sequences {
    my($fig, $feature_data) = @_;
    my %sequences;

    # Iterate over each peg
    foreach my $peg ( grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data )
    {
	my $seq = $fig->get_translation($peg);

	if ( $seq )
	{
	    $sequences{$peg} = $seq;
	}
	else
	{
	    print STDERR "could not get sqeuence for $peg\n";
	}
    }

    return \%sequences;
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3