[Bio] / FigKernelPackages / PinnedRegions.pm Repository:
ViewVC logotype

View of /FigKernelPackages/PinnedRegions.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (download) (as text) (annotate)
Wed Feb 20 19:23:58 2008 UTC (12 years, 1 month ago) by dsouza
Branch: MAIN
Changes since 1.6: +15 -1 lines
PEG popup changed so that FigFam ID does not get repeated (was twice) for
FigFams which are not based on subsystems. This can be removed if the
FigFam function format gets changed.

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################


package PinnedRegions;

use strict;
use warnings;

use FIG;
use FigFams;
use FIG_Config;

use Time::HiRes qw( usleep ualarm gettimeofday tv_interval );

sub pinned_regions {
    my($fig, $pin_desc, $fast_color, $sims_from, $map_sz) = @_;

    # Get list of pegs required by the description in $pin_desc
    my $pinned_pegs = &expand_peg_list($fig, $pin_desc);

    # Get regions around pinned pegs -- boundaries, features, etc.
    my $regions = &define_regions($fig, $map_sz, $pinned_pegs);

    # Filter out overlapping regions caused by gene fusions, frame shifts etc. where multiple PEGs
    # are pinned (by similarity or PCH) to the input PEG
    $regions = &filter_regions_1($pin_desc, $regions);

    # Get information for each feature -- location, strand, function etc.
    my $feature_data = &feature_data($fig, $regions);

    &add_functional_coupling($fig, $pin_desc, $regions, $feature_data);
    &add_figfams($fig, $feature_data);
    &add_subsystem_data($fig, $pin_desc, $feature_data);
    
    # Assign a set number to some PEGs through transitive closure based on similarity, from blast scores
    &color_pegs($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from);

    # Filter out regions which have only a single PEG (the pinned one) colored.
#    $regions = &filter_regions_2($pin_desc, $regions, $feature_data);

    # Add feature data to the regions to make the final maps
    my $maps = &make_maps($fig, $regions, $feature_data);

    return $maps;
}

sub expand_peg_list {
    my($fig, $pin_desc) = @_;
    my @pegs = ();

    # Filter for legitimate genome IDS -- should handle deleted organisms and (in NMPDR) non-NMPDR orgs
    my %ok_genome_id = map {$_ => 1} $fig->genomes;

    # If the user has selected the genomes to be displayed, filter out all other genomes
    if ( @{ $pin_desc->{'show_genomes'} } )
    {
	# create new %ok_genome_id hash from intersection of selected genomes and legitimate genome IDs
	%ok_genome_id = map {$_ => 1} grep {$ok_genome_id{$_}} @{ $pin_desc->{'show_genomes'} };
    }

    if ( @{ $pin_desc->{'pegs'} } > 1 )
    {
	# Input already contains list of pegs, no need for expansion
	@pegs = grep {$ok_genome_id{$fig->genome_of($_)}} @{ $pin_desc->{'pegs'} };
    }
    else
    {
	# Get PCH pinned pegs
	my $pch_pinned_pegs = &pch_pinned_pegs($fig, $pin_desc, \%ok_genome_id);

	$pch_pinned_pegs = &select_pegs($fig, $pin_desc, $pin_desc->{'n_pch_pins'}, $pch_pinned_pegs);

	# Get similarity pinned pegs
	my $sim_pegs        = &sim_pegs($fig, $pin_desc, \%ok_genome_id);

	# Remove PCH PEGs (if any) from similarity pinned PEG list
	my %seen  = map {$_ => 1} @$pch_pinned_pegs;
	$sim_pegs = [grep {! $seen{$_}} @$sim_pegs];

	$sim_pegs = &select_pegs($fig, $pin_desc, $pin_desc->{'n_sims'}, $sim_pegs);

	@pegs = ();

	# Get input peg
	my $peg = $pin_desc->{'pegs'}[0];
	if ( $pin_desc->{'sort_by'} eq 'phylogeny' )
	{
	    # First add input peg, then sort by phylogeny
	    @pegs = $fig->sort_fids_by_taxonomy($peg, @$sim_pegs, @$pch_pinned_pegs);
	}
	else 
	{
	    # Sort by phylogenetic distance or organism name
	    my $g1 = $fig->genome_of($peg);
	    @pegs  = map {$_->[0] } 
	               sort {$a->[1] <=> $b->[1] or $a->[2] cmp $b->[2]} 
	                 map {[$_, $fig->crude_estimate_of_distance($g1,$fig->genome_of($_)), $fig->org_of($_)]} @$sim_pegs, @$pch_pinned_pegs;
	    # Add input peg at front of list
	    unshift @pegs, $peg;
	}
     }

    return \@pegs
}

sub pch_pinned_pegs {
    my($fig, $pin_desc, $ok_genome_id) = @_;

    my @pegs = ();

    if ( $pin_desc->{'n_pch_pins'} > 0 )
    {
	# Get input peg
	my $peg = $pin_desc->{'pegs'}[0];

	# Get pegs in pch pin with input peg, and filter out deleted or non-NMPDR organisms
	@pegs = grep {$ok_genome_id->{$fig->genome_of($_)}} 
	          grep {$_ ne $peg} 
                    map {$_->[0]} $fig->in_pch_pin_with_and_evidence($peg);
    }

    return \@pegs;
}

sub sim_pegs {
    my($fig, $pin_desc, $ok_genome_id) = @_;

    # Return a list of PEGS similar to the input PEG, with evalue less than or equal to 
    # the user defined similarity cutoff.
    # If the number of sims requested is 0, return the empty list 

    my $n_sims     = $pin_desc->{'n_sims'};
    my $sim_cutoff = $pin_desc->{'sim_cutoff'};
    my @pegs = ();
    
    if ( $n_sims > 0 )
    {
	my $peg = $pin_desc->{'pegs'}[0];
	my %seen;
 	@pegs = grep {$ok_genome_id->{$fig->genome_of($_)}} 
	          grep {! $seen{$_}++}
	            map {$_->[1]} $fig->sims($peg, 10000, $sim_cutoff, "fig");

	my $n_pegs = @pegs;
    }

    return \@pegs;
}

sub select_pegs {
    my($fig, $pin_desc, $n_pegs, $pegs) = @_;

    if ( $n_pegs == 0 )
    {
	return [];
    }

    if ( $pin_desc->{'collapse_close_genomes'} == 1 )
    {
	# The ordering of the PEGs done here is in order to =select= the correct set, the ordering for the
	# display is done later.

	# To collapse the PEG list, we want to:
	# a. Return at most $n_pegs PEGs, 
	# b. include a representative PEG from every genus, subject to a, and 
	# c. include a representative PEG from each genus-species, subject to a and b.
	# Individual strains will get represented by a single PEG, chosen arbitrarily.

	# The PEG selection is defined by the order of the PEGs. This could be done beter.
	
	my(%seen, @unique_genus, @unique_genus_species);
	
	foreach my $peg ( @$pegs )
	{
	    my $org = $fig->org_of($peg);
	    # 'org_of' returns undef for deleted PEGs, these need to be omitted from the peg list returned
	    
	    if ( $org )
	    {
		# Use only genus+species to drop strain information
		my($genus, $species) = split(/\s+/, $org);
		my $gs = "$genus $species";

		if ( not $seen{$genus}++ ) 
		{
		    # First PEG from this genus, add it to @unique_genus.
		    # Mark the genus+species as seen.
		    # A subsequent PEG with the same genus and species will be dropped.
		    # A subsequent PEG with the same genus but different species will be added to @unique_genus_species.
		    $seen{$gs}++;
		    push @unique_genus, $peg;
		} 
		elsif ( not $seen{$gs}++ ) 
		{
		    # First PEG from this genus+species, add it to @unique_genus_species.
		    push @unique_genus_species, $peg;
		}
	    }
	}

	# Keep the unique_genus PEGS at the top, followed by the unique_genus_species
	$pegs = [@unique_genus, @unique_genus_species];
    }

    # Truncate list if necessary
    if ( $n_pegs < @$pegs )
    {
	$#{ $pegs } = $n_pegs - 1;
    }
    
    return $pegs;
}

sub filter_regions_1 {
    my($pin_desc, $regions) = @_;
    my $new_regions;

    # Filter out some overlapping regions caused by gene fusions, frame shifts etc. where multiple PEGs
    # are pinned (by similarity or PCH) to the input PEG
    # Note that all overlaps are NOT eliminated

    if ( @{ $pin_desc->{'pegs'} } == 1 )
    {
	# Input pin description is for a single input peg
	my %seen;
	
	foreach my $region ( @$regions )
	{
	    my $pinned_peg = $region->{'pinned_peg'};
	    
	    # If the pinned peg from a region has already been 'seen', don't
	    # add the region into the @$new_regions array
	    
	    if ( not $seen{$pinned_peg} )
	    {
		push @$new_regions, $region;
		
		my $fids = $region->{'features'};
		foreach my $fid ( @$fids )
		{
		    $seen{$fid} = 1;
		}
	    }
	}
    }
    else
    {
 	# input PEGs is a list -- keep all of them
 	$new_regions = $regions;
    }

    return $new_regions;
}

sub filter_regions_2 {
    my($pin_desc, $regions, $feature_data) = @_;
    my $new_regions;

    if ( @{ $pin_desc->{'pegs'} } == 1 )
    {
	# Input is single peg
	my $input_peg = $pin_desc->{'pegs'}[0];

	foreach my $region ( @$regions )
	{
	    my $fids = $region->{'features'};
	    my $n_in_sets = grep {$feature_data->{$_}{'set_number'}} @$fids;
	    
	    if ( $n_in_sets > 1 or $region->{'pinned_peg'} eq $input_peg )
	    {
		# Regions should be displayed only if:
		# a. more than one PEG is in a 'colored set' OR
		# b. the pinned PEG is the input PEG
		push @$new_regions, $region;
	    }
	}
    }
    else
    {
	# Input is a list of pegs -- keep all of the regions
	$new_regions = $regions
    }

    return $new_regions;
}

sub make_maps {
    my($fig, $regions, $feature_data) = @_;

    foreach my $region ( @$regions )
    {
	my $features = [];
	my $fids = $region->{'features'};

	foreach my $fid ( @$fids )
	{
	    push @$features, $feature_data->{$fid};
	}

	$region->{'features'} = $features;
    }

    return $regions;
}

sub define_regions {
    my($fig, $map_sz, $pinned_pegs) = @_;

    # Define the 'region' around the pinned peg, get features, and store some region information
    my $regions = [];
    my $half_map_sz = int($map_sz/2);

    foreach my $peg ( @$pinned_pegs )
    {
	my $genome = $fig->genome_of($peg);
	my $loc    = $fig->feature_location($peg);
	my($contig, $beg, $end) = $fig->boundaries_of($loc);

	my $region_mid = int(($beg + $end)/2);
	my $region_beg = $region_mid - $half_map_sz;
	my $region_end = $region_mid + $half_map_sz;

	my($fids) = $fig->genes_in_region($genome, $contig, $region_beg, $region_end);

	my $region = {};
	$region->{'genome_id'} = $fig->genome_of($peg);
	$region->{'org_name'}  = $fig->genus_species($region->{'genome_id'});
	$region->{'contig'}    = $contig;
	$region->{'beg'}       = $region_beg;
	$region->{'mid'}       = $region_mid;
	$region->{'end'}       = $region_end;
	$region->{'features'}  = $fids;
	$region->{'pinned_peg_strand'} = ($beg <= $end)? '+' : '-';
	$region->{'pinned_peg'} = $peg;

	push @$regions, $region;
    }

    return $regions;
}

sub feature_data {
    my($fig, $regions) = @_;
    my %feature_data;

    foreach my $region ( @$regions )
    {
	my $region_mid = $region->{'mid'};
	my $fids       = $region->{'features'};

	foreach my $fid ( @$fids )
	{
	    # Get feature data if this feature has not been seen before, this avoids repeating
	    #  this step when pegs occur in multiple regions
	    if ( not exists $feature_data{$fid} )
	    {
		$feature_data{$fid} = &feature_entry($fig, $fid, $region_mid);
	    }
	}
    }

    return \%feature_data;
}

sub feature_entry {
    my($fig, $fid, $region_mid) = @_;

    my $type                = $fig->ftype($fid);
    my $loc                 = $fig->feature_location($fid);
    my($contig, $beg, $end) = $fig->boundaries_of($loc);
    my($left, $right)       = sort {$a <=> $b} ($beg, $end);
    my $size                = $right - $left + 1;
    my $strand              = ($beg <= $end)? '+' : '-';
    my $offset              = int(($left + $right)/2) - $region_mid;
    my $offset_beg          = $left  - $region_mid;
    my $offset_end          = $right - $region_mid;
    my $func                = scalar $fig->function_of($fid) || '';

    return {
	     'fid'        => $fid,
	     'type'       => $type,
	     'contig'     => $contig,
	     'beg'        => $beg,
	     'end'        => $end,
	     'size'       => $size,
	     'strand'     => $strand,
	     'offset'     => $offset,
	     'offset_beg' => $offset_beg,
	     'offset_end' => $offset_end,
	     'function'   => $func,
	 };
}

sub add_functional_coupling {
    my($fig, $pin_desc, $regions, $feature_data) = @_;

    my $fc_cutoff = defined($pin_desc->{'fc_cutoff'})? $pin_desc->{'fc_cutoff'} : 4;

    foreach my $region ( @$regions )
    {
	my $pin  = $region->{'pinned_peg'};
	my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} @{ $region->{'features'} };

	foreach my $couple ( $fig->coupled_to_batch(@pegs) )
	{
	    my($peg1, $peg2, $sc) = @$couple;
	    
	    if ( $peg1 eq $pin and $sc >= $fc_cutoff )
	    {
		$feature_data->{$peg2}{'fc_score'} = $sc;
	    }
	}
    }
}

sub add_figfams {
    my($fig, $feature_data) = @_;

    # Get FigFams directory from config file
    my $figfam_dir = $FIG_Config::FigfamsData;

    # Check if FigFams directory is defined and exists on current machine
    if ( defined($figfam_dir) and (-d $figfam_dir) )
    {
	# Get all PEG IDs
	my @pegs            = grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;
	# Get $figfams object
	my $figfams         = new FigFams($fig, $figfam_dir);
	# Get FigFam family ID for @pegs
	my $figfam          = $figfams->families_containing_peg_bulk(\@pegs);
	# Get hash of FigFam ID to family function
	my $family_function = $figfams->family_functions();

	# Some FigFams (the ones that are not subsystem related) have the FigFam ID in the family function.
	# This results in the popup displaying the ID twice, so one should be removed.
	
	my %figfam_text;
	foreach my $fid  ( keys %$feature_data )
	{
	    if ( $figfam->{$fid} )
	    {
		my $figfam_id = $figfam->{$fid};
		if ( ! exists $figfam_text{$figfam_id} ) {
		    if ( $family_function->{$figfam_id} =~ /^FIG\d+/ ) {
			$figfam_text{$figfam_id} = $family_function->{$figfam_id};
		    } else {
			$figfam_text{$figfam_id} = $figfam_id . ': ' . $family_function->{$figfam_id};
		    }
		}

		# Add FigFam information to hash -- to go into the popup text
		$feature_data->{$fid}{'figfam'} = $figfam_text{$figfam_id};
#->{$fid} . ": " . $family_function->{$figfam->{$fid}};
	    }
	}
    }
}

sub add_subsystem_data {
    my($fig, $pin_desc, $feature_data) = @_;

    # Get subsystem_information for =all= pegs
    my %peg_to_ss = $fig->subsystems_for_pegs([grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data]);
    
    # Count number of occurences of each subsystem
    my %ss_count;
    foreach my $ss ( map {$_->[0]} map {@$_} values %peg_to_ss )
    {
	$ss_count{$ss}++;
    }

    # Sort subsystems based on count and create hash with unique index for each subsystem where 
    # lower indices correspond to higher number of occurences of the subsystem.
    my %ss_index;
    my $index = 0;
    foreach my $ss ( sort {$ss_count{$b} <=> $ss_count{$a} or $a cmp $b} keys %ss_count )
    {
	$ss_index{$ss} = ++$index;
    }

    # Add subsystem information for pegs in subsystems
    foreach my $fid ( keys %peg_to_ss )
    {
	my @subsystems = ();
	foreach my $rec ( @{ $peg_to_ss{$fid} } )
	{
	    if ( @$rec )
	    {
		push @subsystems, $rec->[0];
	    }
	}

	if ( @subsystems )
	{
	    $feature_data->{$fid}{'subsystems'} = [sort {$a->[1] <=> $b->[1]} map {$_->[0] =~ s/_/ /g; $_} map {[$_, $ss_index{$_}]} @subsystems];
	}
    }
}

sub color_pegs {
    my($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from) = @_;

    # Run blastp and get connected pegs
    my $blast_hit = {};
    my $color_sim_cutoff = $pin_desc->{'color_sim_cutoff'}; 

    if ( $sims_from eq 'blast' )
    {
	$blast_hit = &blast_hits($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff);
    }
    else
    {
	$blast_hit = &blast_hits_from_sims($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff);
    }

    # Assign set numbers to pegs based on blast scores
    my $color_sets = &partition_pegs($blast_hit);

    # Sort peg sets to that a) larger sets have smaller set numbers, and
    #                       b) sets closer to the pinned peg have smaller set numbers
    $color_sets = &sort_color_sets($pin_desc, $pinned_pegs, $color_sets, $feature_data);

    # Add color set number into $feature_data 
    for (my $i = 0; $i < @$color_sets; $i++)
    {
	# Use an index that starts at 1 (matches with coloring index)
	my $n = $i + 1;
	foreach my $peg ( @{ $color_sets->[$i] } )
	{
	    $feature_data->{$peg}{'set_number'} = $n;
	    
	    # If this is the pinned set, set the 'color' to 'red'
	    if ( $n == 1 )
	    {
		$feature_data->{$peg}{'color'} = 'red';
	    }
	}
    }
}

sub sort_color_sets {
    my($pin_desc, $pinned_pegs, $color_sets, $feature_data) = @_;

    # Find the color set containing the set of pinned pegs, i.e. the set containing the input peg.
    # If the input peg is found (or if input is a list of pegs, returned value is the array index for the
    # set.
    # If the input peg is not found, the returned value is an reference to an array containing the input peg.
    my $set = &pinned_set($pin_desc, $pinned_pegs, $color_sets);
    
    my $pinned_color_set;
    if ( $set =~ /^\d+$/ )
    {
	# Splice out the color set containing the input peg
	($pinned_color_set) = splice(@$color_sets,$set,1);
    }
    else
    {
	$pinned_color_set = $set;
    }

    # Add offset (summed distance from 
    my @color_sets = map {[$_, &offset($_, $feature_data)]}  @$color_sets;
    # Sort the remaining color sets based on:
    # a. size (number of pegs) and
    # b. offset from mid-point of region
    @$color_sets = map {$_->[0]} 
                     sort {@{$b->[0]} <=> @{$a->[0]} or $a->[1] <=> $b->[1]} 
#                     sort {$a->[1] <=> $b->[1] or @{$b->[0]} <=> @{$a->[0]}} 
                       map {[$_, &offset($_, $feature_data)]}  @$color_sets;

    # Add the set of pinned pegs at the front of the returned list so that it gets colored red
    return [$pinned_color_set, @$color_sets];
}

sub pinned_set {
    my($pin_desc, $pinned_pegs, $color_sets) = @_;

    # Returns an integer if the input is a peg-list or if the input peg is in a set.
    # Returns the input peg (as an array reference) if the input peg is not in a set.

    if ( @{ $pin_desc->{'pegs'} } == 1 )
    {
	# Get input peg if it exists -- the set containing this peg should be colored red
	my $peg = $pin_desc->{'pegs'}[0];

	# Iterate through the color sets until you find the set containing the input peg
	for (my $i = 0; $i < @$color_sets; $i++)
	{
	    foreach my $peg2 ( @{ $color_sets->[$i] } )
	    {
		if ( $peg2 eq $peg )
		{
		    # Return the set index
		    return $i;
		}
	    }
	}

	# The input peg may not be in a set if there is only one region or the coloring cutoff is very stringent.
	return [$peg];
    }
    else
    {
	# Input is a peg-list, which may be split into multiple sets -- the largest will be called the 
	# "pinned" set.
	my($i, $max_count);
	my %pinned = map {$_ => 1} @$pinned_pegs;

	for (my $j = 0; $j < @$color_sets; $j++)
	{
	    # count how many pinned pegs are found in each set
	    my $count = scalar grep {$pinned{$_}} @{ $color_sets->[$j] };

	    if ( $max_count < $count )
	    {
		# Keep track of the set having the largest number of pinned pegs
		$max_count = $count;
		$i = $j;
	    }
	}

	return $i;
    }
}

sub offset {
    my($set, $feature_data) = @_;

    my $offset;
    foreach my $peg ( @$set )
    {
	$offset += abs($feature_data->{$peg}{'offset'});
    }
    
    return sprintf("%.2f", $offset/@$set);
}

sub partition_pegs {
    my($blast_hit) = @_;

    my %seen;
    my $sets = [];

    # Iterate through the pegs with blast hits in arbitrary order
    foreach my $peg1 ( keys %$blast_hit )
    {
	# If it has not been 'seen' (and placed in a set) use it as the seed for a set
	if ( not $seen{$peg1} )
	{
	    my $set = [$peg1];

	    # Mark $peg1 as seen
	    $seen{$peg1} = 1;

	    # Grow set through transitive closure -- note that @$set keeps growing
	    for (my $i = 0; $i < @$set; $i++)
	    {
		$peg1 = $set->[$i];

		# Iterate through blast hits
		foreach my $peg2 ( keys %{ $blast_hit->{$peg1} } )
		{
		    # If $peg2 has not been seen, put it in the set, and mark it as seen
		    if ( not exists $seen{$peg2} )
		    {
			push @$set, $peg2;
			$seen{$peg2} = 1;
		    }
		}
	    }

	    # Add the new set to the set of sets
	    push @$sets, $set;
	}
    }

    return $sets;
}

sub blast_hits {
    my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff) = @_;
    my($t0, $dt);

    # Set blast environment variables
    $ENV{"BLASTMAT"} ||= "$FIG_Config::blastmat";
    if ($ENV{"PATH"} !~ /fig\/bin/) { $ENV{"PATH"} = "$FIG_Config::bin:" . $ENV{"PATH"}; }

    # Get amino acid sequences
    my $sequences = &get_peg_sequences($fig, $feature_data);

    # Write the entire set of sequences to a fasta file
    my $all_seqs_file = "$FIG_Config::temp/all_seqs.$$.tmp.fasta";
    &write_seqs_to_file($sequences, $all_seqs_file);

    # Run formatdb
    &formatdb_file($fig, $all_seqs_file);

    # If $fast_color == 0, the complete blast of all vs. all sequences is performed
    # Otherwise, instead of all vs. all, the sequences from a single pinned peg region
    # is blasted against all sequences. If any of the hits are better than a given cutoff
    # ($cutoff_2), the pegs hit are deemed to be 'done', i.e. it is assumed that blasting this 
    # peg will not yield additional information for forming the peg sets. These 'done' pegs
    # are then omitted from the blast when the sequences from the region they belong to 
    # is blasted.
    my %blast_hit;
    if ( $fast_color )
    {
	my $cutoff_2 = $color_sim_cutoff * 1e-20;
	my %done_with;

	foreach my $region ( @$regions )
	{
	    # Iterate through each region
	    my $fids = $region->{'features'};

	    # Build up a hash of peg sequences which are not 'done_with'
	    my %region_seqs;
	    foreach my $peg ( grep {$feature_data->{$_}{'type'} eq 'peg'} @$fids )
	    {
		if ( $sequences->{$peg} and not $done_with{$peg} )
		{
		    $region_seqs{$peg} = $sequences->{$peg};
		}
	    }

	    if ( scalar keys %region_seqs )
	    {
		# Write the region sequences to a file
		my $region_seqs_file = "$FIG_Config::temp/region_seqs.$$.tmp.fasta";
		&write_seqs_to_file(\%region_seqs, $region_seqs_file);
		
		# BLAST the region sequences against all other sequences
		my $hits = &blast_files($fig, $region_seqs_file, $all_seqs_file, $color_sim_cutoff);
		
		# Build up hash of blast hits
		foreach my $hit ( @$hits )
		{
		    my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];
		
		    if ( $peg1 ne $peg2 )
		    {
			$blast_hit{$peg1}{$peg2} = 1;
			$blast_hit{$peg2}{$peg1} = 1;

			# If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so 
			# that it will not be blasted with it's region sequences
			if ( $sc <= $cutoff_2 )
			{
			    $done_with{$peg2} = 1;
			}
		    }
		}
	    }
	}
    }
    else
    {
	# BLAST sequence file against itself
	my $hits = &blast_files($fig, $all_seqs_file, $all_seqs_file, $color_sim_cutoff);

	# Build up hash of blast hits
	foreach my $hit ( @$hits )
	{
	    my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];

	    if ( $peg1 ne $peg2 )
	    {
		$blast_hit{$peg1}{$peg2} = 1;
		$blast_hit{$peg2}{$peg1} = 1;
	    }
	}
    }

    return \%blast_hit;
}

sub blast_hits_from_sims {
    my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff) = @_;

    my %blast_hit;

    my $maxN       = 2000;
    my $maxP       = $color_sim_cutoff;
    my $select     = 'fig';
    my $max_expand = '';
    my $filters    = '';    

    # Create a hash of all pegs
    my %region_peg = map {$_ => 1} grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;

    if ( $fast_color == 1 )
    {
 	my $cutoff_2 = $color_sim_cutoff * 1e-20;
 	my %done_with;

	# Iterate through each peg
	foreach my $peg1 ( keys %region_peg )
	{
	    # Skip the 'done_with' pegs
	    next if ($done_with{$peg1});

	    my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);

	    foreach my $sim ( grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
	    {
		my($peg2, $sc) = @$sim[1,10];

		$blast_hit{$peg1}{$peg2} = 1;
		$blast_hit{$peg2}{$peg1} = 1;
		
		# If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so 
		# that it will not be blasted with it's region sequences
		if ( $sc <= $cutoff_2 )
		{
		    $done_with{$peg2} = 1;
		}
	    }
	}
    }
    else
    {
	# Iterate through each peg
	foreach my $peg1 ( keys %region_peg )
	{
	    my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);

	    foreach my $peg2 ( map {$_->[1]} grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
	    {
		$blast_hit{$peg1}{$peg2} = 1;
		$blast_hit{$peg2}{$peg1} = 1;
	    }
	}
    }

    return \%blast_hit;
}

sub blast_files {
    my($fig, $input, $database, $cutoff) = @_;

    my $cmd  = "$FIG_Config::ext_bin/blastall";
    my @args = ('-p', 'blastp',  '-i', $input, '-d', $database, '-m', 8, '-e', $cutoff);
    my @blast_out = $fig->run_gathering_output($cmd, @args);

    return \@blast_out;
}

sub formatdb_file {
    my($fig, $file) = @_;

    my $cmd = "$FIG_Config::ext_bin/formatdb -i $file -p T";
    $fig->run($cmd);
}

sub write_seqs_to_file {
    my($seq, $fasta_file) = @_;

    open(FASTA, ">$fasta_file") or die "could not create FASTA file '$fasta_file': $!";
    foreach my $peg ( keys %$seq )
    {
	print FASTA ">$peg\n$seq->{$peg}\n";
    }
    close(FASTA) or die "could not close file FASTA file '$fasta_file': $!";
}

sub get_peg_sequences {
    my($fig, $feature_data) = @_;
    my %sequences;

    # Iterate over each peg
    foreach my $peg ( grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data )
    {
	my $seq = $fig->get_translation($peg);

	if ( $seq )
	{
	    $sequences{$peg} = $seq;
	}
	else
	{
	    print STDERR "could not get sqeuence for $peg\n";
	}
    }

    return \%sequences;
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3