[Bio] / FigKernelScripts / find_neighbors_by_sim.pl Repository:
ViewVC logotype

View of /FigKernelScripts/find_neighbors_by_sim.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Mon Feb 18 22:32:20 2008 UTC (12 years, 1 month ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.3: +2 -2 lines
Add -no-duplicates support for build_anno_clearinghouse and build_nr.

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

#
# !!!!!!!!!
#
# This is experimental code, not meant for production use. Think of it as an example.
#


use FIG;
my $fig = new FIG;

use ToCall;
use NewGenome;

my $usage = "usage: find_neighbors_by_sim To_Call_Dir NR peg.syn found [old] > closest.N.genomes";

if (@ARGV && ($ARGV[0] =~ m/-h(elp)?/)) {
    print STDERR "\n   $usage\n\n";
    exit (0);
}

my ($to_call_dir, $NR, $pegsyn, $found);

# Now pick up the normal parameters
(
 ($to_call_dir = shift @ARGV) && 
 ($NR           = shift @ARGV) &&
 ($pegsyn           = shift @ARGV)  &&
 ($found           = shift @ARGV) 
)
    || die "\n   $usage\n\n";

### This is a bit complex.  Normally, one uses this program to call genes
# in a newly-sequenced genome.  In this case the functionality of NewGenome.pm
# is needed.  To call genes in an existing genome (to, for example, see how well
# we are doing) you would use ToCall.pm.

my $to_call;
if ((@ARGV > 0) && ($ARGV[0] =~ /^old/i)) {
    $to_call = new ToCall($to_call_dir);
}
else {
    $to_call = new NewGenome($to_call_dir);
}

open(FOUND, ">>$found") or die "cannot open $found: $!";

#
# Don't call this in this version - assume it was already done?
# Might not if we explicitly mark fragments
#
#$to_call->possible_orfs;

use strict;

my @pegs;

for my $fid ($to_call->get_fids_for_type('orf'))
{
    if ($to_call->get_feature_length($fid) > 900)
    {
	push(@pegs, $fid);
	last if @pegs >= 10;
    }
}

my $n = @pegs;
warn "Found $n\n";

if ($n == 0)
{
    die "Could not find any features of sufficient length\n";
}

my $fa = "$FIG_Config::temp/tmp.fa.$$";
open(TMP, ">$fa") or die "cannot write $fa: $!\n";

for my $fid (@pegs)
{
    my $seq = $to_call->get_feature_sequence($fid);
    &FIG::display_id_and_seq($fid, \$seq, \*TMP);
}

close(TMP);
warn "wrote $fa\n";

my $blastout = "$FIG_Config::temp/tmp.blast.$$";
my $blastexp = "$FIG_Config::temp/tmp.blastexp.$$";
my @cmd = ("$FIG_Config::ext_bin/blastall",
	   -p => 'blastp',
	   -d => $NR,
	   -o => $blastout,
	   -i => $fa,
	   -m => 8,
	   -e => 1.0e-5,
	   '-FF');
my @cmd = ("cp", "tmp.blast.4056", $blastout);
warn "Blast: @cmd\n";
my $rc = system(@cmd);
if ($rc != 0)
{
    die "Blast failed with rc=$rc: @cmd\n";
}

my $rc = system("$FIG_Config::bin/expand_sims", $pegsyn, $blastout, $blastexp);
$rc == 0 or die "Error in expansion from $blastout to $blastexp\n";

open(BL, "<$blastexp") or die "cannot open output $blastexp: $!\n";

my(%gstat);
my %sims;

while (<BL>)
{
    chomp;
    my(@sim) = split(/\t/);
    my($id1, $id2, $iden) = @sim;

    if ($id2 =~ /^fig\|(\d+\.\d+)/)
    {
	my $g2 = $1;

	if ($iden > 70)
	{
	    my $sim = bless \@sim, 'Sim';
	    push(@{$sims{$id1}}, $sim);
	    $gstat{$g2}->{1}++;
	}
	else
	{
	    $gstat{$g2}->{0}++;
	}
    }
}
close(BL);

for my $g (sort keys %gstat)
{
    my $sim = $gstat{$g}->{1} - $gstat{$g}->{0};
    if ($sim > 0)
    {
	print "$g\t$sim\n";
    }
}

for my $id (keys %sims)
{
    my $orf = NewGenome::ORF->new($to_call, $id);
    my @sims = sort { $b->psc <=> $a->psc or $b->iden <=> $a->iden } @{$sims{$id}};
    warn "Found sims \n" . join("\t\n", @sims) . "\n";
    my $fun_id = $sims[0]->id2;
    my $fun = $fig->function_of($fun_id);
    warn "Found func $fun for $fun_id\n";
    my $fid = $orf->promote_to_peg(\@sims, $fun);
    print FOUND join("\t", $fid, "SIM", $fun), "\n";
}
close(FOUND);
   
$to_call->export_features;

# unlink($blastout);
# unlink($fa);

exit(0);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3