[Bio] / FigKernelScripts / update_sims2.pl Repository:
ViewVC logotype

View of /FigKernelScripts/update_sims2.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (download) (as text) (annotate)
Mon Sep 20 20:26:26 2010 UTC (9 years, 6 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.5: +67 -34 lines
updated code

use strict;
use FIG;
use File::Basename;
use Proc::ParallelLoop;

my $usage = "usage: update_sims nr peg.synonyms MaxSz InDir OutDir SortedNewSims IdsToDelete unused";

my($nr, $added,$deleted,$new_sims,@files,$file,$full_file,$hdr,%in,$sim,$syns,$maxsz);
my(@sims,$curr,$currQ,$x,%new,$remapped,@flds,%remapped,$new);
my($in_dir,$out_dir, $sorted_sims, $to_delete);

@ARGV == 6 or @ARGV == 7 or @ARGV == 8 or die $usage;

$nr      = shift @ARGV;
$syns    = shift @ARGV;
$maxsz   = shift @ARGV;
$in_dir  = shift @ARGV;
$out_dir = shift @ARGV;
$sorted_sims = shift @ARGV;
$to_delete = shift @ARGV;
my $unused = shift @ARGV;

#
# Read the NR so we can eliminate sims for sequences that
# do not occur in the NR.
#
print "Reading NR $nr\n";
open(NR,"<$nr") || die "could not open $nr";

my %nr_seqs;
while (<NR>)
{
    if ($_ =~ /^>(\S+)/)
    {
	$nr_seqs{$1}++;
    }
}
close(NR);

-f $syns or die "Synonyms file $syns not found\n";

my $unused_fh;
if ($unused)
{
    open($unused_fh, ">$unused") or die "cannot open unused file $unused: $!\n";
}

my %deleted;
if ($to_delete)
{
    open(DEL, "<$to_delete") or die "Cannot open delete file to_delete: $!";
    while (<DEL>)
    {
	if (/(\S+)/)
	{
	    $deleted{$1}++;
	}
    }
    close(DEL);
}

if (-d $in_dir)
{
    opendir(SIMS,$in_dir)
	|| die "could not open $in_dir";
    @files = map { "$in_dir/$_" }  grep { $_ !~ /^\./ } readdir(SIMS);
    closedir(SIMS);
}
elsif (open(SFILE, "<$in_dir"))
{
    @files = <SFILE>;
    chomp @files;
    close(SFILE);
}
else
{
    die "Bad sims input $in_dir\n";
}

#-d $out_dir and die "Output directory $out_dir already exists\n";


-d $out_dir || mkdir($out_dir,0777) || die "could not make $out_dir";

#
# Read the sorted sims, maintaining an index of peg -> seek & length.
#

open(S, "<$sorted_sims") or die "Cannot open $sorted_sims: $!\n";
$| = 1;

my %idx;
my $offset = tell(S);
my $line = <S>;

my $curr;
my $next_offset;
while (defined($line) and $line =~ /^(\S+)/)
{
    print "." if $. % 100000 == 0;
    $curr = $1;
    while (defined($line) and
	   $line =~ /^(\S+)/ and
	   $1 eq $curr)
    {
	$next_offset = tell(S);
	$line = <S>;
    }
    my $len = $next_offset - $offset;
    $idx{$curr} = [$offset, $len];

    $offset = $next_offset;
}
print "\n";
#
# Outer: Loop over the existing sims files.
#
foreach $file (sort @files)
{
#pareach \@files, sub {
#    my $file = shift;
    print STDERR "processing $file\n";

    open(OLD,"<$file")
        || die "could not open $file";

    #
    # Shortcut if the old file is empty; don't want to pay the price
    # of starting reduce_sims.
    #
    my $base = basename($file);

    my @stat = stat(OLD);
    if (@stat and $stat[7] == 0)
    {
	open(NEW, ">$out_dir/$base");
	close(NEW);
	close(OLD);
	next;
    }

    open(NEW,"| reduce_sims $syns $maxsz > $out_dir/$base")
        || die "could not open $out_dir/$base";

    #
    # Rescan the old sims file, adding the new sims.
    #
    $sim = <OLD>;
    while (defined($sim))
    {
        if ($sim =~ /^(\S+)\t(\S+)/)
        {
	    if (!(sim_ok($1) && sim_ok($2)))
	    {
		$sim = <OLD>;
		next;
	    }
            $curr = $1;
            $currQ = quotemeta $curr;
            @sims = ();
            while (defined($sim) && ($sim =~ /^$currQ\s/))
            {
                if ($sim =~ /^$currQ\t(\S+)/)
                {
		    my $ok = sim_ok($1);
                    push(@sims,$sim) if $ok;
                }
                $sim = <OLD>;
            }

	    if (my $where = $idx{$curr})
	    {
		my($seek, $len) = @$where;
		seek(S, $seek, 0);
		my $buf;
		read(S, $buf, $len);
		my @new = split(/\n/, $buf);

#		print "Got sims=@sims\n new=$buf\n";

		#
		# we have new sims available for this id1.
		#
		# Create a list of tuples [id1, id2, rest of simdata] where tup[10] is the e-score
		# Sort by e-score
		# Select the highest score id2 if there are multiple id2s.
		# and map back to tab-separated list to print.
		#
                my %seen;
                @sims = map  { join("\t",@$_) . "\n" }
		grep { if (! $seen{$_->[1]}) { $seen{$_->[1]} = 1; 1 } else { 0 } }
		sort {
                                ($a->[10] <=> $b->[10])
				}
		map {  chomp; [split(/\t/,$_)] }
		(@sims,@new);

		#
		# Remove the index entry for $curr. That way we can print them out
		# at the end of the process, if we have any left over.
		#
		delete $idx{$curr};
            }
            print NEW join("",@sims) or die "$0 cannot write to $out_dir/$base: $!";
        }
        else
        {
            $sim = <OLD>;
        }
    }
    close(NEW);
    close(OLD);
}

#
# Create a new output file containing the sims that
# we didn't use.
#


# if ($unused_fh)
# {
#     while(my($curr, $where) = each(%idx))
#     {
# 	my($seek, $len) = @$where;
# 	seek(S, $seek, 0);
# 	my $buf;
# 	read(S, $buf, $len);
# 	my @new = split(/\n/, $buf);
	
# 	#
# 	# we have new sims available for this id1.
# 	#
# 	# Create a list of tuples [id1, id2, rest of simdata] where tup[10] is the e-score
# 	# Sort by e-score
# 	# Select the highest score id2 if there are multiple id2s.
# 	# and map back to tab-separated list to print.
# 	#
# 	my %seen;
# 	@sims = map  { join("\t",@$_) . "\n" }
# 		grep { if (! $seen{$_->[1]}) { $seen{$_->[1]} = 1; 1 } else { 0 } }
# 		sort {
#                                 ($a->[10] <=> $b->[10])
# 				}
# 		map {  chomp; [split(/\t/,$_)] }
# 		@new;
# 	print $unused_fh $_ for @sims;
#     }
# }

foreach $new_sims (@ARGV)
{
    &FIG::run("cp $new_sims $out_dir");
}

sub sim_ok
{
    my($id1) = @_;
	
    return !$deleted{$id1} && exists($nr_seqs{$id1});
}
sub rev_sim {
    my($sim) = @_;

    chomp $sim;
    my($id1,$id2,$iden,$ali_ln,$mismatches,$gaps,$b1,$e1,$b2,$e2,$psc,$bsc,$ln1,$ln2) = split(/\t/,$sim);
    return join("\t",($id2,$id1,$iden,$ali_ln,$mismatches,$gaps,$b2,$e2,$b1,$e1,$psc,$bsc,$ln2,$ln1)) . "\n";
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3