[Bio] / FigKernelScripts / update_sims.pl Repository:
ViewVC logotype

View of /FigKernelScripts/update_sims.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (download) (as text) (annotate)
Mon Dec 5 18:56:38 2005 UTC (13 years, 11 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.12: +17 -0 lines
Add license words.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use strict;
use FIG;

my $usage = "usage: update_sims peg.synonyms MaxSz InDir OutDir NewSims1 NewSims2 ...";

my($added,$deleted,$new_sims,@files,$file,$full_file,$hdr,%in,$sim,$syns,$maxsz);
my(%sims_for,@sims,$curr,$currQ,$x,%new,$remapped,@flds,%remapped,$new);
my($in_dir,$out_dir);

(
 ($syns    = shift @ARGV) &&
 ($maxsz   = shift @ARGV) &&
 ($in_dir  = shift @ARGV) &&
 ($out_dir = shift @ARGV) &&
 (@ARGV > 0)
)
    || die $usage;

opendir(SIMS,$in_dir)
    || die "could not open $in_dir";
@files = grep { $_ !~ /^\./ } readdir(SIMS);
closedir(SIMS);


mkdir($out_dir,0777) || die "could not make $out_dir";

#
# Outer: Loop over the existing sims files.
#
foreach $file (@files)
{
    print STDERR "processing $file\n";
    undef %in;
    open(OLD,"<$in_dir/$file")
        || die "could not open $in_dir/$file";

    #
    # Ingest IDs.
    #
    while (defined($_ = <OLD>))
    {
        if ($_ =~ /^(\S+)/)
        {
            $in{$1} = 1;
        }
    }
    close(OLD);
#   print STDERR "    registered ids in $file\n";

    undef %new;

    #
    # Inner: Loop over new sims files.
    #
    foreach $new_sims (@ARGV)
    {
        open(NEWSIMS,"<$new_sims")
            || die "could not open $new_sims";
        while (defined($sim = <NEWSIMS>))
        {
	    #
	    # If id2 is present in the old sims file, reverse the current
	    # sim and add it to the list of sims for id2.
	    #
            if (($sim =~ /^\S+\t(\S+)/) && $in{$1})
            {
                push(@{$sims_for{$1}},&rev_sim($sim));
            }
        }
        close(NEWSIMS);
    }
#   print STDERR "    saved flips\n";

    open(OLD,"<$in_dir/$file")
        || die "could not open $in_dir/$file";
    open(NEW,"| reduce_sims $syns $maxsz > $out_dir/$file")
        || die "could not open $out_dir/$file";

    #
    # Rescan the old sims file, adding the new sims.
    #
    $sim = <OLD>;
    while (defined($sim))
    {
        if ($sim =~ /^(\S+)\t/)
        {
            $curr = $1;
            $currQ = quotemeta $curr;
            @sims = ();
            while (defined($sim) && ($sim =~ /^$currQ\s/))
            {
                if ($sim =~ /^$currQ\t(\S+)/)
                {
                    push(@sims,$sim);
                }
                $sim = <OLD>;
            }

            if ($x = $sims_for{$curr})
            {
		#
		# we have new sims available for this id1.
		#
		# Create a list of tuples [id1, id2, rest of simdata] where tup[10] is the e-score
		# Sort by e-score
		# Select the highest score id2 if there are multiple id2s.
		# and map back to tab-separated list to print.
		#
                my %seen;
                @sims = map  { join("\t",@$_) . "\n" }
		grep { if (! $seen{$_->[1]}) { $seen{$_->[1]} = 1; 1 } else { 0 } }
		sort {
                                ($a->[10] <=> $b->[10])
				}
		map {  chomp; [split(/\t/,$_)] }
		(@sims,@$x);
            }
            print NEW join("",@sims);
        }
        else
        {
            $sim = <OLD>;
        }
    }
    close(NEW);
    close(OLD);
    undef %sims_for;
}

foreach $new_sims (@ARGV)
{
    &FIG::run("cp $new_sims $out_dir");
}

sub rev_sim {
    my($sim) = @_;

    chomp $sim;
    my($id1,$id2,$iden,$ali_ln,$mismatches,$gaps,$b1,$e1,$b2,$e2,$psc,$bsc,$ln1,$ln2) = split(/\t/,$sim);
    return join("\t",($id2,$id1,$iden,$ali_ln,$mismatches,$gaps,$b2,$e2,$b1,$e1,$psc,$bsc,$ln2,$ln1)) . "\n";
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3