[Bio] / FigKernelScripts / update_sims.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/update_sims.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (view) (download) (as text)

1 : olson 1.13 #
2 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
3 :     # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 :     #
7 :     # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 :     # Public License.
10 :     #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :    
18 : efrank 1.1 use strict;
19 :     use FIG;
20 :    
21 : overbeek 1.10 my $usage = "usage: update_sims peg.synonyms MaxSz InDir OutDir NewSims1 NewSims2 ...";
22 : efrank 1.1
23 : overbeek 1.10 my($added,$deleted,$new_sims,@files,$file,$full_file,$hdr,%in,$sim,$syns,$maxsz);
24 : efrank 1.1 my(%sims_for,@sims,$curr,$currQ,$x,%new,$remapped,@flds,%remapped,$new);
25 : overbeek 1.9 my($in_dir,$out_dir);
26 : efrank 1.1
27 :     (
28 : overbeek 1.10 ($syns = shift @ARGV) &&
29 :     ($maxsz = shift @ARGV) &&
30 : overbeek 1.9 ($in_dir = shift @ARGV) &&
31 :     ($out_dir = shift @ARGV) &&
32 : overbeek 1.2 (@ARGV > 0)
33 : efrank 1.1 )
34 :     || die $usage;
35 :    
36 : overbeek 1.9 opendir(SIMS,$in_dir)
37 :     || die "could not open $in_dir";
38 : efrank 1.1 @files = grep { $_ !~ /^\./ } readdir(SIMS);
39 :     closedir(SIMS);
40 :    
41 : overbeek 1.9
42 :     mkdir($out_dir,0777) || die "could not make $out_dir";
43 :    
44 : overbeek 1.12 #
45 :     # Outer: Loop over the existing sims files.
46 :     #
47 : efrank 1.1 foreach $file (@files)
48 :     {
49 :     print STDERR "processing $file\n";
50 :     undef %in;
51 : overbeek 1.9 open(OLD,"<$in_dir/$file")
52 :     || die "could not open $in_dir/$file";
53 : overbeek 1.12
54 :     #
55 :     # Ingest IDs.
56 :     #
57 : efrank 1.1 while (defined($_ = <OLD>))
58 :     {
59 : overbeek 1.9 if ($_ =~ /^(\S+)/)
60 :     {
61 :     $in{$1} = 1;
62 :     }
63 : efrank 1.1 }
64 :     close(OLD);
65 : overbeek 1.7 # print STDERR " registered ids in $file\n";
66 : efrank 1.1
67 :     undef %new;
68 : overbeek 1.12
69 :     #
70 :     # Inner: Loop over new sims files.
71 :     #
72 : overbeek 1.2 foreach $new_sims (@ARGV)
73 : efrank 1.1 {
74 : overbeek 1.9 open(NEWSIMS,"<$new_sims")
75 :     || die "could not open $new_sims";
76 :     while (defined($sim = <NEWSIMS>))
77 :     {
78 : overbeek 1.12 #
79 :     # If id2 is present in the old sims file, reverse the current
80 :     # sim and add it to the list of sims for id2.
81 :     #
82 : overbeek 1.9 if (($sim =~ /^\S+\t(\S+)/) && $in{$1})
83 :     {
84 :     push(@{$sims_for{$1}},&rev_sim($sim));
85 :     }
86 :     }
87 :     close(NEWSIMS);
88 : efrank 1.1 }
89 : overbeek 1.7 # print STDERR " saved flips\n";
90 : efrank 1.1
91 : overbeek 1.9 open(OLD,"<$in_dir/$file")
92 :     || die "could not open $in_dir/$file";
93 : overbeek 1.10 open(NEW,"| reduce_sims $syns $maxsz > $out_dir/$file")
94 : overbeek 1.9 || die "could not open $out_dir/$file";
95 : overbeek 1.12
96 :     #
97 :     # Rescan the old sims file, adding the new sims.
98 :     #
99 : efrank 1.1 $sim = <OLD>;
100 : overbeek 1.3 while (defined($sim))
101 : efrank 1.1 {
102 : overbeek 1.9 if ($sim =~ /^(\S+)\t/)
103 :     {
104 :     $curr = $1;
105 :     $currQ = quotemeta $curr;
106 :     @sims = ();
107 :     while (defined($sim) && ($sim =~ /^$currQ\s/))
108 :     {
109 :     if ($sim =~ /^$currQ\t(\S+)/)
110 :     {
111 :     push(@sims,$sim);
112 :     }
113 :     $sim = <OLD>;
114 :     }
115 :    
116 :     if ($x = $sims_for{$curr})
117 :     {
118 : overbeek 1.12 #
119 :     # we have new sims available for this id1.
120 :     #
121 :     # Create a list of tuples [id1, id2, rest of simdata] where tup[10] is the e-score
122 :     # Sort by e-score
123 :     # Select the highest score id2 if there are multiple id2s.
124 :     # and map back to tab-separated list to print.
125 :     #
126 : overbeek 1.9 my %seen;
127 :     @sims = map { join("\t",@$_) . "\n" }
128 :     grep { if (! $seen{$_->[1]}) { $seen{$_->[1]} = 1; 1 } else { 0 } }
129 :     sort {
130 :     ($a->[10] <=> $b->[10])
131 :     }
132 :     map { chomp; [split(/\t/,$_)] }
133 :     (@sims,@$x);
134 :     }
135 :     print NEW join("",@sims);
136 :     }
137 :     else
138 :     {
139 :     $sim = <OLD>;
140 :     }
141 : efrank 1.1 }
142 :     close(NEW);
143 :     close(OLD);
144 :     undef %sims_for;
145 :     }
146 : overbeek 1.2
147 : overbeek 1.9 foreach $new_sims (@ARGV)
148 : overbeek 1.2 {
149 : overbeek 1.11 &FIG::run("cp $new_sims $out_dir");
150 : overbeek 1.2 }
151 : efrank 1.1
152 :     sub rev_sim {
153 :     my($sim) = @_;
154 : overbeek 1.9
155 : overbeek 1.4 chomp $sim;
156 : efrank 1.1 my($id1,$id2,$iden,$ali_ln,$mismatches,$gaps,$b1,$e1,$b2,$e2,$psc,$bsc,$ln1,$ln2) = split(/\t/,$sim);
157 :     return join("\t",($id2,$id1,$iden,$ali_ln,$mismatches,$gaps,$b2,$e2,$b1,$e1,$psc,$bsc,$ln2,$ln1)) . "\n";
158 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3