[Bio] / FigKernelScripts / index_translations.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/index_translations.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.24 - (view) (download) (as text)

1 : efrank 1.1 # -*- perl -*-
2 : olson 1.15 #
3 : golsen 1.18 # Copyright (c) 2003-2008 University of Chicago and Fellowship
4 : olson 1.15 # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 : golsen 1.19 #
20 :     # usage: index_translations # index genomes and NR proteins
21 :     # index_translations -n # as above, but include Global/nr
22 :     # index_translations G1 G2 G3 ... # index specified genomes, external
23 :     # # databases (e.g., KEGG), or nr
24 :     # # (for SEED Global/nr).
25 :     #
26 : golsen 1.10 # If there are arguments, then the database is NOT reinitialized, and only the
27 :     # designated new set of organisnms is indexed.
28 : efrank 1.1
29 :     use FIG;
30 : parrello 1.13 use Tracer;
31 :    
32 : efrank 1.1 my $fig = new FIG;
33 :    
34 : parrello 1.13 my $dbf = $fig->db_handle;
35 : golsen 1.10
36 :     my $fig_nr_dir = "$FIG_Config::data/NR";
37 : golsen 1.19 my $fig_nr_file = "$FIG_Config::global/nr";
38 : golsen 1.10 my $fig_org_dir = "$FIG_Config::organisms";
39 :     my $fig_tmp_dir = "$FIG_Config::temp";
40 :     my $seeks_file = "$fig_tmp_dir/translations_seeks.$$";
41 :    
42 : golsen 1.19 my $nr_flag = ( @ARGV && $ARGV[0] eq '-n' ) ? shift : '';
43 :    
44 : parrello 1.13 my $mode = (@ARGV == 0 ? 'all' : 'some');
45 :    
46 : golsen 1.10 #
47 :     # Build list of files to be processed
48 :     #
49 :    
50 : efrank 1.1 my @to_process = ();
51 :    
52 : parrello 1.13 if ( @ARGV == 0 ) {
53 : golsen 1.16 Trace("Reading non-redundancy directory.") if T(2);
54 : olson 1.14 opendir( NR, $fig_nr_dir ) || Confess("Could not open directory $fig_nr_dir.");
55 : golsen 1.10 my @dirs = map { "$fig_nr_dir/$_" } grep { $_ !~ /^\./ } readdir( NR );
56 :     closedir( NR );
57 : efrank 1.1
58 :     my $dir;
59 : parrello 1.13 foreach $dir ( @dirs ) {
60 : golsen 1.16 opendir( SUBDIR, $dir) || Confess("Could not open $dir");
61 :     my @files = grep { $_ !~ /^\./ } readdir( SUBDIR );
62 :     closedir( SUBDIR );
63 :     Trace("Examining $dir.") if T(3);
64 :     if ( @files == 1 ) {
65 :     push( @to_process, "$dir/$files[0]" );
66 :     } elsif ( ( my @tmp = grep { $_ =~ /fasta/ } @files ) == 1 ) {
67 :     push( @to_process, "$dir/$tmp[0]" );
68 :     } else {
69 : olson 1.23 push(@to_process, map { "$dir/$_" } grep { $_ =~ /fasta$/ && $_ ne 'md5.fasta' } @files);
70 : golsen 1.16 }
71 : efrank 1.1 }
72 :    
73 : golsen 1.10 my $fasta;
74 :     push @to_process, map { $fasta = "$fig_org_dir/$_/Features/peg/fasta";
75 :     -s $fasta ? $fasta : ()
76 :     } $fig->genomes;
77 : golsen 1.19
78 :     # push @to_process, $fig_nr_file if -r $fig_nr_file;
79 : parrello 1.13 } else {
80 : golsen 1.19 # Expand the idea of special cases from organisms to external files in
81 :     # the NR directory, and even the SEED nr database, Global/nr:
82 : golsen 1.10 my $fasta;
83 : golsen 1.19 @to_process = map { -s "$fig_org_dir/$_/Features/peg/fasta" ? "$fig_org_dir/$_/Features/peg/fasta"
84 :     : -s "$fig_nr_dir/$_/fasta" ? "$fig_nr_dir/$_/fasta"
85 :     : ( $_ eq 'nr' ) && -s $fig_nr_file ? $fig_nr_file
86 :     : ()
87 : golsen 1.10 } @ARGV;
88 :     }
89 : efrank 1.1
90 : golsen 1.10 #
91 :     # The Bulk of the Work: Index all the protein sequence files
92 :     #
93 :     # We must decide if we can do this with a faster, external program, or if
94 :     # we must do it in perl. If "index_translation_files" does not exist, the
95 :     # open will fail. If it succeeds, read the version number (-v option) and
96 :     # make sure that it is 2.00 (after stripping the new line). If this all
97 :     # works, then we will assume that this program can be called and given a
98 :     # list of protein sequence files to index.
99 :     #
100 : parrello 1.13 Trace("Checking translation method.") if T(2);
101 : golsen 1.16
102 : golsen 1.10 my ( $v, $protfilelist );
103 : overbeek 1.22 if ( open VERSION_PIPE, "$FIG_Config::bin/index_translation_files -v |"
104 : golsen 1.10 and $v = <VERSION_PIPE>
105 :     and close VERSION_PIPE
106 :     and chomp $v
107 :     and $v >= 2 and $v < 3
108 : parrello 1.13 ) {
109 : golsen 1.10 #
110 :     # Write the list of files to be indexed
111 :     #
112 :    
113 :     $protfilelist = "$fig_tmp_dir/translations_file_list.$$";
114 : parrello 1.13 Open(\*FILELIST, ">$protfilelist");
115 : golsen 1.10
116 :     my ( $file, $fileno );
117 : parrello 1.13 foreach $file ( @to_process ) {
118 : golsen 1.16 if ( $fileno = $fig->file2N( $file ) ) { print FILELIST "$fileno\t$file\n" }
119 : golsen 1.10 }
120 :     close( FILELIST );
121 :     }
122 :    
123 :     #
124 :     # It is now time to try to do the indexing. If that works, then we stick
125 :     # with this route. Otherwise we can still fall back to doing it in perl.
126 :     #
127 :     # Find the protein seeks, saving them in $seeks_file.
128 :     #
129 : golsen 1.16 # index_translation_files max_ids max_id_len [cksum_suffix_len (D=64)] < file_list > seek_size_and_cksum_info
130 : golsen 1.10 #
131 : parrello 1.13 Trace("Indexing files.") if T(2);
132 : golsen 1.16
133 : olson 1.23 my $max_id_per_file = 50_000_000; # Allocates id memory & hash slots
134 : golsen 1.17 my $max_id_len = 64; # Truncates and continues with log to STDERR
135 :     my $cksum_suffix_len = 64; # Locate same protein suffix
136 :    
137 : golsen 1.10 if ( $protfilelist
138 : overbeek 1.22 and system( "$FIG_Config::bin/index_translation_files $max_id_per_file $max_id_len $cksum_suffix_len < $protfilelist > $seeks_file" ) == 0
139 : golsen 1.16 )
140 :     {
141 : golsen 1.10 unlink( $protfilelist );
142 : golsen 1.16 }
143 :     else
144 :     {
145 :     #
146 :     # If that failed, do it with perl subroutine (without checksums)
147 :     #
148 : golsen 1.10 if ( $protfilelist && -e $protfilelist ) { unlink( $protfilelist ) }
149 :    
150 : golsen 1.18 index_translation_files( $fig, $seeks_file, @to_process );
151 : golsen 1.10 }
152 :    
153 :     #
154 : golsen 1.16 # Remove old translation seeks from database
155 : golsen 1.10 #
156 : parrello 1.13 my @fileNumbers = ();
157 : golsen 1.16 if ($mode eq 'some')
158 :     {
159 :     push @fileNumbers, map { $fig->file2N($_) } @to_process;
160 : efrank 1.1 }
161 :    
162 : parrello 1.13 # $table, $flds, $xflds, $fileName, $genomes
163 : golsen 1.16
164 :     $fig->reload_table( $mode, "protein_sequence_seeks",
165 : olson 1.21 "id varchar(64), fileno INTEGER, seek INTEGER, len INTEGER, "
166 : golsen 1.16 . "slen INTEGER, cksum INTEGER, sufcksum INTEGER",
167 : olson 1.24 { trans_id_ix => "id",
168 :     trans_cksum_ix => "cksum",
169 :     trans_fileno_ix => 'fileno',
170 :     trans_sufcksum_ix => "sufcksum" },
171 : golsen 1.16 $seeks_file, \@fileNumbers, 'fileno');
172 : golsen 1.10 unlink( $seeks_file );
173 :    
174 :     undef $fig;
175 : parrello 1.13 Trace("Translation indexing complete.") if T(2);
176 : golsen 1.10
177 : golsen 1.18 # Add MD5 index for each indexed genome
178 :    
179 :     system( "index_translations_MD5" . ( @ARGV ? join( ' ', '', @ARGV ) : '' ) );
180 :    
181 : golsen 1.10 exit;
182 : efrank 1.1
183 : golsen 1.10
184 :     #=============================================================================
185 :     # This is the perl and unix utilities version. For now, it does not do
186 : parrello 1.13 # sequence cksums due to the cost in perl.
187 : golsen 1.10 #=============================================================================
188 :    
189 : golsen 1.18 sub index_translation_files {
190 : golsen 1.10 my $fig = shift;
191 :     my $seeks_file = shift;
192 :    
193 : olson 1.11 #
194 : olson 1.14 # Test for solaris or linux and tac, since tail -r doesn't work there.
195 : olson 1.11 #
196 :    
197 :     my $rev_cmd = "tail -r";
198 : olson 1.23 my $osname = `uname -o`;
199 :     chomp $osname;
200 :     if ($FIG_Config::arch =~ /^linux/ or $FIG_Config::arch =~ /^solaris/ or
201 :     $osname =~ /linux/i)
202 : olson 1.14 {
203 : olson 1.23 for my $try (qw(/usr/bin/tac /bin/tac)) {
204 :     if (-x $try) {
205 :     $rev_cmd = $try;
206 :     last;
207 :     }
208 :     }
209 :     } elsif ($FIG_Config::arch =~ /^win/) {
210 :     $rev_cmd = "tac - ";
211 : olson 1.11 }
212 :    
213 : golsen 1.10 my ( $file, $fileno, $seek1, $seek2, $id, $last_id, $start_seek, $ln, $slen );
214 : parrello 1.13 foreach $file ( @_ ) {
215 : golsen 1.16 Trace("Indexing $file.") if T(3);
216 :     #
217 :     # Reverse each index (tail -r) and use sort -su to keep last version
218 :     # This requires opening a separate pipe for each input file (so that
219 :     # sort does not need to handle the concatenation of all the input).
220 :     #
221 :     if ( open( TRANS, "<$file" ) ) {
222 :     open( SEEKS, "| $rev_cmd | sort -su -k 1,1 >> $seeks_file" ) || die "aborted";
223 :     $fileno = $fig->file2N( $file );
224 :     $seek1 = tell TRANS;
225 :     while ( defined( $_ = <TRANS> ) ) {
226 :     $seek2 = tell TRANS;
227 :     if ( $_ =~ /^>(\S+)/ ) {
228 :     $id = $1;
229 :     $id =~ s/^([^|]+\|[^|]+)\|.*$/$1/;
230 :     if ( $last_id ) {
231 :     $ln = $seek1 - $start_seek;
232 :     if ( ( $ln > 10 ) && ( $slen > 10 ) ) {
233 :     print SEEKS "$last_id\t$fileno\t$start_seek\t$ln\t$slen\t0\t0\n";
234 :     }
235 :     # else
236 :     # {
237 :     # print STDERR "$last_id not loaded: ln=$ln slen=$slen\n";
238 :     # }
239 :     }
240 :     $last_id = $id;
241 :     $start_seek = $seek2;
242 :     $slen = 0;
243 :     } else {
244 :     $_ =~ s/\s+//g;
245 :     $slen += length( $_ );
246 :     }
247 :     $seek1 = $seek2;
248 :     }
249 :    
250 :     if ( $last_id ) {
251 :     $ln = $seek1 - $start_seek;
252 :     if ( ( $ln > 10 ) && ( $slen > 10 ) ) {
253 :     print SEEKS "$last_id\t$fileno\t$start_seek\t$ln\t$slen\t0\t0\n";
254 :     }
255 :     # else
256 :     # {
257 :     # print STDERR "$last_id not loaded: ln=$ln slen=$slen\n";
258 :     # }
259 :     }
260 :    
261 :     close( SEEKS );
262 :     close( TRANS );
263 :     } else {
264 :     print STDERR "*** could not open $file - ignoring its translations\n";
265 :     }
266 : overbeek 1.5 }
267 :     }
268 : efrank 1.1
269 :    
270 : golsen 1.10 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3