[Bio] / FortyEightMeta / load_sim_data_noncached.pl Repository:
ViewVC logotype

Annotation of /FortyEightMeta/load_sim_data_noncached.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : olson 1.1 #!/usr/bin/perl
2 :    
3 :     #
4 :     # Load sims data into the relational database for use in the frontend analysis software.
5 :     #
6 :     # We assume the incoming data is grouped by id1 (the fragment id), and within the groups
7 :     # it is sorted in order of increasing e-value. The data as returned from the underlying
8 :     # blast runs should have these properties.
9 :     #
10 :     # We analyze each group of hits in turn, assigning ranks within the group for
11 :     # the e-value ordering and the percent-identity ordering. We also look up
12 :     # the id2 values for each hit in order to assign a taxonomic identifier for the
13 :     # hit.
14 :     #
15 :     # For FIG hits, we expand the id2 into a fig| identifier if it is an xxx identifier, and look up
16 :     # a current subsystem membership for the id. If it has one, map the ss name into
17 :     # the "tax string" for that subsystem.
18 :     #
19 :    
20 :     use strict;
21 :     use DBI;
22 :     use Data::Dumper;
23 :     use FIG;
24 :     use FIG_Config;
25 :     use DB_File;
26 :     use File::Basename;
27 :     use Cwd 'abs_path';
28 :    
29 :     my $mysql = "$FIG_Config::ext_bin/mysql";
30 :     my $mysql_sock = "/var/run/mysqld/mysqld.sock";
31 :     my $dbname = "bobtest2";
32 :    
33 :     my $metagenome_data = "/vol/metagenome-48-hour/Data";
34 :     my $seed_data = "/vol/seed-data-anno-mirror/Data.Jan3/Organisms";
35 :     my $pegsyn = "/vol/48-hour/Data/peg.synonyms.index.t";
36 :    
37 :     my $dbh = DBI->connect("dbi:mysql:$dbname;mysql_socket=$mysql_sock;mysql_local_infile=1", "olson");
38 :    
39 :     my $jobdir = abs_path(".");
40 :    
41 :     my $job = basename($jobdir);
42 :    
43 :    
44 :     my $fig = new FIG;
45 :     my $ss_lookup = $fig->db_handle->{_dbh}->prepare(qq(SELECT subsystem FROM subsystem_index WHERE protein = ?));
46 :    
47 :     my $tbl = create_table($job);
48 :     print "Created $tbl\n";
49 :    
50 :     my $fifo = "/tmp/load.fifo.$$";
51 :    
52 :     if (system('mkfifo', $fifo) != 0)
53 :     {
54 :     die "error creating FIFO\n";
55 :     }
56 :    
57 :     my $cmdfile = "/tmp/load.cmd.$$";
58 :     open(C, ">$cmdfile") or die "cannot write $cmdfile: $!";
59 :     print C "LOAD DATA LOCAL INFILE '$fifo' INTO TABLE $tbl FIELDS ESCAPED BY '' ENCLOSED BY ''\n";
60 :     close(C);
61 :    
62 :     #my $load_file = "/tmp/load_rdp.$$";
63 :     #open(L, ">$load_file") or die "cannot write $load_file: $!";
64 :    
65 :     my $load_pid = fork;
66 :    
67 :     if ($load_pid == 0)
68 :     {
69 :     my $cmd = "$mysql --local-infile -S $mysql_sock $dbname < $cmdfile";
70 :     print "Child running $cmd\n";
71 :     my $rc = system($cmd);
72 :     print "Child finished running $cmd with rc=$rc\n";
73 :     exit;
74 :     }
75 :     sleep(1);
76 :    
77 :     open(L, ">$fifo") or die "Cannot write to fifo $fifo: $!\n";
78 :    
79 :     print "Started child $load_pid\n";
80 :    
81 :     my $l10 = 10 / log(10);
82 :     my ($maxlen1, $maxlen2, $maxlen3);
83 :    
84 :     #
85 :     # sim-block columns to rank by.
86 :     # column index is as pushed into the @block in the sim processing loops.
87 :     # third column is 1 for ascending, -1 for descending.
88 :     #
89 :     my @rank = ([1, 'rank_iden', -1],
90 :     [6, 'rank_logpsc', 1],
91 :     );
92 :    
93 :     #load_seed("t");
94 :    
95 :     if (1)
96 :     {
97 :     load_seed("$jobdir/proc/sims.seed.raw");
98 :    
99 :     load_rdp('gg', "$jobdir/proc/sims.gg.raw");
100 :     load_rdp('ssu', "$jobdir/proc/sims.ssu.raw");
101 :     load_rdp('lsu', "$jobdir/proc/sims.lsu.raw");
102 :     }
103 :     close(L);
104 :    
105 :     print "Waiting for child $load_pid to finish\n";
106 :     my $proc = waitpid($load_pid, 0);
107 :     print "Child $proc finished with retcode $?\n";
108 :     exit;
109 :    
110 :     #my $tbl = create_table($job, $maxlen1, $maxlen2, $maxlen3);
111 :    
112 :     #print "Loading into table $tbl\n";
113 :     #my $n = $dbh->do(qq(LOAD DATA INFILE '$load_file' INTO TABLE $tbl FIELDS ESCAPED BY '' ENCLOSED BY ''));
114 :     #print "load returns $n\n";
115 :    
116 :     sub load_rdp
117 :     {
118 :     my($db, $file) = @_;
119 :    
120 :     my $ident = get_db_ident($db);
121 :    
122 :     my $lookup_sth = $dbh->prepare(qq(SELECT tax_str
123 :     FROM rdp_to_tax
124 :     WHERE dbid = $ident AND seq_num = ?));
125 :    
126 :     open(F, "<$file") or die "cannot open $file: $!";
127 :    
128 :     print "Load $file\n";
129 :    
130 :     my $last_id;
131 :     my @block;
132 :     while (<F>)
133 :     {
134 :     chomp;
135 :     my($id1, $id2, $iden, $ali, $mismatch, $gaps, $b1, $e1, $b2, $e2, $psc, $bsc) = split(/\t/);
136 :     $lookup_sth->execute($id2);
137 :     my $res = $lookup_sth->fetchrow_arrayref();
138 :     if ($res)
139 :     {
140 :     my $tstr = $res->[0];
141 :    
142 :     update_maxlen($id1, $id2, $tstr);
143 :    
144 :     if ($id1 ne $last_id)
145 :     {
146 :     handle_block($ident, $last_id, \@block) if @block;
147 :     @block = ();
148 :     $last_id = $id1;
149 :     }
150 :     my(@g) = split(/:/, $tstr, 4);
151 :     my $lpsc = $psc == 0.0 ? -3500 : int($l10 * log($psc));
152 :     push(@block, [$id2, $iden, $ali, $b1, $e1, $b2, $e2, $lpsc, $bsc, $tstr, @g[0..2]]);
153 :     }
154 :     }
155 :     handle_block($ident, $last_id, \@block);
156 :     close(F);
157 :     }
158 :    
159 :     #
160 :     # Process a block of sims from the same id1.
161 :     #
162 :     # We sort by each of the fields we are interested in ranking on
163 :     # (pct identity, bitscore, pscore) and store that rank as well.
164 :     #
165 :     # We actually don't have to sort on bitscore/pscore since we assume
166 :     # our data comes in that way. Those ranks are just the indexes of the
167 :     # rows in the block.
168 :     #
169 :     sub handle_block
170 :     {
171 :     my($dbid, $id1, $block) = @_;
172 :    
173 :     # return if @$block < 10 || $block->[0]->[1] > 90;
174 :    
175 :     my @r;
176 :     for my $rent (@rank)
177 :     {
178 :     my($col_idx, $col_name, $direction) = @$rent;
179 :    
180 :     # print "@$rent\n";
181 :     my @s = sort { $direction * ($a->[$col_idx] <=> $b->[$col_idx]) } @$block;
182 :     map { push(@{$s[$_]}, $_) } 0..$#s;
183 :    
184 :     # for my $i (0..$#s)
185 :     # {
186 :     # print join("\t", $i, @{$s[$i]}, @{$block->[$i]}), "\n";
187 :     # }
188 :     }
189 :    
190 :     for my $i (0..$#$block)
191 :     {
192 :     print L join("\t", $dbid, $id1, @{$block->[$i]}), "\n";
193 :     }
194 :     }
195 :    
196 :     sub load_seed
197 :     {
198 :     my($file) = @_;
199 :    
200 :     my $ident = get_db_ident('seed');
201 :    
202 :     my $lookup_sth = $dbh->prepare(qq(SELECT tax_str
203 :     FROM rdp_to_tax
204 :     WHERE dbid = $ident AND seq_num = ?));
205 :     my $ss_lookup_tax = $dbh->prepare(qq(SELECT tax_str
206 :     FROM ss_to_tax
207 :     WHERE name = ?));
208 :    
209 :     open(F, "<$file") or die "cannot open $file: $!";
210 :     print "load $file\n";
211 :    
212 :     my %ps;
213 :     my $tied = tie %ps, 'DB_File', $pegsyn, O_RDONLY, 0, $DB_BTREE;
214 :     $tied or die "cannot tie $pegsyn: $!";
215 :    
216 :     my $last_id;
217 :     my @block;
218 :     while (<F>)
219 :     {
220 :     chomp;
221 :     my($id1, $id2, $iden, $ali, $mismatch, $gaps, $b1, $e1, $b2, $e2, $psc, $bsc) = split(/\t/);
222 :    
223 :     my $xid = $id2;
224 :     if ($id2 =~ /^xxx/)
225 :     {
226 :     $xid = $ps{$id2};
227 :     }
228 :     if ($xid =~ /(fig\|(\d+\.\d+)\.peg\.\d+)/)
229 :     {
230 :     my $peg = $1;
231 :     my $genome = $2;
232 :     # print "Mapped $id2 to $xid to $peg to $genome\n";
233 :    
234 :     my($ss, $ss_tax);
235 :    
236 :     $ss_lookup->execute($peg);
237 :     my $sres = $ss_lookup->fetchrow_arrayref();
238 :     if ($sres)
239 :     {
240 :     $ss = $sres->[0];
241 :     $ss_lookup_tax->execute($ss);
242 :     my $ssres = $ss_lookup_tax->fetchrow_arrayref();
243 :     $ss_tax = $ssres->[0] if $ssres;
244 :     # print "Got ss $ss $ss_tax\n";
245 :     }
246 :    
247 :     $lookup_sth->execute($genome);
248 :     my $res = $lookup_sth->fetchrow_arrayref();
249 :     if ($res)
250 :     {
251 :     my $tstr = $res->[0];
252 :     update_maxlen($id1, $peg, $tstr);
253 :    
254 :     if ($id1 ne $last_id)
255 :     {
256 :     handle_block($ident, $last_id, \@block);
257 :     @block = ();
258 :     $last_id = $id1;
259 :     }
260 :    
261 :     my $lpsc = $psc == 0.0 ? -3500 : int($l10 * log($psc));
262 :     my(@g) = split(/:/, $tstr, 4);
263 :     my(@s) = split(/:/, $ss_tax, 4);
264 :     push(@block, [$peg, $iden, $ali, $b1, $e1, $b2, $e2, $lpsc, $bsc, $tstr, @g[0..2], $ss_tax, @s[0..2]]);
265 :     }
266 :     }
267 :     # last if $. > 100;
268 :     }
269 :     handle_block($ident, $last_id, \@block);
270 :     close(F);
271 :     }
272 :    
273 :    
274 :     sub get_db_ident
275 :     {
276 :     my($str) = @_;
277 :     my $res = $dbh->selectcol_arrayref(qq(SELECT dbid
278 :     FROM db_type
279 :     WHERE name = ?), undef, $str);
280 :     if (@$res)
281 :     {
282 :     return $res->[0];
283 :     }
284 :    
285 :     $dbh->do(qq(INSERT INTO db_type (name) VALUES (?)), undef, $str);
286 :     my $id = $dbh->{mysql_insertid};
287 :     return $id;
288 :     }
289 :    
290 :     #
291 :     # Create a sims data table for the given job.
292 :     #
293 :     sub create_table
294 :     {
295 :     my($job) = @_;
296 :    
297 :     my $tbl_name = "tax_sim_${job}_a";
298 :    
299 :     $dbh->do(qq(DROP TABLE IF EXISTS $tbl_name));
300 :     my $qry = qq(CREATE TABLE $tbl_name
301 :     (
302 :     dbid TINYINT,
303 :     id1 CHAR(32),
304 :     id2 CHAR(32),
305 :     iden TINYINT UNSIGNED,
306 :     ali_ln SMALLINT UNSIGNED,
307 :     b1 SMALLINT UNSIGNED,
308 :     e1 SMALLINT UNSIGNED,
309 :     b2 SMALLINT UNSIGNED,
310 :     e2 SMALLINT UNSIGNED,
311 :     logpsc SMALLINT,
312 :     bsc SMALLINT UNSIGNED,
313 :     tax_str CHAR(160),
314 :     tax_group_1 CHAR(2),
315 :     tax_group_2 CHAR(2),
316 :     tax_group_3 CHAR(2),
317 :     ss_str CHAR(12),
318 :     ss_group_1 CHAR(2),
319 :     ss_group_2 CHAR(2),
320 :     ss_group_3 CHAR(2),
321 :     rank_iden TINYINT UNSIGNED,
322 :     rank_psc TINYINT UNSIGNED,
323 :     KEY(id1),
324 :     KEY(id2),
325 :     KEY(iden),
326 :     KEY(ali_ln),
327 :     KEY(logpsc),
328 :     KEY(bsc),
329 :     KEY(rank_iden),
330 :     KEY(rank_psc),
331 :     KEY(tax_str),
332 :     KEY(tax_group_1, tax_group_2, tax_group_3),
333 :     KEY(ss_str),
334 :     KEY(ss_group_1, ss_group_2, ss_group_3)
335 :     ) ENGINE = InnoDB;
336 :     );
337 :    
338 :     print $qry;
339 :    
340 :     $dbh->do($qry);
341 :    
342 :     return $tbl_name;
343 :     }
344 :    
345 :     sub update_maxlen
346 :     {
347 :     my $l1 = length($_[0]);
348 :     my $l2 = length($_[1]);
349 :     my $l3 = length($_[2]);
350 :     $maxlen1 = $l1 if $l1 > $maxlen1;
351 :     $maxlen2 = $l2 if $l2 > $maxlen2;
352 :     $maxlen3 = $l3 if $l3 > $maxlen3;
353 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3