[Bio] / FigKernelPackages / FIG.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FIG.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.122, Mon Jun 28 20:11:14 2004 UTC revision 1.123, Thu Jul 1 21:17:53 2004 UTC
# Line 27  Line 27 
27  use Data::Dumper;  use Data::Dumper;
28  use Time::Local;  use Time::Local;
29  use File::Spec;  use File::Spec;
30    use File::Copy;
31  #  #
32  # Try to load the RPC stuff; it might fail on older versions of the software.  # Try to load the RPC stuff; it might fail on older versions of the software.
33  #  #
# Line 208  Line 208 
208    
209  usage: create_sim_askfor_pool()  usage: create_sim_askfor_pool()
210    
211  Creates an askfor pool, a snapshot of the current NR and  Creates an askfor pool, a snapshot of the current NR and similarity
212  similarity queue. Zeros out the old queue. We also create a  queue. Zeros out the old queue.
213  queue index, a file of records  
214    The askfor pool needs to keep track of which sequences need to be
215      offset    length    assigned_time    status  calculated, which have been handed out, etc. To simplify this task we
216    chunk the sequences into fairly small numbers (10-20 sequences) and
217  offset and length are 8 bytes, assigned time is a 4-byte integer  allocate work on a per-chunk basis. We make use of the relational
218  holding seconds-since-the-epoch of when this chunk was assigned, or  database to keep track of chunk status as well as the seek locations
219  0 if unassigned. Status is a 4-byte integer containing status information.  into the file of sequence data. The initial creation of the pool
220    involves indexing the sequence data with seek offsets and lengths and
221    populating the sim_askfor_index table with this information and with
222    initial status information.
223    
224  =cut  =cut
225    
226  sub create_sim_askfor_pool  sub create_sim_askfor_pool
227  {  {
228      my($self) = @_;      my($self, $chunk_size) = @_;
229    
230        $chunk_size = 15 unless $chunk_size =~ /^\d+$/;
231    
232      my $pool_dir = "$FIG_Config::global/sim_pools";      my $pool_dir = "$FIG_Config::global/sim_pools";
233      &verify_dir($pool_dir);      &verify_dir($pool_dir);
# Line 240  Line 245 
245          while (<$toc>)          while (<$toc>)
246          {          {
247              chomp;              chomp;
248              print STDERR "Have toc entry  $_\n";              # print STDERR "Have toc entry  $_\n";
249              my ($idx, $time, $str) = split(/\s+/, $_, 3);              my ($idx, $time, $str) = split(/\s+/, $_, 3);
250    
251              $num = max($num, $idx);              $num = max($num, $idx);
# Line 253  Line 258 
258      print $toc "$num ", time(), " New toc entry\n";      print $toc "$num ", time(), " New toc entry\n";
259      close($toc);      close($toc);
260    
261      my $cpool_dir = sprintf "$pool_dir/%04d", $num;      my $cpool_id = sprintf "%04d", $num;
262        my $cpool_dir = "$pool_dir/$cpool_id";
263    
264      #      #
265      # All set, create the directory for this pool.      # All set, create the directory for this pool.
# Line 268  Line 274 
274      #      #
275    
276      eval {      eval {
   
277          my $sim_q = "$FIG_Config::global/queued_similarities";          my $sim_q = "$FIG_Config::global/queued_similarities";
278    
279          &run("cp $sim_q $cpool_dir/q");          copy("$sim_q", "$cpool_dir/q");
280            copy("$FIG_Config::data/Global/nr", "$cpool_dir/nr");
281    
282          open(F, ">$sim_q") or die "Cannot open $sim_q to truncate it: $!\n";          open(F, ">$sim_q") or die "Cannot open $sim_q to truncate it: $!\n";
283          close(F);          close(F);
284        };
285    
286        unlink("$pool_dir/lockfile");
287        close($lock);
288    
289        #
290        # We've created our pool; we can now run the formatdb and
291        # extract the sequences for the blast run.
292        #
293        my $child_pid = $self->run_in_background(sub {
294            #
295            # Need to close db or there's all sorts of trouble.
296            #
297    
298            my $cmd = "$FIG_Config::ext_bin/formatdb -i $cpool_dir/nr -p T -l $cpool_dir/formatdb.log";
299            print "Will run '$cmd'\n";
300            &run($cmd);
301            print "finished. Logfile:\n";
302            print &FIG::file_read("$cpool_dir/formatdb.log");
303            unlink("$cpool_dir/formatdb.pid");
304         });
305        print "Running formatdb in background job $child_pid\n";
306        open(FPID, ">$cpool_dir/formatdb.pid");
307        print FPID "$child_pid\n";
308        close(FPID);
309    
310        my $db = $self->db_handle();
311        if (!$db->table_exists("sim_queue"))
312        {
313            $db->create_table(tbl => "sim_queue",
314                              flds => "qid varchar(32), chunk_id INTEGER, seek INTEGER, len INTEGER, " .
315                              "assigned BOOL, finished BOOL, output_file varchar(255), " .
316                              "assignment_expires INTEGER, worker_info varchar(255)"
317                             );
318        }
319    
320        #
321        # Write the fasta input file. Keep track of how many have been written,
322        # and write seek info into the database as appropriate.
323          #          #
324          # Create the index  
325        open(my $seq_fh, ">$cpool_dir/fasta.in");
326    
327        my($chunk_idx, $chunk_begin, $seq_idx);
328    
329        $chunk_idx = 0;
330        $chunk_begin = 0;
331        $seq_idx = 0;
332    
333        my(@seeks);
334    
335        open(my $q_fh, "<$cpool_dir/q");
336        while (my $id = <$q_fh>)
337        {
338            chomp $id;
339    
340            my $seq = $self->get_translation($id);
341    
342          #          #
343      };          # check if we're at the beginning of a chunk
344            #
345    
346      unlink("$pool_dir/lockfile");          print $seq_fh ">$id\n$seq\n";
     close($lock);  
347    
348            #
349            # Check if we're at the end of a chunk
350            #
351    
352            if ((($seq_idx + 1) % $chunk_size) == 0)
353            {
354                my $chunk_end = tell($seq_fh);
355                my $chunk_len = $chunk_end - $chunk_begin;
356    
357                push(@seeks, [$cpool_id, $chunk_idx, $chunk_begin, $chunk_len]);
358                $chunk_idx++;
359                $chunk_begin = $chunk_end;
360            }
361            $seq_idx++;
362        }
363    
364        if ((($seq_idx) % $chunk_size) != 0)
365        {
366            my $chunk_end = tell($seq_fh);
367            my $chunk_len = $chunk_end - $chunk_begin;
368    
369            push(@seeks, [$cpool_id, $chunk_idx, $chunk_begin, $chunk_len]);
370    
371            $chunk_idx++;
372            $chunk_begin = $chunk_end;
373        }
374    
375        close($q_fh);
376        close($seq_fh);
377    
378        print "Write seqs\n";
379    
380        for my $seek (@seeks)
381        {
382            my($cpool_id, $chunk_idx, $chunk_begin, $chunk_len) = @$seek;
383    
384            $db->SQL("insert into sim_queue (qid, chunk_id, seek, len, assigned, finished) " .
385                     "values('$cpool_id', $chunk_idx, $chunk_begin, $chunk_len, FALSE, FALSE)");
386        }
387    
388        return $cpool_id;
389    }
390    
391    =pod
392    
393    =head1 get_sim_queue
394    
395    usage: get_sim_queue($pool_id, $all_sims)
396    
397    Returns the sims in the given pool. If $all_sims is true, return the entire queue. Otherwise,
398    just return the sims awaiting processing.
399    
400    =cut
401    
402    sub get_sim_queue
403    {
404        my($self, $pool_id, $all_sims) = @_;
405    }
406    
407    =pod
408    
409    =head1 get_active_sim_pools
410    
411    usage: get_active_sim_pools()
412    
413    Return a list of the pool id's for the sim processing queues that have entries awaiting
414    computation.
415    
416    =cut
417    
418    sub get_active_sim_pools
419    {
420        my($self) = @_;
421    
422        my $dbh = $self->db_handle();
423    
424        my $res = $dbh->SQL("select distinct qid from sim_queue where not finished");
425        return undef unless $res;
426    
427        return map { $_->[0] } @$res;
428    }
429    
430    =pod
431    
432    =head1 get_sim_pool_info
433    
434    usage: get_sim_pool_info($pool_id)
435    
436    Return information about the given sim pool. Return value
437    is a list ($total_entries, $n_finished, $n_assigned, $n_unassigned)
438    
439    =cut
440    
441    sub get_sim_pool_info
442    {
443        my($self, $pool_id) = @_;
444        my($dbh, $res, $total_entries, $n_finished, $n_assigned, $n_unassigned);
445    
446        $dbh = $self->db_handle();
447    
448        $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id'");
449        $total_entries = $res->[0]->[0];
450    
451        $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and finished");
452        $n_finished = $res->[0]->[0];
453    
454        $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and assigned and not finished");
455        $n_assigned = $res->[0]->[0];
456    
457        $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and not finished and not assigned");
458        $n_unassigned = $res->[0]->[0];
459    
460        return ($total_entries, $n_finished, $n_assigned, $n_unassigned);
461  }  }
462    
463  =pod  =pod
# Line 295  Line 468 
468    
469  Returns a chunk of $n_seqs of work.  Returns a chunk of $n_seqs of work.
470    
471    From Ross, about how sims are processed:
472    
473    Here is how I process them:
474    
475    
476        bash$ cd /Volumes/seed/olson/Sims/June22.out
477        bash$ for i in really*
478        > do
479        > cat  < $i >> /Volumes/laptop/new.sims
480        > done
481    
482    
483    Then, I need to "reformat" them by adding to columns to each one
484     and split the result into files of about 3M each This I do using
485    
486    reduce_sims /Volumes/laptop/NR/NewNR/peg.synonyms.june21  300 < /Volumes/laptop/new.sims |
487        reformat_sims /Volumes/laptop/NR/NewNR/checked.nr.june21   > /Volumes/laptop/reformated.sims
488    rm /Volumes/laptop/new.sims
489    split_sims /Volumes/laptop/NewSims sims.june24  reformated.sims
490    rm reformatted.sims
491    
492    
493  =cut  =cut
494  sub get_sim_chunk  sub get_sim_chunk
495  {  {
# Line 3641  Line 3836 
3836  sub load_all {  sub load_all {
3837      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
3838    
3839      &run("load_peg_mapping");      print STDERR "\nLoading SEED data\n\n";
3840      &run("index_contigs");  
3841      &run("compute_genome_counts");      my @packages = qw(load_peg_mapping
3842      &run("load_features");                        index_contigs
3843      &run("index_sims");                        compute_genome_counts
3844      &run("index_translations");                        load_features
3845      &run("add_assertions_of_function");                        index_sims
3846      &run("load_protein_families");                        index_translations
3847      &run("load_external_orgs");                        add_assertions_of_function
3848      &run("load_chromosomal_clusters");                        load_protein_families
3849      &run("load_pch_pins");                        load_external_orgs
3850      &run("index_neighborhoods");                        load_chromosomal_clusters
3851      &run("index_annotations");                        load_pch_pins
3852      &run("load_ec_names");                        index_neighborhoods
3853      &run("init_maps");                        index_annotations
3854      &run("load_kegg");                        load_ec_names
3855      &run("load_distances");                        init_maps
3856      &run("make_indexes");                        load_kegg
3857      &run("format_peg_dbs");                        load_distances
3858      &run("load_links");                        make_indexes
3859      &run("index_subsystems");                        format_peg_dbs
3860      &run("load_bbhs");                        load_links
3861                          index_subsystems
3862                          load_bbhs);
3863    
3864        my $pn = @packages;
3865        for my $i (0..@packages - 1)
3866        {
3867            my $i1 = $i + 1;
3868            my $pkg = $packages[$i];
3869    
3870    
3871            print "Running $pkg ($i1 of $pn)\n";
3872    
3873            &run($pkg);
3874        }
3875        print "\n\nLoad complete.\n\n";
3876  }  }
3877    
3878  ################################# Automated Assignments  ####################################  ################################# Automated Assignments  ####################################
# Line 6134  Line 6344 
6344    
6345  sub run_in_background  sub run_in_background
6346  {  {
6347      my($self, $coderef) = @_;      my($self, $coderef, $close_fds) = @_;
6348    
6349      if (ref($coderef) ne "CODE")      if (ref($coderef) ne "CODE")
6350      {      {
# Line 6160  Line 6370 
6370    
6371          POSIX::setsid();          POSIX::setsid();
6372    
6373            my $d = $self->db_handle();
6374            if ($d)
6375            {
6376                my $dbh = $d->{_dbh};
6377                $dbh->{InactiveDestroy} = 1;
6378            }
6379    
6380            if ($close_fds)
6381            {
6382                for (my $fd = 3; $fd < 32; $fd++)
6383                {
6384                    POSIX::close($fd);
6385                }
6386            }
6387    
6388          my $my_job_dir = "$job_dir/$$";          my $my_job_dir = "$job_dir/$$";
6389          verify_dir($my_job_dir);          verify_dir($my_job_dir);
6390    
# Line 6212  Line 6437 
6437          }          }
6438          close($stat);          close($stat);
6439    
6440            #
6441            # We need to undef this, otherwise the DBrtns destructor
6442            # will do an explicit dbh->disconnect, which will undo any
6443            # effect of the InactiveDestroy set above.
6444            #
6445    
6446            my $d = $self->db_handle();
6447            if ($d)
6448            {
6449                delete $d->{_dbh};
6450            }
6451    
6452          exit;          exit;
6453      }      }
6454    

Legend:
Removed from v.1.122  
changed lines
  Added in v.1.123

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3