[Bio] / FigKernelPackages / P2P.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/P2P.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.20, Thu Jan 6 04:37:54 2005 UTC revision 1.21, Fri Jan 7 18:54:45 2005 UTC
# Line 36  Line 36 
36  our $ns_p2p = "http://thefig.info/schemas/p2p_update";  our $ns_p2p = "http://thefig.info/schemas/p2p_update";
37  our $ns_relay = "http://thefig.info/schemas/p2p_relay";  our $ns_relay = "http://thefig.info/schemas/p2p_relay";
38    
39    my $peg_batch_size = 1000;
40    my $anno_batch_size = 1000;
41    my $assign_batch_size = 1000;
42    
43  =pod  =pod
44    
45  =head1 perform_update($peer)  =head1 perform_update($peer)
# Line 90  Line 94 
94      }      }
95    
96      #      #
97      # We have  the information now to begin the update process. Retrieve the pegs.      # peg_mapping is the local mapping from remote->local peg. This might
98      #      # be replacable by peg_cache from above.
   
     $ret = $peer->get_pegs($session, 0, $num_pegs);  
   
     if (!$ret or ref($ret) ne "ARRAY")  
     {  
         die "perform_update: get_pegs failed\n";  
     }  
   
     my($peg_list, $genome_list) = @$ret;  
   
     #  
     # Walk the peg-list to and generate @pegs_to_finalize.  
     #  
   
     my(%peg_mapping, %genome_map );  
   
     for my $peg_info (@$peg_list)  
     {  
         my($key, $peg, @rest) = @$peg_info;  
   
         if ($key eq 'peg')  
         {  
             #  
             # Peg id is directly usable.  
             #  
             $peg_mapping{$peg} = $peg;  
         }  
         elsif ($key eq 'peg_info')  
         {  
             #  
             # Peg id not directly usable. See if we have it in the cache.  
             #  
   
             if ((my $cached = $peg_cache{$peg}) ne "")  
             {  
                 #  
                 # Cool, we've cached the result. Use it.  
                 #  
   
                 $peg_mapping{$peg} = $cached;  
                 warn "Found cached mapping $peg => $cached\n";  
                 next;  
             }  
   
             my($alias_list, $genome_id) = @rest;  
   
             for my $alias (@$alias_list)  
             {  
                 my $mapped = $fig->by_alias($alias);  
                 if ($mapped)  
                 {  
                     print "$peg maps to $mapped via $alias\n";  
                     $peg_mapping{$peg}= $mapped;  
                     $peg_cache{$peg} = $mapped;  
                     last;  
                 }  
             }  
   
             #  
             # If we didn't succeed in mapping by alias,  
             # stash this in the list of pegs to be mapped by  
             # genome.  
             #  
   
             if (!defined($peg_mapping{$peg}))  
             {  
                 push(@{$genome_map{$genome_id}}, $peg);  
                 print "$peg did not map\n";  
             }  
         }  
     }  
   
     $cache_handle->sync();  
   
     #  
     # finished first pass. Now go over the per-genome mappings that need to be made.  
     #  
     # $genome_map{$genome_id} is a list of pegs that reside on that genome.  
     # the pegs and genome id are both target-based identifiers.  
     #  
   
     my @finalize_req = ();  
     my %local_genome;  
   
     for my $genome_info (@$genome_list)  
     {  
         my($genome, $n_contigs, $n_nucs, $cksum) = @$genome_info;  
   
         next unless defined($genome_map{$genome});  
   
         #  
         # Determine if we have a local genome installed that matches precisely the  
         # genome on the target side.  
         #  
         my $my_genome = $fig->find_genome_by_content($genome, $n_contigs, $n_nucs, $cksum);  
   
         my $pegs = $genome_map{$genome};  
   
         if ($my_genome)  
         {  
             #  
             # We do have such a local genome. Generate a peg_genome request to  
             # get the location information from the target side.  
             #  
             # Also remember the local genome mapping for this peg.  
             #  
   
             print "$genome mapped to $my_genome\n";  
             for my $peg (@$pegs)  
             {  
                 push(@finalize_req, ['peg_genome', $peg]);  
                 $local_genome{$peg} = $my_genome;  
             }  
   
         }  
         else  
         {  
             #  
             # We don't have such a genome. We need to retrieve the  
             # sequence data in order to finish mapping.  
             #  
             push(@finalize_req, map { ['peg_unknown', $_] } @$pegs);  
         }  
     }  
   
     #  
     # If we need to finalize, make the call.  
     if (@finalize_req)  
     {  
         # print Dumper(\@finalize_req);  
         $ret = $peer->finalize_pegs($session, \@finalize_req);  
   
         if (!$ret or ref($ret) ne "ARRAY")  
         {  
             die "perform_update: finalize_pegs failed\n";  
         }  
   
         #  
         # The return is a list of either location entries or  
         # sequence data. Attempt to finish up the mapping.  
         #  
   
         my(%sought, %sought_seq);  
   
   
         my $dbh = $fig->db_handle();  
         for my $entry (@$ret)  
         {  
             my($what, $peg, @rest) = @$entry;  
   
             if ($what eq "peg_loc")  
             {  
                 my($strand, $start, $end, $cksum, $seq) = @rest;  
   
                 #  
                 # We have a contig location. Try to find a matching contig  
                 # here, and see if it maps to something.  
99                  #                  #
100        my %peg_mapping;
101    
                 my $my_genome = $local_genome{$peg};  
                 my $local_contig = $fig->find_contig_with_checksum($my_genome, $cksum);  
                 if ($local_contig)  
                 {  
                     #  
                     # Now look up the local peg. We match on the end location; depending on the strand  
                     # the feature is on, we want to look at either minloc or maxloc.  
                     #  
   
                     my $whichloc = $strand eq '-' ? "minloc" : "maxloc";  
   
                     my $res = $dbh->SQL(qq!SELECT id from features  
                                            WHERE $whichloc = $end and genome = '$my_genome' and  
                                            contig = '$local_contig'  
                                         !);  
   
                     if ($res and @$res > 0)  
                     {  
                         my(@ids) = map { $_->[0] } @$res;  
                         my $id = $ids[0];  
                         $peg_mapping{$peg} = $id;  
                         $peg_cache{$peg} = $id;  
                         print "Mapped $peg to $id via contigs\n";  
                         if (@$res > 1)  
                         {  
                             warn "Multiple mappings found for $peg: @ids\n";  
                         }  
                     }  
                     else  
                     {  
                         print "failed: $peg  $my_genome and contig $local_contig start=$start end=$end strand=$strand\n";  
                         $sought{$peg}++;  
                         $sought_seq{$peg} = $seq;  
                     }  
                 }  
                 else  
                 {  
                     print "Mapping failed for $my_genome checksum $cksum\n";  
                     $sought{$peg}++;  
                     $sought_seq{$peg} = $seq;  
                 }  
             }  
             elsif ($what eq "peg_seq")  
             {  
                 my($seq) = @rest;  
   
                 $sought{$peg}++;  
                 $sought_seq{$peg} = $seq;  
             }  
         }  
102    
103          #          #
104          # Now see if we need to do a tough search.      # We have  the information now to begin the update process. Retrieve the pegs.
105          #          #
106    
107          if (keys(%sought) > 0 and !$skip_tough_search)      _compute_peg_mapping($fig, $peer, $session, $num_pegs, \%peg_mapping, \%peg_cache, $cache_handle,
108          {                           $skip_tough_search);
             my %trans;  
   
             print "Starting tough search\n";  
109    
             $fig->tough_search(undef, \%sought_seq, \%trans, \%sought);  
             print "Tough search translated: \n";  
             while (my($tpeg, $ttrans) = each(%trans))  
             {  
                 print "  $tpeg -> $ttrans\n";  
                 $peg_mapping{$tpeg} = $ttrans;  
                 $peg_cache{$tpeg} = $ttrans;  
             }  
         }  
     }  
110      $cache_handle->sync();      $cache_handle->sync();
111      untie %peg_cache;      untie %peg_cache;
112    
113      #      #
     # Retrieve the assignments.  
     #  
   
     my $assignments = $peer->get_assignments($session, 0, $num_assignments);  
   
     #  
     # Retrieve the annotations, and generate a list of mapped annotations.  
     #  
   
     my $annos = $peer->get_annotations($session, 0, $num_annos);  
   
     #  
114      # Create a list of locally-mapped annotations on a per-genome      # Create a list of locally-mapped annotations on a per-genome
115      # basis.      # basis.
116      #      #
# Line 353  Line 126 
126      #      #
127      my %genome_assignments;      my %genome_assignments;
128    
129    
130    
131        #
132        # Retrieve the annotations, and generate a list of mapped annotations.
133        #
134    
135        for (my $anno_start = 0; $anno_start < $num_annos; $anno_start += $anno_batch_size)
136        {
137            my $anno_req_len = $num_annos - $anno_start;
138            $anno_req_len = $anno_batch_size if $anno_req_len > $anno_batch_size;
139    
140            print "Retrieve $anno_req_len annos at $anno_start\n";
141    
142            my $annos = $peer->get_annotations($session, $anno_start, $anno_req_len);
143    
144      for my $anno (@$annos)      for my $anno (@$annos)
145      {      {
146          my($his_id, $ts, $author, $anno) = @$anno;          my($his_id, $ts, $author, $anno) = @$anno;
# Line 364  Line 152 
152    
153          push(@{$genome_annos{$genome}}, [$my_id, $ts, $author, $anno]);          push(@{$genome_annos{$genome}}, [$my_id, $ts, $author, $anno]);
154      }      }
155        }
156    
157      #      #
158      # Do the same for the assignments      # Do the same for the assignments
# Line 371  Line 160 
160    
161      # print Dumper($assignments);      # print Dumper($assignments);
162    
163    
164        for (my $assign_start = 0; $assign_start < $num_assignments; $assign_start += $assign_batch_size)
165        {
166            my $assign_req_len = $num_assignments - $assign_start;
167            $assign_req_len = $assign_batch_size if $assign_req_len > $assign_batch_size;
168    
169            print "Retrieve $assign_req_len assigns at $assign_start\n";
170    
171            my $assignments = $peer->get_assignments($session, $assign_start, $assign_req_len);
172    
173      for my $assign (@$assignments)      for my $assign (@$assignments)
174      {      {
175          my($his_id, $ts, $author, $func) = @$assign;          my($his_id, $ts, $author, $func) = @$assign;
# Line 381  Line 180 
180          my $genome = $fig->genome_of($my_id);          my $genome = $fig->genome_of($my_id);
181    
182          $genome_assignments{$genome}->{$my_id} =  [$my_id, $ts, $author, $func];          $genome_assignments{$genome}->{$my_id} =  [$my_id, $ts, $author, $func];
183            }
   
184      }      }
185    
186      # print Dumper(\%genome_annos);      # print Dumper(\%genome_annos);
# Line 525  Line 323 
323                  my $old = $fig->function_of($peg, 'master');                  my $old = $fig->function_of($peg, 'master');
324                  print $old_assignments "$peg\t$old\n";                  print $old_assignments "$peg\t$old\n";
325    
326                    if ($old ne $func)
327                    {
328                  print "Assign $peg $func\n";                  print "Assign $peg $func\n";
329                  $fig->assign_function($peg, 'master', $func);                  $fig->assign_function($peg, 'master', $func);
330              }              }
331          }          }
332            }
333    
334          open(my $outfh, ">$anno_file") or die "Cannot open new annotation file $anno_file: $!\n";          open(my $outfh, ">$anno_file") or die "Cannot open new annotation file $anno_file: $!\n";
335    
# Line 548  Line 349 
349              {              {
350                  print $outfh "$txt\n//\n";                  print $outfh "$txt\n//\n";
351                  $last = $txt;                  $last = $txt;
352                  print "Inst $ann->[0] $ann->[1] $ann->[2]\n";                  # print "Inst $ann->[0] $ann->[1] $ann->[2]\n";
353                  $inst++;                  $inst++;
354              }              }
355              else              else
356              {              {
357                  print "Dup $ann->[0] $ann->[1] $ann->[2]\n";                  # print "Dup $ann->[0] $ann->[1] $ann->[2]\n";
358                  $dup++;                  $dup++;
359              }              }
360          }          }
# Line 567  Line 368 
368      close($old_assignments);      close($old_assignments);
369  }  }
370    
371    #
372    # Compute the peg mapping for a session.
373    #
374    # $fig          Active FIG instance
375    # $peer         P2P peer for this session.
376    # $session      P2P session ID
377    # $peg_mapping  Hash ref for the remote -> local PEG mapping
378    # $peg_cache    Hash ref for the persistent remote -> local PEG mapping cache db.
379    # $cache_handle DB_File handle corresponding to $peg_cache.
380    #
381    sub _compute_peg_mapping
382    {
383        my($fig, $peer, $session, $num_pegs, $peg_mapping, $peg_cache, $cache_handle, $skip_tough_search) = @_;
384    
385        #
386        # genome_map is a hash mapping from target genome id to a list of
387        # pegs on the target. This is used to construct a finalize_pegs request after
388        # the first phase of peg mapping.
389        #
390    
391        my %genome_map;
392    
393        #
394        # target_genome_info is a hash mapping from target genome
395        # identifier to the target-side information on the genome -
396        # number of contigs, number of nucleotides, checksum.
397        #
398        # We accumulate it here across possibly multiple batches of
399        # peg retrievals in order to create a single  finalization
400        # list.
401        #
402    
403        my %target_genome_info;
404    
405        #
406        # For very large transfers, we need to batch the peg processing.
407        #
408    
409        for (my $peg_start = 0; $peg_start < $num_pegs; $peg_start += $peg_batch_size)
410        {
411            my $peg_req_len = $num_pegs - $peg_start;
412            $peg_req_len = $peg_batch_size if $peg_req_len > $peg_batch_size;
413    
414            print "Getting $peg_req_len pegs at $peg_start\n";
415            my $ret = $peer->get_pegs($session, $peg_start, $peg_req_len);
416    
417            if (!$ret or ref($ret) ne "ARRAY")
418            {
419                die "perform_update: get_pegs failed\n";
420            }
421    
422            my($peg_list, $genome_list) = @$ret;
423    
424            for my $gent (@$genome_list)
425            {
426                $target_genome_info{$gent->[0]} = $gent;
427            }
428    
429            _compute_peg_mapping_batch($fig, $peer, $session, $peg_mapping, $peg_cache, $cache_handle,
430                                       $peg_list, \%genome_map);
431        }
432    
433        #
434        # We have finished first pass. Now go over the per-genome mappings that need to be made.
435        #
436        # $genome_map{$genome_id} is a list of pegs that reside on that genome.
437        # The pegs and genome id are both target-based identifiers.
438        #
439        # %target_genome_info defines the list of genome information we have on the remote
440        # side.
441        #
442        # We build a request to be passed to finalize_pegs. Each entry in the request is either
443        # ['peg_genome', $peg] which means that we have a genome that corresponds to the
444        # genome the peg is in. We can attempt to map via contig locations.
445        #
446        # If that is not the case,  we pass a request entry of ['peg_unknown', $peg]
447        # which will result in the sequence data being returned.
448        #
449    
450        my @finalize_req = ();
451    
452        #
453        # local_genome maps a target peg identifier to the local genome id it translates to.
454        #
455        my %local_genome;
456    
457        for my $genome (keys(%target_genome_info))
458        {
459            my($tg, $n_contigs, $n_nucs, $cksum) = @{$target_genome_info{$genome}};
460    
461            $tg eq $genome or die "Invalid entry in target_genome_info for $genome => $tg, $n_contigs, $n_nucs, $cksum";
462    
463            #
464            # Don't bother unless we have any pegs to look up.
465            #
466            next unless defined($genome_map{$genome});
467    
468            #
469            # Determine if we have a local genome installed that matches precisely the
470            # genome on the target side.
471            #
472            my $my_genome = $fig->find_genome_by_content($genome, $n_contigs, $n_nucs, $cksum);
473    
474            my $pegs = $genome_map{$genome};
475    
476            if ($my_genome)
477            {
478                #
479                # We do have such a local genome. Generate a peg_genome request to
480                # get the location information from the target side.
481                #
482                # Also remember the local genome mapping for this peg.
483                #
484    
485                print "$genome mapped to $my_genome\n";
486                for my $peg (@$pegs)
487                {
488                    push(@finalize_req, ['peg_genome', $peg]);
489                    $local_genome{$peg} = $my_genome;
490                }
491    
492            }
493            else
494            {
495                #
496                # We don't have such a genome. We need to retrieve the
497                # sequence data in order to finish mapping.
498                #
499                push(@finalize_req, map { ['peg_unknown', $_] } @$pegs);
500            }
501        }
502    
503        #
504        # We've built our finalization request. Handle it (possibly with batching here too).
505        #
506    
507        _process_finalization_request($fig, $peer, $session, $peg_mapping, $peg_cache, $cache_handle,
508                                     \%local_genome, \@finalize_req, $skip_tough_search);
509    
510    }
511    
512    #
513    # Process one batch of PEGs.
514    #
515    # Same args as _compute_peg_mapping, with the addition of:
516    #
517    #       $peg_list       List of pegs to be processed
518    #       $genome_map     Hash maintaining list of genomes with their pegs.
519    #       $target_genome_info     Hash maintaining overall list of target-side genome information.
520    #
521    sub _compute_peg_mapping_batch
522    {
523        my($fig, $peer, $session, $peg_mapping, $peg_cache, $cache_handle,
524           $peg_list, $genome_map, $target_genome_info) = @_;
525    
526        #
527        # Walk the list of pegs as returned from get_pegs() and determine what has to
528        # be done.
529        #
530        # If the entry is ['peg', $peg], we can use the peg ID as is.
531        #
532        # If the entry is ['peg_info', $peg, $alias_list, $genome], the peg
533        # has the given aliases, and is in the given genome.
534        #
535        for my $peg_info (@$peg_list)
536        {
537            my($key, $peg, @rest) = @$peg_info;
538    
539            if ($key eq 'peg')
540            {
541                #
542                # Peg id is directly usable.
543                #
544                $peg_mapping->{$peg} = $peg;
545            }
546            elsif ($key eq 'peg_info')
547            {
548                #
549                # Peg id not directly usable. See if we have it in the cache.
550                #
551    
552                if ((my $cached = $peg_cache->{$peg}) ne "")
553                {
554                    #
555                    # Cool, we've cached the result. Use it.
556                    #
557    
558                    $peg_mapping->{$peg} = $cached;
559                    # warn "Found cached mapping $peg => $cached\n";
560                    next;
561                }
562    
563                #
564                # It is not cached. Attempt to resolve by means of alias IDs.
565                #
566    
567                my($alias_list, $genome_id) = @rest;
568    
569                for my $alias (@$alias_list)
570                {
571                    my $mapped = $fig->by_alias($alias);
572                    if ($mapped)
573                    {
574                        print "$peg maps to $mapped via $alias\n";
575                        $peg_mapping->{$peg}= $mapped;
576                        $peg_cache->{$peg} = $mapped;
577                        last;
578                    }
579                }
580    
581                #
582                # If we weren't able to resolve by ID,
583                # add to %genome_map as a PEG that will need
584                # to be resolved by means of contig location.
585                #
586    
587                if (!defined($peg_mapping->{$peg}))
588                {
589                    push(@{$genome_map->{$genome_id}}, $peg);
590                    print "$peg did not map on first pass\n";
591                }
592            }
593        }
594    
595        #
596        # Flush the cache to write out any computed mappings.
597        #
598        $cache_handle->sync();
599    
600    }
601    
602    sub _process_finalization_request
603    {
604        my($fig, $peer, $session, $peg_mapping, $peg_cache, $cache_handle,
605           $local_genome, $finalize_req, $skip_tough_search) = @_;
606    
607        #
608        # Immediately return unless there's something to do.
609        #
610        return unless ref($finalize_req) and @$finalize_req > 0;
611    
612        my $fin_batch_size = 50;
613    
614        while (@$finalize_req > 0)
615        {
616            my @req = splice(@$finalize_req, 0, $fin_batch_size);
617    
618            print "Invoking finalize_pegs on ", int(@req), " pegs\n";
619            my $ret = $peer->finalize_pegs($session, \@req);
620    
621            if (!$ret or ref($ret) ne "ARRAY")
622            {
623                die "perform_update: finalize_pegs failed\n";
624            }
625    
626            #
627            # The return is a list of either location entries or
628            # sequence data. Attempt to finish up the mapping.
629            #
630    
631            my(%sought, %sought_seq);
632    
633    
634            my $dbh = $fig->db_handle();
635            for my $entry (@$ret)
636            {
637                my($what, $peg, @rest) = @$entry;
638    
639                if ($what eq "peg_loc")
640                {
641                    my($strand, $start, $end, $cksum, $seq) = @rest;
642    
643                    #
644                    # We have a contig location. Try to find a matching contig
645                    # here, and see if it maps to something.
646                    #
647    
648                    my $my_genome = $local_genome->{$peg};
649                    my $local_contig = $fig->find_contig_with_checksum($my_genome, $cksum);
650                    if ($local_contig)
651                    {
652                        #
653                        # Now look up the local peg. We match on the end location; depending on the strand
654                        # the feature is on, we want to look at either minloc or maxloc.
655                        #
656    
657                        my $whichloc = $strand eq '-' ? "minloc" : "maxloc";
658    
659                        my $res = $dbh->SQL(qq!SELECT id from features
660                                               WHERE $whichloc = $end and genome = '$my_genome' and
661                                               contig = '$local_contig'
662                                            !);
663    
664                        if ($res and @$res > 0)
665                        {
666                            my(@ids) = map { $_->[0] } @$res;
667                            my $id = $ids[0];
668                            $peg_mapping->{$peg} = $id;
669                            $peg_cache->{$peg} = $id;
670                            print "Mapped $peg to $id via contigs\n";
671                            if (@$res > 1)
672                            {
673                                warn "Multiple mappings found for $peg: @ids\n";
674                            }
675                        }
676                        else
677                        {
678                            print "failed: $peg  $my_genome and contig $local_contig start=$start end=$end strand=$strand\n";
679                            $sought{$peg}++;
680                            $sought_seq{$peg} = $seq;
681                        }
682                    }
683                    else
684                    {
685                        print "Mapping failed for $my_genome checksum $cksum\n";
686                        $sought{$peg}++;
687                        $sought_seq{$peg} = $seq;
688                    }
689                }
690                elsif ($what eq "peg_seq")
691                {
692                    my($seq) = @rest;
693    
694                    $sought{$peg}++;
695                    $sought_seq{$peg} = $seq;
696                }
697            }
698    
699            #
700            # Now see if we need to do a tough search.
701            #
702    
703            if (keys(%sought) > 0 and !$skip_tough_search)
704            {
705                my %trans;
706    
707                print "Starting tough search\n";
708    
709                $fig->tough_search(undef, \%sought_seq, \%trans, \%sought);
710                print "Tough search translated: \n";
711                while (my($tpeg, $ttrans) = each(%trans))
712                {
713                    print "  $tpeg -> $ttrans\n";
714                    $peg_mapping->{$tpeg} = $ttrans;
715                    $peg_cache->{$tpeg} = $ttrans;
716                }
717            }
718        }
719    }
720    
721  #############  #############
722  #  #

Legend:
Removed from v.1.20  
changed lines
  Added in v.1.21

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3