[Bio] / FigKernelPackages / ServerThing.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/ServerThing.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.33, Wed Dec 16 20:50:31 2009 UTC revision 1.40, Tue Mar 16 20:24:05 2010 UTC
# Line 33  Line 33 
33      # Set up tracing. We never do CGI tracing here; the only question is whether      # Set up tracing. We never do CGI tracing here; the only question is whether
34      # or not the caller passed in a tracing key. If he didn't, we use the server      # or not the caller passed in a tracing key. If he didn't, we use the server
35      # name.      # name.
     warn "Key parameter is \"$key\". Server name is \"$serverName\".\n"; ##HACK  
36      ETracing($key || $serverName, destType => 'APPEND', level => '0 ServerThing');      ETracing($key || $serverName, destType => 'APPEND', level => '0 ServerThing');
37      # Turn off YAML compression, which causes problems with some of our hash keys.      # Turn off YAML compression, which causes problems with some of our hash keys.
38      $YAML::CompressSeries = 0;      $YAML::CompressSeries = 0;
# Line 200  Line 199 
199  sub GetIdList {  sub GetIdList {
200      # Get the parameters.      # Get the parameters.
201      my ($name, $args, $optional) = @_;      my ($name, $args, $optional) = @_;
202        # Declare the return variable.
203        my $retVal;
204      # Check the argument format.      # Check the argument format.
205      if (ref $args ne 'HASH') {      if (! defined $args && $optional) {
206            # Here there are no parameters, but the arguments are optional so it's
207            # okay.
208            $retVal = [];
209        } elsif (ref $args ne 'HASH') {
210            # Here we have an invalid parameter structure.
211          Confess("No '$name' parameter present.");          Confess("No '$name' parameter present.");
212      }      } else {
213      # Try to get the IDs from the argument structure.          # Here we have a hash with potential parameters in it. Try to get the
214      my $retVal = $args->{$name};          # IDs from the argument structure.
215            $retVal = $args->{$name};
216      # Was a member found?      # Was a member found?
217      if (! defined $retVal) {      if (! defined $retVal) {
218          # No. If we're optional, return an empty list; otherwise throw an error.          # No. If we're optional, return an empty list; otherwise throw an error.
# Line 215  Line 222 
222              Confess("No '$name' parameter found.");              Confess("No '$name' parameter found.");
223          }          }
224      } else {      } else {
225          # Here we found something. Get the parameter type. We was a list reference.              # Here we found something. Get the parameter type. We want a list reference.
226          # If it's a scalar, we'll convert it to a singleton list. If it's anything          # If it's a scalar, we'll convert it to a singleton list. If it's anything
227          # else, it's an error.          # else, it's an error.
228          my $type = ref $retVal;          my $type = ref $retVal;
# Line 225  Line 232 
232              Confess("The '$name' parameter must be a list.");              Confess("The '$name' parameter must be a list.");
233          }          }
234      }      }
235        }
236      # Return the result.      # Return the result.
237      return $retVal;      return $retVal;
238  }  }
# Line 284  Line 292 
292  }  }
293    
294    
295    =head2 Gene Correspondence File Methods
296    
297    These methods relate to gene correspondence files, which are generated by the
298    L<svr_corresponding_genes.pl> script. Correspondence files are cached in the
299    organism cache (I<$FIG_Config::orgCache>) directory. Eventually they will be
300    copied into the organism directories themselves. At that point, the code below
301    will be modified to check the organism directories first and use the cache
302    directory if no file is found there.
303    
304    A gene correspondence file contains correspondences from a source genome to a
305    target genome. Most such correspondences are bidirectional best hits. A unidirectional
306    best hit may exist from the source genome to the target genome or in the reverse
307    direction from the targtet genome to the source genome. The cache directory itself
308    is divided into subdirectories by organism. The subdirectory has the source genome
309    name and the files themselves are named by the target genome.
310    
311    Some of the files are invalid and will be erased when they are found. A file is
312    considered invalid if it has a non-numeric value in a numeric column or if it
313    does not have any unidirectional hits from the target genome to the source
314    genome.
315    
316    The process of managing the correspondence files is tricky and dangerous because
317    of the possibility of race conditions. It can take several minutes to generate a
318    file, and if two processes try to generate the same file at the same time we need
319    to make sure they don't step on each other.
320    
321    In stored files, the source genome ID is always lexically lower than the target
322    genome ID. If a correspondence in the reverse direction is desired, the converse
323    file is found and the contents flipped automatically as they are read. So, the
324    correspondence from B<360108.3> to B<100226.1> would be found in a file with the
325    name B<360108.3> in the directory for B<100226.1>. Since this file actually has
326    B<100226.1> as the source and B<360108.3> as the target, the columns are
327    re-ordered and the arrows reversed before the file contents are passed to the
328    caller.
329    
330    =head4 Gene Correspondence List
331    
332    A gene correspondence file contains 18 columns. These are usually packaged as
333    a reference to list of lists. Each sub-list has the following format.
334    
335    =over 4
336    
337    =item 0
338    
339    The ID of a PEG in genome 1.
340    
341    =item 1
342    
343    The ID of a PEG in genome 2 that is our best estimate of a "corresponding gene".
344    
345    =item 2
346    
347    Count of the number of pairs of matching genes were found in the context.
348    
349    =item 3
350    
351    Pairs of corresponding genes from the contexts.
352    
353    =item 4
354    
355    The function of the gene in genome 1.
356    
357    =item 5
358    
359    The function of the gene in genome 2.
360    
361    =item 6
362    
363    Comma-separated list of aliases for the gene in genome 1 (any protein with an
364    identical sequence is considered an alias, whether or not it is actually the
365    name of the same gene in the same genome).
366    
367    =item 7
368    
369    Comma-separated list of aliases for the gene in genome 2 (any protein with an
370    identical sequence is considered an alias, whether or not it is actually the
371    name of the same gene in the same genome).
372    
373    =item 8
374    
375    Bi-directional best hits will contain "<=>" in this column; otherwise, "->" will appear.
376    
377    =item 9
378    
379    Percent identity over the region of the detected match.
380    
381    =item 10
382    
383    The P-score for the detected match.
384    
385    =item 11
386    
387    Beginning match coordinate in the protein encoded by the gene in genome 1.
388    
389    =item 12
390    
391    Ending match coordinate in the protein encoded by the gene in genome 1.
392    
393    =item 13
394    
395    Length of the protein encoded by the gene in genome 1.
396    
397    =item 14
398    
399    Beginning match coordinate in the protein encoded by the gene in genome 2.
400    
401    =item 15
402    
403    Ending match coordinate in the protein encoded by the gene in genome 2.
404    
405    =item 16
406    
407    Length of the protein encoded by the gene in genome 2.
408    
409    =item 17
410    
411    Bit score for the match. Divide by the length of the longer PEG to get
412    what we often refer to as a "normalized bit score".
413    
414    =back
415    
416    In the actual files, there will also be reverse correspondences indicated by a
417    back-arrow ("<-") in item (8). The output returned by the servers, however,
418    is filtered so that only forward correspondences occur. If a converse file
419    is used, the columns are re-ordered and the arrows reversed so that it looks
420    correct.
421    
422    =cut
423    
424    # hash for reversing the arrows
425    use constant ARROW_FLIP => { '->' => '<-', '<=>' => '<=>', '<-' => '->' };
426    # list of columns that contain numeric values that need to be validated
427    use constant NUM_COLS => [2,9,10,11,12,13,14,15,16,17];
428    
429    =head3 CheckForGeneCorrespondenceFile
430    
431        my ($fileName, $converse) = ServerThing::CheckForGeneCorrespondenceFile($genome1, $genome2);
432    
433    Try to find a gene correspondence file for the specified genome pairing. If the
434    file exists, its name and an indication of whether or not it is in the correct
435    direction will be returned.
436    
437    =over 4
438    
439    =item genome1
440    
441    Source genome for the desired correspondence.
442    
443    =item genome2
444    
445    Target genome for the desired correspondence.
446    
447    =item RETURN
448    
449    Returns a two-element list. The first element is the name of the file containing the
450    correspondence, or C<undef> if the file does not exist. The second element is TRUE
451    if the correspondence would be forward or FALSE if the file needs to be flipped.
452    
453    =back
454    
455    =cut
456    
457    sub CheckForGeneCorrespondenceFile {
458        # Get the parameters.
459        my ($genome1, $genome2) = @_;
460        # Declare the return variables.
461        my ($fileName, $converse);
462        # Determine the ordering of the genome IDs.
463        my ($corrFileName, $genomeA, $genomeB) = ComputeCorrespondenceFileName($genome1, $genome2);
464        $converse = ($genomeA ne $genome1);
465        # Look for a file containing the desired correspondence. (The code to check for a
466        # pre-computed file in the organism directories is currently turned off, because
467        # these files are all currently invalid.)
468        my $testFileName = "$FIG_Config::organisms/$genomeA/CorrToReferenceGenomes/$genomeB";
469        if (0 && -f $testFileName) {
470            # Use the pre-computed file.
471            Trace("Using pre-computed file $fileName for genome correspondence.") if T(3);
472            $fileName = $testFileName;
473        } elsif (-f $corrFileName) {
474            $fileName = $corrFileName;
475            Trace("Using cached file $fileName for genome correspondence.") if T(3);
476        }
477        # Return the result.
478        return ($fileName, $converse);
479    }
480    
481    
482    =head3 ComputeCorrespondenceFileName
483    
484        my ($fileName, $genomeA, $genomeB) = ServerThing::ComputeCorrespondenceFileName($genome1, $genome2);
485    
486    Compute the name to be given to a genome correspondence file in the organism cache
487    and return the source and target genomes that would be in it.
488    
489    =over 4
490    
491    =item genome1
492    
493    Source genome for the desired correspondence.
494    
495    =item genome2
496    
497    Target genome for the desired correspondence.
498    
499    =item RETURN
500    
501    Returns a three-element list. The first element is the name of the file to contain the
502    correspondence, the second element is the name of the genome that would act as the
503    source genome in the file, and the third element is the name of the genome that would
504    act as the target genome in the file.
505    
506    =back
507    
508    =cut
509    
510    sub ComputeCorrespondenceFileName {
511        # Get the parameters.
512        my ($genome1, $genome2) = @_;
513        # Declare the return variables.
514        my ($fileName, $genomeA, $genomeB);
515        # Determine the ordering of the genome IDs.
516        if ($genome1 lt $genome2) {
517            ($genomeA, $genomeB) = ($genome1, $genome2);
518        } else {
519            ($genomeA, $genomeB) = ($genome2, $genome1);
520        }
521        # Insure the source organism has a subdirectory in the organism cache.
522        my $orgDir = "$FIG_Config::orgCache/$genomeA";
523        Tracer::Insure($orgDir, 0777);
524        # Compute the name of the correspondence file for the appropriate target genome.
525        $fileName = "$orgDir/$genomeB";
526        # Return the results.
527        return ($fileName, $genomeA, $genomeB);
528    }
529    
530    
531    =head3 CreateGeneCorrespondenceFile
532    
533        my ($fileName, $converse) = ServerThing::CheckForGeneCorrespondenceFile($genome1, $genome2);
534    
535    Create a new gene correspondence file in the organism cache for the specified
536    genome correspondence. The name of the new file will be returned along with
537    an indicator of whether or not it is in the correct direction.
538    
539    =over 4
540    
541    =item genome1
542    
543    Source genome for the desired correspondence.
544    
545    =item genome2
546    
547    Target genome for the desired correspondence.
548    
549    =item RETURN
550    
551    Returns a two-element list. The first element is the name of the file containing the
552    correspondence, or C<undef> if an error occurred. The second element is TRUE
553    if the correspondence would be forward or FALSE if the file needs to be flipped.
554    
555    =back
556    
557    =cut
558    
559    sub CreateGeneCorrespondenceFile {
560        # Get the parameters.
561        my ($genome1, $genome2) = @_;
562        # Declare the return variables.
563        my ($fileName, $converse);
564        # Compute the ultimate name for the correspondence file.
565        my ($corrFileName, $genomeA, $genomeB) = ComputeCorrespondenceFileName($genome1, $genome2);
566        $converse = ($genome1 ne $genomeA);
567        # Generate a temporary file name in the same directory. We'll build the temporary
568        # file and then rename it when we're done.
569        my $tempFileName = "$corrFileName.$$.tmp";
570        # This will be set to FALSE if we detect an error.
571        my $fileOK = 1;
572        # The file handles will be put in here.
573        my ($ih, $oh);
574        # Protect from errors.
575        eval {
576            # Open the temporary file for output.
577            $oh = Open(undef, ">$tempFileName");
578            # Open a pipe to get the correspondence data.
579            $ih = Open(undef, "$FIG_Config::bin/svr_corresponding_genes -u localhost $genomeA $genomeB |");
580            Trace("Creating correspondence file for $genomeA to $genomeB in temporary file $tempFileName.") if T(3);
581            # Copy the pipe date into the temporary file.
582            while (! eof $ih) {
583                my $line = <$ih>;
584                print $oh $line;
585            }
586            # Close both files. If the close fails we need to know: it means there was a pipe
587            # error.
588            $fileOK &&= close $ih;
589            $fileOK &&= close $oh;
590        };
591        if ($@) {
592            # Here a fatal error of some sort occurred. We need to force the files closed.
593            close $ih if $ih;
594            close $oh if $oh;
595        } elsif ($fileOK) {
596            # Here everything worked. Try to rename the temporary file to the real
597            # file name.
598            if (rename $tempFileName, $corrFileName) {
599                # Everything is ok, fix the permissions and return the file name.
600                chmod 0664, $corrFileName;
601                $fileName = $corrFileName;
602                Trace("Created correspondence file $fileName.") if T(3);
603            }
604        }
605        # If the temporary file exists, delete it.
606        if (-f $tempFileName) {
607            unlink $tempFileName;
608        }
609        # Return the results.
610        return ($fileName, $converse);
611    }
612    
613    
614    =head3 ReadGeneCorrespondenceFile
615    
616        my $list = ServerThing::ReadGeneCorrespondenceFile($fileName, $converse, $all);
617    
618    Return the contents of the specified gene correspondence file in the form of
619    a list of lists, with backward correspondences filtered out. If the file is
620    for the converse of the desired correspondence, the columns will be reordered
621    automatically so that it looks as if the file were designed for the proper
622    direction.
623    
624    =over 4
625    
626    =item fileName
627    
628    The name of the gene correspondence file to read.
629    
630    =item converse (optional)
631    
632    TRUE if the file is for the converse of the desired correspondence, else FALSE.
633    If TRUE, the file columns will be reorderd automatically. The default is FALSE,
634    meaning we want to use the file as it appears on disk.
635    
636    =item all (optional)
637    
638    TRUE if backward unidirectional correspondences should be included in the output.
639    The default is FALSE, in which case only forward and bidirectional correspondences
640    are included.
641    
642    =item RETURN
643    
644    Returns a L</Gene Correspondence List> in the form of a reference to a list of lists.
645    If the file's contents are invalid or an error occurs, an undefined value will be
646    returned.
647    
648    =back
649    
650    =cut
651    
652    sub ReadGeneCorrespondenceFile {
653        # Get the parameters.
654        my ($fileName, $converse, $all) = @_;
655        # Declare the return variable. We will only put something in here if we are
656        # completely successful.
657        my $retVal;
658        # This value will be set to 1 if an error is detected.
659        my $error = 0;
660        # Try to open the file.
661        my $ih;
662        Trace("Reading correspondence file $fileName.") if T(3);
663        if (! open $ih, "<$fileName") {
664            # Here the open failed, so we have an error.
665            Trace("Failed to open gene correspondence file $fileName: $!") if T(3);
666            $error = 1;
667        }
668        # The gene correspondence list will be built in here.
669        my @corrList;
670        # This variable will be set to TRUE if we find a reverse correspondence somewhere
671        # in the file. Not finding one is an error.
672        my $reverseFound = 0;
673        # Loop until we hit the end of the file or an error occurs. We must check the error
674        # first in case the file handle failed to open.
675        while (! $error && ! eof $ih) {
676            # Get the current line.
677            my @row = Tracer::GetLine($ih);
678            # Get the correspondence direction and check for a reverse arrow.
679            $reverseFound = 1 if ($row[8] eq '<-');
680            # If we're in converse mode, reformat the line.
681            if ($converse) {
682                ReverseGeneCorrespondenceRow(\@row);
683            }
684            # Validate the row.
685            if (ValidateGeneCorrespondenceRow(\@row)) {
686                Trace("Invalid row $. found in correspondence file $fileName.") if T(3);
687                $error = 1;
688            }
689            # If this row is in the correct direction, keep it.
690            if ($all || $row[8] ne '<-') {
691                push @corrList, \@row;
692            }
693        }
694        # Close the input file.
695        close $ih;
696        # If we have no errors and we found a reverse arrow, keep the result.
697        if (! $error) {
698            if ($reverseFound) {
699                $retVal = \@corrList;
700            } else {
701                Trace("No reverse arrow found in correspondence file $fileName.") if T(3);
702            }
703        }
704        # Return the result (if any).
705        return $retVal;
706    }
707    
708    =head3 ReverseGeneCorrespondenceRow
709    
710        ServerThing::ReverseGeneCorrespondenceRow($row)
711    
712    Convert a gene correspondence row to represent the converse correspondence. The
713    elements in the row will be reordered to represent a correspondence from the
714    target genome to the source genome.
715    
716    =over 4
717    
718    =item row
719    
720    Reference to a list containing a single row from a L</Gene Correspondence List>.
721    
722    =back
723    
724    =cut
725    
726    sub ReverseGeneCorrespondenceRow {
727        # Get the parameters.
728        my ($row) = @_;
729        # Flip the row in place.
730        ($row->[1], $row->[0], $row->[2], $row->[3], $row->[5], $row->[4], $row->[7],
731         $row->[6], ARROW_FLIP->{$row->[8]}, $row->[9], $row->[10], $row->[14],
732         $row->[15], $row->[16], $row->[11], $row->[12], $row->[13], $row->[17]) = @$row;
733    }
734    
735    =head3 ValidateGeneCorrespondenceRow
736    
737        my $errorCount = ServerThing::ValidateGeneCorrespondenceRow($row);
738    
739    Validate a gene correspondence row. The numeric fields are checked to insure they
740    are numeric and the source and target gene IDs are validated. The return value will
741    indicate the number of errors found.
742    
743    =over 4
744    
745    =item row
746    
747    Reference to a list containing a single row from a L</Gene Correspondence List>.
748    
749    =item RETURN
750    
751    Returns the number of errors found in the row. A return of C<0> indicates the row
752    is valid.
753    
754    =back
755    
756    =cut
757    
758    sub ValidateGeneCorrespondenceRow {
759        # Get the parameters.
760        my ($row, $genome1, $genome2) = @_;
761        # Denote no errors have been found so far.
762        my $retVal = 0;
763        # Check for non-numeric values in the number columns.
764        for my $col (@{NUM_COLS()}) {
765            unless ($row->[$col] =~ /^-?\d+\.?\d*(?:e[+-]?\d+)?$/) {
766                $retVal++;
767            }
768        }
769        # Check the gene IDs.
770        for my $col (0, 1) {
771            unless ($row->[$col] =~ /^fig\|\d+\.\d+\.\w+\.\d+$/) {
772                $retVal++;
773            }
774        }
775        # Verify the arrow.
776        unless (exists ARROW_FLIP->{$row->[8]}) {
777            $retVal++;
778        }
779        # Return the error count.
780        return $retVal;
781    }
782    
783    
784  =head2 Internal Utility Methods  =head2 Internal Utility Methods
785    
786  The methods in this section are used internally by this package.  The methods in this section are used internally by this package.

Legend:
Removed from v.1.33  
changed lines
  Added in v.1.40

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3