[Bio] / Sprout / BioWords.pm Repository:
ViewVC logotype

Diff of /Sprout/BioWords.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Tue Sep 16 18:37:03 2008 UTC revision 1.3, Sat Oct 18 16:20:59 2008 UTC
# Line 215  Line 215 
215      # Get the parameters.      # Get the parameters.
216      my ($class, %options) = @_;      my ($class, %options) = @_;
217      # Get the options.      # Get the options.
218      my $exceptionOption = $options{exceptions} || "$FIG_Config::sproutData/exceptions.txt";      my $exceptionOption = $options{exceptions} || "$FIG_Config::sproutData/Exceptions.txt";
219      my $stopOption = $options{stops} || "$FIG_Config::sproutData/stopwords.txt";      my $stopOption = $options{stops} || "$FIG_Config::sproutData/StopWords.txt";
220      my $vowels = $options{vowels} || VOWELS;      my $vowels = $options{vowels} || VOWELS;
221      my $letters = $options{letters} || LETTERS;      my $letters = $options{letters} || LETTERS;
222      my $digits = $options{digits} || DIGITS;      my $digits = $options{digits} || DIGITS;
# Line 284  Line 284 
284  sub Stop {  sub Stop {
285      # Get the parameters.      # Get the parameters.
286      my ($self, $word) = @_;      my ($self, $word) = @_;
287        Trace("$word is a stop word.") if T(4);
288      # Store the stop word.      # Store the stop word.
289      $self->{cache}->{$word} = {stem => EMPTY, phonex => EMPTY, count => 0 };      $self->{cache}->{$word} = {stem => EMPTY, phonex => EMPTY, count => 0 };
290  }  }
# Line 327  Line 328 
328      $self->{cache}->{$word} = { stem => $stem, phonex => $phonex, count => $realCount };      $self->{cache}->{$word} = { stem => $stem, phonex => $phonex, count => $realCount };
329  }  }
330    
   
   
331  =head3 Split  =head3 Split
332    
333      my @words = $bio->Split($string);      my @words = $bio->Split($string);
# Line 368  Line 367 
367      # an undefined input is treated as a null string (which saves us from compiler warnings).      # an undefined input is treated as a null string (which saves us from compiler warnings).
368      my $lowered = (defined($string) ? lc $string : "");      my $lowered = (defined($string) ? lc $string : "");
369      $lowered =~ s/\s+/ /sg;      $lowered =~ s/\s+/ /sg;
370        # Connect the TC prefix to TC numbers.
371        $lowered =~ s/TC ((?:\d+|-)(?:\.(?:\d+|-)){3})/TC_$1/g;
372      # Trim the leading space (if any).      # Trim the leading space (if any).
373      $lowered =~ s/^ //;      $lowered =~ s/^ //;
374      # Fix the periods in EC and TC numbers. Note here we are insisting on real      # Fix the periods in EC and TC numbers. Note here we are insisting on real
# Line 522  Line 523 
523      # Get the word in lower case and compute its length.      # Get the word in lower case and compute its length.
524      my $lowered = lc $word;      my $lowered = lc $word;
525      my $len = length $lowered;      my $len = length $lowered;
526        Trace("Processing \"$lowered\".") if T(4);
527      # Check to see what type of word it is.      # Check to see what type of word it is.
528      if ($lowered =~ /[^$self->{WORD}]/) {      if ($lowered =~ /[^$self->{WORD}]/) {
529          # It's delimiters. Return it unchanged and don't record it.          # It's delimiters. Return it unchanged and don't record it.
# Line 529  Line 531 
531      } elsif ($len < $self->{SHORT}) {      } elsif ($len < $self->{SHORT}) {
532          # It's too short. Treat it as a stop word.          # It's too short. Treat it as a stop word.
533          $retVal = EMPTY;          $retVal = EMPTY;
534      } elsif (exists $cache->{$retVal}) {      } elsif (exists $cache->{$lowered}) {
535          # It's already in the cache. Get the cache entry.          # It's already in the cache. Get the cache entry.
536          my $entry = $cache->{$lowered};          my $entry = $cache->{$lowered};
537          $retVal = $entry->{stem};          $retVal = $entry->{stem};
# Line 732  Line 734 
734                          $activeOp = 0;                          $activeOp = 0;
735                      } else {                      } else {
736                          # An open quote puts us in quote mode. Words inside                          # An open quote puts us in quote mode. Words inside
737                          # quotes do not need the plus added.                          # quotes do not need the plus added, but the
738                            # quote does.
739                          $inQuotes = 1;                          $inQuotes = 1;
740                            $retVal .= "+";
741                      }                      }
742                  } elsif ($op eq ' ') {                  } elsif ($op eq ' ') {
743                      # Spaces detach us from the preceding operator.                      # Spaces detach us from the preceding operator.
# Line 795  Line 799 
799      return @retVal;      return @retVal;
800  }  }
801    
802    =head3 WildsOfEC
803    
804        my @ecWilds = BioWords::WildsOfEC($number);
805    
806    Return a list of all of the possible wild-carded EC numbers that would
807    match the specified EC number.
808    
809    =over 4
810    
811    =item number
812    
813    EC number to process.
814    
815    =item RETURN
816    
817    Returns a list consisting of the original EC number and all other
818    EC numbers that subsume it.
819    
820    =back
821    
822    =cut
823    
824    sub WildsOfEC {
825        # Get the parameters.
826        my ($number) = @_;
827        # Declare the return variable. It contains at the start the original
828        # EC number.
829        my @retVal = $number;
830        # Bust the EC number into pieces.
831        my @pieces = split '.', $number;
832        # Put it back together with hyphens.
833        for (my $i = 1; $i <= $#pieces; $i++) {
834            if ($pieces[$i] ne '-') {
835                my @wildPieces;
836                for (my $j = 0; $j <= $#pieces; $j++) {
837                    push @wildPieces, ($j < $i ? $pieces[$i] : '-');
838                }
839                push @retVal, join(".", @wildPieces);
840            }
841        }
842        # Return the result.
843        return @retVal;
844    }
845    
846    =head3 ExtractECs
847    
848        my @ecThings = BioWords::ExtractECs($string);
849    
850    Return any individual EC numbers found in the specified string.
851    
852    =over 4
853    
854    =item string
855    
856    String containing potential EC numbers.
857    
858    =item RETURN
859    
860    Returns a list of all the EC numbers and subsuming EC numbers found in the string.
861    
862    =back
863    
864    =cut
865    
866    sub ExtractECs {
867        # Get the parameters.
868        my ($string) = @_;
869        # Find all the EC numbers in the string.
870        my @ecs = ($string =~ /ec\s+(\d+(?:\.\d+|\.-){3})/gi);
871        # Get the wild versions.
872        my @retVal = map { WildsOfEc($_) } @ecs;
873        # Return the result.
874        return @retVal;
875    }
876    
877  =head2 Internal Methods  =head2 Internal Methods
878    
879  =head3 _initCache  =head3 _initCache
# Line 814  Line 893 
893      if ($self->{stopFile}) {      if ($self->{stopFile}) {
894          # Read the file.          # Read the file.
895          my @lines = Tracer::GetFile($self->{stopFile});          my @lines = Tracer::GetFile($self->{stopFile});
896            Trace(scalar(@lines) . " lines found in stop file.") if T(3);
897          # Insert it into the cache.          # Insert it into the cache.
898          for my $line (@lines) {          for my $line (@lines) {
899              $self->Stop(lc $line);              $self->Stop(lc $line);
# Line 825  Line 905 
905      if ($self->{exceptionFile}) {      if ($self->{exceptionFile}) {
906          # Read the file.          # Read the file.
907          my @lines = Tracer::GetFile($self->{exceptionFile});          my @lines = Tracer::GetFile($self->{exceptionFile});
908            Trace(scalar(@lines) . " lines found in exception file.") if T(3);
909          # Loop through the lines.          # Loop through the lines.
910          for my $line (@lines) {          for my $line (@lines) {
911              # Extract the words.              # Extract the words.
# Line 883  Line 964 
964      my $p2 = $len - length $r2;      my $p2 = $len - length $r2;
965      # These variables will be used by FindRule.      # These variables will be used by FindRule.
966      my ($prefix, $suffix, $ruleValue);      my ($prefix, $suffix, $ruleValue);
967      # Remove genitives.      # Remove the genitive apostrophe.
968      ($retVal, $suffix, $ruleValue) = FindRule($retVal, q('s') => EMPTY, q('s) => EMPTY, q(') => EMPTY);      ($retVal, $suffix, $ruleValue) = FindRule($retVal, q('s') => EMPTY, q('s) => EMPTY, q(') => EMPTY);
969      # Process latin endings.      # Process latin endings.
970      ($prefix, $suffix, $ruleValue) = FindRule($retVal, us => 'i', um => 'a', ae => 'a');      ($prefix, $suffix, $ruleValue) = FindRule($retVal, us => 'i', um => 'a', ae => 'a');
# Line 981  Line 1062 
1062      # Process the doubled L.      # Process the doubled L.
1063      ($prefix, $suffix, $ruleValue) = FindRule($retVal, ll => 'l');      ($prefix, $suffix, $ruleValue) = FindRule($retVal, ll => 'l');
1064      $retVal = "$prefix$ruleValue";      $retVal = "$prefix$ruleValue";
1065        # Check for an ending 'e'.
1066        $retVal =~ s/([$self->{VOWEL}][^$self->{VOWEL}]+)e$/$1/;
1067      # Return the result.      # Return the result.
1068      return $retVal;      return $retVal;
1069  }  }

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.3

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3