[Bio] / Sprout / Sapling.pm Repository:
ViewVC logotype

Diff of /Sprout/Sapling.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.13, Wed Jul 15 23:39:58 2009 UTC revision 1.14, Mon Aug 3 21:32:54 2009 UTC
# Line 603  Line 603 
603  =item RETURN  =item RETURN
604    
605  Returns a FASTA string for the protein. This includes the identification  Returns a FASTA string for the protein. This includes the identification
606  line, the protein letters themselves, and the trailer line.  line and the protein letters themselves.
607    
608  =back  =back
609    
# Line 633  Line 633 
633    
634  =head3 Taxonomy  =head3 Taxonomy
635    
636      my @taxonomy = $sap->Taxonomy($genomeID);      my @taxonomy = $sap->Taxonomy($genomeID, $format);
637    
638  Return the full taxonomy of the specified genome, starting from the  Return the full taxonomy of the specified genome, starting from the
639  domain downward. The returned values will be primary names, not taxonomy  domain downward.
 IDs.  
640    
641  =over 4  =over 4
642    
# Line 647  Line 646 
646  in the database: the version number will be lopped off and the result used as  in the database: the version number will be lopped off and the result used as
647  an entry point into the taxonomy tree.  an entry point into the taxonomy tree.
648    
649    =item format (optional)
650    
651    Format of the taxonomy. C<names> will return primary names, C<numbers> will
652    return taxonomy numbers, and C<both> will return taxonomy number followed by
653    primary name. The default is C<names>.
654    
655  =item RETURN  =item RETURN
656    
657  Returns a list of taxonomy names, starting from the domain and moving  Returns a list of taxonomy names, starting from the domain and moving
# Line 658  Line 663 
663    
664  sub Taxonomy {  sub Taxonomy {
665      # Get the parameters.      # Get the parameters.
666      my ($self, $genomeID) = @_;      my ($self, $genomeID, $format) = @_;
667      # Get the genome's taxonomic group.      # Get the genome's taxonomic group.
668      my ($taxon) = split /\./, $genomeID, 2;      my ($taxon) = split /\./, $genomeID, 2;
669      # We'll put the return data in here.      # We'll put the return data in here.
# Line 682  Line 687 
687              # taxonomy ID with the ID of our parent, priming the next iteration              # taxonomy ID with the ID of our parent, priming the next iteration
688              # of the loop.              # of the loop.
689              my $name;              my $name;
690                my $oldTaxon = $taxon;
691              ($domainFlag, $name, $taxon) = @$taxonData;              ($domainFlag, $name, $taxon) = @$taxonData;
692              # Put the current group's name in the return list.              # Compute the value we want to put in the output list.
693              unshift @retVal, $name;              my $value;
694                if ($format eq 'numbers') {
695                    $value = $oldTaxon;
696                } elsif ($format eq 'both') {
697                    $value = "$oldTaxon $name";
698                } else {
699                    $value = $name;
700                }
701                # Put the current group's data in the return list.
702                unshift @retVal, $value;
703          }          }
704      }      }
705      # Return the result.      # Return the result.
# Line 764  Line 779 
779      return $retVal;      return $retVal;
780  }  }
781    
782    =head3 Alias
783    
784        my $translatedID = $sap->Alias($fid, $source);
785    
786    Return an alternate ID of the specified type for the specified feature.
787    If no alternate ID of that type exists, the incoming value will be
788    returned unchanged.
789    
790    =over 4
791    
792    =item fid
793    
794    FIG ID of the feature whose alias identifier is desired.
795    
796    =item source
797    
798    Database type for the alternate ID (e.g. C<LocusTag>, C<NCBI>, C<RefSeq>). If
799    C<SEED> is specified, the ID will be returned unchanged and no database lookup
800    will occur.
801    
802    =item RETURN
803    
804    Returns an equivalent ID for the specified feature that belongs to the specified
805    database (that is, has the specified source). If no such ID exists, returns the
806    incoming ID.
807    
808    =back
809    
810    =cut
811    
812    sub Alias {
813        # Get the parameters.
814        my ($self, $fid, $source) = @_;
815        # Default to the incoming value.
816        my $retVal = $fid;
817        # We only have work to do if the database type isn't "SEED".
818        if ($source ne 'SEED') {
819            # Look for an alias.
820            my ($alias) = $self->GetFlat("IsIdentifiedBy Identifier",
821                                         'IsIde                                                                                                           ntifiedBy(from-link) = ? AND Identifier(source) = ?',
822                                         [$fid, $source], 'Identifier(natural-form)');
823            # If we found one, return it.
824            if (defined $alias) {
825                $retVal = $alias;
826            }
827        }
828        # Return the result.
829        return $retVal;
830    }
831    
832    
833    =head3 ContigLength
834    
835        my $contigLen = $sap->ContigLength($contigID);
836    
837    Return the number of base pairs in the specified contig.
838    
839    =over 4
840    
841    =item contigID
842    
843    ID of the contig of interest.
844    
845    =item RETURN
846    
847    Returns the number of base pairs in the specified contig, or 0 if the contig
848    does not exist.
849    
850    =back
851    
852    =cut
853    
854    sub ContigLength {
855        # Get the parameters.
856        my ($self, $contigID) = @_;
857        # Try to find the length.
858        my ($retVal) = $self->GetEntityValues(Contig => $contigID, 'length');
859        # Convert not-found to 0.
860        $retVal = 0 if ! defined $retVal;
861        # Return the result.
862        return $retVal;
863    }
864    
865    
866    =head2 Configuration-Related Methods
867    
868  =head3 SubsystemHash  =head3 SubsystemHash
869    
870      my $subHash = $sap->SubsystemHash();      my $subHash = $sap->SubsystemHash();
# Line 904  Line 1005 
1005      return ($name eq GLOBAL);      return ($name eq GLOBAL);
1006  }  }
1007    
1008  =head3 ContigLength  =head2 Special-Purpose Methods
1009    
1010      my $contigLen = $sap->ContigLength($contigID);  =head3 ComputeFeatureFilter
1011    
1012  Return the number of base pairs in the specified contig.      my ($objects, $filter, @parms) = $sap->ComputeFeatureFilter($source);
1013    
1014    Compute the initial object name list, filter string, and parameter list
1015    for a query by feature ID. The object name list will always end with the
1016    B<Feature> entity, and the combination of the filter string and parameter
1017    list will translate the incoming ID from the specified format to a real
1018    FIG feature ID. If the specified format B<is> FIG feature IDs, then the
1019    query will start on the B<Feature> entity; otherwise, it will start with
1020    the B<Identifier> entity. This is a special-purpose method that performs
1021    the task of intelligently modifying queries to allow for external ID
1022    types.
1023    
1024  =over 4  =over 4
1025    
1026  =item contigID  =item source (optional)
1027    
1028  ID of the contig of interest.  Database source of the IDs specified-- C<SEED> for FIG IDs, C<GENE> for standard
1029    gene identifiers, or C<LocusTag> for locus tags. In addition, you may specify
1030    C<RefSeq>, C<CMR>, C<NCBI>, C<Trembl>, or C<UniProt> for IDs from those databases.
1031    Use C<mixed> to allow mixed ID types (though this may cause problems when the same
1032    ID has different meanings in different databases). The default is C<SEED>.
1033    
1034  =item RETURN  =item RETURN
1035    
1036  Returns the number of base pairs in the specified contig, or 0 if the contig  Returns a list containing parameters to the desired query call. The first element
1037  does not exist.  is the prefix for the object name list, the second is the prefix for the filter
1038    string, and the subsequent elements form the prefix for the parameter value list.
1039    
1040  =back  =back
1041    
1042  =cut  =cut
1043    
1044  sub ContigLength {  sub ComputeFeatureFilter {
1045      # Get the parameters.      # Get the parameters.
1046      my ($self, $contigID) = @_;      my ($self, $source) = @_;
1047      # Try to find the length.      # Declare the return variables.
1048      my ($retVal) = $self->GetEntityValues(Contig => $contigID, 'length');      my ($objects, $filter, @parms);
1049      # Convert not-found to 0.      # Determine the source type.
1050      $retVal = 0 if ! defined $retVal;      if (! defined $source || $source eq 'SEED') {
1051            # Here we're directly processing FIG IDs.
1052            $objects = 'Feature';
1053            $filter = 'Feature(id) = ?'
1054        } elsif ($source eq 'mixed') {
1055            # Here we're processing mixed IDs of unknown type.
1056            $objects = 'Identifier Identifies Feature';
1057            $filter = 'Identifier(natural-form) = ?';
1058        } else {
1059            # Here we're processing a fixed ID type from an external database.
1060            # This is the case that requires an additional parameter. Note that
1061            # we insist that the additional parameter matches the first parameter
1062            # mark.
1063            $objects = 'Identifier Identifies Feature';
1064            $filter = 'Identifier(source) = ? AND Identifier(natural-form) = ?';
1065            push @parms, $source;
1066        }
1067        # Return the results.
1068        return ($objects, $filter, @parms);
1069    }
1070    
1071    
1072    =head3 FindGapLeft
1073    
1074        my @operonData = $sap->FindGapLeft($loc, $maxGap, $interval, \%redundancyHash, \$redundancyFlag);
1075    
1076    This method performs a rather arcane task: searching for a gap to the
1077    left of a location in the contig. The search will proceed from the
1078    starting point to the left, and will stop when a gap between occupied
1079    locations is found that is larger than the specified maximum. The caller
1080    has the option of specifying a hash of feature IDs that are redundant. If
1081    any feature in the hash is found, the search will stop early and the
1082    provided redundancy flag will be set. In addition, an interval size can
1083    be specified to tune the process of retrieving data from the database.
1084    
1085    =over 4
1086    
1087    =item loc
1088    
1089    L<BasicLocation> object for the location from which the search is to start.
1090    This gives us the contig ID, the strand of interest (forward or backward),
1091    and the starting point of the search.
1092    
1093    =item maxGap
1094    
1095    The maximum allowable gap. The search will stop at the left end of the contig
1096    or the first gap larger than this amount.
1097    
1098    =item interval (optional)
1099    
1100    Interval to use for retrieving data from the database. This is the size of
1101    the contig segments being retrieved. The default is C<10000>
1102    
1103    =item redundancyHash (optional)
1104    
1105    A hash of feature IDs. If any feature present in this hash is found during
1106    the search, the search will stop and no data will be returned. The default
1107    is an empty hash (no check).
1108    
1109    =item redundancyFlag (optional)
1110    
1111    A reference to a scalar flag. If present, the entire method will be bypassed
1112    if the flag is TRUE. If a redundancy hash is specified and a redundant feature
1113    is found, this flag will be set to TRUE by the method.
1114    
1115    =item RETURN
1116    
1117    Returns a list of 4-tuples. Each tuple will contain a feature ID, a begin
1118    offset, a direction (C<+> or C<->), and a length, representing an occupied
1119    location on the contig and the feature to which it belongs. The complete
1120    list of locations will be to the left of the starting location and relatively
1121    close together, with no gap larger than the caller-specified maximum.
1122    
1123    =back
1124    
1125    =cut
1126    
1127    sub FindGapLeft {
1128        # Get the parameters.
1129        my ($self, $loc, $maxGap, $interval, $redundancyHash, $redundancyFlag) = @_;
1130        # Declare the return variable.
1131        my @retVal;
1132        # Fix up defaults for the missing parameters.
1133        $interval ||= 10000;
1134        if (! defined $redundancyHash) {
1135            $redundancyHash = {};
1136        }
1137        my $fakeFlag = 0;
1138        if (! defined $redundancyFlag) {
1139            $redundancyFlag = \$fakeFlag;
1140        }
1141        # This flag will be set to TRUE if we run out of locations or find a gap.
1142        my $gapFound = 0;
1143        # This will be used to store tuples found. If we are successful, it will
1144        # be copied to the return list.
1145        my @operonData;
1146        # Now we need to set up some data for the loop. In particular, the contig
1147        # ID, the strand (direction), and the starting point. We add one to the
1148        # starting current position to insure that the starting point is included
1149        # in the first search.
1150        my $currentPosition = $loc->Left + 1;
1151        my $contigID = $loc->Contig;
1152        my $strand = $loc->Dir;
1153        # This variable keeps the leftmost begin location found.
1154        my $begin = $loc->Left;
1155        # Loop until we find a redundancy or a gap.
1156        while (! $$redundancyFlag && ! $gapFound && $currentPosition >= 0) {
1157            # Compute the limits of the search interval for this iteration.
1158            my $nextPosition = $currentPosition - $interval;
1159            # Get all the locations in the interval.
1160            my @rows = $self->GetAll("IsLocatedIn",
1161                                     'IsLocatedIn(to-link) = ? AND IsLocatedIn(dir) = ? AND IsLocatedIn(begin) >= ? AND IsLocatedIn(begin) < ?',
1162                                    [$contigID, $strand, $nextPosition, $currentPosition],
1163                                    [qw(from-link begin dir len)]);
1164            # If nothing was found, it's a gap.
1165            if (! @rows) {
1166                $gapFound = 1;
1167            } else {
1168                # We got something, so we can loop through looking for gaps. The search
1169                # requires we sort by right point.
1170                my @sortableTuples;
1171                for my $tuple (@rows) {
1172                    my ($fid, $left, $dir, $len) = @$tuple;
1173                    push @sortableTuples, [$left + $len, $tuple];
1174                }
1175                my @sortedTuples = map { $_->[1] } sort { -($a->[0] <=> $b->[0]) } @sortableTuples;
1176                # Loop through the tuples, stopping at the first redundancy or gap.
1177                for my $tuple (@sortedTuples) { last if $gapFound || $$redundancyFlag;
1178                    # Get this tuple's data.
1179                    my ($fid, $left, $dir, $len) = @$tuple;
1180                    # Is it close enough to be counted?
1181                    if ($begin - ($left + $len) <= $maxGap) {
1182                        # Yes. We can include this tuple.
1183                        push @operonData, $tuple;
1184                        # Update the begin point.
1185                        $begin = $left;
1186                        # Is it redundant? It's only reasonable to ask this if it's
1187                        # an included feature.
1188                        if ($redundancyHash->{$fid}) {
1189                            $$redundancyFlag = 1;
1190                        }
1191                    } else {
1192                        # No, it's not close enough. We've found a gap.
1193                        $gapFound = 1;
1194                    }
1195                }
1196            }
1197            # Set up for the next interval.
1198            $currentPosition = $nextPosition;
1199        }
1200        # If we're nonredundant, save our results.
1201        if (! $$redundancyFlag) {
1202            @retVal = @operonData;
1203        }
1204      # Return the result.      # Return the result.
1205      return $retVal;      return @retVal;
1206    }
1207    
1208    =head3 FindGapRight
1209    
1210        my @operonData = $sap->FindGapRight($loc, $maxGap, $interval, \%redundancyHash, \$redundancyFlag);
1211    
1212    This method is the dual of L</FindGapLeft>: it searches for a gap to the
1213    right of a location in the contig. The search will proceed from the
1214    starting point to the right, and will stop when a gap between occupied
1215    locations is found that is larger than the specified maximum. The caller
1216    has the option of specifying a hash of feature IDs that are redundant. If
1217    any feature in the hash is found, the search will stop early and the
1218    provided redundancy flag will be set. In addition, an interval size can
1219    be specified to tune the process of retrieving data from the database.
1220    
1221    =over 4
1222    
1223    =item loc
1224    
1225    L<BasicLocation> object for the location from which the search is to start.
1226    This gives us the contig ID, the strand of interest (forward or backward),
1227    and the starting point of the search.
1228    
1229    =item maxGap
1230    
1231    The maximum allowable gap. The search will stop at the right end of the contig
1232    or the first gap larger than this amount.
1233    
1234    =item interval (optional)
1235    
1236    Interval to use for retrieving data from the database. This is the size of
1237    the contig segments being retrieved. The default is C<10000>
1238    
1239    =item redundancyHash (optional)
1240    
1241    A hash of feature IDs. If any feature present in this hash is found during
1242    the search, the search will stop and no data will be returned. The default
1243    is an empty hash (no check).
1244    
1245    =item redundancyFlag (optional)
1246    
1247    A reference to a scalar flag. If present, the entire method will be bypassed
1248    if the flag is TRUE. If a redundancy hash is specified and a redundant feature
1249    is found, this flag will be set to TRUE by the method.
1250    
1251    =item RETURN
1252    
1253    Returns a list of 4-tuples. Each tuple will contain a feature ID, a begin
1254    offset, a direction (C<+> or C<->), and a length, representing an occupied
1255    location on the contig and the feature to which it belongs. The complete
1256    list of locations will be to the right of the starting location and relatively
1257    close together, with no gap larger than the caller-specified maximum.
1258    
1259    =back
1260    
1261    =cut
1262    
1263    sub FindGapRight {
1264        # Get the parameters.
1265        my ($self, $loc, $maxGap, $interval, $redundancyHash, $redundancyFlag) = @_;
1266        # Declare the return variable.
1267        my @retVal;
1268        # Fix up defaults for the missing parameters.
1269        $interval ||= 10000;
1270        if (! defined $redundancyHash) {
1271            $redundancyHash = {};
1272        }
1273        my $fakeFlag = 0;
1274        if (! defined $redundancyFlag) {
1275            $redundancyFlag = \$fakeFlag;
1276        }
1277        # This flag will be set to TRUE if we run out of locations or find a gap.
1278        my $gapFound = 0;
1279        # This will be used to store tuples found. If we are successful, it will
1280        # be copied to the return list.
1281        my @operonData;
1282        # Now we need to set up some data for the loop. In particular, the contig
1283        # ID, the strand (direction), and the starting point. We subtract one from the
1284        # starting current position to insure that the starting point is included
1285        # in the first search.
1286        my $currentPosition = $loc->Left - 1;
1287        my $contigID = $loc->Contig;
1288        my $strand = $loc->Dir;
1289        # Get the length of the contig.
1290        my $contigLen = $self->ContigLength($contigID);
1291        # This variable keeps the rightmost end location found.
1292        my $endPoint = $loc->Left;
1293        # Loop until we find a redundancy or a gap.
1294        while (! $$redundancyFlag && ! $gapFound && $currentPosition <= $contigLen) {
1295            # Compute the limits of the search interval for this iteration.
1296            my $nextPosition = $currentPosition + $interval;
1297            # Get all the locations in the interval.
1298            my @rows = $self->GetAll("IsLocatedIn",
1299                                     'IsLocatedIn(to-link) = ? AND IsLocatedIn(dir) = ? AND IsLocatedIn(begin) >= ? AND IsLocatedIn(begin) < ?',
1300                                    [$contigID, $strand, $nextPosition, $currentPosition],
1301                                    [qw(from-link begin dir len)]);
1302            # If nothing was found, it's a gap.
1303            if (! @rows) {
1304                $gapFound = 1;
1305            } else {
1306                # We got something, so we can loop through looking for gaps. The search
1307                # requires we sort by left point.
1308                my @sortedTuples = sort { $a->[1] <=> $b->[1] } @rows;
1309                # Loop through the tuples, stopping at the first redundancy or gap.
1310                for my $tuple (@sortedTuples) { last if $gapFound || $$redundancyFlag;
1311                    # Get this tuple's data.
1312                    my ($fid, $left, $dir, $len) = @$tuple;
1313                    # Is it close enough to be counted?
1314                    if ($left - $endPoint <= $maxGap) {
1315                        # Yes. We can include this tuple.
1316                        push @operonData, $tuple;
1317                        # Update the end point.
1318                        $endPoint = $left + $len;
1319                        # Is it redundant? It's only reasonable to ask this if it's
1320                        # an included feature.
1321                        if ($redundancyHash->{$fid}) {
1322                            $$redundancyFlag = 1;
1323                        }
1324                    } else {
1325                        # No, it's not close enough. We've found a gap.
1326                        $gapFound = 1;
1327                    }
1328                }
1329            }
1330            # Set up for the next interval.
1331            $currentPosition = $nextPosition;
1332        }
1333        # If we're nonredundant, save our results.
1334        if (! $$redundancyFlag) {
1335            @retVal = @operonData;
1336        }
1337        # Return the result.
1338        return @retVal;
1339  }  }
1340    
1341    

Legend:
Removed from v.1.13  
changed lines
  Added in v.1.14

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3