[Bio] / Sprout / Sprout.pm Repository:
ViewVC logotype

Diff of /Sprout/Sprout.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.67, Sun Jun 18 07:39:54 2006 UTC revision 1.82, Sat Sep 2 07:00:54 2006 UTC
# Line 12  Line 12 
12      use DBObject;      use DBObject;
13      use Tracer;      use Tracer;
14      use FIGRules;      use FIGRules;
15        use FidCheck;
16      use Stats;      use Stats;
17      use POSIX qw(strftime);      use POSIX qw(strftime);
18        use BasicLocation;
19    
20  =head1 Sprout Database Manipulation Object  =head1 Sprout Database Manipulation Object
21    
# Line 91  Line 92 
92  sub new {  sub new {
93      # Get the parameters.      # Get the parameters.
94      my ($class, $dbName, $options) = @_;      my ($class, $dbName, $options) = @_;
95        # Compute the DBD directory.
96        my $dbd_dir = (defined($FIG_Config::dbd_dir) ? $FIG_Config::dbd_dir :
97                                                      $FIG_Config::fig );
98      # Compute the options. We do this by starting with a table of defaults and overwriting with      # Compute the options. We do this by starting with a table of defaults and overwriting with
99      # the incoming data.      # the incoming data.
100      my $optionTable = Tracer::GetOptions({      my $optionTable = Tracer::GetOptions({
# Line 98  Line 102 
102                                                          # database type                                                          # database type
103                         dataDir      => $FIG_Config::sproutData,                         dataDir      => $FIG_Config::sproutData,
104                                                          # data file directory                                                          # data file directory
105                         xmlFileName  => "$FIG_Config::fig/SproutDBD.xml",                         xmlFileName  => "$dbd_dir/SproutDBD.xml",
106                                                          # database definition file name                                                          # database definition file name
107                         userData     => "$FIG_Config::dbuser/$FIG_Config::dbpass",                         userData     => "$FIG_Config::dbuser/$FIG_Config::dbpass",
108                                                          # user name and password                                                          # user name and password
109                         port         => $FIG_Config::dbport,                         port         => $FIG_Config::dbport,
110                                                          # database connection port                                                          # database connection port
111                         sock         => $FIG_Config::dbsock,                         sock         => $FIG_Config::dbsock,
112                           host         => $FIG_Config::dbhost,
113                         maxSegmentLength => 4500,        # maximum feature segment length                         maxSegmentLength => 4500,        # maximum feature segment length
114                         maxSequenceLength => 8000,       # maximum contig sequence length                         maxSequenceLength => 8000,       # maximum contig sequence length
115                         noDBOpen     => 0,               # 1 to suppress the database open                         noDBOpen     => 0,               # 1 to suppress the database open
# Line 118  Line 123 
123      my $dbh;      my $dbh;
124      if (! $optionTable->{noDBOpen}) {      if (! $optionTable->{noDBOpen}) {
125          $dbh = DBKernel->new($optionTable->{dbType}, $dbName, $userName,          $dbh = DBKernel->new($optionTable->{dbType}, $dbName, $userName,
126                                  $password, $optionTable->{port}, undef, $optionTable->{sock});                                  $password, $optionTable->{port}, $optionTable->{host}, $optionTable->{sock});
127      }      }
128      # Create the ERDB object.      # Create the ERDB object.
129      my $xmlFileName = "$optionTable->{xmlFileName}";      my $xmlFileName = "$optionTable->{xmlFileName}";
# Line 375  Line 380 
380      # Get the genomes.      # Get the genomes.
381      my @genomes = $self->GetAll(['Genome'], $filterString, $params, ['Genome(id)',      my @genomes = $self->GetAll(['Genome'], $filterString, $params, ['Genome(id)',
382                                                                       'Genome(genus)',                                                                       'Genome(genus)',
383                                                                       'Genome(species)']);                                                                       'Genome(species)',
384                                                                         'Genome(unique-characterization)']);
385      # Sort them by name.      # Sort them by name.
386      my @sorted = sort { lc("$a->[1] $a->[2]") cmp lc("$b->[1] $b->[2]") } @genomes;      my @sorted = sort { lc("$a->[1] $a->[2]") cmp lc("$b->[1] $b->[2]") } @genomes;
387      # Loop through the genomes, creating the option tags.      # Loop through the genomes, creating the option tags.
388      for my $genomeData (@sorted) {      for my $genomeData (@sorted) {
389          # Get the data for this genome.          # Get the data for this genome.
390          my ($genomeID, $genus, $species) = @{$genomeData};          my ($genomeID, $genus, $species, $strain) = @{$genomeData};
391          # Get the contig count.          # Get the contig count.
392          my $count = $self->ContigCount($genomeID);          my $count = $self->ContigCount($genomeID);
393          my $counting = ($count == 1 ? "contig" : "contigs");          my $counting = ($count == 1 ? "contig" : "contigs");
394          # Build the option tag.          # Build the option tag.
395          $retVal .= "<option value=\"$genomeID\">$genus $species ($genomeID) [$count $counting]</option>\n";          $retVal .= "<option value=\"$genomeID\">$genus $species $strain ($genomeID) [$count $counting]</option>\n";
396            Trace("Option tag built for $genomeID: $genus $species $strain.") if T(3);
397      }      }
398      # Close the SELECT tag.      # Close the SELECT tag.
399      $retVal .= "</select>\n";      $retVal .= "</select>\n";
# Line 836  Line 843 
843    
844  =over 4  =over 4
845    
846  =genomeID  =item genomeID
847    
848  ID of the genome whose feature count is desired.  ID of the genome whose feature count is desired.
849    
# Line 1653  Line 1660 
1660  sub CoupledFeatures {  sub CoupledFeatures {
1661      # Get the parameters.      # Get the parameters.
1662      my ($self, $featureID) = @_;      my ($self, $featureID) = @_;
1663        Trace("Looking for features coupled to $featureID.") if T(coupling => 3);
1664      # Create a query to retrieve the functionally-coupled features.      # Create a query to retrieve the functionally-coupled features.
1665      my $query = $self->Get(['ParticipatesInCoupling', 'Coupling'],      my $query = $self->Get(['ParticipatesInCoupling', 'Coupling'],
1666                             "ParticipatesInCoupling(from-link) = ?", [$featureID]);                             "ParticipatesInCoupling(from-link) = ?", [$featureID]);
# Line 1665  Line 1673 
1673          # Get the ID and score of the coupling.          # Get the ID and score of the coupling.
1674          my ($couplingID, $score) = $clustering->Values(['Coupling(id)',          my ($couplingID, $score) = $clustering->Values(['Coupling(id)',
1675                                                          'Coupling(score)']);                                                          'Coupling(score)']);
1676            Trace("$featureID coupled with score $score to ID $couplingID.") if T(coupling => 4);
1677          # Get the other feature that participates in the coupling.          # Get the other feature that participates in the coupling.
1678          my ($otherFeatureID) = $self->GetFlat(['ParticipatesInCoupling'],          my ($otherFeatureID) = $self->GetFlat(['ParticipatesInCoupling'],
1679                                             "ParticipatesInCoupling(to-link) = ? AND ParticipatesInCoupling(from-link) <> ?",                                             "ParticipatesInCoupling(to-link) = ? AND ParticipatesInCoupling(from-link) <> ?",
1680                                             [$couplingID, $featureID], 'ParticipatesInCoupling(from-link)');                                             [$couplingID, $featureID], 'ParticipatesInCoupling(from-link)');
1681            Trace("$couplingID target feature is $otherFeatureID.") if T(coupling => 4);
1682          # Attach the other feature's score to its ID.          # Attach the other feature's score to its ID.
1683          $retVal{$otherFeatureID} = $score;          $retVal{$otherFeatureID} = $score;
1684          $found = 1;          $found = 1;
# Line 1801  Line 1811 
1811      my ($self, $peg1, $peg2) = @_;      my ($self, $peg1, $peg2) = @_;
1812      # Declare the return values. We'll start with the coupling ID and undefine the      # Declare the return values. We'll start with the coupling ID and undefine the
1813      # flag and score until we have more information.      # flag and score until we have more information.
1814      my ($retVal, $inverted, $score) = (CouplingID($peg1, $peg2), undef, undef);      my ($retVal, $inverted, $score) = ($self->CouplingID($peg1, $peg2), undef, undef);
1815      # Find the coupling data.      # Find the coupling data.
1816      my @pegs = $self->GetAll(['Coupling', 'ParticipatesInCoupling'],      my @pegs = $self->GetAll(['Coupling', 'ParticipatesInCoupling'],
1817                                   "Coupling(id) = ? ORDER BY ParticipatesInCoupling(pos)",                                   "Coupling(id) = ? ORDER BY ParticipatesInCoupling(pos)",
# Line 1822  Line 1832 
1832      return ($retVal, $inverted, $score);      return ($retVal, $inverted, $score);
1833  }  }
1834    
1835    =head3 GetSynonymGroup
1836    
1837    C<< my $id = $sprout->GetSynonymGroup($fid); >>
1838    
1839    Return the synonym group name for the specified feature.
1840    
1841    =over 4
1842    
1843    =item fid
1844    
1845    ID of the feature whose synonym group is desired.
1846    
1847    =item RETURN
1848    
1849    The name of the synonym group to which the feature belongs. If the feature does
1850    not belong to a synonym group, the feature ID itself is returned.
1851    
1852    =back
1853    
1854    =cut
1855    
1856    sub GetSynonymGroup {
1857        # Get the parameters.
1858        my ($self, $fid) = @_;
1859        # Declare the return variable.
1860        my $retVal;
1861        # Find the synonym group.
1862        my @groups = $self->GetFlat(['IsSynonymGroupFor'], "IsSynonymGroupFor(to-link) = ?",
1863                                       [$fid], 'IsSynonymGroupFor(from-link)');
1864        # Check to see if we found anything.
1865        if (@groups) {
1866            $retVal = $groups[0];
1867        } else {
1868            $retVal = $fid;
1869        }
1870        # Return the result.
1871        return $retVal;
1872    }
1873    
1874    =head3 GetBoundaries
1875    
1876    C<< my ($contig, $beg, $end) = $sprout->GetBoundaries(@locList); >>
1877    
1878    Determine the begin and end boundaries for the locations in a list. All of the
1879    locations must belong to the same contig and have mostly the same direction in
1880    order for this method to produce a meaningful result. The resulting
1881    begin/end pair will contain all of the bases in any of the locations.
1882    
1883    =over 4
1884    
1885    =item locList
1886    
1887    List of locations to process.
1888    
1889    =item RETURN
1890    
1891    Returns a 3-tuple consisting of the contig ID, the beginning boundary,
1892    and the ending boundary. The beginning boundary will be left of the
1893    end for mostly-forward locations and right of the end for mostly-backward
1894    locations.
1895    
1896    =back
1897    
1898    =cut
1899    
1900    sub GetBoundaries {
1901        # Get the parameters.
1902        my ($self, @locList) = @_;
1903        # Set up the counters used to determine the most popular direction.
1904        my %counts = ( '+' => 0, '-' => 0 );
1905        # Get the last location and parse it.
1906        my $locObject = BasicLocation->new(pop @locList);
1907        # Prime the loop with its data.
1908        my ($contig, $beg, $end) = ($locObject->Contig, $locObject->Left, $locObject->Right);
1909        # Count its direction.
1910        $counts{$locObject->Dir}++;
1911        # Loop through the remaining locations. Note that in most situations, this loop
1912        # will not iterate at all, because most of the time we will be dealing with a
1913        # singleton list.
1914        for my $loc (@locList) {
1915            # Create a location object.
1916            my $locObject = BasicLocation->new($loc);
1917            # Count the direction.
1918            $counts{$locObject->Dir}++;
1919            # Get the left end and the right end.
1920            my $left = $locObject->Left;
1921            my $right = $locObject->Right;
1922            # Merge them into the return variables.
1923            if ($left < $beg) {
1924                $beg = $left;
1925            }
1926            if ($right > $end) {
1927                $end = $right;
1928            }
1929        }
1930        # If the most common direction is reverse, flip the begin and end markers.
1931        if ($counts{'-'} > $counts{'+'}) {
1932            ($beg, $end) = ($end, $beg);
1933        }
1934        # Return the result.
1935        return ($contig, $beg, $end);
1936    }
1937    
1938  =head3 CouplingID  =head3 CouplingID
1939    
1940  C<< my $couplingID = Sprout::CouplingID($peg1, $peg2); >>  C<< my $couplingID = $sprout->CouplingID($peg1, $peg2); >>
1941    
1942  Return the coupling ID for a pair of feature IDs.  Return the coupling ID for a pair of feature IDs.
1943    
# Line 1857  Line 1970 
1970  =cut  =cut
1971  #: Return Type $;  #: Return Type $;
1972  sub CouplingID {  sub CouplingID {
1973      return join " ", sort @_;      my ($self, @pegs) = @_;
1974        return $self->DigestKey(join " ", sort @pegs);
1975  }  }
1976    
1977  =head3 ReadFasta  =head3 ReadFasta
# Line 2216  Line 2330 
2330      return @retVal;      return @retVal;
2331  }  }
2332    
 =head3 Exists  
   
 C<< my $found = $sprout->Exists($entityName, $entityID); >>  
   
 Return TRUE if an entity exists, else FALSE.  
   
 =over 4  
   
 =item entityName  
   
 Name of the entity type (e.g. C<Feature>) relevant to the existence check.  
   
 =item entityID  
   
 ID of the entity instance whose existence is to be checked.  
   
 =item RETURN  
   
 Returns TRUE if the entity instance exists, else FALSE.  
   
 =back  
   
 =cut  
 #: Return Type $;  
 sub Exists {  
     # Get the parameters.  
     my ($self, $entityName, $entityID) = @_;  
     # Check for the entity instance.  
     Trace("Checking existence of $entityName with ID=$entityID.") if T(4);  
     my $testInstance = $self->GetEntity($entityName, $entityID);  
     # Return an existence indicator.  
     my $retVal = ($testInstance ? 1 : 0);  
     return $retVal;  
 }  
   
2333  =head3 FeatureTranslation  =head3 FeatureTranslation
2334    
2335  C<< my $translation = $sprout->FeatureTranslation($featureID); >>  C<< my $translation = $sprout->FeatureTranslation($featureID); >>
# Line 2788  Line 2867 
2867      return @retVal;      return @retVal;
2868  }  }
2869    
2870    =head3 GenomeSubsystemData
2871    
2872    C<< my %featureData = $sprout->GenomeSubsystemData($genomeID); >>
2873    
2874    Return a hash mapping genome features to their subsystem roles.
2875    
2876    =over 4
2877    
2878    =item genomeID
2879    
2880    ID of the genome whose subsystem feature map is desired.
2881    
2882    =item RETURN
2883    
2884    Returns a hash mapping each feature of the genome to a list of 2-tuples. Eacb
2885    2-tuple contains a subsystem name followed by a role ID.
2886    
2887    =back
2888    
2889    =cut
2890    
2891    sub GenomeSubsystemData {
2892        # Get the parameters.
2893        my ($self, $genomeID) = @_;
2894        # Declare the return variable.
2895        my %retVal = ();
2896        # Get a list of the genome features that participate in subsystems. For each
2897        # feature we get its spreadsheet cells and the corresponding roles.
2898        my @roleData = $self->GetAll(['HasFeature', 'ContainsFeature', 'IsRoleOf'],
2899                                 "HasFeature(from-link) = ?", [$genomeID],
2900                                 ['HasFeature(to-link)', 'IsRoleOf(to-link)', 'IsRoleOf(from-link)']);
2901        # Now we get a list of the spreadsheet cells and their associated subsystems. Subsystems
2902        # with an unknown variant code (-1) are skipped. Note the genome ID is at both ends of the
2903        # list. We use it at the beginning to get all the spreadsheet cells for the genome and
2904        # again at the end to filter out participation in subsystems with a negative variant code.
2905        my @cellData = $self->GetAll(['IsGenomeOf', 'HasSSCell', 'ParticipatesIn'],
2906                                     "IsGenomeOf(from-link) = ? AND ParticipatesIn(variant-code) >= 0 AND ParticipatesIn(from-link) = ?",
2907                                     [$genomeID, $genomeID], ['HasSSCell(to-link)', 'HasSSCell(from-link)']);
2908        # Now "@roleData" lists the spreadsheet cell and role for each of the genome's features.
2909        # "@cellData" lists the subsystem name for each of the genome's spreadsheet cells. We
2910        # link these two lists together to create the result. First, we want a hash mapping
2911        # spreadsheet cells to subsystem names.
2912        my %subHash = map { $_->[0] => $_->[1] } @cellData;
2913        # We loop through @cellData to build the hash.
2914        for my $roleEntry (@roleData) {
2915            # Get the data for this feature and cell.
2916            my ($fid, $cellID, $role) = @{$roleEntry};
2917            # Check for a subsystem name.
2918            my $subsys = $subHash{$cellID};
2919            if ($subsys) {
2920                # Insure this feature has an entry in the return hash.
2921                if (! exists $retVal{$fid}) { $retVal{$fid} = []; }
2922                # Merge in this new data.
2923                push @{$retVal{$fid}}, [$subsys, $role];
2924            }
2925        }
2926        # Return the result.
2927        return %retVal;
2928    }
2929    
2930  =head3 RelatedFeatures  =head3 RelatedFeatures
2931    
# Line 3041  Line 3178 
3178      return %retVal;      return %retVal;
3179  }  }
3180    
3181    =head3 Sims
3182    
3183    C<< my $simList = $sprout->Sims($fid, $maxN, $maxP, $select, $max_expand, $filters); >>
3184    
3185    Get a list of similarities for a specified feature. Similarity information is not kept in the
3186    Sprout database; rather, they are retrieved from a network server. The similarities are
3187    returned as B<Sim> objects. A Sim object is actually a list reference that has been blessed
3188    so that its elements can be accessed by name.
3189    
3190    Similarities can be either raw or expanded. The raw similarities are basic
3191    hits between features with similar DNA. Expanding a raw similarity drags in any
3192    features considered substantially identical. So, for example, if features B<A1>,
3193    B<A2>, and B<A3> are all substatially identical to B<A>, then a raw similarity
3194    B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.
3195    
3196    =over 4
3197    
3198    =item fid
3199    
3200    ID of the feature whose similarities are desired.
3201    
3202    =item maxN
3203    
3204    Maximum number of similarities to return.
3205    
3206    =item maxP
3207    
3208    Minumum allowable similarity score.
3209    
3210    =item select
3211    
3212    Selection criterion: C<raw> means only raw similarities are returned; C<fig>
3213    means only similarities to FIG features are returned; C<all> means all expanded
3214    similarities are returned; and C<figx> means similarities are expanded until the
3215    number of FIG features equals the maximum.
3216    
3217    =item max_expand
3218    
3219    The maximum number of features to expand.
3220    
3221    =item filters
3222    
3223    Reference to a hash containing filter information, or a subroutine that can be
3224    used to filter the sims.
3225    
3226    =item RETURN
3227    
3228    Returns a reference to a list of similarity objects, or C<undef> if an error
3229    occurred.
3230    
3231    =back
3232    
3233    =cut
3234    
3235    sub Sims {
3236        # Get the parameters.
3237        my ($self, $fid, $maxN, $maxP, $select, $max_expand, $filters) = @_;
3238        # Create the shim object to test for deleted FIDs.
3239        my $shim = FidCheck->new($self);
3240        # Ask the network for sims.
3241        my $retVal = FIGRules::GetNetworkSims($shim, $fid, {}, $maxN, $maxP, $select, $max_expand, $filters);
3242        # Return the result.
3243        return $retVal;
3244    }
3245    
3246  =head3 GetGroups  =head3 GetGroups
3247    
3248  C<< my %groups = $sprout->GetGroups(\@groupList); >>  C<< my %groups = $sprout->GetGroups(\@groupList); >>
# Line 3198  Line 3400 
3400      return $retVal;      return $retVal;
3401  }  }
3402    
3403    =head3 Fix
3404    
3405    C<< my %fixedHash = Sprout::Fix(%groupHash); >>
3406    
3407    Prepare a genome group hash (like that returned by L</GetGroups> for processing.
3408    Groups with the same primary name will be combined. The primary name is the
3409    first capitalized word in the group name.
3410    
3411    =over 4
3412    
3413    =item groupHash
3414    
3415    Hash to be fixed up.
3416    
3417    =item RETURN
3418    
3419    Returns a fixed-up version of the hash.
3420    
3421    =back
3422    
3423    =cut
3424    
3425    sub Fix {
3426        # Get the parameters.
3427        my (%groupHash) = @_;
3428        # Create the result hash.
3429        my %retVal = ();
3430        # Copy over the genomes.
3431        for my $groupID (keys %groupHash) {
3432            # Make a safety copy of the group ID.
3433            my $realGroupID = $groupID;
3434            # Yank the primary name.
3435            if ($groupID =~ /([A-Z]\w+)/) {
3436                $realGroupID = $1;
3437            }
3438            # Append this group's genomes into the result hash.
3439            Tracer::AddToListMap(\%retVal, $realGroupID, @{$groupHash{$groupID}});
3440        }
3441        # Return the result hash.
3442        return %retVal;
3443    }
3444    
3445  =head2 Internal Utility Methods  =head2 Internal Utility Methods
3446    
3447  =head3 ParseAssignment  =head3 ParseAssignment
# Line 3254  Line 3498 
3498      }      }
3499      # If we have an assignment, we need to clean the function text. There may be      # If we have an assignment, we need to clean the function text. There may be
3500      # extra junk at the end added as a note from the user.      # extra junk at the end added as a note from the user.
3501      if (@retVal) {      if (defined( $retVal[1] )) {
3502          $retVal[1] =~ s/(\t\S)?\s*$//;          $retVal[1] =~ s/(\t\S)?\s*$//;
3503      }      }
3504      # Return the result list.      # Return the result list.

Legend:
Removed from v.1.67  
changed lines
  Added in v.1.82

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3