[Bio] / Sprout / SimBlocks.pm Repository:
ViewVC logotype

Diff of /Sprout/SimBlocks.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Fri Jan 6 20:35:01 2006 UTC revision 1.10, Tue Apr 10 06:12:29 2007 UTC
# Line 11  Line 11 
11      use PageBuilder;      use PageBuilder;
12      use Genome;      use Genome;
13      use BasicLocation;      use BasicLocation;
14        use Overlap;
15      use FIG_Config;      use FIG_Config;
16        use FIG;
17    
18  =head1 Similarity Block Database  =head1 Similarity Block Database
19    
# Line 211  Line 213 
213      my ($class, $dbname, $dbType, $port) = @_;      my ($class, $dbname, $dbType, $port) = @_;
214      # Plug in the default values.      # Plug in the default values.
215      if (! $dbname) {      if (! $dbname) {
216          $dbname = $FIG_Config::simBlocksDB;          $dbname = DBName();
217      }      }
218      if (! $dbType) {      if (! $dbType) {
219          $dbType = $FIG_Config::dbms;          $dbType = $FIG_Config::dbms;
# Line 224  Line 226 
226      # Get the data directory name.      # Get the data directory name.
227      my $directory = $FIG_Config::simBlocksData;      my $directory = $FIG_Config::simBlocksData;
228      # Create and bless the ERDB object.      # Create and bless the ERDB object.
229      my $retVal = ERDB::new($class, $dbh, "$directory/SimBlocksDBD.xml");      my $retVal = ERDB::new($class, $dbh, "$FIG_Config::fig/SimBlocksDBD.xml");
230      # Return it.      # Return it.
231      return $retVal;      return $retVal;
232  }  }
233    
234    =head3 DBName
235    
236    C<< my $name = SimBlocks::DBName; >>
237    
238    Return the name of the database. This is set from a config variable, but if the
239    variable is undefined a default value is used.
240    
241    =cut
242    
243    sub DBName {
244        my $retVal;
245        if (defined $FIG_Config::simBlocksDB) {
246            $retVal = $FIG_Config::simBlocksDB;
247        } else {
248            $retVal = "Blocks";
249        }
250        return $retVal;
251    }
252    
253  =head3 DefaultDistances  =head3 DefaultDistances
254    
255  C<< my $distances = DefaultDistances(); >>  C<< my $distances = DefaultDistances(); >>
# Line 303  Line 324 
324  Analyze two sets of genomes for commonalities. The group blocks returned will be divided  Analyze two sets of genomes for commonalities. The group blocks returned will be divided
325  into three hashes: one for those common to set 0 and not occurring at all in set 1, one  into three hashes: one for those common to set 0 and not occurring at all in set 1, one
326  for those common to set 1 and not occurring at all in set 0, and one for those common  for those common to set 1 and not occurring at all in set 0, and one for those common
327  to both sets. Each hash is keyed by group ID and will contain B<DBObject>s for  to both sets. Each hash is keyed by group ID and will contain B<ERDBObject>s for
328  B<GroupBlock> records with B<HasInstanceOf> data attached, though the genome ID in  B<GroupBlock> records with B<HasInstanceOf> data attached, though the genome ID in
329  the B<HasInstanceOf> section is not generally predictable.  the B<HasInstanceOf> section is not generally predictable.
330    
# Line 320  Line 341 
341  =item RETURN  =item RETURN
342    
343  Returns a triple of hashes. Each hash is keyed by group ID, and will contain  Returns a triple of hashes. Each hash is keyed by group ID, and will contain
344  B<DBObject>s for records in the B<GroupBlock> table. Groups found in all of the  B<ERDBObject>s for records in the B<GroupBlock> table. Groups found in all of the
345  genomes in set 0 but in none of the genomes of set 1 will be in the first hash,  genomes in set 0 but in none of the genomes of set 1 will be in the first hash,
346  groups found in all of the genomes in set 1 but in none of the genomes of set 0  groups found in all of the genomes in set 1 but in none of the genomes of set 0
347  will be in the second hash, and groups found in all of the genomes of both sets  will be in the second hash, and groups found in all of the genomes of both sets
# Line 422  Line 443 
443  C<< my %blockList = $simBlocks->BlocksInSet($set, $count); >>  C<< my %blockList = $simBlocks->BlocksInSet($set, $count); >>
444    
445  Return a list of the group blocks found in a given number of the genomes in a given  Return a list of the group blocks found in a given number of the genomes in a given
446  set. The list returned will be a hash of B<DBObject>s, each corresponding to a single  set. The list returned will be a hash of B<ERDBObject>s, each corresponding to a single
447  B<GroupBlock> record, with a B<HasInstanceOf> record attached, though the content of  B<GroupBlock> record, with a B<HasInstanceOf> record attached, though the content of
448  the B<HasInstanceOf> record is not predictable. The hash will be keyed by block ID.  the B<HasInstanceOf> record is not predictable. The hash will be keyed by block ID.
449    
# Line 443  Line 464 
464    
465  =item RETURN  =item RETURN
466    
467  Returns a hash of B<DBObject>s corresponding to the group blocks found in the  Returns a hash of B<ERDBObject>s corresponding to the group blocks found in the
468  genomes of the set.  genomes of the set.
469    
470  =back  =back
# Line 467  Line 488 
488      for my $genomeID (@{$set}) {      for my $genomeID (@{$set}) {
489          # Get a list of group blocks for this genome.          # Get a list of group blocks for this genome.
490          my @blocks = $self->GetList(['HasInstanceOf', 'GroupBlock'],          my @blocks = $self->GetList(['HasInstanceOf', 'GroupBlock'],
491                                      "HasInstanceOf(from-link) = ?", $genomeID);                                      "HasInstanceOf(from-link) = ?", [$genomeID]);
492          # Loop through the blocks, storing any new ones in the hash.          # Loop through the blocks, storing any new ones in the hash.
493          for my $block (@blocks) {          for my $block (@blocks) {
494              # Get the ID of this block.              # Get the ID of this block.
# Line 532  Line 553 
553      my $regionCount = 0;      my $regionCount = 0;
554      # Query all the regions for the specified block.      # Query all the regions for the specified block.
555      my $query = $self->Get(['IncludesRegion', 'Region'], "IncludesRegion(from-link) = ?",      my $query = $self->Get(['IncludesRegion', 'Region'], "IncludesRegion(from-link) = ?",
556                             $blockID);                             [$blockID]);
557      # Loop through the query.      # Loop through the query.
558      while (my $region = $query->Fetch) {      while (my $region = $query->Fetch) {
559          # Get this region's data.          # Get this region's data.
# Line 688  Line 709 
709    
710  =item blockObject  =item blockObject
711    
712  A C<DBObject> representing the B<GroupBlock> record for the desired block or  A C<ERDBObject> representing the B<GroupBlock> record for the desired block or
713  the actual ID of the block whose regions are to be examined. It is expected that the  the actual ID of the block whose regions are to be examined. It is expected that the
714  block will have regions in all of the genomes for both sets, but this is not  block will have regions in all of the genomes for both sets, but this is not
715  required by the algorithm.  required by the algorithm.
# Line 717  Line 738 
738      # Get the parameters.      # Get the parameters.
739      my ($self, $blockObject, $set0, $set1) = @_;      my ($self, $blockObject, $set0, $set1) = @_;
740      # Convert an incoming block ID to a block object.      # Convert an incoming block ID to a block object.
741      if (ref $blockObject ne "DBObject") {      if (ref $blockObject ne "ERDBObject") {
742          $blockObject = $self->GetEntity('GroupBlock', $blockObject);          $blockObject = $self->GetEntity('GroupBlock', $blockObject);
743      }      }
744      # Get the ID and length of this block.      # Get the ID and length of this block.
# Line 739  Line 760 
760      }      }
761      # Ask for the regions in the block.      # Ask for the regions in the block.
762      my $query = $self->Get(['IncludesRegion', 'Region'], "IncludesRegion(from-link) = ?",      my $query = $self->Get(['IncludesRegion', 'Region'], "IncludesRegion(from-link) = ?",
763                             $blockID);                             [$blockID]);
764      # Loop through the regions.      # Loop through the regions.
765      while (my $region = $query->Fetch) {      while (my $region = $query->Fetch) {
766          # Determine this region's genome set. We only continue if the region is in          # Determine this region's genome set. We only continue if the region is in
767          # one of the sets.          # one of the sets.
768          my ($location) = $region->Value('Region(from-link)');          my ($location) = $region->Value('Region(id)');
769          my $genomeID = Genome::FromContig($location);          my $genomeID = Genome::FromContig($location);
770          if (exists $setMap{$genomeID}) {          if (exists $setMap{$genomeID}) {
771              my $setIndex = $setMap{$genomeID};              my $setIndex = $setMap{$genomeID};
# Line 823  Line 844 
844      return @retVal;      return @retVal;
845  }  }
846    
   
   
847  =head3 MergeDNA  =head3 MergeDNA
848    
849  C<< my ($groupSequence, $variance) = SimBlocks::MergeDNA($groupSequence, $newSequence); >>  C<< my ($groupSequence, $variance) = SimBlocks::MergeDNA($groupSequence, $newSequence); >>
# Line 1111  Line 1130 
1130    
1131  C<< my $blockData = $simBlocks->GetBlock($blockID); >>  C<< my $blockData = $simBlocks->GetBlock($blockID); >>
1132    
1133  Return a B<DBObject> for a specified group block.  Return a B<ERDBObject> for a specified group block.
1134    
1135  =over 4  =over 4
1136    
# Line 1121  Line 1140 
1140    
1141  =item RETURN  =item RETURN
1142    
1143  Returns a B<DBObject> for the group block in question. The object allows access to  Returns a B<ERDBObject> for the group block in question. The object allows access to
1144  all of the fields in the C<GroupBlock> relation of the database.  all of the fields in the C<GroupBlock> relation of the database.
1145    
1146  =back  =back
# Line 1180  Line 1199 
1199      my $loc = BasicLocation->new($location);      my $loc = BasicLocation->new($location);
1200      # Determine the parameters needed to get the desired list of regions.      # Determine the parameters needed to get the desired list of regions.
1201      my ($left, $right, $contigID) = ($loc->Left, $loc->Right, $loc->Contig);      my ($left, $right, $contigID) = ($loc->Left, $loc->Right, $loc->Contig);
1202        Trace("Searching for regions near " . $loc->String) if T(4);
1203      # Ask for the regions in the section we want.      # Ask for the regions in the section we want.
1204      my @regions = $self->GetAll(['Region', 'IncludesRegion', 'GroupBlock'],      my @regions = $self->GetAll(['Region', 'IncludesRegion', 'GroupBlock'],
1205                                  'Region(end) >= ? AND Region(position) <= ? AND Region(contigID) = ?',                                  'Region(endpoint) >= ? AND Region(position) <= ? AND Region(contigID) = ?',
1206                                  [$left, $right, $contigID],                                  [$left, $right, $contigID],
1207                                  ['GroupBlock(id)', 'GroupBlock(pattern)',                                  ['GroupBlock(id)', 'GroupBlock(pattern)',
1208                                   'GroupBlock(len)',                                   'GroupBlock(len)',
1209                                   'Region(position)', 'Region(end)',                                   'Region(position)', 'Region(endpoint)',
1210                                   'Region(content)']);                                   'Region(content)']);
1211      # Loop through the regions found. For each region we will output a location string.      # Loop through the regions found. For each region we will output a location string.
1212      for my $regionData (@regions) {      for my $regionData (@regions) {
# Line 1243  Line 1263 
1263      return @retVal;      return @retVal;
1264  }  }
1265    
1266    =head3 GetFeatureBlockPieces
1267    
1268    C<< my @pieces = $simBlocks->GetFeatureBlockPieces($fig, \@featureIDs, $distance); >>
1269    
1270    Return a list of the block pieces within the specified distance of the
1271    specified features. This method essentially computes locations from
1272    the distance and the feature locations, then passes them to L/GetBlockPieces>.
1273    That method will return the sections of various similarity blocks that
1274    occur inside the the locations computed.
1275    
1276    =over 4
1277    
1278    =item fig
1279    
1280    A FIG-like object that can be used to convert features into locations.
1281    
1282    =item featureIDs
1283    
1284    Reference to a list of feature IDs. Blocks in the vicinity of any of the
1285    features are returned.
1286    
1287    =item distance
1288    
1289    Distance to search from the feature's actual location.
1290    
1291    =item RETURN
1292    
1293    Returns a list of block piece location objects. A block piece location object
1294    is a standard B<BasicLocation> object with a block ID instead of a contig ID.
1295    
1296    =back
1297    
1298    =cut
1299    #: Return Type @;
1300    sub GetFeatureBlockPieces {
1301        # Get the parameters.
1302        my ($self, $fig, $featureIDs, $distance) = @_;
1303        # Declare a hash for keeping the return data. This will weed out
1304        # duplicates, of which we expect plenty. We'll need to handle
1305        # overlaps later, however.
1306        my %retHash = ();
1307        # Loop through the features.
1308        for my $featureID (@{$featureIDs}) {
1309            # Get the feature's genome ID.
1310            my $genomeID = FIG::genome_of($featureID);
1311            # Get the feature's locations.
1312            my @locations = $fig->feature_location($featureID);
1313            # Loop through the locations, sticking the resulting pieces in the return
1314            # hash.
1315            for my $loc (@locations) {
1316                my $locObject = BasicLocation->new($loc);
1317                # Get the length of the contig in question.
1318                my $len = $fig->contig_ln($genomeID, $locObject->Contig);
1319                # Expand the location.
1320                $locObject->Widen($distance, $len);
1321                # Insure the location is Sprout-style;
1322                $locObject->FixContig($genomeID);
1323                # Get the desired block pieces.
1324                my @pieces = $self->GetBlockPieces($locObject);
1325                Trace(scalar(@pieces) . " pieces found for location $loc.") if T(4);
1326                # Put them in the hash.
1327                for my $piece (@pieces) {
1328                    $retHash{$piece} = 1;
1329                }
1330            }
1331        }
1332        # Now we have all the block pieces that occur in any one of our regions. The
1333        # next step is to merge adjacent and overlapping regions. First we convert
1334        # the pieces to location objects so we can sort them.
1335        my @retVal = ();
1336        for my $piece (keys %retHash) {
1337            my $loc = BasicLocation->new($piece);
1338            push @retVal, $loc;
1339        }
1340        Trace("Beginning sort.") if T(3);
1341        @retVal = sort { BasicLocation::Cmp($a,$b) } @retVal;
1342        Trace(scalar(@retVal) . " pieces found before overlap check.") if T(3);
1343        # Now the locations are sorted by block ID, start position, and descending
1344        # length. This means that if there's an overlap, the two overlapping
1345        # pieces will be next to each other.
1346        my $i = 0;
1347        while ($i < $#retVal) {
1348            # Check for overlap between this location and the next.
1349            my ($type, $len) = Overlap::CheckOverlap($retVal[$i], $retVal[$i+1]);
1350            if ($len == 0) {
1351                # Here there is no overlap, so we push ahead.
1352                $i++;
1353            } elsif ($type eq 'embedded') {
1354                # Here the second piece is inside the first, so we delete the
1355                # second.
1356                delete $retVal[$i+1];
1357            } else {
1358                # Here we have a normal overlap. Because all of our pieces
1359                # are forward locations, this means the second piece starts
1360                # before the end of the of the first. We set the end point
1361                # if the first to the end point of the second and delete the
1362                # second.
1363                $retVal[$i]->SetEnd($retVal[$i+1]->EndPoint);
1364                delete $retVal[$i+1];
1365            }
1366        }
1367        # Return the result.
1368        return @retVal;
1369    }
1370    
1371  =head3 WalkDNA  =head3 WalkDNA
1372    
1373  C<< my $blockPos = SimBlocks::WalkDNA($blockPos, $contigPos, $dna, $loc); >>  C<< my $blockPos = SimBlocks::WalkDNA($blockPos, $contigPos, $dna, $loc); >>

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.10

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3