[Bio] / Sprout / SimBlocks.pm Repository:
ViewVC logotype

Diff of /Sprout/SimBlocks.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Thu Jun 9 19:06:55 2005 UTC revision 1.7, Mon Feb 13 15:42:48 2006 UTC
# Line 4  Line 4 
4      use ERDB;      use ERDB;
5      @ISA = qw(Exporter ERDB);      @ISA = qw(Exporter ERDB);
6      @EXPORT = qw(DefaultDistances);      @EXPORT = qw(DefaultDistances);
7      @EXPORT_OK = qw(BlocksInSet GetBlock CompareGenomes DBLoad DistanceMatrix GetAlignment GetRegions ParsePattern RemoveBlocks SequenceDistance SetNumber SnipScan);      @EXPORT_OK = qw(BlocksInSet GetBlock GetBlockPieces CompareGenomes DBLoad DistanceMatrix GetAlignment GetRegions ParsePattern RemoveBlocks SequenceDistance SetNumber SnipScan);
8    
9      use strict;      use strict;
10      use Tracer;      use Tracer;
11      use PageBuilder;      use PageBuilder;
12      use Genome;      use Genome;
13        use BasicLocation;
14        use Overlap;
15      use FIG_Config;      use FIG_Config;
16        use FIG;
17    
18  =head1 Similarity Block Database  =head1 Similarity Block Database
19    
# Line 150  Line 152 
152      $simBlocksData   = "$data/BlockData";   # directory containing SimBlocks data files      $simBlocksData   = "$data/BlockData";   # directory containing SimBlocks data files
153                                              # (used to load the Similarity database)                                              # (used to load the Similarity database)
154      $simBlastData    = "$data/BlastData";   # directory containing SimBlast data files      $simBlastData    = "$data/BlastData";   # directory containing SimBlast data files
155                                              # (input to SimBlast.pl)                                              # (input to SimLoad.pl)
156    
157  Run the C<SimBlast> script to load the database.  Run the C<SimLoad> script to load the database.
158    
159  The C<SimBlockForm.cgi> service enables you to run a similarity analysis between genome  The C<SimBlockForm.cgi> service enables you to run a similarity analysis between genome
160  sets in the database. The C<Html/ERDBTest.html> page enables you to run general queries  sets in the database. The C<Html/ERDBTest.html> page enables you to run general queries
# Line 211  Line 213 
213      my ($class, $dbname, $dbType, $port) = @_;      my ($class, $dbname, $dbType, $port) = @_;
214      # Plug in the default values.      # Plug in the default values.
215      if (! $dbname) {      if (! $dbname) {
216          $dbname = $FIG_Config::simBlocksDB;          $dbname = DBName();
217      }      }
218      if (! $dbType) {      if (! $dbType) {
219          $dbType = $FIG_Config::dbms;          $dbType = $FIG_Config::dbms;
# Line 229  Line 231 
231      return $retVal;      return $retVal;
232  }  }
233    
234    =head3 DBName
235    
236    C<< my $name = SimBlocks::DBName; >>
237    
238    Return the name of the database. This is set from a config variable, but if the
239    variable is undefined a default value is used.
240    
241    =cut
242    
243    sub DBName {
244        my $retVal;
245        if (defined $FIG_Config::simBlocksDB) {
246            $retVal = $FIG_Config::simBlocksDB;
247        } else {
248            $retVal = "Blocks";
249        }
250        return $retVal;
251    }
252    
253  =head3 DefaultDistances  =head3 DefaultDistances
254    
255  C<< my $distances = DefaultDistances(); >>  C<< my $distances = DefaultDistances(); >>
# Line 601  Line 622 
622    
623  C<< my $taggedDNA = SimBlocks::TagDNA($pattern, $dnaString, $prefix, $suffix); >>  C<< my $taggedDNA = SimBlocks::TagDNA($pattern, $dnaString, $prefix, $suffix); >>
624    
625  Convert a DNA string from the B<ContainsRegionIn> relationship to the actual  Convert a DNA string from the B<Region> relation to the actual DNA.
626  DNA, optionally with markings surrounding them. The DNA string will contain  The incoming DNA string will contain only the values corresponding to the
627  only the values corresponding to the question marks in the pattern,  question marks in the pattern. The pattern should be taken from the DNA
628  which should be taken from the DNA string's parent B<GroupBlock>. The  string's parent B<GroupBlock>. The resulting DNA sequence is built by
629  resulting DNA sequence is built by copying  copying the DNA string data into the pattern positions that have question
630    marks. If the string is intended for display in an HTML page, the
631    I<$prefix> and I<$suffix> parameters can be used to surround the
632    question mark positions with HTML markers that specify a different style,
633    weight, or font.
634    
635  =over 4  =over 4
636    
# Line 617  Line 642 
642    
643  String containing DNA string to be marked.  String containing DNA string to be marked.
644    
645  =item prefix  =item prefix (optional)
646    
647  String to be inserted before each position of variance.  String to be inserted before each position of variance.
648    
649  =item suffix  =item suffix (optional)
650    
651  String to be inserted after each position of variance.  String to be inserted after each position of variance.
652    
# Line 740  Line 765 
765      while (my $region = $query->Fetch) {      while (my $region = $query->Fetch) {
766          # Determine this region's genome set. We only continue if the region is in          # Determine this region's genome set. We only continue if the region is in
767          # one of the sets.          # one of the sets.
768          my ($location) = $region->Value('Region(from-link)');          my ($location) = $region->Value('Region(id)');
769          my $genomeID = Genome::FromContig($location);          my $genomeID = Genome::FromContig($location);
770          if (exists $setMap{$genomeID}) {          if (exists $setMap{$genomeID}) {
771              my $setIndex = $setMap{$genomeID};              my $setIndex = $setMap{$genomeID};
# Line 819  Line 844 
844      return @retVal;      return @retVal;
845  }  }
846    
   
   
847  =head3 MergeDNA  =head3 MergeDNA
848    
849  C<< my ($groupSequence, $variance) = SimBlocks::MergeDNA($groupSequence, $newSequence); >>  C<< my ($groupSequence, $variance) = SimBlocks::MergeDNA($groupSequence, $newSequence); >>
# Line 1085  Line 1108 
1108      if (!defined $distances) {      if (!defined $distances) {
1109          $distances = DefaultDistances();          $distances = DefaultDistances();
1110      }      }
1111      # Declare the return value. Initially, this will be a sum of squared. We'll      # Declare the return value. Initially, this will be a sum of squares. We'll
1112      # take the square root and divide out the length later.      # take the square root and divide out the length later.
1113      my $retVal = 0;      my $retVal = 0;
1114      # Loop through the sequences, character by character, comparing.      # Loop through the sequences, character by character, comparing.
# Line 1133  Line 1156 
1156      return $retVal;      return $retVal;
1157  }  }
1158    
1159    =head3 GetBlockPieces
1160    
1161    C<< my @blocks = $blockData->GetBlockPieces($location); >>
1162    
1163    Return a map of the block pieces inside the specified location. The return
1164    value will be a list of block locations. A block location is essentially a
1165    location string with a block ID in place of the contig ID.
1166    
1167    This routine depends upon the fact that there is no overlap between blocks.
1168    A given nucleotide on a Contig is in exactly one block. Therefore, to find
1169    all the blocks that overlap a specific section of the Contig, we find the
1170    first block that ends after the start of the section and proceed until we
1171    find the last block that begins before the end of the section. The only
1172    complexity that occurs is if the Contig folds back in upon itself and
1173    appears twice in the same block. In this case, we might get two segments from
1174    the same block, and they may or may not overlap.
1175    
1176    =over 4
1177    
1178    =item location
1179    
1180    A basic location object or location string defining the range whose block pieces
1181    are desired.
1182    
1183    =item RETURN
1184    
1185    Returns a list of block location strings. Each string consists of a block ID
1186    followed by a start location and an end location. The three pieces of the
1187    string are separated by underscores (C<_>).
1188    
1189    =back
1190    
1191    =cut
1192    #: Return Type %;
1193    sub GetBlockPieces {
1194        # Get the parameters.
1195        my ($self, $location) = @_;
1196        # Declare the return variable.
1197        my @retVal = ();
1198        # Parse the incoming location.
1199        my $loc = BasicLocation->new($location);
1200        # Determine the parameters needed to get the desired list of regions.
1201        my ($left, $right, $contigID) = ($loc->Left, $loc->Right, $loc->Contig);
1202        Trace("Searching for regions near " . $loc->String) if T(4);
1203        # Ask for the regions in the section we want.
1204        my @regions = $self->GetAll(['Region', 'IncludesRegion', 'GroupBlock'],
1205                                    'Region(endpoint) >= ? AND Region(position) <= ? AND Region(contigID) = ?',
1206                                    [$left, $right, $contigID],
1207                                    ['GroupBlock(id)', 'GroupBlock(pattern)',
1208                                     'GroupBlock(len)',
1209                                     'Region(position)', 'Region(endpoint)',
1210                                     'Region(content)']);
1211        # Loop through the regions found. For each region we will output a location string.
1212        for my $regionData (@regions) {
1213            # Declare the variable to contain the formatted location.
1214            my $blockLoc;
1215            # Separate the fields from the region data.
1216            my ($blockID, $pattern, $len, $position, $end, $content) = @{$regionData};
1217            # Determine whether we're using the full region or only part of it.
1218            if ($position >= $left && $end <= $right) {
1219                # Here we're using the full region, so the location is simple.
1220                $blockLoc = "${blockID}_1_${len}";
1221            } else {
1222                # This is the hard part. We need to convert the contig positions to
1223                # block positions. Since the block contains insertion characters and
1224                # the regions don't, this requires fancy dancing. First, we get the
1225                # region's DNA string.
1226                my $dnaSequence = TagDNA($pattern, $content);
1227                # Let's call the "area of interest" the part of the Contig that
1228                # is in the incoming location AND in the current region. This could
1229                # be at either end of the region or somewhere in the middle. We
1230                # need to walk the block DNA to find what positions in the block
1231                # correspond to the desired positions in the contig. First, we
1232                # put ourselves at the beginning of the block and the region,
1233                # and walk to the start point. Note that our position arithmetic
1234                # is 1-based, in keeping with the conventions of biologists.
1235                my $blockPos = 1;
1236                my $contigPos = $position;
1237                # Check to see if we can use the current position as are starting
1238                # point or we need to find it.
1239                if ($left > $position) {
1240                    # Here we need to find the left edge of the area of interest.
1241                    $blockPos = WalkDNA($blockPos, $contigPos, $dnaSequence, $left);
1242                    # Update the contig position to match.
1243                    $contigPos = $left;
1244                }
1245                # Save the left edge.
1246                my $blockLeft = $blockPos;
1247                # Now find the right edge.
1248                if ($right >= $end) {
1249                    # Here the right edge is the end of the block.
1250                    $blockPos = $len;
1251                } else {
1252                    # Here we have to find the right edge of the area of interest.
1253                    $blockPos = WalkDNA($blockPos, $contigPos, $dnaSequence, $right);
1254                }
1255                my $blockRight = $blockPos;
1256                # Format the location.
1257                $blockLoc = "${blockID}_${blockLeft}_${blockRight}";
1258            }
1259            # Push the block location onto the return list.
1260            push @retVal, $blockLoc;
1261        }
1262        # Return the result.
1263        return @retVal;
1264    }
1265    
1266    =head3 GetFeatureBlockPieces
1267    
1268    C<< my @pieces = $simBlocks->GetFeatureBlockPieces($fig, \@featureIDs, $distance); >>
1269    
1270    Return a list of the block pieces within the specified distance of the
1271    specified features. This method essentially computes locations from
1272    the distance and the feature locations, then passes them to L/GetBlockPieces>.
1273    That method will return the sections of various similarity blocks that
1274    occur inside the the locations computed.
1275    
1276    =over 4
1277    
1278    =item fig
1279    
1280    A FIG-like object that can be used to convert features into locations.
1281    
1282    =item featureIDs
1283    
1284    Reference to a list of feature IDs. Blocks in the vicinity of any of the
1285    features are returned.
1286    
1287    =item distance
1288    
1289    Distance to search from the feature's actual location.
1290    
1291    =item RETURN
1292    
1293    Returns a list of block piece location objects. A block piece location object
1294    is a standard B<BasicLocation> object with a block ID instead of a contig ID.
1295    
1296    =back
1297    
1298    =cut
1299    #: Return Type @;
1300    sub GetFeatureBlockPieces {
1301        # Get the parameters.
1302        my ($self, $fig, $featureIDs, $distance) = @_;
1303        # Declare a hash for keeping the return data. This will weed out
1304        # duplicates, of which we expect plenty. We'll need to handle
1305        # overlaps later, however.
1306        my %retHash = ();
1307        # Loop through the features.
1308        for my $featureID (@{$featureIDs}) {
1309            # Get the feature's genome ID.
1310            my $genomeID = FIG::genome_of($featureID);
1311            # Get the feature's locations.
1312            my @locations = $fig->feature_location($featureID);
1313            # Loop through the locations, sticking the resulting pieces in the return
1314            # hash.
1315            for my $loc (@locations) {
1316                my $locObject = BasicLocation->new($loc);
1317                # Get the length of the contig in question.
1318                my $len = $fig->contig_ln($genomeID, $locObject->Contig);
1319                # Expand the location.
1320                $locObject->Widen($distance, $len);
1321                # Insure the location is Sprout-style;
1322                $locObject->FixContig($genomeID);
1323                # Get the desired block pieces.
1324                my @pieces = $self->GetBlockPieces($locObject);
1325                Trace(scalar(@pieces) . " pieces found for location $loc.") if T(4);
1326                # Put them in the hash.
1327                for my $piece (@pieces) {
1328                    $retHash{$piece} = 1;
1329                }
1330            }
1331        }
1332        # Now we have all the block pieces that occur in any one of our regions. The
1333        # next step is to merge adjacent and overlapping regions. First we convert
1334        # the pieces to location objects so we can sort them.
1335        my @retVal = ();
1336        for my $piece (keys %retHash) {
1337            my $loc = BasicLocation->new($piece);
1338            push @retVal, $loc;
1339        }
1340        Trace("Beginning sort.") if T(3);
1341        @retVal = sort { BasicLocation::Cmp($a,$b) } @retVal;
1342        Trace(scalar(@retVal) . " pieces found before overlap check.") if T(3);
1343        # Now the locations are sorted by block ID, start position, and descending
1344        # length. This means that if there's an overlap, the two overlapping
1345        # pieces will be next to each other.
1346        my $i = 0;
1347        while ($i < $#retVal) {
1348            # Check for overlap between this location and the next.
1349            my ($type, $len) = Overlap::CheckOverlap($retVal[$i], $retVal[$i+1]);
1350            if ($len == 0) {
1351                # Here there is no overlap, so we push ahead.
1352                $i++;
1353            } elsif ($type eq 'embedded') {
1354                # Here the second piece is inside the first, so we delete the
1355                # second.
1356                delete $retVal[$i+1];
1357            } else {
1358                # Here we have a normal overlap. Because all of our pieces
1359                # are forward locations, this means the second piece starts
1360                # before the end of the of the first. We set the end point
1361                # if the first to the end point of the second and delete the
1362                # second.
1363                $retVal[$i]->SetEnd($retVal[$i+1]->EndPoint);
1364                delete $retVal[$i+1];
1365            }
1366        }
1367        # Return the result.
1368        return @retVal;
1369    }
1370    
1371    =head3 WalkDNA
1372    
1373    C<< my $blockPos = SimBlocks::WalkDNA($blockPos, $contigPos, $dna, $loc); >>
1374    
1375    Location the desired position within a block of a specified location in a contig.
1376    
1377    The idea here is that a block contains insertion characters and a contig doesn't.
1378    The incoming DNA sequence is for the block. On entry, we have a current position
1379    in the block and a current position in the contig. We advance the two pointers
1380    in parallel until the contig pointer equals the desired location.
1381    
1382    =over 4
1383    
1384    =item blockPos
1385    
1386    Current position in the block (1-based).
1387    
1388    =item contigPos
1389    
1390    Current position in the contig.
1391    
1392    =item dna
1393    
1394    DNA string for the block with the points of variance filled in with data from
1395    the contig. In other word's the block's version of the current contig region.
1396    
1397    =item loc
1398    
1399    Desired location in the contig.
1400    
1401    =item RETURN
1402    
1403    Returns the position in the block corresponding to the desired position in the
1404    contig.
1405    
1406    =back
1407    
1408    =cut
1409    #: Return Type $;
1410    sub WalkDNA {
1411        # Get the parameters.
1412        my ($blockPos, $contigPos, $dna, $loc) = @_;
1413        # Copy the incoming positions into variables with which we can play. While
1414        # we're at it, we'll convert the block position from 1-based to 0-based.
1415        my ($retVal, $contigPtr) = ($blockPos - 1, $contigPos);
1416        # Loop until we find our destination point.
1417        while ($contigPtr < $loc) {
1418            # Find the next insertion character in the DNA string.
1419            my $nextHyphen = index $dna, '-', $retVal;
1420            # At this point, "nextHyphen" is either -1 (indicating no hyphen
1421            # found) or it is positioned on a hyphen. If it's -1, move it after
1422            # the end of the dna string.
1423            if ($nextHyphen < 0) {
1424                $nextHyphen = length $dna;
1425            }
1426            # Now, if we subtract $retVal from $nextHyphen, we have the largest
1427            # value by which we can advance the contig pointer without getting
1428            # out of sync. The first step is to see if this would move us past
1429            # our desired location.
1430            my $offset = $nextHyphen - $retVal;
1431            my $contigNext = $contigPtr + $offset;
1432            if ($contigNext >= $loc) {
1433                # We now know that everything between the current position and
1434                # the desired position is real nucleotides. We simply push both
1435                # pointers forward by the same amount so that the contig pointer
1436                # becomes $loc.
1437                $retVal += $loc - $contigPtr;
1438                $contigPtr = $loc
1439            } else {
1440                # Here we need to go the whole distance to the next hyphen and
1441                # keep going.
1442                $contigPtr += $offset;
1443                $retVal += $offset;
1444                $retVal++;
1445            }
1446        }
1447        # Return the result. Note we convert back to 1-based.
1448        return $retVal + 1;
1449    }
1450    
1451  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3