[Bio] / Sprout / SimBlocks.pm Repository:
ViewVC logotype

Diff of /Sprout/SimBlocks.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Thu Jun 9 19:06:55 2005 UTC revision 1.3, Fri Jan 6 20:35:01 2006 UTC
# Line 4  Line 4 
4      use ERDB;      use ERDB;
5      @ISA = qw(Exporter ERDB);      @ISA = qw(Exporter ERDB);
6      @EXPORT = qw(DefaultDistances);      @EXPORT = qw(DefaultDistances);
7      @EXPORT_OK = qw(BlocksInSet GetBlock CompareGenomes DBLoad DistanceMatrix GetAlignment GetRegions ParsePattern RemoveBlocks SequenceDistance SetNumber SnipScan);      @EXPORT_OK = qw(BlocksInSet GetBlock GetBlockPieces CompareGenomes DBLoad DistanceMatrix GetAlignment GetRegions ParsePattern RemoveBlocks SequenceDistance SetNumber SnipScan);
8    
9      use strict;      use strict;
10      use Tracer;      use Tracer;
11      use PageBuilder;      use PageBuilder;
12      use Genome;      use Genome;
13        use BasicLocation;
14      use FIG_Config;      use FIG_Config;
15    
16  =head1 Similarity Block Database  =head1 Similarity Block Database
# Line 150  Line 150 
150      $simBlocksData   = "$data/BlockData";   # directory containing SimBlocks data files      $simBlocksData   = "$data/BlockData";   # directory containing SimBlocks data files
151                                              # (used to load the Similarity database)                                              # (used to load the Similarity database)
152      $simBlastData    = "$data/BlastData";   # directory containing SimBlast data files      $simBlastData    = "$data/BlastData";   # directory containing SimBlast data files
153                                              # (input to SimBlast.pl)                                              # (input to SimLoad.pl)
154    
155  Run the C<SimBlast> script to load the database.  Run the C<SimLoad> script to load the database.
156    
157  The C<SimBlockForm.cgi> service enables you to run a similarity analysis between genome  The C<SimBlockForm.cgi> service enables you to run a similarity analysis between genome
158  sets in the database. The C<Html/ERDBTest.html> page enables you to run general queries  sets in the database. The C<Html/ERDBTest.html> page enables you to run general queries
# Line 601  Line 601 
601    
602  C<< my $taggedDNA = SimBlocks::TagDNA($pattern, $dnaString, $prefix, $suffix); >>  C<< my $taggedDNA = SimBlocks::TagDNA($pattern, $dnaString, $prefix, $suffix); >>
603    
604  Convert a DNA string from the B<ContainsRegionIn> relationship to the actual  Convert a DNA string from the B<Region> relation to the actual DNA.
605  DNA, optionally with markings surrounding them. The DNA string will contain  The incoming DNA string will contain only the values corresponding to the
606  only the values corresponding to the question marks in the pattern,  question marks in the pattern. The pattern should be taken from the DNA
607  which should be taken from the DNA string's parent B<GroupBlock>. The  string's parent B<GroupBlock>. The resulting DNA sequence is built by
608  resulting DNA sequence is built by copying  copying the DNA string data into the pattern positions that have question
609    marks. If the string is intended for display in an HTML page, the
610    I<$prefix> and I<$suffix> parameters can be used to surround the
611    question mark positions with HTML markers that specify a different style,
612    weight, or font.
613    
614  =over 4  =over 4
615    
# Line 617  Line 621 
621    
622  String containing DNA string to be marked.  String containing DNA string to be marked.
623    
624  =item prefix  =item prefix (optional)
625    
626  String to be inserted before each position of variance.  String to be inserted before each position of variance.
627    
628  =item suffix  =item suffix (optional)
629    
630  String to be inserted after each position of variance.  String to be inserted after each position of variance.
631    
# Line 1085  Line 1089 
1089      if (!defined $distances) {      if (!defined $distances) {
1090          $distances = DefaultDistances();          $distances = DefaultDistances();
1091      }      }
1092      # Declare the return value. Initially, this will be a sum of squared. We'll      # Declare the return value. Initially, this will be a sum of squares. We'll
1093      # take the square root and divide out the length later.      # take the square root and divide out the length later.
1094      my $retVal = 0;      my $retVal = 0;
1095      # Loop through the sequences, character by character, comparing.      # Loop through the sequences, character by character, comparing.
# Line 1133  Line 1137 
1137      return $retVal;      return $retVal;
1138  }  }
1139    
1140    =head3 GetBlockPieces
1141    
1142    C<< my @blocks = $blockData->GetBlockPieces($location); >>
1143    
1144    Return a map of the block pieces inside the specified location. The return
1145    value will be a list of block locations. A block location is essentially a
1146    location string with a block ID in place of the contig ID.
1147    
1148    This routine depends upon the fact that there is no overlap between blocks.
1149    A given nucleotide on a Contig is in exactly one block. Therefore, to find
1150    all the blocks that overlap a specific section of the Contig, we find the
1151    first block that ends after the start of the section and proceed until we
1152    find the last block that begins before the end of the section. The only
1153    complexity that occurs is if the Contig folds back in upon itself and
1154    appears twice in the same block. In this case, we might get two segments from
1155    the same block, and they may or may not overlap.
1156    
1157    =over 4
1158    
1159    =item location
1160    
1161    A basic location object or location string defining the range whose block pieces
1162    are desired.
1163    
1164    =item RETURN
1165    
1166    Returns a list of block location strings. Each string consists of a block ID
1167    followed by a start location and an end location. The three pieces of the
1168    string are separated by underscores (C<_>).
1169    
1170    =back
1171    
1172    =cut
1173    #: Return Type %;
1174    sub GetBlockPieces {
1175        # Get the parameters.
1176        my ($self, $location) = @_;
1177        # Declare the return variable.
1178        my @retVal = ();
1179        # Parse the incoming location.
1180        my $loc = BasicLocation->new($location);
1181        # Determine the parameters needed to get the desired list of regions.
1182        my ($left, $right, $contigID) = ($loc->Left, $loc->Right, $loc->Contig);
1183        # Ask for the regions in the section we want.
1184        my @regions = $self->GetAll(['Region', 'IncludesRegion', 'GroupBlock'],
1185                                    'Region(end) >= ? AND Region(position) <= ? AND Region(contigID) = ?',
1186                                    [$left, $right, $contigID],
1187                                    ['GroupBlock(id)', 'GroupBlock(pattern)',
1188                                     'GroupBlock(len)',
1189                                     'Region(position)', 'Region(end)',
1190                                     'Region(content)']);
1191        # Loop through the regions found. For each region we will output a location string.
1192        for my $regionData (@regions) {
1193            # Declare the variable to contain the formatted location.
1194            my $blockLoc;
1195            # Separate the fields from the region data.
1196            my ($blockID, $pattern, $len, $position, $end, $content) = @{$regionData};
1197            # Determine whether we're using the full region or only part of it.
1198            if ($position >= $left && $end <= $right) {
1199                # Here we're using the full region, so the location is simple.
1200                $blockLoc = "${blockID}_1_${len}";
1201            } else {
1202                # This is the hard part. We need to convert the contig positions to
1203                # block positions. Since the block contains insertion characters and
1204                # the regions don't, this requires fancy dancing. First, we get the
1205                # region's DNA string.
1206                my $dnaSequence = TagDNA($pattern, $content);
1207                # Let's call the "area of interest" the part of the Contig that
1208                # is in the incoming location AND in the current region. This could
1209                # be at either end of the region or somewhere in the middle. We
1210                # need to walk the block DNA to find what positions in the block
1211                # correspond to the desired positions in the contig. First, we
1212                # put ourselves at the beginning of the block and the region,
1213                # and walk to the start point. Note that our position arithmetic
1214                # is 1-based, in keeping with the conventions of biologists.
1215                my $blockPos = 1;
1216                my $contigPos = $position;
1217                # Check to see if we can use the current position as are starting
1218                # point or we need to find it.
1219                if ($left > $position) {
1220                    # Here we need to find the left edge of the area of interest.
1221                    $blockPos = WalkDNA($blockPos, $contigPos, $dnaSequence, $left);
1222                    # Update the contig position to match.
1223                    $contigPos = $left;
1224                }
1225                # Save the left edge.
1226                my $blockLeft = $blockPos;
1227                # Now find the right edge.
1228                if ($right >= $end) {
1229                    # Here the right edge is the end of the block.
1230                    $blockPos = $len;
1231                } else {
1232                    # Here we have to find the right edge of the area of interest.
1233                    $blockPos = WalkDNA($blockPos, $contigPos, $dnaSequence, $right);
1234                }
1235                my $blockRight = $blockPos;
1236                # Format the location.
1237                $blockLoc = "${blockID}_${blockLeft}_${blockRight}";
1238            }
1239            # Push the block location onto the return list.
1240            push @retVal, $blockLoc;
1241        }
1242        # Return the result.
1243        return @retVal;
1244    }
1245    
1246    =head3 WalkDNA
1247    
1248    C<< my $blockPos = SimBlocks::WalkDNA($blockPos, $contigPos, $dna, $loc); >>
1249    
1250    Location the desired position within a block of a specified location in a contig.
1251    
1252    The idea here is that a block contains insertion characters and a contig doesn't.
1253    The incoming DNA sequence is for the block. On entry, we have a current position
1254    in the block and a current position in the contig. We advance the two pointers
1255    in parallel until the contig pointer equals the desired location.
1256    
1257    =over 4
1258    
1259    =item blockPos
1260    
1261    Current position in the block (1-based).
1262    
1263    =item contigPos
1264    
1265    Current position in the contig.
1266    
1267    =item dna
1268    
1269    DNA string for the block with the points of variance filled in with data from
1270    the contig. In other word's the block's version of the current contig region.
1271    
1272    =item loc
1273    
1274    Desired location in the contig.
1275    
1276    =item RETURN
1277    
1278    Returns the position in the block corresponding to the desired position in the
1279    contig.
1280    
1281    =back
1282    
1283    =cut
1284    #: Return Type $;
1285    sub WalkDNA {
1286        # Get the parameters.
1287        my ($blockPos, $contigPos, $dna, $loc) = @_;
1288        # Copy the incoming positions into variables with which we can play. While
1289        # we're at it, we'll convert the block position from 1-based to 0-based.
1290        my ($retVal, $contigPtr) = ($blockPos - 1, $contigPos);
1291        # Loop until we find our destination point.
1292        while ($contigPtr < $loc) {
1293            # Find the next insertion character in the DNA string.
1294            my $nextHyphen = index $dna, '-', $retVal;
1295            # At this point, "nextHyphen" is either -1 (indicating no hyphen
1296            # found) or it is positioned on a hyphen. If it's -1, move it after
1297            # the end of the dna string.
1298            if ($nextHyphen < 0) {
1299                $nextHyphen = length $dna;
1300            }
1301            # Now, if we subtract $retVal from $nextHyphen, we have the largest
1302            # value by which we can advance the contig pointer without getting
1303            # out of sync. The first step is to see if this would move us past
1304            # our desired location.
1305            my $offset = $nextHyphen - $retVal;
1306            my $contigNext = $contigPtr + $offset;
1307            if ($contigNext >= $loc) {
1308                # We now know that everything between the current position and
1309                # the desired position is real nucleotides. We simply push both
1310                # pointers forward by the same amount so that the contig pointer
1311                # becomes $loc.
1312                $retVal += $loc - $contigPtr;
1313                $contigPtr = $loc
1314            } else {
1315                # Here we need to go the whole distance to the next hyphen and
1316                # keep going.
1317                $contigPtr += $offset;
1318                $retVal += $offset;
1319                $retVal++;
1320            }
1321        }
1322        # Return the result. Note we convert back to 1-based.
1323        return $retVal + 1;
1324    }
1325    
1326  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.3

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3