[Bio] / Sprout / SimBlocks.pm Repository:
ViewVC logotype

Annotation of /Sprout/SimBlocks.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 package SimBlocks;
2 :    
3 :     require Exporter;
4 :     use ERDB;
5 :     @ISA = qw(Exporter ERDB);
6 :     @EXPORT = qw();
7 :     @EXPORT_OK = qw();
8 :    
9 :     use strict;
10 :     use Tracer;
11 :     use PageBuilder;
12 :     use Genome;
13 :    
14 :     use FIG_Config;
15 :    
16 :     =head1 Similarity Block Database
17 :    
18 :     =head2 Introduction
19 :    
20 :     A I<similarity Block Database> describes the similarities amongst a group
21 :     of contigs. The load files for the database are produced by the
22 :     C<SimBlast.pl> script from genome FASTA files. The database is described
23 :     by the C<SimBlocksDBD.xml> file.
24 :    
25 :     The primary entities of the database are B<Genome>, which represents a specified
26 :     organism, B<Group>, which represents a set of similar regions, and B<Contig>,
27 :     which represents a contiguous segment of DNA. The key relationship is
28 :     B<ContainsRegionIn>, which maps groups to contigs. The mapping information
29 :     includes the DNA nucleotides as well as the location in the contig.
30 :    
31 :     For a set of genomes, we will define a I<common group> as a group that
32 :     is present in most of the genomes. (The definition of I<most> is
33 :     determined by a parameter.)
34 :    
35 :     The similarity block database must be able to answer two questions, taking
36 :     as input two sets of genomes.
37 :    
38 :     =over 4
39 :    
40 :     =item (1) Which groups are common in both sets and which are common in only one or the other?
41 :    
42 :     =item (2) For each group that is common in both sets, how do the individual regions differ?
43 :    
44 :     =back
45 :    
46 :     This class is a subclass of B<ERDB>, and inherits all of its public methods.
47 :    
48 :     =cut
49 :    
50 :     #: Constructor SimBlocks->new();
51 :    
52 :     =head2 Public Methods
53 :    
54 :     =head3 new
55 :    
56 :     C<< my $simdb = SimBlocks->new($dbname, $dbType, $port); >>
57 :    
58 :     Construct a new SimBlocks object connected to the specified database. If no
59 :     database is specified, the default database indicated by C<FIG_Config::simBlocksDB>
60 :     will be used. Similarly, if the database type is omitted, the type will be
61 :     taken from C<FIG_Config::dbms>, and if the port is omitted, C<FIG_Config::dbport>
62 :     will be used. The user name and password for connecting are taken from
63 :     C<FIG_Config::dbuser> and C<FIG_Config::dbpass>.
64 :    
65 :     =over 4
66 :    
67 :     =item dbname (optional)
68 :    
69 :     Database schema name. This is how we choose the desired database on the server.
70 :     If omitted, the value of C<$FIG_Config::simBlocksDB> is used.
71 :    
72 :     =item dbType
73 :    
74 :     Database type (e.g. C<mysql> for MySQL or C<pg> for Postgres). If omitted, the
75 :     value of C<$FIG_Config::dbms> will be used.
76 :    
77 :     =item port
78 :    
79 :     Port number for connecting to the database server. If omitted, the value of
80 :     C<$FIG_Config::dbport> will be used.
81 :    
82 :     =back
83 :    
84 :     =cut
85 :    
86 :     sub new {
87 :     # Get the parameters.
88 :     my ($class, $dbname, $dbType, $port) = @_;
89 :     # Plug in the default values.
90 :     if (! $dbname) {
91 :     $dbname = $FIG_Config::simBlocksDB;
92 :     }
93 :     if (! $dbType) {
94 :     $dbType = $FIG_Config::dbms;
95 :     }
96 :     if (! $port) {
97 :     $port = $FIG_Config::dbport;
98 :     }
99 :     # Connect to the database.
100 :     my $dbh = DBKernel->new($dbType, $dbname, $FIG_Config::dbuser, $FIG_Config::dbpass, $port);
101 :     # Get the data directory name.
102 :     my $directory = $FIG_Config::simBlocksData;
103 :     # Create and bless the ERDB object.
104 :     my $retVal = ERDB::new($class, $dbh, "$directory/SimBlocksDBD.xml");
105 :     # Return it.
106 :     return $retVal;
107 :     }
108 :    
109 :     =head3 DBLoad
110 :    
111 :     C<< my $stats = $simBlocks->DBLoad($rebuild); >>
112 :    
113 :     Load the database from the default directory. This is essentially identical to
114 :     a B<LoadTables> call with the default directory used instead of a caller-specified
115 :     directory.
116 :    
117 :     =over 4
118 :    
119 :     =item rebuild
120 :    
121 :     TRUE if the tables should be re-created; FALSE if the tables should be cleared and
122 :     reloaded. The default is FALSE; however, TRUE is considerably faster.
123 :    
124 :     =item RETURN
125 :    
126 :     Returns a statistics object describing the duration of the load, the number of
127 :     records loaded, and a list of error messages.
128 :    
129 :     =back
130 :    
131 :     =cut
132 :     #: Return Type $%@;
133 :     sub DBLoad {
134 :     # Get the parameters.
135 :     my ($self, $rebuild) = @_;
136 :     # Load the tables.
137 :     my $retVal = $self->LoadTables($FIG_Config::simBlocksData, $rebuild);
138 :     # Return the result.
139 :     return $retVal;
140 :     }
141 :    
142 :     =head3 CompareGenomes
143 :    
144 :     C<< my (\%set1Blocks, \%set2Blocks, \%bothBlocks) = $simBlocks->CompareGenomes(\@set1, \@set2); >>
145 :    
146 :     Analyze two sets of genomes for commonalities. The group blocks returned will be divided
147 :     into three hashes: one for those found only in set 1, one for those found only in set 2,
148 :     and one for those found in both sets. Each hash is keyed by group ID and will contain
149 :     B<DBObject>s for B<GroupBlock> records with B<HasInstanceOf> data attached, though the
150 :     genome ID in the B<HasInstanceOf> section is not generally predictable.
151 :    
152 :     =over 4
153 :    
154 :     =item set1
155 :    
156 :     Reference to a list of genome IDs for the genomes in the first set.
157 :    
158 :     =item set2
159 :    
160 :     Reference to a list of genome IDs for the genomes in the second set.
161 :    
162 :     =item RETURN
163 :    
164 :     Returns a list of hashes of lists. Each hash is keyed by group ID, and will contain
165 :     B<DBObject>s for records in the B<GroupBlock> table. Groups found only in set
166 :     1 will be in the first hash, groups found only in set 2 will be in the second
167 :     hash, and groups found in both sets will be in the third hash.
168 :    
169 :     =back
170 :    
171 :     =cut
172 :     #: Return Type @%;
173 :     sub CompareGenomes {
174 :     # Get the parameters.
175 :     my ($self, $set1, $set2) = @_;
176 :     # Declare the three hashes.
177 :     my (%set1Blocks, %set2Blocks, %bothBlocks);
178 :     # Our first task is to find the groups in each genome set.
179 :     my %groups1 = $self->BlocksInSet($set1);
180 :     my %groups2 = $self->BlocksInSet($set2);
181 :     # Create a trailer key to help tighten the loops.
182 :     my $trailer = "\xFF";
183 :     # The groups are hashed by group ID. We will move through them in key order
184 :     # to get the "set1", "set2", and "both" groups.
185 :     my @blockIDs1 = (sort keys %groups1, $trailer);
186 :     my @blockIDs2 = (sort keys %groups2, $trailer);
187 :     # Prime the loop by getting the first ID from each list. Thanks to the trailers
188 :     # we know this process can't fail.
189 :     my $blockID1 = shift @blockIDs1;
190 :     my $blockID2 = shift @blockIDs2;
191 :     while ($blockID1 lt $trailer || $blockID2 lt $trailer) {
192 :     # Compare the block IDs.
193 :     if ($blockID1 lt $blockID2) {
194 :     # Block 1 is only in the first set.
195 :     $set1Blocks{$blockID1} = $groups1{$blockID1};
196 :     $blockID1 = shift @blockIDs1;
197 :     } elsif ($blockID1 gt $blockID2) {
198 :     # Block 2 is only in the second set.
199 :     $set2Blocks{$blockID2} = $groups2{$blockID2};
200 :     $blockID2 = shift @blockIDs2;
201 :     } else {
202 :     # We have a block in both sets.
203 :     $bothBlocks{$blockID1} = $groups1{$blockID1};
204 :     $blockID1 = shift @blockIDs1;
205 :     $blockID2 = shift @blockIDs2;
206 :     }
207 :     }
208 :     # Return the result.
209 :     return (\%set1Blocks, \%set2Blocks, \%bothBlocks);
210 :     }
211 :    
212 :     =head3 BlocksInSet
213 :    
214 :     C<< my %blockList = $simBlocks->BlocksInSet($set); >>
215 :    
216 :     Return a list of the group blocks found in a given set of genomes. The list returned
217 :     will be a hash of B<DBObject>s, each corresponding to a single B<GroupBlock> record,
218 :     with a B<HasInstanceOf> record attached, though the content of the B<HasInstanceOf>
219 :     record is not predictable. The hash will be keyed by group ID.
220 :    
221 :     =over 4
222 :    
223 :     =item set
224 :    
225 :     Reference to a list of genome IDs. All blocks appearing in any one of the genome
226 :     IDs will be in the list returned.
227 :    
228 :     =item RETURN
229 :    
230 :     Returns a hash of B<DBObject>s corresponding to the group blocks found in the
231 :     genomes of the set.
232 :    
233 :     =back
234 :    
235 :     =cut
236 :     #: Return Type %%;
237 :     sub BlocksInSet {
238 :     # Get the parameters.
239 :     my ($self, $set) = @_;
240 :     # Create a hash to hold the groups found. The hash will be keyed by group ID
241 :     # and contain the relevant DB object.
242 :     my %retVal = ();
243 :     # Loop through the genome IDs in the specified set.
244 :     for my $genomeID (@{$set}) {
245 :     # Get a list of group blocks for this genome.
246 :     my @blocks = $self->GetList(['HasInstanceOf', 'GroupBlock'],
247 :     "HasInstanceOf(from-link) = ?", $genomeID);
248 :     # Loop through the blocks, storing any new ones in the hash.
249 :     for my $block (@blocks) {
250 :     # Get the ID of this block.
251 :     my ($blockID) = $block->Value('GroupBlock(id)');
252 :     # Store it in the hash. If it's a duplicate, it will erase the
253 :     # old one.
254 :     $retVal{$blockID} = $block;
255 :     }
256 :     }
257 :     # Return the result.
258 :     return %retVal;
259 :     }
260 :    
261 :     =head3 GetRegions
262 :    
263 :     C<< my %regions = $simBlocks->GetRegions($blockID, \@genomes); >>
264 :    
265 :     Return the regions of the specified block that occur in the contigs of
266 :     the specified genomes. The return value is a hash of DNA strings keyed
267 :     on the region's location descriptor.
268 :    
269 :     =over 4
270 :    
271 :     =item blockID
272 :    
273 :     ID of the group block whose regions are to be returned.
274 :    
275 :     =item genomes
276 :    
277 :     Reference to a list of genome IDs. Only regions in one of the listed
278 :     genomes will be returned.
279 :    
280 :     =item RETURN
281 :    
282 :     Returns a hash keyed by region location. The objects in the hash are
283 :     the DNA strings for the specified region.
284 :    
285 :     =back
286 :    
287 :     =cut
288 :     #: Return Type %$;
289 :     sub GetRegions {
290 :     # Get the parameters.
291 :     my ($self, $blockID, $genomes) = @_;
292 :     # Create the return hash.
293 :     my %retVal = ();
294 :     # Query all the regions for the specified block.
295 :     my $query = $self->Get(['ContainsRegionIn'], "ContainsRegionIn(from-link) = ?", $blockID);
296 :     # Loop through the query.
297 :     while (my $region = $query->Fetch) {
298 :     # Get this region's data.
299 :     my ($contigID, $start, $dir, $len, $dna) =
300 :     $region->Values(['ContainsRegionIn(to-link)', 'ContainsRegionIn(position)',
301 :     'ContainsRegionIn(direction)', 'ContainsRegionIn(len)',
302 :     'ContainsRegionIn(content)']);
303 :     # Insure it belongs to one of our genomes.
304 :     my $genomeID = Genome::FromContig($contigID);
305 :     my $found = grep($_ eq $genomeID, @{$genomes});
306 :     # If it does, put it in the hash.
307 :     if ($found) {
308 :     my $location = "${contigID}_$start$dir$len";
309 :     $retVal{$location} = $dna;
310 :     }
311 :     }
312 :     # Return the result.
313 :     return %retVal;
314 :     }
315 :    
316 :     =head3 TagDNA
317 :    
318 :     C<< my $taggedDNA = SimBlocks::TagDNA($pattern, $dnaString, $prefix, $suffix); >>
319 :    
320 :     Convert a DNA string from the B<ContainsRegionIn> relationship to the actual
321 :     DNA, optionally with markings surrounding them. The DNA string will contain
322 :     only the values corresponding to the question marks in the pattern,
323 :     which should be taken from the DNA string's parent B<GroupBlock>. The
324 :     resulting DNA sequence is built by copying
325 :    
326 :     =over 4
327 :    
328 :     =item pattern
329 :    
330 :     DNA pattern from which the positions of variance are to be taken.
331 :    
332 :     =item dnaString
333 :    
334 :     String containing DNA string to be marked.
335 :    
336 :     =item prefix
337 :    
338 :     String to be inserted before each position of variance.
339 :    
340 :     =item suffix
341 :    
342 :     String to be inserted after each position of variance.
343 :    
344 :     =item RETURN
345 :    
346 :     Returns a copy of the pattern string with the positions of variance filled
347 :     in from the DNA string. If a prefix and suffix are specified, they are
348 :     placed before and after each character copied into the pattern string.
349 :    
350 :     =back
351 :    
352 :     =cut
353 :     #: Return Type $;
354 :     sub TagDNA {
355 :     # Get the parameters.
356 :     my ($pattern, $dnaString, $prefix, $suffix) = @_;
357 :     # Insure the prefix and suffix behave properly.
358 :     if (!defined $prefix) {
359 :     $prefix = "";
360 :     }
361 :     if (!defined $suffix) {
362 :     $suffix = "";
363 :     }
364 :     # Initialize the return variable.
365 :     my $retVal = "";
366 :     # Convert the DNA string to a list with the prefixes and suffixes
367 :     # inserted.
368 :     my @dnaList = map { $prefix . $_ . $suffix } split //, $dnaString;
369 :     # Loop through the DNA string.
370 :     my ($start, $position) = (0, 0);
371 :     while (($position = index $pattern, "?", $start) >= 0) {
372 :     # If we have invariant text, copy it over.
373 :     if ($position > $start) {
374 :     $retVal .= substr $pattern, $start, $position - $start;
375 :     }
376 :     # Pop off the next position from the DNA string.
377 :     $retVal .= shift @dnaList;
378 :     # Start past the question mark.
379 :     $start = $position + 1;
380 :     }
381 :     # Return the result.
382 :     return $retVal;
383 :     }
384 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3