[Bio] / FigKernelPackages / AlignsAndTrees.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/AlignsAndTrees.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.17 - (view) (download) (as text)

1 : fangfang 1.1 #
2 : golsen 1.11 # Copyright (c) 2003-2011 University of Chicago and Fellowship
3 : fangfang 1.1 # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 :     #
7 :     # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 :     # Public License.
10 :     #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :    
18 :     package AlignsAndTrees;
19 :    
20 :     #===============================================================================
21 : golsen 1.2 # perl functions for loading and accessing Alignments and Trees based on md5
22 : fangfang 1.1 #
23 :     # Usage: use AlignsAndTrees;
24 :     #
25 : golsen 1.2 # @alignIDs = all_alignIDs();
26 :     # \@alignIDs = all_alignIDs();
27 :     #
28 :     # @alignIDs = aligns_with_md5ID( $md5 );
29 :     # \@alignIDs = aligns_with_md5ID( $md5 );
30 :     #
31 : fangfang 1.13 # \%md5_alignIDs = md5IDs_to_aligns( $sap, @md5IDs );
32 :     #
33 : fangfang 1.17 # @md5IDs = md5IDs_in_align( $sap, $alignID );
34 :     # \@md5IDs = md5IDs_in_align( $sap, $alignID );
35 : golsen 1.2 #
36 : fangfang 1.17 # \@seqs = md5_alignment_by_ID( $sap, $alignID );
37 :     # ( \@seqs, \%md5_row_metadata ) = md5_alignment_by_ID( $sap, $alignID );
38 :     # \%md5_row_metadata = md5_alignment_metadata( $sap, $alignID );
39 : golsen 1.3 #
40 : fangfang 1.17 # $md5_row_metadata{ $seqID } = [ $md5, $peg_length, $trim_beg, $trim_end, $location_string ]
41 :     #
42 :     # \%md5_row_metadata = alignment_metadata_by_md5( $sap, $alignID, @md5IDs );
43 :     #
44 :     # $md5_row_metadata{ $seqID } = [ $md5ID, $peg_length, $trim_beg, $trim_end, $location_string ]
45 :     #
46 :     # @metadata = alignments_metadata_by_md5( $sap, @md5IDs );
47 :     # \@metadata = alignments_metadata_by_md5( $sap, @md5IDs );
48 :     #
49 :     # @metadata = ( [ $alignID, $seqID, $md5, $peg_length, $trim_beg, $trim_end, $location_string ], ... )
50 : golsen 1.2 #
51 :     # @treeIDs = all_treeIDs( );
52 :     # \@treeIDs = all_treeIDs( );
53 :     #
54 : golsen 1.3 # @treeIDs = trees_with_md5ID( $md5 );
55 :     # \@treeIDs = trees_with_md5ID( $md5 );
56 : golsen 1.2 #
57 : fangfang 1.13 # \%md5_treeIDs = md5IDs_to_trees( $sap, @md5IDs );
58 :     #
59 : golsen 1.2 # @md5IDs = md5IDs_in_tree( $treeID );
60 :     # \@md5IDs = md5IDs_in_tree( $treeID );
61 :     #
62 : fangfang 1.9 # $tree = md5_tree_by_ID( $treeID );
63 :     # ( $tree, \%metadata) = md5_tree_by_ID( $treeID );
64 :     # \%metadata = md5_tree_metadata( $treeID );
65 :     #
66 :     # @fids = md5_to_pegs($sap, $md5);
67 :     # \@fids = md5_to_pegs($sap, $md5);
68 :     #
69 :     # $md5 = peg_to_md5($sap, $peg);
70 :     #
71 :     # ( \@md5_align, \%md5_metadata ) = fid_align_to_md5_align( $sap, \@fid_align, \%fid_metadata, $relaxed );
72 :     # ( \@fid_align, \%fid_metadata ) = md5_align_to_fid_align( $sap, \@md5_align, \%md5_metadata, $relaxed );
73 :     #
74 :     # $md5_tree = fid_tree_to_md5_tree( $sap, $fid_tree, $relaxed );
75 :     # $fid_tree = md5_tree_to_fid_tree( $sap, $md5_tree, $relaxed );
76 : golsen 1.2 #
77 : fangfang 1.10 # ( \@fid_align, $fid_tree, \%fid_metadata) = md5_align_and_tree_to_fid_version( \@md5_align, $md5_tree, \%md5_metadata)
78 :     # ( \@md5_align, $md5_tree, \%md5_metadata) = fid_align_and_tree_to_md5_version( \@fid_align, $fid_tree, \%fid_metadata)
79 :     #
80 :     # $alignID = load_md5_alignment_and_tree( $sap, $md5_align, $md5_tree, \@seq_metadata, \@alignment_tree_metadata );
81 :     # delete_md5_alignment_and_tree( $sap, $alignID );
82 :     #
83 :     # @metadata = alignment_tree_metadata( $sap, $alignID );
84 :     # \@metadata = alignment_tree_metadata( $sap, $alignID );
85 :     #
86 :     # @metadata = ( [ alignment-method, alignment-parameters, alignment-properties,
87 :     # tree-method, tree-parameters, tree-properties ], ... )
88 :     #
89 : golsen 1.11 #-------------------------------------------------------------------------------
90 :     # Some internal subroutines:
91 :     #
92 :     # Alignments and trees are in files in a data directory. This locates the
93 :     # directory so that they can be read and written.
94 :     #
95 :     # $data_dir = locate_data_dir();
96 :     #
97 :     # Some basic sanity checks on the ids in an alignment, tree and associated
98 :     # per sequence metadata.
99 :     #
100 :     # $okFlag = validate_alignment_and_tree_ids( $alignment, $tree, $sequenceMetadata);
101 :     #
102 :     # In deciding if two sequences with the same MD5 are really the saome thing,
103 :     # we ask if they overlap by >80% of the length of the shorter sequence.
104 :     #
105 :     # $boolean = overlaps($beg1, $end1, $beg2, $end2);
106 :     #
107 :     # An MD5 tree tip can be expanded to a large number of tree tips. This
108 :     # routine uses the new_names hash to relabel each tree tip to a list of
109 :     # tips, creating a multifurcation of zero-length branches when expanding
110 :     # one to several.
111 :     #
112 :     # $node = expand_duplicate_tips( $node, \%new_names )
113 :     #
114 : fangfang 1.1 #===============================================================================
115 : golsen 1.3
116 :     use gjoseqlib qw( read_fasta );
117 :     use gjonewicklib qw( read_newick_tree );
118 : golsen 1.12 # use FIG_Config;
119 : parrello 1.5 use Tracer;
120 : fangfang 1.14 use Storable qw( nfreeze thaw );
121 :     use SeedUtils;
122 : parrello 1.5 use strict;
123 : golsen 1.3
124 : parrello 1.5 my $data_dir = undef;
125 :     sub locate_data_dir
126 :     {
127 :     if ( ! $data_dir )
128 :     {
129 :     if ( $ENV{ ATNG } && -d $ENV{ ATNG } ) {
130 :     $data_dir = $ENV{ ATNG };
131 :     } else {
132 : golsen 1.12 require FIG_Config; # Only necessary for a few functions
133 : parrello 1.5 $data_dir = "$FIG_Config::fig/ATNG";
134 :     }
135 :    
136 :     if ( ! $data_dir || ! -d $data_dir ) {
137 :     die "Could not locate directory of alignments and trees.\n";
138 :     }
139 :     }
140 :    
141 :     $data_dir;
142 :     }
143 : golsen 1.3
144 : parrello 1.5 #-------------------------------------------------------------------------------
145 :     #
146 :     # Load an MD5 alignment and tree.
147 :     #
148 :     # $alignID = load_md5_alignment_and_tree($sap, $alignStruct, $treeStruct,
149 :     # $sequenceMetadata, $alignmentMetadata);
150 :     #
151 :     # where $sap is the Sapling database object.
152 :     # $alignStruct describes the alignment
153 :     # $treeStruct describes the tree
154 :     # $sequenceMetadata describes the relationship of each protein sequence to the alignment
155 :     # $alignmentMetadata contains information about how the alignment and tree were computed
156 :     #
157 :     # $treeStruct is a gjonewick tree structure
158 :     # $alignStruct is a reference to an array of protein FASTA triples (id, comment, sequence)
159 :     # $sequenceMetadata is a reference to an array of tuples containing the description of
160 :     # how each protein sequence is used by the alignment (sequence-id, md5, len, begin, end, locations)
161 :     # $alignmentMetadata is reference to a list of strings (alignment-method,
162 :     # alignment-parameters, alignment-properties, tree-method,
163 :     # tree-parameters, tree-properties)
164 :     #
165 :     #-------------------------------------------------------------------------------
166 :     sub load_md5_alignment_and_tree {
167 :     # Get the parameters.
168 :     my ($sap, $alignStruct, $treeStruct, $sequenceMetadata, $alignmentMetadata) = @_;
169 :     # Insure the alignment is valid.
170 :     if (! validate_alignment_and_tree_ids($alignStruct, $treeStruct, $sequenceMetadata)) {
171 :     die "Invalid alignment data.\n";
172 :     }
173 :     # Insure we have a place to store the alignment flat files.
174 :     my $dataDir = locate_data_dir();
175 :     if (! $dataDir) {
176 :     die "No alignment storage directory.\n";
177 :     }
178 :     # First, we compute the ID by looking in the database to see the current largest
179 :     # ID number.
180 :     my ($newID) = $sap->GetFlat('AlignmentTree', "ORDER BY AlignmentTree(id) DESC LIMIT 1",
181 :     [], "id");
182 :     if (! $newID) {
183 :     $newID = "00000001";
184 :     } else {
185 :     $newID++;
186 :     }
187 :     # Insert the alignment-tree root record.
188 : golsen 1.15 $sap->InsertObject('AlignmentTree',
189 :     id => $newID,
190 :     alignment_method => $alignmentMetadata->[0],
191 : parrello 1.5 alignment_parameters => $alignmentMetadata->[1],
192 :     alignment_properties => $alignmentMetadata->[2],
193 : golsen 1.15 tree_method => $alignmentMetadata->[3],
194 :     tree_parameters => $alignmentMetadata->[4],
195 :     tree_properties => $alignmentMetadata->[5]);
196 : parrello 1.5 # Now we store the alignment file in the data directory.
197 :     my $alignFile = "$dataDir/ali$newID.fa";
198 :     gjoseqlib::print_alignment_as_fasta($alignFile, $alignStruct);
199 :     # Next we store the tree file in the same directory.
200 :     my $treeFile = "$dataDir/tree$newID.nwk";
201 :     gjonewicklib::writeNewickTree($treeStruct, $treeFile);
202 : fangfang 1.14 # Then we store a frozen version of the tree in a subdirectory.
203 :     my $frozenFile = "$dataDir/FROZEN/tree$newID.frozen";
204 :     SeedUtils::verify_dir("$dataDir/FROZEN");
205 :     open(F, ">$frozenFile") or die "Could not open $frozenFile";
206 :     print F nfreeze($treeStruct);
207 :     close(F);
208 : parrello 1.5 # Finally, we create the realtionships that connect the protein sequences to
209 :     # the alignment in the database.
210 :     for my $seqTuple (@$sequenceMetadata) {
211 :     # Get the components of this sequence's tuple.
212 :     my ($seqID, $md5, $len, $begin, $end, $locations) = @$seqTuple;
213 :     # Insert a relationship record for this sequence/alignment pair.
214 :     $sap->InsertObject('Aligns', from_link => $newID, sequence_id => $seqID,
215 :     to_link => $md5, len => $len, begin => $begin,
216 :     end => $end, properties => $locations);
217 :     }
218 :     # Return the alignment/tree ID.
219 :     return $newID;
220 :     }
221 :    
222 :     #-------------------------------------------------------------------------------
223 :     #
224 :     # Delete an md5 alignment and tree.
225 :     #
226 :     # delete_md5_alignment_and_tree($sap, $alignID);
227 :     #
228 :     # where $sap is the Sapling database object
229 :     # $alignID is the ID of the alignment/tree to delete
230 :     #
231 :     #-------------------------------------------------------------------------------
232 :     sub delete_md5_alignment_and_tree {
233 :     # Get the parameters.
234 :     my ($sap, $alignID) = @_;
235 :     # Get the data directory.
236 :     my $dataDir = locate_data_dir();
237 :     if (! $dataDir) {
238 :     die "No alignment/tree data directory present.\n";
239 :     }
240 :     # Delete the alignment file if it exists.
241 :     my $alignFile = "$dataDir/ali$alignID.fa";
242 :     if (-f $alignFile) {
243 :     unlink $alignFile;
244 :     }
245 :     # Delete the tree file if it exists.
246 :     my $treeFile = "$dataDir/tree$alignID.nwk";
247 :     if (-f $treeFile) {
248 :     unlink $treeFile;
249 :     }
250 : fangfang 1.14 # Delete the frozen tree file if it exists.
251 :     my $frozenFile = "$dataDir/FROZEN/tree$alignID.frozen";
252 :     if (-f $frozenFile) {
253 :     unlink $frozenFile;
254 :     }
255 : parrello 1.5 # Delete the alignment from the database.
256 :     $sap->Delete(AlignmentTree => $alignID);
257 :     }
258 :    
259 :     #-------------------------------------------------------------------------------
260 :     #
261 :     # Verify that the sequence IDs are consistent in the load data for an md5 alignment
262 :     # and tree.
263 :     #
264 : parrello 1.6 # $okFlag = validate_alignment_and_tree_ids($alignStruct, $treeStruct,
265 : parrello 1.5 # $sequenceMetadata);
266 :     #
267 : parrello 1.7 # where $alignStruct describes the alignment (if undef, will not be checked)
268 :     # $treeStruct describes the tree (if undef, will not be checked)
269 : parrello 1.5 # $sequenceMetadata describes the relationship of each protein sequence to the alignment
270 :     #
271 :     # $treeStruct is a gjonewick tree structure
272 : parrello 1.7 # $alignStruct is a reference to an array of protein FASTA triples (id, comment, sequence)
273 : parrello 1.5 # $sequenceMetadata is a reference to an array of tuples containing the description of
274 :     # how each protein sequence is used by the alignment (sequence-id, md5, len, begin, end, locations)
275 : parrello 1.7 # or a reference to a hash mapping each sequence ID to the description
276 :     # (sequence-id => [md5, len, begin, end, locations])
277 : parrello 1.5 #-------------------------------------------------------------------------------
278 :     sub validate_alignment_and_tree_ids {
279 : parrello 1.6 # Get the parameters.
280 :     my ($alignStruct, $treeStruct, $sequenceMetadata) = @_;
281 :     # Assume we're valid unless we determine otherwise.
282 :     my $retVal = 1;
283 :     # Get a hash of the sequence IDs from the sequence metadata. Each of the other
284 :     # objects must have exactly this set of sequence IDs.
285 : parrello 1.7 my $sequences;
286 :     if (! $sequenceMetadata) {
287 :     warn "No sequence metadata passed into alignment validation.\n";
288 :     return 0;
289 :     } elsif (ref $sequenceMetadata eq 'HASH') {
290 :     # Hash passed in. Just copy it.
291 :     $sequences = $sequenceMetadata;
292 :     } elsif (ref $sequenceMetadata eq 'ARRAY') {
293 :     # Array passed in. Create a hash from the sequence elements.
294 :     $sequences = { map { $_->[0] => 1 } @$sequenceMetadata };
295 :     # Verify that there are no duplicates in the sequence metadata.
296 :     if (scalar(keys %$sequences) != scalar @$sequenceMetadata) {
297 :     warn "Duplicate sequence ID found in sequence metadata for alignment.\n";
298 :     return 0;
299 :     }
300 : parrello 1.6 } else {
301 : parrello 1.7 warn "Unrecognized sequence data structure passed into alignment validator.\n";
302 :     return 0;
303 :     }
304 :     my $sequenceCount = scalar keys %$sequences;
305 :     # The sequence metadata appears valid.
306 :     if ($alignStruct) {
307 :     # We have an alignment. Now loop through the alignment structure
308 : parrello 1.6 # to insure its sequence IDs match. We'll use this hash to make sure there are
309 :     # no duplicates.
310 :     my %alignSeen;
311 : parrello 1.7 # Verify that the alignment is a valid array.
312 :     if (ref $alignStruct ne 'ARRAY') {
313 :     $retVal = 0;
314 :     warn "Alignment is not an array.\n";
315 :     } else {
316 :     for my $alignRow (@$alignStruct) {
317 :     my $sequence = $alignRow->[0];
318 :     # Insure this sequence exists.
319 :     if (! $sequences->{$sequence}) {
320 :     # Here it was not in the metadata.
321 :     $retVal = 0;
322 :     warn "Invalid sequence ID $sequence found in alignment FASTA.\n";
323 :     } elsif ($alignSeen{$sequence}) {
324 :     # Here we've seen it twice.
325 :     $retVal = 0;
326 :     warn "Duplicate sequence ID $sequence found in alignment FASTA.\n";
327 :     } else {
328 :     # Here the sequence is valid.
329 :     $alignSeen{$sequence} = 1;
330 :     }
331 :     }
332 :     # Insure everything was found.
333 :     if (scalar(keys %alignSeen) != $sequenceCount) {
334 : parrello 1.6 $retVal = 0;
335 : parrello 1.7 warn "Some sequence IDs were missing from the alignment FASTA.\n";
336 : parrello 1.6 }
337 :     }
338 : parrello 1.7 }
339 :     # Only check the tree structure if it exists.
340 :     if ($treeStruct) {
341 : parrello 1.6 # Now we validate the tree. We'll use this hash to make sure there are
342 :     # no duplicates in the tree.
343 :     my %treeSeen;
344 : parrello 1.7 # Verify that the tree is a valid newick tree.
345 :     if (ref $treeStruct ne 'ARRAY') {
346 :     $retVal = 0;
347 :     warn "Alignment tree is not an array.\n";
348 :     } else {
349 :     # Get the list of sequences in the tree.
350 :     for my $sequence (gjonewicklib::newick_tip_list($treeStruct)) {
351 :     # Insure this sequence exists.
352 :     if (! $sequences->{$sequence}) {
353 :     # Here it was not in the metadata.
354 :     $retVal = 0;
355 :     warn "Invalid sequence ID $sequence found in alignment tree.\n";
356 :     } elsif ($treeSeen{$sequence}) {
357 :     # Here we've seen it twice.
358 :     $retVal = 0;
359 :     warn "Duplicate sequence ID $sequence found in alignment tree.\n";
360 :     } else {
361 :     # Here the sequence is valid.
362 :     $treeSeen{$sequence} = 1;
363 :     }
364 :     }
365 :     # Insure everything was found.
366 :     if (scalar(keys %treeSeen) != $sequenceCount) {
367 : parrello 1.6 $retVal = 0;
368 : parrello 1.7 warn "Some sequence IDs were missing from the alignment tree.\n";
369 : parrello 1.6 }
370 :     }
371 :     }
372 :     # Return the determination indicator.
373 :     return $retVal;
374 : parrello 1.5 }
375 : golsen 1.3
376 :     #-------------------------------------------------------------------------------
377 : golsen 1.2 #
378 : parrello 1.5 # @alignIDs = all_alignIDs($sap);
379 :     # \@alignIDs = all_alignIDs($sap);
380 :     #
381 : golsen 1.2 #
382 : parrello 1.5 # where $sap is the Sapling database object.
383 : golsen 1.2 #-------------------------------------------------------------------------------
384 : fangfang 1.1 sub all_alignIDs
385 :     {
386 : golsen 1.12 my ( $sap ) = @_;
387 :     my @ids = $sap ? $sap->GetFlat('AlignmentTree', "", [], "id") : ();
388 : golsen 1.2 wantarray ? @ids : \@ids;
389 : fangfang 1.1 }
390 :    
391 : golsen 1.2 #-------------------------------------------------------------------------------
392 :     #
393 : parrello 1.5 # @alignIDs = aligns_with_md5ID( $sap, $md5 );
394 :     # \@alignIDs = aligns_with_md5ID( $sap, $md5 );
395 : golsen 1.2 #
396 : parrello 1.5 # where $sap is the Sapling database object.
397 :     # $md5 is an MD5 protein sequence ID whose alignments are desired
398 : golsen 1.2 #-------------------------------------------------------------------------------
399 : fangfang 1.1 sub aligns_with_md5ID
400 :     {
401 : parrello 1.5 my ( $sap, $md5 ) = @_;
402 :     my @ids;
403 : golsen 1.12 if ( $sap && $md5 ) {
404 : parrello 1.5 @ids = $sap->GetFlat('Aligns', 'Aligns(to-link) = ?', [$md5],
405 :     'from-link');
406 :     }
407 : golsen 1.2 wantarray ? @ids : \@ids;
408 : fangfang 1.1 }
409 :    
410 : golsen 1.2 #-------------------------------------------------------------------------------
411 :     #
412 : fangfang 1.13 # \%md5_alignIDs = md5IDs_to_aligns( $sap, @md5IDs );
413 :     #
414 :     # $md5_alignIDs{ $md5ID } = \@alignIDs_for_md5ID
415 :     #
416 :     # where $sap is the Sapling database object.
417 :     # @md5IDs is an list of MD5 protein sequence IDs whose alignments are desired
418 :     #-------------------------------------------------------------------------------
419 :     sub md5IDs_to_aligns
420 :     {
421 :     my ( $sap, @md5IDs ) = @_;
422 :     $sap && @md5IDs or return {};
423 :    
424 :     # Declare the return hash.
425 :     my $retVal = {};
426 :     # Loop through the incoming model IDs.
427 :     for my $md5 (@md5IDs) {
428 :     # Get the list of reactions for this model.
429 :     my @ids = $sap->GetFlat('Aligns', "Aligns(to-link) = ?", [$md5], 'from-link');
430 :     # Store them in the return hash.
431 :     $retVal->{$md5} = \@ids;
432 :     }
433 :    
434 :     # Return the results.
435 :     return $retVal;
436 :     }
437 :    
438 :     #-------------------------------------------------------------------------------
439 :     #
440 : parrello 1.5 # @metadata = alignment_tree_metadata($sap, $alignID);
441 : golsen 1.12 # \@metadata = alignment_tree_metadata($sap, $alignID);
442 : golsen 1.2 #
443 : parrello 1.5 # where $sap is the Sapling database object.
444 :     # $alignID is an alignment whose metadata is desired.
445 :     # returns (alignment-method, alignment-parameters, alignment-properties,
446 :     # tree-method, tree-parameters, tree-properties)
447 :     #
448 :     #-------------------------------------------------------------------------------
449 : golsen 1.12 sub alignment_tree_metadata
450 :     {
451 : parrello 1.5 # Get the parameters.
452 :     my ($sap, $alignID) = @_;
453 :     my $fields = [];
454 :     if ($alignID) {
455 :     # Retrieve the metadata from the main alignment/tree record.
456 : golsen 1.15 ($fields) = $sap->GetAll("AlignmentTree",
457 :     "AlignmentTree(id) = ?",
458 :     [$alignID],
459 : golsen 1.16 [qw(alignment-method
460 :     alignment-parameters
461 :     alignment-properties
462 :     tree-method
463 :     tree-parameters
464 :     tree-properties)]);
465 : parrello 1.5 }
466 :     # Return the result in the user-specified manner.
467 :     wantarray ? @$fields : $fields;
468 :     }
469 :    
470 :     #-------------------------------------------------------------------------------
471 :     #
472 :     # @md5IDs = md5IDs_in_align( $sap, $alignID );
473 :     # \@md5IDs = md5IDs_in_align( $sap, $alignID );
474 :     #
475 :     # where $sap is the Sapling database object.
476 :     # $alignID is an alignment whose protein list is desired.
477 : golsen 1.2 #-------------------------------------------------------------------------------
478 : fangfang 1.1 sub md5IDs_in_align
479 :     {
480 : parrello 1.5 my ( $sap, $alignID ) = @_;
481 : golsen 1.12 $sap && $alignID or return wantarray ? () : [];
482 : parrello 1.5 my %seen;
483 : golsen 1.12 my @md5IDs = grep { ! $seen{$_}++ }
484 :     $sap->GetFlat('Aligns', 'Aligns(from-link) = ?', [$alignID], 'to-link');
485 : golsen 1.2 wantarray ? @md5IDs : \@md5IDs;
486 : fangfang 1.1 }
487 :    
488 : golsen 1.2 #-------------------------------------------------------------------------------
489 :     #
490 : parrello 1.5 # \@seqs = md5_alignment_by_ID( $sap, $alignID );
491 :     # ( \@seqs, \%metadata ) = md5_alignment_by_ID( $sap, $alignID );
492 :     # \%metadata = md5_alignment_metadata( $sap, $alignID );
493 : golsen 1.3 #
494 : parrello 1.5 # $metadata{ $seqID } = [ $md5, $peg_length, $trim_beg, $trim_end, $location_string ]
495 : golsen 1.2 #
496 : parrello 1.5 # where $sap is the Sapling database object.
497 :     # $alignID is an alignment whose MD5 relationship data is desired.
498 : golsen 1.2 #-------------------------------------------------------------------------------
499 : golsen 1.3 sub md5_alignment_by_ID
500 : fangfang 1.1 {
501 : parrello 1.5 my ( $sap, $alignID ) = @_;
502 : golsen 1.12 $sap && $alignID or return ();
503 : parrello 1.5 my @align;
504 : golsen 1.12 if ( $data_dir ||= locate_data_dir() )
505 : parrello 1.5 {
506 :     my $file = "$data_dir/ali$alignID.fa";
507 :     @align = map { $_->[1] = ''; $_ } gjoseqlib::read_fasta( $file ) if -f $file;
508 :     }
509 :    
510 :     wantarray ? ( \@align, md5_alignment_metadata( $sap, $alignID ) ) : \@align;
511 : fangfang 1.1 }
512 :    
513 : fangfang 1.17 #-------------------------------------------------------------------------------
514 :     #
515 :     # \%md5_row_metadata = md5_alignment_metadata( $sap, $alignID );
516 :     #
517 :     # $md5_row_metadata{ $seqID } = [ $md5ID, $peg_length, $trim_beg, $trim_end, $location_string ]
518 :     #
519 :     # where $TreeServerO is an alignment and tree server object.
520 :     # $alignID is an alignment whose MD5 relationship data is desired.
521 :     #
522 :     # $metadata{ $sedID } = [ $md5, $peg_length, $trim_beg, $trim_end, $location_string ]
523 :     #-------------------------------------------------------------------------------
524 : golsen 1.3 sub md5_alignment_metadata
525 :     {
526 : parrello 1.5 my ( $sap, $alignID ) = @_;
527 :     my %metadata;
528 : golsen 1.12 if ( $sap && $alignID )
529 : parrello 1.5 {
530 :     %metadata = map { my ($md5, @data) = @$_; ( $md5 => \@data ) }
531 :     $sap->GetAll('Aligns', 'Aligns(from-link) = ?', [$alignID],
532 :     [qw(sequence-id to-link len begin end properties)]);
533 :     }
534 : golsen 1.3 \%metadata;
535 :     }
536 : fangfang 1.1
537 : fangfang 1.17 #-------------------------------------------------------------------------------
538 :     #
539 :     # %metadata = alignment_metadata_by_md5( $sap, $alignID, @md5IDs );
540 :     # \%metadata = alignment_metadata_by_md5( $sap, $alignID, @md5IDs );
541 :     #
542 :     # where $sap is the Sapling database object.
543 :     # $alignID is an alignment whose MD5 relationship data is desired.
544 :     # \@md5IDs is a list of the md5IDs for which the data are desired.
545 :     #
546 :     # $metadata{ $seqID } = [ $md5, $peg_length, $trim_beg, $trim_end, $location_string ]
547 :     #-------------------------------------------------------------------------------
548 :     sub alignment_metadata_by_md5
549 :     {
550 :     my ( $sap, $alignID, @md5IDs ) = @_;
551 :     my %metadata;
552 :    
553 :     if ( $sap && $alignID && @md5IDs )
554 :     {
555 :     %metadata = map { my ( $seqID, @data ) = @$_; $seqID => \@data }
556 :     $sap->GetAll('Aligns',
557 :     'Aligns(from-link) = ? AND Aligns(to-link) IN ( ' . join( ', ', qw(?) x @md5IDs ) . ' )',
558 :     [ $alignID, @md5IDs ],
559 :     [qw( sequence-id to-link len begin end properties )]);
560 :     }
561 :     \%metadata;
562 :     }
563 :    
564 :     #-------------------------------------------------------------------------------
565 :     #
566 :     # @metadata = alignments_metadata_by_md5( $sap, @md5IDs );
567 :     # \@metadata = alignments_metadata_by_md5( $sap, @md5IDs );
568 :     #
569 :     # where $sap is the Sapling database object.
570 :     # \@md5IDs is a list of the md5IDs for which the data are desired.
571 :     #
572 :     # @metadata = ( [ $alignID, $seqID, $md5, $peg_length, $trim_beg, $trim_end, $location_string ], ... )
573 :     #-------------------------------------------------------------------------------
574 :     sub alignments_metadata_by_md5
575 :     {
576 :     my ( $sap, @md5IDs ) = @_;
577 :     my @metadata = ();
578 :    
579 :     if ( $sap && @md5IDs )
580 :     {
581 :     @metadata = $sap->GetAll('Aligns',
582 :     'Aligns(to-link) IN ( ' . join( ', ', qw(?) x @md5IDs ) . ' )',
583 :     \@md5IDs,
584 :     [qw( from-link sequence-id to-link len begin end properties )]);
585 :     }
586 :    
587 :     wantarray ? @metadata : \@metadata;
588 :     }
589 : fangfang 1.1
590 : golsen 1.2 #-------------------------------------------------------------------------------
591 :     #
592 : parrello 1.5 # @treeIDs = all_treeIDs( $sap );
593 :     # \@treeIDs = all_treeIDs( $sap );
594 : golsen 1.2 #
595 :     #-------------------------------------------------------------------------------
596 : fangfang 1.1 sub all_treeIDs
597 :     {
598 : parrello 1.5 return all_alignIDs(@_);
599 : fangfang 1.1 }
600 :    
601 : golsen 1.2 #-------------------------------------------------------------------------------
602 :     #
603 : parrello 1.5 # @treeIDs = trees_with_md5ID( $sap, $md5 );
604 :     # \@treeIDs = trees_with_md5ID( $sap, $md5 );
605 : golsen 1.2 #
606 :     #-------------------------------------------------------------------------------
607 : fangfang 1.1 sub trees_with_md5ID
608 :     {
609 : parrello 1.5 return aligns_with_md5ID(@_);
610 : fangfang 1.1 }
611 :    
612 : fangfang 1.13
613 :     #-------------------------------------------------------------------------------
614 :     #
615 :     # \%md5_treeIDs = md5IDs_to_trees( $sap, @md5IDs );
616 :     #
617 :     #-------------------------------------------------------------------------------
618 :     sub md5IDs_to_trees
619 :     {
620 :     return md5IDs_to_aligns(@_);
621 :     }
622 :    
623 :    
624 : golsen 1.2 #-------------------------------------------------------------------------------
625 :     #
626 : parrello 1.5 # @md5IDs = md5IDs_in_tree( $sap, $treeID );
627 :     # \@md5IDs = md5IDs_in_tree( $sap, $treeID );
628 : golsen 1.2 #
629 :     #-------------------------------------------------------------------------------
630 :     sub md5IDs_in_tree
631 : fangfang 1.1 {
632 : parrello 1.5 return md5IDs_in_align(@_);
633 : fangfang 1.1 }
634 :    
635 : golsen 1.2 #-------------------------------------------------------------------------------
636 :     #
637 : parrello 1.5 # $tree = md5_tree_by_ID( $sap, $treeID );
638 :     # ( $tree, \%metadata ) = md5_tree_by_ID( $sap, $treeID );
639 : golsen 1.12 # \%metadata = md5_tree_metadata( $sap, $treeID );
640 : golsen 1.2 #
641 : parrello 1.5 # $metadata{ $seqID } = [ $md5, $peg_length, $trim_beg, $trim_end, $location_string ]
642 :     #
643 :     # where $sap is the Sapling database object.
644 :     # $treeID is a tree whose MD5 relationship data is desired.
645 : golsen 1.2 #-------------------------------------------------------------------------------
646 : golsen 1.3 sub md5_tree_by_ID
647 : fangfang 1.1 {
648 : parrello 1.5 my ( $sap, $treeID ) = @_;
649 : fangfang 1.14 my ( $file, $frozen );
650 : parrello 1.5 if ( $treeID && ( $data_dir ||= locate_data_dir() ) )
651 :     {
652 : fangfang 1.14 $file = "$data_dir/tree$treeID.nwk";
653 :     $frozen = "$data_dir/FROZEN/tree$treeID.frozen";
654 : parrello 1.5 }
655 :    
656 : fangfang 1.14 my $tree = $frozen && -f $frozen ? thaw_tree_file( $frozen ) :
657 :     $file && -f $file ? gjonewicklib::read_newick_tree( $file ) : undef;
658 : parrello 1.5
659 :     wantarray ? ( $tree, md5_tree_metadata( $sap, $treeID ) ) : $tree;
660 :     }
661 :    
662 : fangfang 1.14 sub thaw_tree_file
663 :     {
664 :     my ( $file ) = @_;
665 :     return undef unless $file && -s $file;
666 :    
667 :     my $ref_frozen_tree = gjoseqlib::slurp( $file );
668 :     my $tree = thaw($$ref_frozen_tree);
669 :    
670 :     return $tree;
671 :     }
672 :    
673 :    
674 : parrello 1.5 sub md5_tree_metadata
675 :     {
676 :     return md5_alignment_metadata(@_);
677 : fangfang 1.1 }
678 : golsen 1.3
679 : parrello 1.7 #-------------------------------------------------------------------------------
680 :     #
681 : fangfang 1.10 # @fids = md5_to_pegs($sap, $md5);
682 : parrello 1.7 # \@fids = md5_to_pegs($sap, $md5);
683 :     #
684 :     # where $sap is the Sapling database object.
685 :     # $md5 is an MD5 protein ID
686 :     #
687 :     # This method returns all the pegs that produce the indicated protein.
688 :     #
689 :     #-------------------------------------------------------------------------------
690 :     sub md5_to_pegs {
691 :     # Get the parameters.
692 :     my ($sap, $md5) = @_;
693 :     # Read the features for the indicated protein.
694 :     my @retVal = $sap->GetFlat("IsProteinFor", 'IsProteinFor(from-link) = ?',
695 :     [$md5], 'to-link');
696 :     # Return the result in the caller-specified manner.
697 :     wantarray ? @retVal : \@retVal;
698 :     }
699 :    
700 :    
701 :     #-------------------------------------------------------------------------------
702 :     #
703 :     # $md5 = peg_to_md5($sap, $peg);
704 :     #
705 :     # Return the MD5 ID of the protein produced by the indicated peg.
706 :     #
707 :     # where $sap is the Sapling database object.
708 :     # $peg is a feature ID.
709 :     #
710 :     #-------------------------------------------------------------------------------
711 :     sub peg_to_md5 {
712 :     # Get the parameters.
713 :     my ($sap, $peg) = @_;
714 :     # Read the protein for the indicated PEG.
715 :     my ($retVal) = $sap->GetFlat("Produces", "Produces(from-link) = ?",
716 :     [$peg], 'to-link');
717 :     # Return the result.
718 :     $retVal;
719 :     }
720 :    
721 : fangfang 1.13
722 : parrello 1.7 #===============================================================================
723 :     # Functions for interconverting alignments and trees that md5-based ids and
724 :     # fid-based ids. Because the md5 id is based on the sequences, multiple
725 :     # fids can have the same md5 id. These are reduced to a single instance on
726 :     # conversion to md5, and expanded to all known corresponding fids on conversion
727 :     # back to fids.
728 :     #
729 :     # (\@md5_align, \%md5_metadata) = fid_align_to_md5_align($sap, \@fid_align, \%fid_metadata, $relaxed );
730 :     # (\@fid_align, \%fid_metadata) = md5_align_to_fid_align($sap, \@md5_align, \%md5_metadata, $relaxed );
731 :     # $md5_tree = fid_tree_to_md5_tree($sap, $fid_tree, $relaxed );
732 :     # $fid_tree = md5_tree_to_fid_tree($sap, $md5_tree, $relaxed );
733 :     #
734 :     # sap Sapling database object
735 :     # @fid_align An alignment, as fid_definition_sequence triples.
736 :     # @md5_align An alignment, as md5_definition_sequence triples.
737 :     # %md5_metadata hash mapping sequence IDs to MD5s with additional metadata
738 :     # (sequence id => [md5, len, beg, end, locations])
739 :     # %fid_metadata hash mapping feature IDs to sequence IDs with additional metadata
740 :     # (unique id => [fid, len, beg, end, locations])
741 :     # $fid_tree A gjonewick tree structure with fid ids.
742 :     # $md5_tree A gjonewick tree structure with md5 ids.
743 :     # $relaxed If set to a true value, untranslatable ids are passed through,
744 :     # rather than deleted.
745 :     #===============================================================================
746 : parrello 1.8
747 :     ## Run through the metadata associated with a FID-based alignment or tree and
748 :     ## produce the md5 metadata and a map that translates FID IDs to MD5 IDs.
749 :     sub map_fid_to_md5
750 : parrello 1.7 {
751 : parrello 1.8 my ( $sap, $fid_metadata, $relaxed ) = @_;
752 :     $fid_metadata && ref( $fid_metadata ) eq 'HASH'
753 : parrello 1.7 or return ();
754 :    
755 :     my %md5_metadata;
756 :     my %md5_covered; # maps each md5 to a list of the covered locations [beg, end]
757 : parrello 1.8 my %fid_id_to_md5_id_map;
758 : parrello 1.7
759 : parrello 1.8 foreach my $id ( keys %$fid_metadata )
760 : parrello 1.7 {
761 :     my $fidMeta = $fid_metadata->{$id};
762 :     my ($fid, $len, $beg, $end, $locations) = @$fidMeta;
763 :     my $md5 = peg_to_md5( $sap, $fid );
764 :     $md5 = $fid if ! $md5 && $relaxed;
765 :     next if ! $md5;
766 :     my $cover = $md5_covered{$md5} || [];
767 :     my $found;
768 :     foreach my $b_e (@$cover) {
769 :     if (overlaps(@$b_e, $beg, $end)) {
770 :     $found = 1;
771 :     last;
772 :     }
773 :     }
774 :     next if $found;
775 :     my $md5ID = @$cover ? "$md5-" . scalar @$cover : $md5;
776 :     push @{$md5_covered{$md5}}, [$beg, $end];
777 :     $md5_metadata{$md5ID} = [$md5, $len, $beg, $end, $locations];
778 : parrello 1.8 $fid_id_to_md5_id_map{$id} = $md5ID;
779 : parrello 1.7 }
780 :    
781 : parrello 1.8 return (\%md5_metadata, \%fid_id_to_md5_id_map);
782 : parrello 1.7 }
783 :    
784 : parrello 1.8 sub fid_align_to_md5_align
785 :     {
786 :     my ( $fid_align, $fid_id_to_md5_id_map ) = @_;
787 :     $fid_align && ref( $fid_align ) eq 'ARRAY' &&
788 :     $fid_id_to_md5_id_map && ref( $fid_id_to_md5_id_map ) eq 'HASH'
789 :     or return ();
790 : parrello 1.7
791 : parrello 1.8 my @md5_align;
792 :    
793 :     foreach ( @$fid_align )
794 :     {
795 :     my $id = $_->[0];
796 :     my $md5ID = $fid_id_to_md5_id_map->{$id};
797 :     next if ! $md5ID;
798 :     push @md5_align, [ $md5ID, $_->[1], $_->[2] ];
799 :     }
800 :    
801 :     return \@md5_align;
802 :     }
803 :    
804 :    
805 :     sub map_md5_to_fid
806 : parrello 1.7 {
807 : parrello 1.8 my ( $sap, $md5_metadata, $relaxed ) = @_;
808 :     $md5_metadata && ref( $md5_metadata ) eq 'HASH'
809 : parrello 1.7 or return ();
810 :    
811 :     my %fid_metadata;
812 :     my %fids_seen;
813 : parrello 1.8 my %md5_id_to_fid_ids_map;
814 : parrello 1.7
815 : parrello 1.8 foreach my $md5ID ( keys %$md5_metadata )
816 : parrello 1.7 {
817 :     my $md5Metadata = $md5_metadata->{$md5ID};
818 :     my ($md5, $len, $beg, $end, $location) = @$md5Metadata;
819 :     my @fids = md5_to_pegs( $sap, $md5 );
820 :     @fids = ( $md5 ) if ! @fids && $relaxed;
821 :     foreach my $fid ( @fids )
822 :     {
823 :     my $fidID = $fid;
824 :     if ($fids_seen{$fid}++) {
825 :     $fidID = "$fid-" . $fids_seen{$fid};
826 :     }
827 : parrello 1.8 $fid_metadata{$fidID} = [$fid, $len, $beg, $end, $location];
828 :     push @{$md5_id_to_fid_ids_map{$md5ID}}, $fidID;
829 :     }
830 :     }
831 :    
832 :     return (\%fid_metadata, \%md5_id_to_fid_ids_map);
833 :     }
834 :    
835 :     sub md5_align_to_fid_align
836 :     {
837 :     my ( $md5_align, $md5_id_to_fid_ids_map ) = @_;
838 :     $md5_align && ref( $md5_align ) eq 'ARRAY' && $md5_id_to_fid_ids_map &&
839 :     ref( $md5_id_to_fid_ids_map ) eq 'HASH'
840 :     or return ();
841 :    
842 :     my @fid_align;
843 :     my %fid_metadata;
844 :    
845 :     foreach ( @$md5_align )
846 :     {
847 :     my $md5ID = $_->[0];
848 :     my @fidIDs = @{$md5_id_to_fid_ids_map->{$md5ID}};
849 :     foreach my $fidID ( @fidIDs )
850 :     {
851 : parrello 1.7 push @fid_align, [ $fidID, $_->[1], $_->[2] ];
852 :     }
853 :     }
854 :    
855 : parrello 1.8 return \@fid_align;
856 : parrello 1.7 }
857 :    
858 :    
859 :     sub fid_tree_to_md5_tree
860 :     {
861 : parrello 1.8 my ( $fid_tree, $fid_id_to_md5_id_map ) = @_;
862 :     $fid_tree && ref( $fid_tree ) eq 'ARRAY' &&
863 :     $fid_id_to_md5_id_map && ref( $fid_id_to_md5_id_map ) eq 'HASH'
864 : parrello 1.7 or return undef;
865 :    
866 : parrello 1.8 gjonewicklib::newick_relabel_tips( gjonewicklib::newick_subtree( $fid_tree, keys %$fid_id_to_md5_id_map ), $fid_id_to_md5_id_map );
867 : parrello 1.7 }
868 :    
869 :    
870 :     sub md5_tree_to_fid_tree
871 :     {
872 : parrello 1.8 my ( $md5_tree, $md5_id_to_fid_ids_map ) = @_;
873 :     $md5_tree && ref( $md5_tree ) eq 'ARRAY' &&
874 :     $md5_id_to_fid_ids_map && ref( $md5_id_to_fid_ids_map ) eq 'HASH'
875 : parrello 1.7 or return ();
876 :    
877 :     my @tips = gjonewicklib::newick_tip_list( $md5_tree );
878 :     @tips or return undef;
879 :    
880 :     my $prune = 0;
881 : parrello 1.8 foreach my $md5ID ( @tips )
882 : parrello 1.7 {
883 : parrello 1.8 $prune = 1 if (! $md5_id_to_fid_ids_map->{$md5ID});
884 : parrello 1.7 }
885 :    
886 : parrello 1.8 $md5_tree = gjonewicklib::newick_subtree( $md5_tree, [ keys %$md5_id_to_fid_ids_map ] ) if $prune;
887 :     return expand_duplicate_tips( gjonewicklib::copy_newick_tree( $md5_tree ), $md5_id_to_fid_ids_map );
888 :     }
889 :    
890 : golsen 1.11
891 : parrello 1.8 sub md5_align_and_tree_to_fid_version {
892 :     my ($sap, $md5_align, $md5_tree, $md5_metadata, $relaxed) = @_;
893 :     my ($fid_metadata, $md5_id_to_fid_ids_map) = map_md5_to_fid($sap, $md5_metadata, $relaxed);
894 :     my $fid_tree = md5_tree_to_fid_tree($md5_tree, $md5_id_to_fid_ids_map);
895 :     my $fid_align = md5_align_to_fid_align($md5_align, $md5_id_to_fid_ids_map);
896 :     return ($fid_align, $fid_tree, $fid_metadata);
897 : parrello 1.7 }
898 :    
899 : golsen 1.11
900 : parrello 1.8 sub fid_align_and_tree_to_md5_version {
901 :     my ($sap, $fid_align, $fid_tree, $fid_metadata, $relaxed) = @_;
902 :     my ($md5_metadata, $fid_id_to_md5_id_map) = map_fid_to_md5($sap, $fid_metadata, $relaxed);
903 :     my $md5_tree = fid_tree_to_md5_tree($fid_tree, $fid_id_to_md5_id_map);
904 :     my $md5_align = fid_align_to_md5_align($fid_align, $fid_id_to_md5_id_map);
905 :     return ($md5_align, $md5_tree, $md5_metadata);
906 :     }
907 : parrello 1.7
908 : golsen 1.11
909 : parrello 1.7 #-------------------------------------------------------------------------------
910 :     #
911 :     # Return TRUE if the overlap between two sets of coordinates is sufficient to
912 :     # treat them as essentially the same.
913 :     #
914 :     # $okFlag = overlaps($beg1, $end1, $beg2, $end2);
915 :     #
916 :     #-------------------------------------------------------------------------------
917 :     sub overlaps {
918 :     # Get the parameters.
919 :     my ($beg1, $end1, $beg2, $end2) = @_;
920 :     # Compute the number of overlapping residues.
921 :     my $over = min($end1, $end2) - max($beg1, $beg2) + 1;
922 :     # Return TRUE if the overlap is 80% of the shorter length.
923 :     return $over >= 0.8 * min($end1 - $beg1 + 1, $end2 - $beg2 + 1);
924 :     }
925 :    
926 :     sub min { $_[0] < $_[1] ? $_[0] : $_[1] }
927 :     sub max { $_[0] > $_[1] ? $_[0] : $_[1] }
928 : parrello 1.8
929 :    
930 : parrello 1.7 #-------------------------------------------------------------------------------
931 :     # Use a hash to relabel, and potentially expand the tips in a newick tree.
932 :     #
933 :     # $node = expand_duplicate_tips( $node, \%new_names )
934 :     #
935 :     #-------------------------------------------------------------------------------
936 :     sub expand_duplicate_tips
937 :     {
938 :     my ( $node, $new_names ) = @_;
939 :    
940 :     my @desc = gjonewicklib::newick_desc_list( $node );
941 :    
942 :     if ( @desc )
943 :     {
944 :     foreach ( @desc ) { expand_duplicate_tips( $_, $new_names ) }
945 :     }
946 :     else
947 :     {
948 :     my $new;
949 :     if ( gjonewicklib::node_has_lbl( $node )
950 :     && defined( $new = $new_names->{ gjonewicklib::newick_lbl( $node ) } )
951 :     )
952 :     {
953 :     my @new = @$new;
954 :     if ( @new == 1 )
955 :     {
956 :     gjonewicklib::set_newick_lbl( $node, $new[0] );
957 :     }
958 :     elsif ( @new > 1 )
959 :     {
960 :     gjonewicklib::set_newick_desc_ref( $node, [ map { [ [], $_, 0 ] } @new ] );
961 :     gjonewicklib::set_newick_lbl( $node, undef );
962 :     }
963 :     }
964 :     }
965 :    
966 :     $node;
967 :     }
968 :    
969 : parrello 1.5
970 : golsen 1.3 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3