[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.18 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package FeatureSaplingLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use CGI qw(-nosticky);
26 :     use BasicLocation;
27 :     use HyperLink;
28 : parrello 1.2 use AliasAnalysis;
29 : parrello 1.3 use LoaderUtils;
30 : parrello 1.9 use SeedUtils;
31 : parrello 1.14 use gjoseqlib;
32 : parrello 1.15 use MD5Computer;
33 : parrello 1.1 use base 'BaseSaplingLoader';
34 :    
35 :     =head1 Sapling Feature Load Group Class
36 :    
37 :     =head2 Introduction
38 :    
39 :     The Feature Load Group includes all of the major feature-related tables.
40 :    
41 :     =head3 new
42 :    
43 :     my $sl = FeatureSaplingLoader->new($erdb, $options, @tables);
44 :    
45 :     Construct a new FeatureSaplingLoader object.
46 :    
47 :     =over 4
48 :    
49 :     =item erdb
50 :    
51 : parrello 1.4 L<Sapling> object for the database being loaded.
52 : parrello 1.1
53 :     =item options
54 :    
55 :     Reference to a hash of command-line options.
56 :    
57 :     =item tables
58 :    
59 :     List of tables in this load group.
60 :    
61 :     =back
62 :    
63 :     =cut
64 :    
65 :     sub new {
66 :     # Get the parameters.
67 :     my ($class, $erdb, $options) = @_;
68 :     # Create the table list.
69 : parrello 1.2 my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink
70 : parrello 1.12 FeatureVirulent IsOwnerOf IsLocatedIn IsIdentifiedBy
71 : parrello 1.3 Identifier IsNamedBy ProteinSequence Concerns
72 : parrello 1.9 IsAttachmentSiteFor Publication IsProteinFor
73 : parrello 1.10 Role RoleIndex IsFunctionalIn);
74 : parrello 1.1 # Create the BaseSaplingLoader object.
75 :     my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
76 :     # Return it.
77 :     return $retVal;
78 :     }
79 :    
80 :     =head2 Public Methods
81 :    
82 :     =head3 Generate
83 :    
84 :     $sl->Generate();
85 :    
86 :     Generate the data for the feature-related files.
87 :    
88 :     =cut
89 :    
90 :     sub Generate {
91 :     # Get the parameters.
92 :     my ($self) = @_;
93 :     # Get the database object.
94 :     my $erdb = $self->db();
95 : parrello 1.9 # Check for local or global.
96 : parrello 1.1 if (! $self->global()) {
97 : parrello 1.9 # Here we are generating data for a genome.
98 : parrello 1.1 my $genomeID = $self->section();
99 :     # Load this genome's features.
100 :     $self->LoadGenomeFeatures($genomeID);
101 : parrello 1.9 } else {
102 : parrello 1.10 # The global data is the roles from subsystems and the publications.
103 : parrello 1.9 my $fig = $self->source();
104 : parrello 1.10 # We need the master map of roles to IDs.
105 :     my %roleHash;
106 :     my $lastRoleIndex = -1;
107 :     my $roleMapFile = $erdb->LoadDirectory() . "/roleMap.tbl";
108 :     if (-f $roleMapFile) {
109 :     for my $mapLine (Tracer::GetFile($roleMapFile)) {
110 :     my ($role, $idx) = split /\t/, $mapLine;
111 :     $roleHash{$role} = $idx;
112 :     if ($idx > $lastRoleIndex) {
113 :     $lastRoleIndex = $idx;
114 :     }
115 :     }
116 :     }
117 :     # We'll track duplicate roles in here.
118 :     my %roleList;
119 :     # Now we get the subsystem list.
120 : parrello 1.9 my $subHash = $erdb->SubsystemHash();
121 :     for my $sub (sort keys %$subHash) {
122 :     $self->Add(subsystems => 1);
123 :     Trace("Processing roles for $sub.") if T(3);
124 :     # Get this subsystem's roles and write them out.
125 :     my @roles = $fig->subsystem_to_roles($sub);
126 :     for my $role (@roles) {
127 :     $self->Add(subsystemRoles => 1);
128 : parrello 1.10 # Check to see if this role is hypothetical.
129 :     my $hypo = hypo($role);
130 :     if (! $hypo) {
131 :     # Is this role in the role index hash?
132 :     my $roleIndex = $roleHash{$role};
133 :     if (! defined $roleIndex) {
134 :     # No, compute a new index for it.
135 :     $roleIndex = ++$lastRoleIndex;
136 :     $roleHash{$role} = $roleIndex;
137 :     }
138 :     if (! $roleList{$role}) {
139 :     $roleList{$role} = 1;
140 :     $self->PutE(RoleIndex => $role, role_index => $roleIndex);
141 :     }
142 :     }
143 :     $self->PutE(Role => $role, hypothetical => $hypo);
144 :     }
145 :     }
146 :     Trace("Subsystem roles generated.") if T(2);
147 :     # Write out the role master file.
148 :     Tracer::PutFile($roleMapFile, [map { "$_\t$roleHash{$_}" } keys %roleHash]);
149 :     Trace("Role master file written to $roleMapFile.") if T(2);
150 :     # Now, we get the publications.
151 :     my $pubs = $fig->all_titles();
152 :     for my $pub (@$pubs) {
153 :     # Get the ID and title.
154 :     my ($pubmedID, $title) = @$pub;
155 :     # Only proceed if the ID is valid.
156 :     if ($pubmedID) {
157 :     # Create a hyperlink from the title and the pubmed ID.
158 :     my $link;
159 :     if (! $title) {
160 :     $link = HyperLink->new("<unknown>");
161 :     } else {
162 :     $link = HyperLink->new($title, "http://www.ncbi.nlm.nih.gov/pubmed/$pubmedID");
163 :     }
164 :     # Create the publication record.
165 :     $self->PutE(Publication => $pubmedID, citation => $link);
166 : parrello 1.9 }
167 :     }
168 : parrello 1.10 Trace("Publications generated.") if T(2);
169 : parrello 1.1 }
170 :     }
171 :    
172 :     =head3 LoadGenomeFeatures
173 :    
174 :     $sl->LoadGenomeFeatures($genomeID);
175 :    
176 :     Load the feature-related data for a single genome.
177 :    
178 :     =over 4
179 :    
180 :     =item genomeID
181 :    
182 :     ID of the genome whose feature data is to be loaded.
183 :    
184 :     =back
185 :    
186 :     =cut
187 :    
188 :     sub LoadGenomeFeatures {
189 :     # Get the parameters.
190 :     my ($self, $genomeID) = @_;
191 :     # Get the source object.
192 :     my $fig = $self->source();
193 :     # Get the database.
194 :     my $sapling = $self->db();
195 :     # Get the maximum location segment length. We'll need this later.
196 :     my $maxLength = $sapling->TuningParameter('maxLocationLength');
197 : parrello 1.3 # Get the genome's aliases.
198 :     my $aliasDir = $sapling->LoadDirectory() . "/AliasData";
199 :     my $aliasHash = LoaderUtils::ReadAliasFile($aliasDir, $genomeID);
200 :     if (! defined $aliasHash) {
201 : parrello 1.9 Trace("No aliases found for $genomeID.") if T(ERDBLoadGroup => 1);
202 : parrello 1.3 $self->Add(missingAliasFile => 1);
203 :     $aliasHash = {};
204 :     }
205 : parrello 1.14 # Get all of this genome's protein sequences.
206 :     my %seqs = map { $_->[0] => $_->[2] } gjoseqlib::read_fasta("$FIG_Config::organisms/$genomeID/Features/peg/fasta");
207 : parrello 1.1 # Get all of this genome's features.
208 :     my $featureList = $fig->all_features_detailed_fast($genomeID);
209 : parrello 1.15 # Compute the MD5 identifiers for the genome and its contigs.
210 :     my $genomeMD5Data = MD5Computer->new_from_fasta("$FIG_Config::organisms/$genomeID/contigs");
211 : parrello 1.1 # Loop through them.
212 :     for my $feature (@$featureList) {
213 :     # Get this feature's data.
214 :     my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,
215 :     $assignmentMaker, $quality) = @$feature;
216 :     $self->Track(Features => $fid, 1000);
217 : parrello 1.7 # Fix missing assignments. For RNAs, the assignment may be in the alias list.
218 : parrello 1.1 if (! defined $assignment) {
219 : parrello 1.7 if ($type eq 'rna') {
220 :     $assignment = $aliases;
221 :     $assignmentMaker ||= 'master';
222 :     } else {
223 :     $assignment = '';
224 :     }
225 : parrello 1.1 }
226 :     # Convert the location string to a list of location objects.
227 :     my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;
228 :     # Now we need to run through the locations. We'll put the total sequence
229 :     # length in here.
230 :     my $seqLen = 0;
231 :     # This will track the ordinal position of the current location segment.
232 :     my $locN = 1;
233 :     # Loop through the location objects.
234 :     for my $loc (@locs) {
235 :     # Add this location's length to the total length.
236 :     $seqLen += $loc->Length();
237 :     # Extract the contig ID. Note that we need to prefix the
238 :     # genome ID to make it unique.
239 :     my $contig = $loc->Contig();
240 :     my $contigID = "$genomeID:$contig";
241 :     # We also need the location's direction.
242 :     my $dir = $loc->Dir();
243 :     # Now we peel off sections of the location and connect them
244 :     # to the feature.
245 :     my $peel = $loc->Peel($maxLength);
246 :     while (defined $peel) {
247 : parrello 1.2 $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN++,
248 :     begin => $peel->Left(), len => $peel->Length(),
249 :     dir => $dir);
250 : parrello 1.1 $peel = $loc->Peel($maxLength);
251 :     }
252 :     # Output the residual. There will always be one, because of the way
253 :     # Peel works.
254 : parrello 1.2 $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN,
255 :     begin => $loc->Left(), dir => $dir, len => $loc->Length());
256 : parrello 1.1 }
257 : parrello 1.3 # Is this an attachment site?
258 :     if ($type eq 'att') {
259 :     # Yes, connect it to the attached feature.
260 :     if ($assignment =~ /att([LR])\s+for\s+(fig\|.+)/) {
261 :     $self->PutR(IsAttachmentSiteFor => $fid, $2, edge => $1);
262 :     } else {
263 : parrello 1.9 Trace("Invalid attachment function for $fid: $assignment") if T(ERDBLoadGroup => 1);
264 : parrello 1.3 $self->Add(badAttachment => 1);
265 :     }
266 :     }
267 : parrello 1.16 # Compute the MD5 identifier. This may fail if there is an error in the
268 :     # feature definition.
269 :     my $md5Alias;
270 :     eval {
271 : parrello 1.18 $md5Alias = $genomeMD5Data->ComputeFeatureMD5($type, map { $_->String() } @locs);
272 : parrello 1.16 };
273 :     if ($@) {
274 : parrello 1.17 Trace("Error in $fid MD5 computation: $@") if T(0);
275 : parrello 1.16 $self->Add(md5ComputeError => 1);
276 :     }
277 : parrello 1.1 # Emit the feature record.
278 :     $self->PutE(Feature => $fid, feature_type => $type,
279 : parrello 1.2 sequence_length => $seqLen, function => $assignment,
280 :     locked => $fig->is_locked_fid($fid));
281 : parrello 1.1 # Connect the feature to its genome.
282 :     $self->PutR(IsOwnerOf => $genomeID, $fid);
283 : parrello 1.9 # Connect the feature to its roles.
284 : parrello 1.11 my ($roles, $errors) = SeedUtils::roles_for_loading($assignment);
285 : parrello 1.9 if (! defined $roles) {
286 :     # Here the functional assignment was suspicious.
287 :     $self->Add(suspiciousFunction => 1);
288 :     Trace("$fid has a suspicious function: $assignment") if T(ERDBLoadGroup => 1);
289 :     } else {
290 :     # Here we have a good assignment.
291 :     for my $role (@$roles) {
292 :     $self->Add(featureRole => 1);
293 :     $self->PutR(IsFunctionalIn => $role, $fid);
294 :     $self->PutE(Role => $role, hypothetical => hypo($role));
295 :     }
296 :     $self->Add(badFeatureRoles => $errors);
297 :     }
298 : parrello 1.1 # Now we have a whole bunch of attribute-related stuff to store in
299 : parrello 1.10 # secondary Feature tables. First is the evidence codes. This is special
300 :     # because we have to save the DLIT numbers.
301 :     my @dlits;
302 : parrello 1.1 my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');
303 :     for my $evidenceTuple (@evidenceTuples) {
304 :     my (undef, undef, $code) = @$evidenceTuple;
305 :     $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);
306 : parrello 1.10 # If this is a direct literature reference, save it.
307 :     if ($code =~ /dlit\((\d+)/) {
308 :     push @dlits, $1;
309 :     $self->Add(dlits => 1);
310 :     }
311 : parrello 1.1 }
312 :     # Now we have the external links. These are stored using hyperlink objects.
313 :     my @links = $fig->fid_links($fid);
314 :     for my $link (@links) {
315 :     my $hl = HyperLink->newFromHtml($link);
316 :     $self->PutE(FeatureLink => $fid, link => $hl);
317 :     }
318 :     # Virulence data is next. This is also hyperlink data.
319 :     my @virulenceTuples = $fig->get_attributes($fid, 'virulence_associated%');
320 :     for my $virulenceTuple (@virulenceTuples) {
321 :     my (undef, undef, $text, $url) = @$virulenceTuple;
322 :     my $hl = HyperLink->new($text, $url);
323 :     $self->PutE(FeatureVirulent => $fid, virulent => $hl);
324 :     }
325 :     # Finally, the essentiality stuff, which is the last of the hyperlinks.
326 :     my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);
327 :     for my $essentialTuple (@essentials) {
328 :     my (undef, undef, $essentialityType, $url) = @$essentialTuple;
329 : parrello 1.5 # Only keep this datum if it has a URL. The ones without URLs are
330 :     # all duplicates.
331 :     if ($url) {
332 :     # Form a hyperlink from this essentiality tuple.
333 :     my $link = HyperLink->new($essentialityType, $url);
334 :     # Store it as essentiality data for this feature.
335 :     $self->PutE(FeatureEssential => $fid, essential => $link);
336 :     }
337 : parrello 1.1 }
338 :     # If this is a PEG, we have a protein sequence.
339 : parrello 1.3 my $proteinID;
340 : parrello 1.1 if ($type eq 'peg') {
341 :     # Get the translation.
342 : parrello 1.14 my $proteinSequence = $seqs{$fid};
343 : parrello 1.7 if (! $proteinSequence) {
344 : parrello 1.13 Trace("No protein sequence found for $fid.") if T(ERDBLoadGroup => 2);
345 : parrello 1.7 $self->Add(missingProtein => 1);
346 :     # Here there was some sort of error and the protein sequence did
347 :     # not come back. Ask for the DNA and translate it instead.
348 :     my $dna = $fig->get_dna_seq($fid);
349 :     $proteinSequence = FIG::translate($dna, undef, 1);
350 :     }
351 : parrello 1.1 # Compute the ID.
352 : parrello 1.8 $proteinID = $sapling->ProteinID($proteinSequence);
353 : parrello 1.1 # Create the protein record.
354 :     $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);
355 : parrello 1.3 $self->PutR(IsProteinFor => $proteinID, $fid);
356 : parrello 1.10 # Connect this protein to the feature's publications (if any).
357 :     for my $pub (@dlits) {
358 :     $self->PutR(Concerns => $pub, $proteinID);
359 : parrello 1.1 }
360 : parrello 1.2 }
361 : parrello 1.3 # Now we need to compute the identifiers. We start with the aliases.
362 :     # Get the alias data for this feature. If there is none, we force an
363 :     # empty list.
364 :     my $aliasList = $aliasHash->{$fid} || [];
365 :     # Loop through the aliases found.
366 :     for my $aliasTuple (@$aliasList) {
367 :     my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple;
368 :     # Get the natural form. If there is none, then the canonical
369 : parrello 1.13 # form IS the natural form. Note we have to make a special check
370 :     # for locus tags, which have an insane number of variants.
371 :     my $natural;
372 :     if ($aliasID =~ /LocusTag:(.+)/) {
373 :     $natural = $1;
374 :     } else {
375 :     $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;
376 :     }
377 : parrello 1.3 # Create the identifier record.
378 :     $self->PutE(Identifier => $aliasID, natural_form => $natural,
379 : parrello 1.5 source => $aliasType);
380 : parrello 1.3 # Is this a protein alias?
381 :     if ($aliasConf eq 'C' && $proteinID) {
382 :     # Yes. Connect it using IsNamedBy.
383 :     $self->PutR(IsNamedBy => $proteinID, $aliasID);
384 : parrello 1.2 } else {
385 : parrello 1.3 # No. Connect it to the feature.
386 : parrello 1.12 $self->PutR(IsIdentifiedBy => $fid, $aliasID, conf => $aliasConf);
387 : parrello 1.2 }
388 :     }
389 : parrello 1.15 # Make the MD5 identifier an alias.
390 : parrello 1.16 if (defined $md5Alias) {
391 : parrello 1.18 $self->PutE(Identifier => "ubi|$md5Alias", natural_form => $md5Alias,
392 :     source => 'UBI');
393 :     $self->PutR(IsIdentifiedBy => $fid, "ubi|$md5Alias", conf => 'A');
394 : parrello 1.16 }
395 : parrello 1.3 # Finally, this feature is an alias of itself.
396 :     $self->PutE(Identifier => $fid, natural_form => $fid,
397 :     source => 'SEED');
398 : parrello 1.12 $self->PutR(IsIdentifiedBy => $fid, $fid, conf => 'A');
399 : parrello 1.1 }
400 :     }
401 :    
402 :    
403 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3