[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package FeatureSaplingLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use CGI qw(-nosticky);
26 :     use BasicLocation;
27 :     use HyperLink;
28 : parrello 1.2 use AliasAnalysis;
29 : parrello 1.3 use LoaderUtils;
30 : parrello 1.8 use Digest::MD5;
31 : parrello 1.1 use base 'BaseSaplingLoader';
32 :    
33 :     =head1 Sapling Feature Load Group Class
34 :    
35 :     =head2 Introduction
36 :    
37 :     The Feature Load Group includes all of the major feature-related tables.
38 :    
39 :     =head3 new
40 :    
41 :     my $sl = FeatureSaplingLoader->new($erdb, $options, @tables);
42 :    
43 :     Construct a new FeatureSaplingLoader object.
44 :    
45 :     =over 4
46 :    
47 :     =item erdb
48 :    
49 : parrello 1.4 L<Sapling> object for the database being loaded.
50 : parrello 1.1
51 :     =item options
52 :    
53 :     Reference to a hash of command-line options.
54 :    
55 :     =item tables
56 :    
57 :     List of tables in this load group.
58 :    
59 :     =back
60 :    
61 :     =cut
62 :    
63 :     sub new {
64 :     # Get the parameters.
65 :     my ($class, $erdb, $options) = @_;
66 :     # Create the table list.
67 : parrello 1.2 my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink
68 :     FeatureVirulent IsOwnerOf IsLocatedIn Identifies
69 : parrello 1.3 Identifier IsNamedBy ProteinSequence Concerns
70 :     IsAttachmentSiteFor Publication IsProteinFor);
71 : parrello 1.1 # Create the BaseSaplingLoader object.
72 :     my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
73 :     # Return it.
74 :     return $retVal;
75 :     }
76 :    
77 :     =head2 Public Methods
78 :    
79 :     =head3 Generate
80 :    
81 :     $sl->Generate();
82 :    
83 :     Generate the data for the feature-related files.
84 :    
85 :     =cut
86 :    
87 :     sub Generate {
88 :     # Get the parameters.
89 :     my ($self) = @_;
90 :     # Get the database object.
91 :     my $erdb = $self->db();
92 :     # Only proceed if this is a normal section. There's no global feature data.
93 :     if (! $self->global()) {
94 :     # Get the section ID.
95 :     my $genomeID = $self->section();
96 :     # Load this genome's features.
97 :     $self->LoadGenomeFeatures($genomeID);
98 :     }
99 :     }
100 :    
101 :     =head3 LoadGenomeFeatures
102 :    
103 :     $sl->LoadGenomeFeatures($genomeID);
104 :    
105 :     Load the feature-related data for a single genome.
106 :    
107 :     =over 4
108 :    
109 :     =item genomeID
110 :    
111 :     ID of the genome whose feature data is to be loaded.
112 :    
113 :     =back
114 :    
115 :     =cut
116 :    
117 :     sub LoadGenomeFeatures {
118 :     # Get the parameters.
119 :     my ($self, $genomeID) = @_;
120 :     # Get the source object.
121 :     my $fig = $self->source();
122 :     # Get the database.
123 :     my $sapling = $self->db();
124 :     # Get the maximum location segment length. We'll need this later.
125 :     my $maxLength = $sapling->TuningParameter('maxLocationLength');
126 : parrello 1.3 # Get the genome's aliases.
127 :     my $aliasDir = $sapling->LoadDirectory() . "/AliasData";
128 :     my $aliasHash = LoaderUtils::ReadAliasFile($aliasDir, $genomeID);
129 :     if (! defined $aliasHash) {
130 :     Trace("No aliases found for $genomeID.") if T(1);
131 :     $self->Add(missingAliasFile => 1);
132 :     $aliasHash = {};
133 :     }
134 : parrello 1.1 # Get all of this genome's features.
135 :     my $featureList = $fig->all_features_detailed_fast($genomeID);
136 :     # Loop through them.
137 :     for my $feature (@$featureList) {
138 :     # Get this feature's data.
139 :     my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,
140 :     $assignmentMaker, $quality) = @$feature;
141 :     $self->Track(Features => $fid, 1000);
142 : parrello 1.7 # Fix missing assignments. For RNAs, the assignment may be in the alias list.
143 : parrello 1.1 if (! defined $assignment) {
144 : parrello 1.7 if ($type eq 'rna') {
145 :     $assignment = $aliases;
146 :     $assignmentMaker ||= 'master';
147 :     } else {
148 :     $assignment = '';
149 :     }
150 : parrello 1.1 }
151 :     # Convert the location string to a list of location objects.
152 :     my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;
153 :     # Now we need to run through the locations. We'll put the total sequence
154 :     # length in here.
155 :     my $seqLen = 0;
156 :     # This will track the ordinal position of the current location segment.
157 :     my $locN = 1;
158 :     # Loop through the location objects.
159 :     for my $loc (@locs) {
160 :     # Add this location's length to the total length.
161 :     $seqLen += $loc->Length();
162 :     # Extract the contig ID. Note that we need to prefix the
163 :     # genome ID to make it unique.
164 :     my $contig = $loc->Contig();
165 :     my $contigID = "$genomeID:$contig";
166 :     # We also need the location's direction.
167 :     my $dir = $loc->Dir();
168 :     # Now we peel off sections of the location and connect them
169 :     # to the feature.
170 :     my $peel = $loc->Peel($maxLength);
171 :     while (defined $peel) {
172 : parrello 1.2 $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN++,
173 :     begin => $peel->Left(), len => $peel->Length(),
174 :     dir => $dir);
175 : parrello 1.1 $peel = $loc->Peel($maxLength);
176 :     }
177 :     # Output the residual. There will always be one, because of the way
178 :     # Peel works.
179 : parrello 1.2 $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN,
180 :     begin => $loc->Left(), dir => $dir, len => $loc->Length());
181 : parrello 1.1 }
182 : parrello 1.3 # Is this an attachment site?
183 :     if ($type eq 'att') {
184 :     # Yes, connect it to the attached feature.
185 :     if ($assignment =~ /att([LR])\s+for\s+(fig\|.+)/) {
186 :     $self->PutR(IsAttachmentSiteFor => $fid, $2, edge => $1);
187 :     } else {
188 :     Trace("Invalid attachment function for $fid: $assignment") if T(1);
189 :     $self->Add(badAttachment => 1);
190 :     }
191 :     }
192 : parrello 1.1 # Emit the feature record.
193 :     $self->PutE(Feature => $fid, feature_type => $type,
194 : parrello 1.2 sequence_length => $seqLen, function => $assignment,
195 :     locked => $fig->is_locked_fid($fid));
196 : parrello 1.1 # Connect the feature to its genome.
197 :     $self->PutR(IsOwnerOf => $genomeID, $fid);
198 :     # Now we have a whole bunch of attribute-related stuff to store in
199 :     # secondary Feature tables. First is the evidence codes.
200 :     my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');
201 :     for my $evidenceTuple (@evidenceTuples) {
202 :     my (undef, undef, $code) = @$evidenceTuple;
203 :     $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);
204 :     }
205 :     # Now we have the external links. These are stored using hyperlink objects.
206 :     my @links = $fig->fid_links($fid);
207 :     for my $link (@links) {
208 :     my $hl = HyperLink->newFromHtml($link);
209 :     $self->PutE(FeatureLink => $fid, link => $hl);
210 :     }
211 :     # Virulence data is next. This is also hyperlink data.
212 :     my @virulenceTuples = $fig->get_attributes($fid, 'virulence_associated%');
213 :     for my $virulenceTuple (@virulenceTuples) {
214 :     my (undef, undef, $text, $url) = @$virulenceTuple;
215 :     my $hl = HyperLink->new($text, $url);
216 :     $self->PutE(FeatureVirulent => $fid, virulent => $hl);
217 :     }
218 :     # Finally, the essentiality stuff, which is the last of the hyperlinks.
219 :     my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);
220 :     for my $essentialTuple (@essentials) {
221 :     my (undef, undef, $essentialityType, $url) = @$essentialTuple;
222 : parrello 1.5 # Only keep this datum if it has a URL. The ones without URLs are
223 :     # all duplicates.
224 :     if ($url) {
225 :     # Form a hyperlink from this essentiality tuple.
226 :     my $link = HyperLink->new($essentialityType, $url);
227 :     # Store it as essentiality data for this feature.
228 :     $self->PutE(FeatureEssential => $fid, essential => $link);
229 :     }
230 : parrello 1.1 }
231 :     # If this is a PEG, we have a protein sequence.
232 : parrello 1.3 my $proteinID;
233 : parrello 1.1 if ($type eq 'peg') {
234 :     # Get the translation.
235 :     my $proteinSequence = $fig->get_translation($fid);
236 : parrello 1.7 if (! $proteinSequence) {
237 :     Trace("No protein sequence found for $fid.") if T(2);
238 :     $self->Add(missingProtein => 1);
239 :     # Here there was some sort of error and the protein sequence did
240 :     # not come back. Ask for the DNA and translate it instead.
241 :     my $dna = $fig->get_dna_seq($fid);
242 :     $proteinSequence = FIG::translate($dna, undef, 1);
243 :     }
244 : parrello 1.1 # Compute the ID.
245 : parrello 1.8 $proteinID = $sapling->ProteinID($proteinSequence);
246 : parrello 1.1 # Create the protein record.
247 :     $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);
248 : parrello 1.3 $self->PutR(IsProteinFor => $proteinID, $fid);
249 : parrello 1.1 # Get the publications for this PEG.
250 :     my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');
251 :     for my $pub (@pubs) {
252 :     # Parse out the article title from the data.
253 : parrello 1.3 my (undef, undef, $data, $url) = @$pub;
254 : parrello 1.1 my @pieces = split /,/, $data, 3;
255 :     if (defined $pieces[2]) {
256 :     # Create the publication record.
257 : parrello 1.3 my $hl = HyperLink->new($pieces[2], $url);
258 : parrello 1.1 my $key = ERDB::DigestKey($url);
259 :     $self->PutE(Publication => $key, citation => $hl);
260 :     # Connect it to the protein.
261 :     $self->PutR(Concerns => $key, $proteinID);
262 :     }
263 :     }
264 : parrello 1.2 }
265 : parrello 1.3 # Now we need to compute the identifiers. We start with the aliases.
266 :     # Get the alias data for this feature. If there is none, we force an
267 :     # empty list.
268 :     my $aliasList = $aliasHash->{$fid} || [];
269 :     # Loop through the aliases found.
270 :     for my $aliasTuple (@$aliasList) {
271 :     my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple;
272 :     # Get the natural form. If there is none, then the canonical
273 :     # form IS the natural form.
274 :     my $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;
275 :     # Create the identifier record.
276 :     $self->PutE(Identifier => $aliasID, natural_form => $natural,
277 : parrello 1.5 source => $aliasType);
278 : parrello 1.3 # Is this a protein alias?
279 :     if ($aliasConf eq 'C' && $proteinID) {
280 :     # Yes. Connect it using IsNamedBy.
281 :     $self->PutR(IsNamedBy => $proteinID, $aliasID);
282 : parrello 1.2 } else {
283 : parrello 1.3 # No. Connect it to the feature.
284 :     $self->PutR(Identifies => $aliasID, $fid, conf => $aliasConf);
285 : parrello 1.2 }
286 :     }
287 : parrello 1.3 # Finally, this feature is an alias of itself.
288 :     $self->PutE(Identifier => $fid, natural_form => $fid,
289 :     source => 'SEED');
290 :     $self->PutR(Identifies => $fid, $fid, conf => 'A');
291 : parrello 1.1 }
292 :     }
293 :    
294 :    
295 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3