[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package FeatureSaplingLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use CGI qw(-nosticky);
26 :     use BasicLocation;
27 :     use HyperLink;
28 :     use base 'BaseSaplingLoader';
29 :    
30 :     =head1 Sapling Feature Load Group Class
31 :    
32 :     =head2 Introduction
33 :    
34 :     The Feature Load Group includes all of the major feature-related tables.
35 :    
36 :     =head3 new
37 :    
38 :     my $sl = FeatureSaplingLoader->new($erdb, $options, @tables);
39 :    
40 :     Construct a new FeatureSaplingLoader object.
41 :    
42 :     =over 4
43 :    
44 :     =item erdb
45 :    
46 :     [[SaplingPm]] object for the database being loaded.
47 :    
48 :     =item options
49 :    
50 :     Reference to a hash of command-line options.
51 :    
52 :     =item tables
53 :    
54 :     List of tables in this load group.
55 :    
56 :     =back
57 :    
58 :     =cut
59 :    
60 :     sub new {
61 :     # Get the parameters.
62 :     my ($class, $erdb, $options) = @_;
63 :     # Create the table list.
64 :     my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink FeatureVirulent
65 :     Identifier IsSequenceFor ProteinSequence Concerns Publication
66 :     IdentifierSet IncludesIdentifier IsLocatedIn IsOwnerOf);
67 :     # Create the BaseSaplingLoader object.
68 :     my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
69 :     # Return it.
70 :     return $retVal;
71 :     }
72 :    
73 :     =head2 Public Methods
74 :    
75 :     =head3 Generate
76 :    
77 :     $sl->Generate();
78 :    
79 :     Generate the data for the feature-related files.
80 :    
81 :     =cut
82 :    
83 :     sub Generate {
84 :     # Get the parameters.
85 :     my ($self) = @_;
86 :     # Get the database object.
87 :     my $erdb = $self->db();
88 :     # Only proceed if this is a normal section. There's no global feature data.
89 :     if (! $self->global()) {
90 :     # Get the section ID.
91 :     my $genomeID = $self->section();
92 :     # Load this genome's features.
93 :     $self->LoadGenomeFeatures($genomeID);
94 :     }
95 :     }
96 :    
97 :     =head3 LoadGenomeFeatures
98 :    
99 :     $sl->LoadGenomeFeatures($genomeID);
100 :    
101 :     Load the feature-related data for a single genome.
102 :    
103 :     =over 4
104 :    
105 :     =item genomeID
106 :    
107 :     ID of the genome whose feature data is to be loaded.
108 :    
109 :     =back
110 :    
111 :     =cut
112 :    
113 :     sub LoadGenomeFeatures {
114 :     # Get the parameters.
115 :     my ($self, $genomeID) = @_;
116 :     # Get the source object.
117 :     my $fig = $self->source();
118 :     # Get the database.
119 :     my $sapling = $self->db();
120 :     # Get the maximum location segment length. We'll need this later.
121 :     my $maxLength = $sapling->TuningParameter('maxLocationLength');
122 :     # This hash will be used to track identifiers. Each identifier can only
123 :     # be used once.
124 :     my %identifiers;
125 :     # Get all of this genome's features.
126 :     my $featureList = $fig->all_features_detailed_fast($genomeID);
127 :     # Loop through them.
128 :     for my $feature (@$featureList) {
129 :     # Get this feature's data.
130 :     my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,
131 :     $assignmentMaker, $quality) = @$feature;
132 :     $self->Track(Features => $fid, 1000);
133 :     # Fix the assignment for non-PEG features.
134 :     if (! defined $assignment) {
135 :     $assignment = $aliases;
136 :     $assignmentMaker ||= 'master';
137 :     }
138 :     # Convert the location string to a list of location objects.
139 :     my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;
140 :     # Now we need to run through the locations. We'll put the total sequence
141 :     # length in here.
142 :     my $seqLen = 0;
143 :     # This will track the ordinal position of the current location segment.
144 :     my $locN = 1;
145 :     # Loop through the location objects.
146 :     for my $loc (@locs) {
147 :     # Add this location's length to the total length.
148 :     $seqLen += $loc->Length();
149 :     # Extract the contig ID. Note that we need to prefix the
150 :     # genome ID to make it unique.
151 :     my $contig = $loc->Contig();
152 :     my $contigID = "$genomeID:$contig";
153 :     # We also need the location's direction.
154 :     my $dir = $loc->Dir();
155 :     # Now we peel off sections of the location and connect them
156 :     # to the feature.
157 :     my $peel = $loc->Peel($maxLength);
158 :     while (defined $peel) {
159 :     $self->PutR(IsLocatedIn => $fid, $contigID, beg => $peel->Left(),
160 :     dir => $dir, len => $maxLength, locN => $locN++);
161 :     $peel = $loc->Peel($maxLength);
162 :     }
163 :     # Output the residual. There will always be one, because of the way
164 :     # Peel works.
165 :     $self->PutR(IsLocatedIn => $fid, $contigID, beg => $loc->Left(),
166 :     dir => $dir, len => $loc->Length(), locN => $locN);
167 :     }
168 :     # Emit the feature record.
169 :     $self->PutE(Feature => $fid, feature_type => $type,
170 :     sequence_length => $seqLen, function => $assignment);
171 :     # Connect the feature to its genome.
172 :     $self->PutR(IsOwnerOf => $genomeID, $fid);
173 :     # Now we have a whole bunch of attribute-related stuff to store in
174 :     # secondary Feature tables. First is the evidence codes.
175 :     my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');
176 :     for my $evidenceTuple (@evidenceTuples) {
177 :     my (undef, undef, $code) = @$evidenceTuple;
178 :     $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);
179 :     }
180 :     # Now we have the external links. These are stored using hyperlink objects.
181 :     my @links = $fig->fid_links($fid);
182 :     for my $link (@links) {
183 :     my $hl = HyperLink->newFromHtml($link);
184 :     $self->PutE(FeatureLink => $fid, link => $hl);
185 :     }
186 :     # Virulence data is next. This is also hyperlink data.
187 :     my @virulenceTuples = $fig->get_attributes($fid, 'virulence_associated%');
188 :     for my $virulenceTuple (@virulenceTuples) {
189 :     my (undef, undef, $text, $url) = @$virulenceTuple;
190 :     my $hl = HyperLink->new($text, $url);
191 :     $self->PutE(FeatureVirulent => $fid, virulent => $hl);
192 :     }
193 :     # Finally, the essentiality stuff, which is the last of the hyperlinks.
194 :     my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);
195 :     for my $essentialTuple (@essentials) {
196 :     my (undef, undef, $essentialityType, $url) = @$essentialTuple;
197 :     # Form a hyperlink from this essentiality tuple.
198 :     my $link = HyperLink->new($essentialityType, $url);
199 :     # Store it as essentiality data for this feature.
200 :     $self->PutE(FeatureEssential => $fid, essential => $link);
201 :     }
202 :     # If this is a PEG, we have a protein sequence.
203 :     if ($type eq 'peg') {
204 :     # Get the translation.
205 :     my $proteinSequence = $fig->get_translation($fid);
206 :     # Compute the ID.
207 :     my $proteinID = ERDB::DigestKey($proteinSequence);
208 :     # Create the protein record.
209 :     $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);
210 :     # Get the publications for this PEG.
211 :     my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');
212 :     for my $pub (@pubs) {
213 :     # Parse out the article title from the data.
214 :     my (undef, undef, $data, $url) = @_;
215 :     my @pieces = split /,/, $data, 3;
216 :     if (defined $pieces[2]) {
217 :     # Create the publication record.
218 :     my $hl = Hyperlink->new($pieces[2], $url);
219 :     my $key = ERDB::DigestKey($url);
220 :     $self->PutE(Publication => $key, citation => $hl);
221 :     # Connect it to the protein.
222 :     $self->PutR(Concerns => $key, $proteinID);
223 :     }
224 :     }
225 :     # Now we need to get the identifiers for this feature and put
226 :     # them in the protein's identifier set. The "1" tells FigPm to
227 :     # send back the database name with each identifier. Note that
228 :     # the FIG ID will come back with this list, but there may not be
229 :     # a list if the genome is new.
230 :     my @idTuples = grep { $_->[0] !~ /^[A-Z][A-Z]_\d+$/ } $fig->get_corresponding_ids($fid, 1); ##HACK: grep out the contig IDs
231 :     if (! @idTuples) {
232 :     push @idTuples, [$fid, 'SEED'];
233 :     }
234 :     # Compute the identifier set name and create the set.
235 :     my $setID = "$proteinID:$genomeID";
236 :     $self->PutE(IdentifierSet => $setID);
237 :     # Create the identifiers and onnect them to the protein and
238 :     # the set.
239 :     for my $idTuple (@idTuples) {
240 :     my ($id, $source) = @$idTuple;
241 :     # Only process this identifier if it's new. An identifier
242 :     # can only be in one identifier set. Thankfully, the
243 :     # identifiers belong to genomes, so we don't need to worry
244 :     # about duplicates in other sections.
245 :     if (exists $identifiers{$id} && $identifiers{$id} ne $proteinID) {
246 :     $self->Add(ambiguousProtein => 1);
247 :     } else {
248 :     $self->PutE(Identifier => $id, source => $source);
249 :     $self->PutR(IsSequenceFor => $proteinID, $id);
250 :     $self->PutR(IncludesIdentifier => $setID, $id);
251 :     $identifiers{$id} = $proteinID;
252 :     }
253 :     }
254 :     }
255 :     }
256 :     }
257 :    
258 :    
259 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3