[Bio] / Sprout / SmallHarvester.pl Repository:
ViewVC logotype

Annotation of /Sprout/SmallHarvester.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use ERDB;
23 :     use FIG;
24 :     use Sapling;
25 :     use Stats;
26 :     use Time::HiRes;
27 :     use FullLocation;
28 :    
29 :     =head1 Small Feature Harvester Script
30 :    
31 :     SmallHarvester [options] <genome1> <genome2> ...
32 :    
33 :     Missing Feature Harvester
34 :    
35 :     =head2 Introduction
36 :    
37 :     This script looks for features in particular genomes that are missing from the
38 :     Sapling database and adds them. Only the basic feature and location
39 :     information is inserted. The rest is left for the next real load.
40 :    
41 :     =head2 Positional Parameters
42 :    
43 :     =over 4
44 :    
45 :     =item genome1, genome2
46 :    
47 :     IDs of the genomes whose features are to be harvested.
48 :    
49 :     =back
50 :    
51 :     =head2 Command-Line Options
52 :    
53 :     =over 4
54 :    
55 :     =item trace
56 :    
57 :     Specifies the tracing level. The higher the tracing level, the more messages
58 :     will appear in the trace log. Use E to specify emergency tracing.
59 :    
60 :     =item user
61 :    
62 :     Name suffix to be used for log files. If omitted, the PID is used.
63 :    
64 :     =item sql
65 :    
66 :     If specified, turns on tracing of SQL activity.
67 :    
68 :     =item background
69 :    
70 :     Save the standard and error output to files. The files will be created
71 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
72 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
73 :     B<user> option above.
74 :    
75 :     =item help
76 :    
77 :     Display this command's parameters and options.
78 :    
79 :     =item warn
80 :    
81 :     Create an event in the RSS feed when an error occurs.
82 :    
83 :     =item phone
84 :    
85 :     Phone number to message when the script is complete.
86 :    
87 :     =back
88 :    
89 :     =cut
90 :    
91 :     # Get the command-line options and parameters.
92 :     my ($options, @parameters) = StandardSetup([qw(ERDB Stats Sapling) ],
93 :     {
94 :     trace => ["2", "tracing level"],
95 :     phone => ["", "phone number (international format) to call when load finishes"],
96 :     },
97 :     "<genome1> <genome2> ...",
98 :     @ARGV);
99 :     # Set a variable to contain return type information.
100 :     my $rtype;
101 :     # Insure we catch errors.
102 :     eval {
103 :     # Create a statistics object to track our progress.
104 :     my $stats = Stats->new();
105 :     # Start a timer.
106 :     my $totalStart = time();
107 :     # Connect to the sapling database.
108 :     my $sap = ERDB::GetDatabase('Sapling');
109 :     # Get the maximum location segment length. We'll need this later.
110 :     my $maxLength = $sap->TuningParameter('maxLocationLength');
111 :     # Get the SEED database.
112 :     my $fig = FIG->new();
113 :     # Loop through the genome list.
114 :     for my $genome (@parameters) {
115 :     $stats->Add(genome => 1);
116 :     Trace("Processing genome $genome.") if T(2);
117 :     # Get this genome's features in the Sapling into a hash. We use this
118 :     # to see which features are missing.
119 :     my %sapFeatures = map { $_ => 1 } $sap->GetFlat("IsOwnerOf",
120 :     'IsOwnerOf(from-link) = ?',
121 :     [$genome], 'to-link');
122 :     # Get the features from the SEED.
123 :     my $featureList = $fig->all_features_detailed($genome);
124 :     # Loop through them.
125 :     for my $featureTuple (@$featureList) {
126 :     Trace($stats->Ask('feature') . " features processed.")
127 :     if $stats->Check(features => 1000);
128 :     # Get the current feature.
129 :     my ($fid, $locs, undef, $type) = @$featureTuple;
130 :     # Is it in the sapling?
131 :     if ($sapFeatures{$fid}) {
132 :     # Yes. Count it.
133 :     $stats->Add(found => 1);
134 :     } else {
135 :     # No, we have to add it, so we need more information.
136 :     # Start with the translation (if any).
137 :     my $translation;
138 :     # Is this a PEG?
139 :     if ($type eq 'peg') {
140 :     # Yes. Get the translation of the protein sequence.
141 :     $translation = $fig->get_translation($fid);
142 :     # Compute the protein's ID.
143 :     my $prot_id = ERDB::DigestKey($translation);
144 :     # Does it already exist?
145 :     if (! $sap->Exists(ProteinSequence => $prot_id)) {
146 :     # No, we must create it.
147 :     $sap->InsertObject('ProteinSequence', id => $prot_id,
148 :     sequence => $translation);
149 :     }
150 :     # Connect it to the new feature.
151 :     $sap->InsertObject('IsProteinFor', from_link => $prot_id,
152 :     to_link => $fid);
153 :     }
154 :     # Now we create the feature itself. Get the location
155 :     # information.
156 :     my $loc = FullLocation->new($fig, $genome, $locs, $translation);
157 :     # Get the sequence length.
158 :     my $sequenceLength = $loc->Length;
159 :     # Get the functional assignment.
160 :     my $function = $fig->function_of($fid);
161 :     # Create the feature.
162 :     $sap->InsertObject('Feature', id => $fid, function => $function,
163 :     locked => 0, sequence_length => $sequenceLength);
164 :     # Connect it to the genome.
165 :     $sap->InsertObject('IsOwnerOf', from_link => $genome,
166 :     to_link => $fid);
167 :     # Create its default identifier.
168 :     $sap->InsertObject('Identifier', id => $fid, natural_form => $fid,
169 :     source => 'SEED');
170 :     $sap->InsertObject('Identifies', from_link => $fid,
171 :     to_link => $fid);
172 :     # Now we must create the location connections. This will track
173 :     # the ordinal position of the current location segment.
174 :     my $locN = 1;
175 :     # Loop through the individual basic location objects.
176 :     for my $loc (@{$loc->Locs}) {
177 :     $stats->Add(locations => 1);
178 :     # Extract the contig ID. Note that we need to prefix the
179 :     # genome ID to convert it to Sapling form.
180 :     my $contig = $loc->Contig();
181 :     my $contigID = "$genome:$contig";
182 :     # Now we peel off sections of the location and connect them
183 :     # to the feature.
184 :     my $peel = $loc->Peel($maxLength);
185 :     while (defined $peel) {
186 :     ConnectPeel($sap, $fid, $contigID, $loc, $locN, $peel,
187 :     $stats);
188 :     $peel = $loc->Peel($maxLength);
189 :     }
190 :     # Output the residual. There will always be one, because of the way
191 :     # Peel works.
192 :     ConnectPeel($sap, $fid, $contigID, $loc, $locN, $peel, $stats);
193 :     }
194 :     }
195 :     }
196 :     }
197 :     # Compute the total time.
198 :     $stats->Add('total-time' => (time() - $totalStart));
199 :     # Display the statistics from this run.
200 :     Trace("Statistics for reload:\n" . $stats->Show()) if T(2);
201 :     };
202 :     if ($@) {
203 :     Trace("Script failed with error: $@") if T(0);
204 :     } else {
205 :     Trace("Script complete.") if T(2);
206 :     }
207 :     if ($options->{phone}) {
208 :     my $msgID = Tracer::SendSMS($options->{phone}, "ERDBReloader completed.");
209 :     if ($msgID) {
210 :     Trace("Phone message sent with ID $msgID.") if T(2);
211 :     } else {
212 :     Trace("Phone message not sent.") if T(2);
213 :     }
214 :     }
215 :    
216 :     ## This is a dink little subroutine that creates a location connection from a
217 :     ## peeled location.
218 :     sub ConnectPeel {
219 :     # Get the parameters.
220 :     my ($sap, $fid, $contigID, $loc, $locN, $peel, $stats) = @_;
221 :     # Output the connection.
222 :     $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contigID,
223 :     ordinal => $locN, begin => $loc->Left(),
224 :     dir => $loc->Dir, len => $loc->Length());
225 :     # Record this segment.
226 :     $stats->Add(segments => 1);
227 :     }
228 :    
229 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3