[Bio] / Sprout / ProteinSaplingLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/ProteinSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package ProteinSaplingLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 : parrello 1.4 use AliasAnalysis;
26 : parrello 1.1 use base 'BaseSaplingLoader';
27 :    
28 :     =head1 Sapling Protein Load Group Class
29 :    
30 :     =head2 Introduction
31 :    
32 :     The Protein Load Group includes all of the major protein and annotation data tables.
33 :    
34 :     =head3 new
35 :    
36 :     my $sl = ProteinSaplingLoader->new($erdb, $options, @tables);
37 :    
38 :     Construct a new ProteinSaplingLoader object.
39 :    
40 :     =over 4
41 :    
42 :     =item erdb
43 :    
44 : parrello 1.2 L<Sapling> object for the database being loaded.
45 : parrello 1.1
46 :     =item options
47 :    
48 :     Reference to a hash of command-line options.
49 :    
50 :     =item tables
51 :    
52 :     List of tables in this load group.
53 :    
54 :     =back
55 :    
56 :     =cut
57 :    
58 :     sub new {
59 :     # Get the parameters.
60 :     my ($class, $erdb, $options) = @_;
61 :     # Create the table list.
62 :     my @tables = sort qw(Annotation IsAnnotatedBy HasAssertionFrom Source);
63 :     # Create the BaseSaplingLoader object.
64 :     my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
65 :     # Return it.
66 :     return $retVal;
67 :     }
68 :    
69 :     =head3 BLACKLIST
70 :    
71 :     BLACKLIST is a reference to a hash of protein sources to be ignored
72 :     when processing non-expert assertions. For example, if C<SEED> were included
73 :     in the list, then the SEED subdirectory of the non-expert assertion cluster
74 :     would be bypassed.
75 :    
76 :     =cut
77 :    
78 :     use constant BLACKLIST => { SEED => 1, NMPDR => 1 };
79 :    
80 :     =head2 Public Methods
81 :    
82 :     =head3 Generate
83 :    
84 :     $sl->Generate();
85 :    
86 :     Generate the data for the protein and annotation data files.
87 :    
88 :     =cut
89 :    
90 :     sub Generate {
91 :     # Get the parameters.
92 :     my ($self) = @_;
93 :     # Get the database object.
94 :     my $erdb = $self->db();
95 :     # Get the source object.
96 :     my $fig = $self->source();
97 :     # Is this the global section?
98 :     if ($self->global()) {
99 :     # Yes. We do the assertions here. First, the expert assertions.
100 : parrello 1.5 # These are taken from the expert assertion file on the FTP. It has to be
101 :     # unzipped so we can read it.
102 :     Trace("Reading expert assertions.") if T(ERDBLoadGroup => 3);
103 :     my $ah = Open(undef, "gunzip -cd /vol/ftp.theseed.org/AnnotationClearingHouse/ach_expert_assertions.gz |");
104 :     # We'll track the assertion sources in this hash.
105 : parrello 1.1 my %sources;
106 : parrello 1.5 while (! eof $ah) {
107 :     # Get the current assertion from the file.
108 :     my ($id, $function, undef, $source) = Tracer::GetLine($ah);
109 :     $self->Add("assertions-expert" => 1);
110 :     $self->Track(assertionRows => $id, 1000);
111 :     # Fix the function.
112 :     $function =~ s/\s+$//;
113 :     # Insure this user has a source record.
114 :     if (! exists $sources{$source}) {
115 :     $self->PutE(Source => $source);
116 :     $sources{$source} = 1;
117 : parrello 1.1 }
118 : parrello 1.5 # Attach his assertion to the identifier as an expert assertion.
119 :     $self->PutR(HasAssertionFrom => $id, $source, function => $function,
120 :     expert => 1);
121 : parrello 1.1 }
122 : parrello 1.3 # Create the SEED source. Its data is loaded during the section processing.
123 :     $self->PutE(Source => 'SEED');
124 : parrello 1.1 # Now we need the non-expert assertions. These are kept in flat
125 :     # files called "assigned_function" in the $FIG_Config::NR subdirectory.
126 :     # The sub-directory names are used for the source.
127 :     my $nr_directory = $FIG_Config::NR;
128 :     my @sources = Tracer::OpenDir($nr_directory, 1);
129 :     # Loop through the sources.
130 :     for my $source (@sources) {
131 :     # Insure this is a source we want.
132 :     if (BLACKLIST->{$source}) {
133 :     Trace("Assertions ignored for blacklisted source $source.") if T(ERDBLoadGroup => 3);
134 :     } else {
135 :     # Check for an assigned function file.
136 :     my $functionFile = "$nr_directory/$source/assigned_functions";
137 :     if (-s $functionFile) {
138 :     # Put this source is the source table.
139 :     $self->PutE(Source => $source);
140 :     Trace("Processing assertions for $source.") if T(ERDBLoadGroup => 3);
141 :     # Loop through the assigned function file.
142 :     my $ih = Open(undef, "<$functionFile");
143 :     while (! eof $ih) {
144 :     # Get the identifier and function from this row.
145 :     my ($fid, $function) = Tracer::GetLine($ih);
146 : parrello 1.3 # Fix the function.
147 :     $function =~ s/\s+$//;
148 :     # If this is a RefSeq ID, convert it to its normal form.
149 :     $fid =~ s/^ref\|//;
150 :     # Count this identifier.
151 : parrello 1.1 $self->Track(nrIdentifiers => $fid, 10000);
152 :     # Insure this identifier has a valid function.
153 :     if (! defined $function) {
154 :     $self->Add("badFunction-$source" => 1);
155 :     } else {
156 :     # It does, so put it in the assertion relationship.
157 :     $self->PutR(HasAssertionFrom => $fid, $source,
158 :     function => $function, expert => 0);
159 :     $self->Add("goodFunction-$source" => 1);
160 :     }
161 :     }
162 :     }
163 :     }
164 :     }
165 :     } else {
166 :     # Get the section ID.
167 :     my $genomeID = $self->section();
168 :     # Now we process the annotations for the specified genome.
169 :     # Get the current time.
170 :     my $time = time();
171 :     # Create a hash of timestamps. We use this to prevent duplicate time stamps
172 :     # from showing up for a single PEG's annotations.
173 :     my %seenTimestamps = ();
174 :     # Get the genome's annotations.
175 :     my @annotations = $fig->read_all_annotations($genomeID);
176 :     Trace("Processing annotations.") if T(2);
177 :     for my $tuple (@annotations) {
178 :     # Get the annotation tuple.
179 :     my ($peg, $timestamp, $user, $text) = @{$tuple};
180 :     $self->Track(Annotations => "$peg:$timestamp", 1000);
181 :     # Change assignments by the master user to FIG assignments.
182 :     $text =~ s/Set master function/Set FIG function/s;
183 :     # Insure the time stamp is valid.
184 :     if ($timestamp =~ /^\d+$/) {
185 :     # Here it's a number. We need to insure the one we use to form
186 :     # the key is unique.
187 :     my $keyStamp = $timestamp;
188 :     while ($seenTimestamps{"$peg:$keyStamp"}) {
189 :     $keyStamp++;
190 :     }
191 :     my $annotationID = "$peg:" . Tracer::Pad(9999999999 - $keyStamp, 10,
192 :     1, "0");
193 :     $seenTimestamps{"$peg:$keyStamp"} = 1;
194 :     # Generate the annotation.
195 :     $self->PutE(Annotation => $annotationID, annotation_time => $timestamp,
196 :     comment => $text, annotator => $user);
197 :     $self->PutR(IsAnnotatedBy => $peg, $annotationID);
198 :     } else {
199 :     # Here we have an invalid time stamp.
200 : parrello 1.4 Trace("Invalid time stamp \"$timestamp\" in annotations for $peg.") if T(ERDBLoadGroup => 1);
201 : parrello 1.1 }
202 :     }
203 : parrello 1.3 # Get the genome's assertions. These serve as the non-expert assertions
204 :     # from the SEED.
205 :     Trace("Processing assertions.") if T(2);
206 :     my $featureFile = "$FIG_Config::organisms/$genomeID/assigned_functions";
207 : parrello 1.6 if (! -f $featureFile) {
208 :     Trace("Missing $featureFile for $genomeID.") if T(1);
209 :     $self->Add(missingAssignedFunction => 1);
210 :     } else {
211 :     my $ih = Open(undef, "<$featureFile");
212 :     while (! eof $ih) {
213 :     # Get the FIG ID and function from this row.
214 :     my ($fid, $function) = Tracer::GetLine($ih);
215 :     # Count this ID.
216 :     $self->Track(figAssertions => $fid, 5000);
217 :     # Insure this identifier has a valid function.
218 :     if (! defined $function) {
219 :     $self->Add("badFunction-SEED" => 1);
220 :     } else {
221 :     # It does, so put it in the assertion relationship.
222 :     $self->PutR(HasAssertionFrom => $fid, 'SEED',
223 :     function => $function, expert => 0);
224 :     $self->Add("goodFunction-SEED" => 1);
225 :     }
226 : parrello 1.3 }
227 :     }
228 : parrello 1.1 }
229 :     }
230 :    
231 :     =head3 PostProcess
232 :    
233 :     my $stats = $edbl->PostProcess();
234 :    
235 :     Post-process the load files for this group. This method is called after all
236 :     of the load files have been assembled, but before anything is actually loaded.
237 :    
238 :     This method returns a statistics object describing the post-processing activity,
239 :     or an undefined value if nothing happened.
240 :    
241 :     For the Protein group, the post-processing removes assertions for identifiers
242 :     that are not in our database.
243 :    
244 :     =cut
245 :    
246 :     sub PostProcess {
247 :     my ($self) = @_;
248 :     my $retVal = $self->FilterRelationship(from => 'HasAssertionFrom');
249 :     return $retVal;
250 :     }
251 :    
252 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3