[Bio] / Sprout / SaplingFamilyLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingFamilyLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingFamilyLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 :     use base qw(SaplingDataLoader);
29 :    
30 :     =head1 Sapling Family Loader
31 :    
32 :     This class reloads FIGfam data into a Sapling database from a specified
33 :     directory. This loader is designed for updating a populated database only. Links
34 :     to features and genomes are put in, but not the features and genomes themselves.
35 :     The Family tables will be dropped and then repopulated from the flat files.
36 :    
37 :     =head2 Main Methods
38 :    
39 :     =head3 Process
40 :    
41 :     my $stats = SaplingFamilyLoader::Process($sap, $directory);
42 :    
43 :     Reload FIGfam data from the specified directory. The existing data will be
44 :     deleted.
45 :    
46 :     =over 4
47 :    
48 :     =item sap
49 :    
50 :     L</Sapling> object for accessing the database.
51 :    
52 :     =item directory
53 :    
54 :     Name of the directory containing the FIGfam data files.
55 :    
56 :     =item RETURN
57 :    
58 :     Returns a statistics object describing the activity during the reload.
59 :    
60 :     =back
61 :    
62 :     =cut
63 :    
64 :     sub Process {
65 :     # Get the parameters.
66 :     my ($sap, $directory) = @_;
67 :     # Create the loader object.
68 :     my $loader = SaplingFamilyLoader->new($sap, $directory);
69 :     # Erase the current family tables.
70 :     my @tables = qw(Family HasMember IsCoupledTo IsFamilyFor
71 :     HasRepresentativeOf);
72 :     for my $table (@tables) {
73 :     Trace("Clearing $table.") if T(2);
74 :     $sap->TruncateTable($table);
75 :     }
76 :     # Load the new family data.
77 :     my $stats = $loader->Load();
78 :     # Return the result.
79 :     return $stats;
80 :     }
81 :    
82 :    
83 :     =head2 Loader Object Methods
84 :    
85 :     =head3 new
86 :    
87 :     my $loaderObject = SaplingExpressionLoader->new($sap, $directory);
88 :    
89 :     Create a loader object that can be used to facilitate loading Sapling data from an
90 :     FIGfam release directory.
91 :    
92 :     =over 4
93 :    
94 :     =item sap
95 :    
96 :     L<Sapling> object used to access the target database.
97 :    
98 :     =item directory
99 :    
100 :     Name of the directory containing the FIGfam release.
101 :    
102 :     =back
103 :    
104 :     The object created contains the following fields.
105 :    
106 :     =over 4
107 :    
108 :     =item supportRecords
109 :    
110 :     A hash of hashes, used to track the support records known to exist in the database.
111 :    
112 :     =item sap
113 :    
114 :     L<Sapling> object used to access the database.
115 :    
116 :     =item stats
117 :    
118 :     L<Stats> object for tracking statistical information about the load.
119 :    
120 :     =item directory
121 :    
122 :     Name of the directory containing the subsystem data.
123 :    
124 :     =back
125 :    
126 :     =cut
127 :    
128 :     sub new {
129 :     # Get the parameters.
130 :     my ($class, $sap, $directory) = @_;
131 :     # Create the object.
132 :     my $retVal = SaplingDataLoader::new($class, $sap, qw(FIGfams));
133 :     # Add our specialized data.
134 :     $retVal->{directory} = $directory;
135 :     # Return the result.
136 :     return $retVal;
137 :     }
138 :    
139 :     =head2 Internal Utility Methods
140 :    
141 :     =head3 Load
142 :    
143 :     my $stats = $loader->Load();
144 :    
145 :     Load the FIGfam data from the previously selected directory into the database.
146 :     The family tables shold exist, but must be empty. The statistics from the load
147 :     will be returned.
148 :    
149 :     =cut
150 :    
151 :     sub Load {
152 :     # Get the parameters.
153 :     my ($self) = @_;
154 :     # Get the Sapling database object and the statistics object.
155 :     my $sap = $self->{sap};
156 :     my $stats = $self->{stats};
157 :     # Get the name of the FIGfam release directory.
158 :     my $figFamDir = $self->{directory};
159 :     Trace("FIGfams will be loaded from $figFamDir.") if T(SaplingDataLoader => 2);
160 :     # We will keep the FIGfam IDs in here. We need them to filter the coupling
161 :     # file.
162 :     my %figFams;
163 :     # Read the family functions.
164 :     Trace("Processing family functions.") if T(SaplingDataLoader => 2);
165 :     my $ih = Open(undef, "<$figFamDir/family.functions");
166 :     while (! eof $ih) {
167 :     my ($fam, $function) = Tracer::GetLine($ih);
168 :     $stats->Add(familyFunctionRecord => 1);
169 :     if (! defined $function) {
170 :     $function = "";
171 :     $stats->Add(missingFamilyFunction => 1);
172 :     }
173 :     # Output the family record.
174 :     $sap->InsertObject('Family', id => $fam, family_function => $function);
175 :     $stats->Add('insert-Family' => 1);
176 :     # Remember that this is a valid family.
177 :     $figFams{$fam} = 1;
178 :     # Connect the family to its roles.
179 :     my ($roles, $errors) = SeedUtils::roles_for_loading($function);
180 :     if (! defined $roles) {
181 :     # Here the family function was suspicious.
182 :     $stats->Add(suspiciousFamilyFunction => 1);
183 :     } else {
184 :     # Here we have a good function.
185 :     for my $role (@$roles) {
186 :     $stats->Add(figfamRole => 1);
187 :     $sap->InsertObject('IsFamilyFor', from_link => $fam,
188 :     to_link => $role);
189 :     $stats->Add('insert-IsFamilyFor' => 1);
190 :     }
191 :     $stats->Add(badFigfamRoles => $errors);
192 :     }
193 :     }
194 :     close $ih;
195 :     # Now we need to process the memberships. This hash will map each family
196 :     # to a hash of the associated genomes.
197 :     my %famGenomes;
198 :     # We also need a list of the genomes in the database, so that we only
199 :     # process features in those genomes.
200 :     my %genomeHash = map { $_ => 1 } $sap->GetFlat('Genome', "", [], 'id');
201 :     # Read the memberships.
202 :     Trace("Processing family memberships.") if T(SaplingDataLoader => 2);
203 :     $ih = Open(undef, "<$figFamDir/families.2c");
204 :     while (! eof $ih) {
205 :     my ($fam, $featureID) = Tracer::GetLine($ih);
206 :     $stats->Add(familyFeatureRecord => 1);
207 :     # Extract the genome ID.
208 :     if ($featureID =~ /^fig\|(\d+\.\d+)/) {
209 :     # Insure it's one of ours.
210 :     my $genomeID = $1;
211 :     if (! $genomeHash{$genomeID}) {
212 :     $stats->Add(familyFeatureNotInDb => 1);
213 :     } else {
214 :     # It is. Connect the family to the feature.
215 :     $sap->InsertObject('HasMember', from_link => $fam,
216 :     to_link => $featureID);
217 :     $stats->Add('insert-HasMember' => 1);
218 :     # Connect the family to the genome.
219 :     $famGenomes{$fam}{$1} = 1;
220 :     }
221 :     }
222 :     }
223 :     close $ih;
224 :     # Connect the FIGfams to the genomes found.
225 :     Trace("Connecting families to genomes.") if T(SaplingDataLoader => 2);
226 :     for my $fam (keys %famGenomes) {
227 :     my $genomeH = $famGenomes{$fam};
228 :     for my $genome (keys %$genomeH) {
229 :     $sap->InsertObject('HasRepresentativeOf', from_link => $genome,
230 :     to_link => $fam);
231 :     $stats->Add('insert-HasRepresentativeOf' => 1);
232 :     }
233 :     }
234 :     # Now read the coupling data.
235 :     Trace("Processing coupling data.") if T(SaplingDataLoader => 2);
236 :     $ih = Open(undef, "<$figFamDir/coupling.values");
237 :     while (! eof $ih) {
238 :     my ($from, $to, $expScore, $contigScore) = Tracer::GetLine($ih);
239 :     $stats->Add(familyCouplingRecord => 1);
240 :     # Verify that both FIGfams are ours and are distinct.
241 :     if (! $figFams{$from} || ! $figFams{$to}) {
242 :     $stats->Add(couplingFigFamNotFound => 1);
243 :     } elsif ($from eq $to) {
244 :     $stats->Add(couplingFigFamReflexive => 1);
245 :     } else {
246 :     # Everything's okay, so we can connect the two figfams together.
247 :     # Insure the ordering is correct.
248 :     if ($from > $to) {
249 :     ($from, $to) = ($to, $from);
250 :     }
251 :     # Forge the connection.
252 :     $sap->InsertObject('IsCoupledTo', from_link => $from,
253 :     to_link => $to, co_occurrence_evidence => $contigScore,
254 :     co_expression_evidence => $expScore);
255 :     $stats->Add('insert-IsCoupledTo' => 1);
256 :     }
257 :     }
258 :     close $ih;
259 :     # Return the statistics.
260 :     return $stats;
261 :     }
262 :    
263 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3