[Bio] / Sprout / SubsystemSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SubsystemSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SubsystemSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use base 'BaseSproutLoader';
26 :    
27 :     =head1 Sprout Subsystem Load Group Class
28 :    
29 :     =head2 Introduction
30 :    
31 :     The Load Group includes all of the major subsystem-based tables.
32 :    
33 :     =head3 new
34 :    
35 :     my $sl = SproutLoader->new($erdb, $source, $options, @tables);
36 :    
37 :     Construct a new SproutLoader object.
38 :    
39 :     =over 4
40 :    
41 :     =item erdb
42 :    
43 :     [[SproutPm]] object for the database being loaded.
44 :    
45 :     =item options
46 :    
47 :     Reference to a hash of command-line options.
48 :    
49 :     =item tables
50 :    
51 :     List of tables in this load group.
52 :    
53 :     =back
54 :    
55 :     =cut
56 :    
57 :     sub new {
58 :     # Get the parameters.
59 : parrello 1.2 my ($class, $erdb, $options) = @_;
60 : parrello 1.1 # Create the table list.
61 :     my @tables = sort qw(Diagram RoleOccursIn Subsystem Role RoleEC IsIdentifiedByEC Catalyzes SSCell ContainsFeature IsGenomeOf IsRoleOf OccursInSubsystem ParticipatesIn HasSSCell RoleSubset GenomeSubset ConsistsOfRoles ConsistsOfGenomes HasRoleSubset HasGenomeSubset SubsystemClass SubsystemHopeNotes);
62 :     # Create the BaseSproutLoader object.
63 : parrello 1.2 my $retVal = BaseSproutLoader::new($class, $erdb, $options, @tables);
64 : parrello 1.1 # Return it.
65 :     return $retVal;
66 :     }
67 :    
68 :     =head2 Public Methods
69 :    
70 :     =head3 Generate
71 :    
72 :     $sl->Generate();
73 :    
74 :     Generate the data for the subsystem-based files.
75 :    
76 :     =cut
77 :    
78 :     sub Generate {
79 :     # Get the parameters.
80 :     my ($self) = @_;
81 :     # Get the sprout object.
82 :     my $sprout = $self->db();
83 :     # Get the FIG object.
84 :     my $fig = $self->source();
85 :     # Get the subsystem list.
86 :     my $subHash = $self->GetSubsystems();
87 : parrello 1.3 # Check the section type.
88 : parrello 1.1 if ($self->global()) {
89 :     # In global mode, we generate the subsystem/role data.
90 :     Trace("Generating subsystem data.") if T(2);
91 : parrello 1.3 # Get the list of Sprout genomes. We need this for pruning the genome subsets.
92 :     # The genomes are the section IDs that look like genome numbers.
93 :     my %genomeHash = map { $_ => 1 } grep { $_ =~ /\d+\.\d+/ } BaseSproutLoader::GetSectionList($sprout, $fig);
94 : parrello 1.1 # This hash will contain the roles for each EC. When we're done, this
95 :     # information will be used to generate the Catalyzes table.
96 :     my %ecToRoles = ();
97 :     # Loop through the subsystems. Our first task will be to create the
98 :     # roles. We do this by looping through the subsystems and creating a
99 :     # role hash. The hash tracks each role ID so that we don't create
100 :     # duplicates. As we move along, we'll connect the roles and subsystems
101 :     # and memorize up the reactions.
102 :     my ($genomeID, $roleID);
103 :     my %roleData = ();
104 :     for my $subsysID (sort keys %$subHash) {
105 :     # Get the subsystem object.
106 :     my $sub = $fig->get_subsystem($subsysID);
107 :     # Only proceed if the subsystem has a spreadsheet.
108 :     if (defined($sub) && ! $sub->{empty_ss}) {
109 :     Trace("Creating subsystem $subsysID.") if T(3);
110 :     $self->Add("subsystemIn");
111 : parrello 1.2 # Get the subsystem description. If it's undefined we change it to a
112 :     # null string.
113 :     my $description = $sub->get_description() || "";
114 : parrello 1.1 # Create the subsystem record.
115 :     $self->PutE(Subsystem => $subsysID, curator => $sub->get_curator(),
116 : parrello 1.2 version => $sub->get_version(), description => $description,
117 : parrello 1.1 notes => $sub->get_notes);
118 :     # Add the hope notes.
119 :     my $hopeNotes = $sub->get_hope_curation_notes();
120 :     if ($hopeNotes) {
121 : parrello 1.2 $self->PutE(SubsystemHopeNotes => $subsysID, 'hope-curation-notes' => $hopeNotes);
122 : parrello 1.1 }
123 :     # Now for the classification string. This comes back as a list
124 :     # reference and we convert it to a splitter-delimited string.
125 :     my $classList = $fig->subsystem_classification($subsysID);
126 :     my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
127 :     $self->PutE(SubsystemClass => $subsysID, classification => $classString);
128 :     # Connect the subsystem to its roles. Each role is a column in the subsystem spreadsheet.
129 :     for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
130 :     # Get the role's abbreviation.
131 :     my $abbr = $sub->get_role_abbr($col);
132 : parrello 1.2 # Get its relevance.
133 :     my $aux = ($fig->is_aux_role_in_subsystem($subsysID, $roleID) ? 1 : 0);
134 : parrello 1.1 # Get its reaction note.
135 :     my $hope_note = $sub->get_hope_reaction_notes($roleID) || "";
136 :     # Connect to this role.
137 :     $self->Add(roleIn => 1);
138 :     $self->PutR(OccursInSubsystem => $roleID, $subsysID, abbr => $abbr, auxiliary => $aux,
139 :     'column-number' => $col, 'hope-reaction-note' => $hope_note);
140 :     # If it's a new role, add it to the role table.
141 :     if (! exists $roleData{$roleID}) {
142 :     # Add the role.
143 :     $self->Put('Role', id => $roleID);
144 :     $roleData{$roleID} = 1;
145 :     # Check for an EC number.
146 :     if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
147 :     my $ec = $1;
148 :     $self->PutR(IsIdentifiedByEC => $roleID, $ec);
149 :     # Check to see if this is our first encounter with this EC.
150 :     if (exists $ecToRoles{$ec}) {
151 :     # No, so just add this role to the EC list.
152 :     push @{$ecToRoles{$ec}}, $roleID;
153 :     } else {
154 :     # Output this EC.
155 :     $self->PutE(RoleEC => $ec);
156 :     # Create its role list.
157 :     $ecToRoles{$ec} = [$roleID];
158 :     }
159 :     }
160 :     }
161 :     }
162 : parrello 1.2 # Connect this subsystem's roles to its reactions.
163 :     my %reactions = $sub->get_hope_reactions();
164 :     for my $role (keys %reactions) {
165 :     my @reactions = @{$reactions{$role}};
166 :     for my $reaction (@reactions) {
167 :     $self->PutR(Catalyzes => $role, $reaction);
168 : parrello 1.1 }
169 :     }
170 :     }
171 : parrello 1.2 # Now we need to generate the subsets. The subset names must be concatenated to
172 :     # the subsystem name to make them unique keys. There are two types of subsets:
173 :     # genome subsets and role subsets. We do the role subsets first.
174 :     my @subsetNames = $sub->get_subset_names();
175 :     for my $subsetID (@subsetNames) {
176 :     # Create the subset record.
177 :     my $actualID = "$subsysID:$subsetID";
178 :     $self->PutE(RoleSubset => $actualID);
179 :     # Connect the subset to the subsystem.
180 :     $self->PutR(HasRoleSubset => $subsysID, $actualID);
181 :     # Connect the subset to its roles.
182 :     my @roles = $sub->get_subsetC_roles($subsetID);
183 :     for my $roleID (@roles) {
184 :     $self->PutR(ConsistsOfRoles => $actualID, $roleID);
185 :     }
186 :     }
187 :     # Next the genome subsets.
188 :     @subsetNames = $sub->get_subset_namesR();
189 :     for my $subsetID (@subsetNames) {
190 :     # Create the subset record.
191 :     my $actualID = "$subsysID:$subsetID";
192 :     $self->PutE(GenomeSubset => $actualID);
193 :     # Connect the subset to the subsystem.
194 :     $self->PutR(HasGenomeSubset => $subsysID, $actualID);
195 :     # Connect the subset to its genomes.
196 :     my @genomes = $sub->get_subsetR($subsetID);
197 :     for my $genomeID (@genomes) {
198 : parrello 1.3 # Only include genomes that are ours.
199 :     if ($genomeHash{$genomeID}) {
200 :     $self->PutR(ConsistsOfGenomes => $actualID, $genomeID);
201 :     }
202 : parrello 1.2 }
203 :     }
204 : parrello 1.4 # Clear the subsystem cache to make room for more data.
205 :     $fig->clear_subsystem_cache();
206 : parrello 1.1 }
207 :     # Now we loop through the diagrams. We need to create the diagram records
208 :     # and link each diagram to its roles. Note that only roles which occur
209 :     # in subsystems (and therefore appear in the %ecToRoles hash) are
210 :     # included.
211 :     for my $map ($fig->all_maps()) {
212 :     Trace("Loading diagram $map.") if T(3);
213 :     # Get the diagram's descriptive name.
214 :     my $name = $fig->map_name($map);
215 :     $self->PutE(Diagram => $map, name => $name);
216 :     # Now we need to link all the map's roles to it.
217 :     # A hash is used to prevent duplicates.
218 :     my %roleHash = ();
219 :     for my $ec ($fig->map_to_ecs($map)) {
220 :     if (exists $ecToRoles{$ec}) {
221 :     for my $role (@{$ecToRoles{$ec}}) {
222 :     if (! $roleHash{$role}) {
223 :     $self->PutR(RoleOccursIn => $role, $map);
224 :     $roleHash{$role} = 1;
225 :     }
226 :     }
227 :     }
228 :     }
229 :     }
230 :     } else {
231 :     # Here we have a genome section.
232 :     my $genomeID = $self->section();
233 :     Trace("Connecting $genomeID to subsystems.") if T(3);
234 :     # Get the subsystem data for this genome.
235 :     my %subLists = $fig->get_all_subsystem_pegs($genomeID);
236 :     # Loop through the subsystems.
237 :     for my $subsysID (sort keys %subLists) {
238 : parrello 1.2 # Only proceed if this subsystem is one of ours.
239 :     if (exists $subHash->{$subsysID}) {
240 :     Trace("Processing subsystem $subsysID.") if T(3);
241 :     # Get the subsystem's roles. We create a hash that maps role ID to its
242 :     # column index.
243 :     my $col = 0;
244 :     my $subObject = $fig->get_subsystem($subsysID);
245 :     my %roles = map { $_ => $col++ } $subObject->get_roles();
246 :     # Create a list for the PEGs we find. This list will be used
247 :     # to generate cluster numbers.
248 :     my %pegsFound = ();
249 :     my $aPegFound = 0;
250 :     # Create hashes that maps spreadsheet IDs to pegs in cells.
251 :     # We will use this to generate the ContainsFeature data
252 :     # after we have the cluster numbers.
253 :     my %cellPegs = ();
254 :     # We'll stash the variant code in here.
255 :     my $variant;
256 :     # Loop through the subsystem's pegs.
257 :     for my $pegTuple (@{$subLists{$subsysID}}) {
258 :     my ($roleID, $pegID, $variantCode) = @$pegTuple;
259 :     # Save the variant ID. (It should be the same for each peg.)
260 :     $variant = $variantCode;
261 :     # Only proceed if this feature exists.
262 :     if ($fig->is_deleted_fid($pegID)) {
263 :     $self->Add('deleted-pegs' => 1);
264 :     } else {
265 :     # Get the column number.
266 :     my $col = $roles{$roleID};
267 :     # Compute the spreadsheet cell ID.
268 :     my $cellID = ERDB::DigestKey("$subsysID:$genomeID:$col");
269 :     # Remember this feature. Note we put -1 in the pegs-found
270 :     # hash. This is to help us when we generate cluster numbers.
271 :     $pegsFound{$pegID} = -1;
272 :     $aPegFound = 1;
273 :     # Record this cells.
274 :     push @{$cellPegs{$cellID}}, $pegID;
275 :     }
276 :     }
277 :     # Generate all the subsystem's spreadsheet cells for this genome.
278 :     for my $roleID (keys %roles) {
279 :     # Compute the cell ID.
280 :     my $columnNumber = $roles{$roleID};
281 :     my $cellID = ERDB::DigestKey("$subsysID:$genomeID:$columnNumber");
282 :     # Connect it to the subsystem.
283 :     $self->PutE(SSCell => $cellID, column_number => $columnNumber);
284 :     $self->PutR(HasSSCell => $subsysID, $cellID);
285 :     $self->PutR(IsRoleOf => $roleID, $cellID);
286 :     $self->PutR(IsGenomeOf => $genomeID, $cellID);
287 : parrello 1.1 }
288 : parrello 1.2 # If we found some cells for this genome, we need to compute clusters and
289 :     # denote it participates in the subsystem.
290 :     if ($aPegFound) {
291 :     # Connect the genome to the subsystem.
292 :     $self->PutR(ParticipatesIn => $genomeID, $subsysID,
293 :     'variant-code' => $variant);
294 :     # Partition the PEGs found into clusters.
295 :     my @clusters = $fig->compute_clusters([keys %pegsFound], $subObject);
296 :     for (my $i = 0; $i <= $#clusters; $i++) {
297 :     my $subList = $clusters[$i];
298 :     for my $peg (@{$subList}) {
299 :     $pegsFound{$peg} = $i;
300 :     }
301 :     }
302 :     # Create the ContainsFeature data.
303 :     for my $cellID (keys %cellPegs) {
304 :     my $cellList = $cellPegs{$cellID};
305 :     for my $cellPeg (@$cellList) {
306 :     $self->PutR(ContainsFeature => $cellID, $cellPeg,
307 :     'cluster-number' => $pegsFound{$cellPeg});
308 :     }
309 : parrello 1.1 }
310 :     }
311 : parrello 1.4 # Clear the cache to make room for more subsystems.
312 :     $fig->clear_subsystem_cache();
313 : parrello 1.1 }
314 :     }
315 :     }
316 :     }
317 :    
318 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3