[Bio] / Sprout / SubsystemSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SubsystemSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SubsystemSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use base 'BaseSproutLoader';
26 :    
27 :     =head1 Sprout Subsystem Load Group Class
28 :    
29 :     =head2 Introduction
30 :    
31 :     The Load Group includes all of the major subsystem-based tables.
32 :    
33 :     =head3 new
34 :    
35 :     my $sl = SproutLoader->new($erdb, $source, $options, @tables);
36 :    
37 :     Construct a new SproutLoader object.
38 :    
39 :     =over 4
40 :    
41 :     =item erdb
42 :    
43 :     [[SproutPm]] object for the database being loaded.
44 :    
45 :     =item options
46 :    
47 :     Reference to a hash of command-line options.
48 :    
49 :     =item tables
50 :    
51 :     List of tables in this load group.
52 :    
53 :     =back
54 :    
55 :     =cut
56 :    
57 :     sub new {
58 :     # Get the parameters.
59 : parrello 1.2 my ($class, $erdb, $options) = @_;
60 : parrello 1.1 # Create the table list.
61 :     my @tables = sort qw(Diagram RoleOccursIn Subsystem Role RoleEC IsIdentifiedByEC Catalyzes SSCell ContainsFeature IsGenomeOf IsRoleOf OccursInSubsystem ParticipatesIn HasSSCell RoleSubset GenomeSubset ConsistsOfRoles ConsistsOfGenomes HasRoleSubset HasGenomeSubset SubsystemClass SubsystemHopeNotes);
62 :     # Create the BaseSproutLoader object.
63 : parrello 1.2 my $retVal = BaseSproutLoader::new($class, $erdb, $options, @tables);
64 : parrello 1.1 # Return it.
65 :     return $retVal;
66 :     }
67 :    
68 :     =head2 Public Methods
69 :    
70 :     =head3 Generate
71 :    
72 :     $sl->Generate();
73 :    
74 :     Generate the data for the subsystem-based files.
75 :    
76 :     =cut
77 :    
78 :     sub Generate {
79 :     # Get the parameters.
80 :     my ($self) = @_;
81 :     # Get the sprout object.
82 :     my $sprout = $self->db();
83 :     # Get the FIG object.
84 :     my $fig = $self->source();
85 :     # Get the subsystem list.
86 :     my $subHash = $self->GetSubsystems();
87 :     if ($self->global()) {
88 :     # In global mode, we generate the subsystem/role data.
89 :     Trace("Generating subsystem data.") if T(2);
90 :     # This hash will contain the roles for each EC. When we're done, this
91 :     # information will be used to generate the Catalyzes table.
92 :     my %ecToRoles = ();
93 :     # Loop through the subsystems. Our first task will be to create the
94 :     # roles. We do this by looping through the subsystems and creating a
95 :     # role hash. The hash tracks each role ID so that we don't create
96 :     # duplicates. As we move along, we'll connect the roles and subsystems
97 :     # and memorize up the reactions.
98 :     my ($genomeID, $roleID);
99 :     my %roleData = ();
100 :     for my $subsysID (sort keys %$subHash) {
101 :     # Get the subsystem object.
102 :     my $sub = $fig->get_subsystem($subsysID);
103 :     # Only proceed if the subsystem has a spreadsheet.
104 :     if (defined($sub) && ! $sub->{empty_ss}) {
105 :     Trace("Creating subsystem $subsysID.") if T(3);
106 :     $self->Add("subsystemIn");
107 : parrello 1.2 # Get the subsystem description. If it's undefined we change it to a
108 :     # null string.
109 :     my $description = $sub->get_description() || "";
110 : parrello 1.1 # Create the subsystem record.
111 :     $self->PutE(Subsystem => $subsysID, curator => $sub->get_curator(),
112 : parrello 1.2 version => $sub->get_version(), description => $description,
113 : parrello 1.1 notes => $sub->get_notes);
114 :     # Add the hope notes.
115 :     my $hopeNotes = $sub->get_hope_curation_notes();
116 :     if ($hopeNotes) {
117 : parrello 1.2 $self->PutE(SubsystemHopeNotes => $subsysID, 'hope-curation-notes' => $hopeNotes);
118 : parrello 1.1 }
119 :     # Now for the classification string. This comes back as a list
120 :     # reference and we convert it to a splitter-delimited string.
121 :     my $classList = $fig->subsystem_classification($subsysID);
122 :     my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
123 :     $self->PutE(SubsystemClass => $subsysID, classification => $classString);
124 :     # Connect the subsystem to its roles. Each role is a column in the subsystem spreadsheet.
125 :     for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
126 :     # Get the role's abbreviation.
127 :     my $abbr = $sub->get_role_abbr($col);
128 : parrello 1.2 # Get its relevance.
129 :     my $aux = ($fig->is_aux_role_in_subsystem($subsysID, $roleID) ? 1 : 0);
130 : parrello 1.1 # Get its reaction note.
131 :     my $hope_note = $sub->get_hope_reaction_notes($roleID) || "";
132 :     # Connect to this role.
133 :     $self->Add(roleIn => 1);
134 :     $self->PutR(OccursInSubsystem => $roleID, $subsysID, abbr => $abbr, auxiliary => $aux,
135 :     'column-number' => $col, 'hope-reaction-note' => $hope_note);
136 :     # If it's a new role, add it to the role table.
137 :     if (! exists $roleData{$roleID}) {
138 :     # Add the role.
139 :     $self->Put('Role', id => $roleID);
140 :     $roleData{$roleID} = 1;
141 :     # Check for an EC number.
142 :     if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
143 :     my $ec = $1;
144 :     $self->PutR(IsIdentifiedByEC => $roleID, $ec);
145 :     # Check to see if this is our first encounter with this EC.
146 :     if (exists $ecToRoles{$ec}) {
147 :     # No, so just add this role to the EC list.
148 :     push @{$ecToRoles{$ec}}, $roleID;
149 :     } else {
150 :     # Output this EC.
151 :     $self->PutE(RoleEC => $ec);
152 :     # Create its role list.
153 :     $ecToRoles{$ec} = [$roleID];
154 :     }
155 :     }
156 :     }
157 :     }
158 : parrello 1.2 # Connect this subsystem's roles to its reactions.
159 :     my %reactions = $sub->get_hope_reactions();
160 :     for my $role (keys %reactions) {
161 :     my @reactions = @{$reactions{$role}};
162 :     for my $reaction (@reactions) {
163 :     $self->PutR(Catalyzes => $role, $reaction);
164 : parrello 1.1 }
165 :     }
166 :     }
167 : parrello 1.2 # Now we need to generate the subsets. The subset names must be concatenated to
168 :     # the subsystem name to make them unique keys. There are two types of subsets:
169 :     # genome subsets and role subsets. We do the role subsets first.
170 :     my @subsetNames = $sub->get_subset_names();
171 :     for my $subsetID (@subsetNames) {
172 :     # Create the subset record.
173 :     my $actualID = "$subsysID:$subsetID";
174 :     $self->PutE(RoleSubset => $actualID);
175 :     # Connect the subset to the subsystem.
176 :     $self->PutR(HasRoleSubset => $subsysID, $actualID);
177 :     # Connect the subset to its roles.
178 :     my @roles = $sub->get_subsetC_roles($subsetID);
179 :     for my $roleID (@roles) {
180 :     $self->PutR(ConsistsOfRoles => $actualID, $roleID);
181 :     }
182 :     }
183 :     # Next the genome subsets.
184 :     @subsetNames = $sub->get_subset_namesR();
185 :     for my $subsetID (@subsetNames) {
186 :     # Create the subset record.
187 :     my $actualID = "$subsysID:$subsetID";
188 :     $self->PutE(GenomeSubset => $actualID);
189 :     # Connect the subset to the subsystem.
190 :     $self->PutR(HasGenomeSubset => $subsysID, $actualID);
191 :     # Connect the subset to its genomes.
192 :     my @genomes = $sub->get_subsetR($subsetID);
193 :     for my $genomeID (@genomes) {
194 :     $self->PutR(ConsistsOfGenomes => $actualID, $genomeID);
195 :     }
196 :     }
197 : parrello 1.1 }
198 :     # Now we loop through the diagrams. We need to create the diagram records
199 :     # and link each diagram to its roles. Note that only roles which occur
200 :     # in subsystems (and therefore appear in the %ecToRoles hash) are
201 :     # included.
202 :     for my $map ($fig->all_maps()) {
203 :     Trace("Loading diagram $map.") if T(3);
204 :     # Get the diagram's descriptive name.
205 :     my $name = $fig->map_name($map);
206 :     $self->PutE(Diagram => $map, name => $name);
207 :     # Now we need to link all the map's roles to it.
208 :     # A hash is used to prevent duplicates.
209 :     my %roleHash = ();
210 :     for my $ec ($fig->map_to_ecs($map)) {
211 :     if (exists $ecToRoles{$ec}) {
212 :     for my $role (@{$ecToRoles{$ec}}) {
213 :     if (! $roleHash{$role}) {
214 :     $self->PutR(RoleOccursIn => $role, $map);
215 :     $roleHash{$role} = 1;
216 :     }
217 :     }
218 :     }
219 :     }
220 :     }
221 :     } else {
222 :     # Here we have a genome section.
223 :     my $genomeID = $self->section();
224 :     Trace("Connecting $genomeID to subsystems.") if T(3);
225 :     # Get the subsystem data for this genome.
226 :     my %subLists = $fig->get_all_subsystem_pegs($genomeID);
227 :     # Loop through the subsystems.
228 :     for my $subsysID (sort keys %subLists) {
229 : parrello 1.2 # Only proceed if this subsystem is one of ours.
230 :     if (exists $subHash->{$subsysID}) {
231 :     Trace("Processing subsystem $subsysID.") if T(3);
232 :     # Get the subsystem's roles. We create a hash that maps role ID to its
233 :     # column index.
234 :     my $col = 0;
235 :     my $subObject = $fig->get_subsystem($subsysID);
236 :     my %roles = map { $_ => $col++ } $subObject->get_roles();
237 :     # Create a list for the PEGs we find. This list will be used
238 :     # to generate cluster numbers.
239 :     my %pegsFound = ();
240 :     my $aPegFound = 0;
241 :     # Create hashes that maps spreadsheet IDs to pegs in cells.
242 :     # We will use this to generate the ContainsFeature data
243 :     # after we have the cluster numbers.
244 :     my %cellPegs = ();
245 :     # We'll stash the variant code in here.
246 :     my $variant;
247 :     # Loop through the subsystem's pegs.
248 :     for my $pegTuple (@{$subLists{$subsysID}}) {
249 :     my ($roleID, $pegID, $variantCode) = @$pegTuple;
250 :     # Save the variant ID. (It should be the same for each peg.)
251 :     $variant = $variantCode;
252 :     # Only proceed if this feature exists.
253 :     if ($fig->is_deleted_fid($pegID)) {
254 :     $self->Add('deleted-pegs' => 1);
255 :     } else {
256 :     # Get the column number.
257 :     my $col = $roles{$roleID};
258 :     # Compute the spreadsheet cell ID.
259 :     my $cellID = ERDB::DigestKey("$subsysID:$genomeID:$col");
260 :     # Remember this feature. Note we put -1 in the pegs-found
261 :     # hash. This is to help us when we generate cluster numbers.
262 :     $pegsFound{$pegID} = -1;
263 :     $aPegFound = 1;
264 :     # Record this cells.
265 :     push @{$cellPegs{$cellID}}, $pegID;
266 :     }
267 :     }
268 :     # Generate all the subsystem's spreadsheet cells for this genome.
269 :     for my $roleID (keys %roles) {
270 :     # Compute the cell ID.
271 :     my $columnNumber = $roles{$roleID};
272 :     my $cellID = ERDB::DigestKey("$subsysID:$genomeID:$columnNumber");
273 :     # Connect it to the subsystem.
274 :     $self->PutE(SSCell => $cellID, column_number => $columnNumber);
275 :     $self->PutR(HasSSCell => $subsysID, $cellID);
276 :     $self->PutR(IsRoleOf => $roleID, $cellID);
277 :     $self->PutR(IsGenomeOf => $genomeID, $cellID);
278 : parrello 1.1 }
279 : parrello 1.2 # If we found some cells for this genome, we need to compute clusters and
280 :     # denote it participates in the subsystem.
281 :     if ($aPegFound) {
282 :     # Connect the genome to the subsystem.
283 :     $self->PutR(ParticipatesIn => $genomeID, $subsysID,
284 :     'variant-code' => $variant);
285 :     # Partition the PEGs found into clusters.
286 :     my @clusters = $fig->compute_clusters([keys %pegsFound], $subObject);
287 :     for (my $i = 0; $i <= $#clusters; $i++) {
288 :     my $subList = $clusters[$i];
289 :     for my $peg (@{$subList}) {
290 :     $pegsFound{$peg} = $i;
291 :     }
292 :     }
293 :     # Create the ContainsFeature data.
294 :     for my $cellID (keys %cellPegs) {
295 :     my $cellList = $cellPegs{$cellID};
296 :     for my $cellPeg (@$cellList) {
297 :     $self->PutR(ContainsFeature => $cellID, $cellPeg,
298 :     'cluster-number' => $pegsFound{$cellPeg});
299 :     }
300 : parrello 1.1 }
301 :     }
302 :     }
303 :     }
304 :     }
305 :     }
306 :    
307 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3