[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Thu May 28 18:08:56 2009 UTC revision 1.12, Sun Feb 13 13:02:30 2011 UTC
# Line 27  Line 27 
27      use HyperLink;      use HyperLink;
28      use AliasAnalysis;      use AliasAnalysis;
29      use LoaderUtils;      use LoaderUtils;
30        use Digest::MD5;
31        use SeedUtils;
32      use base 'BaseSaplingLoader';      use base 'BaseSaplingLoader';
33    
34  =head1 Sapling Feature Load Group Class  =head1 Sapling Feature Load Group Class
# Line 45  Line 47 
47    
48  =item erdb  =item erdb
49    
50  [[SaplingPm]] object for the database being loaded.  L<Sapling> object for the database being loaded.
51    
52  =item options  =item options
53    
# Line 64  Line 66 
66      my ($class, $erdb, $options) = @_;      my ($class, $erdb, $options) = @_;
67      # Create the table list.      # Create the table list.
68      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink
69                           FeatureVirulent IsOwnerOf IsLocatedIn Identifies                           FeatureVirulent IsOwnerOf IsLocatedIn IsIdentifiedBy
70                           Identifier IsNamedBy ProteinSequence Concerns                           Identifier IsNamedBy ProteinSequence Concerns
71                           IsAttachmentSiteFor Publication IsProteinFor);                           IsAttachmentSiteFor Publication IsProteinFor
72                             Role RoleIndex IsFunctionalIn);
73      # Create the BaseSaplingLoader object.      # Create the BaseSaplingLoader object.
74      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
75      # Return it.      # Return it.
# Line 88  Line 91 
91      my ($self) = @_;      my ($self) = @_;
92      # Get the database object.      # Get the database object.
93      my $erdb = $self->db();      my $erdb = $self->db();
94      # Only proceed if this is a normal section. There's no global feature data.      # Check for local or global.
95      if (! $self->global()) {      if (! $self->global()) {
96          # Get the section ID.          # Here we are generating data for a genome.
97          my $genomeID = $self->section();          my $genomeID = $self->section();
98          # Load this genome's features.          # Load this genome's features.
99          $self->LoadGenomeFeatures($genomeID);          $self->LoadGenomeFeatures($genomeID);
100        } else {
101            # The global data is the roles from subsystems and the publications.
102            my $fig = $self->source();
103            # We need the master map of roles to IDs.
104            my %roleHash;
105            my $lastRoleIndex = -1;
106            my $roleMapFile = $erdb->LoadDirectory() . "/roleMap.tbl";
107            if (-f $roleMapFile) {
108                for my $mapLine (Tracer::GetFile($roleMapFile)) {
109                    my ($role, $idx) = split /\t/, $mapLine;
110                    $roleHash{$role} = $idx;
111                    if ($idx > $lastRoleIndex) {
112                        $lastRoleIndex = $idx;
113                    }
114                }
115            }
116            # We'll track duplicate roles in here.
117            my %roleList;
118            # Now we get the subsystem list.
119            my $subHash = $erdb->SubsystemHash();
120            for my $sub (sort keys %$subHash) {
121                $self->Add(subsystems => 1);
122                Trace("Processing roles for $sub.") if T(3);
123                # Get this subsystem's roles and write them out.
124                my @roles = $fig->subsystem_to_roles($sub);
125                for my $role (@roles) {
126                    $self->Add(subsystemRoles => 1);
127                    # Check to see if this role is hypothetical.
128                    my $hypo = hypo($role);
129                    if (! $hypo) {
130                        # Is this role in the role index hash?
131                        my $roleIndex = $roleHash{$role};
132                        if (! defined $roleIndex) {
133                            # No, compute a new index for it.
134                            $roleIndex = ++$lastRoleIndex;
135                            $roleHash{$role} = $roleIndex;
136                        }
137                        if (! $roleList{$role}) {
138                            $roleList{$role} = 1;
139                            $self->PutE(RoleIndex => $role, role_index => $roleIndex);
140                        }
141                    }
142                    $self->PutE(Role => $role, hypothetical => $hypo);
143                }
144            }
145            Trace("Subsystem roles generated.") if T(2);
146            # Write out the role master file.
147            Tracer::PutFile($roleMapFile, [map { "$_\t$roleHash{$_}" } keys %roleHash]);
148            Trace("Role master file written to $roleMapFile.") if T(2);
149            # Now, we get the publications.
150            my $pubs = $fig->all_titles();
151            for my $pub (@$pubs) {
152                # Get the ID and title.
153                my ($pubmedID, $title) = @$pub;
154                # Only proceed if the ID is valid.
155                if ($pubmedID) {
156                    # Create a hyperlink from the title and the pubmed ID.
157                    my $link;
158                    if (! $title) {
159                        $link = HyperLink->new("<unknown>");
160                    } else {
161                        $link = HyperLink->new($title, "http://www.ncbi.nlm.nih.gov/pubmed/$pubmedID");
162                    }
163                    # Create the publication record.
164                    $self->PutE(Publication => $pubmedID, citation => $link);
165                }
166            }
167            Trace("Publications generated.") if T(2);
168      }      }
169  }  }
170    
# Line 126  Line 197 
197      my $aliasDir = $sapling->LoadDirectory() . "/AliasData";      my $aliasDir = $sapling->LoadDirectory() . "/AliasData";
198      my $aliasHash = LoaderUtils::ReadAliasFile($aliasDir, $genomeID);      my $aliasHash = LoaderUtils::ReadAliasFile($aliasDir, $genomeID);
199      if (! defined $aliasHash) {      if (! defined $aliasHash) {
200          Trace("No aliases found for $genomeID.") if T(1);          Trace("No aliases found for $genomeID.") if T(ERDBLoadGroup => 1);
201          $self->Add(missingAliasFile => 1);          $self->Add(missingAliasFile => 1);
202          $aliasHash = {};          $aliasHash = {};
203      }      }
# Line 138  Line 209 
209          my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,          my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,
210              $assignmentMaker, $quality) = @$feature;              $assignmentMaker, $quality) = @$feature;
211          $self->Track(Features => $fid, 1000);          $self->Track(Features => $fid, 1000);
212          # Fix the assignment for non-PEG features.          # Fix missing assignments. For RNAs, the assignment may be in the alias list.
213          if (! defined $assignment) {          if (! defined $assignment) {
214                if ($type eq 'rna') {
215              $assignment = $aliases;              $assignment = $aliases;
216              $assignmentMaker ||= 'master';              $assignmentMaker ||= 'master';
217                } else {
218                    $assignment = '';
219                }
220          }          }
221          # Convert the location string to a list of location objects.          # Convert the location string to a list of location objects.
222          my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;          my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;
# Line 180  Line 255 
255              if ($assignment =~ /att([LR])\s+for\s+(fig\|.+)/) {              if ($assignment =~ /att([LR])\s+for\s+(fig\|.+)/) {
256                  $self->PutR(IsAttachmentSiteFor => $fid, $2, edge => $1);                  $self->PutR(IsAttachmentSiteFor => $fid, $2, edge => $1);
257              } else {              } else {
258                  Trace("Invalid attachment function for $fid: $assignment") if T(1);                  Trace("Invalid attachment function for $fid: $assignment") if T(ERDBLoadGroup => 1);
259                  $self->Add(badAttachment => 1);                  $self->Add(badAttachment => 1);
260              }              }
261          }          }
# Line 190  Line 265 
265                      locked => $fig->is_locked_fid($fid));                      locked => $fig->is_locked_fid($fid));
266          # Connect the feature to its genome.          # Connect the feature to its genome.
267          $self->PutR(IsOwnerOf => $genomeID, $fid);          $self->PutR(IsOwnerOf => $genomeID, $fid);
268            # Connect the feature to its roles.
269            my ($roles, $errors) = SeedUtils::roles_for_loading($assignment);
270            if (! defined $roles) {
271                # Here the functional assignment was suspicious.
272                $self->Add(suspiciousFunction => 1);
273                Trace("$fid has a suspicious function: $assignment") if T(ERDBLoadGroup => 1);
274            } else {
275                # Here we have a good assignment.
276                for my $role (@$roles) {
277                    $self->Add(featureRole => 1);
278                    $self->PutR(IsFunctionalIn => $role, $fid);
279                    $self->PutE(Role => $role, hypothetical => hypo($role));
280                }
281                $self->Add(badFeatureRoles => $errors);
282            }
283          # Now we have a whole bunch of attribute-related stuff to store in          # Now we have a whole bunch of attribute-related stuff to store in
284          # secondary Feature tables. First is the evidence codes.          # secondary Feature tables. First is the evidence codes. This is special
285            # because we have to save the DLIT numbers.
286            my @dlits;
287          my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');          my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');
288          for my $evidenceTuple (@evidenceTuples) {          for my $evidenceTuple (@evidenceTuples) {
289              my (undef, undef, $code) = @$evidenceTuple;              my (undef, undef, $code) = @$evidenceTuple;
290              $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);              $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);
291                # If this is a direct literature reference, save it.
292                if ($code =~ /dlit\((\d+)/) {
293                    push @dlits, $1;
294                    $self->Add(dlits => 1);
295                }
296          }          }
297          # Now we have the external links. These are stored using hyperlink objects.          # Now we have the external links. These are stored using hyperlink objects.
298          my @links = $fig->fid_links($fid);          my @links = $fig->fid_links($fid);
# Line 214  Line 311 
311          my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);          my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);
312          for my $essentialTuple (@essentials) {          for my $essentialTuple (@essentials) {
313              my (undef, undef, $essentialityType, $url) = @$essentialTuple;              my (undef, undef, $essentialityType, $url) = @$essentialTuple;
314                # Only keep this datum if it has a URL. The ones without URLs are
315                # all duplicates.
316                if ($url) {
317              # Form a hyperlink from this essentiality tuple.              # Form a hyperlink from this essentiality tuple.
318              my $link = HyperLink->new($essentialityType, $url);              my $link = HyperLink->new($essentialityType, $url);
319              # Store it as essentiality data for this feature.              # Store it as essentiality data for this feature.
320              $self->PutE(FeatureEssential => $fid, essential => $link);              $self->PutE(FeatureEssential => $fid, essential => $link);
321          }          }
322            }
323          # If this is a PEG, we have a protein sequence.          # If this is a PEG, we have a protein sequence.
324          my $proteinID;          my $proteinID;
325          if ($type eq 'peg') {          if ($type eq 'peg') {
326              # Get the translation.              # Get the translation.
327              my $proteinSequence = $fig->get_translation($fid);              my $proteinSequence = $fig->get_translation($fid);
328                if (! $proteinSequence) {
329                    Trace("No protein sequence found for $fid.") if T(2);
330                    $self->Add(missingProtein => 1);
331                    # Here there was some sort of error and the protein sequence did
332                    # not come back. Ask for the DNA and translate it instead.
333                    my $dna = $fig->get_dna_seq($fid);
334                    $proteinSequence = FIG::translate($dna, undef, 1);
335                }
336              # Compute the ID.              # Compute the ID.
337              $proteinID = ERDB::DigestKey($proteinSequence);              $proteinID = $sapling->ProteinID($proteinSequence);
338              # Create the protein record.              # Create the protein record.
339              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);
340              $self->PutR(IsProteinFor => $proteinID, $fid);              $self->PutR(IsProteinFor => $proteinID, $fid);
341              # Get the publications for this PEG.              # Connect this protein to the feature's publications (if any).
342              my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');              for my $pub (@dlits) {
343              for my $pub (@pubs) {                  $self->PutR(Concerns => $pub, $proteinID);
                 # Parse out the article title from the data.  
                 my (undef, undef, $data, $url) = @$pub;  
                 my @pieces = split /,/, $data, 3;  
                 if (defined $pieces[2]) {  
                     # Create the publication record.  
                     my $hl = HyperLink->new($pieces[2], $url);  
                     my $key = ERDB::DigestKey($url);  
                     $self->PutE(Publication => $key, citation => $hl);  
                     # Connect it to the protein.  
                     $self->PutR(Concerns => $key, $proteinID);  
                 }  
344              }              }
345          }          }
346          # Now we need to compute the identifiers. We start with the aliases.          # Now we need to compute the identifiers. We start with the aliases.
# Line 257  Line 355 
355              my $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;              my $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;
356              # Create the identifier record.              # Create the identifier record.
357              $self->PutE(Identifier => $aliasID, natural_form => $natural,              $self->PutE(Identifier => $aliasID, natural_form => $natural,
358                          source => $type);                          source => $aliasType);
359              # Is this a protein alias?              # Is this a protein alias?
360              if ($aliasConf eq 'C' && $proteinID) {              if ($aliasConf eq 'C' && $proteinID) {
361                  # Yes. Connect it using IsNamedBy.                  # Yes. Connect it using IsNamedBy.
362                  $self->PutR(IsNamedBy => $proteinID, $aliasID);                  $self->PutR(IsNamedBy => $proteinID, $aliasID);
363              } else {              } else {
364                  # No. Connect it to the feature.                  # No. Connect it to the feature.
365                  $self->PutR(Identifies => $aliasID, $fid, conf => $aliasConf);                  $self->PutR(IsIdentifiedBy => $fid, $aliasID, conf => $aliasConf);
366              }              }
367          }          }
368          # Finally, this feature is an alias of itself.          # Finally, this feature is an alias of itself.
369          $self->PutE(Identifier => $fid, natural_form => $fid,          $self->PutE(Identifier => $fid, natural_form => $fid,
370                      source => 'SEED');                      source => 'SEED');
371          $self->PutR(Identifies => $fid, $fid, conf => 'A');          $self->PutR(IsIdentifiedBy => $fid, $fid, conf => 'A');
         if ($proteinID) {  
             $self->PutR(IsNamedBy => $proteinID, $fid);  
         }  
372      }      }
373  }  }
374    

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.12

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3