[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Mon Mar 2 22:22:11 2009 UTC revision 1.18, Wed Aug 24 17:58:54 2011 UTC
# Line 26  Line 26 
26      use BasicLocation;      use BasicLocation;
27      use HyperLink;      use HyperLink;
28      use AliasAnalysis;      use AliasAnalysis;
29        use LoaderUtils;
30        use SeedUtils;
31        use gjoseqlib;
32        use MD5Computer;
33      use base 'BaseSaplingLoader';      use base 'BaseSaplingLoader';
34    
35  =head1 Sapling Feature Load Group Class  =head1 Sapling Feature Load Group Class
# Line 44  Line 48 
48    
49  =item erdb  =item erdb
50    
51  [[SaplingPm]] object for the database being loaded.  L<Sapling> object for the database being loaded.
52    
53  =item options  =item options
54    
# Line 63  Line 67 
67      my ($class, $erdb, $options) = @_;      my ($class, $erdb, $options) = @_;
68      # Create the table list.      # Create the table list.
69      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink
70                           FeatureVirulent IsOwnerOf IsLocatedIn Identifies                           FeatureVirulent IsOwnerOf IsLocatedIn IsIdentifiedBy
71                           Identifier IsNamedBy ProteinSequence Concerns Publication);                           Identifier IsNamedBy ProteinSequence Concerns
72                             IsAttachmentSiteFor Publication IsProteinFor
73                             Role RoleIndex IsFunctionalIn);
74      # Create the BaseSaplingLoader object.      # Create the BaseSaplingLoader object.
75      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
76      # Return it.      # Return it.
# Line 86  Line 92 
92      my ($self) = @_;      my ($self) = @_;
93      # Get the database object.      # Get the database object.
94      my $erdb = $self->db();      my $erdb = $self->db();
95      # Only proceed if this is a normal section. There's no global feature data.      # Check for local or global.
96      if (! $self->global()) {      if (! $self->global()) {
97          # Get the section ID.          # Here we are generating data for a genome.
98          my $genomeID = $self->section();          my $genomeID = $self->section();
99          # Load this genome's features.          # Load this genome's features.
100          $self->LoadGenomeFeatures($genomeID);          $self->LoadGenomeFeatures($genomeID);
101        } else {
102            # The global data is the roles from subsystems and the publications.
103            my $fig = $self->source();
104            # We need the master map of roles to IDs.
105            my %roleHash;
106            my $lastRoleIndex = -1;
107            my $roleMapFile = $erdb->LoadDirectory() . "/roleMap.tbl";
108            if (-f $roleMapFile) {
109                for my $mapLine (Tracer::GetFile($roleMapFile)) {
110                    my ($role, $idx) = split /\t/, $mapLine;
111                    $roleHash{$role} = $idx;
112                    if ($idx > $lastRoleIndex) {
113                        $lastRoleIndex = $idx;
114                    }
115                }
116            }
117            # We'll track duplicate roles in here.
118            my %roleList;
119            # Now we get the subsystem list.
120            my $subHash = $erdb->SubsystemHash();
121            for my $sub (sort keys %$subHash) {
122                $self->Add(subsystems => 1);
123                Trace("Processing roles for $sub.") if T(3);
124                # Get this subsystem's roles and write them out.
125                my @roles = $fig->subsystem_to_roles($sub);
126                for my $role (@roles) {
127                    $self->Add(subsystemRoles => 1);
128                    # Check to see if this role is hypothetical.
129                    my $hypo = hypo($role);
130                    if (! $hypo) {
131                        # Is this role in the role index hash?
132                        my $roleIndex = $roleHash{$role};
133                        if (! defined $roleIndex) {
134                            # No, compute a new index for it.
135                            $roleIndex = ++$lastRoleIndex;
136                            $roleHash{$role} = $roleIndex;
137                        }
138                        if (! $roleList{$role}) {
139                            $roleList{$role} = 1;
140                            $self->PutE(RoleIndex => $role, role_index => $roleIndex);
141                        }
142                    }
143                    $self->PutE(Role => $role, hypothetical => $hypo);
144                }
145            }
146            Trace("Subsystem roles generated.") if T(2);
147            # Write out the role master file.
148            Tracer::PutFile($roleMapFile, [map { "$_\t$roleHash{$_}" } keys %roleHash]);
149            Trace("Role master file written to $roleMapFile.") if T(2);
150            # Now, we get the publications.
151            my $pubs = $fig->all_titles();
152            for my $pub (@$pubs) {
153                # Get the ID and title.
154                my ($pubmedID, $title) = @$pub;
155                # Only proceed if the ID is valid.
156                if ($pubmedID) {
157                    # Create a hyperlink from the title and the pubmed ID.
158                    my $link;
159                    if (! $title) {
160                        $link = HyperLink->new("<unknown>");
161                    } else {
162                        $link = HyperLink->new($title, "http://www.ncbi.nlm.nih.gov/pubmed/$pubmedID");
163                    }
164                    # Create the publication record.
165                    $self->PutE(Publication => $pubmedID, citation => $link);
166                }
167            }
168            Trace("Publications generated.") if T(2);
169      }      }
170  }  }
171    
# Line 120  Line 194 
194      my $sapling = $self->db();      my $sapling = $self->db();
195      # Get the maximum location  segment length. We'll need this later.      # Get the maximum location  segment length. We'll need this later.
196      my $maxLength = $sapling->TuningParameter('maxLocationLength');      my $maxLength = $sapling->TuningParameter('maxLocationLength');
197        # Get the genome's aliases.
198        my $aliasDir = $sapling->LoadDirectory() . "/AliasData";
199        my $aliasHash = LoaderUtils::ReadAliasFile($aliasDir, $genomeID);
200        if (! defined $aliasHash) {
201            Trace("No aliases found for $genomeID.") if T(ERDBLoadGroup => 1);
202            $self->Add(missingAliasFile => 1);
203            $aliasHash = {};
204        }
205        # Get all of this genome's protein sequences.
206        my %seqs = map { $_->[0] => $_->[2] } gjoseqlib::read_fasta("$FIG_Config::organisms/$genomeID/Features/peg/fasta");
207      # Get all of this genome's features.      # Get all of this genome's features.
208      my $featureList = $fig->all_features_detailed_fast($genomeID);      my $featureList = $fig->all_features_detailed_fast($genomeID);
209        # Compute the MD5 identifiers for the genome and its contigs.
210        my $genomeMD5Data = MD5Computer->new_from_fasta("$FIG_Config::organisms/$genomeID/contigs");
211      # Loop through them.      # Loop through them.
212      for my $feature (@$featureList) {      for my $feature (@$featureList) {
213          # Get this feature's data.          # Get this feature's data.
214          my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,          my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,
215              $assignmentMaker, $quality) = @$feature;              $assignmentMaker, $quality) = @$feature;
216          $self->Track(Features => $fid, 1000);          $self->Track(Features => $fid, 1000);
217          # Fix the assignment for non-PEG features.          # Fix missing assignments. For RNAs, the assignment may be in the alias list.
218          if (! defined $assignment) {          if (! defined $assignment) {
219                if ($type eq 'rna') {
220              $assignment = $aliases;              $assignment = $aliases;
221              $assignmentMaker ||= 'master';              $assignmentMaker ||= 'master';
222                } else {
223                    $assignment = '';
224                }
225          }          }
226          # Convert the location string to a list of location objects.          # Convert the location string to a list of location objects.
227          my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;          my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;
# Line 164  Line 254 
254              $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN,              $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN,
255                          begin => $loc->Left(), dir => $dir, len => $loc->Length());                          begin => $loc->Left(), dir => $dir, len => $loc->Length());
256          }          }
257            # Is this an attachment site?
258            if ($type eq 'att') {
259                # Yes, connect it to the attached feature.
260                if ($assignment =~ /att([LR])\s+for\s+(fig\|.+)/) {
261                    $self->PutR(IsAttachmentSiteFor => $fid, $2, edge => $1);
262                } else {
263                    Trace("Invalid attachment function for $fid: $assignment") if T(ERDBLoadGroup => 1);
264                    $self->Add(badAttachment => 1);
265                }
266            }
267            # Compute the MD5 identifier. This may fail if there is an error in the
268            # feature definition.
269            my $md5Alias;
270            eval {
271                $md5Alias = $genomeMD5Data->ComputeFeatureMD5($type, map { $_->String() } @locs);
272            };
273            if ($@) {
274                Trace("Error in $fid MD5 computation: $@") if T(0);
275                $self->Add(md5ComputeError => 1);
276            }
277          # Emit the feature record.          # Emit the feature record.
278          $self->PutE(Feature => $fid, feature_type => $type,          $self->PutE(Feature => $fid, feature_type => $type,
279                      sequence_length => $seqLen, function => $assignment,                      sequence_length => $seqLen, function => $assignment,
280                      locked => $fig->is_locked_fid($fid));                      locked => $fig->is_locked_fid($fid));
281          # Connect the feature to its genome.          # Connect the feature to its genome.
282          $self->PutR(IsOwnerOf => $genomeID, $fid);          $self->PutR(IsOwnerOf => $genomeID, $fid);
283            # Connect the feature to its roles.
284            my ($roles, $errors) = SeedUtils::roles_for_loading($assignment);
285            if (! defined $roles) {
286                # Here the functional assignment was suspicious.
287                $self->Add(suspiciousFunction => 1);
288                Trace("$fid has a suspicious function: $assignment") if T(ERDBLoadGroup => 1);
289            } else {
290                # Here we have a good assignment.
291                for my $role (@$roles) {
292                    $self->Add(featureRole => 1);
293                    $self->PutR(IsFunctionalIn => $role, $fid);
294                    $self->PutE(Role => $role, hypothetical => hypo($role));
295                }
296                $self->Add(badFeatureRoles => $errors);
297            }
298          # Now we have a whole bunch of attribute-related stuff to store in          # Now we have a whole bunch of attribute-related stuff to store in
299          # secondary Feature tables. First is the evidence codes.          # secondary Feature tables. First is the evidence codes. This is special
300            # because we have to save the DLIT numbers.
301            my @dlits;
302          my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');          my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');
303          for my $evidenceTuple (@evidenceTuples) {          for my $evidenceTuple (@evidenceTuples) {
304              my (undef, undef, $code) = @$evidenceTuple;              my (undef, undef, $code) = @$evidenceTuple;
305              $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);              $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);
306                # If this is a direct literature reference, save it.
307                if ($code =~ /dlit\((\d+)/) {
308                    push @dlits, $1;
309                    $self->Add(dlits => 1);
310                }
311          }          }
312          # Now we have the external links. These are stored using hyperlink objects.          # Now we have the external links. These are stored using hyperlink objects.
313          my @links = $fig->fid_links($fid);          my @links = $fig->fid_links($fid);
# Line 194  Line 326 
326          my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);          my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);
327          for my $essentialTuple (@essentials) {          for my $essentialTuple (@essentials) {
328              my (undef, undef, $essentialityType, $url) = @$essentialTuple;              my (undef, undef, $essentialityType, $url) = @$essentialTuple;
329                # Only keep this datum if it has a URL. The ones without URLs are
330                # all duplicates.
331                if ($url) {
332              # Form a hyperlink from this essentiality tuple.              # Form a hyperlink from this essentiality tuple.
333              my $link = HyperLink->new($essentialityType, $url);              my $link = HyperLink->new($essentialityType, $url);
334              # Store it as essentiality data for this feature.              # Store it as essentiality data for this feature.
335              $self->PutE(FeatureEssential => $fid, essential => $link);              $self->PutE(FeatureEssential => $fid, essential => $link);
336          }          }
337            }
338          # If this is a PEG, we have a protein sequence.          # If this is a PEG, we have a protein sequence.
339            my $proteinID;
340          if ($type eq 'peg') {          if ($type eq 'peg') {
341              # Get the translation.              # Get the translation.
342              my $proteinSequence = $fig->get_translation($fid);              my $proteinSequence = $seqs{$fid};
343                if (! $proteinSequence) {
344                    Trace("No protein sequence found for $fid.") if T(ERDBLoadGroup => 2);
345                    $self->Add(missingProtein => 1);
346                    # Here there was some sort of error and the protein sequence did
347                    # not come back. Ask for the DNA and translate it instead.
348                    my $dna = $fig->get_dna_seq($fid);
349                    $proteinSequence = FIG::translate($dna, undef, 1);
350                }
351              # Compute the ID.              # Compute the ID.
352              my $proteinID = ERDB::DigestKey($proteinSequence);              $proteinID = $sapling->ProteinID($proteinSequence);
353              # Create the protein record.              # Create the protein record.
354              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);
355              # Get the publications for this PEG.              $self->PutR(IsProteinFor => $proteinID, $fid);
356              my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');              # Connect this protein to the feature's publications (if any).
357              for my $pub (@pubs) {              for my $pub (@dlits) {
358                  # Parse out the article title from the data.                  $self->PutR(Concerns => $pub, $proteinID);
359                  my (undef, undef, $data, $url) = @_;              }
360                  my @pieces = split /,/, $data, 3;          }
361                  if (defined $pieces[2]) {          # Now we need to compute the identifiers. We start with the aliases.
362                      # Create the publication record.          # Get the alias data for this feature. If there is none, we force an
363                      my $hl = Hyperlink->new($pieces[2], $url);          # empty list.
364                      my $key = ERDB::DigestKey($url);          my $aliasList = $aliasHash->{$fid} || [];
365                      $self->PutE(Publication => $key, citation => $hl);          # Loop through the aliases found.
366                      # Connect it to the protein.          for my $aliasTuple (@$aliasList) {
367                      $self->PutR(Concerns => $key, $proteinID);              my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple;
368                  }              # Get the natural form. If there is none, then the canonical
369              }              # form IS the natural form. Note we have to make a special check
370          }              # for locus tags, which have an insane number of variants.
371          # Now we need to compute the identifiers. We start with the aliases              my $natural;
372          # We need to insure we don't put multiple copies of the same alias              if ($aliasID =~ /LocusTag:(.+)/) {
373          # in the keyword list, so we'll put all aliases found in this hash.                  $natural = $1;
         my %aliasHash;  
         # Loop through the aliases, recording the normal and natural forms/  
         for my $alias (split /,/, $aliases) {  
             # Compute the alias type.  
             my $aliasType = AliasAnalysis::TypeOf($alias);  
             # Is this alias a known type?  
             if (! defined $aliasType) {  
                 # Check to see if it's a locus tag.  
                 if ($alias =~ /^[A-Z]{3}\d+$/) {  
                     # It is, so convert it to internal form.  
                     my $normalized = AliasAnalysis::Normalize(LocusTag => $alias);  
                     $aliasHash{$normalized} = 'LocusTag';  
374                  } else {                  } else {
375                      # Here it's a complete mystery. If we haven't seen it yet,                  $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;
                     # mark it as being an unknown type.  
                     if (! exists $aliasHash{$alias}) {  
                         $aliasHash{$alias} = "";  
                     }  
                 }  
             } else {  
                 # Here we have a known type.  
                 $aliasHash{$alias} = $aliasType;  
             }  
376          }          }
377          # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).              # Create the identifier record.
378          my @corresponders = $fig->get_corresponding_ids($fid, 1);              $self->PutE(Identifier => $aliasID, natural_form => $natural,
379          for my $tuple (@corresponders) {                          source => $aliasType);
380              my ($id, $xdb) = @{$tuple};              # Is this a protein alias?
381              # Ignore SEED: that's us. Also ignore contig IDs. Those result from a bug              if ($aliasConf eq 'C' && $proteinID) {
382              # at PIR.                  # Yes. Connect it using IsNamedBy.
383              if ($xdb ne 'SEED' && ! ($xdb eq 'RefSeq' && $id =~ /^[A-Z][A-Z]_\d+$/)) {                  $self->PutR(IsNamedBy => $proteinID, $aliasID);
                 # Compute the ID's normalized form.  
                 my $normalized = AliasAnalysis::Normalize($xdb => $id);  
                 # Add it to the alias hash.  
                 $aliasHash{$normalized} = $xdb;  
             }  
         }  
         # Convert the aliases to feature identifiers.  
         for my $alias (keys %aliasHash) {  
             # Compute the identifier's natural form and the real type.  
             my $type = $aliasHash{$alias};  
             my $naturalForm;  
             if (! $type) {  
                 # Here we have an unknown type. The natural form is the same  
                 # as the internal form.  
                 $naturalForm = $alias;  
                 $type = 'Miscellaneous';  
384              } else {              } else {
385                  # Here we have a known type.                  # No. Connect it to the feature.
386                  $naturalForm = AliasAnalysis::Type($type => $alias);                  $self->PutR(IsIdentifiedBy => $fid, $aliasID, conf => $aliasConf);
387              }              }
             # Create the identifier and connect it to the feature.  
             $self->PutE(Identifier => $alias, source => $type,  
                         natural_form => $naturalForm);  
             $self->PutR(Identifies => $alias, $fid);  
388          }          }
389            # Make the MD5 identifier an alias.
390            if (defined $md5Alias) {
391                $self->PutE(Identifier => "ubi|$md5Alias", natural_form => $md5Alias,
392                            source => 'UBI');
393                $self->PutR(IsIdentifiedBy => $fid, "ubi|$md5Alias", conf => 'A');
394            }
395            # Finally, this feature is an alias of itself.
396            $self->PutE(Identifier => $fid, natural_form => $fid,
397                        source => 'SEED');
398            $self->PutR(IsIdentifiedBy => $fid, $fid, conf => 'A');
399      }      }
400  }  }
401    

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.18

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3