[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Mon Mar 2 22:22:11 2009 UTC revision 1.8, Wed Dec 16 01:45:19 2009 UTC
# Line 26  Line 26 
26      use BasicLocation;      use BasicLocation;
27      use HyperLink;      use HyperLink;
28      use AliasAnalysis;      use AliasAnalysis;
29        use LoaderUtils;
30        use Digest::MD5;
31      use base 'BaseSaplingLoader';      use base 'BaseSaplingLoader';
32    
33  =head1 Sapling Feature Load Group Class  =head1 Sapling Feature Load Group Class
# Line 44  Line 46 
46    
47  =item erdb  =item erdb
48    
49  [[SaplingPm]] object for the database being loaded.  L<Sapling> object for the database being loaded.
50    
51  =item options  =item options
52    
# Line 64  Line 66 
66      # Create the table list.      # Create the table list.
67      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink
68                           FeatureVirulent IsOwnerOf IsLocatedIn Identifies                           FeatureVirulent IsOwnerOf IsLocatedIn Identifies
69                           Identifier IsNamedBy ProteinSequence Concerns Publication);                           Identifier IsNamedBy ProteinSequence Concerns
70                             IsAttachmentSiteFor Publication IsProteinFor);
71      # Create the BaseSaplingLoader object.      # Create the BaseSaplingLoader object.
72      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
73      # Return it.      # Return it.
# Line 120  Line 123 
123      my $sapling = $self->db();      my $sapling = $self->db();
124      # Get the maximum location  segment length. We'll need this later.      # Get the maximum location  segment length. We'll need this later.
125      my $maxLength = $sapling->TuningParameter('maxLocationLength');      my $maxLength = $sapling->TuningParameter('maxLocationLength');
126        # Get the genome's aliases.
127        my $aliasDir = $sapling->LoadDirectory() . "/AliasData";
128        my $aliasHash = LoaderUtils::ReadAliasFile($aliasDir, $genomeID);
129        if (! defined $aliasHash) {
130            Trace("No aliases found for $genomeID.") if T(1);
131            $self->Add(missingAliasFile => 1);
132            $aliasHash = {};
133        }
134      # Get all of this genome's features.      # Get all of this genome's features.
135      my $featureList = $fig->all_features_detailed_fast($genomeID);      my $featureList = $fig->all_features_detailed_fast($genomeID);
136      # Loop through them.      # Loop through them.
# Line 128  Line 139 
139          my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,          my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,
140              $assignmentMaker, $quality) = @$feature;              $assignmentMaker, $quality) = @$feature;
141          $self->Track(Features => $fid, 1000);          $self->Track(Features => $fid, 1000);
142          # Fix the assignment for non-PEG features.          # Fix missing assignments. For RNAs, the assignment may be in the alias list.
143          if (! defined $assignment) {          if (! defined $assignment) {
144                if ($type eq 'rna') {
145              $assignment = $aliases;              $assignment = $aliases;
146              $assignmentMaker ||= 'master';              $assignmentMaker ||= 'master';
147                } else {
148                    $assignment = '';
149                }
150          }          }
151          # Convert the location string to a list of location objects.          # Convert the location string to a list of location objects.
152          my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;          my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;
# Line 164  Line 179 
179              $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN,              $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN,
180                          begin => $loc->Left(), dir => $dir, len => $loc->Length());                          begin => $loc->Left(), dir => $dir, len => $loc->Length());
181          }          }
182            # Is this an attachment site?
183            if ($type eq 'att') {
184                # Yes, connect it to the attached feature.
185                if ($assignment =~ /att([LR])\s+for\s+(fig\|.+)/) {
186                    $self->PutR(IsAttachmentSiteFor => $fid, $2, edge => $1);
187                } else {
188                    Trace("Invalid attachment function for $fid: $assignment") if T(1);
189                    $self->Add(badAttachment => 1);
190                }
191            }
192          # Emit the feature record.          # Emit the feature record.
193          $self->PutE(Feature => $fid, feature_type => $type,          $self->PutE(Feature => $fid, feature_type => $type,
194                      sequence_length => $seqLen, function => $assignment,                      sequence_length => $seqLen, function => $assignment,
# Line 194  Line 219 
219          my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);          my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);
220          for my $essentialTuple (@essentials) {          for my $essentialTuple (@essentials) {
221              my (undef, undef, $essentialityType, $url) = @$essentialTuple;              my (undef, undef, $essentialityType, $url) = @$essentialTuple;
222                # Only keep this datum if it has a URL. The ones without URLs are
223                # all duplicates.
224                if ($url) {
225              # Form a hyperlink from this essentiality tuple.              # Form a hyperlink from this essentiality tuple.
226              my $link = HyperLink->new($essentialityType, $url);              my $link = HyperLink->new($essentialityType, $url);
227              # Store it as essentiality data for this feature.              # Store it as essentiality data for this feature.
228              $self->PutE(FeatureEssential => $fid, essential => $link);              $self->PutE(FeatureEssential => $fid, essential => $link);
229          }          }
230            }
231          # If this is a PEG, we have a protein sequence.          # If this is a PEG, we have a protein sequence.
232            my $proteinID;
233          if ($type eq 'peg') {          if ($type eq 'peg') {
234              # Get the translation.              # Get the translation.
235              my $proteinSequence = $fig->get_translation($fid);              my $proteinSequence = $fig->get_translation($fid);
236                if (! $proteinSequence) {
237                    Trace("No protein sequence found for $fid.") if T(2);
238                    $self->Add(missingProtein => 1);
239                    # Here there was some sort of error and the protein sequence did
240                    # not come back. Ask for the DNA and translate it instead.
241                    my $dna = $fig->get_dna_seq($fid);
242                    $proteinSequence = FIG::translate($dna, undef, 1);
243                }
244              # Compute the ID.              # Compute the ID.
245              my $proteinID = ERDB::DigestKey($proteinSequence);              $proteinID = $sapling->ProteinID($proteinSequence);
246              # Create the protein record.              # Create the protein record.
247              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);
248                $self->PutR(IsProteinFor => $proteinID, $fid);
249              # Get the publications for this PEG.              # Get the publications for this PEG.
250              my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');              my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');
251              for my $pub (@pubs) {              for my $pub (@pubs) {
252                  # Parse out the article title from the data.                  # Parse out the article title from the data.
253                  my (undef, undef, $data, $url) = @_;                  my (undef, undef, $data, $url) = @$pub;
254                  my @pieces = split /,/, $data, 3;                  my @pieces = split /,/, $data, 3;
255                  if (defined $pieces[2]) {                  if (defined $pieces[2]) {
256                      # Create the publication record.                      # Create the publication record.
257                      my $hl = Hyperlink->new($pieces[2], $url);                      my $hl = HyperLink->new($pieces[2], $url);
258                      my $key = ERDB::DigestKey($url);                      my $key = ERDB::DigestKey($url);
259                      $self->PutE(Publication => $key, citation => $hl);                      $self->PutE(Publication => $key, citation => $hl);
260                      # Connect it to the protein.                      # Connect it to the protein.
# Line 223  Line 262 
262                  }                  }
263              }              }
264          }          }
265          # Now we need to compute the identifiers. We start with the aliases          # Now we need to compute the identifiers. We start with the aliases.
266          # We need to insure we don't put multiple copies of the same alias          # Get the alias data for this feature. If there is none, we force an
267          # in the keyword list, so we'll put all aliases found in this hash.          # empty list.
268          my %aliasHash;          my $aliasList = $aliasHash->{$fid} || [];
269          # Loop through the aliases, recording the normal and natural forms/          # Loop through the aliases found.
270          for my $alias (split /,/, $aliases) {          for my $aliasTuple (@$aliasList) {
271              # Compute the alias type.              my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple;
272              my $aliasType = AliasAnalysis::TypeOf($alias);              # Get the natural form. If there is none, then the canonical
273              # Is this alias a known type?              # form IS the natural form.
274              if (! defined $aliasType) {              my $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;
275                  # Check to see if it's a locus tag.              # Create the identifier record.
276                  if ($alias =~ /^[A-Z]{3}\d+$/) {              $self->PutE(Identifier => $aliasID, natural_form => $natural,
277                      # It is, so convert it to internal form.                          source => $aliasType);
278                      my $normalized = AliasAnalysis::Normalize(LocusTag => $alias);              # Is this a protein alias?
279                      $aliasHash{$normalized} = 'LocusTag';              if ($aliasConf eq 'C' && $proteinID) {
280                  } else {                  # Yes. Connect it using IsNamedBy.
281                      # Here it's a complete mystery. If we haven't seen it yet,                  $self->PutR(IsNamedBy => $proteinID, $aliasID);
                     # mark it as being an unknown type.  
                     if (! exists $aliasHash{$alias}) {  
                         $aliasHash{$alias} = "";  
                     }  
                 }  
             } else {  
                 # Here we have a known type.  
                 $aliasHash{$alias} = $aliasType;  
             }  
         }  
         # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).  
         my @corresponders = $fig->get_corresponding_ids($fid, 1);  
         for my $tuple (@corresponders) {  
             my ($id, $xdb) = @{$tuple};  
             # Ignore SEED: that's us. Also ignore contig IDs. Those result from a bug  
             # at PIR.  
             if ($xdb ne 'SEED' && ! ($xdb eq 'RefSeq' && $id =~ /^[A-Z][A-Z]_\d+$/)) {  
                 # Compute the ID's normalized form.  
                 my $normalized = AliasAnalysis::Normalize($xdb => $id);  
                 # Add it to the alias hash.  
                 $aliasHash{$normalized} = $xdb;  
             }  
         }  
         # Convert the aliases to feature identifiers.  
         for my $alias (keys %aliasHash) {  
             # Compute the identifier's natural form and the real type.  
             my $type = $aliasHash{$alias};  
             my $naturalForm;  
             if (! $type) {  
                 # Here we have an unknown type. The natural form is the same  
                 # as the internal form.  
                 $naturalForm = $alias;  
                 $type = 'Miscellaneous';  
282              } else {              } else {
283                  # Here we have a known type.                  # No. Connect it to the feature.
284                  $naturalForm = AliasAnalysis::Type($type => $alias);                  $self->PutR(Identifies => $aliasID, $fid, conf => $aliasConf);
285              }              }
             # Create the identifier and connect it to the feature.  
             $self->PutE(Identifier => $alias, source => $type,  
                         natural_form => $naturalForm);  
             $self->PutR(Identifies => $alias, $fid);  
286          }          }
287            # Finally, this feature is an alias of itself.
288            $self->PutE(Identifier => $fid, natural_form => $fid,
289                        source => 'SEED');
290            $self->PutR(Identifies => $fid, $fid, conf => 'A');
291      }      }
292  }  }
293    

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.8

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3