[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Mon Jan 19 21:43:27 2009 UTC revision 1.7, Wed Nov 25 21:00:45 2009 UTC
# Line 25  Line 25 
25      use CGI qw(-nosticky);      use CGI qw(-nosticky);
26      use BasicLocation;      use BasicLocation;
27      use HyperLink;      use HyperLink;
28        use AliasAnalysis;
29        use LoaderUtils;
30      use base 'BaseSaplingLoader';      use base 'BaseSaplingLoader';
31    
32  =head1 Sapling Feature Load Group Class  =head1 Sapling Feature Load Group Class
# Line 43  Line 45 
45    
46  =item erdb  =item erdb
47    
48  [[SaplingPm]] object for the database being loaded.  L<Sapling> object for the database being loaded.
49    
50  =item options  =item options
51    
# Line 61  Line 63 
63      # Get the parameters.      # Get the parameters.
64      my ($class, $erdb, $options) = @_;      my ($class, $erdb, $options) = @_;
65      # Create the table list.      # Create the table list.
66      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink FeatureVirulent      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink
67                          Identifier IsSequenceFor ProteinSequence Concerns Publication                           FeatureVirulent IsOwnerOf IsLocatedIn Identifies
68                          IdentifierSet IncludesIdentifier IsLocatedIn IsOwnerOf);                           Identifier IsNamedBy ProteinSequence Concerns
69                             IsAttachmentSiteFor Publication IsProteinFor);
70      # Create the BaseSaplingLoader object.      # Create the BaseSaplingLoader object.
71      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
72      # Return it.      # Return it.
# Line 119  Line 122 
122      my $sapling = $self->db();      my $sapling = $self->db();
123      # Get the maximum location  segment length. We'll need this later.      # Get the maximum location  segment length. We'll need this later.
124      my $maxLength = $sapling->TuningParameter('maxLocationLength');      my $maxLength = $sapling->TuningParameter('maxLocationLength');
125      # This hash will be used to track identifiers. Each identifier can only      # Get the genome's aliases.
126      # be used once.      my $aliasDir = $sapling->LoadDirectory() . "/AliasData";
127      my %identifiers;      my $aliasHash = LoaderUtils::ReadAliasFile($aliasDir, $genomeID);
128        if (! defined $aliasHash) {
129            Trace("No aliases found for $genomeID.") if T(1);
130            $self->Add(missingAliasFile => 1);
131            $aliasHash = {};
132        }
133      # Get all of this genome's features.      # Get all of this genome's features.
134      my $featureList = $fig->all_features_detailed_fast($genomeID);      my $featureList = $fig->all_features_detailed_fast($genomeID);
135      # Loop through them.      # Loop through them.
# Line 130  Line 138 
138          my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,          my ($fid, $locationString, $aliases, $type, undef, undef, $assignment,
139              $assignmentMaker, $quality) = @$feature;              $assignmentMaker, $quality) = @$feature;
140          $self->Track(Features => $fid, 1000);          $self->Track(Features => $fid, 1000);
141          # Fix the assignment for non-PEG features.          # Fix missing assignments. For RNAs, the assignment may be in the alias list.
142          if (! defined $assignment) {          if (! defined $assignment) {
143                if ($type eq 'rna') {
144              $assignment = $aliases;              $assignment = $aliases;
145              $assignmentMaker ||= 'master';              $assignmentMaker ||= 'master';
146                } else {
147                    $assignment = '';
148                }
149          }          }
150          # Convert the location string to a list of location objects.          # Convert the location string to a list of location objects.
151          my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;          my @locs = map { BasicLocation->new($_) } split /\s*,\s*/, $locationString;
# Line 156  Line 168 
168              # to the feature.              # to the feature.
169              my $peel = $loc->Peel($maxLength);              my $peel = $loc->Peel($maxLength);
170              while (defined $peel) {              while (defined $peel) {
171                  $self->PutR(IsLocatedIn => $fid, $contigID, beg => $peel->Left(),                  $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN++,
172                              dir => $dir, len => $maxLength, locN => $locN++);                              begin => $peel->Left(), len => $peel->Length(),
173                                dir => $dir);
174                  $peel = $loc->Peel($maxLength);                  $peel = $loc->Peel($maxLength);
175              }              }
176              # Output the residual. There will always be one, because of the way              # Output the residual. There will always be one, because of the way
177              # Peel works.              # Peel works.
178              $self->PutR(IsLocatedIn => $fid, $contigID, beg => $loc->Left(),              $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN,
179                          dir => $dir, len => $loc->Length(), locN => $locN);                          begin => $loc->Left(), dir => $dir, len => $loc->Length());
180            }
181            # Is this an attachment site?
182            if ($type eq 'att') {
183                # Yes, connect it to the attached feature.
184                if ($assignment =~ /att([LR])\s+for\s+(fig\|.+)/) {
185                    $self->PutR(IsAttachmentSiteFor => $fid, $2, edge => $1);
186                } else {
187                    Trace("Invalid attachment function for $fid: $assignment") if T(1);
188                    $self->Add(badAttachment => 1);
189                }
190          }          }
191          # Emit the feature record.          # Emit the feature record.
192          $self->PutE(Feature => $fid, feature_type => $type,          $self->PutE(Feature => $fid, feature_type => $type,
193                      sequence_length => $seqLen, function => $assignment);                      sequence_length => $seqLen, function => $assignment,
194                        locked => $fig->is_locked_fid($fid));
195          # Connect the feature to its genome.          # Connect the feature to its genome.
196          $self->PutR(IsOwnerOf => $genomeID, $fid);          $self->PutR(IsOwnerOf => $genomeID, $fid);
197          # Now we have a whole bunch of attribute-related stuff to store in          # Now we have a whole bunch of attribute-related stuff to store in
# Line 194  Line 218 
218          my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);          my @essentials = $fig->get_attributes($fid, undef, ['essential', 'potential-essential']);
219          for my $essentialTuple (@essentials) {          for my $essentialTuple (@essentials) {
220              my (undef, undef, $essentialityType, $url) = @$essentialTuple;              my (undef, undef, $essentialityType, $url) = @$essentialTuple;
221                # Only keep this datum if it has a URL. The ones without URLs are
222                # all duplicates.
223                if ($url) {
224              # Form a hyperlink from this essentiality tuple.              # Form a hyperlink from this essentiality tuple.
225              my $link = HyperLink->new($essentialityType, $url);              my $link = HyperLink->new($essentialityType, $url);
226              # Store it as essentiality data for this feature.              # Store it as essentiality data for this feature.
227              $self->PutE(FeatureEssential => $fid, essential => $link);              $self->PutE(FeatureEssential => $fid, essential => $link);
228          }          }
229            }
230          # If this is a PEG, we have a protein sequence.          # If this is a PEG, we have a protein sequence.
231            my $proteinID;
232          if ($type eq 'peg') {          if ($type eq 'peg') {
233              # Get the translation.              # Get the translation.
234              my $proteinSequence = $fig->get_translation($fid);              my $proteinSequence = $fig->get_translation($fid);
235                if (! $proteinSequence) {
236                    Trace("No protein sequence found for $fid.") if T(2);
237                    $self->Add(missingProtein => 1);
238                    # Here there was some sort of error and the protein sequence did
239                    # not come back. Ask for the DNA and translate it instead.
240                    my $dna = $fig->get_dna_seq($fid);
241                    $proteinSequence = FIG::translate($dna, undef, 1);
242                }
243              # Compute the ID.              # Compute the ID.
244              my $proteinID = ERDB::DigestKey($proteinSequence);              $proteinID = ERDB::DigestKey($proteinSequence);
245              # Create the protein record.              # Create the protein record.
246              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);
247                $self->PutR(IsProteinFor => $proteinID, $fid);
248              # Get the publications for this PEG.              # Get the publications for this PEG.
249              my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');              my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');
250              for my $pub (@pubs) {              for my $pub (@pubs) {
251                  # Parse out the article title from the data.                  # Parse out the article title from the data.
252                  my (undef, undef, $data, $url) = @_;                  my (undef, undef, $data, $url) = @$pub;
253                  my @pieces = split /,/, $data, 3;                  my @pieces = split /,/, $data, 3;
254                  if (defined $pieces[2]) {                  if (defined $pieces[2]) {
255                      # Create the publication record.                      # Create the publication record.
256                      my $hl = Hyperlink->new($pieces[2], $url);                      my $hl = HyperLink->new($pieces[2], $url);
257                      my $key = ERDB::DigestKey($url);                      my $key = ERDB::DigestKey($url);
258                      $self->PutE(Publication => $key, citation => $hl);                      $self->PutE(Publication => $key, citation => $hl);
259                      # Connect it to the protein.                      # Connect it to the protein.
260                      $self->PutR(Concerns => $key, $proteinID);                      $self->PutR(Concerns => $key, $proteinID);
261                  }                  }
262              }              }
             # Now we need to get the identifiers for this feature and put  
             # them in the protein's identifier set. The "1" tells FigPm to  
             # send back the database name with each identifier. Note that  
             # the FIG ID will come back with this list, but there may not be  
             # a list if the genome is new.  
             my @idTuples = grep { $_->[0] !~ /^[A-Z][A-Z]_\d+$/ } $fig->get_corresponding_ids($fid, 1); ##HACK: grep out the contig IDs  
             if (! @idTuples) {  
                 push @idTuples, [$fid, 'SEED'];  
             }  
             # Compute the identifier set name and create the set.  
             my $setID = "$proteinID:$genomeID";  
             $self->PutE(IdentifierSet => $setID);  
             # Create the identifiers and onnect them to the protein and  
             # the set.  
             for my $idTuple (@idTuples) {  
                 my ($id, $source) = @$idTuple;  
                 # Only process this identifier if it's new. An identifier  
                 # can only be in one identifier set. Thankfully, the  
                 # identifiers belong to genomes, so we don't need to worry  
                 # about duplicates in other sections.  
                 if (exists $identifiers{$id} && $identifiers{$id} ne $proteinID) {  
                     $self->Add(ambiguousProtein => 1);  
                 } else {  
                     $self->PutE(Identifier => $id, source => $source);  
                     $self->PutR(IsSequenceFor => $proteinID, $id);  
                     $self->PutR(IncludesIdentifier => $setID, $id);  
                     $identifiers{$id} = $proteinID;  
263                  }                  }
264            # Now we need to compute the identifiers. We start with the aliases.
265            # Get the alias data for this feature. If there is none, we force an
266            # empty list.
267            my $aliasList = $aliasHash->{$fid} || [];
268            # Loop through the aliases found.
269            for my $aliasTuple (@$aliasList) {
270                my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple;
271                # Get the natural form. If there is none, then the canonical
272                # form IS the natural form.
273                my $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;
274                # Create the identifier record.
275                $self->PutE(Identifier => $aliasID, natural_form => $natural,
276                            source => $aliasType);
277                # Is this a protein alias?
278                if ($aliasConf eq 'C' && $proteinID) {
279                    # Yes. Connect it using IsNamedBy.
280                    $self->PutR(IsNamedBy => $proteinID, $aliasID);
281                } else {
282                    # No. Connect it to the feature.
283                    $self->PutR(Identifies => $aliasID, $fid, conf => $aliasConf);
284              }              }
285          }          }
286            # Finally, this feature is an alias of itself.
287            $self->PutE(Identifier => $fid, natural_form => $fid,
288                        source => 'SEED');
289            $self->PutR(Identifies => $fid, $fid, conf => 'A');
290      }      }
291  }  }
292    

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3