[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Mon Jan 19 21:43:27 2009 UTC revision 1.2, Mon Mar 2 22:22:11 2009 UTC
# Line 25  Line 25 
25      use CGI qw(-nosticky);      use CGI qw(-nosticky);
26      use BasicLocation;      use BasicLocation;
27      use HyperLink;      use HyperLink;
28        use AliasAnalysis;
29      use base 'BaseSaplingLoader';      use base 'BaseSaplingLoader';
30    
31  =head1 Sapling Feature Load Group Class  =head1 Sapling Feature Load Group Class
# Line 61  Line 62 
62      # Get the parameters.      # Get the parameters.
63      my ($class, $erdb, $options) = @_;      my ($class, $erdb, $options) = @_;
64      # Create the table list.      # Create the table list.
65      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink FeatureVirulent      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink
66                          Identifier IsSequenceFor ProteinSequence Concerns Publication                           FeatureVirulent IsOwnerOf IsLocatedIn Identifies
67                          IdentifierSet IncludesIdentifier IsLocatedIn IsOwnerOf);                           Identifier IsNamedBy ProteinSequence Concerns Publication);
68      # Create the BaseSaplingLoader object.      # Create the BaseSaplingLoader object.
69      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
70      # Return it.      # Return it.
# Line 119  Line 120 
120      my $sapling = $self->db();      my $sapling = $self->db();
121      # Get the maximum location  segment length. We'll need this later.      # Get the maximum location  segment length. We'll need this later.
122      my $maxLength = $sapling->TuningParameter('maxLocationLength');      my $maxLength = $sapling->TuningParameter('maxLocationLength');
     # This hash will be used to track identifiers. Each identifier can only  
     # be used once.  
     my %identifiers;  
123      # Get all of this genome's features.      # Get all of this genome's features.
124      my $featureList = $fig->all_features_detailed_fast($genomeID);      my $featureList = $fig->all_features_detailed_fast($genomeID);
125      # Loop through them.      # Loop through them.
# Line 156  Line 154 
154              # to the feature.              # to the feature.
155              my $peel = $loc->Peel($maxLength);              my $peel = $loc->Peel($maxLength);
156              while (defined $peel) {              while (defined $peel) {
157                  $self->PutR(IsLocatedIn => $fid, $contigID, beg => $peel->Left(),                  $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN++,
158                              dir => $dir, len => $maxLength, locN => $locN++);                              begin => $peel->Left(), len => $peel->Length(),
159                                dir => $dir);
160                  $peel = $loc->Peel($maxLength);                  $peel = $loc->Peel($maxLength);
161              }              }
162              # Output the residual. There will always be one, because of the way              # Output the residual. There will always be one, because of the way
163              # Peel works.              # Peel works.
164              $self->PutR(IsLocatedIn => $fid, $contigID, beg => $loc->Left(),              $self->PutR(IsLocatedIn => $fid, $contigID, ordinal => $locN,
165                          dir => $dir, len => $loc->Length(), locN => $locN);                          begin => $loc->Left(), dir => $dir, len => $loc->Length());
166          }          }
167          # Emit the feature record.          # Emit the feature record.
168          $self->PutE(Feature => $fid, feature_type => $type,          $self->PutE(Feature => $fid, feature_type => $type,
169                      sequence_length => $seqLen, function => $assignment);                      sequence_length => $seqLen, function => $assignment,
170                        locked => $fig->is_locked_fid($fid));
171          # Connect the feature to its genome.          # Connect the feature to its genome.
172          $self->PutR(IsOwnerOf => $genomeID, $fid);          $self->PutR(IsOwnerOf => $genomeID, $fid);
173          # Now we have a whole bunch of attribute-related stuff to store in          # Now we have a whole bunch of attribute-related stuff to store in
# Line 222  Line 222 
222                      $self->PutR(Concerns => $key, $proteinID);                      $self->PutR(Concerns => $key, $proteinID);
223                  }                  }
224              }              }
225              # Now we need to get the identifiers for this feature and put          }
226              # them in the protein's identifier set. The "1" tells FigPm to          # Now we need to compute the identifiers. We start with the aliases
227              # send back the database name with each identifier. Note that          # We need to insure we don't put multiple copies of the same alias
228              # the FIG ID will come back with this list, but there may not be          # in the keyword list, so we'll put all aliases found in this hash.
229              # a list if the genome is new.          my %aliasHash;
230              my @idTuples = grep { $_->[0] !~ /^[A-Z][A-Z]_\d+$/ } $fig->get_corresponding_ids($fid, 1); ##HACK: grep out the contig IDs          # Loop through the aliases, recording the normal and natural forms/
231              if (! @idTuples) {          for my $alias (split /,/, $aliases) {
232                  push @idTuples, [$fid, 'SEED'];              # Compute the alias type.
233              }              my $aliasType = AliasAnalysis::TypeOf($alias);
234              # Compute the identifier set name and create the set.              # Is this alias a known type?
235              my $setID = "$proteinID:$genomeID";              if (! defined $aliasType) {
236              $self->PutE(IdentifierSet => $setID);                  # Check to see if it's a locus tag.
237              # Create the identifiers and onnect them to the protein and                  if ($alias =~ /^[A-Z]{3}\d+$/) {
238              # the set.                      # It is, so convert it to internal form.
239              for my $idTuple (@idTuples) {                      my $normalized = AliasAnalysis::Normalize(LocusTag => $alias);
240                  my ($id, $source) = @$idTuple;                      $aliasHash{$normalized} = 'LocusTag';
241                  # Only process this identifier if it's new. An identifier                  } else {
242                  # can only be in one identifier set. Thankfully, the                      # Here it's a complete mystery. If we haven't seen it yet,
243                  # identifiers belong to genomes, so we don't need to worry                      # mark it as being an unknown type.
244                  # about duplicates in other sections.                      if (! exists $aliasHash{$alias}) {
245                  if (exists $identifiers{$id} && $identifiers{$id} ne $proteinID) {                          $aliasHash{$alias} = "";
246                      $self->Add(ambiguousProtein => 1);                      }
247                    }
248                  } else {                  } else {
249                      $self->PutE(Identifier => $id, source => $source);                  # Here we have a known type.
250                      $self->PutR(IsSequenceFor => $proteinID, $id);                  $aliasHash{$alias} = $aliasType;
251                      $self->PutR(IncludesIdentifier => $setID, $id);              }
                     $identifiers{$id} = $proteinID;  
252                  }                  }
253            # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).
254            my @corresponders = $fig->get_corresponding_ids($fid, 1);
255            for my $tuple (@corresponders) {
256                my ($id, $xdb) = @{$tuple};
257                # Ignore SEED: that's us. Also ignore contig IDs. Those result from a bug
258                # at PIR.
259                if ($xdb ne 'SEED' && ! ($xdb eq 'RefSeq' && $id =~ /^[A-Z][A-Z]_\d+$/)) {
260                    # Compute the ID's normalized form.
261                    my $normalized = AliasAnalysis::Normalize($xdb => $id);
262                    # Add it to the alias hash.
263                    $aliasHash{$normalized} = $xdb;
264                }
265            }
266            # Convert the aliases to feature identifiers.
267            for my $alias (keys %aliasHash) {
268                # Compute the identifier's natural form and the real type.
269                my $type = $aliasHash{$alias};
270                my $naturalForm;
271                if (! $type) {
272                    # Here we have an unknown type. The natural form is the same
273                    # as the internal form.
274                    $naturalForm = $alias;
275                    $type = 'Miscellaneous';
276                } else {
277                    # Here we have a known type.
278                    $naturalForm = AliasAnalysis::Type($type => $alias);
279              }              }
280                # Create the identifier and connect it to the feature.
281                $self->PutE(Identifier => $alias, source => $type,
282                            natural_form => $naturalForm);
283                $self->PutR(Identifies => $alias, $fid);
284          }          }
285      }      }
286  }  }

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.2

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3