[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.9, Mon Feb 1 20:14:28 2010 UTC revision 1.17, Mon Jun 27 20:20:12 2011 UTC
# Line 27  Line 27 
27      use HyperLink;      use HyperLink;
28      use AliasAnalysis;      use AliasAnalysis;
29      use LoaderUtils;      use LoaderUtils;
     use Digest::MD5;  
30      use SeedUtils;      use SeedUtils;
31        use gjoseqlib;
32        use MD5Computer;
33      use base 'BaseSaplingLoader';      use base 'BaseSaplingLoader';
34    
35  =head1 Sapling Feature Load Group Class  =head1 Sapling Feature Load Group Class
# Line 66  Line 67 
67      my ($class, $erdb, $options) = @_;      my ($class, $erdb, $options) = @_;
68      # Create the table list.      # Create the table list.
69      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink
70                           FeatureVirulent IsOwnerOf IsLocatedIn Identifies                           FeatureVirulent IsOwnerOf IsLocatedIn IsIdentifiedBy
71                           Identifier IsNamedBy ProteinSequence Concerns                           Identifier IsNamedBy ProteinSequence Concerns
72                           IsAttachmentSiteFor Publication IsProteinFor                           IsAttachmentSiteFor Publication IsProteinFor
73                           Role IsFunctionalIn);                           Role RoleIndex IsFunctionalIn);
74      # Create the BaseSaplingLoader object.      # Create the BaseSaplingLoader object.
75      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
76      # Return it.      # Return it.
# Line 98  Line 99 
99          # Load this genome's features.          # Load this genome's features.
100          $self->LoadGenomeFeatures($genomeID);          $self->LoadGenomeFeatures($genomeID);
101      } else {      } else {
102          # The global data is the roles from subsystems.          # The global data is the roles from subsystems and the publications.
103          my $fig = $self->source();          my $fig = $self->source();
104          # First, we get the subsystem list.          # We need the master map of roles to IDs.
105            my %roleHash;
106            my $lastRoleIndex = -1;
107            my $roleMapFile = $erdb->LoadDirectory() . "/roleMap.tbl";
108            if (-f $roleMapFile) {
109                for my $mapLine (Tracer::GetFile($roleMapFile)) {
110                    my ($role, $idx) = split /\t/, $mapLine;
111                    $roleHash{$role} = $idx;
112                    if ($idx > $lastRoleIndex) {
113                        $lastRoleIndex = $idx;
114                    }
115                }
116            }
117            # We'll track duplicate roles in here.
118            my %roleList;
119            # Now we get the subsystem list.
120          my $subHash = $erdb->SubsystemHash();          my $subHash = $erdb->SubsystemHash();
121          for my $sub (sort keys %$subHash) {          for my $sub (sort keys %$subHash) {
122              $self->Add(subsystems => 1);              $self->Add(subsystems => 1);
# Line 109  Line 125 
125              my @roles = $fig->subsystem_to_roles($sub);              my @roles = $fig->subsystem_to_roles($sub);
126              for my $role (@roles) {              for my $role (@roles) {
127                  $self->Add(subsystemRoles => 1);                  $self->Add(subsystemRoles => 1);
128                  $self->PutE(Role => $role, hypothetical => hypo($role));                  # Check to see if this role is hypothetical.
129                    my $hypo = hypo($role);
130                    if (! $hypo) {
131                        # Is this role in the role index hash?
132                        my $roleIndex = $roleHash{$role};
133                        if (! defined $roleIndex) {
134                            # No, compute a new index for it.
135                            $roleIndex = ++$lastRoleIndex;
136                            $roleHash{$role} = $roleIndex;
137                        }
138                        if (! $roleList{$role}) {
139                            $roleList{$role} = 1;
140                            $self->PutE(RoleIndex => $role, role_index => $roleIndex);
141                        }
142                    }
143                    $self->PutE(Role => $role, hypothetical => $hypo);
144                }
145            }
146            Trace("Subsystem roles generated.") if T(2);
147            # Write out the role master file.
148            Tracer::PutFile($roleMapFile, [map { "$_\t$roleHash{$_}" } keys %roleHash]);
149            Trace("Role master file written to $roleMapFile.") if T(2);
150            # Now, we get the publications.
151            my $pubs = $fig->all_titles();
152            for my $pub (@$pubs) {
153                # Get the ID and title.
154                my ($pubmedID, $title) = @$pub;
155                # Only proceed if the ID is valid.
156                if ($pubmedID) {
157                    # Create a hyperlink from the title and the pubmed ID.
158                    my $link;
159                    if (! $title) {
160                        $link = HyperLink->new("<unknown>");
161                    } else {
162                        $link = HyperLink->new($title, "http://www.ncbi.nlm.nih.gov/pubmed/$pubmedID");
163                    }
164                    # Create the publication record.
165                    $self->PutE(Publication => $pubmedID, citation => $link);
166              }              }
167          }          }
168          Trace("Subsystem roles generated.") if T(3);          Trace("Publications generated.") if T(2);
169      }      }
170  }  }
171    
# Line 149  Line 202 
202          $self->Add(missingAliasFile => 1);          $self->Add(missingAliasFile => 1);
203          $aliasHash = {};          $aliasHash = {};
204      }      }
205        # Get all of this genome's protein sequences.
206        my %seqs = map { $_->[0] => $_->[2] } gjoseqlib::read_fasta("$FIG_Config::organisms/$genomeID/Features/peg/fasta");
207      # Get all of this genome's features.      # Get all of this genome's features.
208      my $featureList = $fig->all_features_detailed_fast($genomeID);      my $featureList = $fig->all_features_detailed_fast($genomeID);
209        # Compute the MD5 identifiers for the genome and its contigs.
210        my $genomeMD5Data = MD5Computer->new_from_fasta("$FIG_Config::organisms/$genomeID/contigs");
211      # Loop through them.      # Loop through them.
212      for my $feature (@$featureList) {      for my $feature (@$featureList) {
213          # Get this feature's data.          # Get this feature's data.
# Line 207  Line 264 
264                  $self->Add(badAttachment => 1);                  $self->Add(badAttachment => 1);
265              }              }
266          }          }
267            # Compute the MD5 identifier. This may fail if there is an error in the
268            # feature definition.
269            my $md5Alias;
270            eval {
271                $md5Alias = $genomeMD5Data->ComputeGeneMD5(map { $_->String() } @locs);
272            };
273            if ($@) {
274                Trace("Error in $fid MD5 computation: $@") if T(0);
275                $self->Add(md5ComputeError => 1);
276            }
277          # Emit the feature record.          # Emit the feature record.
278          $self->PutE(Feature => $fid, feature_type => $type,          $self->PutE(Feature => $fid, feature_type => $type,
279                      sequence_length => $seqLen, function => $assignment,                      sequence_length => $seqLen, function => $assignment,
# Line 214  Line 281 
281          # Connect the feature to its genome.          # Connect the feature to its genome.
282          $self->PutR(IsOwnerOf => $genomeID, $fid);          $self->PutR(IsOwnerOf => $genomeID, $fid);
283          # Connect the feature to its roles.          # Connect the feature to its roles.
284          my ($roles, $errors) = LoaderUtils::RolesForLoading($assignment);          my ($roles, $errors) = SeedUtils::roles_for_loading($assignment);
285          if (! defined $roles) {          if (! defined $roles) {
286              # Here the functional assignment was suspicious.              # Here the functional assignment was suspicious.
287              $self->Add(suspiciousFunction => 1);              $self->Add(suspiciousFunction => 1);
# Line 229  Line 296 
296              $self->Add(badFeatureRoles => $errors);              $self->Add(badFeatureRoles => $errors);
297          }          }
298          # Now we have a whole bunch of attribute-related stuff to store in          # Now we have a whole bunch of attribute-related stuff to store in
299          # secondary Feature tables. First is the evidence codes.          # secondary Feature tables. First is the evidence codes. This is special
300            # because we have to save the DLIT numbers.
301            my @dlits;
302          my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');          my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');
303          for my $evidenceTuple (@evidenceTuples) {          for my $evidenceTuple (@evidenceTuples) {
304              my (undef, undef, $code) = @$evidenceTuple;              my (undef, undef, $code) = @$evidenceTuple;
305              $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);              $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);
306                # If this is a direct literature reference, save it.
307                if ($code =~ /dlit\((\d+)/) {
308                    push @dlits, $1;
309                    $self->Add(dlits => 1);
310                }
311          }          }
312          # Now we have the external links. These are stored using hyperlink objects.          # Now we have the external links. These are stored using hyperlink objects.
313          my @links = $fig->fid_links($fid);          my @links = $fig->fid_links($fid);
# Line 265  Line 339 
339          my $proteinID;          my $proteinID;
340          if ($type eq 'peg') {          if ($type eq 'peg') {
341              # Get the translation.              # Get the translation.
342              my $proteinSequence = $fig->get_translation($fid);              my $proteinSequence = $seqs{$fid};
343              if (! $proteinSequence) {              if (! $proteinSequence) {
344                  Trace("No protein sequence found for $fid.") if T(2);                  Trace("No protein sequence found for $fid.") if T(ERDBLoadGroup => 2);
345                  $self->Add(missingProtein => 1);                  $self->Add(missingProtein => 1);
346                  # Here there was some sort of error and the protein sequence did                  # Here there was some sort of error and the protein sequence did
347                  # not come back. Ask for the DNA and translate it instead.                  # not come back. Ask for the DNA and translate it instead.
# Line 279  Line 353 
353              # Create the protein record.              # Create the protein record.
354              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);
355              $self->PutR(IsProteinFor => $proteinID, $fid);              $self->PutR(IsProteinFor => $proteinID, $fid);
356              # Get the publications for this PEG.              # Connect this protein to the feature's publications (if any).
357              my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');              for my $pub (@dlits) {
358              for my $pub (@pubs) {                  $self->PutR(Concerns => $pub, $proteinID);
                 # Parse out the article title from the data.  
                 my (undef, undef, $data, $url) = @$pub;  
                 my @pieces = split /,/, $data, 3;  
                 if (defined $pieces[2]) {  
                     # Create the publication record.  
                     my $hl = HyperLink->new($pieces[2], $url);  
                     my $key = ERDB::DigestKey($url);  
                     $self->PutE(Publication => $key, citation => $hl);  
                     # Connect it to the protein.  
                     $self->PutR(Concerns => $key, $proteinID);  
                 }  
359              }              }
360          }          }
361          # Now we need to compute the identifiers. We start with the aliases.          # Now we need to compute the identifiers. We start with the aliases.
# Line 303  Line 366 
366          for my $aliasTuple (@$aliasList) {          for my $aliasTuple (@$aliasList) {
367              my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple;              my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple;
368              # Get the natural form. If there is none, then the canonical              # Get the natural form. If there is none, then the canonical
369              # form IS the natural form.              # form IS the natural form. Note we have to make a special check
370              my $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;              # for locus tags, which have an insane number of variants.
371                my $natural;
372                if ($aliasID =~ /LocusTag:(.+)/) {
373                    $natural = $1;
374                } else {
375                    $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;
376                }
377              # Create the identifier record.              # Create the identifier record.
378              $self->PutE(Identifier => $aliasID, natural_form => $natural,              $self->PutE(Identifier => $aliasID, natural_form => $natural,
379                          source => $aliasType);                          source => $aliasType);
# Line 314  Line 383 
383                  $self->PutR(IsNamedBy => $proteinID, $aliasID);                  $self->PutR(IsNamedBy => $proteinID, $aliasID);
384              } else {              } else {
385                  # No. Connect it to the feature.                  # No. Connect it to the feature.
386                  $self->PutR(Identifies => $aliasID, $fid, conf => $aliasConf);                  $self->PutR(IsIdentifiedBy => $fid, $aliasID, conf => $aliasConf);
387                }
388              }              }
389            # Make the MD5 identifier an alias.
390            if (defined $md5Alias) {
391                $self->PutE(Identifier => "md5g|$md5Alias", natural_form => $md5Alias,
392                            source => 'MD5');
393                $self->PutR(IsIdentifiedBy => $fid, "md5g|$md5Alias", conf => 'A');
394          }          }
395          # Finally, this feature is an alias of itself.          # Finally, this feature is an alias of itself.
396          $self->PutE(Identifier => $fid, natural_form => $fid,          $self->PutE(Identifier => $fid, natural_form => $fid,
397                      source => 'SEED');                      source => 'SEED');
398          $self->PutR(Identifies => $fid, $fid, conf => 'A');          $self->PutR(IsIdentifiedBy => $fid, $fid, conf => 'A');
399      }      }
400  }  }
401    

Legend:
Removed from v.1.9  
changed lines
  Added in v.1.17

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3