[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.9, Mon Feb 1 20:14:28 2010 UTC revision 1.15, Fri Jun 24 20:46:53 2011 UTC
# Line 27  Line 27 
27      use HyperLink;      use HyperLink;
28      use AliasAnalysis;      use AliasAnalysis;
29      use LoaderUtils;      use LoaderUtils;
     use Digest::MD5;  
30      use SeedUtils;      use SeedUtils;
31        use gjoseqlib;
32        use MD5Computer;
33      use base 'BaseSaplingLoader';      use base 'BaseSaplingLoader';
34    
35  =head1 Sapling Feature Load Group Class  =head1 Sapling Feature Load Group Class
# Line 66  Line 67 
67      my ($class, $erdb, $options) = @_;      my ($class, $erdb, $options) = @_;
68      # Create the table list.      # Create the table list.
69      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink
70                           FeatureVirulent IsOwnerOf IsLocatedIn Identifies                           FeatureVirulent IsOwnerOf IsLocatedIn IsIdentifiedBy
71                           Identifier IsNamedBy ProteinSequence Concerns                           Identifier IsNamedBy ProteinSequence Concerns
72                           IsAttachmentSiteFor Publication IsProteinFor                           IsAttachmentSiteFor Publication IsProteinFor
73                           Role IsFunctionalIn);                           Role RoleIndex IsFunctionalIn);
74      # Create the BaseSaplingLoader object.      # Create the BaseSaplingLoader object.
75      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
76      # Return it.      # Return it.
# Line 98  Line 99 
99          # Load this genome's features.          # Load this genome's features.
100          $self->LoadGenomeFeatures($genomeID);          $self->LoadGenomeFeatures($genomeID);
101      } else {      } else {
102          # The global data is the roles from subsystems.          # The global data is the roles from subsystems and the publications.
103          my $fig = $self->source();          my $fig = $self->source();
104          # First, we get the subsystem list.          # We need the master map of roles to IDs.
105            my %roleHash;
106            my $lastRoleIndex = -1;
107            my $roleMapFile = $erdb->LoadDirectory() . "/roleMap.tbl";
108            if (-f $roleMapFile) {
109                for my $mapLine (Tracer::GetFile($roleMapFile)) {
110                    my ($role, $idx) = split /\t/, $mapLine;
111                    $roleHash{$role} = $idx;
112                    if ($idx > $lastRoleIndex) {
113                        $lastRoleIndex = $idx;
114                    }
115                }
116            }
117            # We'll track duplicate roles in here.
118            my %roleList;
119            # Now we get the subsystem list.
120          my $subHash = $erdb->SubsystemHash();          my $subHash = $erdb->SubsystemHash();
121          for my $sub (sort keys %$subHash) {          for my $sub (sort keys %$subHash) {
122              $self->Add(subsystems => 1);              $self->Add(subsystems => 1);
# Line 109  Line 125 
125              my @roles = $fig->subsystem_to_roles($sub);              my @roles = $fig->subsystem_to_roles($sub);
126              for my $role (@roles) {              for my $role (@roles) {
127                  $self->Add(subsystemRoles => 1);                  $self->Add(subsystemRoles => 1);
128                  $self->PutE(Role => $role, hypothetical => hypo($role));                  # Check to see if this role is hypothetical.
129                    my $hypo = hypo($role);
130                    if (! $hypo) {
131                        # Is this role in the role index hash?
132                        my $roleIndex = $roleHash{$role};
133                        if (! defined $roleIndex) {
134                            # No, compute a new index for it.
135                            $roleIndex = ++$lastRoleIndex;
136                            $roleHash{$role} = $roleIndex;
137                        }
138                        if (! $roleList{$role}) {
139                            $roleList{$role} = 1;
140                            $self->PutE(RoleIndex => $role, role_index => $roleIndex);
141                        }
142                    }
143                    $self->PutE(Role => $role, hypothetical => $hypo);
144                }
145            }
146            Trace("Subsystem roles generated.") if T(2);
147            # Write out the role master file.
148            Tracer::PutFile($roleMapFile, [map { "$_\t$roleHash{$_}" } keys %roleHash]);
149            Trace("Role master file written to $roleMapFile.") if T(2);
150            # Now, we get the publications.
151            my $pubs = $fig->all_titles();
152            for my $pub (@$pubs) {
153                # Get the ID and title.
154                my ($pubmedID, $title) = @$pub;
155                # Only proceed if the ID is valid.
156                if ($pubmedID) {
157                    # Create a hyperlink from the title and the pubmed ID.
158                    my $link;
159                    if (! $title) {
160                        $link = HyperLink->new("<unknown>");
161                    } else {
162                        $link = HyperLink->new($title, "http://www.ncbi.nlm.nih.gov/pubmed/$pubmedID");
163                    }
164                    # Create the publication record.
165                    $self->PutE(Publication => $pubmedID, citation => $link);
166              }              }
167          }          }
168          Trace("Subsystem roles generated.") if T(3);          Trace("Publications generated.") if T(2);
169      }      }
170  }  }
171    
# Line 149  Line 202 
202          $self->Add(missingAliasFile => 1);          $self->Add(missingAliasFile => 1);
203          $aliasHash = {};          $aliasHash = {};
204      }      }
205        # Get all of this genome's protein sequences.
206        my %seqs = map { $_->[0] => $_->[2] } gjoseqlib::read_fasta("$FIG_Config::organisms/$genomeID/Features/peg/fasta");
207      # Get all of this genome's features.      # Get all of this genome's features.
208      my $featureList = $fig->all_features_detailed_fast($genomeID);      my $featureList = $fig->all_features_detailed_fast($genomeID);
209        # Compute the MD5 identifiers for the genome and its contigs.
210        my $genomeMD5Data = MD5Computer->new_from_fasta("$FIG_Config::organisms/$genomeID/contigs");
211      # Loop through them.      # Loop through them.
212      for my $feature (@$featureList) {      for my $feature (@$featureList) {
213          # Get this feature's data.          # Get this feature's data.
# Line 207  Line 264 
264                  $self->Add(badAttachment => 1);                  $self->Add(badAttachment => 1);
265              }              }
266          }          }
267            # Compute the MD5 identifier.
268            my $md5Alias = $genomeMD5Data->ComputeGeneMD5(map { $_->String() } @locs);
269          # Emit the feature record.          # Emit the feature record.
270          $self->PutE(Feature => $fid, feature_type => $type,          $self->PutE(Feature => $fid, feature_type => $type,
271                      sequence_length => $seqLen, function => $assignment,                      sequence_length => $seqLen, function => $assignment,
# Line 214  Line 273 
273          # Connect the feature to its genome.          # Connect the feature to its genome.
274          $self->PutR(IsOwnerOf => $genomeID, $fid);          $self->PutR(IsOwnerOf => $genomeID, $fid);
275          # Connect the feature to its roles.          # Connect the feature to its roles.
276          my ($roles, $errors) = LoaderUtils::RolesForLoading($assignment);          my ($roles, $errors) = SeedUtils::roles_for_loading($assignment);
277          if (! defined $roles) {          if (! defined $roles) {
278              # Here the functional assignment was suspicious.              # Here the functional assignment was suspicious.
279              $self->Add(suspiciousFunction => 1);              $self->Add(suspiciousFunction => 1);
# Line 229  Line 288 
288              $self->Add(badFeatureRoles => $errors);              $self->Add(badFeatureRoles => $errors);
289          }          }
290          # Now we have a whole bunch of attribute-related stuff to store in          # Now we have a whole bunch of attribute-related stuff to store in
291          # secondary Feature tables. First is the evidence codes.          # secondary Feature tables. First is the evidence codes. This is special
292            # because we have to save the DLIT numbers.
293            my @dlits;
294          my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');          my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');
295          for my $evidenceTuple (@evidenceTuples) {          for my $evidenceTuple (@evidenceTuples) {
296              my (undef, undef, $code) = @$evidenceTuple;              my (undef, undef, $code) = @$evidenceTuple;
297              $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);              $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);
298                # If this is a direct literature reference, save it.
299                if ($code =~ /dlit\((\d+)/) {
300                    push @dlits, $1;
301                    $self->Add(dlits => 1);
302                }
303          }          }
304          # Now we have the external links. These are stored using hyperlink objects.          # Now we have the external links. These are stored using hyperlink objects.
305          my @links = $fig->fid_links($fid);          my @links = $fig->fid_links($fid);
# Line 265  Line 331 
331          my $proteinID;          my $proteinID;
332          if ($type eq 'peg') {          if ($type eq 'peg') {
333              # Get the translation.              # Get the translation.
334              my $proteinSequence = $fig->get_translation($fid);              my $proteinSequence = $seqs{$fid};
335              if (! $proteinSequence) {              if (! $proteinSequence) {
336                  Trace("No protein sequence found for $fid.") if T(2);                  Trace("No protein sequence found for $fid.") if T(ERDBLoadGroup => 2);
337                  $self->Add(missingProtein => 1);                  $self->Add(missingProtein => 1);
338                  # Here there was some sort of error and the protein sequence did                  # Here there was some sort of error and the protein sequence did
339                  # not come back. Ask for the DNA and translate it instead.                  # not come back. Ask for the DNA and translate it instead.
# Line 279  Line 345 
345              # Create the protein record.              # Create the protein record.
346              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);
347              $self->PutR(IsProteinFor => $proteinID, $fid);              $self->PutR(IsProteinFor => $proteinID, $fid);
348              # Get the publications for this PEG.              # Connect this protein to the feature's publications (if any).
349              my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');              for my $pub (@dlits) {
350              for my $pub (@pubs) {                  $self->PutR(Concerns => $pub, $proteinID);
                 # Parse out the article title from the data.  
                 my (undef, undef, $data, $url) = @$pub;  
                 my @pieces = split /,/, $data, 3;  
                 if (defined $pieces[2]) {  
                     # Create the publication record.  
                     my $hl = HyperLink->new($pieces[2], $url);  
                     my $key = ERDB::DigestKey($url);  
                     $self->PutE(Publication => $key, citation => $hl);  
                     # Connect it to the protein.  
                     $self->PutR(Concerns => $key, $proteinID);  
                 }  
351              }              }
352          }          }
353          # Now we need to compute the identifiers. We start with the aliases.          # Now we need to compute the identifiers. We start with the aliases.
# Line 303  Line 358 
358          for my $aliasTuple (@$aliasList) {          for my $aliasTuple (@$aliasList) {
359              my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple;              my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple;
360              # Get the natural form. If there is none, then the canonical              # Get the natural form. If there is none, then the canonical
361              # form IS the natural form.              # form IS the natural form. Note we have to make a special check
362              my $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;              # for locus tags, which have an insane number of variants.
363                my $natural;
364                if ($aliasID =~ /LocusTag:(.+)/) {
365                    $natural = $1;
366                } else {
367                    $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;
368                }
369              # Create the identifier record.              # Create the identifier record.
370              $self->PutE(Identifier => $aliasID, natural_form => $natural,              $self->PutE(Identifier => $aliasID, natural_form => $natural,
371                          source => $aliasType);                          source => $aliasType);
# Line 314  Line 375 
375                  $self->PutR(IsNamedBy => $proteinID, $aliasID);                  $self->PutR(IsNamedBy => $proteinID, $aliasID);
376              } else {              } else {
377                  # No. Connect it to the feature.                  # No. Connect it to the feature.
378                  $self->PutR(Identifies => $aliasID, $fid, conf => $aliasConf);                  $self->PutR(IsIdentifiedBy => $fid, $aliasID, conf => $aliasConf);
379              }              }
380          }          }
381            # Make the MD5 identifier an alias.
382            $self->PutE(Identifier => "md5g|$md5Alias", natural_form => $md5Alias,
383                        source => 'MD5');
384            $self->PutR(IsIdentifiedBy => $fid, "md5g|$md5Alias", conf => 'A');
385          # Finally, this feature is an alias of itself.          # Finally, this feature is an alias of itself.
386          $self->PutE(Identifier => $fid, natural_form => $fid,          $self->PutE(Identifier => $fid, natural_form => $fid,
387                      source => 'SEED');                      source => 'SEED');
388          $self->PutR(Identifies => $fid, $fid, conf => 'A');          $self->PutR(IsIdentifiedBy => $fid, $fid, conf => 'A');
389      }      }
390  }  }
391    

Legend:
Removed from v.1.9  
changed lines
  Added in v.1.15

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3