[Bio] / Sprout / FeatureSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/FeatureSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.9, Mon Feb 1 20:14:28 2010 UTC revision 1.19, Tue Sep 13 21:07:59 2011 UTC
# Line 27  Line 27 
27      use HyperLink;      use HyperLink;
28      use AliasAnalysis;      use AliasAnalysis;
29      use LoaderUtils;      use LoaderUtils;
     use Digest::MD5;  
30      use SeedUtils;      use SeedUtils;
31        use gjoseqlib;
32        use MD5Computer;
33      use base 'BaseSaplingLoader';      use base 'BaseSaplingLoader';
34    
35  =head1 Sapling Feature Load Group Class  =head1 Sapling Feature Load Group Class
# Line 66  Line 67 
67      my ($class, $erdb, $options) = @_;      my ($class, $erdb, $options) = @_;
68      # Create the table list.      # Create the table list.
69      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink      my @tables = sort qw(Feature FeatureEssential FeatureEvidence FeatureLink
70                           FeatureVirulent IsOwnerOf IsLocatedIn Identifies                           FeatureVirulent IsOwnerOf IsLocatedIn IsIdentifiedBy
71                           Identifier IsNamedBy ProteinSequence Concerns                           Identifier IsNamedBy ProteinSequence Concerns
72                           IsAttachmentSiteFor Publication IsProteinFor                           IsAttachmentSiteFor Publication IsProteinFor
73                           Role IsFunctionalIn);                           Role RoleIndex IsFunctionalIn);
74      # Create the BaseSaplingLoader object.      # Create the BaseSaplingLoader object.
75      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
76      # Return it.      # Return it.
# Line 98  Line 99 
99          # Load this genome's features.          # Load this genome's features.
100          $self->LoadGenomeFeatures($genomeID);          $self->LoadGenomeFeatures($genomeID);
101      } else {      } else {
102          # The global data is the roles from subsystems.          # The global data is the roles from subsystems and the publications.
103          my $fig = $self->source();          my $fig = $self->source();
104          # First, we get the subsystem list.          # We need the master map of roles to IDs.
105            my %roleHash;
106            my $lastRoleIndex = -1;
107            my $roleMapFile = $erdb->LoadDirectory() . "/roleMap.tbl";
108            if (-f $roleMapFile) {
109                for my $mapLine (Tracer::GetFile($roleMapFile)) {
110                    my ($role, $idx) = split /\t/, $mapLine;
111                    $roleHash{$role} = $idx;
112                    if ($idx > $lastRoleIndex) {
113                        $lastRoleIndex = $idx;
114                    }
115                }
116            }
117            # We'll track duplicate roles in here.
118            my %roleList;
119            # Now we get the subsystem list.
120          my $subHash = $erdb->SubsystemHash();          my $subHash = $erdb->SubsystemHash();
121          for my $sub (sort keys %$subHash) {          for my $sub (sort keys %$subHash) {
122              $self->Add(subsystems => 1);              $self->Add(subsystems => 1);
# Line 109  Line 125 
125              my @roles = $fig->subsystem_to_roles($sub);              my @roles = $fig->subsystem_to_roles($sub);
126              for my $role (@roles) {              for my $role (@roles) {
127                  $self->Add(subsystemRoles => 1);                  $self->Add(subsystemRoles => 1);
128                  $self->PutE(Role => $role, hypothetical => hypo($role));                  # Check to see if this role is hypothetical.
129                    my $hypo = hypo($role);
130                    if (! $hypo) {
131                        # Is this role in the role index hash?
132                        my $roleIndex = $roleHash{$role};
133                        if (! defined $roleIndex) {
134                            # No, compute a new index for it.
135                            $roleIndex = ++$lastRoleIndex;
136                            $roleHash{$role} = $roleIndex;
137                        }
138                        if (! $roleList{$role}) {
139                            $roleList{$role} = 1;
140                            $self->PutE(RoleIndex => $role, role_index => $roleIndex);
141                        }
142                    }
143                    $self->PutE(Role => $role, hypothetical => $hypo);
144                }
145            }
146            Trace("Subsystem roles generated.") if T(2);
147            # Write out the role master file.
148            Tracer::PutFile($roleMapFile, [map { "$_\t$roleHash{$_}" } keys %roleHash]);
149            Trace("Role master file written to $roleMapFile.") if T(2);
150            # Now, we get the publications.
151            my $pubs = $fig->all_titles();
152            for my $pub (@$pubs) {
153                # Get the ID and title.
154                my ($pubmedID, $title) = @$pub;
155                # Only proceed if the ID is valid.
156                if ($pubmedID) {
157                    # Create a hyperlink from the title and the pubmed ID.
158                    my $link;
159                    if (! $title) {
160                        $link = HyperLink->new("<unknown>");
161                    } else {
162                        $link = HyperLink->new($title, "http://www.ncbi.nlm.nih.gov/pubmed/$pubmedID");
163                    }
164                    # Create the publication record.
165                    $self->PutE(Publication => $pubmedID, citation => $link);
166              }              }
167          }          }
168          Trace("Subsystem roles generated.") if T(3);          Trace("Publications generated.") if T(2);
169      }      }
170  }  }
171    
# Line 149  Line 202 
202          $self->Add(missingAliasFile => 1);          $self->Add(missingAliasFile => 1);
203          $aliasHash = {};          $aliasHash = {};
204      }      }
205        # Get all of this genome's protein sequences.
206        my %seqs = map { $_->[0] => $_->[2] } gjoseqlib::read_fasta("$FIG_Config::organisms/$genomeID/Features/peg/fasta");
207      # Get all of this genome's features.      # Get all of this genome's features.
208      my $featureList = $fig->all_features_detailed_fast($genomeID);      my $featureList = $fig->all_features_detailed_fast($genomeID);
209      # Loop through them.      # Loop through them.
# Line 214  Line 269 
269          # Connect the feature to its genome.          # Connect the feature to its genome.
270          $self->PutR(IsOwnerOf => $genomeID, $fid);          $self->PutR(IsOwnerOf => $genomeID, $fid);
271          # Connect the feature to its roles.          # Connect the feature to its roles.
272          my ($roles, $errors) = LoaderUtils::RolesForLoading($assignment);          my ($roles, $errors) = SeedUtils::roles_for_loading($assignment);
273          if (! defined $roles) {          if (! defined $roles) {
274              # Here the functional assignment was suspicious.              # Here the functional assignment was suspicious.
275              $self->Add(suspiciousFunction => 1);              $self->Add(suspiciousFunction => 1);
# Line 229  Line 284 
284              $self->Add(badFeatureRoles => $errors);              $self->Add(badFeatureRoles => $errors);
285          }          }
286          # Now we have a whole bunch of attribute-related stuff to store in          # Now we have a whole bunch of attribute-related stuff to store in
287          # secondary Feature tables. First is the evidence codes.          # secondary Feature tables. First is the evidence codes. This is special
288            # because we have to save the DLIT numbers.
289            my @dlits;
290          my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');          my @evidenceTuples = $fig->get_attributes($fid, 'evidence_code');
291          for my $evidenceTuple (@evidenceTuples) {          for my $evidenceTuple (@evidenceTuples) {
292              my (undef, undef, $code) = @$evidenceTuple;              my (undef, undef, $code) = @$evidenceTuple;
293              $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);              $self->PutE(FeatureEvidence => $fid, 'evidence-code' => $code);
294                # If this is a direct literature reference, save it.
295                if ($code =~ /dlit\((\d+)/) {
296                    push @dlits, $1;
297                    $self->Add(dlits => 1);
298                }
299          }          }
300          # Now we have the external links. These are stored using hyperlink objects.          # Now we have the external links. These are stored using hyperlink objects.
301          my @links = $fig->fid_links($fid);          my @links = $fig->fid_links($fid);
# Line 265  Line 327 
327          my $proteinID;          my $proteinID;
328          if ($type eq 'peg') {          if ($type eq 'peg') {
329              # Get the translation.              # Get the translation.
330              my $proteinSequence = $fig->get_translation($fid);              my $proteinSequence = $seqs{$fid};
331              if (! $proteinSequence) {              if (! $proteinSequence) {
332                  Trace("No protein sequence found for $fid.") if T(2);                  Trace("No protein sequence found for $fid.") if T(ERDBLoadGroup => 2);
333                  $self->Add(missingProtein => 1);                  $self->Add(missingProtein => 1);
334                  # Here there was some sort of error and the protein sequence did                  # Here there was some sort of error and the protein sequence did
335                  # not come back. Ask for the DNA and translate it instead.                  # not come back. Ask for the DNA and translate it instead.
# Line 279  Line 341 
341              # Create the protein record.              # Create the protein record.
342              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);              $self->PutE(ProteinSequence => $proteinID, sequence => $proteinSequence);
343              $self->PutR(IsProteinFor => $proteinID, $fid);              $self->PutR(IsProteinFor => $proteinID, $fid);
344              # Get the publications for this PEG.              # Connect this protein to the feature's publications (if any).
345              my @pubs = $fig->get_attributes($fid, 'PUBMED_CURATED_RELEVANT');              for my $pub (@dlits) {
346              for my $pub (@pubs) {                  $self->PutR(Concerns => $pub, $proteinID);
                 # Parse out the article title from the data.  
                 my (undef, undef, $data, $url) = @$pub;  
                 my @pieces = split /,/, $data, 3;  
                 if (defined $pieces[2]) {  
                     # Create the publication record.  
                     my $hl = HyperLink->new($pieces[2], $url);  
                     my $key = ERDB::DigestKey($url);  
                     $self->PutE(Publication => $key, citation => $hl);  
                     # Connect it to the protein.  
                     $self->PutR(Concerns => $key, $proteinID);  
                 }  
347              }              }
348          }          }
349          # Now we need to compute the identifiers. We start with the aliases.          # Now we need to compute the identifiers. We start with the aliases.
# Line 303  Line 354 
354          for my $aliasTuple (@$aliasList) {          for my $aliasTuple (@$aliasList) {
355              my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple;              my ($aliasID, $aliasType, $aliasConf) = @$aliasTuple;
356              # Get the natural form. If there is none, then the canonical              # Get the natural form. If there is none, then the canonical
357              # form IS the natural form.              # form IS the natural form. Note we have to make a special check
358              my $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;              # for locus tags, which have an insane number of variants.
359                my $natural;
360                if ($aliasID =~ /LocusTag:(.+)/) {
361                    $natural = $1;
362                } else {
363                    $natural = AliasAnalysis::Type($aliasType => $aliasID) || $aliasID;
364                }
365              # Create the identifier record.              # Create the identifier record.
366              $self->PutE(Identifier => $aliasID, natural_form => $natural,              $self->PutE(Identifier => $aliasID, natural_form => $natural,
367                          source => $aliasType);                          source => $aliasType);
# Line 314  Line 371 
371                  $self->PutR(IsNamedBy => $proteinID, $aliasID);                  $self->PutR(IsNamedBy => $proteinID, $aliasID);
372              } else {              } else {
373                  # No. Connect it to the feature.                  # No. Connect it to the feature.
374                  $self->PutR(Identifies => $aliasID, $fid, conf => $aliasConf);                  $self->PutR(IsIdentifiedBy => $fid, $aliasID, conf => $aliasConf);
375              }              }
376          }          }
377          # Finally, this feature is an alias of itself.          # Finally, this feature is an alias of itself.
378          $self->PutE(Identifier => $fid, natural_form => $fid,          $self->PutE(Identifier => $fid, natural_form => $fid,
379                      source => 'SEED');                      source => 'SEED');
380          $self->PutR(Identifies => $fid, $fid, conf => 'A');          $self->PutR(IsIdentifiedBy => $fid, $fid, conf => 'A');
381      }      }
382  }  }
383    

Legend:
Removed from v.1.9  
changed lines
  Added in v.1.19

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3