[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Tue Dec 14 19:48:38 2010 UTC revision 1.2, Tue Jan 11 15:04:03 2011 UTC
# Line 25  Line 25 
25      use SeedUtils;      use SeedUtils;
26      use SAPserver;      use SAPserver;
27      use Sapling;      use Sapling;
28        use AliasAnalysis;
29    
30  =head1 Sapling Genome Loader  =head1 Sapling Genome Loader
31    
# Line 65  Line 66 
66      # Create the loader object.      # Create the loader object.
67      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
68      # Load the contigs.      # Load the contigs.
69        Trace("Loading contigs for $genome.") if T(2);
70      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
71      # Load the features.      # Load the features.
72        Trace("Loading features for $genome.") if T(2);
73      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
74      # Load the subsystem bindings.      # Load the subsystem bindings.
75        Trace("Loading subsystems for $genome.") if T(2);
76      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
77      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
78        Trace("Creating root for $genome.") if T(2);
79      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
80      # Return the statistics.      # Return the statistics.
81      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 300  Line 305 
305      # Compute the chunk ID.      # Compute the chunk ID.
306      my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');      my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
307      # Connect this sequence to the contig.      # Connect this sequence to the contig.
308      $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunk);      $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
309      # Create the DNA sequence.      # Create the DNA sequence.
310      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
311      # Record the chunk.      # Record the chunk.
# Line 419  Line 424 
424              if (exists $fids{$fid}) {              if (exists $fids{$fid}) {
425                  $sap->Delete(Feature => $fid);                  $sap->Delete(Feature => $fid);
426                  $stats->Add(duplicateFid => 1);                  $stats->Add(duplicateFid => 1);
             } else {  
                 # Otherwise connect it to the genome.  
                 $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);  
427              }              }
428                # Otherwise connect this feature to the genome.
429                $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);
430              # Now we must parse the locations. This will contain a list of the location              # Now we must parse the locations. This will contain a list of the location
431              # data 4-tuples (contig, start, dir, len).              # data 4-tuples (contig, start, dir, len).
432              my @locData;              my @locData;
# Line 482  Line 486 
486                  # Output the last segment.                  # Output the last segment.
487                  $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);                  $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
488              }              }
489                # Now we process the aliases and create the identifiers. We don't do this
490                # for RNA, because the RNA function is stored in the aliases.
491                if ($type ne 'rna') {
492                    for my $alias (@aliases) {
493                        my $normalized;
494                        # Determine the type.
495                        my $aliasType = AliasAnalysis::TypeOf($alias);
496                        $stats->Add(aliasAll => 1);
497                        # Is this a recognized type?
498                        if ($aliasType) {
499                            $stats->Add(aliasNormal => 1);
500                            # Yes. Write it normally.
501                            $self->CreateIdentifier($alias, B => $aliasType, $fid);
502                        } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {
503                            # No, but this is a specially-marked locus tag.
504                            $normalized = $1;
505                            $stats->Add(aliasLocus => 1);
506                            $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
507                        } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
508                            # No, but this is a natural locus tag.
509                            $stats->Add(aliasLocus => 1);
510                            $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
511                        } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {
512                            # No, but this is a natural gene name.
513                            $stats->Add(aliasGene => 1);
514                            $self->CreateIdentifier($normalized, B => 'GENE', $fid);
515                        } elsif ($alias =~ /^\d+$/) {
516                            # Here it's a naked number, which means it's a GI number
517                            # of some sort.
518                            $stats->Add(aliasGI => 1);
519                            $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);
520                        } elsif ($alias =~ /^protein_id\|(.+)/) {
521                            # Here we have a REFSEQ protein ID. Right now we don't have a way to
522                            # handle that, because we don't know the feature's protein ID here.
523                            $stats->Add(aliasProtein => 1);
524                        } elsif ($alias =~ /[:|]/) {
525                            # Here it's an alias of an unknown type, so we skip it.
526                            $stats->Add(aliasUnknown => 1);
527                        } else {
528                            # Here it's a miscellaneous type.
529                            $stats->Add(aliasMisc => 1);
530                            $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);
531                        }
532                    }
533                }
534          }          }
535      }      }
536  }  }
# Line 1030  Line 1079 
1079      return %retVal;      return %retVal;
1080  }  }
1081    
1082    =head3 CreateIdentifier
1083    
1084        $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);
1085    
1086    Link an identifier to a feature. The identifier is presented in prefixed form and is of the
1087    specified type and the specified confidence level.
1088    
1089    =over 4
1090    
1091    =item alias
1092    
1093    Identifier to connect to the feature.
1094    
1095    =item conf
1096    
1097    Confidence level (C<A> curated, C<B> normal, C<C> protein only).
1098    
1099    =item aliasType
1100    
1101    Type of alias (e.g. C<NCBI>, C<LocusTag>).
1102    
1103    =item fid
1104    
1105    ID of the relevant feature.
1106    
1107    =back
1108    
1109    =cut
1110    
1111    sub CreateIdentifier {
1112        # Get the parameters.
1113        my ($self, $alias, $conf, $aliasType, $fid) = @_;
1114        # Get the Sapling object.
1115        my $sap = $self->{sap};
1116        # Compute the identifier's natural form.
1117        my $natural = $alias;
1118        if ($natural =~ /[:|](.+)/) {
1119            $natural = $1;
1120        }
1121        # Insure the identifier exists in the database.
1122        $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);
1123        # Connect the identifier to the feature.
1124        $sap->InsertObject('Identifies', from_link => $alias, to_link => $fid, conf => $conf);
1125    }
1126    
1127  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.2

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3