[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5, Fri Sep 9 14:55:01 2005 UTC revision 1.7, Tue Sep 13 19:05:20 2005 UTC
# Line 220  Line 220 
220      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
221      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
222          Trace("Loading data for genome $genomeID.") if T(3);          Trace("Loading data for genome $genomeID.") if T(3);
223            $loadGenome->Add("genomeIn");
224          # The access code comes in via the genome hash.          # The access code comes in via the genome hash.
225          my $accessCode = $genomeHash->{$genomeID};          my $accessCode = $genomeHash->{$genomeID};
226          # Get the genus, species, and strain from the scientific name. Note that we append          # Get the genus, species, and strain from the scientific name. Note that we append
# Line 235  Line 236 
236          my @contigs = $fig->all_contigs($genomeID);          my @contigs = $fig->all_contigs($genomeID);
237          for my $contigID (@contigs) {          for my $contigID (@contigs) {
238              Trace("Processing contig $contigID for $genomeID.") if T(4);              Trace("Processing contig $contigID for $genomeID.") if T(4);
239                $loadContig->Add("contigIn");
240                $loadSequence->Add("contigIn");
241              # Create the contig ID.              # Create the contig ID.
242              my $sproutContigID = "$genomeID:$contigID";              my $sproutContigID = "$genomeID:$contigID";
243              # Create the contig record and relate it to the genome.              # Create the contig record and relate it to the genome.
# Line 246  Line 249 
249              # Now we get the sequence a chunk at a time.              # Now we get the sequence a chunk at a time.
250              my $contigLen = $fig->contig_ln($genomeID, $contigID);              my $contigLen = $fig->contig_ln($genomeID, $contigID);
251              for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {              for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
252                    $loadSequence->Add("chunkIn");
253                  # Compute the endpoint of this chunk.                  # Compute the endpoint of this chunk.
254                  my $end = FIG::min($i + $chunkSize - 1, $contigLen);                  my $end = FIG::min($i + $chunkSize - 1, $contigLen);
255                  # Get the actual DNA.                  # Get the actual DNA.
# Line 310  Line 314 
314      # Loop through the genomes found.      # Loop through the genomes found.
315      for my $genome (sort keys %{$genomeFilter}) {      for my $genome (sort keys %{$genomeFilter}) {
316          Trace("Generating coupling data for $genome.") if T(3);          Trace("Generating coupling data for $genome.") if T(3);
317            $loadCoupling->Add("genomeIn");
318          # Create a hash table for holding coupled pairs. We use this to prevent          # Create a hash table for holding coupled pairs. We use this to prevent
319          # duplicates. For example, if A is coupled to B, we don't want to also          # duplicates. For example, if A is coupled to B, we don't want to also
320          # assert that B is coupled to A, because we already know it. Fortunately,          # assert that B is coupled to A, because we already know it. Fortunately,
# Line 320  Line 325 
325          my @pegs = $fig->pegs_of($genome);          my @pegs = $fig->pegs_of($genome);
326          # Loop through the PEGs.          # Loop through the PEGs.
327          for my $peg1 (@pegs) {          for my $peg1 (@pegs) {
328                $loadCoupling->Add("pegIn");
329              Trace("Processing PEG $peg1 for $genome.") if T(4);              Trace("Processing PEG $peg1 for $genome.") if T(4);
330              # Get a list of the coupled PEGs.              # Get a list of the coupled PEGs.
331              my @couplings = $fig->coupled_to($peg1);              my @couplings = $fig->coupled_to($peg1);
# Line 330  Line 336 
336                  # Compute the coupling ID.                  # Compute the coupling ID.
337                  my $coupleID = Sprout::CouplingID($peg1, $peg2);                  my $coupleID = Sprout::CouplingID($peg1, $peg2);
338                  if (! exists $dupHash{$coupleID}) {                  if (! exists $dupHash{$coupleID}) {
339                        $loadCoupling->Add("couplingIn");
340                      # Here we have a new coupling to store in the load files.                      # Here we have a new coupling to store in the load files.
341                      Trace("Storing coupling ($coupleID) with score $score.") if T(4);                      Trace("Storing coupling ($coupleID) with score $score.") if T(4);
342                      # Ensure we don't do this again.                      # Ensure we don't do this again.
# Line 345  Line 352 
352                      my %evidenceMap = ();                      my %evidenceMap = ();
353                      # Process each evidence item.                      # Process each evidence item.
354                      for my $evidenceData (@evidence) {                      for my $evidenceData (@evidence) {
355                            $loadPCH->Add("evidenceIn");
356                          my ($peg3, $peg4, $usage) = @{$evidenceData};                          my ($peg3, $peg4, $usage) = @{$evidenceData};
357                          # Only proceed if the evidence is from a Sprout                          # Only proceed if the evidence is from a Sprout
358                          # genome.                          # genome.
359                          if ($genomeFilter->{$fig->genome_of($peg3)}) {                          if ($genomeFilter->{$fig->genome_of($peg3)}) {
360                                $loadUsesAsEvidence->Add("evidenceChosen");
361                              my $evidenceKey = "$coupleID $peg3 $peg4";                              my $evidenceKey = "$coupleID $peg3 $peg4";
362                              # We store this evidence in the hash if the usage                              # We store this evidence in the hash if the usage
363                              # is nonzero or no prior evidence has been found. This                              # is nonzero or no prior evidence has been found. This
# Line 429  Line 438 
438      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
439      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
440          Trace("Loading features for genome $genomeID.") if T(3);          Trace("Loading features for genome $genomeID.") if T(3);
441            $loadFeature->Add("genomeIn");
442          # Get the feature list for this genome.          # Get the feature list for this genome.
443          my $features = $fig->all_features_detailed($genomeID);          my $features = $fig->all_features_detailed($genomeID);
444          # Loop through the features.          # Loop through the features.
445          for my $featureData (@{$features}) {          for my $featureData (@{$features}) {
446                $loadFeature->Add("featureIn");
447              # Split the tuple.              # Split the tuple.
448              my ($featureID, $locations, $aliases, $type) = @{$featureData};              my ($featureID, $locations, $aliases, $type) = @{$featureData};
449              # Create the feature record.              # Create the feature record.
450              $loadFeature->Put("$genomeID:$featureID", 1, $type);              $loadFeature->Put($featureID, 1, $type);
451              # Create the aliases.              # Create the aliases.
452              for my $alias (split /\s*,\s*/, $aliases) {              for my $alias (split /\s*,\s*/, $aliases) {
453                  $loadFeatureAlias->Put($featureID, $alias);                  $loadFeatureAlias->Put($featureID, $alias);
# Line 448  Line 459 
459              }              }
460              # If this is a peg, generate the translation and the upstream.              # If this is a peg, generate the translation and the upstream.
461              if ($type eq 'peg') {              if ($type eq 'peg') {
462                    $loadFeatureTranslation->Add("pegIn");
463                  my $translation = $fig->get_translation($featureID);                  my $translation = $fig->get_translation($featureID);
464                  if ($translation) {                  if ($translation) {
465                      $loadFeatureTranslation->Put($featureID, $translation);                      $loadFeatureTranslation->Put($featureID, $translation);
# Line 470  Line 482 
482                  # Split it into a list of chunks.                  # Split it into a list of chunks.
483                  my @locOList = ();                  my @locOList = ();
484                  while (my $peeling = $locObject->Peel($chunkSize)) {                  while (my $peeling = $locObject->Peel($chunkSize)) {
485                        $loadIsLocatedIn->Add("peeling");
486                      push @locOList, $peeling;                      push @locOList, $peeling;
487                  }                  }
488                  push @locOList, $locObject;                  push @locOList, $locObject;
# Line 528  Line 541 
541      Trace("Beginning BBH load.") if T(2);      Trace("Beginning BBH load.") if T(2);
542      # Now we loop through the genomes, generating the data for each one.      # Now we loop through the genomes, generating the data for each one.
543      for my $genomeID (sort keys %{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
544            $loadIsBidirectionalBestHitOf->Add("genomeIn");
545          Trace("Processing features for genome $genomeID.") if T(3);          Trace("Processing features for genome $genomeID.") if T(3);
546          # Get the feature list for this genome.          # Get the feature list for this genome.
547          my $features = $fig->all_features_detailed($genomeID);          my $features = $fig->all_features_detailed($genomeID);
# Line 625  Line 639 
639      my %roleData = ();      my %roleData = ();
640      for my $subsysID (@subsysIDs) {      for my $subsysID (@subsysIDs) {
641          Trace("Creating subsystem $subsysID.") if T(3);          Trace("Creating subsystem $subsysID.") if T(3);
642            $loadSubsystem->Add("subsystemIn");
643          # Create the subsystem record.          # Create the subsystem record.
644          $loadSubsystem->Put($subsysID);          $loadSubsystem->Put($subsysID);
645          # Get the subsystem's roles.          # Get the subsystem's roles.
646          my @roles = $fig->subsys_to_roles($subsysID);          my @roles = $fig->subsystem_to_roles($subsysID);
647          # Connect the roles to the subsystem. If a role is new, we create          # Connect the roles to the subsystem. If a role is new, we create
648          # a role record for it.          # a role record for it.
649          for my $roleID (@roles) {          for my $roleID (@roles) {
650                $loadOccursInSubsystem->Add("roleIn");
651              $loadOccursInSubsystem->Put($roleID, $subsysID);              $loadOccursInSubsystem->Put($roleID, $subsysID);
652              if (! exists $roleData{$roleID}) {              if (! exists $roleData{$roleID}) {
653                  $loadRole->Put($roleID);                  $loadRole->Put($roleID);
# Line 653  Line 669 
669                  for (my $i = 0; $i <= $#roles; $i++) {                  for (my $i = 0; $i <= $#roles; $i++) {
670                      my $role = $roles[$i];                      my $role = $roles[$i];
671                      # Get the features in the spreadsheet cell for this genome and role.                      # Get the features in the spreadsheet cell for this genome and role.
672                      my @pegs = $fig->pegs_in_subsystem_coll($subsysID, $genomeID, $i);                      my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i);
673                      # Only proceed if features exist.                      # Only proceed if features exist.
674                      if (@pegs > 0) {                      if (@pegs > 0) {
675                          # Create the spreadsheet cell.                          # Create the spreadsheet cell.
# Line 781  Line 797 
797      my $nextID = 1;      my $nextID = 1;
798      # Loop through the genomes.      # Loop through the genomes.
799      for my $genomeID (keys %{$genomeHash}) {      for my $genomeID (keys %{$genomeHash}) {
800            $loadProperty->Add("genomeIn");
801          # Get the genome's features. The feature ID is the first field in the          # Get the genome's features. The feature ID is the first field in the
802          # tuples returned by "all_features_detailed". We use "all_features_detailed"          # tuples returned by "all_features_detailed". We use "all_features_detailed"
803          # rather than "all_features" because we want all features regardless of type.          # rather than "all_features" because we want all features regardless of type.
804          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};          my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
805          # Loop through the features, creating HasProperty records.          # Loop through the features, creating HasProperty records.
806          for my $fid (@features) {          for my $fid (@features) {
807                $loadProperty->Add("featureIn");
808              # Get all attributes for this feature. We do this one feature at a time              # Get all attributes for this feature. We do this one feature at a time
809              # to insure we do not get any genome attributes.              # to insure we do not get any genome attributes.
810              my @attributeList = $fig->get_attributes($fid, '', '', '');              my @attributeList = $fig->get_attributes($fid, '', '', '');
# Line 872  Line 890 
890      # Get the current time.      # Get the current time.
891      my $time = time();      my $time = time();
892      # Loop through the genomes.      # Loop through the genomes.
893      for my $genomeID (%{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
894          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
895          # Get the genome's PEGs.          # Get the genome's PEGs.
896          my @pegs = $fig->pegs_of($genomeID);          my @pegs = $fig->pegs_of($genomeID);
# Line 897  Line 915 
915                  }                  }
916                  # Now loop through the real annotations.                  # Now loop through the real annotations.
917                  for my $tuple ($fig->feature_annotations($peg, "raw")) {                  for my $tuple ($fig->feature_annotations($peg, "raw")) {
918                      my ($fid, $timestamp, $user, $text) = $tuple;                      my ($fid, $timestamp, $user, $text) = @{$tuple};
919                      # Here we fix up the annotation text. "\r" is removed,                      # Here we fix up the annotation text. "\r" is removed,
920                      # and "\t" and "\n" are escaped. Note we use the "s"                      # and "\t" and "\n" are escaped. Note we use the "s"
921                      # modifier so that new-lines inside the text do not                      # modifier so that new-lines inside the text do not
# Line 984  Line 1002 
1002      my %sourceDesc = ();      my %sourceDesc = ();
1003      # Loop through the genomes.      # Loop through the genomes.
1004      my $line;      my $line;
1005      for my $genomeID (%{$genomeHash}) {      for my $genomeID (sort keys %{$genomeHash}) {
1006          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
1007          # Open the project file.          # Open the project file.
1008          if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) &&          if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) &&
1009              defined($line = <TMP>)) {              defined($line = <TMP>)) {
1010              chomp $line;              chomp $line;
1011              my($sourceID, $desc, $url) = split(/\t/,$_);              my($sourceID, $desc, $url) = split(/\t/,$line);
1012              $loadComesFrom->Put($genomeID, $sourceID);              $loadComesFrom->Put($genomeID, $sourceID);
1013              if ($url && ! exists $sourceURL{$genomeID}) {              if ($url && ! exists $sourceURL{$genomeID}) {
1014                  $loadSourceURL->Put($sourceID, $url);                  $loadSourceURL->Put($sourceID, $url);
# Line 1008  Line 1026 
1026      return $retVal;      return $retVal;
1027  }  }
1028    
1029    =head3 LoadExternalData
1030    
1031    C<< my $stats = $spl->LoadExternalData(); >>
1032    
1033    Load the external data from FIG into Sprout.
1034    
1035    External data contains information about external feature IDs.
1036    
1037    The following relations are loaded by this method.
1038    
1039        ExternalAliasFunc
1040        ExternalAliasOrg
1041    
1042    The support for external IDs in FIG is hidden beneath layers of other data, so
1043    we access the SEED files directly to create these tables. This is also one of
1044    the few load methods that does not proceed genome by genome.
1045    
1046    =over 4
1047    
1048    =item RETURNS
1049    
1050    Returns a statistics object for the loads.
1051    
1052    =back
1053    
1054    =cut
1055    #: Return Type $%;
1056    sub LoadExternalData {
1057        # Get this object instance.
1058        my ($self) = @_;
1059        # Get the FIG object.
1060        my $fig = $self->{fig};
1061        # Get the genome hash.
1062        my $genomeHash = $self->{genomes};
1063        my $genomeCount = (keys %{$genomeHash});
1064        # Convert the genome hash. We'll get the genus and species for each genome and make
1065        # it the key.
1066        my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1067        # Create load objects for each of the tables we're loading.
1068        my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);
1069        my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);
1070        Trace("Beginning external data load.") if T(2);
1071        # We loop through the files one at a time. First, the organism file.
1072        Open(\*ORGS, "<$FIG_Config::global/ext_org.table");
1073        my $orgLine;
1074        while (defined($orgLine = <ORGS>)) {
1075            # Clean the input line.
1076            chomp $orgLine;
1077            # Parse the organism name.
1078            my ($protID, $name) = split /\s*\t\s*/, $orgLine;
1079            $loadExternalAliasOrg->Put($protID, $name);
1080        }
1081        close ORGS;
1082        # Now the function file.
1083        my $funcLine;
1084        Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");
1085        while (defined($funcLine = <FUNCS>)) {
1086            # Clean the line ending.
1087            chomp $funcLine;
1088            # Only proceed if the line is non-blank.
1089            if ($funcLine) {
1090                # Split it into fields.
1091                my @funcFields = split /\s*\t\s*/, $funcLine;
1092                # If there's an EC number, append it to the description.
1093                if ($#funcFields >= 2 && $funcFields[2] =~ /^(EC .*\S)/) {
1094                    $funcFields[1] .= " $1";
1095                }
1096                # Output the function line.
1097                $loadExternalAliasFunc->Put(@funcFields[0,1]);
1098            }
1099        }
1100        # Finish the load.
1101        my $retVal = $self->_FinishAll();
1102        return $retVal;
1103    }
1104    
1105  =head3 LoadGroupData  =head3 LoadGroupData
1106    
# Line 1045  Line 1138 
1138      Trace("Beginning group data load.") if T(2);      Trace("Beginning group data load.") if T(2);
1139      # Loop through the genomes.      # Loop through the genomes.
1140      my $line;      my $line;
1141      for my $genomeID (%{$genomeHash}) {      for my $genomeID (keys %{$genomeHash}) {
1142          Trace("Processing $genomeID.") if T(3);          Trace("Processing $genomeID.") if T(3);
1143          # Open the NMPDR group file for this genome.          # Open the NMPDR group file for this genome.
1144          if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&          if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
1145              defined($line = <TMP>)) {              defined($line = <TMP>)) {
1146              # Clean the line ending.              # Clean the line ending.
1147              chomp;              chomp $line;
1148              # Add the group to the table. Note that there can only be one group              # Add the group to the table. Note that there can only be one group
1149              # per genome.              # per genome.
1150              $loadGenomeGroups->Put($genomeID, $line);              $loadGenomeGroups->Put($genomeID, $line);

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3