[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Diff of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.87, Mon Sep 10 18:16:54 2007 UTC revision 1.92, Sun Mar 23 16:33:15 2008 UTC
# Line 12  Line 12 
12      use Stats;      use Stats;
13      use BasicLocation;      use BasicLocation;
14      use HTML;      use HTML;
15        use AliasAnalysis;
16    
17  =head1 Sprout Load Methods  =head1 Sprout Load Methods
18    
# Line 51  Line 52 
52    
53  =head3 new  =head3 new
54    
55  C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>      my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options);
56    
57  Construct a new Sprout Loader object, specifying the two participating databases and  Construct a new Sprout Loader object, specifying the two participating databases and
58  the name of the files containing the list of genomes and subsystems to use.  the name of the files containing the list of genomes and subsystems to use.
# Line 102  Line 103 
103              # Here we want all the complete genomes and an access code of 1.              # Here we want all the complete genomes and an access code of 1.
104              my @genomeList = $fig->genomes(1);              my @genomeList = $fig->genomes(1);
105              %genomes = map { $_ => 1 } @genomeList;              %genomes = map { $_ => 1 } @genomeList;
106                Trace(scalar(keys %genomes) . " genomes found.") if T(3);
107          } else {          } else {
108              my $type = ref $genomeFile;              my $type = ref $genomeFile;
109              Trace("Genome file parameter type is \"$type\".") if T(3);              Trace("Genome file parameter type is \"$type\".") if T(3);
# Line 168  Line 170 
170          for my $subsystem (keys %subsystems) {          for my $subsystem (keys %subsystems) {
171              my $name = $subsystem;              my $name = $subsystem;
172              $name =~ s/_/ /g;              $name =~ s/_/ /g;
 #            my $classes = $fig->subsystem_classification($subsystem);  
 #            $name .= " " . join(" ", @{$classes});  
173              $subsystems{$subsystem} = $name;              $subsystems{$subsystem} = $name;
174          }          }
175      }      }
# Line 196  Line 196 
196    
197  =head3 LoadOnly  =head3 LoadOnly
198    
199  C<< my $flag = $spl->LoadOnly; >>      my $flag = $spl->LoadOnly;
200    
201  Return TRUE if we are in load-only mode, else FALSE.  Return TRUE if we are in load-only mode, else FALSE.
202    
# Line 210  Line 210 
210    
211  =head3 LoadGenomeData  =head3 LoadGenomeData
212    
213  C<< my $stats = $spl->LoadGenomeData(); >>      my $stats = $spl->LoadGenomeData();
214    
215  Load the Genome, Contig, and Sequence data from FIG into Sprout.  Load the Genome, Contig, and Sequence data from FIG into Sprout.
216    
# Line 255  Line 255 
255          Trace("Loading from existing files.") if T(2);          Trace("Loading from existing files.") if T(2);
256      } else {      } else {
257          Trace("Generating genome data.") if T(2);          Trace("Generating genome data.") if T(2);
258            # Get the full info for the FIG genomes.
259            my %genomeInfo = map { $_->[0] => { gname => $_->[1], szdna => $_->[2], maindomain => $_->[3],
260                                                pegs => $_->[4], rnas => $_->[5], complete => $_->[6] } } @{$fig->genome_info()};
261          # Now we loop through the genomes, generating the data for each one.          # Now we loop through the genomes, generating the data for each one.
262          for my $genomeID (sort keys %{$genomeHash}) {          for my $genomeID (sort keys %{$genomeHash}) {
263              Trace("Generating data for genome $genomeID.") if T(3);              Trace("Generating data for genome $genomeID.") if T(3);
# Line 284  Line 287 
287                  $group = $FIG_Config::otherGroup;                  $group = $FIG_Config::otherGroup;
288              }              }
289              close TMP;              close TMP;
290                # Get the contigs.
291                my @contigs = $fig->all_contigs($genomeID);
292                # Get this genome's info array.
293                my $info = $genomeInfo{$genomeID};
294              # Output the genome record.              # Output the genome record.
295              $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID),              $loadGenome->Put($genomeID, $accessCode, $info->{complete}, scalar(@contigs),
296                               $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy);                               $dnaSize, $genus, $info->{pegs}, $group, $info->{rnas}, $species, $extra, $version, $taxonomy);
297              # Now we loop through each of the genome's contigs.              # Now we loop through each of the genome's contigs.
             my @contigs = $fig->all_contigs($genomeID);  
298              for my $contigID (@contigs) {              for my $contigID (@contigs) {
299                  Trace("Processing contig $contigID for $genomeID.") if T(4);                  Trace("Processing contig $contigID for $genomeID.") if T(4);
300                  $loadContig->Add("contigIn");                  $loadContig->Add("contigIn");
# Line 326  Line 332 
332    
333  =head3 LoadFeatureData  =head3 LoadFeatureData
334    
335  C<< my $stats = $spl->LoadFeatureData(); >>      my $stats = $spl->LoadFeatureData();
336    
337  Load the feature data from FIG into Sprout.  Load the feature data from FIG into Sprout.
338    
# Line 397  Line 403 
403      } else {      } else {
404          Trace("Generating feature data.") if T(2);          Trace("Generating feature data.") if T(2);
405          # Now we loop through the genomes, generating the data for each one.          # Now we loop through the genomes, generating the data for each one.
406          for my $genomeID (sort keys %{$genomeHash}) {          my @allGenomes = sort keys %{$genomeHash};
407            Trace(scalar(@allGenomes) . " genomes found in list.") if T(3);
408            for my $genomeID (@allGenomes) {
409              Trace("Loading features for genome $genomeID.") if T(3);              Trace("Loading features for genome $genomeID.") if T(3);
410              $loadFeature->Add("genomeIn");              $loadFeature->Add("genomeIn");
411              # Get the feature list for this genome.              # Get the feature list for this genome.
# Line 409  Line 417 
417              Trace("$count features found for genome $genomeID.") if T(3);              Trace("$count features found for genome $genomeID.") if T(3);
418              # Get the attributes for this genome and put them in a hash by feature ID.              # Get the attributes for this genome and put them in a hash by feature ID.
419              my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);              my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
420                Trace("Looping through features for $genomeID.") if T(3);
421              # Set up for our duplicate-feature check.              # Set up for our duplicate-feature check.
422              my $oldFeatureID = "";              my $oldFeatureID = "";
423              # Loop through the features.              # Loop through the features.
# Line 448  Line 457 
457                              $alias{$alias} = 1;                              $alias{$alias} = 1;
458                          }                          }
459                      }                      }
460                        # Add the corresponding IDs. Note we have to remove the FIG ID from the
461                        # return list. It's already among the keywords.
462                        my @corresponders = grep { $_ !~ /^fig/} $fig->get_corresponding_ids($featureID);
463                        push @keywords, @corresponders;
464                      Trace("Assignment for $featureID is: $assignment") if T(4);                      Trace("Assignment for $featureID is: $assignment") if T(4);
465                      # Break the assignment into words and shove it onto the                      # Break the assignment into words and shove it onto the
466                      # keyword list.                      # keyword list.
# Line 552  Line 565 
565                          my @cddData = sort keys %{$cddHash};                          my @cddData = sort keys %{$cddHash};
566                          for my $cdd (@cddData) {                          for my $cdd (@cddData) {
567                              # Extract the score for this CDD and decode it.                              # Extract the score for this CDD and decode it.
568                              my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[0]);                              my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[1]);
569                              my $realScore = FIGRules::DecodeScore($codeScore);                              my $realScore = FIGRules::DecodeScore($codeScore);
570                                # We can't afford to crash because of a bad attribute
571                                # value, hence the IF below.
572                                if (! defined($realScore)) {
573                                    # Bad score, so count it.
574                                    $loadFeature->Add('badCDDscore');
575                                } else {
576                              # Create the connection.                              # Create the connection.
577                              $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);                              $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
578                              # If this CDD does not yet exist, create its record.                              # If this CDD does not yet exist, create its record.
# Line 563  Line 582 
582                              }                              }
583                          }                          }
584                      }                      }
585                        }
586                      # Now we need to bust up hyphenated words in the keyword                      # Now we need to bust up hyphenated words in the keyword
587                      # list. We keep them separate and put them at the end so                      # list. We keep them separate and put them at the end so
588                      # the original word order is available.                      # the original word order is available.
# Line 587  Line 607 
607                      my @locationList = split /\s*,\s*/, $locations;                      my @locationList = split /\s*,\s*/, $locations;
608                      # Next, we convert them to Sprout location objects.                      # Next, we convert them to Sprout location objects.
609                      my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;                      my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
610                        # Assemble them into a sprout location string for later.
611                        my $locationString = join(", ", map { $_->String } @locObjectList);
612                      # This part is the roughest. We need to relate the features to contig                      # This part is the roughest. We need to relate the features to contig
613                      # locations, and the locations must be split so that none of them exceed                      # locations, and the locations must be split so that none of them exceed
614                      # the maximum segment size. This simplifies the genes_in_region processing                      # the maximum segment size. This simplifies the genes_in_region processing
# Line 610  Line 632 
632                          }                          }
633                      }                      }
634                      # Finally, reassemble the location objects into a list of Sprout location strings.                      # Finally, reassemble the location objects into a list of Sprout location strings.
                     $locations = join(", ", map { $_->String } @locObjectList);  
635                      # Create the feature record.                      # Create the feature record.
636                      $loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locations);                      $loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locationString);
637                  }                  }
638              }              }
639                Trace("Genome $genomeID processed.") if T(3);
640          }          }
641      }      }
642      # Finish the loads.      # Finish the loads.
# Line 624  Line 646 
646    
647  =head3 LoadSubsystemData  =head3 LoadSubsystemData
648    
649  C<< my $stats = $spl->LoadSubsystemData(); >>      my $stats = $spl->LoadSubsystemData();
650    
651  Load the subsystem data from FIG into Sprout.  Load the subsystem data from FIG into Sprout.
652    
# Line 727  Line 749 
749                  # Create the subsystem record.                  # Create the subsystem record.
750                  my $curator = $sub->get_curator();                  my $curator = $sub->get_curator();
751                  my $notes = $sub->get_notes();                  my $notes = $sub->get_notes();
752                  $loadSubsystem->Put($subsysID, $curator, $notes);                  my $description = $sub->get_description();
753                    $loadSubsystem->Put($subsysID, $curator, $description, $notes);
754                  # Now for the classification string. This comes back as a list                  # Now for the classification string. This comes back as a list
755                  # reference and we convert it to a space-delimited string.                  # reference and we convert it to a space-delimited string.
756                  my $classList = $fig->subsystem_classification($subsysID);                  my $classList = $fig->subsystem_classification($subsysID);
# Line 906  Line 929 
929    
930  =head3 LoadPropertyData  =head3 LoadPropertyData
931    
932  C<< my $stats = $spl->LoadPropertyData(); >>      my $stats = $spl->LoadPropertyData();
933    
934  Load the attribute data from FIG into Sprout.  Load the attribute data from FIG into Sprout.
935    
# Line 994  Line 1017 
1017    
1018  =head3 LoadAnnotationData  =head3 LoadAnnotationData
1019    
1020  C<< my $stats = $spl->LoadAnnotationData(); >>      my $stats = $spl->LoadAnnotationData();
1021    
1022  Load the annotation data from FIG into Sprout.  Load the annotation data from FIG into Sprout.
1023    
# Line 1101  Line 1124 
1124    
1125  =head3 LoadSourceData  =head3 LoadSourceData
1126    
1127  C<< my $stats = $spl->LoadSourceData(); >>      my $stats = $spl->LoadSourceData();
1128    
1129  Load the source data from FIG into Sprout.  Load the source data from FIG into Sprout.
1130    
# Line 1179  Line 1202 
1202    
1203  =head3 LoadExternalData  =head3 LoadExternalData
1204    
1205  C<< my $stats = $spl->LoadExternalData(); >>      my $stats = $spl->LoadExternalData();
1206    
1207  Load the external data from FIG into Sprout.  Load the external data from FIG into Sprout.
1208    
# Line 1259  Line 1282 
1282    
1283  =head3 LoadReactionData  =head3 LoadReactionData
1284    
1285  C<< my $stats = $spl->LoadReactionData(); >>      my $stats = $spl->LoadReactionData();
1286    
1287  Load the reaction data from FIG into Sprout.  Load the reaction data from FIG into Sprout.
1288    
# Line 1373  Line 1396 
1396    
1397  =head3 LoadSynonymData  =head3 LoadSynonymData
1398    
1399  C<< my $stats = $spl->LoadSynonymData(); >>      my $stats = $spl->LoadSynonymData();
1400    
1401  Load the synonym groups into Sprout.  Load the synonym groups into Sprout.
1402    
# Line 1418  Line 1441 
1441          if (! defined($result)) {          if (! defined($result)) {
1442              Confess("Database error in Synonym load: " . $sth->errstr());              Confess("Database error in Synonym load: " . $sth->errstr());
1443          } else {          } else {
1444                Trace("Processing synonym results.") if T(2);
1445              # Remember the current synonym.              # Remember the current synonym.
1446              my $current_syn = "";              my $current_syn = "";
1447              # Count the features.              # Count the features.
1448              my $featureCount = 0;              my $featureCount = 0;
1449                my $entryCount = 0;
1450              # Loop through the synonym/peg pairs.              # Loop through the synonym/peg pairs.
1451              while (my @row = $sth->fetchrow()) {              while (my @row = $sth->fetchrow()) {
1452                  # Get the synonym group ID and feature ID.                  # Get the synonym group ID and feature ID.
1453                  my ($syn_id, $peg) = @row;                  my ($syn_id, $peg) = @row;
1454                    # Count this row.
1455                    $entryCount++;
1456                    if ($entryCount % 1000 == 0) {
1457                        Trace("$entryCount rows processed.") if T(3);
1458                    }
1459                  # Insure it's for one of our genomes.                  # Insure it's for one of our genomes.
1460                  my $genomeID = FIG::genome_of($peg);                  my $genomeID = FIG::genome_of($peg);
1461                  if (exists $genomeHash->{$genomeID}) {                  if (exists $genomeHash->{$genomeID}) {
# Line 1444  Line 1474 
1474                      }                      }
1475                  }                  }
1476              }              }
1477                Trace("$entryCount rows produced $featureCount features.") if T(2);
1478          }          }
1479      }      }
1480      # Finish the load.      # Finish the load.
# Line 1453  Line 1484 
1484    
1485  =head3 LoadFamilyData  =head3 LoadFamilyData
1486    
1487  C<< my $stats = $spl->LoadFamilyData(); >>      my $stats = $spl->LoadFamilyData();
1488    
1489  Load the protein families into Sprout.  Load the protein families into Sprout.
1490    
# Line 1521  Line 1552 
1552    
1553  =head3 LoadDrugData  =head3 LoadDrugData
1554    
1555  C<< my $stats = $spl->LoadDrugData(); >>      my $stats = $spl->LoadDrugData();
1556    
1557  Load the drug target data into Sprout.  Load the drug target data into Sprout.
1558    
# Line 1655  Line 1686 
1686                          # Decode the score.                          # Decode the score.
1687                          my $realScore = FIGRules::DecodeScore($score);                          my $realScore = FIGRules::DecodeScore($score);
1688                          # Connect the PDB to the feature.                          # Connect the PDB to the feature.
1689                          $loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end);                          $loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end);
1690                      }                      }
1691                  }                  }
1692              }              }
# Line 1720  Line 1751 
1751    
1752  =head3 SpecialAttribute  =head3 SpecialAttribute
1753    
1754  C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >>      my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader);
1755    
1756  Look for special attributes of a given type. A special attribute is found by comparing one of  Look for special attributes of a given type. A special attribute is found by comparing one of
1757  the columns of the incoming attribute list to a search pattern. If a match is found, then  the columns of the incoming attribute list to a search pattern. If a match is found, then
# Line 1896  Line 1927 
1927    
1928  =head3 GetGenomeAttributes  =head3 GetGenomeAttributes
1929    
1930  C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys); >>      my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys);
1931    
1932  Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related  Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
1933  attributes for all the features of a genome in a single call, then organizes them into  attributes for all the features of a genome in a single call, then organizes them into

Legend:
Removed from v.1.87  
changed lines
  Added in v.1.92

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3