[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.17, Tue Mar 26 02:10:46 2013 UTC revision 1.18, Wed May 8 20:28:44 2013 UTC
# Line 26  Line 26 
26      use SAPserver;      use SAPserver;
27      use Sapling;      use Sapling;
28      use AliasAnalysis;      use AliasAnalysis;
29        use BaseSaplingLoader;
30        use MD5Computer;
31      use base qw(SaplingDataLoader);      use base qw(SaplingDataLoader);
32    
33  =head1 Sapling Genome Loader  =head1 Sapling Genome Loader
# Line 134  Line 136 
136      my $subStats = $sap->Delete(Genome => $genome);      my $subStats = $sap->Delete(Genome => $genome);
137      # Accumulate the statistics from the delete.      # Accumulate the statistics from the delete.
138      $stats->Accumulate($subStats);      $stats->Accumulate($subStats);
139        Trace("Statistics for delete of $genome:\n" . $stats->Show()) if T(SaplingDataLoader => 3);
140      # Return the statistics object.      # Return the statistics object.
141      return $stats;      return $stats;
142  }  }
# Line 249  Line 252 
252      $retVal->{directory} = $directory;      $retVal->{directory} = $directory;
253      # Leave the assignment hash undefined until we populate it.      # Leave the assignment hash undefined until we populate it.
254      $retVal->{assignHash} = undef;      $retVal->{assignHash} = undef;
255        # Create an MD5 Computer for this genome.
256        $retVal->{md5} = MD5Computer->new();
257      # Return the result.      # Return the result.
258      return $retVal;      return $retVal;
259  }  }
# Line 274  Line 279 
279      # Get the genome ID.      # Get the genome ID.
280      my $genome = $self->{genome};      my $genome = $self->{genome};
281      # These variables will contain the current contig ID, the current contig length,      # These variables will contain the current contig ID, the current contig length,
282      # the ordinal of the current chunk, the accumulated DNA sequence, and its length.      # the accumulated DNA sequence for the current chunk, and its length.
283      my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);      my ($contigID, $contigLen, $chunk, $chunkLen) = (undef, 0, '', 0);
284        # This variable contains a list of the chunks, for use in computing the MD5.
285        my @chunks;
286      # Loop through the contig file.      # Loop through the contig file.
287      while (! eof $ih) {      while (! eof $ih) {
288          # Get the current record.          # Get the current record.
# Line 286  Line 293 
293              my $newContigID = $1;              my $newContigID = $1;
294              # Is there a current contig?              # Is there a current contig?
295              if (defined $contigID) {              if (defined $contigID) {
296                  # Yes. Output the current chunk.                  # Yes. Output the contig.
297                  $self->OutputChunk($contigID, $ordinal, $chunk);                  $self->OutputContig($contigID, $contigLen, $chunk, \@chunks);
                 # Output the contig.  
                 $self->OutputContig($contigID, $contigLen);  
298              }              }
299              # Compute the new contig ID. We need to insure it has a genome ID in front.              # Compute the new contig ID. We need to insure it has a genome ID in front.
300              $self->FixContig(\$newContigID);              $self->FixContig(\$newContigID);
301              # Initialize the new contig.              # Initialize the new contig.
302              $contigID = $newContigID;              $contigID = $newContigID;
303              $contigLen = 0;              $contigLen = 0;
             $ordinal = 0;  
304              $chunk = '';              $chunk = '';
305              $chunkLen = 0;              $chunkLen = 0;
306                @chunks = ();
307          } else {          } else {
308              # Here we have more DNA in the current contig. Are we at the end of              # Here we have more DNA in the current contig. Are we at the end of
309              # the current chunk?              # the current chunk?
# Line 312  Line 317 
317                  # Yes. Create the actual chunk.                  # Yes. Create the actual chunk.
318                  $chunk .= substr($line, 0, $segmentLength - $chunkLen);                  $chunk .= substr($line, 0, $segmentLength - $chunkLen);
319                  # Write it out.                  # Write it out.
320                  $self->OutputChunk($contigID, $ordinal, $chunk);                  $self->OutputChunk($contigID, scalar @chunks, $chunk);
321                  # Set up the new chunk.                  # Set up the new chunk.
322                  $chunk = substr($line, $segmentLength - $chunkLen);                  $chunk = substr($line, $segmentLength - $chunkLen);
                 $ordinal++;  
323                  $chunkLen = length($chunk);                  $chunkLen = length($chunk);
324                    push @chunks, $chunk;
325              }              }
326              # Update the contig length.              # Update the contig length.
327              $contigLen += $lineLen;              $contigLen += $lineLen;
# Line 324  Line 329 
329      }      }
330      # Is there a current contig?      # Is there a current contig?
331      if (defined $contigID) {      if (defined $contigID) {
332          # Yes. Output the residual chunk.          # Yes. Output the contig itself.
333          $self->OutputChunk($contigID, $ordinal, $chunk);          $self->OutputContig($contigID, $contigLen, $chunk, \@chunks);
         # Output the contig itself.  
         $self->OutputContig($contigID, $contigLen);  
334      }      }
335  }  }
336    
# Line 374  Line 377 
377    
378  =head3 OutputContig  =head3 OutputContig
379    
380      $loaderObject->OutputContig($contigID, $contigLen);      $loaderObject->OutputContig($contigID, $contigLen, $chunk, \@chunks);
381    
382  Write out the current contig.  Write out the current contig.
383    
# Line 388  Line 391 
391    
392  Length of the contig in base pairs.  Length of the contig in base pairs.
393    
394    =item chunk
395    
396    Last DNA chunk of the contig.
397    
398    =item chunks
399    
400    Reference to a list of the DNA chunks up to (but not including) the last one.
401    
402  =back  =back
403    
404  =cut  =cut
405    
406  sub OutputContig {  sub OutputContig {
407      # Get the parameters.      # Get the parameters.
408      my ($self, $contigID, $contigLen) = @_;      my ($self, $contigID, $contigLen, $chunk, $chunks) = @_;
409      # Get the sapling object.      # Get the sapling object.
410      my $sap = $self->{sap};      my $sap = $self->{sap};
411        # Get the MD5 computer.
412        my $md5C = $self->{md5};
413        # Output the last chunk.
414        $self->OutputChunk($contigID, scalar @$chunks, $chunk);
415      # Connect the contig to the genome.      # Connect the contig to the genome.
416      $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);      $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
417        # Compute the MD5.
418        push @$chunks, $chunk;
419        my $contigMD5 = $md5C->ProcessContig($contigID, $chunks);
420      # Output the contig record.      # Output the contig record.
421      $sap->InsertObject('Contig', id => $contigID, length => $contigLen);      $sap->InsertObject('Contig', id => $contigID, length => $contigLen,
422                md5_identifier => $contigMD5);
423      # Record the contig.      # Record the contig.
424      $self->{stats}->Add(contigs => 1);      $self->{stats}->Add(contigs => 1);
425      $self->{stats}->Add(dna => $contigLen);      $self->{stats}->Add(dna => $contigLen);
# Line 605  Line 624 
624                  if ($fidHash{$fid}) {                  if ($fidHash{$fid}) {
625                      $sap->Delete(Feature => $fid);                      $sap->Delete(Feature => $fid);
626                      $stats->Add(duplicateFid => 1);                      $stats->Add(duplicateFid => 1);
627                    } else {
628                        # It doesn't exist, so record it in the statistics.
629                        $stats->Add($type => 1);
630                  }                  }
631                  # If this is RNA, the alias list is always empty. Sometimes, the functional                  # If this is RNA, the alias list is always empty. Sometimes, the functional
632                  # assignment is found there.                  # assignment is found there.
# Line 792  Line 814 
814      my $stats = $self->{stats};      my $stats = $self->{stats};
815      # Get the genome ID.      # Get the genome ID.
816      my $genome = $self->{genome};      my $genome = $self->{genome};
817        # Compute the subsystem and binding file names.
818        my $subFileName = "$self->{directory}/Subsystems/subsystems";
819        my $bindFileName = "$self->{directory}/Subsystems/bindings";
820        # Only proceed if both exist.
821        if (! -f $subFileName || ! -f $bindFileName) {
822            Trace("Missing subsystem data for $genome.") if T(1);
823            $stats->Add(noSubsystems => 1);
824        } else {
825      # This hash maps subsystem IDs to molecular machine IDs.      # This hash maps subsystem IDs to molecular machine IDs.
826      my %machines;      my %machines;
827      # This hash maps subsystem/role pairs to machine role IDs.      # This hash maps subsystem/role pairs to machine role IDs.
# Line 801  Line 831 
831      # We loop through the subsystems, looking for the ones already in the      # We loop through the subsystems, looking for the ones already in the
832      # database. The list is given in the subsystems file of the Subsystems      # database. The list is given in the subsystems file of the Subsystems
833      # directory.      # directory.
834      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");          my $ih = Open(undef, "<$subFileName");
835      # Loop through the subsystems in the file, insuring we have them in the database.      # Loop through the subsystems in the file, insuring we have them in the database.
836      while (! eof $ih) {      while (! eof $ih) {
837          # Get this subsystem.          # Get this subsystem.
# Line 818  Line 848 
848              $subsystems{$subsystem} = 1;              $subsystems{$subsystem} = 1;
849              # Compute this subsystem's MD5.              # Compute this subsystem's MD5.
850              my $subsystemMD5 = ERDB::DigestKey($subsystem);              my $subsystemMD5 = ERDB::DigestKey($subsystem);
851                    my $rolePrefix = "$subsystemMD5:$genome";
852              # Loop through the roles.              # Loop through the roles.
853              my @roleList = $sap->GetAll('Includes', 'Includes(from-link) = ?',              my @roleList = $sap->GetAll('Includes', 'Includes(from-link) = ?',
854                      [$subsystem], "to-link abbreviation");                      [$subsystem], "to-link abbreviation");
855              for my $roleTuple (@roleList) {              for my $roleTuple (@roleList) {
856                  my ($roleID, $abbr) = @$roleTuple;                  my ($roleID, $abbr) = @$roleTuple;
857                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);                      my $machineRoleID = $rolePrefix . '::' . $abbr;
858                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;
859              }              }
860              # Next we need the variant code and key.              # Next we need the variant code and key.
# Line 841  Line 872 
872      }      }
873      # Now we go through the bindings file. This file connects the subsystem      # Now we go through the bindings file. This file connects the subsystem
874      # roles to the molecular machines.      # roles to the molecular machines.
875      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");          $ih = Open(undef, "<$bindFileName");
876      # Loop through the bindings.      # Loop through the bindings.
877      while (! eof $ih) {      while (! eof $ih) {
878          # Get the binding data.          # Get the binding data.
# Line 866  Line 897 
897          }          }
898      }      }
899  }  }
900    }
901    
902  =head3 CreateGenome  =head3 CreateGenome
903    
# Line 888  Line 920 
920      my $dir = $self->{directory};      my $dir = $self->{directory};
921      # Get the sapling database.      # Get the sapling database.
922      my $sap = $self->{sap};      my $sap = $self->{sap};
923        # Get the MD5 computer.
924        my $md5C = $self->{md5};
925      # We'll put the genome attributes in here.      # We'll put the genome attributes in here.
926      my %fields;      my %fields;
927      # Check for a basic statistics file.      # Check for a basic statistics file.
# Line 915  Line 949 
949          $fields{'scientific-name'} = $line;          $fields{'scientific-name'} = $line;
950          # Get the taxonomy and extract the domain from it.          # Get the taxonomy and extract the domain from it.
951          $ih = Open(undef, "<$dir/TAXONOMY");          $ih = Open(undef, "<$dir/TAXONOMY");
952          ($fields{domain}) = split /;/, <$ih>, 2;          ($fields{domain}) = split m/;/, <$ih>, 2;
953      }      }
954      # Get the counts from the statistics object.      # Get the counts from the statistics object.
955      my $stats = $self->{stats};      my $stats = $self->{stats};
# Line 934  Line 968 
968      }      }
969      # Use the domain to determine whether or not the genome is prokaryotic.      # Use the domain to determine whether or not the genome is prokaryotic.
970      $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;      $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
971        # Compute the genome MD5.
972        $fields{md5_identifier} = $md5C->CloseGenome();
973      # Finally, add the genome ID.      # Finally, add the genome ID.
974      $fields{id} = $self->{genome};      $fields{id} = $self->{genome};
975      # Create the genome record.      # Create the genome record.

Legend:
Removed from v.1.17  
changed lines
  Added in v.1.18

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3