[Bio] / Sprout / SaplingExpressionLoader.pm Repository:
ViewVC logotype

View of /Sprout/SaplingExpressionLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Sun Feb 13 13:02:30 2011 UTC (9 years ago) by parrello
Branch: MAIN
CVS Tags: mgrast_release_3_0, mgrast_release_3_0_2, mgrast_release_3_0_1
Changes since 1.3: +2 -2 lines
Inverted some relationships to improve performance.

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package SaplingExpressionLoader;

    use strict;
    use Tracer;
    use Stats;
    use SeedUtils;
    use SAPserver;
    use Sapling;
    use base qw(SaplingDataLoader);

=head1 Sapling Subsystem Loader

This class loads Expression data into a Sapling database from an expression directory.
Unlike L<SaplingGenomeLoader>, this version is designed for updating a populated
database only. Links to features and genomes are put in, but not the features and
genomes themselves.

=head2 Main Methods

=head3 Load

    my $stats = SaplingExpressionLoader::Load($sap, $genome, $directory);

Load a genome's expression data from an expression data directory into the sapling database.

=over 4

=item sap

L<Sapling> object used to access the target database.

=item genome

ID of the relevant genome.

=item directory

Name of the directory containing the expression data.

=back

=cut

sub Load {
    # Get the parameters.
    my ($sap, $genome, $directory) = @_;
    # Create the loader object.
    my $loaderObject = SaplingExpressionLoader->new($sap, $genome, $directory);
    # Load the expression data.
    $loaderObject->LoadExperiments();
    # Return the statistics.
    return $loaderObject->{stats};
}

=head3 ClearExpressionData

    my $stats = SaplingExpressionLoader::ClearExpressionData($sap, $expression);

Delete the specified genome's expression data from the specified sapling
database. This method can also be used to clean up after a failed or aborted load.

=over 4

=item sap

L<Sapling> object used to access the target database.

=item genome

ID of the genome whose expression data is to be deleted.

=item RETURN

Returns a statistics object counting the records deleted.

=back

=cut

sub ClearExpressionData {
    # Get the parameters.
    my ($sap, $genome) = @_;
    # Create the statistics object.
    my $stats = Stats->new();
    # Get a list of this genome's chips.
    my @chips = $sap->GetFlat('HadResultsProducedBy', "HadResultsProducedBy(from-link) = ?",
                              [$genome], 'to-link');
    # Delete the chips and their associated experiment data.
    Trace("Deleting chips.") if T(2);
    for my $chip (@chips) {
        my $newStats = $sap->Delete(Chip => $chip);
        $stats->Accumulate($newStats);
    }
    # Now we need to delete all coregulation information for features in this genome.
    Trace("Deleting feature coregulation pairs.") if T(2);
    my $fidPattern = "fig|$genome.%";
    my $coregDeletes = $sap->DeleteLike('IsCoregulatedWith', "IsCoregulatedWith(from-link) LIKE ?",
                                        [$fidPattern]);
    $stats->Add(IsCoregulatedWith => $coregDeletes);
    # Loop through the coregulated set links for features in this genome, keeping the coregulated set
    # IDs.
    Trace("Deleting coregulation sets.") if T(2);
    my %cosets;
    my $q = $sap->Get('IsRegulatedWith', "IsRegulatedWith(from-link) LIKE ?", [$fidPattern]);
    while (my $coset = $q->Fetch()) {
        $cosets{$coset->PrimaryValue('to-link')} = 1;
    }
    # Now delete the coregulated sets found.
    for my $coset (keys %cosets) {
        my $newStats = $sap->Delete(CoregulatedSet => $coset);
        $stats->Accumulate($newStats);
    }
    # Finally, delete the atomic regulons.
    Trace("Deleting atomic regulons.") if T(2);
    # Delete the links that connect the regulons to the features. This prevents the regulon
    # deletes from deleting the features themselves.
    my $atomicFeatureLinks = $sap->DeleteLike('IsFormedOf', "IsFormedOf(to-link) LIKE ?",
                                              [$fidPattern]);
    $stats->Add(IsFormedInto => $atomicFeatureLinks);
    # Now get the list of regulons for this genome.
    my @regulons = $sap->GetFlat('IsConfiguredBy', "IsConfiguredBy(from-link) = ?", [$genome],
                                 'to-link');
    # Loop through the regulons, deleting them.
    for my $regulon (@regulons) {
        my $newStats = $sap->Delete(AtomicRegulon => $regulon);
        $stats->Accumulate($newStats);
    }
    # Return the statistics object.
    return $stats;
}

=head2 Loader Object Methods

=head3 new

    my $loaderObject = SaplingExpressionLoader->new($sap, $genome, $directory);

Create a loader object that can be used to facilitate loading Sapling data from an
expression data directory.

=over 4

=item sap

L<Sapling> object used to access the target database.

=item genome

ID of the genome whose expression data is being loaded.

=item directory

Name of the directory containing the expression data.

=back

The object created contains the following fields.

=over 4

=item supportRecords

A hash of hashes, used to track the support records known to exist in the database.

=item sap

L<Sapling> object used to access the database.

=item stats

L<Stats> object for tracking statistical information about the load.

=item genome

ID of the genome whose data is being loaded.

=item chip

ID of the relevant chip.

=item directory

Name of the directory containing the subsystem data.

=back

=cut

sub new {
    # Get the parameters.
    my ($class, $sap, $genome, $directory) = @_;
    # Create the object.
    my $retVal = SaplingDataLoader::new($class, $sap, qw(experiments));
    # Add our specialized data.
    $retVal->{genome} = $genome;
    $retVal->{chip} = "CNF$genome";
    $retVal->{directory} = $directory;
    # Return the result.
    return $retVal;
}

=head2 Internal Utility Methods

=head3 LoadExperiments

    $loaderObject->LoadExperiments();

Load all the expression data experiments and related information for this genome from the
incoming directory.

=cut

sub LoadExperiments {
    # Get the parameters.
    my ($self) = @_;
    # Get the directory name.
    my $expDataDirectory = $self->{directory};
    # Get the sapling database object.
    my $sap = $self->{sap};
    # Get the statistics object.
    my $stats = $self->{stats};
    # Get the genome ID.
    my $genome = $self->{genome};
    # Check for a chip ID.
    my $chipFileName = "$expDataDirectory/chip";
    if (-f $chipFileName) {
        # Chip file found.
        my $ih = Open(undef, "<$chipFileName");
        # Read the chip ID.
        ($self->{chip}) = Tracer::GetLine($ih);
        # Close the chip file.
        close $ih;
        # Denote we have a custom chip ID.
        $stats->Add(customChipID => 1);
    }
    # Create the chip record.
    $sap->InsertObject('Chip', id => $self->{chip});
    # Connect it to the genome.
    $sap->InsertObject('ProducedResultsFor', from_link => $self->{chip}, to_link => $genome);
    # Do we have experiments?
    my $fileName = "$expDataDirectory/experiment.names";
    if (! -f $fileName) {
        # No, so skip this genome.
        Trace("No experiment names found for $genome.") if T(1);
        $stats->Add(missingExperimentNames => 1);
    } else {
        # Yes. Get the vector of experiment IDs.
        my $ih = Open(undef, "<$fileName");
        # This array will hold the experiment names.
        my @experiments;
        # This hash maps experiment IDs to sequence numbers.
        my %expHash;
        # Loop through the experiments.
        while (! eof $ih) {
            my ($id, $name) = Tracer::GetLine($ih);
            # Note that the experiment numbers are 1-based.
            my $seqNo = $id - 1;
            $experiments[$seqNo] = $name;
            $expHash{$name} = $seqNo;
            # Put the experiments in the experiment table.
            $sap->InsertObject('HasResultsIn', from_link => $self->{chip}, to_link => $name,
                                sequence => $seqNo);
            $sap->InsertObject('Experiment', id => $name, source => '');
            $stats->Add(experiments => 1);
        }
        close $ih;
        # Remember the number of experiments.
        my $expCount = scalar @experiments;
        # Check for a raw data file.
        my $raw_data_file = "$expDataDirectory/rma_normalized.tab";
        if (! -f $raw_data_file) {
            # This means we have a chip and probes, but no experimental results.
            Trace("No raw data file found for $genome expression data.") if T(1);
            $stats->Add(missingRawData => 1);
        } else {
            # Open the raw data file.
            $ih = Open(undef, "<$raw_data_file");
            Trace("Processing expression data from $raw_data_file.") if T(3);
            # Read the list of experiments.
            my @exps = Tracer::GetLine($ih);
            # Spool them into the experiment table if they're not already there.
            for my $exp (@exps) {
                if (! exists $expHash{$exp}) {
                    $sap->InsertObject('HasResultsIn', from_link => $self->{chip},
                                        to_link => $exp, sequence => -1);
                    $sap->InsertObject('Experiment', id => $exp, source => '');
                    $stats->Add(extraExperiment => 1);
                }
            }
            # We'll put the signal values in here. For each feature, we will
            # have a list of signals in the same order as the list of experiments.
            my $corrH = {};
            while (! eof $ih) {
                my ($fid, @signals) = Tracer::GetLine($ih);
                # Store the signals in the hash.
                $corrH->{$fid} = \@signals;
            }
            # Close the raw data file.
            close $ih;
            # Now we need to generate the signal records. This includes the signal values we just
            # read in plus on/off indications from the "final_on_off_calls.txt".
            $ih = Open(undef, "<$expDataDirectory/final_on_off_calls.txt");
            # Discard the experiment list. It will be the same as the one we already have.
            Tracer::GetLine($ih);
            # Loop through the on/off file.
            while (! eof $ih) {
                my ($fid, @onOffs) = Tracer::GetLine($ih);
                # Get the signals for this feature.
                my $signals = $corrH->{$fid};
                if (! $signals) {
                    Trace("No signals found for $fid in experiments for $genome.") if T(1);
                    $stats->Add(noSignals => 1);
                } else {
                    # This will be used to build the levels vector.
                    my @levels;
                    # Generate the signal records.
                    for (my $i = 0; $i <= $#exps; $i++) {
                        if (defined $signals->[$i]) {
                            # Here we have a signal for this experiment.
                            $sap->InsertObject('HasIndicatedSignalFrom', to_link => $exps[$i],
                                                from_link => $fid, rma_value => $signals->[$i],
                                                level => $onOffs[$i]);
                            $stats->Add(signalX => 1);
                        } else {
                            # No signal, so no indication will be built.
                            $stats->Add(signalZ => 1);
                        }
                        # Next, we put the on/off level in the experiment vector.
                        my $i = $expHash{$exps[$i]};
                        if (defined $i) {
                            $levels[$i] = $onOffs[$i];
                        }
                    }
                    # We need to connect the feature to the chip, specifying the level vector.
                    # First, we must fill in the gaps.
                    for (my $i = 0; $i <= $expCount; $i++) {
                        if (! defined $levels[$i]) {
                            $levels[$i] = 0;
                            $stats->Add(fidLevelGap => 1);
                        }
                    }
                    # Output the relationship.
                    $sap->InsertObject('IndicatedLevelsFor', from_link => $self->{chip},
                                        to_link => $fid, level_vector => \@levels);
                }
            }
            # Close the raw data file.
            close $ih;
            # Now we loop through the coregulated sets. These are found in files named
            # "coregulated.<something>". For each pair of features in the coregulated set
            # we compute a Pearson coefficient from their signals.
            Trace("Generating coregulated sets for $genome.") if T(3);
            my @files = grep { $_ =~ /^coregulated/ } Tracer::OpenDir($expDataDirectory, 0);
            for my $file (@files) {
                $ih = Open(undef, "<$expDataDirectory/$file");
                # Loop through the sets in this file.
                while (! eof $ih) {
                    my ($fidList, $desc) = Tracer::GetLine($ih);
                    $stats->Add(coregulations => 1);
                    # Compute the list of features in this set.
                    my @fids = split /,/, $fidList;
                    # Create an ID for it.
                    my $setID = ERDB::DigestKey(join("-", sort @fids));
                    # Create the record for the set, if it's new.
                    my $newSet = $self->InsureEntity(CoregulatedSet => $setID, reason => $desc);
                    if ($newSet) {
                        # It is new, so connect it to the features.
                        for my $fid (@fids) {
                            $sap->InsertObject('IsRegulatedWith', from_link => $fid,
                                                to_link => $setID);
                        }
                        # Compute the Pearson coefficients for these features.
                        my $pcs = compute_pc(\@fids, $corrH);
                        # Loop through the features, generating Pearson coefficients for each
                        # pair.
                        for (my $i = 0; $i <= $#fids; $i++) {
                            my $fid1 = $fids[$i];
                            for (my $j = $i + 1; $j <= $#fids; $j++) {
                                my $fid2 = $fids[$j];
                                # Get the pearson coefficient from the hash. We assume 0 if
                                # the coefficient isn't found.
                                my $pc = $pcs->{$fid1}{$fid2} || 0;
                                # Store both directions of the correlation.
                                $sap->InsertObject('IsCoregulatedWith', from_link => $fid1,
                                                    to_link => $fid2, coefficient => $pc);
                                $sap->InsertObject('IsCoregulatedWith', from_link => $fid2,
                                                    to_link => $fid1, coefficient => $pc);
                            }
                        }
                    }
                }
            }
        }
        # Open the atomic regulon file.
        $ih = Open(undef, "<$expDataDirectory/atomic.regulons");
        # We'll track our regulons in here.
        my %regulons;
        Trace("Generating regulons for $genome.") if T(3);
        # Loop through the regulons.
        while (! eof $ih) {
            my ($id, $fid) = Tracer::GetLine($ih);
            $stats->Add(regulonRecord => 1);
            # Compute the real regulon ID.
            my $realID = "$genome:$id";
            # Is this is a new regulon?
            if (! exists $regulons{$id}) {
                # Yes. Create the regulon record and connect it to the genome.
                $sap->InsertObject('AtomicRegulon', id => $realID);
                $sap->InsertObject('IsConfiguredBy', from_link => $genome,
                                    to_link => $realID);
                # Make sure we know this is handled.
                $regulons{$id} = 1;
            }
            # Connect the regulon to the feature.
            $sap->InsertObject('IsFormedOf', from_link => $realID, to_link => $fid);
        }
        close $ih;
        Trace("Generating expression levels for $genome.") if T(3);
        # Now we can connect the regulons to the experiments.
        $ih = Open(undef, "<$expDataDirectory/ar.vectors");
        # Loop through the experiment vectors.
        while (! eof $ih) {
            my ($id, $levelList) = Tracer::GetLine($ih);
            $stats->Add(vectorRecord => 1);
            # Compute the real regulon ID.
            my $realID = "$genome:$id";
            # Extract the expression levels.
            my @levels = split /\s*,\s*/, $levelList;
            my $levelCount = scalar @levels;
            if ($levelCount != $expCount) {
                Trace("Vector $id for $genome has $levelCount levels for $expCount experiments.") if T(2);
                if ($levelCount < $expCount) {
                    $stats->Add(shortExpressionVector => 1);
                } else {
                    $stats->Add(longExpressionVector => 1);
                }
            }
            # Loop through them, connecting the regulon to experiments.
            for (my $i = 0; $i <= $#levels; $i++) {
                if ($i >= $expCount) {
                    $stats->Add(extraExpressionVectorElement => 1);
                } else {
                    $sap->InsertObject('AffectsLevelOf', from_link => $experiments[$i],
                                        to_link => $realID, level => $levels[$i]);
                }
            }
            # Create the atomic regulon experiment vector.
            $sap->InsertObject('GeneratedLevelsFor', from_link => $self->{chip},
                                to_link => $realID, level_vector => \@levels);
        }
    }
}


=head3 compute_pc

    my $hash = SaplingExpressionLoader::compute_pc(\@gene_ids, \%gxp_hash);

Compute the Pearson coefficients for each pair of features in the list of incoming
gene IDs. The coefficients will indicate the correlation between the features' gene
expression lists from the incoming hash.

=over 4

=item gene_ids

List of feature IDs for which correlation coefficients are desired.

=item gxp_hash

A hash mapping each feature ID to a list of normalized expression values.

=item RETURN

Returns a reference to a hash of hashes keyed by feature ID. Each feature ID
is mapped to a sub-hash that maps the other feature IDs to the appropriate
Pearson coefficients.

=back

=cut

sub compute_pc {
    my ($gene_ids, $gxp_hash) = @_;
    my %values = map { $_ => {} } @$gene_ids;
    require Statistics::Descriptive;
    for (my $i = 0; $i < @$gene_ids-1; $i++) {
        my $stat = Statistics::Descriptive::Full->new();
        $stat->add_data(@{$gxp_hash->{$gene_ids->[$i]}});

        for (my $j = $i+1; $j < @$gene_ids; $j++)
        {
            my ($q, $m, $r, $err) = $stat->least_squares_fit(@{$gxp_hash->{$gene_ids->[$j]}});
            $values{$gene_ids->[$i]}->{$gene_ids->[$j]} = $r;
            $values{$gene_ids->[$j]}->{$gene_ids->[$i]} = $r;
        }
    }
    return \%values;
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3