[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

View of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (download) (as text) (annotate)
Mon Jan 19 21:46:21 2009 UTC (10 years, 10 months ago) by parrello
Branch: MAIN
CVS Tags: rast_rel_2009_02_05
Changes since 1.5: +65 -18 lines
ERDB 2.0 support

#!/usr/bin/perl -w

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use strict;
use Tracer;
use ERDB;
use ERDBLoadGroup;
use ERDBGenerate;
use Stats;
use Time::HiRes;


=head1 ERDBLoader Script

    ERDBLoader [options] <database> <group1> <group2> ...

ERDB Database Load Finisher

=head2 Introduction

This script finishes the database load process begun by [[ERDBGeneratorPl]].

[[ERDBGeneratorPl]] divides the source data into sections, and generates a
partial load file for each section of each table. To finish the load process, we
need to combine the partial files into single files and load the resulting
single files into the database tables.

Like [[ERDBGeneratorPl]], this script acts on load groups-- sets of related
tables that are loaded at the same time. For each table in a named group that
does not exist in the database, the script first attempts to find a completed
data file. If one does not exist, it attempts to create one by collating section
files. Once the collated section files for a load group are finished, they are
loaded into the database.

=head2 Positional Parameters

=over 4

=item database

Name of the ERDB database. This should be the class name for the subclass used
to access the database.

=back

=head2 Command-Line Options

=over 4

=item trace

Specifies the tracing level. The higher the tracing level, the more messages
will appear in the trace log. Use E to specify emergency tracing.

=item user

Name suffix to be used for log files. If omitted, the PID is used.

=item sql

If specified, turns on tracing of SQL activity.

=item background

Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named C<err>I<User>C<.log> and
C<out>I<User>C<.log>, respectively, where I<User> is the value of the
B<user> option above.

=item help

Display this command's parameters and options.

=item keepSections

If specified, section files (the fragments of data load files created by
[[ERDBGeneratorPl]], will not be deleted after they are collated.

=item sanityCheck

If specified, no tables will be loaded. Instead, the first I<N> records from the
assembled load files will be displayed so that the file contents can be
visually matched against the column names.

=item warn

Create an event in the RSS feed when an error occurs.

=item phone

Phone number to message when the script is complete.

=item DBD

Name of the DBD file. If specified, the DBD must be in the main FIG directory
(specified in C<$FIG_Config::fig>). This option allows the use of an alternate
DBD during load, so that access to the database by other processes is not
compromised.

=back

=cut

# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
                                           {
                                              sanityCheck => ["", "don't load, trace contents of first N load file records instead"],
                                              trace => ["2", "tracing level"],
                                              keepSections => ["", "if specified, section files will not be deleted after being collated"],
                                              phone => ["", "phone number (international format) to call when load finishes"],
                                              DBD => ["", "if specified, the name of a DBD file in the FIG directory"],
                                           },
                                           "<database> <group1> <group2> ...",
                                           @ARGV);
# Set a variable to contain return type information.
my $rtype;
# Insure we catch errors.
eval {
    # Get the parameters.
    my ($database, @groups) = @parameters;
    # Check for an alternate DBD.
    my $altDBD = ($options->{DBD} ? "$FIG_Config::fig/$options->{DBD}" : undef);
    # Connect to the database and get its load directory.
    my $erdb = ERDB::GetDatabase($database, $altDBD);
    # Fix the group list.
    my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
    # Get the source object and load directory for this database.
    my $source = $erdb->GetSourceObject();
    my $directory = $erdb->LoadDirectory();
    # Get the list of sections.
    my @sectionList = $erdb->SectionList($source);
    # Create a statistics object to track our progress.
    my $stats = Stats->new();
    # Find out if we're doing a sanity check.
    my $sanityCheck = $options->{sanityCheck} || "";
    # Start a timer.
    my $totalStart = time();
    # Loop through the groups.
    for my $group (@realGroups) {
        # Get the list of tables for this group.
        my @tableList = ERDBLoadGroup::GetTables($erdb, $group);
        # We need to insure there is a data file for every table. If we fail to find one,
        # we set the following error flag, which prevents us from loading the database.
        my $missingTable = 0;
        # Loop through the tables in this group.
        for my $table (@tableList) {
            Trace("Processing table $table for assembly.") if T(2);
            # Get the section file names.
            my @sectionFiles =
                map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;
            # Get the data file name.
            my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
            # Do we have it?
            if (-f $dataFile) {
                # Yes. This is good news.
                $stats->Add('tables-found' => 1);
                Trace("Table file found for $table.") if T(3);
            } else {
                # No, we must build it. Verify that we have all the sections.
                my @missingFiles = grep { ! -f $_ } @sectionFiles;
                # Did we find everything?
                if (scalar @missingFiles) {
                    # No! Denote that we have a missing table.
                    $missingTable++;
                    $stats->Add('tables-skipped' => 1);
                    # If the user wants a sanity check, we want to give him some
                    # data anyway.
                    if ($sanityCheck) {
                        # Get some data lines in the sections. Note we stop when we've exceeded
                        # the number of lines expected by the sanity check.
                        my @lines;
                        for my $sectionFile (@sectionFiles) {
                            if (-s $sectionFile && scalar(@lines) < $sanityCheck) {
                                Trace("Reading from $sectionFile for $table.") if T(3);
                                push @lines, Tracer::GetFile($sectionFile);
                            }
                        }
                        # Create a new temporary file.
                        my $tmpFile = "$FIG_Config::temp/$table$$.temp.dtx";
                        my $oh = Open(undef, ">$tmpFile");
                        # Put all the data into it.
                        Trace(scalar(@lines) . " data lines found.") if T(3);
                        print $oh join("\n", @lines);
                        close $oh;
                        # Sanity check the temp file.
                        CheckLoadFile($erdb, $table, $tmpFile, $sanityCheck);
                        # Clean it up.
                        unlink $tmpFile;
                    } else {
                        # Otherwise tell the user about all the missing files.
                        for my $missingFile (@missingFiles) {
                            $stats->Add('sections-missing' => 1);
                            $stats->AddMessage("Data file $missingFile not found for table $table.");
                        }
                    }
                } else {
                    # We have all the sections. Try to assemble them into a data file.
                    my $sortStart = time();
                    my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile";
                    Trace("Sort command: $sortCommand") if T(3);
                    # Pipe to the sort command. Note that we turn on autoflush
                    # so there's no buffering.
                    my $oh = Open(undef, "| $sortCommand");
                    select $oh; $| = 1; select STDOUT;
                    # Loop through the sections.
                    for my $sectionFile (@sectionFiles) {
                        Trace("Collating $sectionFile.") if T(3);
                        $stats->Add("$table-sections" => 1);
                        # Loop through the section file.
                        my $ih = Open(undef, "<$sectionFile");
                        while (defined (my $line = <$ih>)) {
                            print $oh $line;
                            $stats->Add("$table-collations" => 1);
                        }
                    }
                    # Finish the sort step.
                    Trace("Finishing collate for $table.") if T(3);
                    close $oh;
                    $stats->Add('tables-collated' => 1);
                    $stats->Add('collate-time' => time() - $sortStart);
                }
            }
            # Now that we know we have a full data file, we can delete the
            # section files to make room in the data directory. The user can
            # turn this behavior off with the keepSections option.
            if (! $options->{keepSections}) {
                for my $sectionFile (@sectionFiles) {
                    if (-e $sectionFile) {
                        unlink $sectionFile;
                        $stats->Add('files-deleted' => 1);
                    }
                }
                Trace("Section files for $table deleted.") if T(3);
            }
        }
        # Were any tables missing?
        if ($missingTable) {
            # Yes, skip this group.
            $stats->Add('groups-skipped' => 1);
            Trace("Skipping $group group: $missingTable missing tables.") if T(2);
        } else {
            # No! Process this group's files.
            if ($sanityCheck eq "") {
                Trace("Loading group $group into database.") if T(2);
            } else {
                Trace("Sanity check for group $group.") if T(2);
            }
            my $loadStart = time();
            for my $table (@tableList) {
                my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
                # Do we want a real load or a sanity check?
                if ($sanityCheck eq "") {
                    # Real load.
                    my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
                    $stats->Accumulate($newStats);
                    Trace("$fileName loaded into $table.") if T(3);
                } elsif ($sanityCheck > 0) {
                    # Here we want a sanity check. Note that if the check value is 0,
                    # we don't bother. The user just wants to suppress the load step.
                    CheckLoadFile($erdb, $table, $fileName, $sanityCheck);
                }
            }
            $stats->Add("groups-loaded" => 1);
            $stats->Add('load-time' => 1);
        }
    }
    $stats->Add('total-time' => time() - $totalStart);
    # Display the statistics from this run.
    Trace("Statistics for load:\n" . $stats->Show()) if T(2);
};
if ($@) {
    Trace("Script failed with error: $@") if T(0);
} else {
    Trace("Script complete.") if T(2);
}
if ($options->{phone}) {
    my $msgID = Tracer::SendSMS($options->{phone}, "ERDBLoader completed.");
    if ($msgID) {
        Trace("Phone message sent with ID $msgID.") if T(2);
    } else {
        Trace("Phone message not sent.") if T(2);
    }
}

=head3 CheckLoadFile

    CheckLoadFile($erdb, $table, $fileName, $count);

Read the first few records of a load file and trace the contents at level
2. This allows the user to visually compare the load file contents with
the database definition.

=over 4

=item erdb

[[ErdbPm]] object describing the database.

=item table

Name of the table to check.

=item fileName

Name of the load file to check.

=item count

Number of records to check.

=back

=cut

sub CheckLoadFile {
    # Get the parameters.
    my ($erdb, $table, $fileName, $count) = @_;
    # Open the file for input.
    my $ih = Open(undef, "<$fileName");
    # Slurp the first N records.
    my @records;
    while (! eof $ih && scalar(@records) < $count) {
        push @records, [ Tracer::GetLine($ih) ];
    }
    my $found = scalar(@records);
    Trace("$found records for $table found in sanity check using $fileName.") if T(2);
    # Do we have any data at all?
    if ($found) {
        # Yes. Get the table's descriptor. We use this to determine the field names.
        my $relationData = $erdb->FindRelation($table);
        Confess("Relation $table not found in database.") if (! defined $relationData);
        my @fields = @{$relationData->{Fields}};
        # If this is a relationship, we need the FROM and TO data.
        my %ends; ($ends{from}, $ends{to}) = $erdb->GetRelationshipEntities($table);
        # Loop through the fields. We generate one message per field.
        for (my $i = 0; $i <= $#fields; $i++) {
            # Get this field's information.
            my $fieldInfo = $fields[$i];
            my $type = $fieldInfo->{type};
            my $name = $fieldInfo->{name};
            if ($name =~ /^(from|to)-link$/) {
                # Here it's a relationship link, so add the name of the target table to
                # the type.
                $type .= " ($ends{$1})";
            }
            # This is going to be a multi-line trace message. We start with the field name and type.
            my @lines = ("Values for $table($name), type $type:\n");
            # Loop through the records. We generate one line of data per record.
            for (my $j = 0; $j < $found; $j++) {
                # Get the field value.
                my $field = $records[$j]->[$i];
                # Compute the record label.
                my $line = "Record $j";
                # Check for unusual cases.
                if (! defined $field || $field eq '') {
                    $line .= "= <empty>";
                } else {
                    # Make sure we don't trace something ungodly.
                    my $excess = (length $field) - 40;
                    if ($excess > 0) {
                        $field = substr($field, 0, 40) . " >> + $excess characters";
                    }
                    $line .= ": $field";
                }
                # Save this line. We indent a little for readability.
                push @lines, "   $line";
            }
            # Trace this field.
            Trace(join("\n", @lines)) if T(2);
        }
    }
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3