[Bio] / FigKernelScripts / Packager.pl Repository:
ViewVC logotype

View of /FigKernelScripts/Packager.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (download) (as text) (annotate)
Thu Dec 6 14:00:37 2007 UTC (11 years, 11 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.6: +26 -26 lines
Changed POD format for better compatability with Wiki.

#!/usr/bin/perl -w
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


=head1 Packager

C<Packager> [I<options>] C<scan>|C<pack>|C<unpack> I<directoryRoot> I<packageFile>

Package all the files in a directory tree for transport or recreate the directory
tree from the packaged files.

This method takes as input three positional parameters.

=over 4

=item command

C<pack> to package the files into a single package file, <unpack> to restore
the files from the package file, or C<scan> to scan the files to determine
file types.

=item directoryRoot

Root directory of the directory tree. If scanning or packing, all files in this
tree will be packaged. If unpacking, the files are unpacked into this tree.

=item packageFile

Name of the package file. If packing, this is the output file. If unpacking,
this is the input file.

=back

File and directory names should be specified using Unix conventions, with a
forward slash (C</>) instead of a backslash (C<\>).

The process of packaging is straightforward. We write out the name of the input file,
its type (text or binary), the file data itself, and an MD5 digest. Each file is
compressed using the PERL C<Compress::Zlib> libary.

This is by no means the optimal way to package files, but it is simple to do and
can be customized for this project's special problems (such as incompatible file
names).

It is important that the packager understand the difference between text files
and binary files. The line end characters in text files need to be translated
to the target platform's characters, while binary files must be moved without
translation. The file's extension is used to determine whether it is binary or
text. When packaging, we will look for a file called C<extensions.txt> in the
root directory of the tree. This file will have a list of binary file extensions,
one per line. The <scan> command can be used to create this file. If the file
is not found when packaging, it is presumed all files are text files.

When packaging, line-end characters inside text files are converted to a single
line feed (C<\n>). When unpackaging, the line feeds are converted to the line-ending
character of the current platform.

=head2 Temporary Files

Temporary files in the directory tree are skipped when scanning or packaging. A file
is considered temporary if its name ends in a tilde (C<~>) or pound sign C<#>,
or it has an extension of C<.bak> or C<.tgz>. In addition, directories with names
ending in C<.old> will be skipped.

=cut

use strict;
use Cwd 'abs_path';
use File::Copy;
use File::Path;
use Fcntl qw(:DEFAULT :seek);
use POSIX;
use Compress::Zlib;
use Digest::MD5;

=head2 Constants

=head3 CHUNK_SIZE

Number of bytes to use when reading a survey chunk during file scans.
We take three chunks from the file of this size. Any binary character
(ASCII code > 127) indicates a binary file. A binary file is also
presumed if the number of non-whitespace control characters is above
a certain percentage.

=cut
my $CHUNK_SIZE = 1024;

=head3 SURVEY_SIZE

Number of bytes in a file survey. A file survey consists of three chunks.

=cut

my $SURVEY_SIZE = $CHUNK_SIZE * 3;

=head3 NON_WHITE

Fraction of the non-white control characters in the survey data that
would classify the file as binary.

=cut

my $NON_WHITE = 0.25;

=head3 MAX_PACKAGE_SIZE

Maximum legal size for a package file.

=cut

my $MAX_PACKAGE_SIZE = 0x7FFFFFFF;

=head3 NL_LEN

Number of characters in a line-end sequence.

=cut

my $NL_LEN = length($/);

=head3 PACK_BLOCK

Number of characters in a package block.

=cut

my $PACK_BLOCK = 4096;

# OPEN CODE

# Get the command, the file directory name, and the package file name.
my ($command, $treeDirectory, $packageFile) = @ARGV;
# Get the absolute path for the directory tree.
my $absDirectory = abs_path($treeDirectory);
# Trim off any trailing slash.
if ($absDirectory =~ m!/$!) {
    chop $absDirectory;
}
# Process the command.
$command = lc $command;
if ($command eq 'scan') {
    ScanTree($absDirectory);
} elsif ($command eq 'pack') {
    PackTree($absDirectory, $packageFile);
} elsif ($command eq 'unpack') {
    UnPackTree($absDirectory, $packageFile);
} else {
    print "Invalid command \"$command\".\n";
}
print "Processing complete.\n";

=head2 Methods

=head3 ScanTree

    ScanTree($directory);

Scan all files in the specified directory and create an C<extensions.txt> file
listing the extensions of files that are binary.

=cut

sub ScanTree {
    my ($directory) = @_;
    # Create a hash of file extensions. The first time a file with a particular
    # extension is found, it is scanned for binary characters. If such characters
    # are found, the file is added to the hash with a value of "b"; otherwise it is
    # added with a value of "t".
    my %extensions = ();
    # Create a directory object.
    my $dirObject = OpenDirectory($directory);
    # Loop through the files in the tree, scanning each one.
    while (my $fileName = GetFileName($dirObject)) {
        # Get the file name relative to the root and the file extension.
        my $relName = $dirObject->{relName};
        my $suffix = $dirObject->{suffix};
        # Only proceed if this is a new file type.
        if (exists $extensions{$suffix}) {
            print "Skipping $relName.\n";
        } else {
            # Get the file size.
            my $size = -s $fileName;
            # If the file is empty, skip it.
            if ($size == 0) {
                print "File $relName is empty: ignored.\n";
            } else {
                # Here the file is worth scanning.
                print "Scanning $relName($size) for type of \"$suffix\"... ";
                # Determine the type of the file.
                my $type = ScanFile($fileName, $size);
                print "$type\n";
                # Add the extension to the hash.
                $extensions{$suffix} = substr($type, 0, 1);
            }
        }
    }
    # Now we write the extension file.
    print "Creating extensions file.\n";
    (open EXTFILE, ">$directory/extensions.txt") ||
        die "Could not open extensions file: $!";
    my $count = 0;
    for my $ext (keys %extensions) {
        if ($extensions{$ext} eq "b") {
            (print EXTFILE "$ext\n") ||
                die "Error writing extensions file: $!";
            $count++;
        }
    }
    close EXTFILE;
    print "$count binary extensions found.\n";
    print InSummary($dirObject);
}

=head3 ScanFile

    my $type = ScanFile($fileName, $size);

Survey the specified file of the specified size to determine whether it is
text or binary. A file is binary if it has a lot of control characters or
any character whose ASCII value is greater than 127.

=cut

sub ScanFile {
    my ($fileName, $size) = @_;
    # Open the file for binary input.
    (sysopen INFILE, $fileName, O_RDONLY + O_BINARY) ||
        die "Could not scan $fileName: $!";
    # Survey the file. We take chunks from the beginning, end, and middle.
    # if the file has fewer than three chunks of characters, we read the
    # whole thing.
    my $survey = "";
    if ($size <= $SURVEY_SIZE) {
        (sysread INFILE, $survey, $SURVEY_SIZE) ||
            die "Error reading $fileName: $!";
    } else {
        (sysread INFILE, $survey, $CHUNK_SIZE) ||
            die "Error reading start of $fileName: $!";
        my $position = int($size - $CHUNK_SIZE / 2);
        (sysseek INFILE, $position, SEEK_SET) ||
            die "Error moving to middle of $fileName: $!";
        (sysread INFILE, $survey, $CHUNK_SIZE, $CHUNK_SIZE) ||
            die "Error reading middle of $fileName: $!";
        (sysseek INFILE, -$CHUNK_SIZE, SEEK_END) ||
            die "Error moving to end of $fileName: $!";
        (sysread INFILE, $survey, 2*$CHUNK_SIZE, $CHUNK_SIZE) ||
            die "Error reading end of $fileName: $!";
    }
    close INFILE;
    my $surveyLen = length $survey;
    # Now $survey contains a bunch of bytes from the file. We run through them
    # counting non-white control characters and looking for binary characters.
    # First, we assume the file is text until we learn otherwise.
    my $retVal = "text";
    # This variable will contain the number of non-white control characters
    # required to prove the file is binary.
    my $controls = ceil($NON_WHITE * $surveyLen);
    # Here we loop through the survey characters.
    while ($retVal eq "text" && (my $chr = chop $survey)) {
        # Process according to the character type.
        if ($chr gt chr(127)) {
            # Here we have a true binary character.
            $retVal = "binary";
        } elsif ($chr lt " " && $chr !~ /\s/) {
            # Here we have a non-white control character.
            $controls--;
            if (! $controls) { $retVal = "binary"; }
        }
    }
    # Return the file type.
    return $retVal;
}

=head3 OpenDirectory

    my $dirObject = OpenDirectory($directoryName);

This method returns an object that can be used to find all the files in a directory
tree. The incoming parameter is the directory tree name, properly cleaned and with
the trailing slash removed.

=cut

sub OpenDirectory {
    my ($directoryName) = @_;
    return {
            # The root name length enables us to separate the relative file name from
            # the absolute file name.
            rootLen => length($directoryName) + 1,
            # This is a stack of the files and directories still to be processed.
            stack => [$directoryName],
            # These are counters for various file types.
            tempCount => 0,
            dirCount => 0,
            dirSkip => 0,
            foundCount => 0
        };
}

=head3 GetFileName

    my $fileName = GetFileName($dirObject);

This method returns the next file name for a directory tree. The incoming parameter is
a directory object returned by OpenDirectory. If all the files have been processed, the
method returns an undefined value.

=cut

sub GetFileName {
    my ($dirObject) = @_;
    # Loop until we find a file name or empty the stack.
    my $retVal;
    my $done = 0;
    while (! $done) {
        # Get the next file name.
        my $nextName = pop @{$dirObject->{stack}};
        if (! $nextName) {
            # The stack is empty, so we're done.
            $done = 1;
        } elsif (! -d $nextName) {
            # Here we've found a real file. Now we need to check for a temporary file name.
            if ($nextName =~ /(~|#|\.bak|\.tgz)$/) {
                # We have a temporary file name, so we count it and keep going.
                $dirObject->{dirSkip}++;
            } else {
                # Here we have a non-temporary name, so we're done.
                $done = 1;
                $retVal = $nextName;
                $dirObject->{foundCount}++;
                # Compute the file's relative name and extension, then stuff them in
                # the object.
                $dirObject->{relName} = substr $nextName, $dirObject->{rootLen};
                my ($path, $suffix) = NameParse($nextName);
                $dirObject->{suffix} = lc $suffix;
            }
        } elsif ($nextName =~ /\.old$/) {
            # Here we have a backup directory.
            $dirObject->{dirSkip}++;
        } else {
            # Here we have a directory. We need to push all its members on the stack.
            # Note that file names beginning with a period are ignored.
            opendir NEXTDIR, $nextName;
            my @files = grep { $_ =~ /^[^.]/ } readdir NEXTDIR;
            closedir NEXTDIR;
            push @{$dirObject->{stack}}, map { "$nextName/$_" } @files;
            $dirObject->{dirCount}++;
        }
    }
    return $retVal;
}

=head3 InSummary

    print InSummary($dirObject);

Return a summary of the files processed by a directory object.

=cut

sub InSummary {
    my ($dirObject) = @_;
    my $dirCount = $dirObject->{dirCount};
    my $tempCount = $dirObject->{tempCount};
    my $foundCount = $dirObject->{foundCount};
    my $dirSkip = $dirObject->{dirSkip};
    return "$foundCount files found, $tempCount temporary files ignored, $dirCount directories processed, $dirSkip directories skipped.\n";
}

=head3 NameParse

    my ($path, $suffix) = NameParse($fileName);

Separate the directory path and suffix out of a file name. Note that if the suffix
is entirely numeric, we return a suffix of "#".

=cut

sub NameParse {
    my ($fileName) = @_;
    # Split the file into path pieces.
    my @pieces = split /\//, $fileName;
    # Peel off the last piece.
    my $baseName = pop @pieces;
    # Form the path from everything remaining.
    my $path = join "/", @pieces;
    # Split the extension off the base name.
    my $suffix = '';
    if ($baseName =~ /\.\d+$/) {
        $suffix = '#';
    } elsif ($baseName =~ /\.([^.]+)$/) {
        $suffix = $1;
    }
    # Return the results.
    return ($path, $suffix);
}

=head3 StartFile

    StartFile($mode, $fileName);

Open the specified file for compression input. The file handle used is CLEARFILE.

=cut

sub StartFile {
    my ($mode, $fileName) = @_;
    # Determine the type of file.
    if ($mode eq 't') {
        # Here we have a text file.
        (open CLEARFILE, "<$fileName") ||
            die "Could not open text input file $fileName: $!";
    } else {
        # Here we have a binary file.
        (sysopen CLEARFILE, $fileName, O_RDONLY | O_BINARY) ||
            die "Could not open binary input file $fileName: $!";
    }
}

=head3 GetCharacters

    my $characters = GetCharacters($mode);

Get some characters to compress from the current input file. If we have reached
end-of-file, returns an empty list.

=cut

sub GetCharacters {
    my ($mode) = @_;
    # Declare the return variable.
    my $retVal;
    # Determine the type of file.
    if ($mode eq 't') {
        # Here we have a text file. We read a line of text. Since it's open as
        # a text file, the line-end will be translated for us.
        $retVal = <CLEARFILE>;
        if (! defined $retVal) {
            # Here we either have end-of-file or a file error.
            if ($?) {
                die "Text input error: $!";
            } else {
                # If it's end-of-file, we close the file.
                close CLEARFILE;
            }
        }
    } else {
        # Here we have a binary file. We read a fixed-length text chunk.
        my $textLine;
        my $result = sysread CLEARFILE, $textLine, $CHUNK_SIZE;
        if (! defined $result) {
            # Here we have a file error.
            die "Binary input error: $!";
        } elsif ($result == 0) {
            # Here we have end-of-file, so we close the file.
            close CLEARFILE;
        } else {
            # Here we have data to return.
            $retVal = $textLine;
        }
    }
    return $retVal;
}

=head3 GetExtensions

    my %extHash = GetExtensions($directory);

Create a hash containing the extensions of the binary files in the specified
directory tree.

=cut

sub GetExtensions {
    my ($directory) = @_;
    # Declare the return variable.
    my %retVal = ();
    # Check for the extensions file.
    my $fileName = "$directory/extensions.txt";
    if (! -e $fileName) {
        print "No extensions file found. All files are treated as text.\n";
    } else {
        # The file exists, so we read it into the hash.
        (open EXTFILE, "<$fileName") ||
            die "Could not open extensions file: $!";
        while (my $line = <EXTFILE>) {
            chomp $line;
            $retVal{$line} = 'b';
        }
    }
    # Return the hash.
    return %retVal;
}

=head3 StartPackaging

    my $packObject = StartPackaging($packageFileName);

Prepare to create the package file.

=cut

sub StartPackaging {
    my ($packageFileName) = @_;
    # Open the package file.
    (sysopen PACKFILE, "$packageFileName", O_WRONLY | O_CREAT | O_TRUNC | O_BINARY) ||
        die "Could not open package file $packageFileName: $!";
    # Return an object telling us how we're doing.
    return {
            packName => $packageFileName,
            fileSize => 0,
            buffer => '',
            bufferSize => 0,
            outCount => 0
           };
}

=head3 WritePackageLine

    WritePackageLine($packObject, $line);

Write a line of text data to the package file. A text line is preceded by a space character
and is terminated by a line feed character.

=cut

sub WritePackageLine {
    my ($packObject, $line) = @_;
    # Write the line to the file.
    (print PACKFILE " $line\n") ||
        die "Error writing to package file: $!";
    # Update the file size.
    $packObject->{fileSize} += length($line) + 2;
}

=head3 FinishPackaging

    FinishPackaging($packObject);

Finish the packaging process. This basically closes the package file.

=cut

sub FinishPackaging {
    close PACKFILE;
}

=head3 StartCompressedData

    StartCompressedData($packObject, $compTable);

Initialize for writing compressed file data. Compressed file data is output in fixed-
length blocks preceded by a C<b> character. (This distinguishes them from text lines,
which begin with a space.) The last block may be short. It is preceded by an C<e>
character and the next four bytes contain the length in decimal. (So, for example,
C<e0012> would indicate a 12-byte final block.)

=cut

sub StartCompressedData {
    my ($packObject, $compTable) = @_;
    # Initialize the buffer data and the output count.
    $packObject->{buffer} = '';
    $packObject->{bufferLen} = 0;
    $packObject->{outCount} = 0;
    # Remember the compression table.
    $packObject->{compTable} = $compTable;
    # Create the checksum.
    $packObject->{digest} = Digest::MD5->new();
}

=head3 WriteCompressedData

    WriteCompressedData($packObject, $data, $status);

Add a block of data to the compressed output. The data is accumulated until we fill
an entire block, at which time it is written to the file with the appropriate marker.

=cut

sub WriteCompressedData {
    my ($packObject, $data, $status) = @_;
    # Check the status code to see if there's an error.
    if ($status != Z_OK) {
        die "Compression error: " . $packObject->{comptable}->msg();
    } else {
        # Get the amount of data we have.
        my $len = length($data);
        # Find out how much room is in the buffer.
        my $room = $PACK_BLOCK - $packObject->{bufferLen};
        # Now we put the data into blocks. At all times, $residual
        # will be the data not transmitted, $room the room left in
        # the block, and $len the length of the residual.
        my $residual = $data;
        while ($room < $len) {
            # Put as much data into the buffer as will fit.
            $packObject->{buffer} .= substr($residual, 0, $room);
            # Remove the data from the residual.
            $residual = substr($residual, $room);
            $len -= $room;
            # Write the buffer.
            print PACKFILE "b" . $packObject->{buffer};
            $packObject->{outCount} += $PACK_BLOCK + 1;
            # Denote that the buffer is now empty.
            $packObject->{bufferLen} = 0;
            $packObject->{buffer} = '';
            $room = $PACK_BLOCK;
        }
        # Put the remaining text into the buffer.
        $packObject->{buffer} .= $residual;
        $packObject->{bufferLen} += $len;
    }
}

=head3 WriteEndMark

    my $outCount = WriteEndMark($packObject);

Write out the last block of a compressed data stream and return the total number of bytes
in the stream.

=cut

sub WriteEndMark {
    my ($packObject) = @_;
    # Get the number of bytes in the buffer.
    my $len = $packObject->{bufferLen};
    # Format the prefix.
    my $prefix = "$len";
    while (length($prefix) < 4) {
        $prefix = "0$prefix";
    }
    # Insert the checksum.
    $prefix .= $packObject->{digest}->hexdigest;
    # Write the prefix and the buffer content.
    print PACKFILE "e$prefix" . $packObject->{buffer};
    # Compute the total length output.
    my $retVal = $len + 37 + $packObject->{outCount};
    # Add it to the package output counter.
    $packObject->{fileSize} += $retVal;
    # Return the total length output.
    return $retVal;
}

=head3 PackTree

    PackTree($directory, $packageFileName);

Package all files in a directory tree into a package file.

The C<extensions.txt> file in the root of the directory tree is used to determine
which file extensions indicate binary files.

=cut

sub PackTree {
    my ($directory, $packageFileName) = @_;
    # Get the extensions hash. This is used to separate text files
    # from binary files.
    my %extHash = GetExtensions($directory);
    # Set up to output to the package files.
    my $packObject = StartPackaging($packageFileName);
    # Create a directory object.
    my $dirObject = OpenDirectory($directory);
    # Create file type counters.
    my %fileCounters = ( t => 0, b => 0 );
    # Loop through the files in the tree, packaging each one.
    while (my $fileName = GetFileName($dirObject)) {
        # Decide whether this file is text or binary. We begin by separating
        # the file name into its path and extension.
        my ($path, $suffix) = NameParse($fileName);
        # Check the extension hash for the suffix.
        my $mode = (exists $extHash{$suffix} ? "b" : "t");
        # Output the relative file name.
        my $relName = $dirObject->{relName};
        WritePackageLine($packObject, "$mode $relName");
        # Update the appropriate file counter.
        $fileCounters{$mode}++;
        # Now we compress the file.
        PackFile($fileName, $packObject, $mode);
    }
    # Close the package output stream.
    FinishPackaging($packObject);
    # Tell the user how much package file we have.
    print $packObject->{fileSize} . " bytes written to package file.\n";
    # Display the text/binary split.
    print "$fileCounters{t} text files, $fileCounters{b} binary files compressed.\n";
    # Write the summary of files processed.
    print InSummary($dirObject);
}

=head3 PackFile

    PackFile($fileName, $packObject, $mode);

Compress the specified file to the specified package file in the specified mode.

=cut

sub PackFile {
    my ($fileName, $packObject, $mode) = @_;
    print "Packaging $fileName ($mode).\n";
    # Create a compression object.
    my $compTable = deflateInit();
    # Initialize for reading the input file.
    StartFile($mode, $fileName);
    # Initialize for writing the output.
    StartCompressedData($packObject, $compTable);
    # Loop through the input.
    while (my $chars = GetCharacters($mode)) {
        # Add the data to the checksum.
        $packObject->{digest}->add($chars);
        # Compress this data.
        my ($output, $status) = $compTable->deflate($chars);
        # Write it to the output.
        WriteCompressedData($packObject, $output, $status);
    }
    # Flush out any remaining data.
    my ($output, $status) = $compTable->flush();
    WriteCompressedData($packObject, $output, $status);
    # Terminate the file.
    my $outCount = WriteEndMark($packObject);
    # Display the compression ratio.
    my $fileSize = $compTable->total_in();
    my $ratio = ($fileSize ? int($outCount * 100 / $fileSize + 0.5) : 0);
    print "$fileSize characters in, $outCount characters out ($ratio%)\n";
}

=head3 UnPackTree

    UnPackTree($directory, $packageFile);

Read file data from a package file and put it into a directory tree.

The package file consists of a series of compressed files. The package file contains
three types of records. A text record begins with a space and is terminated by a
new-line character (C<\n>). A block record has a fixed length and begins with a C<b>.
An end record begins with an C<e> and a four-digit length, followed by data of that
length. A file contains a text record with the file type (C<t> for text and C<b> for
binary) and its name relative to the root of the tree, zero or more block records, and
one end record.

To prevent file system compatibility problems, the name is automatically cleaned
before the file is created: all spaces and colons are converted to underscores,
and question marks are converted to the double letter C<QQ>.

Text files are open in text mode and binary files in binary mode. As a result, the
line-end characters in a text file are automatically translated to the line-end
character of the target operating system.

=cut

sub UnPackTree {
    my ($directory, $packageFile) = @_;
    # Open the package file for input.
    my $packObject = StartUnPacking($packageFile);
    # Create some file counters.
    my %fileCounters = ( b => 0, t => 0 );
    # Loop through the package one file at a time.
    while (my $fileData = GetTextRecord($packObject)) {
        # Extract the file name and mode.
        my $mode = substr($fileData,0,1);
        my $fileName = substr($fileData,2);
        print "Unpacking $fileName ($mode).\n";
        # Create the decompression object.
        my $compObject = inflateInit();
        # Create the output file.
        my $fileObject = CreateFile($fileName, $mode, $directory, $compObject);
        # Loop through the package file, de-compressing the data.
        while (my $block = GetBlock($packObject)) {
            # Decompress the current block.
            my ($data, $status) = $compObject->inflate($block);
            # Write the decompressed data.
            WriteData($fileObject, $data, $status);
        }
        # Verify the checksums.
        my ($expected, $found) = ($packObject->{digest}, $fileObject->{digest}->hexdigest);
        if ($expected ne $found) {
            # Here we have a mismatch.
            die "Invalid checksum found unpacking file $fileName: expected $expected, found $found.";
        }
        # Count this file.
        $fileCounters{$mode}++;
        # Close it.
        close CLEARFILE;
    }
    # Close the package file.
    close PACKFILE;
    # Tell the user what we did.
    print "$fileCounters{t} text files unpacked, $fileCounters{b} binary files unpacked.\n";
    print $packObject->{inCounter} . " package file bytes read.\n";
}

=head3 StartUnPacking

    my $packObject = StartUnPacking($packageFile);

Prepare to read from a package file.

=cut

sub StartUnPacking {
    my ($packageFile) = @_;
    # Open the package file for binary input.
    (sysopen PACKFILE, $packageFile, O_RDONLY | O_BINARY) ||
        die "Could not open $packageFile: $!";
    # Return the package file input object.
    return {
            blockEnd => 0,
            inCounter => 0
           }
}

=head3 GetTextRecord

    my $data = GetTextRecord($packObject);

Read a text record from the package file. The text record begins with a space and
ends with a new-line character.

=cut

sub GetTextRecord {
    my ($packObject) = @_;
    # Declare the return variable.
    my $retVal;
    # Read the next character.
    my $buffer;
    my $rv = read PACKFILE, $buffer, 1;
    # Check to insure we read a space.
    if (! defined $rv) {
        # Here the read failed.
        die "Error reading package file: $!";
    } elsif ($rv == 0) {
        # Here we've reached end-of-file. We'll return an undefined value
        # to the caller.
    } elsif ($buffer ne " ") {
        # Here we didn't find a space: the file is corrupt.
        die "Text record not found when expected in package file.";
    } else {
        # Here we have a real, live text record. We loop until we find a new-line.
        $retVal = "";
        $rv = read PACKFILE, $buffer, 1;
        while (defined $rv && $rv == 1 && $buffer ne "\n") {
            $retVal .= $buffer;
            $rv = read PACKFILE, $buffer, 1;
        }
        # Check for error conditions.
        if (! defined $rv) {
            die "Error reading package file: $!";
        } elsif ($rv == 0) {
            die "End-of-file inside text record in package file.";
        }
        # Update the input counter.
        $packObject->{inCounter} += length($retVal) + 2;
        # Denote we haven't read an end block.
        $packObject->{blockEnd} = 0;
    }
    return $retVal;
}

=head3 GetBlock

    my $block = GetBlock($packObject);

Read a block of compressed data from the package file. If the end of the compressed data
has been reached, will return an undefined value.

=cut

sub GetBlock {
    my ($packObject) = @_;
    # Declare the return value.
    my $retVal;
    # Only proceed if we have NOT reached the end block.
    if (! $packObject->{blockEnd}) {
        # Read the indicator character.
        my $buffer;
        my $rv = read PACKFILE, $buffer, 1;
        # Check for errors.
        if (! defined $rv) {
            die "Error reading package block mark: $!";
        } elsif ($rv == 0) {
            die "End-of-file read where block mark expected in package file.";
        } elsif ($buffer eq 'b') {
            # Here we have a normal block. We read it directly into the
            # return variable.
            $rv = read PACKFILE, $retVal, $PACK_BLOCK;
            # Check for errors.
            if (! defined $rv) {
                die "Error reading package block data: $!";
            } elsif ($rv < $PACK_BLOCK) {
                die "Unexpected end-of-file inside package file block.";
            } else {
                # Record the bytes read.
                $packObject->{inCounter} += $PACK_BLOCK + 1;
            }
        } elsif ($buffer eq 'e') {
            # Here we have an end block. We need to read the block length
            # and the checksum.
            $rv = read PACKFILE, $buffer, 36;
            # Check for errors.
            if (! defined $rv) {
                die "Error reading end block length: $!";
            } elsif ($rv < 36) {
                die "Unexpected end-of-file reading end block length and checksum.";
            } else {
                # Get the length and the checksum.
                my $lengthCode = substr $buffer, 0, 4;
                $packObject->{digest} = substr $buffer, 4, 32;
                # Check for an empty block.
                if ($lengthCode == 0) {
                    # Here the end block is empty. We will return an undefined value,
                    # indicating end-of-file. Record the 36 bytes read.
                    $packObject->{inCounter} += 36;
                } else {
                    # Here we have a valid block length, so we read the block.
                    $rv = read PACKFILE, $retVal, $lengthCode;
                    # Check for errors.
                    if (! defined $rv) {
                        die "Error reading end block: $!";
                    } elsif ($rv < $lengthCode) {
                        die "Unexpected end-of-file inside end block.";
                    } else {
                        # Denote the next read will return end-of-file.
                        $packObject->{blockEnd} = 1;
                        # Record the bytes read.
                        $packObject->{inCounter} += $lengthCode + 37;
                    }
                }
            }
        } else {
            # Here we have an invalid block mark.
            die "Invalid block mark read in package file.";
        }
    }
    return $retVal;
}

=head3 CreateFile

    my $fileObject = CreateFile($fileName, $mode, $directory, $compObject);

Create a file into which packaged data can be unpacked.

=cut

sub CreateFile {
    my ($fileName, $mode, $directory, $compObject) = @_;
    # Clean the file name.
    $fileName =~ tr/: /__/;
    $fileName =~ s/\?/QQ/g;
    # Add the file name to the directory to get the fully-qualified file name.
    my $fullName = "$directory/$fileName";
    # Extract the path.
    my ($path, $suffix) = NameParse($fullName);
    # Insure the path exists.
    if (! -d $path) {
        mkpath($path);
    }
    # Check for the directory problem.
    if (-d $fullName) {
        # Here a directory has the same name as the file. We append a suffix
        # to work around the problem.
        $fullName .= ".$mode";
    }
    # Open the file in the appropriate mode.
    if ($mode eq 't') {
        (open CLEARFILE, ">$fullName") ||
            die "Could not open text file $fullName: $!";
    } else {
        (sysopen CLEARFILE, $fullName, O_WRONLY | O_CREAT | O_BINARY) ||
            die "Could not open binary file $fullName: $!";
    }
    # Create a file manipulation object to return.
    return {
            mode => $mode,
            compObject => $compObject,
            digest => Digest::MD5->new()
           };
}

=head3 WriteData

    WriteData($fileObject, $data, $status);

Write a block of data to the output file.

=cut

sub WriteData {
    my ($fileObject, $data, $status) = @_;
    # Check the status code from the compressor.
    if ($status != Z_OK && $status != Z_STREAM_END) {
        die "Error in decompression: " . $fileObject->{compObject}->msg();
    } else {
        # Here we're okay, so we can write the data.
        print CLEARFILE $data;
        # Add it to the checksum.
        $fileObject->{digest}->add($data);
    }
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3