[Bio] / FigKernelScripts / Packager.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/Packager.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 : olson 1.5 #
3 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 : parrello 1.7 #
8 : olson 1.5 # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 : parrello 1.7 # Public License.
11 : olson 1.5 #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 : parrello 1.1
20 :     =head1 Packager
21 :    
22 : parrello 1.3 C<Packager> [I<options>] C<scan>|C<pack>|C<unpack> I<directoryRoot> I<packageFile>
23 :    
24 : parrello 1.1 Package all the files in a directory tree for transport or recreate the directory
25 :     tree from the packaged files.
26 :    
27 : parrello 1.3 This method takes as input three positional parameters.
28 :    
29 :     =over 4
30 :    
31 :     =item command
32 :    
33 :     C<pack> to package the files into a single package file, <unpack> to restore
34 :     the files from the package file, or C<scan> to scan the files to determine
35 :     file types.
36 :    
37 :     =item directoryRoot
38 :    
39 :     Root directory of the directory tree. If scanning or packing, all files in this
40 :     tree will be packaged. If unpacking, the files are unpacked into this tree.
41 :    
42 :     =item packageFile
43 :    
44 :     Name of the package file. If packing, this is the output file. If unpacking,
45 :     this is the input file.
46 :    
47 :     =back
48 : parrello 1.1
49 :     File and directory names should be specified using Unix conventions, with a
50 :     forward slash (C</>) instead of a backslash (C<\>).
51 :    
52 :     The process of packaging is straightforward. We write out the name of the input file,
53 :     its type (text or binary), the file data itself, and an MD5 digest. Each file is
54 :     compressed using the PERL C<Compress::Zlib> libary.
55 :    
56 :     This is by no means the optimal way to package files, but it is simple to do and
57 :     can be customized for this project's special problems (such as incompatible file
58 :     names).
59 :    
60 :     It is important that the packager understand the difference between text files
61 :     and binary files. The line end characters in text files need to be translated
62 :     to the target platform's characters, while binary files must be moved without
63 :     translation. The file's extension is used to determine whether it is binary or
64 :     text. When packaging, we will look for a file called C<extensions.txt> in the
65 :     root directory of the tree. This file will have a list of binary file extensions,
66 :     one per line. The <scan> command can be used to create this file. If the file
67 :     is not found when packaging, it is presumed all files are text files.
68 :    
69 :     When packaging, line-end characters inside text files are converted to a single
70 :     line feed (C<\n>). When unpackaging, the line feeds are converted to the line-ending
71 :     character of the current platform.
72 :    
73 :     =head2 Temporary Files
74 :    
75 :     Temporary files in the directory tree are skipped when scanning or packaging. A file
76 :     is considered temporary if its name ends in a tilde (C<~>) or pound sign C<#>,
77 : parrello 1.2 or it has an extension of C<.bak> or C<.tgz>. In addition, directories with names
78 :     ending in C<.old> will be skipped.
79 : parrello 1.1
80 :     =cut
81 :    
82 :     use strict;
83 :     use Cwd 'abs_path';
84 :     use File::Copy;
85 :     use File::Path;
86 :     use Fcntl qw(:DEFAULT :seek);
87 :     use POSIX;
88 :     use Compress::Zlib;
89 :     use Digest::MD5;
90 :    
91 :     =head2 Constants
92 :    
93 :     =head3 CHUNK_SIZE
94 :    
95 :     Number of bytes to use when reading a survey chunk during file scans.
96 :     We take three chunks from the file of this size. Any binary character
97 :     (ASCII code > 127) indicates a binary file. A binary file is also
98 :     presumed if the number of non-whitespace control characters is above
99 :     a certain percentage.
100 :    
101 :     =cut
102 :     my $CHUNK_SIZE = 1024;
103 :    
104 :     =head3 SURVEY_SIZE
105 :    
106 :     Number of bytes in a file survey. A file survey consists of three chunks.
107 :    
108 :     =cut
109 :    
110 :     my $SURVEY_SIZE = $CHUNK_SIZE * 3;
111 :    
112 :     =head3 NON_WHITE
113 :    
114 :     Fraction of the non-white control characters in the survey data that
115 :     would classify the file as binary.
116 :    
117 :     =cut
118 :    
119 :     my $NON_WHITE = 0.25;
120 :    
121 :     =head3 MAX_PACKAGE_SIZE
122 :    
123 :     Maximum legal size for a package file.
124 :    
125 :     =cut
126 :    
127 :     my $MAX_PACKAGE_SIZE = 0x7FFFFFFF;
128 :    
129 :     =head3 NL_LEN
130 :    
131 :     Number of characters in a line-end sequence.
132 :    
133 :     =cut
134 :    
135 :     my $NL_LEN = length($/);
136 :    
137 :     =head3 PACK_BLOCK
138 :    
139 :     Number of characters in a package block.
140 :    
141 :     =cut
142 :    
143 :     my $PACK_BLOCK = 4096;
144 :    
145 :     # OPEN CODE
146 :    
147 :     # Get the command, the file directory name, and the package file name.
148 :     my ($command, $treeDirectory, $packageFile) = @ARGV;
149 :     # Get the absolute path for the directory tree.
150 :     my $absDirectory = abs_path($treeDirectory);
151 :     # Trim off any trailing slash.
152 :     if ($absDirectory =~ m!/$!) {
153 :     chop $absDirectory;
154 :     }
155 :     # Process the command.
156 :     $command = lc $command;
157 :     if ($command eq 'scan') {
158 :     ScanTree($absDirectory);
159 :     } elsif ($command eq 'pack') {
160 :     PackTree($absDirectory, $packageFile);
161 :     } elsif ($command eq 'unpack') {
162 :     UnPackTree($absDirectory, $packageFile);
163 :     } else {
164 :     print "Invalid command \"$command\".\n";
165 :     }
166 :     print "Processing complete.\n";
167 :    
168 :     =head2 Methods
169 :    
170 :     =head3 ScanTree
171 :    
172 : parrello 1.7 ScanTree($directory);
173 : parrello 1.1
174 :     Scan all files in the specified directory and create an C<extensions.txt> file
175 :     listing the extensions of files that are binary.
176 :    
177 :     =cut
178 :    
179 :     sub ScanTree {
180 :     my ($directory) = @_;
181 :     # Create a hash of file extensions. The first time a file with a particular
182 :     # extension is found, it is scanned for binary characters. If such characters
183 :     # are found, the file is added to the hash with a value of "b"; otherwise it is
184 :     # added with a value of "t".
185 :     my %extensions = ();
186 :     # Create a directory object.
187 :     my $dirObject = OpenDirectory($directory);
188 :     # Loop through the files in the tree, scanning each one.
189 :     while (my $fileName = GetFileName($dirObject)) {
190 :     # Get the file name relative to the root and the file extension.
191 :     my $relName = $dirObject->{relName};
192 :     my $suffix = $dirObject->{suffix};
193 :     # Only proceed if this is a new file type.
194 :     if (exists $extensions{$suffix}) {
195 :     print "Skipping $relName.\n";
196 :     } else {
197 :     # Get the file size.
198 :     my $size = -s $fileName;
199 :     # If the file is empty, skip it.
200 :     if ($size == 0) {
201 :     print "File $relName is empty: ignored.\n";
202 :     } else {
203 :     # Here the file is worth scanning.
204 :     print "Scanning $relName($size) for type of \"$suffix\"... ";
205 :     # Determine the type of the file.
206 :     my $type = ScanFile($fileName, $size);
207 :     print "$type\n";
208 :     # Add the extension to the hash.
209 :     $extensions{$suffix} = substr($type, 0, 1);
210 :     }
211 :     }
212 :     }
213 :     # Now we write the extension file.
214 :     print "Creating extensions file.\n";
215 :     (open EXTFILE, ">$directory/extensions.txt") ||
216 :     die "Could not open extensions file: $!";
217 :     my $count = 0;
218 :     for my $ext (keys %extensions) {
219 :     if ($extensions{$ext} eq "b") {
220 :     (print EXTFILE "$ext\n") ||
221 :     die "Error writing extensions file: $!";
222 :     $count++;
223 :     }
224 :     }
225 :     close EXTFILE;
226 :     print "$count binary extensions found.\n";
227 :     print InSummary($dirObject);
228 :     }
229 :    
230 :     =head3 ScanFile
231 :    
232 : parrello 1.7 my $type = ScanFile($fileName, $size);
233 : parrello 1.1
234 :     Survey the specified file of the specified size to determine whether it is
235 :     text or binary. A file is binary if it has a lot of control characters or
236 :     any character whose ASCII value is greater than 127.
237 :    
238 :     =cut
239 :    
240 :     sub ScanFile {
241 :     my ($fileName, $size) = @_;
242 :     # Open the file for binary input.
243 :     (sysopen INFILE, $fileName, O_RDONLY + O_BINARY) ||
244 :     die "Could not scan $fileName: $!";
245 :     # Survey the file. We take chunks from the beginning, end, and middle.
246 :     # if the file has fewer than three chunks of characters, we read the
247 :     # whole thing.
248 :     my $survey = "";
249 :     if ($size <= $SURVEY_SIZE) {
250 :     (sysread INFILE, $survey, $SURVEY_SIZE) ||
251 :     die "Error reading $fileName: $!";
252 :     } else {
253 :     (sysread INFILE, $survey, $CHUNK_SIZE) ||
254 :     die "Error reading start of $fileName: $!";
255 :     my $position = int($size - $CHUNK_SIZE / 2);
256 :     (sysseek INFILE, $position, SEEK_SET) ||
257 :     die "Error moving to middle of $fileName: $!";
258 :     (sysread INFILE, $survey, $CHUNK_SIZE, $CHUNK_SIZE) ||
259 :     die "Error reading middle of $fileName: $!";
260 :     (sysseek INFILE, -$CHUNK_SIZE, SEEK_END) ||
261 :     die "Error moving to end of $fileName: $!";
262 :     (sysread INFILE, $survey, 2*$CHUNK_SIZE, $CHUNK_SIZE) ||
263 :     die "Error reading end of $fileName: $!";
264 :     }
265 :     close INFILE;
266 :     my $surveyLen = length $survey;
267 :     # Now $survey contains a bunch of bytes from the file. We run through them
268 :     # counting non-white control characters and looking for binary characters.
269 : parrello 1.7 # First, we assume the file is text until we learn otherwise.
270 : parrello 1.1 my $retVal = "text";
271 :     # This variable will contain the number of non-white control characters
272 :     # required to prove the file is binary.
273 :     my $controls = ceil($NON_WHITE * $surveyLen);
274 :     # Here we loop through the survey characters.
275 :     while ($retVal eq "text" && (my $chr = chop $survey)) {
276 :     # Process according to the character type.
277 :     if ($chr gt chr(127)) {
278 :     # Here we have a true binary character.
279 :     $retVal = "binary";
280 :     } elsif ($chr lt " " && $chr !~ /\s/) {
281 :     # Here we have a non-white control character.
282 :     $controls--;
283 :     if (! $controls) { $retVal = "binary"; }
284 :     }
285 :     }
286 :     # Return the file type.
287 :     return $retVal;
288 :     }
289 :    
290 :     =head3 OpenDirectory
291 :    
292 : parrello 1.7 my $dirObject = OpenDirectory($directoryName);
293 : parrello 1.1
294 :     This method returns an object that can be used to find all the files in a directory
295 :     tree. The incoming parameter is the directory tree name, properly cleaned and with
296 :     the trailing slash removed.
297 :    
298 :     =cut
299 :    
300 :     sub OpenDirectory {
301 :     my ($directoryName) = @_;
302 :     return {
303 :     # The root name length enables us to separate the relative file name from
304 :     # the absolute file name.
305 :     rootLen => length($directoryName) + 1,
306 :     # This is a stack of the files and directories still to be processed.
307 :     stack => [$directoryName],
308 :     # These are counters for various file types.
309 :     tempCount => 0,
310 :     dirCount => 0,
311 : parrello 1.2 dirSkip => 0,
312 : parrello 1.1 foundCount => 0
313 :     };
314 :     }
315 :    
316 :     =head3 GetFileName
317 :    
318 : parrello 1.7 my $fileName = GetFileName($dirObject);
319 : parrello 1.1
320 :     This method returns the next file name for a directory tree. The incoming parameter is
321 :     a directory object returned by OpenDirectory. If all the files have been processed, the
322 :     method returns an undefined value.
323 :    
324 :     =cut
325 :    
326 :     sub GetFileName {
327 :     my ($dirObject) = @_;
328 :     # Loop until we find a file name or empty the stack.
329 :     my $retVal;
330 :     my $done = 0;
331 :     while (! $done) {
332 :     # Get the next file name.
333 :     my $nextName = pop @{$dirObject->{stack}};
334 :     if (! $nextName) {
335 :     # The stack is empty, so we're done.
336 :     $done = 1;
337 :     } elsif (! -d $nextName) {
338 :     # Here we've found a real file. Now we need to check for a temporary file name.
339 :     if ($nextName =~ /(~|#|\.bak|\.tgz)$/) {
340 :     # We have a temporary file name, so we count it and keep going.
341 : parrello 1.2 $dirObject->{dirSkip}++;
342 : parrello 1.1 } else {
343 :     # Here we have a non-temporary name, so we're done.
344 :     $done = 1;
345 :     $retVal = $nextName;
346 :     $dirObject->{foundCount}++;
347 :     # Compute the file's relative name and extension, then stuff them in
348 :     # the object.
349 :     $dirObject->{relName} = substr $nextName, $dirObject->{rootLen};
350 :     my ($path, $suffix) = NameParse($nextName);
351 :     $dirObject->{suffix} = lc $suffix;
352 :     }
353 : parrello 1.2 } elsif ($nextName =~ /\.old$/) {
354 :     # Here we have a backup directory.
355 :     $dirObject->{dirSkip}++;
356 : parrello 1.1 } else {
357 :     # Here we have a directory. We need to push all its members on the stack.
358 :     # Note that file names beginning with a period are ignored.
359 :     opendir NEXTDIR, $nextName;
360 :     my @files = grep { $_ =~ /^[^.]/ } readdir NEXTDIR;
361 :     closedir NEXTDIR;
362 :     push @{$dirObject->{stack}}, map { "$nextName/$_" } @files;
363 :     $dirObject->{dirCount}++;
364 :     }
365 :     }
366 :     return $retVal;
367 :     }
368 :    
369 :     =head3 InSummary
370 :    
371 : parrello 1.7 print InSummary($dirObject);
372 : parrello 1.1
373 :     Return a summary of the files processed by a directory object.
374 :    
375 :     =cut
376 :    
377 :     sub InSummary {
378 :     my ($dirObject) = @_;
379 :     my $dirCount = $dirObject->{dirCount};
380 :     my $tempCount = $dirObject->{tempCount};
381 :     my $foundCount = $dirObject->{foundCount};
382 : parrello 1.2 my $dirSkip = $dirObject->{dirSkip};
383 :     return "$foundCount files found, $tempCount temporary files ignored, $dirCount directories processed, $dirSkip directories skipped.\n";
384 : parrello 1.1 }
385 :    
386 :     =head3 NameParse
387 :    
388 : parrello 1.7 my ($path, $suffix) = NameParse($fileName);
389 : parrello 1.1
390 :     Separate the directory path and suffix out of a file name. Note that if the suffix
391 :     is entirely numeric, we return a suffix of "#".
392 :    
393 :     =cut
394 :    
395 :     sub NameParse {
396 :     my ($fileName) = @_;
397 :     # Split the file into path pieces.
398 :     my @pieces = split /\//, $fileName;
399 :     # Peel off the last piece.
400 :     my $baseName = pop @pieces;
401 :     # Form the path from everything remaining.
402 :     my $path = join "/", @pieces;
403 :     # Split the extension off the base name.
404 :     my $suffix = '';
405 :     if ($baseName =~ /\.\d+$/) {
406 :     $suffix = '#';
407 :     } elsif ($baseName =~ /\.([^.]+)$/) {
408 :     $suffix = $1;
409 :     }
410 :     # Return the results.
411 :     return ($path, $suffix);
412 :     }
413 :    
414 :     =head3 StartFile
415 :    
416 : parrello 1.7 StartFile($mode, $fileName);
417 : parrello 1.1
418 :     Open the specified file for compression input. The file handle used is CLEARFILE.
419 :    
420 :     =cut
421 :    
422 :     sub StartFile {
423 :     my ($mode, $fileName) = @_;
424 :     # Determine the type of file.
425 :     if ($mode eq 't') {
426 :     # Here we have a text file.
427 :     (open CLEARFILE, "<$fileName") ||
428 :     die "Could not open text input file $fileName: $!";
429 :     } else {
430 :     # Here we have a binary file.
431 :     (sysopen CLEARFILE, $fileName, O_RDONLY | O_BINARY) ||
432 :     die "Could not open binary input file $fileName: $!";
433 :     }
434 :     }
435 :    
436 :     =head3 GetCharacters
437 :    
438 : parrello 1.7 my $characters = GetCharacters($mode);
439 : parrello 1.1
440 :     Get some characters to compress from the current input file. If we have reached
441 :     end-of-file, returns an empty list.
442 :    
443 :     =cut
444 :    
445 :     sub GetCharacters {
446 :     my ($mode) = @_;
447 :     # Declare the return variable.
448 :     my $retVal;
449 :     # Determine the type of file.
450 :     if ($mode eq 't') {
451 :     # Here we have a text file. We read a line of text. Since it's open as
452 :     # a text file, the line-end will be translated for us.
453 :     $retVal = <CLEARFILE>;
454 :     if (! defined $retVal) {
455 :     # Here we either have end-of-file or a file error.
456 :     if ($?) {
457 :     die "Text input error: $!";
458 :     } else {
459 :     # If it's end-of-file, we close the file.
460 :     close CLEARFILE;
461 :     }
462 :     }
463 :     } else {
464 :     # Here we have a binary file. We read a fixed-length text chunk.
465 :     my $textLine;
466 :     my $result = sysread CLEARFILE, $textLine, $CHUNK_SIZE;
467 :     if (! defined $result) {
468 :     # Here we have a file error.
469 :     die "Binary input error: $!";
470 :     } elsif ($result == 0) {
471 :     # Here we have end-of-file, so we close the file.
472 :     close CLEARFILE;
473 :     } else {
474 :     # Here we have data to return.
475 :     $retVal = $textLine;
476 :     }
477 :     }
478 :     return $retVal;
479 :     }
480 :    
481 :     =head3 GetExtensions
482 :    
483 : parrello 1.7 my %extHash = GetExtensions($directory);
484 : parrello 1.1
485 :     Create a hash containing the extensions of the binary files in the specified
486 :     directory tree.
487 :    
488 :     =cut
489 :    
490 :     sub GetExtensions {
491 :     my ($directory) = @_;
492 :     # Declare the return variable.
493 :     my %retVal = ();
494 :     # Check for the extensions file.
495 :     my $fileName = "$directory/extensions.txt";
496 :     if (! -e $fileName) {
497 :     print "No extensions file found. All files are treated as text.\n";
498 :     } else {
499 :     # The file exists, so we read it into the hash.
500 :     (open EXTFILE, "<$fileName") ||
501 :     die "Could not open extensions file: $!";
502 :     while (my $line = <EXTFILE>) {
503 :     chomp $line;
504 :     $retVal{$line} = 'b';
505 :     }
506 :     }
507 :     # Return the hash.
508 :     return %retVal;
509 :     }
510 :    
511 :     =head3 StartPackaging
512 :    
513 : parrello 1.7 my $packObject = StartPackaging($packageFileName);
514 : parrello 1.1
515 :     Prepare to create the package file.
516 :    
517 :     =cut
518 :    
519 :     sub StartPackaging {
520 :     my ($packageFileName) = @_;
521 :     # Open the package file.
522 :     (sysopen PACKFILE, "$packageFileName", O_WRONLY | O_CREAT | O_TRUNC | O_BINARY) ||
523 :     die "Could not open package file $packageFileName: $!";
524 :     # Return an object telling us how we're doing.
525 :     return {
526 :     packName => $packageFileName,
527 :     fileSize => 0,
528 :     buffer => '',
529 :     bufferSize => 0,
530 :     outCount => 0
531 :     };
532 :     }
533 :    
534 :     =head3 WritePackageLine
535 :    
536 : parrello 1.7 WritePackageLine($packObject, $line);
537 : parrello 1.1
538 :     Write a line of text data to the package file. A text line is preceded by a space character
539 :     and is terminated by a line feed character.
540 :    
541 :     =cut
542 :    
543 :     sub WritePackageLine {
544 :     my ($packObject, $line) = @_;
545 :     # Write the line to the file.
546 :     (print PACKFILE " $line\n") ||
547 :     die "Error writing to package file: $!";
548 :     # Update the file size.
549 :     $packObject->{fileSize} += length($line) + 2;
550 :     }
551 :    
552 :     =head3 FinishPackaging
553 :    
554 : parrello 1.7 FinishPackaging($packObject);
555 : parrello 1.1
556 :     Finish the packaging process. This basically closes the package file.
557 :    
558 :     =cut
559 :    
560 :     sub FinishPackaging {
561 :     close PACKFILE;
562 :     }
563 :    
564 :     =head3 StartCompressedData
565 :    
566 : parrello 1.7 StartCompressedData($packObject, $compTable);
567 : parrello 1.1
568 :     Initialize for writing compressed file data. Compressed file data is output in fixed-
569 :     length blocks preceded by a C<b> character. (This distinguishes them from text lines,
570 :     which begin with a space.) The last block may be short. It is preceded by an C<e>
571 :     character and the next four bytes contain the length in decimal. (So, for example,
572 :     C<e0012> would indicate a 12-byte final block.)
573 :    
574 :     =cut
575 :    
576 :     sub StartCompressedData {
577 :     my ($packObject, $compTable) = @_;
578 :     # Initialize the buffer data and the output count.
579 :     $packObject->{buffer} = '';
580 :     $packObject->{bufferLen} = 0;
581 :     $packObject->{outCount} = 0;
582 :     # Remember the compression table.
583 :     $packObject->{compTable} = $compTable;
584 :     # Create the checksum.
585 :     $packObject->{digest} = Digest::MD5->new();
586 :     }
587 :    
588 :     =head3 WriteCompressedData
589 :    
590 : parrello 1.7 WriteCompressedData($packObject, $data, $status);
591 : parrello 1.1
592 :     Add a block of data to the compressed output. The data is accumulated until we fill
593 :     an entire block, at which time it is written to the file with the appropriate marker.
594 :    
595 :     =cut
596 :    
597 :     sub WriteCompressedData {
598 :     my ($packObject, $data, $status) = @_;
599 :     # Check the status code to see if there's an error.
600 :     if ($status != Z_OK) {
601 :     die "Compression error: " . $packObject->{comptable}->msg();
602 :     } else {
603 :     # Get the amount of data we have.
604 :     my $len = length($data);
605 :     # Find out how much room is in the buffer.
606 :     my $room = $PACK_BLOCK - $packObject->{bufferLen};
607 :     # Now we put the data into blocks. At all times, $residual
608 :     # will be the data not transmitted, $room the room left in
609 :     # the block, and $len the length of the residual.
610 :     my $residual = $data;
611 :     while ($room < $len) {
612 :     # Put as much data into the buffer as will fit.
613 :     $packObject->{buffer} .= substr($residual, 0, $room);
614 :     # Remove the data from the residual.
615 :     $residual = substr($residual, $room);
616 :     $len -= $room;
617 :     # Write the buffer.
618 :     print PACKFILE "b" . $packObject->{buffer};
619 :     $packObject->{outCount} += $PACK_BLOCK + 1;
620 :     # Denote that the buffer is now empty.
621 :     $packObject->{bufferLen} = 0;
622 :     $packObject->{buffer} = '';
623 :     $room = $PACK_BLOCK;
624 :     }
625 :     # Put the remaining text into the buffer.
626 :     $packObject->{buffer} .= $residual;
627 :     $packObject->{bufferLen} += $len;
628 :     }
629 :     }
630 :    
631 :     =head3 WriteEndMark
632 :    
633 : parrello 1.7 my $outCount = WriteEndMark($packObject);
634 : parrello 1.1
635 :     Write out the last block of a compressed data stream and return the total number of bytes
636 :     in the stream.
637 :    
638 :     =cut
639 :    
640 :     sub WriteEndMark {
641 :     my ($packObject) = @_;
642 :     # Get the number of bytes in the buffer.
643 :     my $len = $packObject->{bufferLen};
644 :     # Format the prefix.
645 :     my $prefix = "$len";
646 :     while (length($prefix) < 4) {
647 :     $prefix = "0$prefix";
648 :     }
649 :     # Insert the checksum.
650 :     $prefix .= $packObject->{digest}->hexdigest;
651 :     # Write the prefix and the buffer content.
652 :     print PACKFILE "e$prefix" . $packObject->{buffer};
653 :     # Compute the total length output.
654 :     my $retVal = $len + 37 + $packObject->{outCount};
655 :     # Add it to the package output counter.
656 :     $packObject->{fileSize} += $retVal;
657 :     # Return the total length output.
658 :     return $retVal;
659 :     }
660 :    
661 :     =head3 PackTree
662 :    
663 : parrello 1.7 PackTree($directory, $packageFileName);
664 : parrello 1.1
665 :     Package all files in a directory tree into a package file.
666 :    
667 :     The C<extensions.txt> file in the root of the directory tree is used to determine
668 :     which file extensions indicate binary files.
669 :    
670 :     =cut
671 :    
672 :     sub PackTree {
673 :     my ($directory, $packageFileName) = @_;
674 :     # Get the extensions hash. This is used to separate text files
675 :     # from binary files.
676 :     my %extHash = GetExtensions($directory);
677 :     # Set up to output to the package files.
678 :     my $packObject = StartPackaging($packageFileName);
679 :     # Create a directory object.
680 :     my $dirObject = OpenDirectory($directory);
681 :     # Create file type counters.
682 :     my %fileCounters = ( t => 0, b => 0 );
683 :     # Loop through the files in the tree, packaging each one.
684 :     while (my $fileName = GetFileName($dirObject)) {
685 :     # Decide whether this file is text or binary. We begin by separating
686 :     # the file name into its path and extension.
687 :     my ($path, $suffix) = NameParse($fileName);
688 :     # Check the extension hash for the suffix.
689 :     my $mode = (exists $extHash{$suffix} ? "b" : "t");
690 :     # Output the relative file name.
691 :     my $relName = $dirObject->{relName};
692 :     WritePackageLine($packObject, "$mode $relName");
693 :     # Update the appropriate file counter.
694 :     $fileCounters{$mode}++;
695 :     # Now we compress the file.
696 :     PackFile($fileName, $packObject, $mode);
697 :     }
698 :     # Close the package output stream.
699 :     FinishPackaging($packObject);
700 :     # Tell the user how much package file we have.
701 :     print $packObject->{fileSize} . " bytes written to package file.\n";
702 :     # Display the text/binary split.
703 :     print "$fileCounters{t} text files, $fileCounters{b} binary files compressed.\n";
704 :     # Write the summary of files processed.
705 :     print InSummary($dirObject);
706 :     }
707 :    
708 :     =head3 PackFile
709 :    
710 : parrello 1.7 PackFile($fileName, $packObject, $mode);
711 : parrello 1.1
712 :     Compress the specified file to the specified package file in the specified mode.
713 :    
714 :     =cut
715 :    
716 :     sub PackFile {
717 :     my ($fileName, $packObject, $mode) = @_;
718 :     print "Packaging $fileName ($mode).\n";
719 :     # Create a compression object.
720 :     my $compTable = deflateInit();
721 :     # Initialize for reading the input file.
722 :     StartFile($mode, $fileName);
723 :     # Initialize for writing the output.
724 :     StartCompressedData($packObject, $compTable);
725 :     # Loop through the input.
726 :     while (my $chars = GetCharacters($mode)) {
727 :     # Add the data to the checksum.
728 :     $packObject->{digest}->add($chars);
729 :     # Compress this data.
730 :     my ($output, $status) = $compTable->deflate($chars);
731 :     # Write it to the output.
732 :     WriteCompressedData($packObject, $output, $status);
733 :     }
734 :     # Flush out any remaining data.
735 :     my ($output, $status) = $compTable->flush();
736 :     WriteCompressedData($packObject, $output, $status);
737 :     # Terminate the file.
738 :     my $outCount = WriteEndMark($packObject);
739 :     # Display the compression ratio.
740 :     my $fileSize = $compTable->total_in();
741 :     my $ratio = ($fileSize ? int($outCount * 100 / $fileSize + 0.5) : 0);
742 :     print "$fileSize characters in, $outCount characters out ($ratio%)\n";
743 :     }
744 :    
745 :     =head3 UnPackTree
746 :    
747 : parrello 1.7 UnPackTree($directory, $packageFile);
748 : parrello 1.1
749 :     Read file data from a package file and put it into a directory tree.
750 :    
751 :     The package file consists of a series of compressed files. The package file contains
752 :     three types of records. A text record begins with a space and is terminated by a
753 :     new-line character (C<\n>). A block record has a fixed length and begins with a C<b>.
754 :     An end record begins with an C<e> and a four-digit length, followed by data of that
755 :     length. A file contains a text record with the file type (C<t> for text and C<b> for
756 :     binary) and its name relative to the root of the tree, zero or more block records, and
757 :     one end record.
758 :    
759 :     To prevent file system compatibility problems, the name is automatically cleaned
760 : parrello 1.4 before the file is created: all spaces and colons are converted to underscores,
761 :     and question marks are converted to the double letter C<QQ>.
762 : parrello 1.1
763 :     Text files are open in text mode and binary files in binary mode. As a result, the
764 :     line-end characters in a text file are automatically translated to the line-end
765 :     character of the target operating system.
766 :    
767 :     =cut
768 :    
769 :     sub UnPackTree {
770 :     my ($directory, $packageFile) = @_;
771 :     # Open the package file for input.
772 :     my $packObject = StartUnPacking($packageFile);
773 :     # Create some file counters.
774 :     my %fileCounters = ( b => 0, t => 0 );
775 :     # Loop through the package one file at a time.
776 :     while (my $fileData = GetTextRecord($packObject)) {
777 :     # Extract the file name and mode.
778 :     my $mode = substr($fileData,0,1);
779 :     my $fileName = substr($fileData,2);
780 :     print "Unpacking $fileName ($mode).\n";
781 :     # Create the decompression object.
782 :     my $compObject = inflateInit();
783 :     # Create the output file.
784 :     my $fileObject = CreateFile($fileName, $mode, $directory, $compObject);
785 :     # Loop through the package file, de-compressing the data.
786 :     while (my $block = GetBlock($packObject)) {
787 :     # Decompress the current block.
788 :     my ($data, $status) = $compObject->inflate($block);
789 :     # Write the decompressed data.
790 :     WriteData($fileObject, $data, $status);
791 :     }
792 :     # Verify the checksums.
793 :     my ($expected, $found) = ($packObject->{digest}, $fileObject->{digest}->hexdigest);
794 :     if ($expected ne $found) {
795 :     # Here we have a mismatch.
796 :     die "Invalid checksum found unpacking file $fileName: expected $expected, found $found.";
797 :     }
798 :     # Count this file.
799 :     $fileCounters{$mode}++;
800 :     # Close it.
801 :     close CLEARFILE;
802 :     }
803 :     # Close the package file.
804 :     close PACKFILE;
805 :     # Tell the user what we did.
806 :     print "$fileCounters{t} text files unpacked, $fileCounters{b} binary files unpacked.\n";
807 :     print $packObject->{inCounter} . " package file bytes read.\n";
808 :     }
809 :    
810 :     =head3 StartUnPacking
811 :    
812 : parrello 1.7 my $packObject = StartUnPacking($packageFile);
813 : parrello 1.1
814 :     Prepare to read from a package file.
815 :    
816 :     =cut
817 :    
818 :     sub StartUnPacking {
819 :     my ($packageFile) = @_;
820 :     # Open the package file for binary input.
821 :     (sysopen PACKFILE, $packageFile, O_RDONLY | O_BINARY) ||
822 :     die "Could not open $packageFile: $!";
823 :     # Return the package file input object.
824 :     return {
825 :     blockEnd => 0,
826 :     inCounter => 0
827 :     }
828 :     }
829 :    
830 :     =head3 GetTextRecord
831 :    
832 : parrello 1.7 my $data = GetTextRecord($packObject);
833 : parrello 1.1
834 :     Read a text record from the package file. The text record begins with a space and
835 :     ends with a new-line character.
836 :    
837 :     =cut
838 :    
839 :     sub GetTextRecord {
840 :     my ($packObject) = @_;
841 :     # Declare the return variable.
842 :     my $retVal;
843 :     # Read the next character.
844 :     my $buffer;
845 :     my $rv = read PACKFILE, $buffer, 1;
846 :     # Check to insure we read a space.
847 :     if (! defined $rv) {
848 :     # Here the read failed.
849 :     die "Error reading package file: $!";
850 :     } elsif ($rv == 0) {
851 :     # Here we've reached end-of-file. We'll return an undefined value
852 :     # to the caller.
853 :     } elsif ($buffer ne " ") {
854 :     # Here we didn't find a space: the file is corrupt.
855 :     die "Text record not found when expected in package file.";
856 :     } else {
857 :     # Here we have a real, live text record. We loop until we find a new-line.
858 :     $retVal = "";
859 :     $rv = read PACKFILE, $buffer, 1;
860 :     while (defined $rv && $rv == 1 && $buffer ne "\n") {
861 :     $retVal .= $buffer;
862 :     $rv = read PACKFILE, $buffer, 1;
863 :     }
864 :     # Check for error conditions.
865 :     if (! defined $rv) {
866 :     die "Error reading package file: $!";
867 :     } elsif ($rv == 0) {
868 :     die "End-of-file inside text record in package file.";
869 :     }
870 :     # Update the input counter.
871 :     $packObject->{inCounter} += length($retVal) + 2;
872 :     # Denote we haven't read an end block.
873 :     $packObject->{blockEnd} = 0;
874 :     }
875 :     return $retVal;
876 :     }
877 :    
878 :     =head3 GetBlock
879 :    
880 : parrello 1.7 my $block = GetBlock($packObject);
881 : parrello 1.1
882 :     Read a block of compressed data from the package file. If the end of the compressed data
883 :     has been reached, will return an undefined value.
884 :    
885 :     =cut
886 :    
887 :     sub GetBlock {
888 :     my ($packObject) = @_;
889 :     # Declare the return value.
890 :     my $retVal;
891 :     # Only proceed if we have NOT reached the end block.
892 :     if (! $packObject->{blockEnd}) {
893 :     # Read the indicator character.
894 :     my $buffer;
895 :     my $rv = read PACKFILE, $buffer, 1;
896 :     # Check for errors.
897 :     if (! defined $rv) {
898 :     die "Error reading package block mark: $!";
899 :     } elsif ($rv == 0) {
900 :     die "End-of-file read where block mark expected in package file.";
901 :     } elsif ($buffer eq 'b') {
902 :     # Here we have a normal block. We read it directly into the
903 :     # return variable.
904 :     $rv = read PACKFILE, $retVal, $PACK_BLOCK;
905 :     # Check for errors.
906 :     if (! defined $rv) {
907 :     die "Error reading package block data: $!";
908 :     } elsif ($rv < $PACK_BLOCK) {
909 :     die "Unexpected end-of-file inside package file block.";
910 :     } else {
911 :     # Record the bytes read.
912 :     $packObject->{inCounter} += $PACK_BLOCK + 1;
913 :     }
914 :     } elsif ($buffer eq 'e') {
915 :     # Here we have an end block. We need to read the block length
916 :     # and the checksum.
917 :     $rv = read PACKFILE, $buffer, 36;
918 :     # Check for errors.
919 :     if (! defined $rv) {
920 :     die "Error reading end block length: $!";
921 :     } elsif ($rv < 36) {
922 :     die "Unexpected end-of-file reading end block length and checksum.";
923 :     } else {
924 :     # Get the length and the checksum.
925 :     my $lengthCode = substr $buffer, 0, 4;
926 :     $packObject->{digest} = substr $buffer, 4, 32;
927 :     # Check for an empty block.
928 :     if ($lengthCode == 0) {
929 :     # Here the end block is empty. We will return an undefined value,
930 :     # indicating end-of-file. Record the 36 bytes read.
931 :     $packObject->{inCounter} += 36;
932 :     } else {
933 :     # Here we have a valid block length, so we read the block.
934 :     $rv = read PACKFILE, $retVal, $lengthCode;
935 :     # Check for errors.
936 :     if (! defined $rv) {
937 :     die "Error reading end block: $!";
938 :     } elsif ($rv < $lengthCode) {
939 :     die "Unexpected end-of-file inside end block.";
940 :     } else {
941 :     # Denote the next read will return end-of-file.
942 :     $packObject->{blockEnd} = 1;
943 :     # Record the bytes read.
944 :     $packObject->{inCounter} += $lengthCode + 37;
945 :     }
946 :     }
947 :     }
948 :     } else {
949 :     # Here we have an invalid block mark.
950 :     die "Invalid block mark read in package file.";
951 :     }
952 :     }
953 :     return $retVal;
954 :     }
955 :    
956 :     =head3 CreateFile
957 :    
958 : parrello 1.7 my $fileObject = CreateFile($fileName, $mode, $directory, $compObject);
959 : parrello 1.1
960 :     Create a file into which packaged data can be unpacked.
961 :    
962 :     =cut
963 :    
964 :     sub CreateFile {
965 :     my ($fileName, $mode, $directory, $compObject) = @_;
966 : parrello 1.4 # Clean the file name.
967 : parrello 1.1 $fileName =~ tr/: /__/;
968 : parrello 1.4 $fileName =~ s/\?/QQ/g;
969 : parrello 1.1 # Add the file name to the directory to get the fully-qualified file name.
970 :     my $fullName = "$directory/$fileName";
971 :     # Extract the path.
972 :     my ($path, $suffix) = NameParse($fullName);
973 :     # Insure the path exists.
974 :     if (! -d $path) {
975 :     mkpath($path);
976 :     }
977 :     # Check for the directory problem.
978 :     if (-d $fullName) {
979 :     # Here a directory has the same name as the file. We append a suffix
980 :     # to work around the problem.
981 :     $fullName .= ".$mode";
982 :     }
983 :     # Open the file in the appropriate mode.
984 :     if ($mode eq 't') {
985 :     (open CLEARFILE, ">$fullName") ||
986 :     die "Could not open text file $fullName: $!";
987 :     } else {
988 :     (sysopen CLEARFILE, $fullName, O_WRONLY | O_CREAT | O_BINARY) ||
989 :     die "Could not open binary file $fullName: $!";
990 :     }
991 :     # Create a file manipulation object to return.
992 :     return {
993 :     mode => $mode,
994 :     compObject => $compObject,
995 :     digest => Digest::MD5->new()
996 :     };
997 :     }
998 :    
999 :     =head3 WriteData
1000 :    
1001 : parrello 1.7 WriteData($fileObject, $data, $status);
1002 : parrello 1.1
1003 :     Write a block of data to the output file.
1004 :    
1005 :     =cut
1006 :    
1007 :     sub WriteData {
1008 :     my ($fileObject, $data, $status) = @_;
1009 :     # Check the status code from the compressor.
1010 :     if ($status != Z_OK && $status != Z_STREAM_END) {
1011 :     die "Error in decompression: " . $fileObject->{compObject}->msg();
1012 :     } else {
1013 :     # Here we're okay, so we can write the data.
1014 :     print CLEARFILE $data;
1015 :     # Add it to the checksum.
1016 :     $fileObject->{digest}->add($data);
1017 :     }
1018 :     }
1019 :    
1020 : golsen 1.6 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3