[Bio] / Sprout / AttrDBRefresh.pl Repository:
ViewVC logotype

Annotation of /Sprout/AttrDBRefresh.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.26 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     =head1 AttrDBRefresh
4 :    
5 : parrello 1.5 This script performs useful function on the custom attributes database.
6 : parrello 1.1
7 :     The currently-supported command-line options are as follows.
8 :    
9 :     =over 4
10 :    
11 :     =item user
12 :    
13 :     Name suffix to be used for log files. If omitted, the PID is used.
14 :    
15 :     =item trace
16 :    
17 :     Numeric trace level. A higher trace level causes more messages to appear. The
18 : parrello 1.2 default trace level is C<3>.
19 : parrello 1.1
20 :     =item sql
21 :    
22 :     If specified, turns on tracing of SQL activity.
23 :    
24 :     =item background
25 :    
26 :     Save the standard and error output to files. The files will be created
27 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
28 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
29 :     B<user> option above.
30 :    
31 :     =item h
32 :    
33 :     Display this command's parameters and options.
34 :    
35 :     =item phone
36 :    
37 :     Phone number to message when the script is complete.
38 :    
39 : parrello 1.2 =item migrate
40 :    
41 : parrello 1.10 If specified, a comma-delimited list of attributes to be migrated from the old system to the
42 :     new one. The attributes will be erased before migration.
43 : parrello 1.2
44 : parrello 1.4 =item initializeAndClear
45 : parrello 1.2
46 : parrello 1.4 If specified, then the tables in the attribute database are dropped and re-created.
47 : parrello 1.2
48 : parrello 1.22 =item replace
49 : parrello 1.12
50 : parrello 1.22 If specified, existing keys will be erased before loading the attribute data. This
51 :     option only makes sense if C<load> is specified.
52 : parrello 1.12
53 : parrello 1.5 =item load
54 :    
55 :     If specified, the name of a file containing attribute data to be loaded into the
56 :     system. The file is presumed to be tab-delimited. The first column must be the
57 :     object ID, the second the attribute key name, and the remaining columns the
58 : parrello 1.22 attribute values. Existing attributes will be unchanged unless the C<replace>
59 :     option is specified.
60 : parrello 1.5
61 : parrello 1.24 =item loadKeys
62 : parrello 1.7
63 :     If specified, the name of a tab-delimited file containing attribute key data. For each key,
64 : parrello 1.24 there is a pair of lines. The first line contains the ID, value table name, and
65 :     description of the key. The second line contains the marker C<#GROUPS> followed by zero or
66 :     more group names. The attribute will be connected to all the specified groups.
67 : parrello 1.7
68 : parrello 1.5 =item backup
69 :    
70 :     If specified, the name of a file into which all the attribute data should be
71 : parrello 1.7 dumped. The file itself will receive the attribute data in the format expected
72 : parrello 1.24 by C<load>.
73 :    
74 :     =item backupKeys
75 :    
76 :     If specified, the name of a file into which all the attribute key data should be
77 :     dumped. The file will receive the attribute key data in the format expected by C<loadKey>.
78 : parrello 1.5
79 : parrello 1.10 =item summary
80 :    
81 :     If specified, the name of a file to contain a summary report. The summary report
82 :     contains a table of the attribute keys and the number of values of each. The report
83 :     is formatted as a web page.
84 :    
85 :     =item trimSpaces
86 :    
87 :     If specified, the name of an attribute value backup file. The file will be processed to
88 :     remove excess spaces. The fixed file will have the same name as the incoming backup
89 :     file with the extension <.fixed>. This new file can then be reloaded using the
90 :     C<load> option.
91 :    
92 : parrello 1.13 =item showKeyDef
93 :    
94 :     If specified, the name of an attribute key. The key's descriptive data will be displayed.
95 :    
96 : parrello 1.14 =item mapSubkey
97 :    
98 :     Subkey mapping rule. Consists of a key name, followed by a substitution command enclosed in
99 :     slashes. For example, to remove the word C<ZINC> from the beginning of C<docking_results>
100 :     subkeys, you would code
101 :    
102 :     -mapSubkey=docking_results/^ZINC//
103 :    
104 : parrello 1.17 =item mapObjectID
105 :    
106 :     Object ID mapping rule. Consists of a key name, followed by a substitution command enclosed in
107 :     slashes. For example, to add the prefix C<fig|> to all the object IDs for the C<PRODOM> key,
108 :     you would code
109 :    
110 :     -mapObjectID=PRODOM/^/fig\|/
111 :    
112 : parrello 1.15 =item dockClean
113 :    
114 :     If specified, Predicted docking results will be removed from the attribute database.
115 :    
116 : parrello 1.25 =item dupClean
117 :    
118 :     If specified, duplicate attribute values will be removed from the database. The
119 :     parameter should be an attribute key. All attribute keys whose names are greater than
120 :     or equal to the specified value will be processed. (This is to allow restarting.)
121 :    
122 : parrello 1.19 =item resume
123 :    
124 : parrello 1.21 If specified, key-value pairs already in the database will not be reinserted.
125 :     Specify a number to start checking after the specified number of lines and
126 :     then admit everything after the first line not yet loaded. Specify C<careful>
127 :     to check every single line. Specify C<none> to ignore this option. The default
128 :     is C<none>. So, if you believe that a previous load failed somewhere after 50000
129 :     lines, a resume value of C<50000> would skip 50000 lines in the file, then
130 :     check each line after that until it finds one not already in the database. The
131 :     first such line found and all lines after that will be loaded. On the other
132 :     hand, if you have a file of 100000 records, and some have been loaded and some
133 :     not, you would use the word C<careful>, so that every line would be checked before
134 :     it is inserted. A resume of C<0> will start checking the first line of the
135 :     input file and then begin loading once it finds a line not in the database.
136 :    
137 :     =item chunkSize
138 :    
139 :     Number of lines to load in each burst. The default is 10,000. This option
140 :     is only used if C<load> is specified.
141 :    
142 :     =item mode
143 :    
144 :     C<concurrent> to use concurrent loading in MySQL or C<low_priority> to use
145 : parrello 1.23 low-priority loading in MySQL. If C<normal>, normal loading will be used. The
146 :     default is C<concurrent>. This option is only used if C<load> is specified.
147 : parrello 1.19
148 : parrello 1.1 =back
149 :    
150 :     =cut
151 :    
152 :     use strict;
153 :     use Tracer;
154 :     use Cwd;
155 :     use File::Copy;
156 :     use File::Path;
157 :     use CustomAttributes;
158 : parrello 1.5 use ERDBLoad;
159 : parrello 1.14 use Stats;
160 : parrello 1.26 use CGI qw(-nosticky);
161 : parrello 1.1
162 :     # Get the command-line options and parameters.
163 : parrello 1.25 my ($options, @parameters) = StandardSetup([qw(CustomAttributes DBKernel) ],
164 : parrello 1.1 {
165 : parrello 1.2 trace => [3, "trace level"],
166 : parrello 1.4 initializeAndClear => ["", "if specified, the tables of the attribute database will be re-created"],
167 : parrello 1.1 phone => ["", "phone number (international format) to call when load finishes"],
168 : parrello 1.5 load => ["", "file from which to load attribute data"],
169 : parrello 1.24 loadKeys => ["", "file from which to load attribute key data"],
170 : parrello 1.6 backup => ["", "file to which attribute data should be dumped"],
171 : parrello 1.24 backupKeys => ["", "file to which attribute key data should be dumped"],
172 : parrello 1.10 compare => ["", "name of a file into which a comparison report will be written"],
173 :     summary => ["", "name of a file into which a summary report will be written"],
174 :     trimSpaces => ["", "if specified, the name of a backup file, which will be processed to remove excess spaces"],
175 : parrello 1.22 replace => ["", "if specified, data will be erased before loading from the load file"],
176 : parrello 1.13 showKeyDef => ["", "if specified, the name of a key whose descriptive data is to be displayed"],
177 : parrello 1.14 mapSubkey => ["", "instructions for fixing subkey values"],
178 : parrello 1.17 mapObjectID => ["", "instructions for fixing object ID values"],
179 : parrello 1.15 dockClean => ["", "if specified, Predicted docking results will be removed from the database"],
180 : parrello 1.21 resume => ["", "if specified, key-value pairs already in the database will not be inserted when loading from the load file"],
181 : parrello 1.23 mode => ["concurrent", "MySQL load mode to use"],
182 : parrello 1.25 chunksize => ["", "number of attributes to load in each burst"],
183 :     dupClean => ["", "clean duplicate attributes"]
184 : parrello 1.1 },
185 :     "",
186 :     @ARGV);
187 :     # Set a variable to contain return type information.
188 :     my $rtype;
189 : parrello 1.10 # Create a CGI object.
190 :     my $cgi = CGI->new();
191 : parrello 1.1 # Insure we catch errors.
192 :     eval {
193 : parrello 1.4 # Get the attribute database.
194 : parrello 1.14 Trace("Connecting to local attribute database.") if T(2);
195 : parrello 1.4 my $ca = CustomAttributes->new();
196 :     # Process according to the options selected.
197 : parrello 1.7 if ($options->{backup}) {
198 :     # Back up the attributes to the specified file.
199 :     my $backupFileName = $options->{backup};
200 :     Trace("Backing up attribute data.") if T(2);
201 :     my $stats = $ca->BackupAllAttributes($backupFileName);
202 :     Trace("Attribute backup statistics:\n" . $stats->Show()) if T(2);
203 : parrello 1.24 }
204 :     if ($options->{backupKeys}) {
205 :     # Back up the attribute key data to the specified file.
206 : parrello 1.7 Trace("Backing up key data.") if T(2);
207 : parrello 1.24 my $backupFileName = $options->{backupKeys};
208 :     my $stats = $ca->BackupKeys($backupFileName);
209 : parrello 1.7 Trace("Key backup statistics:\n" . $stats->Show()) if T(2);
210 : parrello 1.5 }
211 : parrello 1.4 if ($options->{initializeAndClear}) {
212 :     # Create the tables.
213 :     $ca->CreateTables();
214 :     Trace("Tables recreated.") if T(2);
215 : parrello 1.2 }
216 : parrello 1.25 if ($options->{dupClean}) {
217 :     # Clean out duplicates. Determine the point at which we should start.
218 :     # The default is at the beginning of the key list.
219 :     my $startPoint = " ";
220 :     # If the user specified a start value, start from there. An unspecified
221 :     # value defaults to 1.
222 :     if ($options->{dupClean} ne "1") {
223 :     $startPoint = $options->{dupClean};
224 :     }
225 :     CleanDuplicates($ca, $startPoint);
226 :     }
227 : parrello 1.10 if ($options->{trimSpaces}) {
228 :     # Here we need to remove unnecessary spaces from an attribute values backup
229 :     # file. First, we open the input backup file.
230 :     my $fileName = $options->{trimSpaces};
231 :     my $ih = Open(undef, "<$fileName");
232 :     # Now we open the proposed output file.
233 :     my $oh = Open(undef, ">$fileName.fixed");
234 : parrello 1.11 # Create a statistics object to track our progress.
235 :     my $stats = Stats->new('lines', 'trims');
236 : parrello 1.10 Trace("Cleaning $fileName and copying to $fileName.fixed.") if T(2);
237 :     # Loop through the input file.
238 :     while (! eof $ih) {
239 :     # Get the next record in the input file.
240 : parrello 1.11 $stats->Add(lines => 1);
241 : parrello 1.10 my ($id, $key, @values) = Tracer::GetLine($ih);
242 :     # Trim the values.
243 : parrello 1.11 for my $value (@values) {
244 :     if ($value =~ /(\S.+\S)\s+/) {
245 :     $value = $1;
246 :     $stats->Add(trims => 1);
247 :     }
248 :     }
249 : parrello 1.10 # Write the result to the output file.
250 :     Tracer::PutLine($oh, [$id, $key, @values]);
251 :     }
252 :     # Close the files.
253 :     close $ih;
254 :     close $oh;
255 : parrello 1.11 Trace("$fileName.fixed is now a cleaned backup.\n" . $stats->Show()) if T(2);
256 : parrello 1.10 }
257 : parrello 1.24 if ($options->{loadKeys}) {
258 : parrello 1.7 # We want to load the attribute data from the specified file, but
259 :     # first we need to verify that the file exists.
260 : parrello 1.24 my $loadFileName = $options->{loadKeys};
261 : parrello 1.7 if (! -f $loadFileName) {
262 :     Confess("Cannot load keys: file \"$loadFileName\" is not found or not a file.");
263 :     } else {
264 :     Trace("Loading key data from $loadFileName.") if T(2);
265 :     my $stats = $ca->RestoreKeys($loadFileName);
266 :     Trace("Load statistics:\n" . $stats->Show()) if T(2);
267 :     }
268 :     }
269 : parrello 1.13 if ($options->{showKeyDef}) {
270 :     # We want to display the identified key's description. Get the key name.
271 :     my $name = $options->{showKeyDef};
272 :     # Look for keys with the same name.
273 :     my %keys = $ca->GetAttributeData('find', $options->{showKeyDef});
274 :     # See if we found the key.
275 :     if (! $keys{$name}) {
276 :     print "Key $name not found.\n";
277 :     } else {
278 :     print "Description for $name.\n\n";
279 :     print $keys{$name}->[1];
280 :     print "\n\n";
281 :     }
282 :     }
283 : parrello 1.5 if ($options->{load}) {
284 :     # We want to load the attribute data from the specified file, but
285 :     # first we need to verify that the file exists.
286 :     my $loadFileName = $options->{load};
287 :     if (! -f $loadFileName) {
288 :     Confess("Cannot load: file \"$loadFileName\" is not found or not a file.");
289 :     } else {
290 : parrello 1.19 # Set up options. We may need to specify the append and resume options, and
291 :     # we need to archive.
292 : parrello 1.12 my %loadOptions;
293 : parrello 1.22 if ($options->{replace}) {
294 :     $loadOptions{append} = 0;
295 :     } else {
296 : parrello 1.12 $loadOptions{append} = 1;
297 :     }
298 : parrello 1.19 if ($options->{resume}) {
299 :     $loadOptions{resume} = 1;
300 :     }
301 : parrello 1.21 if ($options->{mode}) {
302 :     $loadOptions{mode} = $options->{mode};
303 :     }
304 : parrello 1.19 # Insure we have the archive directory available.
305 :     my $archiveDirectory = "$FIG_Config::fig/AttribData";
306 :     Tracer::Insure($archiveDirectory, 0777);
307 :     # Create an archive file name from the current time and the PID.
308 :     my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time());
309 :     $loadOptions{archive} = "$archiveDirectory/attrSave$$.$mon.$mday.$year.$hour.$min.$sec.log";
310 : parrello 1.12 # Now we can load.
311 : parrello 1.5 Trace("Loading attribute data from $loadFileName.") if T(2);
312 : parrello 1.12 my $stats = $ca->LoadAttributesFrom($loadFileName, %loadOptions);
313 : parrello 1.5 Trace("Load statistics:\n" . $stats->Show()) if T(2);
314 :     }
315 :     }
316 : parrello 1.10 if ($options->{summary}) {
317 :     # Here we have a summary report. The value of the option is the name of a file that is to
318 :     # contain an html-formatted report. We start by getting a complete list of the keys and
319 :     # the associated counts.
320 :     my $keyCounts = GetAllKeys($ca);
321 :     # Buffer the lines in the following list.
322 :     my @lines = ();
323 :     # Start the table.
324 : parrello 1.26 push @lines, CGI::start_table({border => 2});
325 :     push @lines, CGI::Tr(CGI::th({align => 'left'}, 'Key Name'), CGI::th({align => 'right'}, 'Values'));
326 : parrello 1.10 # Loop through the key hash, building row data.
327 :     for my $key (sort keys %{$keyCounts}) {
328 : parrello 1.26 push @lines, CGI::Tr(CGI::td({align => 'left'}, $key), CGI::td({align => 'right'}, $keyCounts->{$key}));
329 : parrello 1.10 }
330 :     # Close off the table.
331 : parrello 1.26 push @lines, CGI::end_table();
332 : parrello 1.10 # Output the page.
333 :     WritePage($options->{summary}, 'Attribute Key Summary', \@lines);
334 :     }
335 : parrello 1.17 if ($options->{mapSubkey} || $options->{mapObjectID}) {
336 : parrello 1.14 # Parse out the main key.
337 : parrello 1.17 my $type = ($options->{mapSubkey} ? 'mapSubkey' : 'mapObjectID');
338 :     my $mapThing = $options->{$type};
339 :     my $field = ($options->{mapSubkey} ? 'subkey' : 'to-link');
340 :     if ($mapThing =~ m#([^/]+)(/.+)#) {
341 : parrello 1.14 my ($keyName, $pattern) = ($1, $2);
342 : parrello 1.17 Trace("Processing $type mapping for $keyName with pattern s$pattern.") if T(2);
343 : parrello 1.14 # Create a statistics object.
344 :     my $results = Stats->new();
345 : parrello 1.17 # Get all the field values.
346 :     my %things = map { $_ => 1 } $ca->GetFlat(['HasValueFor'], "HasValueFor(from-link) = ?", [$keyName],
347 :     "HasValueFor($field)");
348 :     my $totalThings = scalar keys %things;
349 :     Trace("$totalThings ${field}s found.") if T(2);
350 :     # Loop through the values, doing updates where necessary.
351 :     for my $thing (keys %things) {
352 : parrello 1.14 # Count this subkey.
353 : parrello 1.17 my $thingCount = $results->Add("${field}s" => 1);
354 : parrello 1.14 # Apply the substitution.
355 : parrello 1.17 my $newThing = $thing;
356 :     eval("\$newThing =~ s$pattern");
357 : parrello 1.14 # If the evaluation resulted in an error, stop immediately.
358 :     if ($@) {
359 :     Confess("Error in substitution pattern: $@");
360 : parrello 1.17 } elsif ($newThing ne $thing) {
361 : parrello 1.14 # Here the substitution worked and it changed the key value.
362 :     # We need to update the database.
363 :     $results->Add(updates => 1);
364 : parrello 1.17 my $count = $ca->UpdateField("HasValueFor($field)", $thing, $newThing,
365 : parrello 1.14 "HasValueFor(from-link) = ?", [$keyName]);
366 :     $results->Add(rowsChanged => $count);
367 :     }
368 :     # Trace our progress.
369 : parrello 1.17 if ($thingCount % 100 == 0) {
370 : parrello 1.18 Trace("$thingCount processed.") if T(3);
371 : parrello 1.14 }
372 :     }
373 :     # Display the statistics.
374 : parrello 1.17 Trace("Statistics from $field update:\n" . $results->Show()) if T(2);
375 : parrello 1.14 } else {
376 :     # Here the incoming parameter was in the wrong format. Mostly this means there
377 :     # was nothing before the slash or no slash was found.
378 : parrello 1.17 Confess("Invalid substitution syntax in map option.");
379 : parrello 1.14 }
380 :     }
381 : parrello 1.15 if ($options->{dockClean}) {
382 :     # Get the list of PDBs with results.
383 :     my @pdbList = sort map { $_->[0] } $ca->GetAttributes(undef, 'has_results');
384 :     # Loop through the PDB IDs.
385 :     for my $pdbID (@pdbList) {
386 :     Trace("Processing $pdbID.") if T(3);
387 :     # Loop until we run out of rows to delete.
388 :     my $thisCount = 1;
389 :     my $totalCount = 0;
390 :     while ($thisCount) {
391 :     # Delete a bunch of rows. To avoid a timeout, we limit the results.
392 : parrello 1.16 $thisCount = $ca->DeleteLike('HasValueFor', 'HasValueFor(to-link) = ? AND HasValueFor(value) LIKE ? LIMIT 10000',
393 : parrello 1.15 [$pdbID, '%Predicted']);
394 :     $totalCount += $thisCount;
395 :     Trace("$thisCount rows deleted in batch. $totalCount total deletions for pdb $pdbID.") if T(3);
396 :     }
397 :     }
398 :     }
399 : parrello 1.2 Trace("Processing complete.") if T(2);
400 : parrello 1.1 };
401 :     if ($@) {
402 :     Trace("Script failed with error: $@") if T(0);
403 :     $rtype = "error";
404 :     } else {
405 :     Trace("Script complete.") if T(2);
406 :     $rtype = "no error";
407 :     }
408 :     if ($options->{phone}) {
409 :     my $msgID = Tracer::SendSMS($options->{phone}, "RefreshAttrDB terminated with $rtype.");
410 :     if ($msgID) {
411 :     Trace("Phone message sent with ID $msgID.") if T(2);
412 :     } else {
413 :     Trace("Phone message not sent.") if T(2);
414 :     }
415 :     }
416 :    
417 : parrello 1.6
418 : parrello 1.10 =head3 GetAllKeys
419 :    
420 : parrello 1.20 my @keys = GetAllKeys($ca);
421 : parrello 1.10
422 :     Return a sorted list of the attribute keys.
423 :    
424 :     =over 4
425 :    
426 :     =item ca
427 :    
428 :     CustomAttributes object used to access the database.
429 :    
430 :     =item RETURN
431 :    
432 :     Returns a sorted list of all the attribute keys.
433 :    
434 :     =back
435 :    
436 :     =cut
437 :    
438 :     sub GetAllKeys {
439 :     # Get the parameters.
440 :     my ($ca) = @_;
441 :     # Get the attribute data.
442 :     my %keyData = $ca->GetAttributeData('name', '');
443 :     # Sort the keys.
444 :     my @retVal = sort keys %keyData;
445 :     # Return the result.
446 :     return @retVal;
447 :     }
448 :    
449 :     =head3 OpenPage
450 :    
451 : parrello 1.20 my $fh = OpenPage($fileName, $title);
452 : parrello 1.10
453 :     Start writing an HTML page to a file and return the file handle.
454 :    
455 :     =over 4
456 :    
457 :     =item fileName
458 :    
459 :     Name of the file to which the page will be written.
460 :    
461 :     =item title
462 :    
463 :     Title for the page.
464 :    
465 :     =item RETURN
466 :    
467 :     Returns the file handle for writing the rest of the page.
468 :    
469 :     =back
470 :    
471 :     =cut
472 :    
473 :     sub OpenPage {
474 :     # Get the parameters.
475 :     my ($fileName, $title) = @_;
476 :     # Open the file.
477 :     my $retVal = Open(undef, ">$fileName");
478 :     # Write out the HTML headers.
479 : parrello 1.26 print $retVal CGI::start_html(-title => $title, -BGCOLOR => 'silver');
480 : parrello 1.10 print $retVal "\n";
481 :     # Return the file handle.
482 :     return $retVal;
483 :     }
484 :    
485 :     =head3 GetNewKeyCounts
486 :    
487 : parrello 1.20 my %counts = GetNewKeyCounts($ca);
488 : parrello 1.10
489 :     Return a hash mapping attribute key names to counts.
490 :    
491 :     =over 4
492 :    
493 :     =item ca
494 :    
495 :     CustomAttributes object for accessing the attribute database.
496 :    
497 :     =item RETURN
498 :    
499 :     Returns a reference to a hash mapping each key name to a count of the key's values.
500 :    
501 :     =back
502 :    
503 :     =cut
504 :    
505 :     sub GetNewKeyCounts {
506 :     # Get the parameters.
507 :     my ($ca) = @_;
508 :     # Declare the return variable.
509 :     my $retVal = {};
510 :     # Get all of the keys.
511 :     my @keys = GetAllKeys($ca);
512 :     # Loop through the list, filling the hash.
513 :     for my $key (@keys) {
514 :     my $count = $ca->GetCount(['HasValueFor'], "HasValueFor(from-link) = ?", [$key]);
515 :     $retVal->{$key} = $count;
516 :     }
517 :     # Return the result.
518 :     return $retVal;
519 :     }
520 :    
521 :     =head3 WritePage
522 :    
523 : parrello 1.20 WritePage($fileName, $title, \@lines);
524 : parrello 1.10
525 :     Write the specified lines to the specified file as a web page. The lines are assumed to be raw
526 :     HTML body text. They will be preceded by a standard HTML header and followed by a standard
527 :     HTML footer.
528 :    
529 :     =over 4
530 :    
531 :     =item fileName
532 :    
533 :     Name of the output file.
534 :    
535 :     =item title
536 :    
537 :     Title for the web page.
538 :    
539 :     =item lines
540 :    
541 :     Reference to a list of lines of HTML.
542 :    
543 :     =back
544 :    
545 :     =cut
546 :    
547 :     sub WritePage {
548 :     # Get the parameters.
549 :     my ($fileName, $title, $lines) = @_;
550 :     # Open the file and write the header to it. The header includes everything up to and including
551 :     # the BODY tag.
552 :     Trace("Writing web page to $fileName.") if T(2);
553 :     my $oh = OpenPage($fileName, $title);
554 :     # Write the lines one at a time.
555 :     for my $line (@{$lines}) {
556 :     print $oh "$line\n";
557 :     }
558 :     # Write the HTML footer.
559 : parrello 1.26 print $oh CGI::end_html();
560 : parrello 1.10 # Close the output file.
561 :     close $oh;
562 :     Trace("Web page created in $fileName.") if T(2);
563 :     }
564 :    
565 : parrello 1.25 =head3 CleanDuplicates
566 :    
567 :     CleanDuplicates($ca, $startPoint);
568 :    
569 :     Remove duplicate attribute values from the attribute database, starting
570 :     with the specified key. This is a long, slow process. We look through all
571 :     the values for a particular key. If duplicate values are found, we delete
572 :     all the matching values and re-insert.
573 :    
574 :     =over 4
575 :    
576 :     =item ca
577 :    
578 :     [[CustomAttributesPm]] object for accessing the attribute database.
579 :    
580 :     =item startPoint
581 :    
582 :     Name of the first key to process. All keys that are lexically equal to or greater than this
583 :     value will be processed.
584 :    
585 :     =back
586 :    
587 :     =cut
588 :    
589 :     sub CleanDuplicates {
590 :     # Get the parameters.
591 :     my ($ca, $startPoint) = @_;
592 :     # Get a statistics object.
593 :     my $stats = Stats->new();
594 :     # Get the attribute keys we'll be wanting to process. For each key we get the
595 :     # key ID and the relevant relationship name.
596 :     my %keyList = map { $_->[0] => $_->[1] } $ca->GetAll(['AttributeKey'],
597 :     "AttributeKey(id) >= ? ORDER BY AttributeKey(id)",
598 :     [$startPoint],
599 :     ['AttributeKey(id)', 'AttributeKey(relationship-name)']);
600 :     # Form the actual keys into a sorted list. We do this so we can more easily trace the number of
601 :     # keys we have to process.
602 :     my @keys = sort keys %keyList;
603 :     my $n = scalar(@keys);
604 :     Trace("$n will be cleaned for duplicates.") if T(2);
605 :     # Loop through the keys.
606 :     for my $key (@keys) {
607 :     Trace("Processing key " . $stats->Add(keys => 1) . " of $n: $key.") if T(3);
608 :     # Get the key's table.
609 :     my $table = $keyList{$key};
610 :     # Now we will loop through the table's values in sequence, checking for duplicates.
611 :     # we will read the values in clumps, one clump for each target object ID. In general
612 :     # the clumps will be small, and we roll them into a hash to identify the duplicates.
613 :     # This next variable holds the current object ID.
614 :     my $objectID = "";
615 :     # This will be the hash used to check for duplicate values.
616 :     my %valueHash;
617 :     # Duplicates found will be put in this list.
618 :     my @dupList = ();
619 :     # Count the values for this key.
620 :     my $keyVals = 0;
621 :     # Now loop through all the entries for this key.
622 :     my $query = $ca->Get([$table], "$table(from-link) = ? ORDER BY $table(from-link), $table(to-link)",
623 :     [$key]);
624 :     while (my $value = $query->Fetch()) {
625 :     # Get the fields for this value.
626 :     my ($myID, $subKey, $value) = $value->Values(["$table(to-link)", "$table(subkey)",
627 :     "$table(value)"]);
628 :     # Count it.
629 :     Trace($stats->Ask('values') . " total values processed.") if $stats->Check(values => 500) && T(3);
630 :     $keyVals++;
631 :     # Is this a new clump?
632 :     if ($myID ne $objectID) {
633 :     # Yes it is. Clear the value hash and save the new object ID.
634 :     %valueHash = ();
635 :     $objectID = $myID;
636 :     $stats->Add(clumps => 1);
637 :     }
638 :     # Now determine if we have a duplicate.
639 :     my $valueKey = "$subKey::$value";
640 :     if (! $valueHash{$valueKey}) {
641 :     # No. Record it for future use.
642 :     $valueHash{$valueKey} = 1;
643 :     } else {
644 :     # Yes. Count it as a duplicate.
645 :     my $count = $valueHash{$valueKey}++;
646 :     $stats->Add(duplicates => 1);
647 :     # Is this our first time for it?
648 :     if ($count == 1) {
649 :     # Yes. Save it in the duplicates list.
650 :     push @dupList, [$key, $objectID, $subKey, $value];
651 :     }
652 :     }
653 :     }
654 :     Trace(scalar(@dupList) . " duplicates found for $key out of $keyVals.") if T(3);
655 :     # Now we've processed the key. Go through deleting and restoring the values found.
656 :     # This next variable contains the filter clause to use.
657 :     my $filter = "$table(from-link) = ? AND $table(to-link) = ? AND $table(subkey) = ? AND $table(value) = ?";
658 :     # This is a counter for tracing.
659 :     my $dupCount = 0;
660 :     # Loop through the duplicates.
661 :     for my $dup (@dupList) {
662 :     # Delete all copies of this duplicate.
663 :     my $count = $ca->DeleteLike($table => $filter, $dup);
664 :     $stats->Add(deleted => $count - 1);
665 :     # Put a single instance back in.
666 :     $ca->InsertObject($table, {'from-link' => $dup->[0], 'to-link' => $dup->[1], subkey => $dup->[2],
667 :     value => $dup->[3]});
668 :     # Count this.
669 :     $dupCount++;
670 :     Trace("$dupCount duplicates processed for $key.") if ($dupCount % 100 == 0) && T(3);
671 :     }
672 :     Trace("Key $key finished. $dupCount duplicates removed.") if T(3);
673 :     }
674 :     Trace("Processing complete:\n" . $stats->Show()) if T(2);
675 :     }
676 :    
677 :    
678 : parrello 1.20 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3