[Bio] / Sprout / AttrDBRefresh.pl Repository:
ViewVC logotype

Diff of /Sprout/AttrDBRefresh.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.24, Tue Aug 12 06:06:02 2008 UTC revision 1.25, Wed Sep 3 20:50:24 2008 UTC
# Line 120  Line 120 
120    
121  If specified, Predicted docking results will be removed from the attribute database.  If specified, Predicted docking results will be removed from the attribute database.
122    
123    =item dupClean
124    
125    If specified, duplicate attribute values will be removed from the database. The
126    parameter should be an attribute key. All attribute keys whose names are greater than
127    or equal to the specified value will be processed. (This is to allow restarting.)
128    
129  =item resume  =item resume
130    
131  If specified, key-value pairs already in the database will not be reinserted.  If specified, key-value pairs already in the database will not be reinserted.
# Line 161  Line 167 
167  use CGI;  use CGI;
168    
169  # Get the command-line options and parameters.  # Get the command-line options and parameters.
170  my ($options, @parameters) = StandardSetup([qw(CustomAttributes ERDB DBKernel) ],  my ($options, @parameters) = StandardSetup([qw(CustomAttributes DBKernel) ],
171                                             {                                             {
172                                                trace => [3, "trace level"],                                                trace => [3, "trace level"],
173                                                initializeAndClear => ["", "if specified, the tables of the attribute database will be re-created"],                                                initializeAndClear => ["", "if specified, the tables of the attribute database will be re-created"],
# Line 180  Line 186 
186                                                dockClean => ["", "if specified, Predicted docking results will be removed from the database"],                                                dockClean => ["", "if specified, Predicted docking results will be removed from the database"],
187                                                resume => ["", "if specified, key-value pairs already in the database will not be inserted when loading from the load file"],                                                resume => ["", "if specified, key-value pairs already in the database will not be inserted when loading from the load file"],
188                                                mode => ["concurrent", "MySQL load mode to use"],                                                mode => ["concurrent", "MySQL load mode to use"],
189                                                chunksize => ["", "number of attributes to load in each burst"]                                                chunksize => ["", "number of attributes to load in each burst"],
190                                                  dupClean => ["", "clean duplicate attributes"]
191                                             },                                             },
192                                             "",                                             "",
193                                             @ARGV);                                             @ARGV);
# Line 215  Line 222 
222          $ca->CreateTables();          $ca->CreateTables();
223          Trace("Tables recreated.") if T(2);          Trace("Tables recreated.") if T(2);
224      }      }
225        if ($options->{dupClean}) {
226            # Clean out duplicates. Determine the point at which we should start.
227            # The default is at the beginning of the key list.
228            my $startPoint = " ";
229            # If the user specified a start value, start from there. An unspecified
230            # value defaults to 1.
231            if ($options->{dupClean} ne "1") {
232                $startPoint = $options->{dupClean};
233            }
234            CleanDuplicates($ca, $startPoint);
235        }
236      if ($options->{trimSpaces}) {      if ($options->{trimSpaces}) {
237          # Here we need to remove unnecessary spaces from an attribute values backup          # Here we need to remove unnecessary spaces from an attribute values backup
238          # file. First, we open the input backup file.          # file. First, we open the input backup file.
# Line 789  Line 807 
807      Trace("Web page created in $fileName.") if T(2);      Trace("Web page created in $fileName.") if T(2);
808  }  }
809    
810    =head3 CleanDuplicates
811    
812        CleanDuplicates($ca, $startPoint);
813    
814    Remove duplicate attribute values from the attribute database, starting
815    with the specified key. This is a long, slow process. We look through all
816    the values for a particular key. If duplicate values are found, we delete
817    all the matching values and re-insert.
818    
819    =over 4
820    
821    =item ca
822    
823    [[CustomAttributesPm]] object for accessing the attribute database.
824    
825    =item startPoint
826    
827    Name of the first key to process. All keys that are lexically equal to or greater than this
828    value will be processed.
829    
830    =back
831    
832    =cut
833    
834    sub CleanDuplicates {
835        # Get the parameters.
836        my ($ca, $startPoint) = @_;
837        # Get a statistics object.
838        my $stats = Stats->new();
839        # Get the attribute keys we'll be wanting to process. For each key we get the
840        # key ID and the relevant relationship name.
841        my %keyList = map { $_->[0] => $_->[1] } $ca->GetAll(['AttributeKey'],
842                                                             "AttributeKey(id) >= ? ORDER BY AttributeKey(id)",
843                                                             [$startPoint],
844                                                             ['AttributeKey(id)', 'AttributeKey(relationship-name)']);
845        # Form the actual keys into a sorted list. We do this so we can more easily trace the number of
846        # keys we have to process.
847        my @keys = sort keys %keyList;
848        my $n = scalar(@keys);
849        Trace("$n will be cleaned for duplicates.") if T(2);
850        # Loop through the keys.
851        for my $key (@keys) {
852            Trace("Processing key " . $stats->Add(keys => 1) . " of $n: $key.") if T(3);
853            # Get the key's table.
854            my $table = $keyList{$key};
855            # Now we will loop through the table's values in sequence, checking for duplicates.
856            # we will read the values in clumps, one clump for each target object ID. In general
857            # the clumps will be small, and we roll them into a hash to identify the duplicates.
858            # This next variable holds the current object ID.
859            my $objectID = "";
860            # This will be the hash used to check for duplicate values.
861            my %valueHash;
862            # Duplicates found will be put in this list.
863            my @dupList = ();
864            # Count the values for this key.
865            my $keyVals = 0;
866            # Now loop through all the entries for this key.
867            my $query = $ca->Get([$table], "$table(from-link) = ? ORDER BY $table(from-link), $table(to-link)",
868                                 [$key]);
869            while (my $value = $query->Fetch()) {
870                # Get the fields for this value.
871                my ($myID, $subKey, $value) = $value->Values(["$table(to-link)", "$table(subkey)",
872                                                              "$table(value)"]);
873                # Count it.
874                Trace($stats->Ask('values') . " total values processed.") if $stats->Check(values => 500) && T(3);
875                $keyVals++;
876                # Is this a new clump?
877                if ($myID ne $objectID) {
878                    # Yes it is. Clear the value hash and save the new object ID.
879                    %valueHash = ();
880                    $objectID = $myID;
881                    $stats->Add(clumps => 1);
882                }
883                # Now determine if we have a duplicate.
884                my $valueKey = "$subKey::$value";
885                if (! $valueHash{$valueKey}) {
886                    # No. Record it for future use.
887                    $valueHash{$valueKey} = 1;
888                } else {
889                    # Yes. Count it as a duplicate.
890                    my $count = $valueHash{$valueKey}++;
891                    $stats->Add(duplicates => 1);
892                    # Is this our first time for it?
893                    if ($count == 1) {
894                        # Yes. Save it in the duplicates list.
895                        push @dupList, [$key, $objectID, $subKey, $value];
896                    }
897                }
898            }
899            Trace(scalar(@dupList) . " duplicates found for $key out of $keyVals.") if T(3);
900            # Now we've processed the key. Go through deleting and restoring the values found.
901            # This next variable contains the filter clause to use.
902            my $filter = "$table(from-link) = ? AND $table(to-link) = ? AND $table(subkey) = ? AND $table(value) = ?";
903            # This is a counter for tracing.
904            my $dupCount = 0;
905            # Loop through the duplicates.
906            for my $dup (@dupList) {
907                # Delete all copies of this duplicate.
908                my $count = $ca->DeleteLike($table => $filter, $dup);
909                $stats->Add(deleted => $count - 1);
910                # Put a single instance back in.
911                $ca->InsertObject($table, {'from-link' => $dup->[0], 'to-link' => $dup->[1], subkey => $dup->[2],
912                                           value => $dup->[3]});
913                # Count this.
914                $dupCount++;
915                Trace("$dupCount duplicates processed for $key.") if ($dupCount % 100 == 0) && T(3);
916            }
917            Trace("Key $key finished. $dupCount duplicates removed.") if T(3);
918        }
919        Trace("Processing complete:\n" . $stats->Show()) if T(2);
920    }
921    
922    
923  1;  1;

Legend:
Removed from v.1.24  
changed lines
  Added in v.1.25

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3