[Bio] / Sprout / FindDuplicate.pl Repository:
ViewVC logotype

Annotation of /Sprout/FindDuplicate.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 use strict;
2 :    
3 :     use Tracer;
4 :     use Stats;
5 :    
6 :     =head1 Find Duplicate Field Values
7 :    
8 :     This script searches a specific column of a tab-delimited file looking for
9 :     duplicate values. The default column is the first (1). Use the B<c> option to
10 :     specify a different column. The input file name is specified as a positional
11 :     parameter. If no input file name is specified, the input is taken from the
12 :     standard input.
13 :    
14 :     =cut
15 :    
16 :     my ($options, $fileName) = StandardSetup([], { c => [1, "column to check (1-based)"]},
17 :     "<fileName>", @ARGV);
18 :     # Get the column number to check. Note we convert from a column number to an array
19 :     # index.
20 :     my $col = $options->{c} - 1;
21 :     # Create a statistics object.
22 :     my $stats = Stats->new(qw(lines duplicates blanks));
23 :     # Open the input file. If there is no input file name, we use a hyphen to get
24 :     # STDIN.
25 :     if (! $fileName) {
26 :     $fileName = "-";
27 :     Trace("Reading from standard input.") if T(2);
28 :     } else {
29 :     Trace("Reading from $fileName.") if T(2);
30 :     }
31 :     my $ih = Open(undef, "<$fileName");
32 :     # The values found will be cached in here.
33 :     my %values;
34 :     # Loop through the input file.
35 :     while (! eof $ih) {
36 :     # Get the key column from this record.
37 :     my @fields = Tracer::GetLine($ih);
38 :     $stats->Add(lines => 1);
39 :     my $key = $fields[$col];
40 :     if (! defined $key || $key eq '') {
41 :     # Ignore lines without a field value.
42 :     $stats->Add(blanks => 1);
43 :     } else {
44 :     # Here we have a field value. See if it's a duplicate.
45 :     if ($values{$key}) {
46 :     Trace("Duplicate value \"$key\" found in line $..") if T(1);
47 :     $stats->Add(duplicates => 1);
48 :     }
49 :     # Record this key value.
50 :     $values{$key} = 1;
51 :     }
52 :     }
53 :     # Display the statistics.
54 :     Trace("All done:\n" . $stats->Show()) if T(2);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3