-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdistinct
executable file
·113 lines (89 loc) · 1.86 KB
/
distinct
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/perl
use warnings;
use strict;
use Getopt::Long;
use POSIX qw(ceil floor);
use File::Path;
use Pod::Usage;
=head1 NAME
distinct
=head1 SYNOPSIS
usage: distinct <filename>
-h help : displays this help
-s summarize : summarizes the distinct values and total number of values
-k histogram : prints the values and the corresponding number of occurrences
-c column number (default 1)
example: distinct -c4 pscalare.txt
Prints distinct values from a column of a file. Can read from STDIN too.
=head1 DESCRIPTION
=cut
#option variables
my $verbose;
my $debug;
my $help;
my $histogram = 0;
my $summarize = 0;
my $col = 1;
#initialize options
Getopt::Long::Configure ('bundling');
if(!GetOptions ('h'=>\$help, 's'=>\$summarize, 'k'=>\$histogram, 'c=i'=>\$col)
|| $col<1 || $help || scalar(@ARGV)>1)
{
if ($help)
{
pod2usage(-verbose => 2);
}
else
{
pod2usage(1);
}
}
#STDIN redirection
if (scalar(@ARGV)==1)
{
open(IN, $ARGV[0]) || die "Cannot open $ARGV[0]\n";
}
else
{
*IN = *STDIN;
}
#contains values found in column
my %VALUES;
my @orderedValues;
#true index in the array
my $i = $col-1;
my $colNo;
while(<IN>)
{
s/\r?\n?$//;
#detects number of columns
$colNo = s/\t/\t/g + 1 if ($.==1);
my @fields = split('\t', $_, $colNo);
my $value = $fields[$i];
if(!defined($value))
{
$value = "";
}
if(!exists($VALUES{$value}))
{
push(@orderedValues, $value);
}
$VALUES{$value}++;
}
if ($summarize)
{
#prints a summary of the total nummber of values and the number of distinct values
print "Number of values: $.\n";
print "Number of distinct values: " . scalar(keys(%VALUES)) . "\n";
}
elsif ($histogram)
{
print "value\toccurrences\n";
map {print "$_\t$VALUES{$_}\n"} sort(keys(%VALUES));
}
else
{
#prints distinct values
map {print "$_\n"} @orderedValues;
}
close(IN);