Skip to content

Commit 513a869

Browse files
committed
! improve uq and add its documents
1 parent e48fac9 commit 513a869

File tree

4 files changed

+394
-20
lines changed

4 files changed

+394
-20
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ source <(curl -fsSL https://raw.githubusercontent.com/oldratlee/useful-scripts/r
5050
彩色`cat`出文件行,方便人眼区分不同的行。
5151
1. [a2l](docs/shell.md#-a2l)
5252
按行彩色输出参数,方便人眼查看。
53+
1. [uq](docs/shell.md#-uq)
54+
不重排序输入完成整个输入行的去重。相比系统的`uniq`命令加强的是可以跨行去重,不需要排序输入。
5355
1. [ap and rp](docs/shell.md#-ap-and-rp)
5456
批量转换文件路径为绝对路径/相对路径,会自动跟踪链接并规范化路径。
5557
1. [tcp-connection-state-counter](docs/shell.md#-tcp-connection-state-counter)

bin/helper/uq.awk

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/local/bin/awk -f
2+
3+
function printResult(for_lines) {
4+
for (idx = 0; idx < length(for_lines); idx++) {
5+
line=for_lines[idx]
6+
count=line_count_array[storeLine(line)]
7+
8+
#printf "DEBUG 1: %7s %s, index: %s\n", count, line, idx
9+
10+
if (uq_opt_only_unique) {
11+
if (count == 1) printLine(count, line)
12+
} else {
13+
#printf "DEBUG 2: %7s %s uq_opt_only_repeated: %s\n", count, line, uq_opt_only_repeated
14+
15+
if (uq_opt_only_repeated && count <= 1) {
16+
continue
17+
}
18+
19+
if (uq_opt_repeated_method == "prepend" || uq_opt_repeated_method == "separate" && outputted) {
20+
if (!compareLine(line, outputted)) print ""
21+
}
22+
23+
printLine(count, line)
24+
outputted=line
25+
}
26+
}
27+
}
28+
29+
function printLine(count, line) {
30+
if (uq_opt_count) {
31+
printf "%7s %s%s", count, line, ORS
32+
} else {
33+
print line
34+
}
35+
}
36+
37+
function storeLine(line) {
38+
if (uq_opt_ignore_case) {
39+
return tolower(line)
40+
} else {
41+
return line
42+
}
43+
}
44+
45+
function compareLine(line1, line2) {
46+
return storeLine(line1) == storeLine(line2)
47+
}
48+
49+
50+
BEGIN {
51+
if (uq_opt_zero_terminated) {
52+
RS = "\0"
53+
ORS = "\0"
54+
}
55+
}
56+
57+
58+
{
59+
# use index to keep lines order
60+
lines[line_index++] = $0
61+
62+
store_line=storeLine($0)
63+
# line_count_array: line content -> count
64+
if (++line_count_array[store_line] == 1) {
65+
# use index to keep lines order
66+
deduplicated_lines[deduplicated_line_index++] = store_line
67+
}
68+
}
69+
70+
71+
END {
72+
if (uq_opt_all_repeated) printResult(lines)
73+
else printResult(deduplicated_lines)
74+
}

bin/uq

Lines changed: 193 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,200 @@
11
#!/bin/bash
22
# @Function
3-
# print uniq line keep order, no sorting required
3+
# Filter lines from INPUT (or standard input), writing to OUTPUT (or standard output).
4+
# same as `uniq` command in core utils,
5+
# but detect repeated lines that are not adjacent, no sorting required.
46
#
7+
# @Usage
8+
# uq [OPTION]... [INPUT [OUTPUT]]
9+
#
10+
# @online-doc https://github.com/oldratlee/useful-scripts/blob/dev-2.x/docs/shell.md#-uq
511
# @author Zava Xu (zava.kid at gmail dot com)
12+
# @author Jerry Lee (oldratlee at gmail dot com)
13+
set -eEuo pipefail
14+
15+
PROG="$(basename "$0")"
16+
PROG_PATH="$(readlink -f "$0")"
17+
PROG_DIR="$(dirname "$PROG_PATH")"
18+
19+
################################################################################
20+
# util functions
21+
################################################################################
22+
23+
# NOTE: $'foo' is the escape sequence syntax of bash
24+
readonly ec=$'\033' # escape char
25+
readonly eend=$'\033[0m' # escape end
26+
readonly nl=$'\n' # new line
27+
28+
redEcho() {
29+
[ -t 1 ] && echo "${ec}[1;31m$*$eend" || echo "$*"
30+
}
31+
32+
yellowEcho() {
33+
[ -t 1 ] && echo "${ec}[1;33m$*$eend" || echo "$*"
34+
}
35+
36+
die() {
37+
redEcho "Error: $*" 1>&2
38+
exit 1
39+
}
40+
41+
usage() {
42+
local -r exit_code="${1:-0}"
43+
(($# > 0)) && shift
44+
# shellcheck disable=SC2015
45+
[ "$exit_code" != 0 ] && local -r out=/dev/stderr || local -r out=/dev/stdout
46+
47+
(($# > 0)) && redEcho "$*$nl" >$out
48+
49+
cat >$out <<EOF
50+
Usage: ${PROG} [OPTION]... [INPUT [OUTPUT]]
51+
Filter lines from INPUT (or standard input), writing to OUTPUT (or standard output).
52+
Same as \`uniq\` command in core utils,
53+
but detect repeated lines that are not adjacent, no sorting required.
654
7-
outputUniqLines() {
8-
awk '{
9-
s[$0]++
10-
}
11-
12-
END {
13-
for(v in s) {
14-
if (s[v] == 1) {
15-
print v
16-
}
17-
}
18-
}'
55+
Example:
56+
# only one file, output to stdout
57+
uq in.txt
58+
# more than 1 file, last file argument is output file
59+
uq in.txt out.txt
60+
# when use - as output file, output to stdout
61+
uq in1.txt in2.txt -
62+
63+
Options:
64+
-c, --count prefix lines by the number of occurrences
65+
-d, --repeated only print duplicate lines, one for each group
66+
-D print all duplicate lines
67+
combined with -c/-d option usually
68+
--all-repeated[=METHOD] like -D, but allow separating groups
69+
with an empty line;
70+
METHOD={none(default),prepend,separate}
71+
-u, --unique Only output unique lines
72+
that are not repeated in the input
73+
-i, --ignore-case ignore differences in case when comparing
74+
-z, --zero-terminated line delimiter is NUL, not newline
75+
76+
Miscellaneous:
77+
-h, --help display this help and exit
78+
EOF
79+
80+
exit "$exit_code"
1981
}
2082

21-
cat "$@" | outputUniqLines
83+
################################################################################
84+
# parse options
85+
################################################################################
86+
87+
uq_opt_count=0
88+
uq_opt_only_repeated=0
89+
uq_opt_all_repeated=0
90+
uq_opt_repeated_method=none
91+
uq_opt_only_unique=0
92+
uq_opt_ignore_case=0
93+
uq_opt_zero_terminated=0
94+
declare -a argv=()
95+
96+
while (($# > 0)); do
97+
case "$1" in
98+
-c | --count)
99+
uq_opt_count=1
100+
shift
101+
;;
102+
-d | --repeated)
103+
uq_opt_only_repeated=1
104+
shift
105+
;;
106+
-D)
107+
uq_opt_all_repeated=1
108+
shift
109+
;;
110+
--all-repeated=*)
111+
uq_opt_all_repeated=1
112+
uq_opt_repeated_method=$(echo "$1" | awk -F= '{print $2}')
113+
[[ $uq_opt_repeated_method == 'none' || $uq_opt_repeated_method == 'prepend' || $uq_opt_repeated_method == 'separate' ]] ||
114+
usage 1 "$PROG: invalid argument ‘${uq_opt_repeated_method}’ for ‘--all-repeated’${nl}Valid arguments are:$nl - ‘none’$nl - ‘prepend’$nl - ‘separate’"
115+
shift
116+
;;
117+
-u | --unique)
118+
uq_opt_only_unique=1
119+
shift
120+
;;
121+
-i | --ignore-case)
122+
uq_opt_ignore_case=1
123+
shift
124+
;;
125+
-z | --zero-terminated)
126+
uq_opt_zero_terminated=1
127+
shift
128+
;;
129+
-h | --help)
130+
usage
131+
;;
132+
--)
133+
shift
134+
argv=("${argv[@]}" "$@")
135+
break
136+
;;
137+
-)
138+
argv=(${argv[@]:+"${argv[@]}"} "$1")
139+
shift
140+
;;
141+
-*)
142+
usage 2 "${PROG}: unrecognized option '$1'"
143+
;;
144+
*)
145+
argv=(${argv[@]:+"${argv[@]}"} "$1")
146+
shift
147+
;;
148+
esac
149+
done
150+
151+
[[ $uq_opt_only_repeated == 1 && $uq_opt_only_unique == 1 ]] &&
152+
usage 2 "printing duplicated lines(-d, --repeated) and unique lines(-u, --unique) is meaningless"
153+
[[ $uq_opt_all_repeated == 1 && $uq_opt_only_unique == 1 ]] &&
154+
usage 2 "printing all duplicate lines(-D, --all-repeated) and unique lines(-u, --unique) is meaningless"
155+
156+
[[ $uq_opt_all_repeated == 1 && $uq_opt_repeated_method == none && ( $uq_opt_count == 0 && $uq_opt_only_repeated == 0 ) ]] &&
157+
yellowEcho "[$PROG] WARN: -D/--all-repeated=none option without -c/-d option, just cat input simply!" >&2
158+
159+
argc=${#argv[@]}
160+
161+
if ((argc == 0)); then
162+
input_files=()
163+
output_file=/dev/stdout
164+
elif ((argc == 1)); then
165+
input_files=("${argv[0]}")
166+
output_file=/dev/stdout
167+
else
168+
input_files=("${argv[@]:0:argc-1}")
169+
output_file=${argv[argc - 1]}
170+
if [ "$output_file" = - ]; then
171+
output_file=/dev/stdout
172+
fi
173+
fi
174+
175+
# Check input file
176+
for f in ${input_files[@]:+"${input_files[@]}"}; do
177+
# - is stdin, ok
178+
[ "$f" = - ] && continue
179+
180+
[ -e "$f" ] || die "input file $f does not exist!"
181+
[ ! -d "$f" ] || die "input file $f exists, but is a directory!"
182+
[ -f "$f" ] || die "input file $f exists, but is not a file!"
183+
[ -r "$f" ] || die "input file $f exists, but is not readable!"
184+
done
185+
186+
################################################################################
187+
# biz logic
188+
################################################################################
189+
190+
awk \
191+
-v "uq_opt_count=$uq_opt_count" \
192+
-v "uq_opt_only_repeated=$uq_opt_only_repeated" \
193+
-v "uq_opt_all_repeated=$uq_opt_all_repeated" \
194+
-v "uq_opt_repeated_method=$uq_opt_repeated_method" \
195+
-v "uq_opt_only_unique=$uq_opt_only_unique" \
196+
-v "uq_opt_ignore_case=$uq_opt_ignore_case" \
197+
-v "uq_opt_zero_terminated=$uq_opt_zero_terminated" \
198+
-f "$PROG_DIR/helper/uq.awk" \
199+
-- ${input_files[@]:+"${input_files[@]}"} \
200+
>"$output_file"

0 commit comments

Comments
 (0)