|
1 | 1 | #!/bin/bash
|
2 | 2 | # @Function
|
3 |
| -# print uniq line keep order, no sorting required |
| 3 | +# Filter lines from INPUT (or standard input), writing to OUTPUT (or standard output). |
| 4 | +# same as `uniq` command in core utils, |
| 5 | +# but detect repeated lines that are not adjacent, no sorting required. |
4 | 6 | #
|
| 7 | +# @Usage |
| 8 | +# uq [OPTION]... [INPUT [OUTPUT]] |
| 9 | +# |
| 10 | +# @online-doc https://github.com/oldratlee/useful-scripts/blob/dev-2.x/docs/shell.md#-uq |
5 | 11 | # @author Zava Xu (zava.kid at gmail dot com)
|
| 12 | +# @author Jerry Lee (oldratlee at gmail dot com) |
| 13 | +set -eEuo pipefail |
| 14 | + |
| 15 | +PROG="$(basename "$0")" |
| 16 | +PROG_PATH="$(readlink -f "$0")" |
| 17 | +PROG_DIR="$(dirname "$PROG_PATH")" |
| 18 | + |
| 19 | +################################################################################ |
| 20 | +# util functions |
| 21 | +################################################################################ |
| 22 | + |
| 23 | +# NOTE: $'foo' is the escape sequence syntax of bash |
| 24 | +readonly ec=$'\033' # escape char |
| 25 | +readonly eend=$'\033[0m' # escape end |
| 26 | +readonly nl=$'\n' # new line |
| 27 | + |
| 28 | +redEcho() { |
| 29 | + [ -t 1 ] && echo "${ec}[1;31m$*$eend" || echo "$*" |
| 30 | +} |
| 31 | + |
| 32 | +yellowEcho() { |
| 33 | + [ -t 1 ] && echo "${ec}[1;33m$*$eend" || echo "$*" |
| 34 | +} |
| 35 | + |
| 36 | +die() { |
| 37 | + redEcho "Error: $*" 1>&2 |
| 38 | + exit 1 |
| 39 | +} |
| 40 | + |
| 41 | +usage() { |
| 42 | + local -r exit_code="${1:-0}" |
| 43 | + (($# > 0)) && shift |
| 44 | + # shellcheck disable=SC2015 |
| 45 | + [ "$exit_code" != 0 ] && local -r out=/dev/stderr || local -r out=/dev/stdout |
| 46 | + |
| 47 | + (($# > 0)) && redEcho "$*$nl" >$out |
| 48 | + |
| 49 | + cat >$out <<EOF |
| 50 | +Usage: ${PROG} [OPTION]... [INPUT [OUTPUT]] |
| 51 | +Filter lines from INPUT (or standard input), writing to OUTPUT (or standard output). |
| 52 | +Same as \`uniq\` command in core utils, |
| 53 | +but detect repeated lines that are not adjacent, no sorting required. |
6 | 54 |
|
7 |
| -outputUniqLines() { |
8 |
| - awk '{ |
9 |
| - s[$0]++ |
10 |
| - } |
11 |
| -
|
12 |
| - END { |
13 |
| - for(v in s) { |
14 |
| - if (s[v] == 1) { |
15 |
| - print v |
16 |
| - } |
17 |
| - } |
18 |
| - }' |
| 55 | +Example: |
| 56 | + # only one file, output to stdout |
| 57 | + uq in.txt |
| 58 | + # more than 1 file, last file argument is output file |
| 59 | + uq in.txt out.txt |
| 60 | + # when use - as output file, output to stdout |
| 61 | + uq in1.txt in2.txt - |
| 62 | +
|
| 63 | +Options: |
| 64 | + -c, --count prefix lines by the number of occurrences |
| 65 | + -d, --repeated only print duplicate lines, one for each group |
| 66 | + -D print all duplicate lines |
| 67 | + combined with -c/-d option usually |
| 68 | + --all-repeated[=METHOD] like -D, but allow separating groups |
| 69 | + with an empty line; |
| 70 | + METHOD={none(default),prepend,separate} |
| 71 | + -u, --unique Only output unique lines |
| 72 | + that are not repeated in the input |
| 73 | + -i, --ignore-case ignore differences in case when comparing |
| 74 | + -z, --zero-terminated line delimiter is NUL, not newline |
| 75 | +
|
| 76 | +Miscellaneous: |
| 77 | + -h, --help display this help and exit |
| 78 | +EOF |
| 79 | + |
| 80 | + exit "$exit_code" |
19 | 81 | }
|
20 | 82 |
|
21 |
| -cat "$@" | outputUniqLines |
| 83 | +################################################################################ |
| 84 | +# parse options |
| 85 | +################################################################################ |
| 86 | + |
| 87 | +uq_opt_count=0 |
| 88 | +uq_opt_only_repeated=0 |
| 89 | +uq_opt_all_repeated=0 |
| 90 | +uq_opt_repeated_method=none |
| 91 | +uq_opt_only_unique=0 |
| 92 | +uq_opt_ignore_case=0 |
| 93 | +uq_opt_zero_terminated=0 |
| 94 | +declare -a argv=() |
| 95 | + |
| 96 | +while (($# > 0)); do |
| 97 | + case "$1" in |
| 98 | + -c | --count) |
| 99 | + uq_opt_count=1 |
| 100 | + shift |
| 101 | + ;; |
| 102 | + -d | --repeated) |
| 103 | + uq_opt_only_repeated=1 |
| 104 | + shift |
| 105 | + ;; |
| 106 | + -D) |
| 107 | + uq_opt_all_repeated=1 |
| 108 | + shift |
| 109 | + ;; |
| 110 | + --all-repeated=*) |
| 111 | + uq_opt_all_repeated=1 |
| 112 | + uq_opt_repeated_method=$(echo "$1" | awk -F= '{print $2}') |
| 113 | + [[ $uq_opt_repeated_method == 'none' || $uq_opt_repeated_method == 'prepend' || $uq_opt_repeated_method == 'separate' ]] || |
| 114 | + usage 1 "$PROG: invalid argument ‘${uq_opt_repeated_method}’ for ‘--all-repeated’${nl}Valid arguments are:$nl - ‘none’$nl - ‘prepend’$nl - ‘separate’" |
| 115 | + shift |
| 116 | + ;; |
| 117 | + -u | --unique) |
| 118 | + uq_opt_only_unique=1 |
| 119 | + shift |
| 120 | + ;; |
| 121 | + -i | --ignore-case) |
| 122 | + uq_opt_ignore_case=1 |
| 123 | + shift |
| 124 | + ;; |
| 125 | + -z | --zero-terminated) |
| 126 | + uq_opt_zero_terminated=1 |
| 127 | + shift |
| 128 | + ;; |
| 129 | + -h | --help) |
| 130 | + usage |
| 131 | + ;; |
| 132 | + --) |
| 133 | + shift |
| 134 | + argv=("${argv[@]}" "$@") |
| 135 | + break |
| 136 | + ;; |
| 137 | + -) |
| 138 | + argv=(${argv[@]:+"${argv[@]}"} "$1") |
| 139 | + shift |
| 140 | + ;; |
| 141 | + -*) |
| 142 | + usage 2 "${PROG}: unrecognized option '$1'" |
| 143 | + ;; |
| 144 | + *) |
| 145 | + argv=(${argv[@]:+"${argv[@]}"} "$1") |
| 146 | + shift |
| 147 | + ;; |
| 148 | + esac |
| 149 | +done |
| 150 | + |
| 151 | +[[ $uq_opt_only_repeated == 1 && $uq_opt_only_unique == 1 ]] && |
| 152 | + usage 2 "printing duplicated lines(-d, --repeated) and unique lines(-u, --unique) is meaningless" |
| 153 | +[[ $uq_opt_all_repeated == 1 && $uq_opt_only_unique == 1 ]] && |
| 154 | + usage 2 "printing all duplicate lines(-D, --all-repeated) and unique lines(-u, --unique) is meaningless" |
| 155 | + |
| 156 | +[[ $uq_opt_all_repeated == 1 && $uq_opt_repeated_method == none && ( $uq_opt_count == 0 && $uq_opt_only_repeated == 0 ) ]] && |
| 157 | + yellowEcho "[$PROG] WARN: -D/--all-repeated=none option without -c/-d option, just cat input simply!" >&2 |
| 158 | + |
| 159 | +argc=${#argv[@]} |
| 160 | + |
| 161 | +if ((argc == 0)); then |
| 162 | + input_files=() |
| 163 | + output_file=/dev/stdout |
| 164 | +elif ((argc == 1)); then |
| 165 | + input_files=("${argv[0]}") |
| 166 | + output_file=/dev/stdout |
| 167 | +else |
| 168 | + input_files=("${argv[@]:0:argc-1}") |
| 169 | + output_file=${argv[argc - 1]} |
| 170 | + if [ "$output_file" = - ]; then |
| 171 | + output_file=/dev/stdout |
| 172 | + fi |
| 173 | +fi |
| 174 | + |
| 175 | +# Check input file |
| 176 | +for f in ${input_files[@]:+"${input_files[@]}"}; do |
| 177 | + # - is stdin, ok |
| 178 | + [ "$f" = - ] && continue |
| 179 | + |
| 180 | + [ -e "$f" ] || die "input file $f does not exist!" |
| 181 | + [ ! -d "$f" ] || die "input file $f exists, but is a directory!" |
| 182 | + [ -f "$f" ] || die "input file $f exists, but is not a file!" |
| 183 | + [ -r "$f" ] || die "input file $f exists, but is not readable!" |
| 184 | +done |
| 185 | + |
| 186 | +################################################################################ |
| 187 | +# biz logic |
| 188 | +################################################################################ |
| 189 | + |
| 190 | +awk \ |
| 191 | + -v "uq_opt_count=$uq_opt_count" \ |
| 192 | + -v "uq_opt_only_repeated=$uq_opt_only_repeated" \ |
| 193 | + -v "uq_opt_all_repeated=$uq_opt_all_repeated" \ |
| 194 | + -v "uq_opt_repeated_method=$uq_opt_repeated_method" \ |
| 195 | + -v "uq_opt_only_unique=$uq_opt_only_unique" \ |
| 196 | + -v "uq_opt_ignore_case=$uq_opt_ignore_case" \ |
| 197 | + -v "uq_opt_zero_terminated=$uq_opt_zero_terminated" \ |
| 198 | + -f "$PROG_DIR/helper/uq.awk" \ |
| 199 | + -- ${input_files[@]:+"${input_files[@]}"} \ |
| 200 | + >"$output_file" |
0 commit comments