-
Notifications
You must be signed in to change notification settings - Fork 2.8k
Expand file tree
/
Copy pathuq
More file actions
executable file
·337 lines (292 loc) · 9.1 KB
/
uq
File metadata and controls
executable file
·337 lines (292 loc) · 9.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
#!/usr/bin/env bash
# @Function
# Filter lines from INPUT (or standard input), writing to OUTPUT (or standard output).
# same as `uniq` command in core utils,
# but detect repeated lines that are not adjacent, no sorting required.
#
# @Usage
# uq [OPTION]... [INPUT [OUTPUT]]
#
# @online-doc https://github.com/oldratlee/useful-scripts/blob/dev-3.x/docs/shell.md#-uq
# @author Zava Xu (zava.kid at gmail dot com)
# @author Jerry Lee (oldratlee at gmail dot com)
set -eEuo pipefail
readonly PROG=${0##*/}
readonly PROG_VERSION='3.x-dev'
################################################################################
# util functions
################################################################################
# NOTE: $'foo' is the escape sequence syntax of bash
readonly NL=$'\n' # new line
redPrint() {
# if stdout is a terminal, turn on color output.
# '-t' check: is a terminal?
# check isatty in bash https://stackoverflow.com/questions/10022323
if [ -t 1 ]; then
printf '\e[1;31m%s\e[0m\n' "$*"
else
printf '%s\n' "$*"
fi
}
yellowPrint() {
if [ -t 1 ]; then
printf '\e[1;33m%s\e[0m\n' "$*"
else
printf '%s\n' "$*"
fi
}
die() {
local prompt_help=false exit_status=2
while (($# > 0)); do
case "$1" in
-h)
prompt_help=true
shift
;;
-s)
exit_status=$2
shift 2
;;
*)
break
;;
esac
done
(($# > 0)) && redPrint "$PROG: $*"
$prompt_help && echo "Try '$PROG --help' for more information."
exit "$exit_status"
} >&2
convertHumanReadableSizeToSize() {
local human_readable_size=$1
[[ "$human_readable_size" =~ ^([0-9][0-9]*)([kmg]?)$ ]] || return 1
local size=${BASH_REMATCH[1]} unit=${BASH_REMATCH[2]}
case "$unit" in
k)
((size *= 1024))
;;
m)
((size *= 1024 ** 2))
;;
g)
((size *= 1024 ** 3))
;;
esac
echo "$size"
}
usage() {
cat <<EOF
Usage: $PROG [OPTION]... [INPUT [OUTPUT]]
Filter lines from INPUT (or standard input), writing to OUTPUT (or standard output).
Same as \`uniq\` command in core utils,
but detect repeated lines that are not adjacent, no sorting required.
Example:
# only one file, output to stdout
uq in.txt
# more than 1 file, last file argument is output file
uq in.txt out.txt
# when use - as output file, output to stdout
uq in1.txt in2.txt -
Options:
-c, --count prefix lines by the number of occurrences
-d, --repeated only print duplicate lines, one for each group
-D print all duplicate lines
combined with -c/-d option usually
--all-repeated[=METHOD] like -D, but allow separating groups
with an empty line;
METHOD={none(default),prepend,separate}
-u, --unique Only output unique lines
that are not repeated in the input
-i, --ignore-case ignore differences in case when comparing
-z, --zero-terminated line delimiter is NUL, not newline
-XM, --max-input max input size(count by char), support k,m,g postfix
default is 256m
avoid consuming large memory unexpectedly
-h, --help display this help and exit
-V, --version display version information and exit
EOF
exit
}
progVersion() {
printf '%s\n' "$PROG $PROG_VERSION"
exit
}
################################################################################
# parse options
################################################################################
uq_opt_count=0
uq_opt_only_repeated=0
uq_opt_all_repeated=0
uq_opt_repeated_method=none
uq_opt_only_unique=0
uq_opt_ignore_case=0
uq_opt_zero_terminated=0
uq_max_input_human_readable_size=256m
argv=()
while (($# > 0)); do
case "$1" in
-c | --count)
uq_opt_count=1
shift
;;
-d | --repeated)
uq_opt_only_repeated=1
shift
;;
-D)
uq_opt_all_repeated=1
shift
;;
--all-repeated=*)
uq_opt_all_repeated=1
uq_opt_repeated_method=${1#--all-repeated=}
[[ $uq_opt_repeated_method = 'none' || $uq_opt_repeated_method = 'prepend' || $uq_opt_repeated_method = 'separate' ]] ||
die -h "invalid argument ‘$uq_opt_repeated_method’ for ‘--all-repeated’${NL}Valid arguments are:$NL - ‘none’$NL - ‘prepend’$NL - ‘separate’"
shift
;;
-u | --unique)
uq_opt_only_unique=1
shift
;;
-i | --ignore-case)
uq_opt_ignore_case=1
shift
;;
-z | --zero-terminated)
uq_opt_zero_terminated=1
shift
;;
-XM | --max-input)
uq_max_input_human_readable_size=$2
shift 2
;;
-h | --help)
usage
;;
-V | --version)
progVersion
;;
--)
shift
argv=(${argv[@]:+"${argv[@]}"} "$@")
break
;;
-)
argv=(${argv[@]:+"${argv[@]}"} "$1")
shift
;;
-*)
die -h "unrecognized option '$1'"
;;
*)
argv=(${argv[@]:+"${argv[@]}"} "$1")
shift
;;
esac
done
[[ $uq_opt_only_repeated = 1 && $uq_opt_only_unique = 1 ]] &&
die -h "printing duplicated lines(-d, --repeated) and unique lines(-u, --unique) is meaningless"
[[ $uq_opt_all_repeated = 1 && $uq_opt_only_unique = 1 ]] &&
die -h "printing all duplicate lines(-D, --all-repeated) and unique lines(-u, --unique) is meaningless"
[[ $uq_opt_all_repeated = 1 && $uq_opt_repeated_method = none && ($uq_opt_count = 0 && $uq_opt_only_repeated = 0) ]] &&
yellowPrint "WARN: -D/--all-repeated=none option without -c/-d option, just cat input simply!" >&2
# DO NOT declare and assign var uq_max_input_size(as readonly) in ONE line!
# more info see https://github.com/koalaman/shellcheck/wiki/SC2155
uq_max_input_size=$(convertHumanReadableSizeToSize "$uq_max_input_human_readable_size") ||
die -h "illegal value of option -XM/--max-input: $uq_max_input_human_readable_size"
readonly argc=${#argv[@]} argv uq_max_input_size
if ((argc == 0)); then
input_files=()
output_file=/dev/stdout
elif ((argc == 1)); then
input_files=("${argv[0]}")
output_file=/dev/stdout
else
input_files=("${argv[@]:0:argc-1}")
output_file=${argv[argc - 1]}
if [ "$output_file" = - ]; then
output_file=/dev/stdout
fi
fi
readonly output_file
# Check input file
for f in ${input_files[@]:+"${input_files[@]}"}; do
# - is stdin, ok
[ "$f" = - ] && continue
[ -e "$f" ] || die "input file $f: No such file or directory!"
[ ! -d "$f" ] || die "input file $f exists, but is a directory!"
[ -f "$f" ] || die "input file $f exists, but is not a file!"
[ -r "$f" ] || die "input file $f exists, but is not readable!"
done
unset f
################################################################################
# biz logic
################################################################################
# uq awk script
#
# edit in a separated file(eg: uq.awk) then copy here,
# maybe more convenient(like good syntax highlight)
# shellcheck disable=SC2016
readonly uq_awk_script='
function printResult(for_lines) {
for (idx = 0; idx < length(for_lines); idx++) {
line = for_lines[idx]
count = line_count_array[caseAwareLine(line)]
#printf "DEBUG: %s %s, index: %s, uq_opt_only_repeated: %s\n", count, line, idx, uq_opt_only_repeated
if (uq_opt_only_unique) {
if (count == 1) printLine(count, line)
} else {
if (uq_opt_only_repeated && count <= 1) continue
if (uq_opt_repeated_method == "prepend" || uq_opt_repeated_method == "separate" && previous_output) {
if (line != previous_output) print ""
}
printLine(count, line)
previous_output = line
}
}
}
function printLine(count, line) {
if (uq_opt_count) printf "%7s %s%s", count, line, ORS
else print line
}
function caseAwareLine(line) {
if (IGNORECASE) return tolower(line)
else return line
}
BEGIN {
if (uq_opt_zero_terminated) ORS = RS = "\0"
}
{
total_input_size += length + 1
if (total_input_size > uq_max_input_size) {
printf "%s: input size exceed max input size %s!\nuse option -XM/--max-input specify a REASONABLE larger value.\n",
uq_PROG, uq_max_input_human_readable_size > "/dev/stderr"
exit(1)
}
# use index to keep lines order
original_lines[line_index++] = $0
case_aware_line = caseAwareLine($0)
# line_count_array: line content -> count
if (++line_count_array[case_aware_line] == 1) {
# use index to keep lines order
deduplicated_lines[deduplicated_line_index++] = case_aware_line
}
}
END {
if (uq_opt_all_repeated) printResult(original_lines)
else printResult(deduplicated_lines)
}
'
awk \
-v "uq_opt_count=$uq_opt_count" \
-v "uq_opt_only_repeated=$uq_opt_only_repeated" \
-v "uq_opt_all_repeated=$uq_opt_all_repeated" \
-v "uq_opt_repeated_method=$uq_opt_repeated_method" \
-v "uq_opt_only_unique=$uq_opt_only_unique" \
-v "IGNORECASE=$uq_opt_ignore_case" \
-v "uq_opt_zero_terminated=$uq_opt_zero_terminated" \
-v "uq_max_input_human_readable_size=$uq_max_input_human_readable_size" \
-v "uq_max_input_size=$uq_max_input_size" \
-v "uq_PROG=$PROG" \
-f <(printf "%s" "$uq_awk_script") \
-- ${input_files[@]:+"${input_files[@]}"} \
>"$output_file"