From 92cc28c48045db05ec951a6725583a5d4d8ddfd0 Mon Sep 17 00:00:00 2001 From: Julian Prein Date: Fri, 5 Jan 2024 17:00:23 +0100 Subject: [PATCH] zsh:finddup(): Replace awk solution with uniq Replace custom awk solution with uniq, by first flipping filename and filesize so that uniq's `-f` flag can be utilized (as there is no inverse of it, i.e. "only look at field n"). This increases performance by quite a bit. --- .config/zsh/zshrc.d/40-functions.zsh | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/.config/zsh/zshrc.d/40-functions.zsh b/.config/zsh/zshrc.d/40-functions.zsh index 17fb8b1..362e70a 100644 --- a/.config/zsh/zshrc.d/40-functions.zsh +++ b/.config/zsh/zshrc.d/40-functions.zsh @@ -560,18 +560,25 @@ suffix() { # Find duplicate files finddup() { # find all files, filter the ones out with unique size, calculate md5 and - # print duplicates - # TODO: Fix duplicate lines output in the awk script that currently `sort - # -u` handles + # print duplicates. Assumes that no file contains tab characters in their + # name. + # # TODO: Use cksum to calculate faster CRC with custom awk solution to print # duplicates, as `uniq -w32` breaks through the different CRC lengths. + # TODO: The second sort call could be optimized in some way, since we + # already grouped files with the same size. Instead of resorting the + # whole thing, we only need to check if the files with the same size + # have the same hash. Just removing the sort call does almost the + # trick just breaks for groups of files with the same size where same + # checksums are not behind each other. + find "$@" -type f -exec du -b '{}' '+' \ - | sort \ - | awk '{ if (!_[$1]) { _[$1] = $0 } else { print _[$1]; print $0; } }' \ - | sort -u \ - | cut -d$'\t' -f2- \ + | awk -F'\t' '{print $2"\t"$1}' \ + | sort --field-separator=$'\t' -nk2 \ + | uniq -f1 -D \ + | cut -d$'\t' -f1 \ | xargs -d'\n' md5sum \ - | sort \ + | sort -k1,1 \ | uniq -w32 --all-repeated=separate \ | cut -d' ' -f3- }