From b38e01c72add1faf4ce0eb5c0f5e354e21a0f477 Mon Sep 17 00:00:00 2001
From: Julian Prein <druckdev@protonmail.com>
Date: Wed, 28 Dec 2022 01:43:01 +0100
Subject: [PATCH] zsh:funcs:finddup: Support filenames with spaces

Previously when filenames contained spaces, the function would break as
`awk {print $2,$1}` would only print a part of the filename.

The field swap was used as a workaround so that `uniq` only compares the
sizes, and `uniq` unfortunately only has a flag to **skip** fields.

Fix this issue by using a short awk script that mimics `uniq` but only
with the first field (i.e. the size).

My awk foo is unfortunately not very good, and that is why the one-liner
prints out the first duplicated line multiple time. The `sort -u` pipe
afterwards gets rid of those.
---
 .config/zsh/zshrc.d/40-functions.zsh | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/.config/zsh/zshrc.d/40-functions.zsh b/.config/zsh/zshrc.d/40-functions.zsh
index abe5193..f5d745c 100644
--- a/.config/zsh/zshrc.d/40-functions.zsh
+++ b/.config/zsh/zshrc.d/40-functions.zsh
@@ -554,15 +554,17 @@ suffix() {
 finddup() {
 	# find all files, filter the ones out with unique size, calculate md5 and
 	# print duplicates
+	# TODO: Fix duplicate lines output in the awk script that currently `sort
+	#       -u` handles
 	find "$@" -type f -exec du '{}' '+' \
-	| awk '{print $2,$1}' \
-	| sort -k2 \
-	| uniq -f1 -D \
-	| awk '{print $1}' \
+	| sort \
+	| awk '{ if (!_[$1]) { _[$1] = $0 } else { print _[$1]; print $0; } }' \
+	| sort -u \
+	| cut -d$'\t' -f2- \
 	| xargs -d'\n' md5sum \
 	| sort \
 	| uniq -w32 --all-repeated=separate \
-	| awk '{print $2}'
+	| cut -d' ' -f3-
 }
 
 # Wrapper around tmsu that searches for .tmsu/db in all parent directories and