smb: client: compress: LZ77 optimizations

This patch implements several micro-optimizations on lz77_compress() with the goal of reducing the number of instructions per [input] byte (a.k.a. IPB). Changes: - change hashtable to be u32 (instead of u64) -- change the hash function to reflect that (adds lz77_hash() and lz77_read32() helpers) - batch-write literals instead of 1 by 1 -- now that we have a well defined hot path (match finding) and a cold path (encode literals + match), batch writing makes a significant difference - implement adaptive skipping of input bytes -- skip input bytes more aggressively if too few matches are being found - name some constants for more meaningful context Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de> Signed-off-by: Steve French <stfrench@microsoft.com>
2026-05-25 07:33:19 +02:00 · 2026-04-13 16:07:10 -03:00 · 2026-04-13 16:07:10 -03:00 · 4460e9c68d
commit 4460e9c68d
parent fca46b0e68
2 changed files with 105 additions and 66 deletions
--- a/fs/smb/client/compress/lz77.c
+++ b/fs/smb/client/compress/lz77.c
@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (C) 2024, SUSE LLC
+ * Copyright (C) 2024-2026, SUSE LLC
 *
 * Authors: Enzo Matsumiya <ematsumiya@suse.de>
 *
@ -16,17 +16,26 @@
 /*
 * Compression parameters.
 */
-#define LZ77_MATCH_MIN_LEN	4
 #define LZ77_MATCH_MAX_DIST	SZ_8K
 #define LZ77_HASH_LOG		15
 #define LZ77_HASH_SIZE		(1 << LZ77_HASH_LOG)
-#define LZ77_STEP_SIZE		sizeof(u64)
+#define LZ77_RSTEP_SIZE		sizeof(u32)
+#define LZ77_MSTEP_SIZE		sizeof(u64)
+#define LZ77_SKIP_TRIGGER	4
+
+#define LZ77_PREFETCH(ptr)	__builtin_prefetch((ptr), 0, 3)
+#define LZ77_FLAG_MAX		32

 static __always_inline u8 lz77_read8(const u8 *ptr)
 {
 	return get_unaligned(ptr);
 }

+static __always_inline u32 lz77_read32(const u32 *ptr)
+{
+	return get_unaligned(ptr);
+}
+
 static __always_inline u64 lz77_read64(const u64 *ptr)
 {
 	return get_unaligned(ptr);
@ -50,14 +59,14 @@ static __always_inline void lz77_write32(u32 *ptr, u32 v)
 static __always_inline u32 lz77_match_len(const void *match, const void *cur, const void *end)
 {
 	const void *start = cur;
-	u64 diff;

 	/* Safe for a do/while because otherwise we wouldn't reach here from the main loop. */
 	do {
-		diff = lz77_read64(cur) ^ lz77_read64(match);
+		const u64 diff = lz77_read64(cur) ^ lz77_read64(match);
+
 		if (!diff) {
-			cur += LZ77_STEP_SIZE;
-			match += LZ77_STEP_SIZE;
+			cur += LZ77_MSTEP_SIZE;
+			match += LZ77_MSTEP_SIZE;

 			continue;
 		}
@ -66,7 +75,7 @@ static __always_inline u32 lz77_match_len(const void *match, const void *cur, co
 		cur += count_trailing_zeros(diff) >> 3;

 		return (cur - start);
-	} while (likely(cur + LZ77_STEP_SIZE <= end));
+	} while (likely(cur + LZ77_MSTEP_SIZE <= end));

 	/* Fallback to byte-by-byte comparison for last <8 bytes. */
 	while (cur < end && lz77_read8(cur) == lz77_read8(match)) {
@ -77,7 +86,7 @@ static __always_inline u32 lz77_match_len(const void *match, const void *cur, co
 	return (cur - start);
 }

-static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u32 len)
+static __always_inline void *lz77_encode_match(void *dst, void **nib, u16 dist, u32 len)
 {
 	len -= 3;
 	dist--;
@ -131,94 +140,124 @@ static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u
 	return dst + 4;
 }

-noinline int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen)
+static __always_inline void *lz77_encode_literals(const void *start, const void *end, void *dst,
+						  long *f, u32 *fc, void **fp)
 {
-	const void *srcp, *end;
+	if (start >= end)
+		return dst;
+
+	do {
+		const u32 len = umin(end - start, LZ77_FLAG_MAX - *fc);
+
+		memcpy(dst, start, len);
+
+		dst += len;
+		start += len;
+
+		*f <<= len;
+		*fc += len;
+		if (*fc == LZ77_FLAG_MAX) {
+			lz77_write32(*fp, *f);
+			*fc = 0;
+			*fp = dst;
+			dst += 4;
+		}
+	} while (start < end);
+
+	return dst;
+}
+
+static __always_inline u32 lz77_hash(const u32 v)
+{
+	return ((v ^ 0x9E3779B9) * 0x85EBCA6B) >> (32 - LZ77_HASH_LOG);
+}
+
+noinline int lz77_compress(const void *src, const u32 slen, void *dst, u32 *dlen)
+{
+	const void *srcp, *rlim, *end, *anchor;
+	u32 *htable, hash, flag_count = 0;
 	void *dstp, *nib, *flag_pos;
-	u32 flag_count = 0;
 	long flag = 0;
-	u64 *htable;

 	/* This is probably a bug, so throw a warning. */
 	if (WARN_ON_ONCE(*dlen < lz77_compressed_alloc_size(slen)))
 		return -EINVAL;

-	srcp = src;
-	end = src + slen;
+	srcp = anchor = src;
+	end = srcp + slen; /* absolute end */
+	rlim = end - LZ77_MSTEP_SIZE; /* read limit (for lz77_match_len()) */
 	dstp = dst;
-	nib = NULL;
 	flag_pos = dstp;
 	dstp += 4;
+	nib = NULL;

 	htable = kvcalloc(LZ77_HASH_SIZE, sizeof(*htable), GFP_KERNEL);
 	if (!htable)
 		return -ENOMEM;

-	/* Main loop. */
+	LZ77_PREFETCH(srcp + LZ77_RSTEP_SIZE);
+
+	hash = lz77_hash(lz77_read32(srcp++));
+	htable[hash] = 0;
+	hash = lz77_hash(lz77_read32(srcp));
+
+	/*
+	 * Main loop.
+	 *
+	 * @dlen is >= lz77_compressed_alloc_size(), so run without bound-checking @dstp.
+	 *
+	 * This code was crafted in a way to best utilise fetch-decode-execute CPU flow.
+	 * Any attempt to optimize it, or even organize it, can lead to huge performance loss.
+	 */
 	do {
-		u32 dist, len = 0;
-		const void *wnd;
-		u64 hash;
+		const void *match, *next = srcp;
+		u32 len, step = 1, skip = 1U << LZ77_SKIP_TRIGGER;

-		hash = ((lz77_read64(srcp) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG);
-		wnd = src + htable[hash];
-		htable[hash] = srcp - src;
-		dist = srcp - wnd;
+		/* Match finding (hot path -- don't change the read/check/write order). */
+		do {
+			const u32 cur_hash = hash;

-		if (dist && dist < LZ77_MATCH_MAX_DIST)
-			len = lz77_match_len(wnd, srcp, end);
+			srcp = next;
+			next += step;
+			step = (skip++ >> LZ77_SKIP_TRIGGER);
+			if (unlikely(next > rlim))
+				goto out;

-		if (len < LZ77_MATCH_MIN_LEN) {
-			lz77_write8(dstp, lz77_read8(srcp));
+			hash = lz77_hash(lz77_read32(next));
+			match = src + htable[cur_hash];
+			htable[cur_hash] = srcp - src;
+		} while (likely(match + LZ77_MATCH_MAX_DIST < srcp) ||
+			 lz77_read32(match) != lz77_read32(srcp));

-			dstp++;
-			srcp++;
-
-			flag <<= 1;
-			flag_count++;
-			if (flag_count == 32) {
-				lz77_write32(flag_pos, flag);
-				flag_count = 0;
-				flag_pos = dstp;
-				dstp += 4;
-			}
-
-			continue;
-		}
-
-		dstp = lz77_write_match(dstp, &nib, dist, len);
+		dstp = lz77_encode_literals(anchor, srcp, dstp, &flag, &flag_count, &flag_pos);
+		len = lz77_match_len(match, srcp, end);
+		dstp = lz77_encode_match(dstp, &nib, srcp - match, len);
 		srcp += len;
+		anchor = srcp;
+
+		LZ77_PREFETCH(srcp);

 		flag = (flag << 1) | 1;
 		flag_count++;
-		if (flag_count == 32) {
+		if (flag_count == LZ77_FLAG_MAX) {
 			lz77_write32(flag_pos, flag);
 			flag_count = 0;
 			flag_pos = dstp;
 			dstp += 4;
 		}
-	} while (likely(srcp + LZ77_STEP_SIZE <= end));

-	while (srcp < end) {
-		u32 c = umin(end - srcp, 32 - flag_count);
+		if (unlikely(srcp > rlim))
+			break;

-		memcpy(dstp, srcp, c);
+		/* Prepare for next loop. */
+		hash = lz77_hash(lz77_read32(srcp));
+	} while (srcp < end);
+out:
+	dstp = lz77_encode_literals(anchor, end, dstp, &flag, &flag_count, &flag_pos);

-		dstp += c;
-		srcp += c;
-
-		flag <<= c;
-		flag_count += c;
-		if (flag_count == 32) {
-			lz77_write32(flag_pos, flag);
-			flag_count = 0;
-			flag_pos = dstp;
-			dstp += 4;
-		}
-	}
-
-	flag <<= (32 - flag_count);
-	flag |= (1UL << (32 - flag_count)) - 1;
+	flag_count = LZ77_FLAG_MAX - flag_count;
+	flag <<= flag_count;
+	flag |= (1UL << flag_count) - 1;
 	lz77_write32(flag_pos, flag);

 	*dlen = dstp - dst;
--- a/fs/smb/client/compress/lz77.h
+++ b/fs/smb/client/compress/lz77.h
@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2024, SUSE LLC
+ * Copyright (C) 2024-2026, SUSE LLC
 *
 * Authors: Enzo Matsumiya <ematsumiya@suse.de>
 *
@ -39,5 +39,5 @@ static __always_inline u32 lz77_compressed_alloc_size(const u32 size)
 	return size + (size >> 3) + 8;
 }

-int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen);
+int lz77_compress(const void *src, const u32 slen, void *dst, u32 *dlen);
 #endif /* _SMB_COMPRESS_LZ77_H */