mirror of
https://github.com/torvalds/linux.git
synced 2026-05-25 07:33:19 +02:00
smb: client: compress: LZ77 optimizations
This patch implements several micro-optimizations on lz77_compress() with the goal of reducing the number of instructions per [input] byte (a.k.a. IPB). Changes: - change hashtable to be u32 (instead of u64) -- change the hash function to reflect that (adds lz77_hash() and lz77_read32() helpers) - batch-write literals instead of 1 by 1 -- now that we have a well defined hot path (match finding) and a cold path (encode literals + match), batch writing makes a significant difference - implement adaptive skipping of input bytes -- skip input bytes more aggressively if too few matches are being found - name some constants for more meaningful context Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de> Signed-off-by: Steve French <stfrench@microsoft.com>
This commit is contained in:
parent
fca46b0e68
commit
4460e9c68d
|
|
@ -1,6 +1,6 @@
|
|||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Copyright (C) 2024, SUSE LLC
|
||||
* Copyright (C) 2024-2026, SUSE LLC
|
||||
*
|
||||
* Authors: Enzo Matsumiya <ematsumiya@suse.de>
|
||||
*
|
||||
|
|
@ -16,17 +16,26 @@
|
|||
/*
|
||||
* Compression parameters.
|
||||
*/
|
||||
#define LZ77_MATCH_MIN_LEN 4
|
||||
#define LZ77_MATCH_MAX_DIST SZ_8K
|
||||
#define LZ77_HASH_LOG 15
|
||||
#define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG)
|
||||
#define LZ77_STEP_SIZE sizeof(u64)
|
||||
#define LZ77_RSTEP_SIZE sizeof(u32)
|
||||
#define LZ77_MSTEP_SIZE sizeof(u64)
|
||||
#define LZ77_SKIP_TRIGGER 4
|
||||
|
||||
#define LZ77_PREFETCH(ptr) __builtin_prefetch((ptr), 0, 3)
|
||||
#define LZ77_FLAG_MAX 32
|
||||
|
||||
static __always_inline u8 lz77_read8(const u8 *ptr)
|
||||
{
|
||||
return get_unaligned(ptr);
|
||||
}
|
||||
|
||||
static __always_inline u32 lz77_read32(const u32 *ptr)
|
||||
{
|
||||
return get_unaligned(ptr);
|
||||
}
|
||||
|
||||
static __always_inline u64 lz77_read64(const u64 *ptr)
|
||||
{
|
||||
return get_unaligned(ptr);
|
||||
|
|
@ -50,14 +59,14 @@ static __always_inline void lz77_write32(u32 *ptr, u32 v)
|
|||
static __always_inline u32 lz77_match_len(const void *match, const void *cur, const void *end)
|
||||
{
|
||||
const void *start = cur;
|
||||
u64 diff;
|
||||
|
||||
/* Safe for a do/while because otherwise we wouldn't reach here from the main loop. */
|
||||
do {
|
||||
diff = lz77_read64(cur) ^ lz77_read64(match);
|
||||
const u64 diff = lz77_read64(cur) ^ lz77_read64(match);
|
||||
|
||||
if (!diff) {
|
||||
cur += LZ77_STEP_SIZE;
|
||||
match += LZ77_STEP_SIZE;
|
||||
cur += LZ77_MSTEP_SIZE;
|
||||
match += LZ77_MSTEP_SIZE;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
|
@ -66,7 +75,7 @@ static __always_inline u32 lz77_match_len(const void *match, const void *cur, co
|
|||
cur += count_trailing_zeros(diff) >> 3;
|
||||
|
||||
return (cur - start);
|
||||
} while (likely(cur + LZ77_STEP_SIZE <= end));
|
||||
} while (likely(cur + LZ77_MSTEP_SIZE <= end));
|
||||
|
||||
/* Fallback to byte-by-byte comparison for last <8 bytes. */
|
||||
while (cur < end && lz77_read8(cur) == lz77_read8(match)) {
|
||||
|
|
@ -77,7 +86,7 @@ static __always_inline u32 lz77_match_len(const void *match, const void *cur, co
|
|||
return (cur - start);
|
||||
}
|
||||
|
||||
static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u32 len)
|
||||
static __always_inline void *lz77_encode_match(void *dst, void **nib, u16 dist, u32 len)
|
||||
{
|
||||
len -= 3;
|
||||
dist--;
|
||||
|
|
@ -131,94 +140,124 @@ static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u
|
|||
return dst + 4;
|
||||
}
|
||||
|
||||
noinline int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen)
|
||||
static __always_inline void *lz77_encode_literals(const void *start, const void *end, void *dst,
|
||||
long *f, u32 *fc, void **fp)
|
||||
{
|
||||
const void *srcp, *end;
|
||||
if (start >= end)
|
||||
return dst;
|
||||
|
||||
do {
|
||||
const u32 len = umin(end - start, LZ77_FLAG_MAX - *fc);
|
||||
|
||||
memcpy(dst, start, len);
|
||||
|
||||
dst += len;
|
||||
start += len;
|
||||
|
||||
*f <<= len;
|
||||
*fc += len;
|
||||
if (*fc == LZ77_FLAG_MAX) {
|
||||
lz77_write32(*fp, *f);
|
||||
*fc = 0;
|
||||
*fp = dst;
|
||||
dst += 4;
|
||||
}
|
||||
} while (start < end);
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
static __always_inline u32 lz77_hash(const u32 v)
|
||||
{
|
||||
return ((v ^ 0x9E3779B9) * 0x85EBCA6B) >> (32 - LZ77_HASH_LOG);
|
||||
}
|
||||
|
||||
noinline int lz77_compress(const void *src, const u32 slen, void *dst, u32 *dlen)
|
||||
{
|
||||
const void *srcp, *rlim, *end, *anchor;
|
||||
u32 *htable, hash, flag_count = 0;
|
||||
void *dstp, *nib, *flag_pos;
|
||||
u32 flag_count = 0;
|
||||
long flag = 0;
|
||||
u64 *htable;
|
||||
|
||||
/* This is probably a bug, so throw a warning. */
|
||||
if (WARN_ON_ONCE(*dlen < lz77_compressed_alloc_size(slen)))
|
||||
return -EINVAL;
|
||||
|
||||
srcp = src;
|
||||
end = src + slen;
|
||||
srcp = anchor = src;
|
||||
end = srcp + slen; /* absolute end */
|
||||
rlim = end - LZ77_MSTEP_SIZE; /* read limit (for lz77_match_len()) */
|
||||
dstp = dst;
|
||||
nib = NULL;
|
||||
flag_pos = dstp;
|
||||
dstp += 4;
|
||||
nib = NULL;
|
||||
|
||||
htable = kvcalloc(LZ77_HASH_SIZE, sizeof(*htable), GFP_KERNEL);
|
||||
if (!htable)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Main loop. */
|
||||
LZ77_PREFETCH(srcp + LZ77_RSTEP_SIZE);
|
||||
|
||||
hash = lz77_hash(lz77_read32(srcp++));
|
||||
htable[hash] = 0;
|
||||
hash = lz77_hash(lz77_read32(srcp));
|
||||
|
||||
/*
|
||||
* Main loop.
|
||||
*
|
||||
* @dlen is >= lz77_compressed_alloc_size(), so run without bound-checking @dstp.
|
||||
*
|
||||
* This code was crafted in a way to best utilise fetch-decode-execute CPU flow.
|
||||
* Any attempt to optimize it, or even organize it, can lead to huge performance loss.
|
||||
*/
|
||||
do {
|
||||
u32 dist, len = 0;
|
||||
const void *wnd;
|
||||
u64 hash;
|
||||
const void *match, *next = srcp;
|
||||
u32 len, step = 1, skip = 1U << LZ77_SKIP_TRIGGER;
|
||||
|
||||
hash = ((lz77_read64(srcp) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG);
|
||||
wnd = src + htable[hash];
|
||||
htable[hash] = srcp - src;
|
||||
dist = srcp - wnd;
|
||||
/* Match finding (hot path -- don't change the read/check/write order). */
|
||||
do {
|
||||
const u32 cur_hash = hash;
|
||||
|
||||
if (dist && dist < LZ77_MATCH_MAX_DIST)
|
||||
len = lz77_match_len(wnd, srcp, end);
|
||||
srcp = next;
|
||||
next += step;
|
||||
step = (skip++ >> LZ77_SKIP_TRIGGER);
|
||||
if (unlikely(next > rlim))
|
||||
goto out;
|
||||
|
||||
if (len < LZ77_MATCH_MIN_LEN) {
|
||||
lz77_write8(dstp, lz77_read8(srcp));
|
||||
hash = lz77_hash(lz77_read32(next));
|
||||
match = src + htable[cur_hash];
|
||||
htable[cur_hash] = srcp - src;
|
||||
} while (likely(match + LZ77_MATCH_MAX_DIST < srcp) ||
|
||||
lz77_read32(match) != lz77_read32(srcp));
|
||||
|
||||
dstp++;
|
||||
srcp++;
|
||||
|
||||
flag <<= 1;
|
||||
flag_count++;
|
||||
if (flag_count == 32) {
|
||||
lz77_write32(flag_pos, flag);
|
||||
flag_count = 0;
|
||||
flag_pos = dstp;
|
||||
dstp += 4;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
dstp = lz77_write_match(dstp, &nib, dist, len);
|
||||
dstp = lz77_encode_literals(anchor, srcp, dstp, &flag, &flag_count, &flag_pos);
|
||||
len = lz77_match_len(match, srcp, end);
|
||||
dstp = lz77_encode_match(dstp, &nib, srcp - match, len);
|
||||
srcp += len;
|
||||
anchor = srcp;
|
||||
|
||||
LZ77_PREFETCH(srcp);
|
||||
|
||||
flag = (flag << 1) | 1;
|
||||
flag_count++;
|
||||
if (flag_count == 32) {
|
||||
if (flag_count == LZ77_FLAG_MAX) {
|
||||
lz77_write32(flag_pos, flag);
|
||||
flag_count = 0;
|
||||
flag_pos = dstp;
|
||||
dstp += 4;
|
||||
}
|
||||
} while (likely(srcp + LZ77_STEP_SIZE <= end));
|
||||
|
||||
while (srcp < end) {
|
||||
u32 c = umin(end - srcp, 32 - flag_count);
|
||||
if (unlikely(srcp > rlim))
|
||||
break;
|
||||
|
||||
memcpy(dstp, srcp, c);
|
||||
/* Prepare for next loop. */
|
||||
hash = lz77_hash(lz77_read32(srcp));
|
||||
} while (srcp < end);
|
||||
out:
|
||||
dstp = lz77_encode_literals(anchor, end, dstp, &flag, &flag_count, &flag_pos);
|
||||
|
||||
dstp += c;
|
||||
srcp += c;
|
||||
|
||||
flag <<= c;
|
||||
flag_count += c;
|
||||
if (flag_count == 32) {
|
||||
lz77_write32(flag_pos, flag);
|
||||
flag_count = 0;
|
||||
flag_pos = dstp;
|
||||
dstp += 4;
|
||||
}
|
||||
}
|
||||
|
||||
flag <<= (32 - flag_count);
|
||||
flag |= (1UL << (32 - flag_count)) - 1;
|
||||
flag_count = LZ77_FLAG_MAX - flag_count;
|
||||
flag <<= flag_count;
|
||||
flag |= (1UL << flag_count) - 1;
|
||||
lz77_write32(flag_pos, flag);
|
||||
|
||||
*dlen = dstp - dst;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
* Copyright (C) 2024, SUSE LLC
|
||||
* Copyright (C) 2024-2026, SUSE LLC
|
||||
*
|
||||
* Authors: Enzo Matsumiya <ematsumiya@suse.de>
|
||||
*
|
||||
|
|
@ -39,5 +39,5 @@ static __always_inline u32 lz77_compressed_alloc_size(const u32 size)
|
|||
return size + (size >> 3) + 8;
|
||||
}
|
||||
|
||||
int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen);
|
||||
int lz77_compress(const void *src, const u32 slen, void *dst, u32 *dlen);
|
||||
#endif /* _SMB_COMPRESS_LZ77_H */
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user