mirror of
https://github.com/torvalds/linux.git
synced 2026-05-24 07:03:03 +02:00
Document parts of the code, especially the apparently non-sense parts. Other: - change pointer increment constants to sizeof() values Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de> Signed-off-by: Steve French <stfrench@microsoft.com>
336 lines
8.1 KiB
C
336 lines
8.1 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Copyright (C) 2024-2026, SUSE LLC
|
|
*
|
|
* Authors: Enzo Matsumiya <ematsumiya@suse.de>
|
|
*
|
|
* Implementation of the LZ77 "plain" compression algorithm, as per MS-XCA spec.
|
|
*/
|
|
#include <linux/slab.h>
|
|
#include <linux/sizes.h>
|
|
#include <linux/count_zeros.h>
|
|
#include <linux/unaligned.h>
|
|
|
|
#include "lz77.h"
|
|
|
|
/*
|
|
* Compression parameters.
|
|
*
|
|
* LZ77_MATCH_MAX_DIST: Farthest back a match can be from current position (can be 1 - 8K).
|
|
* LZ77_HASH_LOG:
|
|
* LZ77_HASH_SIZE: ilog2 hash size (recommended to be 13 - 18, default 15 (hash size
|
|
* 32k)).
|
|
* LZ77_RSTEP_SIZE: Number of bytes to read from input buffer for hashing and initial
|
|
* match check (default 4 bytes, this effectivelly makes this the min
|
|
* match len).
|
|
* LZ77_MSTEP_SIZE: Number of bytes to extend-compare a found match (default 8 bytes).
|
|
* LZ77_SKIP_TRIGGER: ilog2 value for adaptive skipping, i.e. to progressively skip input
|
|
* bytes when we can't find matches. Default is 4.
|
|
* Higher values (>0) will decrease compression time, but will result
|
|
* in worse compression ratio. Lower values will give better
|
|
* compression ratio (more matches found), but will increase time.
|
|
*/
|
|
#define LZ77_MATCH_MAX_DIST SZ_8K
|
|
#define LZ77_HASH_LOG 15
|
|
#define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG)
|
|
#define LZ77_RSTEP_SIZE sizeof(u32)
|
|
#define LZ77_MSTEP_SIZE sizeof(u64)
|
|
#define LZ77_SKIP_TRIGGER 4
|
|
|
|
#define LZ77_PREFETCH(ptr) __builtin_prefetch((ptr), 0, 3)
|
|
#define LZ77_FLAG_MAX 32
|
|
|
|
static __always_inline u8 lz77_read8(const u8 *ptr)
|
|
{
|
|
return get_unaligned(ptr);
|
|
}
|
|
|
|
static __always_inline u32 lz77_read32(const u32 *ptr)
|
|
{
|
|
return get_unaligned(ptr);
|
|
}
|
|
|
|
static __always_inline u64 lz77_read64(const u64 *ptr)
|
|
{
|
|
return get_unaligned(ptr);
|
|
}
|
|
|
|
static __always_inline void lz77_write8(u8 *ptr, u8 v)
|
|
{
|
|
put_unaligned(v, ptr);
|
|
}
|
|
|
|
static __always_inline void lz77_write16(u16 *ptr, u16 v)
|
|
{
|
|
put_unaligned_le16(v, ptr);
|
|
}
|
|
|
|
static __always_inline void lz77_write32(u32 *ptr, u32 v)
|
|
{
|
|
put_unaligned_le32(v, ptr);
|
|
}
|
|
|
|
static __always_inline u32 lz77_match_len(const void *match, const void *cur, const void *end)
|
|
{
|
|
const void *start = cur;
|
|
|
|
/* Safe for a do/while because otherwise we wouldn't reach here from the main loop. */
|
|
do {
|
|
const u64 diff = lz77_read64(cur) ^ lz77_read64(match);
|
|
|
|
if (!diff) {
|
|
cur += LZ77_MSTEP_SIZE;
|
|
match += LZ77_MSTEP_SIZE;
|
|
|
|
continue;
|
|
}
|
|
|
|
/* This computes the number of common bytes in @diff. */
|
|
cur += count_trailing_zeros(diff) >> 3;
|
|
|
|
return (cur - start);
|
|
} while (likely(cur + LZ77_MSTEP_SIZE <= end));
|
|
|
|
/* Fallback to byte-by-byte comparison for last <8 bytes. */
|
|
while (cur < end && lz77_read8(cur) == lz77_read8(match)) {
|
|
cur++;
|
|
match++;
|
|
}
|
|
|
|
return (cur - start);
|
|
}
|
|
|
|
/**
|
|
* lz77_encode_match() - Match encoding.
|
|
* @dst: compressed buffer
|
|
* @nib: pointer to an address in @dst
|
|
* @dist: match distance
|
|
* @len: match length
|
|
*
|
|
* Assumes all args were previously checked.
|
|
*
|
|
* Return: @dst advanced to new position
|
|
*
|
|
* Ref: MS-XCA 2.3.4 "Plain LZ77 Compression Algorithm Details" - "Processing"
|
|
*/
|
|
static __always_inline void *lz77_encode_match(void *dst, void **nib, u16 dist, u32 len)
|
|
{
|
|
len -= 3;
|
|
dist--;
|
|
dist <<= 3;
|
|
|
|
if (len < 7) {
|
|
lz77_write16(dst, dist + len);
|
|
|
|
return dst + sizeof(u16);
|
|
}
|
|
|
|
dist |= 7;
|
|
lz77_write16(dst, dist);
|
|
dst += sizeof(u16);
|
|
len -= 7;
|
|
|
|
if (!*nib) {
|
|
lz77_write8(dst, umin(len, 15));
|
|
*nib = dst;
|
|
dst++;
|
|
} else {
|
|
u8 *b = *nib;
|
|
|
|
lz77_write8(b, *b | umin(len, 15) << 4);
|
|
*nib = NULL;
|
|
}
|
|
|
|
if (len < 15)
|
|
return dst;
|
|
|
|
len -= 15;
|
|
if (len < 255) {
|
|
lz77_write8(dst, len);
|
|
|
|
return dst + 1;
|
|
}
|
|
|
|
lz77_write8(dst, 0xff);
|
|
dst++;
|
|
len += 7 + 15;
|
|
if (len <= 0xffff) {
|
|
lz77_write16(dst, len);
|
|
|
|
return dst + sizeof(u16);
|
|
}
|
|
|
|
lz77_write16(dst, 0);
|
|
dst += sizeof(u16);
|
|
lz77_write32(dst, len);
|
|
|
|
return dst + sizeof(u32);
|
|
}
|
|
|
|
/**
|
|
* lz77_encode_literals() - Literals encoding.
|
|
* @start: where to start copying literals (uncompressed buffer)
|
|
* @end: when to stop copying (uncompressed buffer)
|
|
* @dst: compressed buffer
|
|
* @f: pointer to current flag value
|
|
* @fc: pointer to current flag count
|
|
* @fp: pointer to current flag address
|
|
*
|
|
* Batch copy literals from @start to @dst, updating flag values accordingly.
|
|
* Assumes all args were previously checked.
|
|
*
|
|
* Return: @dst advanced to new position
|
|
*
|
|
* MS-XCA 2.3.4 "Plain LZ77 Compression Algorithm Details" - "Processing"
|
|
*/
|
|
static __always_inline void *lz77_encode_literals(const void *start, const void *end, void *dst,
|
|
long *f, u32 *fc, void **fp)
|
|
{
|
|
if (start >= end)
|
|
return dst;
|
|
|
|
do {
|
|
const u32 len = umin(end - start, LZ77_FLAG_MAX - *fc);
|
|
|
|
memcpy(dst, start, len);
|
|
|
|
dst += len;
|
|
start += len;
|
|
|
|
*f <<= len;
|
|
*fc += len;
|
|
if (*fc == LZ77_FLAG_MAX) {
|
|
lz77_write32(*fp, *f);
|
|
*fc = 0;
|
|
*fp = dst;
|
|
dst += sizeof(u32);
|
|
}
|
|
} while (start < end);
|
|
|
|
return dst;
|
|
}
|
|
|
|
static __always_inline u32 lz77_hash(const u32 v)
|
|
{
|
|
return ((v ^ 0x9E3779B9) * 0x85EBCA6B) >> (32 - LZ77_HASH_LOG);
|
|
}
|
|
|
|
noinline int lz77_compress(const void *src, const u32 slen, void *dst, u32 *dlen)
|
|
{
|
|
const void *srcp, *rlim, *end, *anchor;
|
|
u32 *htable, hash, flag_count = 0;
|
|
void *dstp, *nib, *flag_pos;
|
|
long flag = 0;
|
|
|
|
/* This is probably a bug, so throw a warning. */
|
|
if (WARN_ON_ONCE(*dlen < lz77_compressed_alloc_size(slen)))
|
|
return -EINVAL;
|
|
|
|
srcp = anchor = src;
|
|
end = srcp + slen; /* absolute end */
|
|
rlim = end - LZ77_MSTEP_SIZE; /* read limit (for lz77_match_len()) */
|
|
dstp = dst;
|
|
flag_pos = dstp;
|
|
dstp += sizeof(u32);
|
|
nib = NULL;
|
|
|
|
htable = kvcalloc(LZ77_HASH_SIZE, sizeof(*htable), GFP_KERNEL);
|
|
if (!htable)
|
|
return -ENOMEM;
|
|
|
|
LZ77_PREFETCH(srcp + LZ77_RSTEP_SIZE);
|
|
|
|
/*
|
|
* Adjust @srcp so we don't get a false positive match on first iteration.
|
|
* Then prepare hash for first loop iteration (don't advance @srcp again).
|
|
*/
|
|
hash = lz77_hash(lz77_read32(srcp++));
|
|
htable[hash] = 0;
|
|
hash = lz77_hash(lz77_read32(srcp));
|
|
|
|
/*
|
|
* Main loop.
|
|
*
|
|
* @dlen is >= lz77_compressed_alloc_size(), so run without bound-checking @dstp.
|
|
*
|
|
* This code was crafted in a way to best utilise fetch-decode-execute CPU flow.
|
|
* Any attempt to optimize it, or even organize it, can lead to huge performance loss.
|
|
*/
|
|
do {
|
|
const void *match, *next = srcp;
|
|
u32 len, step = 1, skip = 1U << LZ77_SKIP_TRIGGER;
|
|
|
|
/* Match finding (hot path -- don't change the read/check/write order). */
|
|
do {
|
|
const u32 cur_hash = hash;
|
|
|
|
srcp = next;
|
|
next += step;
|
|
|
|
/*
|
|
* Adaptive skipping.
|
|
*
|
|
* Increment @step every (1 << LZ77_SKIP_TRIGGER, 16 in our case) bytes
|
|
* without a match.
|
|
* Reset to 1 when a match is found.
|
|
*/
|
|
step = (skip++ >> LZ77_SKIP_TRIGGER);
|
|
if (unlikely(next > rlim))
|
|
goto out;
|
|
|
|
hash = lz77_hash(lz77_read32(next));
|
|
match = src + htable[cur_hash];
|
|
htable[cur_hash] = srcp - src;
|
|
} while (likely(match + LZ77_MATCH_MAX_DIST < srcp) ||
|
|
lz77_read32(match) != lz77_read32(srcp));
|
|
|
|
/*
|
|
* Match found. Warm/cold path; begin parsing @srcp and writing to @dstp:
|
|
* - flush literals
|
|
* - compute match length (*)
|
|
* - encode match
|
|
*
|
|
* (*) Current minimum match length is defined by the memory read size above, so
|
|
* here we already know that we have 4 matching bytes, but it's just faster to
|
|
* redundantly compute it again in lz77_match_len() than to adjust pointers/len.
|
|
*/
|
|
dstp = lz77_encode_literals(anchor, srcp, dstp, &flag, &flag_count, &flag_pos);
|
|
len = lz77_match_len(match, srcp, end);
|
|
dstp = lz77_encode_match(dstp, &nib, srcp - match, len);
|
|
srcp += len;
|
|
anchor = srcp;
|
|
|
|
LZ77_PREFETCH(srcp);
|
|
|
|
flag = (flag << 1) | 1;
|
|
flag_count++;
|
|
if (flag_count == LZ77_FLAG_MAX) {
|
|
lz77_write32(flag_pos, flag);
|
|
flag_count = 0;
|
|
flag_pos = dstp;
|
|
dstp += sizeof(u32);
|
|
}
|
|
|
|
if (unlikely(srcp > rlim))
|
|
break;
|
|
|
|
/* Prepare for next loop. */
|
|
hash = lz77_hash(lz77_read32(srcp));
|
|
} while (srcp < end);
|
|
out:
|
|
dstp = lz77_encode_literals(anchor, end, dstp, &flag, &flag_count, &flag_pos);
|
|
|
|
flag_count = LZ77_FLAG_MAX - flag_count;
|
|
flag <<= flag_count;
|
|
flag |= (1UL << flag_count) - 1;
|
|
lz77_write32(flag_pos, flag);
|
|
|
|
*dlen = dstp - dst;
|
|
kvfree(htable);
|
|
|
|
if (*dlen < slen)
|
|
return 0;
|
|
|
|
return -EMSGSIZE;
|
|
}
|