From 81b35e7cad790eecf9f359662804bb26055ac7e8 Mon Sep 17 00:00:00 2001 From: Jie Meng Date: Fri, 7 Oct 2022 13:23:47 -0700 Subject: [PATCH 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx x64 JIT produces redundant instructions when a shift operation's destination register is BPF_REG_4/ecx and this patch removes them. Specifically, when dest reg is BPF_REG_4 but the src isn't, we needn't push and pop ecx around shift only to get it overwritten by r11 immediately afterwards. In the rare case when both dest and src registers are BPF_REG_4, a single shift instruction is sufficient and we don't need the two MOV instructions around the shift. To summarize using shift left as an example, without patch: ------------------------------------------------- | dst == ecx | dst != ecx ================================================= src == ecx | mov r11, ecx | shl dst, cl | shl r11, ecx | | mov ecx, r11 | ------------------------------------------------- src != ecx | mov r11, ecx | push ecx | push ecx | mov ecx, src | mov ecx, src | shl dst, cl | shl r11, cl | pop ecx | pop ecx | | mov ecx, r11 | ------------------------------------------------- With patch: ------------------------------------------------- | dst == ecx | dst != ecx ================================================= src == ecx | shl ecx, cl | shl dst, cl ------------------------------------------------- src != ecx | mov r11, ecx | push ecx | mov ecx, src | mov ecx, src | shl r11, cl | shl dst, cl | mov ecx, r11 | pop ecx ------------------------------------------------- Signed-off-by: Jie Meng Link: https://lore.kernel.org/r/20221007202348.1118830-2-jmeng@fb.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0abd082786e7..d926ca637d8d 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1138,16 +1138,15 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image case BPF_ALU64 | BPF_RSH | BPF_X: case BPF_ALU64 | BPF_ARSH | BPF_X: - /* Check for bad case when dst_reg == rcx */ - if (dst_reg == BPF_REG_4) { - /* mov r11, dst_reg */ - EMIT_mov(AUX_REG, dst_reg); - dst_reg = AUX_REG; - } - if (src_reg != BPF_REG_4) { /* common case */ - EMIT1(0x51); /* push rcx */ - + /* Check for bad case when dst_reg == rcx */ + if (dst_reg == BPF_REG_4) { + /* mov r11, dst_reg */ + EMIT_mov(AUX_REG, dst_reg); + dst_reg = AUX_REG; + } else { + EMIT1(0x51); /* push rcx */ + } /* mov rcx, src_reg */ EMIT_mov(BPF_REG_4, src_reg); } @@ -1159,12 +1158,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image b3 = simple_alu_opcodes[BPF_OP(insn->code)]; EMIT2(0xD3, add_1reg(b3, dst_reg)); - if (src_reg != BPF_REG_4) - EMIT1(0x59); /* pop rcx */ + if (src_reg != BPF_REG_4) { + if (insn->dst_reg == BPF_REG_4) + /* mov dst_reg, r11 */ + EMIT_mov(insn->dst_reg, AUX_REG); + else + EMIT1(0x59); /* pop rcx */ + } - if (insn->dst_reg == BPF_REG_4) - /* mov dst_reg, r11 */ - EMIT_mov(insn->dst_reg, AUX_REG); break; case BPF_ALU | BPF_END | BPF_FROM_BE: From 77d8f5d47bfbb5f0a8630102838ffb22cd70d6f5 Mon Sep 17 00:00:00 2001 From: Jie Meng Date: Fri, 7 Oct 2022 13:23:48 -0700 Subject: [PATCH 2/3] bpf,x64: use shrx/sarx/shlx when available BMI2 provides 3 shift instructions (shrx, sarx and shlx) that use VEX encoding but target general purpose registers [1]. They allow the shift count in any general purpose register and have the same performance as non BMI2 shift instructions [2]. Instead of shr/sar/shl that implicitly use %cl (lowest 8 bit of %rcx), emit their more flexible alternatives provided in BMI2 when advantageous; keep using the non BMI2 instructions when shift count is already in BPF_REG_4/%rcx as non BMI2 instructions are shorter. To summarize, when BMI2 is available: ------------------------------------------------- | arbitrary dst ================================================= src == ecx | shl dst, cl ------------------------------------------------- src != ecx | shlx dst, dst, src ------------------------------------------------- And no additional register shuffling is needed. A concrete example between non BMI2 and BMI2 codegen. To shift %rsi by %rdi: Without BMI2: ef3: push %rcx 51 ef4: mov %rdi,%rcx 48 89 f9 ef7: shl %cl,%rsi 48 d3 e6 efa: pop %rcx 59 With BMI2: f0b: shlx %rdi,%rsi,%rsi c4 e2 c1 f7 f6 [1] https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set [2] https://www.agner.org/optimize/instruction_tables.pdf Signed-off-by: Jie Meng Link: https://lore.kernel.org/r/20221007202348.1118830-3-jmeng@fb.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 81 +++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d926ca637d8d..d7dd8e0db8da 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -891,6 +891,65 @@ static void emit_nops(u8 **pprog, int len) *pprog = prog; } +/* emit the 3-byte VEX prefix + * + * r: same as rex.r, extra bit for ModRM reg field + * x: same as rex.x, extra bit for SIB index field + * b: same as rex.b, extra bit for ModRM r/m, or SIB base + * m: opcode map select, encoding escape bytes e.g. 0x0f38 + * w: same as rex.w (32 bit or 64 bit) or opcode specific + * src_reg2: additional source reg (encoded as BPF reg) + * l: vector length (128 bit or 256 bit) or reserved + * pp: opcode prefix (none, 0x66, 0xf2 or 0xf3) + */ +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m, + bool w, u8 src_reg2, bool l, u8 pp) +{ + u8 *prog = *pprog; + const u8 b0 = 0xc4; /* first byte of 3-byte VEX prefix */ + u8 b1, b2; + u8 vvvv = reg2hex[src_reg2]; + + /* reg2hex gives only the lower 3 bit of vvvv */ + if (is_ereg(src_reg2)) + vvvv |= 1 << 3; + + /* + * 2nd byte of 3-byte VEX prefix + * ~ means bit inverted encoding + * + * 7 0 + * +---+---+---+---+---+---+---+---+ + * |~R |~X |~B | m | + * +---+---+---+---+---+---+---+---+ + */ + b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f); + /* + * 3rd byte of 3-byte VEX prefix + * + * 7 0 + * +---+---+---+---+---+---+---+---+ + * | W | ~vvvv | L | pp | + * +---+---+---+---+---+---+---+---+ + */ + b2 = (w << 7) | ((~vvvv & 0xf) << 3) | (l << 2) | (pp & 3); + + EMIT3(b0, b1, b2); + *pprog = prog; +} + +/* emit BMI2 shift instruction */ +static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op) +{ + u8 *prog = *pprog; + bool r = is_ereg(dst_reg); + u8 m = 2; /* escape code 0f38 */ + + emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op); + EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg)); + *pprog = prog; +} + #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, @@ -1137,6 +1196,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image case BPF_ALU64 | BPF_LSH | BPF_X: case BPF_ALU64 | BPF_RSH | BPF_X: case BPF_ALU64 | BPF_ARSH | BPF_X: + /* BMI2 shifts aren't better when shift count is already in rcx */ + if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) { + /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */ + bool w = (BPF_CLASS(insn->code) == BPF_ALU64); + u8 op; + + switch (BPF_OP(insn->code)) { + case BPF_LSH: + op = 1; /* prefix 0x66 */ + break; + case BPF_RSH: + op = 3; /* prefix 0xf2 */ + break; + case BPF_ARSH: + op = 2; /* prefix 0xf3 */ + break; + } + + emit_shiftx(&prog, dst_reg, src_reg, w, op); + + break; + } if (src_reg != BPF_REG_4) { /* common case */ /* Check for bad case when dst_reg == rcx */ From 8662de2321496499a21841486660817f72ac9456 Mon Sep 17 00:00:00 2001 From: Jie Meng Date: Fri, 7 Oct 2022 13:23:49 -0700 Subject: [PATCH 3/3] bpf: add selftests for lsh, rsh, arsh with reg operand Current tests cover only shifts with an immediate as the source operand/shift counts; add a new test case to cover register operand. Signed-off-by: Jie Meng Link: https://lore.kernel.org/r/20221007202348.1118830-4-jmeng@fb.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/verifier/jit.c | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/jit.c b/tools/testing/selftests/bpf/verifier/jit.c index 79021c30e51e..8bf37e5207f1 100644 --- a/tools/testing/selftests/bpf/verifier/jit.c +++ b/tools/testing/selftests/bpf/verifier/jit.c @@ -20,6 +20,30 @@ .result = ACCEPT, .retval = 2, }, +{ + "jit: lsh, rsh, arsh by reg", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_4, 1), + BPF_MOV64_IMM(BPF_REG_1, 0xff), + BPF_ALU64_REG(BPF_LSH, BPF_REG_1, BPF_REG_0), + BPF_ALU32_REG(BPF_LSH, BPF_REG_1, BPF_REG_4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1), + BPF_EXIT_INSN(), + BPF_ALU64_REG(BPF_RSH, BPF_REG_1, BPF_REG_4), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_1), + BPF_ALU32_REG(BPF_RSH, BPF_REG_4, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0xff, 1), + BPF_EXIT_INSN(), + BPF_ALU64_REG(BPF_ARSH, BPF_REG_4, BPF_REG_4), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 2, +}, { "jit: mov32 for ldimm64, 1", .insns = {