From c239bfc2e4aca29bfd02e9f63d32763f528bbfdf Mon Sep 17 00:00:00 2001 From: Romain Naour Date: Tue, 20 Apr 2021 22:12:10 +0100 Subject: [PATCH 01/29] mips: Do not include hi and lo in clobber list for R6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 1d7ba0165d8206ac073f7ac3b14fc0836b66eae7 upstream. From [1] "GCC 10 (PR 91233) won't silently allow registers that are not architecturally available to be present in the clobber list anymore, resulting in build failure for mips*r6 targets in form of: ... .../sysdep.h:146:2: error: the register ‘lo’ cannot be clobbered in ‘asm’ for the current target 146 | __asm__ volatile ( \ | ^~~~~~~ This is because base R6 ISA doesn't define hi and lo registers w/o DSP extension. This patch provides the alternative clobber list for r6 targets that won't include those registers." Since kernel 5.4 and mips support for generic vDSO [2], the kernel fail to build for mips r6 cpus with gcc 10 for the same reason as glibc. [1] https://sourceware.org/git/?p=glibc.git;a=commit;h=020b2a97bb15f807c0482f0faee2184ed05bcad8 [2] '24640f233b46 ("mips: Add support for generic vDSO")' Signed-off-by: Romain Naour Signed-off-by: Sudip Mukherjee Signed-off-by: Thomas Bogendoerfer Signed-off-by: Greg Kroah-Hartman --- arch/mips/include/asm/vdso/gettimeofday.h | 26 ++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/arch/mips/include/asm/vdso/gettimeofday.h b/arch/mips/include/asm/vdso/gettimeofday.h index 2203e2d0ae2a..44a45f3fa4b0 100644 --- a/arch/mips/include/asm/vdso/gettimeofday.h +++ b/arch/mips/include/asm/vdso/gettimeofday.h @@ -20,6 +20,12 @@ #define VDSO_HAS_CLOCK_GETRES 1 +#if MIPS_ISA_REV < 6 +#define VDSO_SYSCALL_CLOBBERS "hi", "lo", +#else +#define VDSO_SYSCALL_CLOBBERS +#endif + static __always_inline long gettimeofday_fallback( struct __kernel_old_timeval *_tv, struct timezone *_tz) @@ -35,7 +41,9 @@ static __always_inline long gettimeofday_fallback( : "=r" (ret), "=r" (error) : "r" (tv), "r" (tz), "r" (nr) : "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13", - "$14", "$15", "$24", "$25", "hi", "lo", "memory"); + "$14", "$15", "$24", "$25", + VDSO_SYSCALL_CLOBBERS + "memory"); return error ? -ret : ret; } @@ -59,7 +67,9 @@ static __always_inline long clock_gettime_fallback( : "=r" (ret), "=r" (error) : "r" (clkid), "r" (ts), "r" (nr) : "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13", - "$14", "$15", "$24", "$25", "hi", "lo", "memory"); + "$14", "$15", "$24", "$25", + VDSO_SYSCALL_CLOBBERS + "memory"); return error ? -ret : ret; } @@ -83,7 +93,9 @@ static __always_inline int clock_getres_fallback( : "=r" (ret), "=r" (error) : "r" (clkid), "r" (ts), "r" (nr) : "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13", - "$14", "$15", "$24", "$25", "hi", "lo", "memory"); + "$14", "$15", "$24", "$25", + VDSO_SYSCALL_CLOBBERS + "memory"); return error ? -ret : ret; } @@ -105,7 +117,9 @@ static __always_inline long clock_gettime32_fallback( : "=r" (ret), "=r" (error) : "r" (clkid), "r" (ts), "r" (nr) : "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13", - "$14", "$15", "$24", "$25", "hi", "lo", "memory"); + "$14", "$15", "$24", "$25", + VDSO_SYSCALL_CLOBBERS + "memory"); return error ? -ret : ret; } @@ -125,7 +139,9 @@ static __always_inline int clock_getres32_fallback( : "=r" (ret), "=r" (error) : "r" (clkid), "r" (ts), "r" (nr) : "$1", "$3", "$8", "$9", "$10", "$11", "$12", "$13", - "$14", "$15", "$24", "$25", "hi", "lo", "memory"); + "$14", "$15", "$24", "$25", + VDSO_SYSCALL_CLOBBERS + "memory"); return error ? -ret : ret; } From d3598eb3915cc0c0d8cab42f4a6258ff44c4033e Mon Sep 17 00:00:00 2001 From: Jonathon Reinhart Date: Mon, 12 Apr 2021 00:24:53 -0400 Subject: [PATCH 02/29] netfilter: conntrack: Make global sysctls readonly in non-init netns commit 2671fa4dc0109d3fb581bc3078fdf17b5d9080f6 upstream. These sysctls point to global variables: - NF_SYSCTL_CT_MAX (&nf_conntrack_max) - NF_SYSCTL_CT_EXPECT_MAX (&nf_ct_expect_max) - NF_SYSCTL_CT_BUCKETS (&nf_conntrack_htable_size_user) Because their data pointers are not updated to point to per-netns structures, they must be marked read-only in a non-init_net ns. Otherwise, changes in any net namespace are reflected in (leaked into) all other net namespaces. This problem has existed since the introduction of net namespaces. The current logic marks them read-only only if the net namespace is owned by an unprivileged user (other than init_user_ns). Commit d0febd81ae77 ("netfilter: conntrack: re-visit sysctls in unprivileged namespaces") "exposes all sysctls even if the namespace is unpriviliged." Since we need to mark them readonly in any case, we can forego the unprivileged user check altogether. Fixes: d0febd81ae77 ("netfilter: conntrack: re-visit sysctls in unprivileged namespaces") Signed-off-by: Jonathon Reinhart Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nf_conntrack_standalone.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index c6c0cb465664..313d1c8ff066 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1060,16 +1060,10 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) nf_conntrack_standalone_init_dccp_sysctl(net, table); nf_conntrack_standalone_init_gre_sysctl(net, table); - /* Don't allow unprivileged users to alter certain sysctls */ - if (net->user_ns != &init_user_ns) { + /* Don't allow non-init_net ns to alter global sysctls */ + if (!net_eq(&init_net, net)) { table[NF_SYSCTL_CT_MAX].mode = 0444; table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444; - table[NF_SYSCTL_CT_HELPER].mode = 0444; -#ifdef CONFIG_NF_CONNTRACK_EVENTS - table[NF_SYSCTL_CT_EVENTS].mode = 0444; -#endif - table[NF_SYSCTL_CT_BUCKETS].mode = 0444; - } else if (!net_eq(&init_net, net)) { table[NF_SYSCTL_CT_BUCKETS].mode = 0444; } From 2e68890993d0940cedc1f0369282256854293a93 Mon Sep 17 00:00:00 2001 From: Phillip Potter Date: Thu, 1 Apr 2021 23:36:07 +0100 Subject: [PATCH 03/29] net: usb: ax88179_178a: initialize local variables before use commit bd78980be1a68d14524c51c4b4170782fada622b upstream. Use memset to initialize local array in drivers/net/usb/ax88179_178a.c, and also set a local u16 and u32 variable to 0. Fixes a KMSAN found uninit-value bug reported by syzbot at: https://syzkaller.appspot.com/bug?id=00371c73c72f72487c1d0bfe0cc9d00de339d5aa Reported-by: syzbot+4993e4a0e237f1b53747@syzkaller.appspotmail.com Signed-off-by: Phillip Potter Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/ax88179_178a.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 5541f3faedbc..b77b0a33d697 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -296,12 +296,12 @@ static int ax88179_read_cmd(struct usbnet *dev, u8 cmd, u16 value, u16 index, int ret; if (2 == size) { - u16 buf; + u16 buf = 0; ret = __ax88179_read_cmd(dev, cmd, value, index, size, &buf, 0); le16_to_cpus(&buf); *((u16 *)data) = buf; } else if (4 == size) { - u32 buf; + u32 buf = 0; ret = __ax88179_read_cmd(dev, cmd, value, index, size, &buf, 0); le32_to_cpus(&buf); *((u32 *)data) = buf; @@ -1296,6 +1296,8 @@ static void ax88179_get_mac_addr(struct usbnet *dev) { u8 mac[ETH_ALEN]; + memset(mac, 0, sizeof(mac)); + /* Maybe the boot loader passed the MAC address via device tree */ if (!eth_platform_get_mac_address(&dev->udev->dev, mac)) { netif_dbg(dev, ifup, dev->net, From a41c193d004eac7c4daf5166030001d9763a6ebe Mon Sep 17 00:00:00 2001 From: Nick Lowe Date: Mon, 21 Dec 2020 22:25:02 +0000 Subject: [PATCH 04/29] igb: Enable RSS for Intel I211 Ethernet Controller commit 6e6026f2dd2005844fb35c3911e8083c09952c6c upstream. The Intel I211 Ethernet Controller supports 2 Receive Side Scaling (RSS) queues. It should not be excluded from having this feature enabled. Via commit c883de9fd787 ("igb: rename igb define to be more generic") E1000_MRQC_ENABLE_RSS_4Q was renamed to E1000_MRQC_ENABLE_RSS_MQ to indicate that this is a generic bit flag to enable queues and not a flag that is specific to devices that support 4 queues The bit flag enables 2, 4 or 8 queues appropriately depending on the part. Tested with a multicore CPU and frames were then distributed as expected. This issue appears to have been introduced because of confusion caused by the prior name. Signed-off-by: Nick Lowe Tested-by: David Switzer Signed-off-by: Tony Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/igb/igb_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index fecfcfcf161c..368f0aac5e1d 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -4482,8 +4482,7 @@ static void igb_setup_mrqc(struct igb_adapter *adapter) else mrqc |= E1000_MRQC_ENABLE_VMDQ; } else { - if (hw->mac.type != e1000_i211) - mrqc |= E1000_MRQC_ENABLE_RSS_MQ; + mrqc |= E1000_MRQC_ENABLE_RSS_MQ; } igb_vmm_control(adapter); From 2cfa537674cd1051a3b8111536d77d0558f33d5d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 30 Apr 2021 16:21:46 +0200 Subject: [PATCH 05/29] bpf: Fix masking negation logic upon negative dst register commit b9b34ddbe2076ade359cd5ce7537d5ed019e9807 upstream. The negation logic for the case where the off_reg is sitting in the dst register is not correct given then we cannot just invert the add to a sub or vice versa. As a fix, perform the final bitwise and-op unconditionally into AX from the off_reg, then move the pointer from the src to dst and finally use AX as the source for the original pointer arithmetic operation such that the inversion yields a correct result. The single non-AX mov in between is possible given constant blinding is retaining it as it's not an immediate based operation. Fixes: 979d63d50c0c ("bpf: prevent out of bounds speculation on pointer arithmetic") Signed-off-by: Daniel Borkmann Tested-by: Piotr Krysiuk Reviewed-by: Piotr Krysiuk Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b9180509917e..a32c5962c344 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11403,14 +11403,10 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); - if (issrc) { - *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, - off_reg); - insn->src_reg = BPF_REG_AX; - } else { - *patch++ = BPF_ALU64_REG(BPF_AND, off_reg, - BPF_REG_AX); - } + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg); + if (!issrc) + *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg); + insn->src_reg = BPF_REG_AX; if (isneg) insn->code = insn->code == code_add ? code_sub : code_add; From 2fa15d61e4cbaaa1d1250e67b251ff96952fa614 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 Apr 2021 15:19:37 +0000 Subject: [PATCH 06/29] bpf: Fix leakage of uninitialized bpf stack under speculation commit 801c6058d14a82179a7ee17a4b532cac6fad067f upstream. The current implemented mechanisms to mitigate data disclosure under speculation mainly address stack and map value oob access from the speculative domain. However, Piotr discovered that uninitialized BPF stack is not protected yet, and thus old data from the kernel stack, potentially including addresses of kernel structures, could still be extracted from that 512 bytes large window. The BPF stack is special compared to map values since it's not zero initialized for every program invocation, whereas map values /are/ zero initialized upon their initial allocation and thus cannot leak any prior data in either domain. In the non-speculative domain, the verifier ensures that every stack slot read must have a prior stack slot write by the BPF program to avoid such data leaking issue. However, this is not enough: for example, when the pointer arithmetic operation moves the stack pointer from the last valid stack offset to the first valid offset, the sanitation logic allows for any intermediate offsets during speculative execution, which could then be used to extract any restricted stack content via side-channel. Given for unprivileged stack pointer arithmetic the use of unknown but bounded scalars is generally forbidden, we can simply turn the register-based arithmetic operation into an immediate-based arithmetic operation without the need for masking. This also gives the benefit of reducing the needed instructions for the operation. Given after the work in 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask"), the aux->alu_limit already holds the final immediate value for the offset register with the known scalar. Thus, a simple mov of the immediate to AX register with using AX as the source for the original instruction is sufficient and possible now in this case. Reported-by: Piotr Krysiuk Signed-off-by: Daniel Borkmann Tested-by: Piotr Krysiuk Reviewed-by: Piotr Krysiuk Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf_verifier.h | 5 +++-- kernel/bpf/verifier.c | 27 +++++++++++++++++---------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 85bac3191e12..2739a6431b9e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -291,10 +291,11 @@ struct bpf_verifier_state_list { }; /* Possible states for alu_state member. */ -#define BPF_ALU_SANITIZE_SRC 1U -#define BPF_ALU_SANITIZE_DST 2U +#define BPF_ALU_SANITIZE_SRC (1U << 0) +#define BPF_ALU_SANITIZE_DST (1U << 1) #define BPF_ALU_NEG_VALUE (1U << 2) #define BPF_ALU_NON_POINTER (1U << 3) +#define BPF_ALU_IMMEDIATE (1U << 4) #define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ BPF_ALU_SANITIZE_DST) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a32c5962c344..b6656d181c9e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5755,6 +5755,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, { struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux; struct bpf_verifier_state *vstate = env->cur_state; + bool off_is_imm = tnum_is_const(off_reg->var_off); bool off_is_neg = off_reg->smin_value < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); @@ -5785,6 +5786,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_limit = abs(tmp_aux->alu_limit - alu_limit); } else { alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; } @@ -11383,7 +11385,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; struct bpf_insn insn_buf[16]; struct bpf_insn *patch = &insn_buf[0]; - bool issrc, isneg; + bool issrc, isneg, isimm; u32 off_reg; aux = &env->insn_aux_data[i + delta]; @@ -11394,16 +11396,21 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) isneg = aux->alu_state & BPF_ALU_NEG_VALUE; issrc = (aux->alu_state & BPF_ALU_SANITIZE) == BPF_ALU_SANITIZE_SRC; + isimm = aux->alu_state & BPF_ALU_IMMEDIATE; off_reg = issrc ? insn->src_reg : insn->dst_reg; - if (isneg) - *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); - *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); - *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); - *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); - *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); - *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); - *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg); + if (isimm) { + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); + } else { + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg); + } if (!issrc) *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg); insn->src_reg = BPF_REG_AX; @@ -11411,7 +11418,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = insn->code == code_add ? code_sub : code_add; *patch++ = *insn; - if (issrc && isneg) + if (issrc && isneg && !isimm) *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); cnt = patch - insn_buf; From 48ec949ac979b4b42d740f67b6177797af834f80 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Wed, 21 Apr 2021 10:40:07 -0700 Subject: [PATCH 07/29] net: qrtr: Avoid potential use after free in MHI send commit 47a017f33943278570c072bc71681809b2567b3a upstream. It is possible that the MHI ul_callback will be invoked immediately following the queueing of the skb for transmission, leading to the callback decrementing the refcount of the associated sk and freeing the skb. As such the dereference of skb and the increment of the sk refcount must happen before the skb is queued, to avoid the skb to be used after free and potentially the sk to drop its last refcount.. Fixes: 6e728f321393 ("net: qrtr: Add MHI transport layer") Signed-off-by: Bjorn Andersson Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/qrtr/mhi.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c index ff0c41467fc1..0fe4bf40919d 100644 --- a/net/qrtr/mhi.c +++ b/net/qrtr/mhi.c @@ -50,6 +50,9 @@ static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) struct qrtr_mhi_dev *qdev = container_of(ep, struct qrtr_mhi_dev, ep); int rc; + if (skb->sk) + sock_hold(skb->sk); + rc = skb_linearize(skb); if (rc) goto free_skb; @@ -59,12 +62,11 @@ static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) if (rc) goto free_skb; - if (skb->sk) - sock_hold(skb->sk); - return rc; free_skb: + if (skb->sk) + sock_put(skb->sk); kfree_skb(skb); return rc; From b571a6302a64fd1d516a1843afc893f0dfdf23d5 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 15 Apr 2021 16:34:16 +0800 Subject: [PATCH 08/29] perf data: Fix error return code in perf_data__create_dir() [ Upstream commit f2211881e737cade55e0ee07cf6a26d91a35a6fe ] Although 'ret' has been initialized to -1, but it will be reassigned by the "ret = open(...)" statement in the for loop. So that, the value of 'ret' is unknown when asprintf() failed. Reported-by: Hulk Robot Signed-off-by: Zhen Lei Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210415083417.3740-1-thunder.leizhen@huawei.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin --- tools/perf/util/data.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c index c47aa34fdc0a..5d97b3e45fbb 100644 --- a/tools/perf/util/data.c +++ b/tools/perf/util/data.c @@ -35,7 +35,7 @@ void perf_data__close_dir(struct perf_data *data) int perf_data__create_dir(struct perf_data *data, int nr) { struct perf_data_file *files = NULL; - int i, ret = -1; + int i, ret; if (WARN_ON(!data->is_dir)) return -EINVAL; @@ -51,7 +51,8 @@ int perf_data__create_dir(struct perf_data *data, int nr) for (i = 0; i < nr; i++) { struct perf_data_file *file = &files[i]; - if (asprintf(&file->path, "%s/data.%d", data->path, i) < 0) + ret = asprintf(&file->path, "%s/data.%d", data->path, i); + if (ret < 0) goto out_err; ret = open(file->path, O_RDWR|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR); From fb4c1c2e9fd1adb19452bbaa75f0f2bb2826ac0d Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Tue, 20 Apr 2021 08:43:34 -0500 Subject: [PATCH 09/29] capabilities: require CAP_SETFCAP to map uid 0 [ Upstream commit db2e718a47984b9d71ed890eb2ea36ecf150de18 ] cap_setfcap is required to create file capabilities. Since commit 8db6c34f1dbc ("Introduce v3 namespaced file capabilities"), a process running as uid 0 but without cap_setfcap is able to work around this as follows: unshare a new user namespace which maps parent uid 0 into the child namespace. While this task will not have new capabilities against the parent namespace, there is a loophole due to the way namespaced file capabilities are represented as xattrs. File capabilities valid in userns 1 are distinguished from file capabilities valid in userns 2 by the kuid which underlies uid 0. Therefore the restricted root process can unshare a new self-mapping namespace, add a namespaced file capability onto a file, then use that file capability in the parent namespace. To prevent that, do not allow mapping parent uid 0 if the process which opened the uid_map file does not have CAP_SETFCAP, which is the capability for setting file capabilities. As a further wrinkle: a task can unshare its user namespace, then open its uid_map file itself, and map (only) its own uid. In this case we do not have the credential from before unshare, which was potentially more restricted. So, when creating a user namespace, we record whether the creator had CAP_SETFCAP. Then we can use that during map_write(). With this patch: 1. Unprivileged user can still unshare -Ur ubuntu@caps:~$ unshare -Ur root@caps:~# logout 2. Root user can still unshare -Ur ubuntu@caps:~$ sudo bash root@caps:/home/ubuntu# unshare -Ur root@caps:/home/ubuntu# logout 3. Root user without CAP_SETFCAP cannot unshare -Ur: root@caps:/home/ubuntu# /sbin/capsh --drop=cap_setfcap -- root@caps:/home/ubuntu# /sbin/setcap cap_setfcap=p /sbin/setcap unable to set CAP_SETFCAP effective capability: Operation not permitted root@caps:/home/ubuntu# unshare -Ur unshare: write failed /proc/self/uid_map: Operation not permitted Note: an alternative solution would be to allow uid 0 mappings by processes without CAP_SETFCAP, but to prevent such a namespace from writing any file capabilities. This approach can be seen at [1]. Background history: commit 95ebabde382 ("capabilities: Don't allow writing ambiguous v3 file capabilities") tried to fix the issue by preventing v3 fscaps to be written to disk when the root uid would map to the same uid in nested user namespaces. This led to regressions for various workloads. For example, see [2]. Ultimately this is a valid use-case we have to support meaning we had to revert this change in 3b0c2d3eaa83 ("Revert 95ebabde382c ("capabilities: Don't allow writing ambiguous v3 file capabilities")"). Link: https://git.kernel.org/pub/scm/linux/kernel/git/sergeh/linux.git/log/?h=2021-04-15/setfcap-nsfscaps-v4 [1] Link: https://github.com/containers/buildah/issues/3071 [2] Signed-off-by: Serge Hallyn Reviewed-by: Andrew G. Morgan Tested-by: Christian Brauner Reviewed-by: Christian Brauner Tested-by: Giuseppe Scrivano Cc: Eric Biederman Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/user_namespace.h | 3 ++ include/uapi/linux/capability.h | 3 +- kernel/user_namespace.c | 65 +++++++++++++++++++++++++++++++-- 3 files changed, 67 insertions(+), 4 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 6ef1c7109fc4..7616c7bf4b24 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -64,6 +64,9 @@ struct user_namespace { kgid_t group; struct ns_common ns; unsigned long flags; + /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP + * in its effective capability set at the child ns creation time. */ + bool parent_could_setfcap; #ifdef CONFIG_KEYS /* List of joinable keyrings in this namespace. Modification access of diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index c6ca33034147..2ddb4226cd23 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -335,7 +335,8 @@ struct vfs_ns_cap_data { #define CAP_AUDIT_CONTROL 30 -/* Set or remove capabilities on files */ +/* Set or remove capabilities on files. + Map uid=0 into a child user namespace. */ #define CAP_SETFCAP 31 diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e703d5d9cbe8..ce396ea4de60 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -106,6 +106,7 @@ int create_user_ns(struct cred *new) if (!ns) goto fail_dec; + ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; @@ -841,6 +842,60 @@ static int sort_idmaps(struct uid_gid_map *map) return 0; } +/** + * verify_root_map() - check the uid 0 mapping + * @file: idmapping file + * @map_ns: user namespace of the target process + * @new_map: requested idmap + * + * If a process requests mapping parent uid 0 into the new ns, verify that the + * process writing the map had the CAP_SETFCAP capability as the target process + * will be able to write fscaps that are valid in ancestor user namespaces. + * + * Return: true if the mapping is allowed, false if not. + */ +static bool verify_root_map(const struct file *file, + struct user_namespace *map_ns, + struct uid_gid_map *new_map) +{ + int idx; + const struct user_namespace *file_ns = file->f_cred->user_ns; + struct uid_gid_extent *extent0 = NULL; + + for (idx = 0; idx < new_map->nr_extents; idx++) { + if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent0 = &new_map->extent[idx]; + else + extent0 = &new_map->forward[idx]; + if (extent0->lower_first == 0) + break; + + extent0 = NULL; + } + + if (!extent0) + return true; + + if (map_ns == file_ns) { + /* The process unshared its ns and is writing to its own + * /proc/self/uid_map. User already has full capabilites in + * the new namespace. Verify that the parent had CAP_SETFCAP + * when it unshared. + * */ + if (!file_ns->parent_could_setfcap) + return false; + } else { + /* Process p1 is writing to uid_map of p2, who is in a child + * user namespace to p1's. Verify that the opener of the map + * file has CAP_SETFCAP against the parent of the new map + * namespace */ + if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) + return false; + } + + return true; +} + static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -848,7 +903,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; - struct user_namespace *ns = seq->private; + struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; @@ -895,7 +950,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* * Adjusting namespace settings requires capabilities on the target. */ - if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) + if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ @@ -965,7 +1020,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ - if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) + if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; @@ -1086,6 +1141,10 @@ static bool new_idmap_permitted(const struct file *file, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; + + if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) + return false; + /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ From a7c37332afa8916ac55509e3df6a4cfdbb570a4e Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 21 Apr 2021 14:04:00 +0200 Subject: [PATCH 10/29] perf ftrace: Fix access to pid in array when setting a pid filter [ Upstream commit 671b60cb6a897a5b3832fe57657152f2c3995e25 ] Command 'perf ftrace -v -- ls' fails in s390 (at least 5.12.0rc6). The root cause is a missing pointer dereference which causes an array element address to be used as PID. Fix this by extracting the PID. Output before: # ./perf ftrace -v -- ls function_graph tracer is used write '-263732416' to tracing/set_ftrace_pid failed: Invalid argument failed to set ftrace pid # Output after: ./perf ftrace -v -- ls function_graph tracer is used # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 4) | rcu_read_lock_sched_held() { 4) 0.552 us | rcu_lockdep_current_cpu_online(); 4) 6.124 us | } Reported-by: Alexander Schmidt Signed-off-by: Thomas Richter Acked-by: Namhyung Kim Cc: Heiko Carstens Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Link: http://lore.kernel.org/lkml/20210421120400.2126433-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Sasha Levin --- tools/perf/builtin-ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index 9366fad591dc..eecc70fc3b19 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -289,7 +289,7 @@ static int set_tracing_pid(struct perf_ftrace *ftrace) for (i = 0; i < perf_thread_map__nr(ftrace->evlist->core.threads); i++) { scnprintf(buf, sizeof(buf), "%d", - ftrace->evlist->core.threads->map[i]); + perf_thread_map__pid(ftrace->evlist->core.threads, i)); if (append_tracing_file("set_ftrace_pid", buf) < 0) return -1; } From 6995512a472f3b57d2838dcae725c01ceb0bb4f5 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 23 Apr 2021 14:29:03 -0700 Subject: [PATCH 11/29] tools/cgroup/slabinfo.py: updated to work on current kernel [ Upstream commit 1974c45dd7745e999b9387be3d8fdcb27a5b1721 ] slabinfo.py script does not work with actual kernel version. First, it was unable to recognise SLUB susbsytem, and when I specified it manually it failed again with AttributeError: 'struct page' has no member 'obj_cgroups' .. and then again with File "tools/cgroup/memcg_slabinfo.py", line 221, in main memcg.kmem_caches.address_of_(), AttributeError: 'struct mem_cgroup' has no member 'kmem_caches' Link: https://lkml.kernel.org/r/cec1a75e-43b4-3d64-2084-d9f98fda037f@virtuozzo.com Signed-off-by: Vasily Averin Tested-by: Roman Gushchin Acked-by: Roman Gushchin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- tools/cgroup/memcg_slabinfo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py index c4225ed63565..1600b17dbb8a 100644 --- a/tools/cgroup/memcg_slabinfo.py +++ b/tools/cgroup/memcg_slabinfo.py @@ -128,9 +128,9 @@ def detect_kernel_config(): cfg['nr_nodes'] = prog['nr_online_nodes'].value_() - if prog.type('struct kmem_cache').members[1][1] == 'flags': + if prog.type('struct kmem_cache').members[1].name == 'flags': cfg['allocator'] = 'SLUB' - elif prog.type('struct kmem_cache').members[1][1] == 'batchcount': + elif prog.type('struct kmem_cache').members[1].name == 'batchcount': cfg['allocator'] = 'SLAB' else: err('Can\'t determine the slab allocator') @@ -193,7 +193,7 @@ def main(): # look over all slab pages, belonging to non-root memcgs # and look for objects belonging to the given memory cgroup for page in for_each_slab_page(prog): - objcg_vec_raw = page.obj_cgroups.value_() + objcg_vec_raw = page.memcg_data.value_() if objcg_vec_raw == 0: continue cache = page.slab_cache @@ -202,7 +202,7 @@ def main(): addr = cache.value_() caches[addr] = cache # clear the lowest bit to get the true obj_cgroups - objcg_vec = Object(prog, page.obj_cgroups.type_, + objcg_vec = Object(prog, 'struct obj_cgroup **', value=objcg_vec_raw & ~1) if addr not in stats: From 2e8b3b0b8e2d3b56a0b653980c62cbcb374ffedf Mon Sep 17 00:00:00 2001 From: Jianxiong Gao Date: Thu, 29 Apr 2021 17:33:07 +0000 Subject: [PATCH 12/29] driver core: add a min_align_mask field to struct device_dma_parameters commit: 36950f2da1ea4cb683be174f6f581e25b2d33e71 Some devices rely on the address offset in a page to function correctly (NVMe driver as an example). These devices may use a different page size than the Linux kernel. The address offset has to be preserved upon mapping, and in order to do so, we need to record the page_offset_mask first. Signed-off-by: Jianxiong Gao Signed-off-by: Christoph Hellwig Acked-by: Greg Kroah-Hartman Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 1 + include/linux/dma-mapping.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/linux/device.h b/include/linux/device.h index 2b39de35525a..75a24b32fee8 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -291,6 +291,7 @@ struct device_dma_parameters { * sg limitations. */ unsigned int max_segment_size; + unsigned int min_align_mask; unsigned long segment_boundary_mask; }; diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 956151052d45..a7d70cdee25e 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -500,6 +500,22 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) return -EIO; } +static inline unsigned int dma_get_min_align_mask(struct device *dev) +{ + if (dev->dma_parms) + return dev->dma_parms->min_align_mask; + return 0; +} + +static inline int dma_set_min_align_mask(struct device *dev, + unsigned int min_align_mask) +{ + if (WARN_ON_ONCE(!dev->dma_parms)) + return -EIO; + dev->dma_parms->min_align_mask = min_align_mask; + return 0; +} + static inline int dma_get_cache_alignment(void) { #ifdef ARCH_DMA_MINALIGN From 22163a8ec863f98bc4a096ac08b2dfe9edc5ddf1 Mon Sep 17 00:00:00 2001 From: Jianxiong Gao Date: Thu, 29 Apr 2021 17:33:08 +0000 Subject: [PATCH 13/29] swiotlb: add a IO_TLB_SIZE define commit: b5d7ccb7aac3895c2138fe0980a109116ce15eff Add a new IO_TLB_SIZE define instead open coding it using IO_TLB_SHIFT all over. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Jianxiong Gao Signed-off-by: Greg Kroah-Hartman --- include/linux/swiotlb.h | 1 + kernel/dma/swiotlb.c | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index fbdc65782195..5d2dbe7e04c3 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -29,6 +29,7 @@ enum swiotlb_force { * controllable. */ #define IO_TLB_SHIFT 11 +#define IO_TLB_SIZE (1 << IO_TLB_SHIFT) extern void swiotlb_init(int verbose); int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 781b9dca197c..fb88c2982645 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -475,20 +475,20 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, tbl_dma_addr &= mask; - offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + offset_slots = ALIGN(tbl_dma_addr, IO_TLB_SIZE) >> IO_TLB_SHIFT; /* * Carefully handle integer overflow which can occur when mask == ~0UL. */ max_slots = mask + 1 - ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT + ? ALIGN(mask + 1, IO_TLB_SIZE) >> IO_TLB_SHIFT : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); /* * For mappings greater than or equal to a page, we limit the stride * (and hence alignment) to a page size. */ - nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT; if (alloc_size >= PAGE_SIZE) stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); else @@ -582,7 +582,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_data_direction dir, unsigned long attrs) { unsigned long flags; - int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + int i, count, nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT; int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; @@ -633,7 +633,7 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, if (orig_addr == INVALID_PHYS_ADDR) return; - orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1); + orig_addr += (unsigned long)tlb_addr & (IO_TLB_SIZE - 1); switch (target) { case SYNC_FOR_CPU: @@ -691,7 +691,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, size_t swiotlb_max_mapping_size(struct device *dev) { - return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE; + return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE; } bool is_swiotlb_active(void) From 1bbcc985d19524114bd2e0db1d826807043348cd Mon Sep 17 00:00:00 2001 From: Jianxiong Gao Date: Thu, 29 Apr 2021 17:33:09 +0000 Subject: [PATCH 14/29] swiotlb: factor out an io_tlb_offset helper commit: c7fbeca757fe74135d8b6a4c8ddaef76f5775d68 Replace the very genericly named OFFSET macro with a little inline helper that hardcodes the alignment to the only value ever passed. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Jianxiong Gao Signed-off-by: Greg Kroah-Hartman --- kernel/dma/swiotlb.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index fb88c2982645..f55248f16366 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -50,9 +50,6 @@ #define CREATE_TRACE_POINTS #include -#define OFFSET(val,align) ((unsigned long) \ - ( (val) & ( (align) - 1))) - #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) /* @@ -176,6 +173,11 @@ void swiotlb_print_info(void) bytes >> 20); } +static inline unsigned long io_tlb_offset(unsigned long val) +{ + return val & (IO_TLB_SEGSIZE - 1); +} + /* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to @@ -225,7 +227,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) __func__, alloc_size, PAGE_SIZE); for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; @@ -359,7 +361,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) goto cleanup4; for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; @@ -530,7 +532,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, for (i = index; i < (int) (index + nslots); i++) io_tlb_list[i] = 0; - for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && + io_tlb_list[i]; i--) io_tlb_list[i] = ++count; tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); @@ -616,7 +620,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * Step 2: merge the returned slots with the preceding slots, * if available (non zero) */ - for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && + io_tlb_list[i]; i--) io_tlb_list[i] = ++count; io_tlb_used -= nslots; From 1f2ef5a0f771f69f71549bb1917c6e8cb23e6818 Mon Sep 17 00:00:00 2001 From: Jianxiong Gao Date: Thu, 29 Apr 2021 17:33:10 +0000 Subject: [PATCH 15/29] swiotlb: factor out a nr_slots helper commit: c32a77fd18780a5192dfb6eec69f239faebf28fd Factor out a helper to find the number of slots for a given size. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Jianxiong Gao Signed-off-by: Greg Kroah-Hartman --- kernel/dma/swiotlb.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f55248f16366..f0be199da527 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -178,6 +178,11 @@ static inline unsigned long io_tlb_offset(unsigned long val) return val & (IO_TLB_SEGSIZE - 1); } +static inline unsigned long nr_slots(u64 val) +{ + return DIV_ROUND_UP(val, IO_TLB_SIZE); +} + /* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to @@ -477,20 +482,20 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, tbl_dma_addr &= mask; - offset_slots = ALIGN(tbl_dma_addr, IO_TLB_SIZE) >> IO_TLB_SHIFT; + offset_slots = nr_slots(tbl_dma_addr); /* * Carefully handle integer overflow which can occur when mask == ~0UL. */ max_slots = mask + 1 - ? ALIGN(mask + 1, IO_TLB_SIZE) >> IO_TLB_SHIFT + ? nr_slots(mask + 1) : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); /* * For mappings greater than or equal to a page, we limit the stride * (and hence alignment) to a page size. */ - nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT; + nslots = nr_slots(alloc_size); if (alloc_size >= PAGE_SIZE) stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); else @@ -586,7 +591,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_data_direction dir, unsigned long attrs) { unsigned long flags; - int i, count, nslots = ALIGN(alloc_size, IO_TLB_SIZE) >> IO_TLB_SHIFT; + int i, count, nslots = nr_slots(alloc_size); int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; From 9efd5df078a7e1aa2a825855688025b586ed880f Mon Sep 17 00:00:00 2001 From: Jianxiong Gao Date: Thu, 29 Apr 2021 17:33:11 +0000 Subject: [PATCH 16/29] swiotlb: clean up swiotlb_tbl_unmap_single commit: ca10d0f8e530600ec63c603dbace2c30927d70b7 swiotlb: clean up swiotlb_tbl_unmap_single Remove a layer of pointless indentation, replace a hard to follow ternary expression with a plain if/else. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Jianxiong Gao Signed-off-by: Greg Kroah-Hartman --- kernel/dma/swiotlb.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f0be199da527..f5530336d7ca 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -610,28 +610,29 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * with slots below and above the pool being returned. */ spin_lock_irqsave(&io_tlb_lock, flags); - { - count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? - io_tlb_list[index + nslots] : 0); - /* - * Step 1: return the slots to the free list, merging the - * slots with superceeding slots - */ - for (i = index + nslots - 1; i >= index; i--) { - io_tlb_list[i] = ++count; - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - } - /* - * Step 2: merge the returned slots with the preceding slots, - * if available (non zero) - */ - for (i = index - 1; - io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && - io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; + if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) + count = io_tlb_list[index + nslots]; + else + count = 0; - io_tlb_used -= nslots; + /* + * Step 1: return the slots to the free list, merging the slots with + * superceeding slots + */ + for (i = index + nslots - 1; i >= index; i--) { + io_tlb_list[i] = ++count; + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } + + /* + * Step 2: merge the returned slots with the preceding slots, if + * available (non zero) + */ + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i]; + i--) + io_tlb_list[i] = ++count; + io_tlb_used -= nslots; spin_unlock_irqrestore(&io_tlb_lock, flags); } From 25ed8827cfbf017ffcd195258643c54b55db08c5 Mon Sep 17 00:00:00 2001 From: Jianxiong Gao Date: Thu, 29 Apr 2021 17:33:12 +0000 Subject: [PATCH 17/29] swiotlb: refactor swiotlb_tbl_map_single commit: 26a7e094783d482f3e125f09945a5bb1d867b2e6 Split out a bunch of a self-contained helpers to make the function easier to follow. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Jianxiong Gao Signed-off-by: Greg Kroah-Hartman --- kernel/dma/swiotlb.c | 205 +++++++++++++++++++++---------------------- 1 file changed, 102 insertions(+), 103 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f5530336d7ca..243750453b3d 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -452,19 +452,99 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, } } -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, +#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) + +/* + * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. + */ +static inline unsigned long get_max_slots(unsigned long boundary_mask) +{ + if (boundary_mask == ~0UL) + return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); + return nr_slots(boundary_mask + 1); +} + +static unsigned int wrap_index(unsigned int index) +{ + if (index >= io_tlb_nslabs) + return 0; + return index; +} + +/* + * Find a suitable number of IO TLB entries size that will fit this request and + * allocate a buffer from that IO TLB pool. + */ +static int find_slots(struct device *dev, size_t alloc_size) +{ + unsigned long boundary_mask = dma_get_seg_boundary(dev); + dma_addr_t tbl_dma_addr = + phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask; + unsigned long max_slots = get_max_slots(boundary_mask); + unsigned int nslots = nr_slots(alloc_size), stride = 1; + unsigned int index, wrap, count = 0, i; + unsigned long flags; + + BUG_ON(!nslots); + + /* + * For mappings greater than or equal to a page, we limit the stride + * (and hence alignment) to a page size. + */ + if (alloc_size >= PAGE_SIZE) + stride <<= (PAGE_SHIFT - IO_TLB_SHIFT); + + spin_lock_irqsave(&io_tlb_lock, flags); + if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) + goto not_found; + + index = wrap = wrap_index(ALIGN(io_tlb_index, stride)); + do { + /* + * If we find a slot that indicates we have 'nslots' number of + * contiguous buffers, we allocate the buffers from that slot + * and mark the entries as '0' indicating unavailable. + */ + if (!iommu_is_span_boundary(index, nslots, + nr_slots(tbl_dma_addr), + max_slots)) { + if (io_tlb_list[index] >= nslots) + goto found; + } + index = wrap_index(index + stride); + } while (index != wrap); + +not_found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + return -1; + +found: + for (i = index; i < index + nslots; i++) + io_tlb_list[i] = 0; + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && + io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + + /* + * Update the indices to avoid searching in the next round. + */ + if (index + nslots < io_tlb_nslabs) + io_tlb_index = index + nslots; + else + io_tlb_index = 0; + io_tlb_used += nslots; + + spin_unlock_irqrestore(&io_tlb_lock, flags); + return index; +} + +phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start); - unsigned long flags; + unsigned int index, i; phys_addr_t tlb_addr; - unsigned int nslots, stride, index, wrap; - int i; - unsigned long mask; - unsigned long offset_slots; - unsigned long max_slots; - unsigned long tmp_io_tlb_used; if (no_iotlb_memory) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); @@ -473,113 +553,32 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); if (mapping_size > alloc_size) { - dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", + dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", mapping_size, alloc_size); return (phys_addr_t)DMA_MAPPING_ERROR; } - mask = dma_get_seg_boundary(hwdev); - - tbl_dma_addr &= mask; - - offset_slots = nr_slots(tbl_dma_addr); - - /* - * Carefully handle integer overflow which can occur when mask == ~0UL. - */ - max_slots = mask + 1 - ? nr_slots(mask + 1) - : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); - - /* - * For mappings greater than or equal to a page, we limit the stride - * (and hence alignment) to a page size. - */ - nslots = nr_slots(alloc_size); - if (alloc_size >= PAGE_SIZE) - stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); - else - stride = 1; - - BUG_ON(!nslots); - - /* - * Find suitable number of IO TLB entries size that will fit this - * request and allocate a buffer from that IO TLB pool. - */ - spin_lock_irqsave(&io_tlb_lock, flags); - - if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) - goto not_found; - - index = ALIGN(io_tlb_index, stride); - if (index >= io_tlb_nslabs) - index = 0; - wrap = index; - - do { - while (iommu_is_span_boundary(index, nslots, offset_slots, - max_slots)) { - index += stride; - if (index >= io_tlb_nslabs) - index = 0; - if (index == wrap) - goto not_found; - } - - /* - * If we find a slot that indicates we have 'nslots' number of - * contiguous buffers, we allocate the buffers from that slot - * and mark the entries as '0' indicating unavailable. - */ - if (io_tlb_list[index] >= nslots) { - int count = 0; - - for (i = index; i < (int) (index + nslots); i++) - io_tlb_list[i] = 0; - for (i = index - 1; - io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && - io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; - tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); - - /* - * Update the indices to avoid searching in the next - * round. - */ - io_tlb_index = ((index + nslots) < io_tlb_nslabs - ? (index + nslots) : 0); - - goto found; - } - index += stride; - if (index >= io_tlb_nslabs) - index = 0; - } while (index != wrap); - -not_found: - tmp_io_tlb_used = io_tlb_used; - - spin_unlock_irqrestore(&io_tlb_lock, flags); - if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) - dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - alloc_size, io_tlb_nslabs, tmp_io_tlb_used); - return (phys_addr_t)DMA_MAPPING_ERROR; -found: - io_tlb_used += nslots; - spin_unlock_irqrestore(&io_tlb_lock, flags); + index = find_slots(dev, alloc_size); + if (index == -1) { + if (!(attrs & DMA_ATTR_NO_WARN)) + dev_warn_ratelimited(dev, + "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", + alloc_size, io_tlb_nslabs, io_tlb_used); + return (phys_addr_t)DMA_MAPPING_ERROR; + } /* * Save away the mapping from the original address to the DMA address. * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - for (i = 0; i < nslots; i++) - io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); + for (i = 0; i < nr_slots(alloc_size); i++) + io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); + + tlb_addr = slot_addr(io_tlb_start, index); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); - return tlb_addr; } From 85a5a6875ca93dc4efbf20df942ba41d27a917e3 Mon Sep 17 00:00:00 2001 From: Jianxiong Gao Date: Thu, 29 Apr 2021 17:33:13 +0000 Subject: [PATCH 18/29] swiotlb: don't modify orig_addr in swiotlb_tbl_sync_single commit: 16fc3cef33a04632ab6b31758abdd77563a20759 swiotlb_tbl_map_single currently nevers sets a tlb_addr that is not aligned to the tlb bucket size. But we're going to add such a case soon, for which this adjustment would be bogus. Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Jianxiong Gao Signed-off-by: Greg Kroah-Hartman --- kernel/dma/swiotlb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 243750453b3d..b8f82f96c4c2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -644,7 +644,6 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, if (orig_addr == INVALID_PHYS_ADDR) return; - orig_addr += (unsigned long)tlb_addr & (IO_TLB_SIZE - 1); switch (target) { case SYNC_FOR_CPU: From f8e71c667ee12094b50f5a935133d080740cd66b Mon Sep 17 00:00:00 2001 From: Jianxiong Gao Date: Thu, 29 Apr 2021 17:33:14 +0000 Subject: [PATCH 19/29] swiotlb: respect min_align_mask commit: 1f221a0d0dbf0e48ef3a9c62871281d6a7819f05 swiotlb: respect min_align_mask Respect the min_align_mask in struct device_dma_parameters in swiotlb. There are two parts to it: 1) for the lower bits of the alignment inside the io tlb slot, just extent the size of the allocation and leave the start of the slot empty 2) for the high bits ensure we find a slot that matches the high bits of the alignment to avoid wasting too much memory Based on an earlier patch from Jianxiong Gao . Signed-off-by: Christoph Hellwig Acked-by: Jianxiong Gao Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Jianxiong Gao Signed-off-by: Greg Kroah-Hartman --- kernel/dma/swiotlb.c | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b8f82f96c4c2..ba4055a192e4 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -454,6 +454,14 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, #define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) +/* + * Return the offset into a iotlb slot required to keep the device happy. + */ +static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) +{ + return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); +} + /* * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. */ @@ -475,24 +483,29 @@ static unsigned int wrap_index(unsigned int index) * Find a suitable number of IO TLB entries size that will fit this request and * allocate a buffer from that IO TLB pool. */ -static int find_slots(struct device *dev, size_t alloc_size) +static int find_slots(struct device *dev, phys_addr_t orig_addr, + size_t alloc_size) { unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); - unsigned int nslots = nr_slots(alloc_size), stride = 1; + unsigned int iotlb_align_mask = + dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); + unsigned int nslots = nr_slots(alloc_size), stride; unsigned int index, wrap, count = 0, i; unsigned long flags; BUG_ON(!nslots); /* - * For mappings greater than or equal to a page, we limit the stride - * (and hence alignment) to a page size. + * For mappings with an alignment requirement don't bother looping to + * unaligned slots once we found an aligned one. For allocations of + * PAGE_SIZE or larger only look for page aligned allocations. */ + stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1; if (alloc_size >= PAGE_SIZE) - stride <<= (PAGE_SHIFT - IO_TLB_SHIFT); + stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT)); spin_lock_irqsave(&io_tlb_lock, flags); if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) @@ -500,6 +513,12 @@ static int find_slots(struct device *dev, size_t alloc_size) index = wrap = wrap_index(ALIGN(io_tlb_index, stride)); do { + if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != + (orig_addr & iotlb_align_mask)) { + index = wrap_index(index + 1); + continue; + } + /* * If we find a slot that indicates we have 'nslots' number of * contiguous buffers, we allocate the buffers from that slot @@ -543,6 +562,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { + unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int index, i; phys_addr_t tlb_addr; @@ -558,7 +578,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return (phys_addr_t)DMA_MAPPING_ERROR; } - index = find_slots(dev, alloc_size); + index = find_slots(dev, orig_addr, alloc_size + offset); if (index == -1) { if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, @@ -572,10 +592,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - for (i = 0; i < nr_slots(alloc_size); i++) + for (i = 0; i < nr_slots(alloc_size + offset); i++) io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); - tlb_addr = slot_addr(io_tlb_start, index); + tlb_addr = slot_addr(io_tlb_start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); @@ -590,8 +610,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_data_direction dir, unsigned long attrs) { unsigned long flags; - int i, count, nslots = nr_slots(alloc_size); - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); + int i, count, nslots = nr_slots(alloc_size + offset); + int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; /* From 2fa0387fa2d00924aca0a5fa4c48eb6a4b3f8731 Mon Sep 17 00:00:00 2001 From: Jianxiong Gao Date: Thu, 29 Apr 2021 17:33:15 +0000 Subject: [PATCH 20/29] nvme-pci: set min_align_mask commit: 3d2d861eb03e8ee96dc430a54361c900cbe28afd The PRP addressing scheme requires all PRP entries except for the first one to have a zero offset into the NVMe controller pages (which can be different from the Linux PAGE_SIZE). Use the min_align_mask device parameter to ensure that swiotlb does not change the address of the buffer modulo the device page size to ensure that the PRPs won't be malformed. Signed-off-by: Jianxiong Gao Signed-off-by: Christoph Hellwig Tested-by: Jianxiong Gao Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4dca58f4afdf..716039ea4450 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2634,6 +2634,7 @@ static void nvme_reset_work(struct work_struct *work) * Don't limit the IOMMU merged segment size. */ dma_set_max_seg_size(dev->dev, 0xffffffff); + dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); mutex_unlock(&dev->shutdown_lock); From 71d58457a8afc650da5d3292a7f7029317654d95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Mon, 29 Mar 2021 18:49:07 +0200 Subject: [PATCH 21/29] ovl: fix leaked dentry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit eaab1d45cdb4bb0c846bd23c3d666d5b90af7b41 upstream. Since commit 6815f479ca90 ("ovl: use only uppermetacopy state in ovl_lookup()"), overlayfs doesn't put temporary dentry when there is a metacopy error, which leads to dentry leaks when shutting down the related superblock: overlayfs: refusing to follow metacopy origin for (/file0) ... BUG: Dentry (____ptrval____){i=3f33,n=file3} still in use (1) [unmount of overlay overlay] ... WARNING: CPU: 1 PID: 432 at umount_check.cold+0x107/0x14d CPU: 1 PID: 432 Comm: unmount-overlay Not tainted 5.12.0-rc5 #1 ... RIP: 0010:umount_check.cold+0x107/0x14d ... Call Trace: d_walk+0x28c/0x950 ? dentry_lru_isolate+0x2b0/0x2b0 ? __kasan_slab_free+0x12/0x20 do_one_tree+0x33/0x60 shrink_dcache_for_umount+0x78/0x1d0 generic_shutdown_super+0x70/0x440 kill_anon_super+0x3e/0x70 deactivate_locked_super+0xc4/0x160 deactivate_super+0xfa/0x140 cleanup_mnt+0x22e/0x370 __cleanup_mnt+0x1a/0x30 task_work_run+0x139/0x210 do_exit+0xb0c/0x2820 ? __kasan_check_read+0x1d/0x30 ? find_held_lock+0x35/0x160 ? lock_release+0x1b6/0x660 ? mm_update_next_owner+0xa20/0xa20 ? reacquire_held_locks+0x3f0/0x3f0 ? __sanitizer_cov_trace_const_cmp4+0x22/0x30 do_group_exit+0x135/0x380 __do_sys_exit_group.isra.0+0x20/0x20 __x64_sys_exit_group+0x3c/0x50 do_syscall_64+0x45/0x70 entry_SYSCALL_64_after_hwframe+0x44/0xae ... VFS: Busy inodes after unmount of overlay. Self-destruct in 5 seconds. Have a nice day... This fix has been tested with a syzkaller reproducer. Cc: Amir Goldstein Cc: # v5.8+ Reported-by: syzbot Fixes: 6815f479ca90 ("ovl: use only uppermetacopy state in ovl_lookup()") Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20210329164907.2133175-1-mic@digikod.net Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/namei.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index a6162c4076db..f3309e044f07 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -913,6 +913,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, continue; if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) { + dput(this); err = -EPERM; pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry); goto out_put; From 27c1936af5068b5367078a65df6a3d4de3e94e9a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 12 Apr 2021 12:00:37 +0200 Subject: [PATCH 22/29] ovl: allow upperdir inside lowerdir commit 708fa01597fa002599756bf56a96d0de1677375c upstream. Commit 146d62e5a586 ("ovl: detect overlapping layers") made sure we don't have overlapping layers, but it also broke the arguably valid use case of mount -olowerdir=/,upperdir=/subdir,.. where upperdir overlaps lowerdir on the same filesystem. This has been causing regressions. Revert the check, but only for the specific case where upperdir and/or workdir are subdirectories of lowerdir. Any other overlap (e.g. lowerdir is subdirectory of upperdir, etc) case is crazy, so leave the check in place for those. Overlaps are detected at lookup time too, so reverting the mount time check should be safe. Fixes: 146d62e5a586 ("ovl: detect overlapping layers") Cc: # v5.2 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/super.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 50529a4e7bf3..77f08ac04d1f 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1759,7 +1759,8 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, * - upper/work dir of any overlayfs instance */ static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs, - struct dentry *dentry, const char *name) + struct dentry *dentry, const char *name, + bool is_lower) { struct dentry *next = dentry, *parent; int err = 0; @@ -1771,7 +1772,7 @@ static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs, /* Walk back ancestors to root (inclusive) looking for traps */ while (!err && parent != next) { - if (ovl_lookup_trap_inode(sb, parent)) { + if (is_lower && ovl_lookup_trap_inode(sb, parent)) { err = -ELOOP; pr_err("overlapping %s path\n", name); } else if (ovl_is_inuse(parent)) { @@ -1797,7 +1798,7 @@ static int ovl_check_overlapping_layers(struct super_block *sb, if (ovl_upper_mnt(ofs)) { err = ovl_check_layer(sb, ofs, ovl_upper_mnt(ofs)->mnt_root, - "upperdir"); + "upperdir", false); if (err) return err; @@ -1808,7 +1809,8 @@ static int ovl_check_overlapping_layers(struct super_block *sb, * workbasedir. In that case, we already have their traps in * inode cache and we will catch that case on lookup. */ - err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir"); + err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir", + false); if (err) return err; } @@ -1816,7 +1818,7 @@ static int ovl_check_overlapping_layers(struct super_block *sb, for (i = 1; i < ofs->numlayer; i++) { err = ovl_check_layer(sb, ofs, ofs->layers[i].mnt->mnt_root, - "lowerdir"); + "lowerdir", true); if (err) return err; } From 59b3f88386b5f5b4ab6622cd7307be40e871684c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 7 Apr 2021 16:45:49 +0200 Subject: [PATCH 23/29] ALSA: usb-audio: Add MIDI quirk for Vox ToneLab EX commit 64f40f9be14106e7df0098c427cb60be645bddb7 upstream. ToneLab EX guitar pedal device requires the same quirk like ToneLab ST for supporting the MIDI. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=212593 Cc: Link: https://lore.kernel.org/r/20210407144549.1530-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks-table.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 3c1697f6b60c..5728bf722c88 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -2376,6 +2376,16 @@ YAMAHA_DEVICE(0x7010, "UB99"), } }, +{ + USB_DEVICE_VENDOR_SPEC(0x0944, 0x0204), + .driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) { + .vendor_name = "KORG, Inc.", + /* .product_name = "ToneLab EX", */ + .ifnum = 3, + .type = QUIRK_MIDI_STANDARD_INTERFACE, + } +}, + /* AKAI devices */ { USB_DEVICE(0x09e8, 0x0062), From d844aaa49ac8b3225e906a150f89d2b805ad8cfc Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 12 Apr 2021 21:54:53 +0800 Subject: [PATCH 24/29] USB: Add LPM quirk for Lenovo ThinkPad USB-C Dock Gen2 Ethernet commit 8f23fe35ff1e5491b4d279323a8209a31f03ae65 upstream. This is another branded 8153 device that doesn't work well with LPM enabled: [ 400.597506] r8152 5-1.1:1.0 enx482ae3a2a6f0: Tx status -71 So disable LPM to resolve the issue. Signed-off-by: Kai-Heng Feng BugLink: https://bugs.launchpad.net/bugs/1922651 Link: https://lore.kernel.org/r/20210412135455.791971-1-kai.heng.feng@canonical.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 76ac5d6555ae..6114cf83bb44 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -438,6 +438,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x17ef, 0xa012), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, + /* Lenovo ThinkPad USB-C Dock Gen2 Ethernet (RTL8153 GigE) */ + { USB_DEVICE(0x17ef, 0xa387), .driver_info = USB_QUIRK_NO_LPM }, + /* BUILDWIN Photo Frame */ { USB_DEVICE(0x1908, 0x1315), .driver_info = USB_QUIRK_HONOR_BNUMINTERFACES }, From ac2cd82c76099f579be39087125010a201dac01a Mon Sep 17 00:00:00 2001 From: Chris Chiu Date: Wed, 21 Apr 2021 01:46:51 +0800 Subject: [PATCH 25/29] USB: Add reset-resume quirk for WD19's Realtek Hub commit ca91fd8c7643d93bfc18a6fec1a0d3972a46a18a upstream. Realtek Hub (0bda:5487) in Dell Dock WD19 sometimes fails to work after the system resumes from suspend with remote wakeup enabled device connected: [ 1947.640907] hub 5-2.3:1.0: hub_ext_port_status failed (err = -71) [ 1947.641208] usb 5-2.3-port5: cannot disable (err = -71) [ 1947.641401] hub 5-2.3:1.0: hub_ext_port_status failed (err = -71) [ 1947.641450] usb 5-2.3-port4: cannot reset (err = -71) Information of this hub: T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 10 Spd=480 MxCh= 5 D: Ver= 2.10 Cls=09(hub ) Sub=00 Prot=02 MxPS=64 #Cfgs= 1 P: Vendor=0bda ProdID=5487 Rev= 1.47 S: Manufacturer=Dell Inc. S: Product=Dell dock C:* #Ifs= 1 Cfg#= 1 Atr=e0 MxPwr= 0mA I: If#= 0 Alt= 0 #EPs= 1 Cls=09(hub ) Sub=00 Prot=01 Driver=hub E: Ad=81(I) Atr=03(Int.) MxPS= 1 Ivl=256ms I:* If#= 0 Alt= 1 #EPs= 1 Cls=09(hub ) Sub=00 Prot=02 Driver=hub E: Ad=81(I) Atr=03(Int.) MxPS= 1 Ivl=256ms The failure results from the ETIMEDOUT by chance when turning on the suspend feature for the specified port of the hub. The port seems to be in an unknown state so the hub_activate during resume fails the hub_port_status, then the hub will fail to work. The quirky hub needs the reset-resume quirk to function correctly. Acked-by: Alan Stern Signed-off-by: Chris Chiu Cc: stable Link: https://lore.kernel.org/r/20210420174651.6202-1-chris.chiu@canonical.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 6114cf83bb44..21e7522655ac 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -406,6 +406,7 @@ static const struct usb_device_id usb_quirk_list[] = { /* Realtek hub in Dell WD19 (Type-C) */ { USB_DEVICE(0x0bda, 0x0487), .driver_info = USB_QUIRK_NO_LPM }, + { USB_DEVICE(0x0bda, 0x5487), .driver_info = USB_QUIRK_RESET_RESUME }, /* Generic RTL8153 based ethernet adapters */ { USB_DEVICE(0x0bda, 0x8153), .driver_info = USB_QUIRK_NO_LPM }, From 399f9c18473cefe76420a5764253b74b3add7f8f Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Wed, 7 Apr 2021 17:20:15 -0400 Subject: [PATCH 26/29] platform/x86: thinkpad_acpi: Correct thermal sensor allocation commit 6759e18e5cd8745a5dfc5726e4a3db5281ec1639 upstream. On recent Thinkpad platforms it was reported that temp sensor 11 was always incorrectly displaying 66C. It turns out the reason for this is that this location in EC RAM is not a temperature sensor but is the power supply ID (offset 0xC2). Based on feedback from the Lenovo firmware team the EC RAM version can be determined and for the current version (3) only the 0x78 to 0x7F range is used for temp sensors. I don't have any details for earlier versions so I have left the implementation unaltered there. Note - in this block only 0x78 and 0x79 are officially designated (CPU & GPU sensors). The use of the other locations in the block will vary from platform to platform; but the existing logic to detect a sensor presence holds. Signed-off-by: Mark Pearson Link: https://lore.kernel.org/r/20210407212015.298222-1-markpearson@lenovo.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Greg Kroah-Hartman --- drivers/platform/x86/thinkpad_acpi.c | 31 ++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 3b0acaeb20cf..1c25af28a723 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -6258,6 +6258,7 @@ enum thermal_access_mode { enum { /* TPACPI_THERMAL_TPEC_* */ TP_EC_THERMAL_TMP0 = 0x78, /* ACPI EC regs TMP 0..7 */ TP_EC_THERMAL_TMP8 = 0xC0, /* ACPI EC regs TMP 8..15 */ + TP_EC_FUNCREV = 0xEF, /* ACPI EC Functional revision */ TP_EC_THERMAL_TMP_NA = -128, /* ACPI EC sensor not available */ TPACPI_THERMAL_SENSOR_NA = -128000, /* Sensor not available */ @@ -6456,7 +6457,7 @@ static const struct attribute_group thermal_temp_input8_group = { static int __init thermal_init(struct ibm_init_struct *iibm) { - u8 t, ta1, ta2; + u8 t, ta1, ta2, ver = 0; int i; int acpi_tmp7; int res; @@ -6471,7 +6472,14 @@ static int __init thermal_init(struct ibm_init_struct *iibm) * 0x78-0x7F, 0xC0-0xC7. Registers return 0x00 for * non-implemented, thermal sensors return 0x80 when * not available + * The above rule is unfortunately flawed. This has been seen with + * 0xC2 (power supply ID) causing thermal control problems. + * The EC version can be determined by offset 0xEF and at least for + * version 3 the Lenovo firmware team confirmed that registers 0xC0-0xC7 + * are not thermal registers. */ + if (!acpi_ec_read(TP_EC_FUNCREV, &ver)) + pr_warn("Thinkpad ACPI EC unable to access EC version\n"); ta1 = ta2 = 0; for (i = 0; i < 8; i++) { @@ -6481,11 +6489,13 @@ static int __init thermal_init(struct ibm_init_struct *iibm) ta1 = 0; break; } - if (acpi_ec_read(TP_EC_THERMAL_TMP8 + i, &t)) { - ta2 |= t; - } else { - ta1 = 0; - break; + if (ver < 3) { + if (acpi_ec_read(TP_EC_THERMAL_TMP8 + i, &t)) { + ta2 |= t; + } else { + ta1 = 0; + break; + } } } if (ta1 == 0) { @@ -6498,9 +6508,12 @@ static int __init thermal_init(struct ibm_init_struct *iibm) thermal_read_mode = TPACPI_THERMAL_NONE; } } else { - thermal_read_mode = - (ta2 != 0) ? - TPACPI_THERMAL_TPEC_16 : TPACPI_THERMAL_TPEC_8; + if (ver >= 3) + thermal_read_mode = TPACPI_THERMAL_TPEC_8; + else + thermal_read_mode = + (ta2 != 0) ? + TPACPI_THERMAL_TPEC_16 : TPACPI_THERMAL_TPEC_8; } } else if (acpi_tmp7) { if (tpacpi_is_ibm() && From 4348d3b5027bc3ff6336368b6c60605d4ef8e1ce Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Wed, 24 Feb 2021 22:56:28 +0100 Subject: [PATCH 27/29] perf/core: Fix unconditional security_locked_down() call commit 08ef1af4de5fe7de9c6d69f1e22e51b66e385d9b upstream. Currently, the lockdown state is queried unconditionally, even though its result is used only if the PERF_SAMPLE_REGS_INTR bit is set in attr.sample_type. While that doesn't matter in case of the Lockdown LSM, it causes trouble with the SELinux's lockdown hook implementation. SELinux implements the locked_down hook with a check whether the current task's type has the corresponding "lockdown" class permission ("integrity" or "confidentiality") allowed in the policy. This means that calling the hook when the access control decision would be ignored generates a bogus permission check and audit record. Fix this by checking sample_type first and only calling the hook when its result would be honored. Fixes: b0c8fdc7fdb7 ("lockdown: Lock down perf when in confidentiality mode") Signed-off-by: Ondrej Mosnacek Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paul Moore Link: https://lkml.kernel.org/r/20210224215628.192519-1-omosnace@redhat.com Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 4af161b3f322..8e1b8126c0e4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11705,12 +11705,12 @@ SYSCALL_DEFINE5(perf_event_open, return err; } - err = security_locked_down(LOCKDOWN_PERF); - if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) - /* REGS_INTR can leak data, lockdown must prevent this */ - return err; - - err = 0; + /* REGS_INTR can leak data, lockdown must prevent this */ + if (attr.sample_type & PERF_SAMPLE_REGS_INTR) { + err = security_locked_down(LOCKDOWN_PERF); + if (err) + return err; + } /* * In cgroup mode, the pid argument is used to pass the fd From 94c76056fc3f7e0f0cbdb6b92935d18752eae8df Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 4 Mar 2021 21:30:03 -0400 Subject: [PATCH 28/29] vfio: Depend on MMU commit b2b12db53507bc97d96f6b7cb279e831e5eafb00 upstream. VFIO_IOMMU_TYPE1 does not compile with !MMU: ../drivers/vfio/vfio_iommu_type1.c: In function 'follow_fault_pfn': ../drivers/vfio/vfio_iommu_type1.c:536:22: error: implicit declaration of function 'pte_write'; did you mean 'vfs_write'? [-Werror=implicit-function-declaration] So require it. Suggested-by: Cornelia Huck Signed-off-by: Jason Gunthorpe Message-Id: <0-v1-02cb5500df6e+78-vfio_no_mmu_jgg@nvidia.com> Signed-off-by: Alex Williamson Signed-off-by: Greg Kroah-Hartman --- drivers/vfio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 90c0525b1e0c..67d0bf4efa16 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -22,7 +22,7 @@ config VFIO_VIRQFD menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" select IOMMU_API - select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM || ARM64) + select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) help VFIO provides a framework for secure userspace device drivers. See Documentation/driver-api/vfio.rst for more details. From f53a3a4808625f876aebc5a0bfb354480bbf0c21 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 7 May 2021 11:04:33 +0200 Subject: [PATCH 29/29] Linux 5.10.35 Tested-By: Patrick McCormick Tested-by: Jon Hunter Tested-by: Florian Fainelli Tested-by: Fox Chen Tested-by: Shuah Khan Tested-by: Salvatore Bonaccorso Tested-by: Jason Self Tested-by: Guenter Roeck Tested-by: Hulk Robot Tested-by: Linux Kernel Functional Testing Tested-by: Pavel Machek (CIP) Tested-by: Sudip Mukherjee Link: https://lore.kernel.org/r/20210505112326.195493232@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ac2f14a067d3..6ca39f3aa4e9 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 10 -SUBLEVEL = 34 +SUBLEVEL = 35 EXTRAVERSION = NAME = Dare mighty things