From 06fde695ee76429634c1e8c8c1154035aa61191e Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Fri, 18 Dec 2020 14:00:39 +0800 Subject: [PATCH 001/135] genirq/msi: Initialize msi_alloc_info before calling msi_domain_prepare_irqs() Since commit 5fe71d271df8 ("irqchip/gic-v3-its: Tag ITS device as shared if allocating for a proxy device"), some of the devices are wrongly marked as "shared" by the ITS driver on systems equipped with the ITS(es). The problem is that the @info->flags may not be initialized anywhere and we end up looking at random bits on the stack. That's obviously not good. We can perform the initialization in the IRQ core layer before calling msi_domain_prepare_irqs(), which is neat enough. Fixes: 5fe71d271df8 ("irqchip/gic-v3-its: Tag ITS device as shared if allocating for a proxy device") Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201218060039.1770-1-yuzenghui@huawei.com --- kernel/irq/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 2c0c4d6d0f83..dc0e2d7fbdfd 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -402,7 +402,7 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_domain_ops *ops = info->ops; struct irq_data *irq_data; struct msi_desc *desc; - msi_alloc_info_t arg; + msi_alloc_info_t arg = { }; int i, ret, virq; bool can_reserve; From e90f55e0196a66f8e9e445f7f33f876dd889be9a Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Mon, 14 Dec 2020 21:35:30 +0800 Subject: [PATCH 002/135] irqchip/irq-sl28cpld: Convert comma to semicolon Replace a comma between expression statements by a semicolon. Signed-off-by: Zheng Yongjun Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201214133530.3783-1-zhengyongjun3@huawei.com --- drivers/irqchip/irq-sl28cpld.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-sl28cpld.c b/drivers/irqchip/irq-sl28cpld.c index 0aa50d025ef6..fbb354413ffa 100644 --- a/drivers/irqchip/irq-sl28cpld.c +++ b/drivers/irqchip/irq-sl28cpld.c @@ -66,7 +66,7 @@ static int sl28cpld_intc_probe(struct platform_device *pdev) irqchip->chip.num_regs = 1; irqchip->chip.status_base = base + INTC_IP; irqchip->chip.mask_base = base + INTC_IE; - irqchip->chip.mask_invert = true, + irqchip->chip.mask_invert = true; irqchip->chip.ack_base = base + INTC_IP; return devm_regmap_add_irq_chip_fwnode(dev, dev_fwnode(dev), From d7f39c40ebb6986e7371510d1c20a4efee4a7f0d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Dec 2020 18:03:46 +0000 Subject: [PATCH 003/135] irqchip/bcm2836: Fix IPI acknowledgement after conversion to handle_percpu_devid_irq It appears that despite its name, the bcm2836_arm_irqchip_ipi_eoi() callback is an acknowledgement, and not an EOI. This means that we lose IPIs that are made pending between the handling of the IPI and the write to LOCAL_MAILBOX0_CLR0. With the right timing, things fail nicely. This used to work with handle_percpu_devid_fasteoi_ipi(), which started by eoi-ing the interrupt. With the standard fasteoi flow, this doesn't work anymore. So let's use this callback for what it is, an ack. Your favourite RPi-2/3 is back up and running. Fixes: ffdad793d579 ("irqchip/bcm2836: Make IPIs use handle_percpu_devid_irq()") Cc: Valentin Schneider Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/c9fb4ab3-a5cb-648c-6de3-c6a871e60870@roeck-us.net --- drivers/irqchip/irq-bcm2836.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-bcm2836.c b/drivers/irqchip/irq-bcm2836.c index 5f5eb8877c41..25c9a9c06e41 100644 --- a/drivers/irqchip/irq-bcm2836.c +++ b/drivers/irqchip/irq-bcm2836.c @@ -167,7 +167,7 @@ static void bcm2836_arm_irqchip_handle_ipi(struct irq_desc *desc) chained_irq_exit(chip, desc); } -static void bcm2836_arm_irqchip_ipi_eoi(struct irq_data *d) +static void bcm2836_arm_irqchip_ipi_ack(struct irq_data *d) { int cpu = smp_processor_id(); @@ -195,7 +195,7 @@ static struct irq_chip bcm2836_arm_irqchip_ipi = { .name = "IPI", .irq_mask = bcm2836_arm_irqchip_dummy_op, .irq_unmask = bcm2836_arm_irqchip_dummy_op, - .irq_eoi = bcm2836_arm_irqchip_ipi_eoi, + .irq_ack = bcm2836_arm_irqchip_ipi_ack, .ipi_send_mask = bcm2836_arm_irqchip_ipi_send_mask, }; From e1dc20995cb9fa04b46e8f37113a7203c906d2bf Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 21 Dec 2020 22:30:55 +0800 Subject: [PATCH 004/135] driver core: platform: Add extra error check in devm_platform_get_irqs_affinity() The current check of nvec < minvec for nvec returned from platform_irq_count() will not detect a negative error code in nvec. This is because minvec is unsigned, and, as such, nvec is promoted to unsigned in that check, which will make it a huge number (if it contained -EPROBE_DEFER). In practice, an error should not occur in nvec for the only in-tree user, but add a check anyway. Fixes: e15f2fa959f2 ("driver core: platform: Add devm_platform_get_irqs_affinity()") Reported-by: Dan Carpenter Signed-off-by: John Garry Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1608561055-231244-1-git-send-email-john.garry@huawei.com --- drivers/base/platform.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index ea8add164b89..74c97b65048c 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -351,6 +351,8 @@ int devm_platform_get_irqs_affinity(struct platform_device *dev, return -ERANGE; nvec = platform_irq_count(dev); + if (nvec < 0) + return nvec; if (nvec < minvec) return -ENOSPC; From 54ca955b5a4024e2ce0f206b03adb7109bc4da26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Wed, 23 Dec 2020 20:19:31 +0100 Subject: [PATCH 005/135] serial: mvebu-uart: fix tx lost characters at power off MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c685af1108d7 ("serial: mvebu-uart: fix tx lost characters") fixed tx lost characters at low baud rates but started causing tx lost characters when kernel is going to power off or reboot. TX_EMP tells us when transmit queue is empty therefore all characters were transmitted. TX_RDY tells us when CPU can send a new character. Therefore we need to use different check prior transmitting new character and different check after all characters were sent. This patch splits polling code into two functions: wait_for_xmitr() which waits for TX_RDY and wait_for_xmite() which waits for TX_EMP. When rebooting A3720 platform without this patch on UART is print only: [ 42.699� And with this patch on UART is full output: [ 39.530216] reboot: Restarting system Fixes: c685af1108d7 ("serial: mvebu-uart: fix tx lost characters") Signed-off-by: Pali Rohár Cc: stable Link: https://lore.kernel.org/r/20201223191931.18343-1-pali@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/mvebu-uart.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/mvebu-uart.c b/drivers/tty/serial/mvebu-uart.c index 118b29912289..e0c00a1b0763 100644 --- a/drivers/tty/serial/mvebu-uart.c +++ b/drivers/tty/serial/mvebu-uart.c @@ -648,6 +648,14 @@ static void wait_for_xmitr(struct uart_port *port) (val & STAT_TX_RDY(port)), 1, 10000); } +static void wait_for_xmite(struct uart_port *port) +{ + u32 val; + + readl_poll_timeout_atomic(port->membase + UART_STAT, val, + (val & STAT_TX_EMP), 1, 10000); +} + static void mvebu_uart_console_putchar(struct uart_port *port, int ch) { wait_for_xmitr(port); @@ -675,7 +683,7 @@ static void mvebu_uart_console_write(struct console *co, const char *s, uart_console_write(port, s, count, mvebu_uart_console_putchar); - wait_for_xmitr(port); + wait_for_xmite(port); if (ier) writel(ier, port->membase + UART_CTRL(port)); From ef019c5daf032dce0b95ed4d45bfec93c4fbcb9f Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 22 Dec 2020 13:10:46 +0000 Subject: [PATCH 006/135] PHY: Ingenic: fix unconditional build of phy-ingenic-usb Currently drivers/phy/ingenic/Makefile adds phy-ingenic-usb to targets not depending on actual Kconfig symbol CONFIG_PHY_INGENIC_USB, so this driver always gets built[-in] on every system. Add missing dependency. Fixes: 31de313dfdcf ("PHY: Ingenic: Add USB PHY driver using generic PHY framework.") Signed-off-by: Alexander Lobakin Link: https://lore.kernel.org/r/20201222131021.4751-1-alobakin@pm.me Signed-off-by: Vinod Koul --- drivers/phy/ingenic/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/phy/ingenic/Makefile b/drivers/phy/ingenic/Makefile index 65d5ea00fc9d..1cb158d7233f 100644 --- a/drivers/phy/ingenic/Makefile +++ b/drivers/phy/ingenic/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += phy-ingenic-usb.o +obj-$(CONFIG_PHY_INGENIC_USB) += phy-ingenic-usb.o From 92cbdb923c17544684c2dd3be9f8636617898a44 Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Thu, 10 Dec 2020 21:31:36 +0800 Subject: [PATCH 007/135] usb: cdns3: imx: fix writing read-only memory issue The memory for struct clk_bulk_data should not be static which will be written during the clk_bulk_get. It fixed below oops when loading cdns3-imx as module. [ 17.272605] Unable to handle kernel write to read-only memory at virtual address ffff8000092a5398 [ 17.299730] Mem abort info: [ 17.313542] unregister ISI channel: mxc_isi.4 [ 17.324076] ESR = 0x9600004f [ 17.344658] EC = 0x25: DABT (current EL), IL = 32 bits [ 17.402055] SET = 0, FnV = 0 [ 17.404321] mxs_phy 5b100000.usbphy: supply phy-3p0 not found, using dummy regulator [ 17.405121] EA = 0, S1PTW = 0 [ 17.405133] Data abort info: [ 17.496231] ISV = 0, ISS = 0x0000004f [ 17.510871] CM = 0, WnR = 1 [ 17.533542] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000081ea5000 [ 17.545709] [ffff8000092a5398] pgd=00000008bffff003, p4d=00000008bffff003, pud=00000008bfffe003, pmd=0000000885041003, pte=006000088513b783 [ 17.573521] Internal error: Oops: 9600004f [#1] PREEMPT SMP [ 17.579113] Modules linked in: usbmisc_imx phy_mxs_usb phy_cadence_salvo cdns3_imx(+) tcpci imx8_media_dev(C) caam error [ 17.590044] CPU: 2 PID: 253 Comm: systemd-udevd Tainted: G C 5.10.0-rc4-04445-g11f3c3a29d0-dirty #19 [ 17.600488] Hardware name: Freescale i.MX8QXP MEK (DT) [ 17.605633] pstate: 20000005 (nzCv daif -PAN -UAO -TCO BTYPE=--) [ 17.611662] pc : __clk_bulk_get+0x48/0x130 [ 17.615786] lr : clk_bulk_get+0x18/0x20 [ 17.619634] sp : ffff80001369b880 [ 17.622953] x29: ffff80001369b880 x28: 0000000000000013 [ 17.628277] x27: 0000000000000100 x26: ffff00080553b100 [ 17.633602] x25: ffff80001229b4d8 x24: 0000000000000000 [ 17.638928] x23: ffff000800665410 x22: 0000000000000005 [ 17.644275] x21: ffff8000092a5390 x20: ffff000800665400 [ 17.649605] x19: ffff000804e6f980 x18: 000000005b110000 [ 17.654946] x17: 0000000000000000 x16: 0000000000000000 [ 17.660274] x15: ffff800011989100 x14: 0000000000000000 [ 17.665599] x13: ffff800013ce1000 x12: ffff800013ca1000 [ 17.670924] x11: 000000005b110000 x10: 0000000000000000 [ 17.676249] x9 : ffff8000106c5a30 x8 : ffff000804e6fa00 [ 17.681575] x7 : 0000000000000000 x6 : 000000000000003f [ 17.686901] x5 : 0000000000000040 x4 : ffff80001369b8b0 [ 17.692228] x3 : ffff8000092a5398 x2 : ffff8000092a5390 [ 17.697574] x1 : ffff8000092a53e8 x0 : 0000000000000004 [ 17.702905] Call trace: [ 17.705366] __clk_bulk_get+0x48/0x130 [ 17.709125] clk_bulk_get+0x18/0x20 [ 17.712620] devm_clk_bulk_get+0x58/0xb8 [ 17.716563] cdns_imx_probe+0x84/0x1f0 [cdns3_imx] [ 17.721363] platform_drv_probe+0x58/0xa8 [ 17.725381] really_probe+0xec/0x4c8 [ 17.728967] driver_probe_device+0xf4/0x160 [ 17.733160] device_driver_attach+0x74/0x80 [ 17.737355] __driver_attach+0xa4/0x170 [ 17.741202] bus_for_each_dev+0x74/0xc8 [ 17.745043] driver_attach+0x28/0x30 [ 17.748620] bus_add_driver+0x144/0x228 [ 17.752462] driver_register+0x68/0x118 [ 17.756308] __platform_driver_register+0x4c/0x58 [ 17.761022] cdns_imx_driver_init+0x24/0x1000 [cdns3_imx] [ 17.766434] do_one_initcall+0x48/0x2c0 [ 17.770280] do_init_module+0x5c/0x220 [ 17.774029] load_module+0x210c/0x2858 [ 17.777784] __do_sys_finit_module+0xb8/0x120 [ 17.782148] __arm64_sys_finit_module+0x24/0x30 [ 17.786691] el0_svc_common.constprop.0+0x70/0x168 [ 17.791497] do_el0_svc+0x28/0x88 [ 17.794822] el0_sync_handler+0x158/0x160 [ 17.798833] el0_sync+0x140/0x180 [ 17.802158] Code: aa0203f5 91002043 8b205021 a90153f3 (f801047f) Cc: Fixes: 1e056efab993 ("usb: cdns3: add NXP imx8qm glue layer") Signed-off-by: Peter Chen --- drivers/usb/cdns3/cdns3-imx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/cdns3/cdns3-imx.c b/drivers/usb/cdns3/cdns3-imx.c index 22a56c4dce67..4d3fedc86753 100644 --- a/drivers/usb/cdns3/cdns3-imx.c +++ b/drivers/usb/cdns3/cdns3-imx.c @@ -185,7 +185,11 @@ static int cdns_imx_probe(struct platform_device *pdev) } data->num_clks = ARRAY_SIZE(imx_cdns3_core_clks); - data->clks = (struct clk_bulk_data *)imx_cdns3_core_clks; + data->clks = devm_kmemdup(dev, imx_cdns3_core_clks, + sizeof(imx_cdns3_core_clks), GFP_KERNEL); + if (!data->clks) + return -ENOMEM; + ret = devm_clk_bulk_get(dev, data->num_clks, data->clks); if (ret) return ret; From 2ef02b846ee2526249a562a66d6dcb25fcbca9d8 Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Thu, 10 Dec 2020 21:31:37 +0800 Subject: [PATCH 008/135] usb: cdns3: imx: fix can't create core device the second time issue The cdns3 core device is populated by calling of_platform_populate, the flag OF_POPULATED is set for core device node, if this flag is not cleared, when calling of_platform_populate the second time after loading parent module again, the OF code will not try to create platform device for core device. To fix it, it uses of_platform_depopulate to depopulate the core device which the parent created, and the flag OF_POPULATED for core device node will be cleared accordingly. Cc: Fixes: 1e056efab993 ("usb: cdns3: add NXP imx8qm glue layer") Signed-off-by: Peter Chen --- drivers/usb/cdns3/cdns3-imx.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/usb/cdns3/cdns3-imx.c b/drivers/usb/cdns3/cdns3-imx.c index 4d3fedc86753..6b358e8be579 100644 --- a/drivers/usb/cdns3/cdns3-imx.c +++ b/drivers/usb/cdns3/cdns3-imx.c @@ -218,20 +218,11 @@ static int cdns_imx_probe(struct platform_device *pdev) return ret; } -static int cdns_imx_remove_core(struct device *dev, void *data) -{ - struct platform_device *pdev = to_platform_device(dev); - - platform_device_unregister(pdev); - - return 0; -} - static int cdns_imx_remove(struct platform_device *pdev) { struct device *dev = &pdev->dev; - device_for_each_child(dev, NULL, cdns_imx_remove_core); + of_platform_depopulate(dev); platform_set_drvdata(pdev, NULL); return 0; From d1357119157c4662d43143885f3691f9a766369a Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Thu, 10 Dec 2020 21:33:21 +0800 Subject: [PATCH 009/135] usb: cdns3: imx: improve driver .remove API Keep the runtime active during the remove operation, and disable related clocks. Signed-off-by: Peter Chen --- drivers/usb/cdns3/cdns3-imx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/cdns3/cdns3-imx.c b/drivers/usb/cdns3/cdns3-imx.c index 6b358e8be579..7990fee03fe4 100644 --- a/drivers/usb/cdns3/cdns3-imx.c +++ b/drivers/usb/cdns3/cdns3-imx.c @@ -221,8 +221,13 @@ static int cdns_imx_probe(struct platform_device *pdev) static int cdns_imx_remove(struct platform_device *pdev) { struct device *dev = &pdev->dev; + struct cdns_imx *data = dev_get_drvdata(dev); + pm_runtime_get_sync(dev); of_platform_depopulate(dev); + clk_bulk_disable_unprepare(data->num_clks, data->clks); + pm_runtime_disable(dev); + pm_runtime_put_noidle(dev); platform_set_drvdata(pdev, NULL); return 0; From 65403ff98ebb86caf498e020d572819bb61860ad Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Fri, 18 Dec 2020 12:57:36 +0200 Subject: [PATCH 010/135] MAINTAINERS: Update address for Cadence USB3 driver Updates my email address for Cadence USB3 driver. Signed-off-by: Roger Quadros Signed-off-by: Peter Chen --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 546aa66428c9..afe3a5c66bc9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3883,7 +3883,7 @@ F: drivers/mtd/nand/raw/cadence-nand-controller.c CADENCE USB3 DRD IP DRIVER M: Peter Chen M: Pawel Laszczak -M: Roger Quadros +R: Roger Quadros R: Aswath Govindraju L: linux-usb@vger.kernel.org S: Maintained From 764257d9069a9c19758b626cc1ba4ae079335d9e Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 30 Dec 2020 12:21:05 +0200 Subject: [PATCH 011/135] phy: cpcap-usb: Fix warning for missing regulator_disable On deferred probe, we will get the following splat: cpcap-usb-phy cpcap-usb-phy.0: could not initialize VBUS or ID IIO: -517 WARNING: CPU: 0 PID: 21 at drivers/regulator/core.c:2123 regulator_put+0x68/0x78 ... (regulator_put) from [] (release_nodes+0x1b4/0x1fc) (release_nodes) from [] (really_probe+0x104/0x4a0) (really_probe) from [] (driver_probe_device+0x58/0xb4) Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20201230102105.11826-1-tony@atomide.com Signed-off-by: Vinod Koul --- drivers/phy/motorola/phy-cpcap-usb.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/phy/motorola/phy-cpcap-usb.c b/drivers/phy/motorola/phy-cpcap-usb.c index 442522ba487f..4728e2bff662 100644 --- a/drivers/phy/motorola/phy-cpcap-usb.c +++ b/drivers/phy/motorola/phy-cpcap-usb.c @@ -662,35 +662,42 @@ static int cpcap_usb_phy_probe(struct platform_device *pdev) generic_phy = devm_phy_create(ddata->dev, NULL, &ops); if (IS_ERR(generic_phy)) { error = PTR_ERR(generic_phy); - return PTR_ERR(generic_phy); + goto out_reg_disable; } phy_set_drvdata(generic_phy, ddata); phy_provider = devm_of_phy_provider_register(ddata->dev, of_phy_simple_xlate); - if (IS_ERR(phy_provider)) - return PTR_ERR(phy_provider); + if (IS_ERR(phy_provider)) { + error = PTR_ERR(phy_provider); + goto out_reg_disable; + } error = cpcap_usb_init_optional_pins(ddata); if (error) - return error; + goto out_reg_disable; cpcap_usb_init_optional_gpios(ddata); error = cpcap_usb_init_iio(ddata); if (error) - return error; + goto out_reg_disable; error = cpcap_usb_init_interrupts(pdev, ddata); if (error) - return error; + goto out_reg_disable; usb_add_phy_dev(&ddata->phy); atomic_set(&ddata->active, 1); schedule_delayed_work(&ddata->detect_work, msecs_to_jiffies(1)); return 0; + +out_reg_disable: + regulator_disable(ddata->vusb); + + return error; } static int cpcap_usb_phy_remove(struct platform_device *pdev) From d092bd9110494de3372722b317510b3692f1b2fe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 14:55:17 +0100 Subject: [PATCH 012/135] phy: mediatek: allow compile-testing the dsi phy Randconfig builds show another broken dependency: WARNING: unmet direct dependencies detected for PHY_MTK_MIPI_DSI Depends on [n]: ARCH_MEDIATEK [=n] && OF [=y] Selected by [m]: - DRM_MEDIATEK [=m] && HAS_IOMEM [=y] && DRM [=m] && (ARCH_MEDIATEK [=n] || ARM [=y] && COMPILE_TEST [=y]) && COMMON_CLK [=y] && HAVE_ARM_SMCCC [=y] && OF [=y] && MTK_MMSYS [=y] This is similar to the hdmi driver I fixed earlier, and I guess the common-clk bug would sooner or later also manifest here, so just use the exact same solution I chose for the other driver, and hope that any future drivers just copy it from here. Fixes: 90f80d95992f ("phy: mediatek: Move mtk_mipi_dsi_phy driver into drivers/phy/mediatek folder") Fixes: f5f6e01f9164 ("phy: mediatek: allow compile-testing the hdmi phy") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210103135524.3678664-1-arnd@kernel.org Signed-off-by: Vinod Koul --- drivers/phy/mediatek/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/phy/mediatek/Kconfig b/drivers/phy/mediatek/Kconfig index d38def43b1bf..55f8e6c048ab 100644 --- a/drivers/phy/mediatek/Kconfig +++ b/drivers/phy/mediatek/Kconfig @@ -49,7 +49,9 @@ config PHY_MTK_HDMI config PHY_MTK_MIPI_DSI tristate "MediaTek MIPI-DSI Driver" - depends on ARCH_MEDIATEK && OF + depends on ARCH_MEDIATEK || COMPILE_TEST + depends on COMMON_CLK + depends on OF select GENERIC_PHY help Support MIPI DSI for Mediatek SoCs. From 4cc99d03757df10a4064ba28bf6021406b04d6a9 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 5 Jan 2021 10:56:51 +0800 Subject: [PATCH 013/135] irqchip/loongson-liointc: Fix build warnings Fix build warnings as below: >> drivers/irqchip/irq-loongson-liointc.c:134:12: warning: no previous prototype for 'liointc_of_init' [-Wmissing-prototypes] 134 | int __init liointc_of_init(struct device_node *node, | ^~~~~~~~~~~~~~~ Fixes: dbb152267908c4b2c3639492a94 ("irqchip: Add driver for Loongson I/O Local Interrupt Controller") Reported-by: kernel test robot Signed-off-by: Huacai Chen Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210105025651.772024-1-chenhuacai@loongson.cn --- drivers/irqchip/irq-loongson-liointc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-loongson-liointc.c b/drivers/irqchip/irq-loongson-liointc.c index 9ed1bc473663..09b91b81851c 100644 --- a/drivers/irqchip/irq-loongson-liointc.c +++ b/drivers/irqchip/irq-loongson-liointc.c @@ -142,8 +142,8 @@ static void liointc_resume(struct irq_chip_generic *gc) static const char * const parent_names[] = {"int0", "int1", "int2", "int3"}; -int __init liointc_of_init(struct device_node *node, - struct device_node *parent) +static int __init liointc_of_init(struct device_node *node, + struct device_node *parent) { struct irq_chip_generic *gc; struct irq_domain *domain; From f477a538c14d07f8c45e554c8c5208d588514e98 Mon Sep 17 00:00:00 2001 From: Necip Fazil Yildiran Date: Thu, 17 Sep 2020 18:45:48 +0300 Subject: [PATCH 014/135] sh: dma: fix kconfig dependency for G2_DMA When G2_DMA is enabled and SH_DMA is disabled, it results in the following Kbuild warning: WARNING: unmet direct dependencies detected for SH_DMA_API Depends on [n]: SH_DMA [=n] Selected by [y]: - G2_DMA [=y] && SH_DREAMCAST [=y] The reason is that G2_DMA selects SH_DMA_API without depending on or selecting SH_DMA while SH_DMA_API depends on SH_DMA. When G2_DMA was first introduced with commit 40f49e7ed77f ("sh: dma: Make G2 DMA configurable."), this wasn't an issue since SH_DMA_API didn't have such dependency, and this way was the only way to enable it since SH_DMA_API was non-visible. However, later SH_DMA_API was made visible and dependent on SH_DMA with commit d8902adcc1a9 ("dmaengine: sh: Add Support SuperH DMA Engine driver"). Let G2_DMA depend on SH_DMA_API instead to avoid Kbuild issues. Fixes: d8902adcc1a9 ("dmaengine: sh: Add Support SuperH DMA Engine driver") Signed-off-by: Necip Fazil Yildiran Signed-off-by: Rich Felker --- arch/sh/drivers/dma/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/sh/drivers/dma/Kconfig b/arch/sh/drivers/dma/Kconfig index d0de378beefe..7d54f284ce10 100644 --- a/arch/sh/drivers/dma/Kconfig +++ b/arch/sh/drivers/dma/Kconfig @@ -63,8 +63,7 @@ config PVR2_DMA config G2_DMA tristate "G2 Bus DMA support" - depends on SH_DREAMCAST - select SH_DMA_API + depends on SH_DREAMCAST && SH_DMA_API help This enables support for the DMA controller for the Dreamcast's G2 bus. Drivers that want this will generally enable this on From 7fb0a1a5e56779c427b409d6e53889d46519755e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 17 Sep 2020 19:14:04 -0700 Subject: [PATCH 015/135] arch/sh: hyphenate Non-Uniform in Kconfig prompt Hyphenate Non-Uniform in the NUMA kconfig prompt. Signed-off-by: Randy Dunlap Cc: Yoshinori Sato Cc: Rich Felker Cc: linux-sh@vger.kernel.org Cc: Jiri Kosina Signed-off-by: Rich Felker --- arch/sh/mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 703d3069997c..77aa2f802d8d 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -105,7 +105,7 @@ config VSYSCALL (the default value) say Y. config NUMA - bool "Non Uniform Memory Access (NUMA) Support" + bool "Non-Uniform Memory Access (NUMA) Support" depends on MMU && SYS_SUPPORTS_NUMA select ARCH_WANT_NUMA_VARIABLE_LOCALITY default n From 5c5dc5f8dccbafaacc8c97bbe7762986bdda6f63 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Sat, 19 Sep 2020 10:52:06 +0800 Subject: [PATCH 016/135] sh: intc: Convert to DEFINE_SHOW_ATTRIBUTE Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code. Signed-off-by: Qinglang Miao Signed-off-by: Rich Felker --- drivers/sh/intc/virq-debugfs.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/sh/intc/virq-debugfs.c b/drivers/sh/intc/virq-debugfs.c index 9e62ba9311f0..939915a07d99 100644 --- a/drivers/sh/intc/virq-debugfs.c +++ b/drivers/sh/intc/virq-debugfs.c @@ -16,7 +16,7 @@ #include #include "internals.h" -static int intc_irq_xlate_debug(struct seq_file *m, void *priv) +static int intc_irq_xlate_show(struct seq_file *m, void *priv) { int i; @@ -37,17 +37,7 @@ static int intc_irq_xlate_debug(struct seq_file *m, void *priv) return 0; } -static int intc_irq_xlate_open(struct inode *inode, struct file *file) -{ - return single_open(file, intc_irq_xlate_debug, inode->i_private); -} - -static const struct file_operations intc_irq_xlate_fops = { - .open = intc_irq_xlate_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(intc_irq_xlate); static int __init intc_irq_xlate_init(void) { From a1153636e904faf2b30fae3fb6ee3f4f4d0175c8 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Sat, 19 Sep 2020 10:52:05 +0800 Subject: [PATCH 017/135] sh: mm: Convert to DEFINE_SHOW_ATTRIBUTE Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code. Signed-off-by: Qinglang Miao Signed-off-by: Rich Felker --- arch/sh/mm/asids-debugfs.c | 15 ++------------- arch/sh/mm/cache-debugfs.c | 15 ++------------- arch/sh/mm/pmb.c | 15 ++------------- 3 files changed, 6 insertions(+), 39 deletions(-) diff --git a/arch/sh/mm/asids-debugfs.c b/arch/sh/mm/asids-debugfs.c index 4c1ca197e9c5..d16d6f5ec774 100644 --- a/arch/sh/mm/asids-debugfs.c +++ b/arch/sh/mm/asids-debugfs.c @@ -26,7 +26,7 @@ #include #include -static int asids_seq_show(struct seq_file *file, void *iter) +static int asids_debugfs_show(struct seq_file *file, void *iter) { struct task_struct *p; @@ -48,18 +48,7 @@ static int asids_seq_show(struct seq_file *file, void *iter) return 0; } -static int asids_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, asids_seq_show, inode->i_private); -} - -static const struct file_operations asids_debugfs_fops = { - .owner = THIS_MODULE, - .open = asids_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(asids_debugfs); static int __init asids_debugfs_init(void) { diff --git a/arch/sh/mm/cache-debugfs.c b/arch/sh/mm/cache-debugfs.c index 17d780794497..b0f185169dfa 100644 --- a/arch/sh/mm/cache-debugfs.c +++ b/arch/sh/mm/cache-debugfs.c @@ -22,7 +22,7 @@ enum cache_type { CACHE_TYPE_UNIFIED, }; -static int cache_seq_show(struct seq_file *file, void *iter) +static int cache_debugfs_show(struct seq_file *file, void *iter) { unsigned int cache_type = (unsigned int)file->private; struct cache_info *cache; @@ -94,18 +94,7 @@ static int cache_seq_show(struct seq_file *file, void *iter) return 0; } -static int cache_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, cache_seq_show, inode->i_private); -} - -static const struct file_operations cache_debugfs_fops = { - .owner = THIS_MODULE, - .open = cache_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(cache_debugfs); static int __init cache_debugfs_init(void) { diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c index b20aba6e1b37..68eb7cc6e564 100644 --- a/arch/sh/mm/pmb.c +++ b/arch/sh/mm/pmb.c @@ -812,7 +812,7 @@ bool __in_29bit_mode(void) return (__raw_readl(PMB_PASCR) & PASCR_SE) == 0; } -static int pmb_seq_show(struct seq_file *file, void *iter) +static int pmb_debugfs_show(struct seq_file *file, void *iter) { int i; @@ -846,18 +846,7 @@ static int pmb_seq_show(struct seq_file *file, void *iter) return 0; } -static int pmb_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, pmb_seq_show, NULL); -} - -static const struct file_operations pmb_debugfs_fops = { - .owner = THIS_MODULE, - .open = pmb_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(pmb_debugfs); static int __init pmb_debugfs_init(void) { From b7aaf16d10bd9f1fbc5beefb9496e029fd1424ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Sep 2020 06:31:39 +0200 Subject: [PATCH 018/135] sh: remove CONFIG_IDE from most defconfig Remove CONFIG_IDE from defconfigs that did not actually select chipset drivers, and switch ones that have libata drivers to libata. Signed-off-by: Christoph Hellwig Signed-off-by: Rich Felker --- arch/sh/configs/landisk_defconfig | 9 ++++----- arch/sh/configs/microdev_defconfig | 2 -- arch/sh/configs/sdk7780_defconfig | 6 ++---- arch/sh/configs/sdk7786_defconfig | 3 --- arch/sh/configs/se7750_defconfig | 1 - arch/sh/configs/sh03_defconfig | 3 --- 6 files changed, 6 insertions(+), 18 deletions(-) diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig index ba6ec042606f..e6c5ddf070c0 100644 --- a/arch/sh/configs/landisk_defconfig +++ b/arch/sh/configs/landisk_defconfig @@ -27,13 +27,12 @@ CONFIG_NETFILTER=y CONFIG_ATALK=m CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y -CONFIG_IDE=y -CONFIG_BLK_DEV_IDECD=y -CONFIG_BLK_DEV_OFFBOARD=y -CONFIG_BLK_DEV_GENERIC=y -CONFIG_BLK_DEV_AEC62XX=y +CONFIG_ATA=y +CONFIG_ATA_GENERIC=y +CONFIG_PATA_ATP867X=y CONFIG_SCSI=y CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y CONFIG_SCSI_MULTI_LUN=y CONFIG_MD=y CONFIG_BLK_DEV_MD=m diff --git a/arch/sh/configs/microdev_defconfig b/arch/sh/configs/microdev_defconfig index c65667d00313..e9825196dd66 100644 --- a/arch/sh/configs/microdev_defconfig +++ b/arch/sh/configs/microdev_defconfig @@ -20,8 +20,6 @@ CONFIG_IP_PNP=y # CONFIG_IPV6 is not set # CONFIG_FW_LOADER is not set CONFIG_BLK_DEV_RAM=y -CONFIG_IDE=y -CONFIG_BLK_DEV_IDECD=y CONFIG_NETDEVICES=y CONFIG_NET_ETHERNET=y CONFIG_SMC91X=y diff --git a/arch/sh/configs/sdk7780_defconfig b/arch/sh/configs/sdk7780_defconfig index d10a0414123a..d00376eb044f 100644 --- a/arch/sh/configs/sdk7780_defconfig +++ b/arch/sh/configs/sdk7780_defconfig @@ -44,16 +44,14 @@ CONFIG_NET_SCHED=y CONFIG_PARPORT=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y -CONFIG_IDE=y -CONFIG_BLK_DEV_IDECD=y -CONFIG_BLK_DEV_PLATFORM=y -CONFIG_BLK_DEV_GENERIC=y CONFIG_BLK_DEV_SD=y CONFIG_BLK_DEV_SR=y CONFIG_CHR_DEV_SG=y CONFIG_SCSI_SPI_ATTRS=y CONFIG_SCSI_FC_ATTRS=y CONFIG_ATA=y +CONFIG_ATA_GENERIC=y +CONFIG_PATA_PLATFORM=y CONFIG_MD=y CONFIG_BLK_DEV_DM=y CONFIG_NETDEVICES=y diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index 61bec46ebd66..4a44cac640bc 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -116,9 +116,6 @@ CONFIG_MTD_UBI_GLUEBI=m CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_CRYPTOLOOP=y CONFIG_BLK_DEV_RAM=y -CONFIG_IDE=y -CONFIG_BLK_DEV_IDECD=y -CONFIG_BLK_DEV_PLATFORM=y CONFIG_BLK_DEV_SD=y CONFIG_BLK_DEV_SR=y CONFIG_SCSI_MULTI_LUN=y diff --git a/arch/sh/configs/se7750_defconfig b/arch/sh/configs/se7750_defconfig index 3f1c13799d79..4defc7628a49 100644 --- a/arch/sh/configs/se7750_defconfig +++ b/arch/sh/configs/se7750_defconfig @@ -29,7 +29,6 @@ CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_AMDSTD=y CONFIG_MTD_ROM=y -CONFIG_IDE=y CONFIG_SCSI=y CONFIG_NETDEVICES=y CONFIG_NET_ETHERNET=y diff --git a/arch/sh/configs/sh03_defconfig b/arch/sh/configs/sh03_defconfig index f0073ed39947..48b457d59e79 100644 --- a/arch/sh/configs/sh03_defconfig +++ b/arch/sh/configs/sh03_defconfig @@ -39,9 +39,6 @@ CONFIG_IP_PNP_RARP=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_NBD=y CONFIG_BLK_DEV_RAM=y -CONFIG_IDE=y -CONFIG_BLK_DEV_IDECD=m -CONFIG_BLK_DEV_IDETAPE=m CONFIG_SCSI=m CONFIG_BLK_DEV_SD=m CONFIG_BLK_DEV_SR=m From 19170492735be935747b0545b7eed8bb40cc1209 Mon Sep 17 00:00:00 2001 From: Jinyang He Date: Mon, 12 Oct 2020 11:50:24 +0800 Subject: [PATCH 019/135] sh: Remove unused HAVE_COPY_THREAD_TLS macro Fixes: e1cc9d8d596e ("sh: switch to copy_thread_tls()") Signed-off-by: Jinyang He Signed-off-by: Rich Felker --- arch/sh/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 5fa580219a86..52646f52f130 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -29,7 +29,6 @@ config SUPERH select HAVE_ARCH_KGDB select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK - select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK select HAVE_DYNAMIC_FTRACE From 542baf5108e052684c3abdeea57861f12f89a6b9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 12 Oct 2020 18:40:50 +0300 Subject: [PATCH 020/135] sh: Drop ARCH_NR_GPIOS definition The default by generic header is the same, hence drop unnecessary definition. Signed-off-by: Andy Shevchenko Signed-off-by: Rich Felker --- arch/sh/include/asm/gpio.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sh/include/asm/gpio.h b/arch/sh/include/asm/gpio.h index 351918894e86..d643250f0a0f 100644 --- a/arch/sh/include/asm/gpio.h +++ b/arch/sh/include/asm/gpio.h @@ -16,7 +16,6 @@ #include #endif -#define ARCH_NR_GPIOS 512 #include #ifdef CONFIG_GPIOLIB From 7a202ec74c151e30edc1d17e3209fe6d6fe50eee Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Mon, 9 Nov 2020 10:45:51 +0800 Subject: [PATCH 021/135] arch: sh: remove duplicate include Remove duplicate header which is included twice. Signed-off-by: Wang Qing Signed-off-by: Rich Felker --- arch/sh/kernel/cpu/sh3/entry.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sh/kernel/cpu/sh3/entry.S b/arch/sh/kernel/cpu/sh3/entry.S index 25eb80905416..e48b3dd996f5 100644 --- a/arch/sh/kernel/cpu/sh3/entry.S +++ b/arch/sh/kernel/cpu/sh3/entry.S @@ -14,7 +14,6 @@ #include #include #include -#include ! NOTE: ! GNU as (as of 2.9.1) changes bf/s into bt/s and bra, when the address From a118584e7e60fa72ee441055b33b41c3354dba7e Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Mon, 9 Nov 2020 10:35:01 +0800 Subject: [PATCH 022/135] sh: mach-sh03: remove duplicate include Remove duplicate header which is included twice. Signed-off-by: Wang Qing Signed-off-by: Rich Felker --- arch/sh/boards/mach-sh03/rtc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sh/boards/mach-sh03/rtc.c b/arch/sh/boards/mach-sh03/rtc.c index 8b23ed7c201c..7fb474844a2d 100644 --- a/arch/sh/boards/mach-sh03/rtc.c +++ b/arch/sh/boards/mach-sh03/rtc.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include From b89bc060b53e7054e5c8ca11feea4bc884d83611 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 10 Nov 2020 16:49:39 +0100 Subject: [PATCH 023/135] sh/intc: Restore devm_ioremap() alignment Restore alignment of the continuation of the devm_ioremap() call in register_intc_controller(). Fixes: 4bdc0d676a643140 ("remove ioremap_nocache and devm_ioremap_nocache") Signed-off-by: Geert Uytterhoeven Signed-off-by: Rich Felker --- drivers/sh/intc/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/sh/intc/core.c b/drivers/sh/intc/core.c index f8e070d67fa3..a14684ffe4c1 100644 --- a/drivers/sh/intc/core.c +++ b/drivers/sh/intc/core.c @@ -214,7 +214,7 @@ int __init register_intc_controller(struct intc_desc *desc) d->window[k].phys = res->start; d->window[k].size = resource_size(res); d->window[k].virt = ioremap(res->start, - resource_size(res)); + resource_size(res)); if (!d->window[k].virt) goto err2; } From c14556fc0c7c115ffb4a287560e1ec9f7869aac3 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 17 Dec 2020 12:39:11 +0300 Subject: [PATCH 024/135] thunderbolt: Drop duplicated 0x prefix from format string The tb_dbg() call is using %#x that already adds the 0x prefix so don't duplicate it. Fixes: 9039387e166e ("thunderbolt: Add USB4 router operation proxy for firmware connection manager") Signed-off-by: Mika Westerberg Reviewed-by: Yehezkel Bernat --- drivers/thunderbolt/icm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thunderbolt/icm.c b/drivers/thunderbolt/icm.c index 8b7f941a9bb7..b8c4159bc32d 100644 --- a/drivers/thunderbolt/icm.c +++ b/drivers/thunderbolt/icm.c @@ -2316,7 +2316,7 @@ static int icm_usb4_switch_nvm_authenticate_status(struct tb_switch *sw, if (auth && auth->reply.route_hi == sw->config.route_hi && auth->reply.route_lo == sw->config.route_lo) { - tb_dbg(tb, "NVM_AUTH found for %llx flags 0x%#x status %#x\n", + tb_dbg(tb, "NVM_AUTH found for %llx flags %#x status %#x\n", tb_route(sw), auth->reply.hdr.flags, auth->reply.status); if (auth->reply.hdr.flags & ICM_FLAGS_ERROR) ret = -EIO; From 491b1bea00040233b791dc8fea1608ac6a7003bc Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Fri, 8 Jan 2021 08:35:30 +0800 Subject: [PATCH 025/135] MAINTAINERS: update Peter Chen's email address Using kernel.org as my email address. Signed-off-by: Peter Chen --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index afe3a5c66bc9..c16a2e83e176 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3881,7 +3881,7 @@ F: Documentation/devicetree/bindings/mtd/cadence-nand-controller.txt F: drivers/mtd/nand/raw/cadence-nand-controller.c CADENCE USB3 DRD IP DRIVER -M: Peter Chen +M: Peter Chen M: Pawel Laszczak R: Roger Quadros R: Aswath Govindraju @@ -4163,7 +4163,7 @@ S: Maintained F: Documentation/translations/zh_CN/ CHIPIDEA USB HIGH SPEED DUAL ROLE CONTROLLER -M: Peter Chen +M: Peter Chen L: linux-usb@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/peter.chen/usb.git @@ -18404,7 +18404,7 @@ F: Documentation/usb/ohci.rst F: drivers/usb/host/ohci* USB OTG FSM (Finite State Machine) -M: Peter Chen +M: Peter Chen L: linux-usb@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/peter.chen/usb.git From d0243bbd5dd3ebbd49dafa8b56bb911d971131d0 Mon Sep 17 00:00:00 2001 From: Meng Li Date: Tue, 5 Jan 2021 15:09:27 +0800 Subject: [PATCH 026/135] drivers core: Free dma_range_map when driver probe failed There will be memory leak if driver probe failed. Trace as below: backtrace: [<000000002415258f>] kmemleak_alloc+0x3c/0x50 [<00000000f447ebe4>] __kmalloc+0x208/0x530 [<0000000048bc7b3a>] of_dma_get_range+0xe4/0x1b0 [<0000000041e39065>] of_dma_configure_id+0x58/0x27c [<000000006356866a>] platform_dma_configure+0x2c/0x40 ...... [<000000000afcf9b5>] ret_from_fork+0x10/0x3c This issue is introduced by commit e0d072782c73("dma-mapping: introduce DMA range map, supplanting dma_pfn_offset "). It doesn't free dma_range_map when driver probe failed and cause above memory leak. So, add code to free it in error path. Fixes: e0d072782c73 ("dma-mapping: introduce DMA range map, supplanting dma_pfn_offset ") Cc: stable@vger.kernel.org Signed-off-by: Meng Li Link: https://lore.kernel.org/r/20210105070927.14968-1-Meng.Li@windriver.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 2f32f38a11ed..0b76b5423778 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -619,6 +619,8 @@ static int really_probe(struct device *dev, struct device_driver *drv) else if (drv->remove) drv->remove(dev); probe_failed: + kfree(dev->dma_range_map); + dev->dma_range_map = NULL; if (dev->bus) blocking_notifier_call_chain(&dev->bus->p->bus_notifier, BUS_NOTIFY_DRIVER_NOT_BOUND, dev); From 29f7c54b253fc18bff9bf7e9f303b75deb285c7a Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 21 Dec 2020 22:30:55 +0800 Subject: [PATCH 027/135] Driver core: platform: Add extra error check in devm_platform_get_irqs_affinity() The current check of nvec < minvec for nvec returned from platform_irq_count() will not detect a negative error code in nvec. This is because minvec is unsigned, and, as such, nvec is promoted to unsigned in that check, which will make it a huge number (if it contained -EPROBE_DEFER). In practice, an error should not occur in nvec for the only in-tree user, but add a check anyway. Fixes: e15f2fa959f2 ("driver core: platform: Add devm_platform_get_irqs_affinity()") Reported-by: Dan Carpenter Signed-off-by: John Garry Link: https://lore.kernel.org/r/1608561055-231244-1-git-send-email-john.garry@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 95fd1549f87d..8456d8384ac8 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -366,6 +366,8 @@ int devm_platform_get_irqs_affinity(struct platform_device *dev, return -ERANGE; nvec = platform_irq_count(dev); + if (nvec < 0) + return nvec; if (nvec < minvec) return -ENOSPC; From b8e594fa20d2e33d40c7a8c7c106549a35c38972 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Fri, 8 Jan 2021 10:29:01 -0600 Subject: [PATCH 028/135] irqchip/pruss: Simplify the TI_PRUSS_INTC Kconfig The TI PRUSS INTC irqchip driver handles the local interrupt controller which is a child device of it's parent PRUSS/ICSSG device. The driver was upstreamed in parallel with the PRUSS platform driver, and was configurable independently previously. The PRUSS interrupt controller is an integral part of the overall PRUSS software architecture, and is not useful at all by itself. Simplify the TI_PRUSS_INTC Kconfig dependencies by making it silent and selected automatically when the TI_PRUSS platform driver is enabled. Signed-off-by: Suman Anna Reviewed-by: David Lechner Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210108162901.6003-1-s-anna@ti.com --- drivers/irqchip/Kconfig | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 94920a51c628..b147f22a78f4 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -493,8 +493,9 @@ config TI_SCI_INTA_IRQCHIP TI System Controller, say Y here. Otherwise, say N. config TI_PRUSS_INTC - tristate "TI PRU-ICSS Interrupt Controller" - depends on ARCH_DAVINCI || SOC_AM33XX || SOC_AM43XX || SOC_DRA7XX || ARCH_KEYSTONE || ARCH_K3 + tristate + depends on TI_PRUSS + default TI_PRUSS select IRQ_DOMAIN help This enables support for the PRU-ICSS Local Interrupt Controller From 599b3063adf4bf041a87a69244ee36aded0d878f Mon Sep 17 00:00:00 2001 From: Mathias Kresin Date: Thu, 7 Jan 2021 22:36:03 +0100 Subject: [PATCH 029/135] irqchip/mips-cpu: Set IPI domain parent chip Since commit 55567976629e ("genirq/irqdomain: Allow partial trimming of irq_data hierarchy") the irq_data chain is valided. The irq_domain_trim_hierarchy() function doesn't consider the irq + ipi domain hierarchy as valid, since the ipi domain has the irq domain set as parent, but the parent domain has no chip set. Hence the boot ends in a kernel panic. Set the chip for the parent domain as it is done in the mips gic irq driver, to have a valid irq_data chain. Fixes: 3838a547fda2 ("irqchip: mips-cpu: Introduce IPI IRQ domain support") Cc: # v5.10+ Signed-off-by: Mathias Kresin Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210107213603.1637781-1-dev@kresin.me --- drivers/irqchip/irq-mips-cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/irqchip/irq-mips-cpu.c b/drivers/irqchip/irq-mips-cpu.c index 95d4fd8f7a96..0bbb0b2d0dd5 100644 --- a/drivers/irqchip/irq-mips-cpu.c +++ b/drivers/irqchip/irq-mips-cpu.c @@ -197,6 +197,13 @@ static int mips_cpu_ipi_alloc(struct irq_domain *domain, unsigned int virq, if (ret) return ret; + ret = irq_domain_set_hwirq_and_chip(domain->parent, virq + i, hwirq, + &mips_mt_cpu_irq_controller, + NULL); + + if (ret) + return ret; + ret = irq_set_irq_type(virq + i, IRQ_TYPE_LEVEL_HIGH); if (ret) return ret; From adc5d8757288a3a5628436d16e78fb696d802e39 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 7 Dec 2020 01:02:52 +0100 Subject: [PATCH 030/135] signal: Add missing __user annotation to copy_siginfo_from_user_any copy_siginfo_from_user_any() takes a userspace pointer as second argument; annotate the parameter type accordingly. Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20201207000252.138564-1-jannh@google.com Signed-off-by: Christian Brauner --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/signal.c b/kernel/signal.c index 5736c55aaa1a..546b860c6514 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3701,7 +3701,8 @@ static bool access_pidfd_pidns(struct pid *pid) return true; } -static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) +static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, + siginfo_t __user *info) { #ifdef CONFIG_COMPAT /* From 96e1e9846c6691f90009ae4d8e486e0ce5c628a7 Mon Sep 17 00:00:00 2001 From: Alexander Guril Date: Sat, 26 Dec 2020 12:40:21 +0100 Subject: [PATCH 031/135] Kernel: fork.c: Fix coding style: Do not use {} around single-line statements Fixed two coding style issues in kernel/fork.c Do not use {} around single-line statements. Cc: linux-kernel@vger.kernel.org Acked-by: Christian Brauner Signed-off-by: Alexander Guril Link: https://lore.kernel.org/r/20201226114021.2589-1-alexander.guril02@gmail.com Signed-off-by: Christian Brauner --- kernel/fork.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 37720a6d04ea..d66cd1014211 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -819,9 +819,8 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; - for (i = 0; i < UCOUNT_COUNTS; i++) { + for (i = 0; i < UCOUNT_COUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; - } #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", @@ -1654,9 +1653,8 @@ static inline void init_task_pid_links(struct task_struct *task) { enum pid_type type; - for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { + for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) INIT_HLIST_NODE(&task->pid_links[type]); - } } static inline void From cb5021ca622fe83923e0789f99fe7227cbcd3f68 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Mon, 11 Jan 2021 18:48:07 +0800 Subject: [PATCH 032/135] kthread: remove comments about old _do_fork() helper The old _do_fork() helper has been removed in favor of kernel_clone(). Here correct some comments which still contain _do_fork() Link: https://lore.kernel.org/r/20210111104807.18022-1-yanfei.xu@windriver.com Cc: christian@brauner.io Cc: linux-kernel@vger.kernel.org Acked-by: Christian Brauner Signed-off-by: Yanfei Xu Signed-off-by: Christian Brauner --- include/trace/events/sched.h | 2 +- kernel/kthread.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 5039af667645..cbe3e152d24c 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -366,7 +366,7 @@ TRACE_EVENT(sched_process_wait, ); /* - * Tracepoint for do_fork: + * Tracepoint for kernel_clone: */ TRACE_EVENT(sched_process_fork, diff --git a/kernel/kthread.c b/kernel/kthread.c index a5eceecd4513..fb9c3dcbb68d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -294,7 +294,7 @@ static int kthread(void *_create) do_exit(ret); } -/* called from do_fork() to get node information for about to be created task */ +/* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA From 7024f60d655272bd2ca1d3a4c9e0a63319b1eea1 Mon Sep 17 00:00:00 2001 From: "Hyunwook (Wooky) Baek" Date: Sat, 9 Jan 2021 23:11:02 -0800 Subject: [PATCH 033/135] x86/sev-es: Handle string port IO to kernel memory properly Don't assume dest/source buffers are userspace addresses when manually copying data for string I/O or MOVS MMIO, as {get,put}_user() will fail if handed a kernel address and ultimately lead to a kernel panic. When invoking INSB/OUTSB instructions in kernel space in a SEV-ES-enabled VM, the kernel crashes with the following message: "SEV-ES: Unsupported exception in #VC instruction emulation - can't continue" Handle that case properly. [ bp: Massage commit message. ] Fixes: f980f9c31a92 ("x86/sev-es: Compile early handler code into kernel image") Signed-off-by: Hyunwook (Wooky) Baek Signed-off-by: Borislav Petkov Acked-by: David Rientjes Link: https://lkml.kernel.org/r/20210110071102.2576186-1-baekhw@google.com --- arch/x86/kernel/sev-es.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 0bd1a0fc587e..ab31c34ba508 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -286,6 +286,12 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, u16 d2; u8 d1; + /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ + if (!user_mode(ctxt->regs) && !access_ok(target, size)) { + memcpy(dst, buf, size); + return ES_OK; + } + switch (size) { case 1: memcpy(&d1, buf, 1); @@ -335,6 +341,12 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, u16 d2; u8 d1; + /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ + if (!user_mode(ctxt->regs) && !access_ok(s, size)) { + memcpy(buf, src, size); + return ES_OK; + } + switch (size) { case 1: if (get_user(d1, s)) From 76e2fc63ca40977af893b724b00cc2f8e9ce47a4 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 11 Jan 2021 11:04:29 +0100 Subject: [PATCH 034/135] x86/cpu/amd: Set __max_die_per_package on AMD Set the maximum DIE per package variable on AMD using the NodesPerProcessor topology value. This will be used by RAPL, among others, to determine the maximum number of DIEs on the system in order to do per-DIE manipulations. [ bp: Productize into a proper patch. ] Fixes: 028c221ed190 ("x86/CPU/AMD: Save AMD NodeId as cpu_die_id") Reported-by: Johnathan Smithinovic Reported-by: Rafael Kitover Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov Tested-by: Johnathan Smithinovic Tested-by: Rafael Kitover Link: https://bugzilla.kernel.org/show_bug.cgi?id=210939 Link: https://lkml.kernel.org/r/20210106112106.GE5729@zn.tnic Link: https://lkml.kernel.org/r/20210111101455.1194-1-bp@alien8.de --- arch/x86/kernel/cpu/amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f8ca66f3d861..347a956f71ca 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -542,12 +542,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) u32 ecx; ecx = cpuid_ecx(0x8000001e); - nodes_per_socket = ((ecx >> 8) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - nodes_per_socket = ((value >> 3) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; } if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && From a9d4ef643430d638de1910377f50e0d492d85a43 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 11 Jan 2021 13:49:38 +0200 Subject: [PATCH 035/135] habanalabs: fix dma_addr passed to dma_mmap_coherent When doing dma_alloc_coherent in the driver, we add a certain hard-coded offset to the DMA address before returning to the callee function. This offset is needed when our device use this DMA address to perform outbound transactions to the host. However, if we want to map the DMA'able memory to the user via dma_mmap_coherent(), we need to pass the original dma address, without this offset. Otherwise, we will get erronouos mapping. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 3 ++- drivers/misc/habanalabs/goya/goya.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 8c09e4466af8..b328ddaa64ee 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -4002,7 +4002,8 @@ static int gaudi_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma, vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE; - rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, dma_addr, size); + rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, + (dma_addr - HOST_PHYS_BASE), size); if (rc) dev_err(hdev->dev, "dma_mmap_coherent error %d", rc); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index b8b4aa636b7c..63679a747d2c 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -2719,7 +2719,8 @@ static int goya_cb_mmap(struct hl_device *hdev, struct vm_area_struct *vma, vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE; - rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, dma_addr, size); + rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, + (dma_addr - HOST_PHYS_BASE), size); if (rc) dev_err(hdev->dev, "dma_mmap_coherent error %d", rc); From aa6df6533b8f9ead98889baa92e2b19793b1c77e Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 11 Jan 2021 15:00:38 +0200 Subject: [PATCH 036/135] habanalabs: fix reset process in case of failures There are some points in the reset process where if the code fails for some reason, and the system admin tries to initiate the reset process again we will get a kernel panic. This is because there aren't any protections in different fini functions that are called during the reset process. The protections that are added in this patch make sure that if the fini functions are called multiple times, without calling init functions between them, there won't be double release of already released resources. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 2 +- drivers/misc/habanalabs/common/mmu_v1.c | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 1456eabf9601..1ea57d86caa3 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1037,7 +1037,7 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, if (hard_reset) { /* Release kernel context */ - if (hl_ctx_put(hdev->kernel_ctx) == 1) + if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1) hdev->kernel_ctx = NULL; hl_vm_fini(hdev); hl_mmu_fini(hdev); diff --git a/drivers/misc/habanalabs/common/mmu_v1.c b/drivers/misc/habanalabs/common/mmu_v1.c index 2ce6ea89d4fa..06d8a44dd5d4 100644 --- a/drivers/misc/habanalabs/common/mmu_v1.c +++ b/drivers/misc/habanalabs/common/mmu_v1.c @@ -467,8 +467,16 @@ static void hl_mmu_v1_fini(struct hl_device *hdev) { /* MMU H/W fini was already done in device hw_fini() */ - kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0); - gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool); + if (!ZERO_OR_NULL_PTR(hdev->mmu_priv.hr.mmu_shadow_hop0)) { + kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0); + gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool); + } + + /* Make sure that if we arrive here again without init was called we + * won't cause kernel panic. This can happen for example if we fail + * during hard reset code at certain points + */ + hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL; } /** From 9488307a5559255f2fc9a3ab61e1c31e243ca7c6 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 11 Jan 2021 17:49:30 +0200 Subject: [PATCH 037/135] habanalabs: prevent soft lockup during unmap When using Deep learning framework such as tensorflow or pytorch, there are tens of thousands of host memory mappings. When the user frees all those mappings at the same time, the process of unmapping and unpinning them can take a long time, which may cause a soft lockup bug. To prevent this, we need to free the core to do other things during the unmapping process. For now, we chose to do it every 32K unmappings (each unmap is a single 4K page). Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs.h | 1 + drivers/misc/habanalabs/common/memory.c | 10 ++++++++-- drivers/misc/habanalabs/common/mmu.c | 6 +++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index e0d7f5fbaa5c..60e16dc4bcac 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2182,6 +2182,7 @@ void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu); int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr); int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_info *hops); +bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr); int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name, void __iomem *dst, u32 src_offset, u32 size); diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index cbe9da4e0211..5d4fbdcaefe3 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -886,8 +886,10 @@ static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr, { struct hl_device *hdev = ctx->hdev; u64 next_vaddr, i; + bool is_host_addr; u32 page_size; + is_host_addr = !hl_is_dram_va(hdev, vaddr); page_size = phys_pg_pack->page_size; next_vaddr = vaddr; @@ -900,9 +902,13 @@ static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr, /* * unmapping on Palladium can be really long, so avoid a CPU * soft lockup bug by sleeping a little between unmapping pages + * + * In addition, when unmapping host memory we pass through + * the Linux kernel to unpin the pages and that takes a long + * time. Therefore, sleep every 32K pages to avoid soft lockup */ - if (hdev->pldm) - usleep_range(500, 1000); + if (hdev->pldm || (is_host_addr && (i & 0x7FFF) == 0)) + usleep_range(50, 200); } } diff --git a/drivers/misc/habanalabs/common/mmu.c b/drivers/misc/habanalabs/common/mmu.c index 33ae953d3a36..28a4638741d8 100644 --- a/drivers/misc/habanalabs/common/mmu.c +++ b/drivers/misc/habanalabs/common/mmu.c @@ -9,7 +9,7 @@ #include "habanalabs.h" -static bool is_dram_va(struct hl_device *hdev, u64 virt_addr) +bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr) { struct asic_fixed_properties *prop = &hdev->asic_prop; @@ -156,7 +156,7 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size, if (!hdev->mmu_enable) return 0; - is_dram_addr = is_dram_va(hdev, virt_addr); + is_dram_addr = hl_is_dram_va(hdev, virt_addr); if (is_dram_addr) mmu_prop = &prop->dmmu; @@ -236,7 +236,7 @@ int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, if (!hdev->mmu_enable) return 0; - is_dram_addr = is_dram_va(hdev, virt_addr); + is_dram_addr = hl_is_dram_va(hdev, virt_addr); if (is_dram_addr) mmu_prop = &prop->dmmu; From 280a9045bb18833db921b316a5527d2b565e9f2e Mon Sep 17 00:00:00 2001 From: Eugene Korenevsky Date: Sun, 10 Jan 2021 20:36:09 +0300 Subject: [PATCH 038/135] ehci: fix EHCI host controller initialization sequence According to EHCI spec, EHCI HC clears USBSTS.HCHalted whenever USBCMD.RS=1. However, it is a good practice to wait some time after setting USBCMD.RS (approximately 100ms) until USBSTS.HCHalted become zero. Without this waiting, VirtualBox's EHCI virtual HC accidentally hangs (see BugLink). BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=211095 Acked-by: Alan Stern Signed-off-by: Eugene Korenevsky Cc: stable Link: https://lore.kernel.org/r/20210110173609.GA17313@himera.home Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ehci-hcd.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c index e358ae17d51e..1926b328b6aa 100644 --- a/drivers/usb/host/ehci-hcd.c +++ b/drivers/usb/host/ehci-hcd.c @@ -574,6 +574,7 @@ static int ehci_run (struct usb_hcd *hcd) struct ehci_hcd *ehci = hcd_to_ehci (hcd); u32 temp; u32 hcc_params; + int rc; hcd->uses_new_polling = 1; @@ -629,9 +630,20 @@ static int ehci_run (struct usb_hcd *hcd) down_write(&ehci_cf_port_reset_rwsem); ehci->rh_state = EHCI_RH_RUNNING; ehci_writel(ehci, FLAG_CF, &ehci->regs->configured_flag); + + /* Wait until HC become operational */ ehci_readl(ehci, &ehci->regs->command); /* unblock posted writes */ msleep(5); + rc = ehci_handshake(ehci, &ehci->regs->status, STS_HALT, 0, 100 * 1000); + up_write(&ehci_cf_port_reset_rwsem); + + if (rc) { + ehci_err(ehci, "USB %x.%x, controller refused to start: %d\n", + ((ehci->sbrn & 0xf0)>>4), (ehci->sbrn & 0x0f), rc); + return rc; + } + ehci->last_periodic_enable = ktime_get_real(); temp = HC_VERSION(ehci, ehci_readl(ehci, &ehci->caps->hc_capbase)); From 643a4df7fe3f6831d14536fd692be85f92670a52 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Tue, 12 Jan 2021 09:57:27 +0800 Subject: [PATCH 039/135] USB: ehci: fix an interrupt calltrace error The system that use Synopsys USB host controllers goes to suspend when using USB audio player. This causes the USB host controller continuous send interrupt signal to system, When the number of interrupts exceeds 100000, the system will forcibly close the interrupts and output a calltrace error. When the system goes to suspend, the last interrupt is reported to the driver. At this time, the system has set the state to suspend. This causes the last interrupt to not be processed by the system and not clear the interrupt flag. This uncleared interrupt flag constantly triggers new interrupt event. This causing the driver to receive more than 100,000 interrupts, which causes the system to forcibly close the interrupt report and report the calltrace error. so, when the driver goes to sleep and changes the system state to suspend, the interrupt flag needs to be cleared. Signed-off-by: Longfang Liu Acked-by: Alan Stern Link: https://lore.kernel.org/r/1610416647-45774-1-git-send-email-liulongfang@huawei.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ehci-hub.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c index 087402aec5cb..9f9ab5ccea88 100644 --- a/drivers/usb/host/ehci-hub.c +++ b/drivers/usb/host/ehci-hub.c @@ -345,6 +345,9 @@ static int ehci_bus_suspend (struct usb_hcd *hcd) unlink_empty_async_suspended(ehci); + /* Some Synopsys controllers mistakenly leave IAA turned on */ + ehci_writel(ehci, STS_IAA, &ehci->regs->status); + /* Any IAA cycle that started before the suspend is now invalid */ end_iaa_cycle(ehci); ehci_handle_start_intr_unlinks(ehci); From 4e0dcf62ab4cf917d0cbe751b8bf229a065248d4 Mon Sep 17 00:00:00 2001 From: Ryan Chen Date: Fri, 8 Jan 2021 16:12:38 +0800 Subject: [PATCH 040/135] usb: gadget: aspeed: fix stop dma register setting. The vhub engine has two dma mode, one is descriptor list, another is single stage DMA. Each mode has different stop register setting. Descriptor list operation (bit2) : 0 disable reset, 1: enable reset Single mode operation (bit0) : 0 : disable, 1: enable Fixes: 7ecca2a4080c ("usb/gadget: Add driver for Aspeed SoC virtual hub") Cc: stable Acked-by: Felipe Balbi Acked-by: Joel Stanley Signed-off-by: Ryan Chen Link: https://lore.kernel.org/r/20210108081238.10199-2-ryan_chen@aspeedtech.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/aspeed-vhub/epn.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/aspeed-vhub/epn.c b/drivers/usb/gadget/udc/aspeed-vhub/epn.c index 0bd6b20435b8..02d8bfae58fb 100644 --- a/drivers/usb/gadget/udc/aspeed-vhub/epn.c +++ b/drivers/usb/gadget/udc/aspeed-vhub/epn.c @@ -420,7 +420,10 @@ static void ast_vhub_stop_active_req(struct ast_vhub_ep *ep, u32 state, reg, loops; /* Stop DMA activity */ - writel(0, ep->epn.regs + AST_VHUB_EP_DMA_CTLSTAT); + if (ep->epn.desc_mode) + writel(VHUB_EP_DMA_CTRL_RESET, ep->epn.regs + AST_VHUB_EP_DMA_CTLSTAT); + else + writel(0, ep->epn.regs + AST_VHUB_EP_DMA_CTLSTAT); /* Wait for it to complete */ for (loops = 0; loops < 1000; loops++) { From 895bee270863588fe3d46dca86cd15d461f47a7a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 12 Jan 2021 19:02:29 +0100 Subject: [PATCH 041/135] Revert "driver core: Reorder devices on successful probe" This reverts commit 5b6164d3465fcc13b5679c860c452963443172a7. Stephan reports problems with this commit, so revert it for now. Fixes: 5b6164d3465f ("driver core: Reorder devices on successful probe") Link: https://lore.kernel.org/r/X/ycQpu7NIGI969v@gerhold.net Reported-by: Stephan Gerhold Cc: Jonathan Hunter Cc: Rafael. J. Wysocki Cc: Thierry Reding Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 0b76b5423778..9179825ff646 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -370,13 +370,6 @@ static void driver_bound(struct device *dev) device_pm_check_callbacks(dev); - /* - * Reorder successfully probed devices to the end of the device list. - * This ensures that suspend/resume order matches probe order, which - * is usually what drivers rely on. - */ - device_pm_move_to_tail(dev); - /* * Make sure the device is no longer in one of the deferred lists and * kick off retrying all pending devices From 9caa7ff509add50959a793b811cc7c9339e281cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Jan 2021 15:36:20 +0100 Subject: [PATCH 042/135] x86/entry: Fix noinstr fail vmlinux.o: warning: objtool: __do_fast_syscall_32()+0x47: call to syscall_enter_from_user_mode_work() leaves .noinstr.text section Fixes: 4facb95b7ada ("x86/entry: Unbreak 32bit fast syscall") Reported-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210106144017.472696632@infradead.org --- arch/x86/entry/common.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 18d8f17f755c..0904f5676e4d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -73,10 +73,8 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, unsigned int nr) { if (likely(nr < IA32_NR_syscalls)) { - instrumentation_begin(); nr = array_index_nospec(nr, IA32_NR_syscalls); regs->ax = ia32_sys_call_table[nr](regs); - instrumentation_end(); } } @@ -91,8 +89,11 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) * or may not be necessary, but it matches the old asm behavior. */ nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); + instrumentation_begin(); do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); syscall_exit_to_user_mode(regs); } @@ -121,11 +122,12 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) res = get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp); } - instrumentation_end(); if (res) { /* User code screwed up. */ regs->ax = -EFAULT; + + instrumentation_end(); syscall_exit_to_user_mode(regs); return false; } @@ -135,6 +137,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs, nr); + + instrumentation_end(); syscall_exit_to_user_mode(regs); return true; } From a1d5c98aac33a5a0004ecf88905dcc261c52f988 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Jan 2021 15:36:21 +0100 Subject: [PATCH 043/135] x86/sev: Fix nonistr violation When the compiler fails to inline, it violates nonisntr: vmlinux.o: warning: objtool: __sev_es_nmi_complete()+0xc7: call to sev_es_wr_ghcb_msr() leaves .noinstr.text section Fixes: 4ca68e023b11 ("x86/sev-es: Handle NMI State") Reported-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210106144017.532902065@infradead.org --- arch/x86/kernel/sev-es.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index ab31c34ba508..84c1821819af 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -225,7 +225,7 @@ static inline u64 sev_es_rd_ghcb_msr(void) return __rdmsr(MSR_AMD64_SEV_ES_GHCB); } -static inline void sev_es_wr_ghcb_msr(u64 val) +static __always_inline void sev_es_wr_ghcb_msr(u64 val) { u32 low, high; From 0afda3a888dccf12557b41ef42eee942327d122b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Jan 2021 15:36:22 +0100 Subject: [PATCH 044/135] locking/lockdep: Cure noinstr fail When the compiler doesn't feel like inlining, it causes a noinstr fail: vmlinux.o: warning: objtool: lock_is_held_type()+0xb: call to lockdep_enabled() leaves .noinstr.text section Fixes: 4d004099a668 ("lockdep: Fix lockdep recursion") Reported-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210106144017.592595176@infradead.org --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c1418b47f625..02bc5b8f1eb2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -79,7 +79,7 @@ module_param(lock_stat, int, 0644); DEFINE_PER_CPU(unsigned int, lockdep_recursion); EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion); -static inline bool lockdep_enabled(void) +static __always_inline bool lockdep_enabled(void) { if (!debug_locks) return false; From 77ca93a6b1223e210e58e1000c09d8d420403c94 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Jan 2021 15:36:23 +0100 Subject: [PATCH 045/135] locking/lockdep: Avoid noinstr warning for DEBUG_LOCKDEP vmlinux.o: warning: objtool: lock_is_held_type()+0x60: call to check_flags.part.0() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210106144017.652218215@infradead.org --- kernel/locking/lockdep.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 02bc5b8f1eb2..bdaf4829098c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5271,12 +5271,15 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie /* * Check whether we follow the irq-flags state precisely: */ -static void check_flags(unsigned long flags) +static noinstr void check_flags(unsigned long flags) { #if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) if (!debug_locks) return; + /* Get the warning out.. */ + instrumentation_begin(); + if (irqs_disabled_flags(flags)) { if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-off.\n"); @@ -5304,6 +5307,8 @@ static void check_flags(unsigned long flags) if (!debug_locks) print_irqtrace_events(current); + + instrumentation_end(); #endif } From 737495361d4469477ffe45d51e6fc56f44f3cc6a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Jan 2021 15:36:24 +0100 Subject: [PATCH 046/135] x86/mce: Remove explicit/superfluous tracing There's some explicit tracing left in exc_machine_check_kernel(), remove it, as it's already implied by irqentry_nmi_enter(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210106144017.719310466@infradead.org --- arch/x86/kernel/cpu/mce/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 13d3f1cbda17..e133ce1e562b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1992,10 +1992,9 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) * that out because it's an indirect call. Annotate it. */ instrumentation_begin(); - trace_hardirqs_off_finish(); + machine_check_vector(regs); - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); + instrumentation_end(); irqentry_nmi_exit(regs, irq_state); } @@ -2004,7 +2003,9 @@ static __always_inline void exc_machine_check_user(struct pt_regs *regs) { irqentry_enter_from_user_mode(regs); instrumentation_begin(); + machine_check_vector(regs); + instrumentation_end(); irqentry_exit_to_user_mode(regs); } From 66a425011c61e71560c234492d204e83cfb73d1d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Jan 2021 11:14:25 +0100 Subject: [PATCH 047/135] x86: __always_inline __{rd,wr}msr() When the compiler choses to not inline the trivial MSR helpers: vmlinux.o: warning: objtool: __sev_es_nmi_complete()+0xce: call to __wrmsr.constprop.14() leaves .noinstr.text section Reported-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Randy Dunlap # build-tested Link: https://lore.kernel.org/r/X/bf3gV+BW7kGEsB@hirez.programming.kicks-ass.net --- arch/x86/include/asm/msr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 0b4920a7238e..e16cccdd0420 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -86,7 +86,7 @@ static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} * think of extending them - you will be slapped with a stinking trout or a frozen * shark will reach you, wherever you are! You've been warned. */ -static inline unsigned long long notrace __rdmsr(unsigned int msr) +static __always_inline unsigned long long __rdmsr(unsigned int msr) { DECLARE_ARGS(val, low, high); @@ -98,7 +98,7 @@ static inline unsigned long long notrace __rdmsr(unsigned int msr) return EAX_EDX_VAL(val, low, high); } -static inline void notrace __wrmsr(unsigned int msr, u32 low, u32 high) +static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) { asm volatile("1: wrmsr\n" "2:\n" From aba428a0c612bb259891307da12e22efd0fab14c Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Tue, 1 Dec 2020 17:52:31 +0800 Subject: [PATCH 048/135] timekeeping: Remove unused get_seconds() The get_seconds() cleanup seems to have been completed, now it is time to delete the legacy interface to avoid misuse later. Signed-off-by: Chunguang Xu Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/1606816351-26900-1-git-send-email-brookxu@tencent.com --- include/linux/ktime.h | 1 - include/linux/timekeeping32.h | 14 -------------- kernel/time/timekeeping.c | 3 +-- 3 files changed, 1 insertion(+), 17 deletions(-) delete mode 100644 include/linux/timekeeping32.h diff --git a/include/linux/ktime.h b/include/linux/ktime.h index a12b5523cc18..73f20deb497d 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -230,6 +230,5 @@ static inline ktime_t ms_to_ktime(u64 ms) } # include -# include #endif diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h deleted file mode 100644 index 266017fc9ee9..000000000000 --- a/include/linux/timekeeping32.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _LINUX_TIMEKEEPING32_H -#define _LINUX_TIMEKEEPING32_H -/* - * These interfaces are all based on the old timespec type - * and should get replaced with the timespec64 based versions - * over time so we can remove the file here. - */ - -static inline unsigned long get_seconds(void) -{ - return ktime_get_real_seconds(); -} - -#endif diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a45cedda93a7..6aee5768c86f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -991,8 +991,7 @@ EXPORT_SYMBOL_GPL(ktime_get_seconds); /** * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME * - * Returns the wall clock seconds since 1970. This replaces the - * get_seconds() interface which is not y2038 safe on 32bit systems. + * Returns the wall clock seconds since 1970. * * For 64bit systems the fast access to tk->xtime_sec is preserved. On * 32bit systems the access must be protected with the sequence From e3fab2f3de081e98c50b7b4ace1b040161d95310 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Jan 2021 11:39:56 +0100 Subject: [PATCH 049/135] ntp: Fix RTC synchronization on 32-bit platforms Due to an integer overflow, RTC synchronization now happens every 2s instead of the intended 11 minutes. Fix this by forcing 64-bit arithmetic for the sync period calculation. Annotate the other place which multiplies seconds for consistency as well. Fixes: c9e6189fb03123a7 ("ntp: Make the RTC synchronization more reliable") Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210111103956.290378-1-geert+renesas@glider.be --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7404d3831527..87389b9e21ab 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -498,7 +498,7 @@ int second_overflow(time64_t secs) static void sync_hw_clock(struct work_struct *work); static DECLARE_WORK(sync_work, sync_hw_clock); static struct hrtimer sync_hrtimer; -#define SYNC_PERIOD_NS (11UL * 60 * NSEC_PER_SEC) +#define SYNC_PERIOD_NS (11ULL * 60 * NSEC_PER_SEC) static enum hrtimer_restart sync_timer_callback(struct hrtimer *timer) { @@ -512,7 +512,7 @@ static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry) ktime_t exp = ktime_set(ktime_get_real_seconds(), 0); if (retry) - exp = ktime_add_ns(exp, 2 * NSEC_PER_SEC - offset_nsec); + exp = ktime_add_ns(exp, 2ULL * NSEC_PER_SEC - offset_nsec); else exp = ktime_add_ns(exp, SYNC_PERIOD_NS - offset_nsec); From ce09ccc50208c04a1b03abfd530b5d6314258fd0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Jan 2021 15:43:18 +0100 Subject: [PATCH 050/135] genirq: Export irq_check_status_bit() One of the users can be built modular: ERROR: modpost: "irq_check_status_bit" [drivers/perf/arm_spe_pmu.ko] undefined! Fixes: fdd029630434 ("genirq: Move status flag checks to core") Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201227192049.GA195845@roeck-us.net --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ab8567f32501..dec3f73e8db9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2859,3 +2859,4 @@ bool irq_check_status_bit(unsigned int irq, unsigned int bitmask) rcu_read_unlock(); return res; } +EXPORT_SYMBOL_GPL(irq_check_status_bit); From a2e38dffcd93541914aba52b30c6a52acca35201 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 5 Jan 2021 18:04:20 -0600 Subject: [PATCH 051/135] objtool: Don't add empty symbols to the rbtree Building with the Clang assembler shows the following warning: arch/x86/kernel/ftrace_64.o: warning: objtool: missing symbol for insn at offset 0x16 The Clang assembler strips section symbols. That ends up giving objtool's find_func_containing() much more test coverage than normal. Turns out, find_func_containing() doesn't work so well for overlapping symbols: 2: 000000000000000e 0 NOTYPE LOCAL DEFAULT 2 fgraph_trace 3: 000000000000000f 0 NOTYPE LOCAL DEFAULT 2 trace 4: 0000000000000000 165 FUNC GLOBAL DEFAULT 2 __fentry__ 5: 000000000000000e 0 NOTYPE GLOBAL DEFAULT 2 ftrace_stub The zero-length NOTYPE symbols are inside __fentry__(), confusing the rbtree search for any __fentry__() offset coming after a NOTYPE. Try to avoid this problem by not adding zero-length symbols to the rbtree. They're rare and aren't needed in the rbtree anyway. One caveat, this actually might not end up being the right fix. Non-empty overlapping symbols, if they exist, could have the same problem. But that would need bigger changes, let's see if we can get away with the easy fix for now. Reported-by: Arnd Bergmann Acked-by: Peter Zijlstra (Intel) Signed-off-by: Josh Poimboeuf --- tools/objtool/elf.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index be89c741ba9a..f9682db33cca 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -448,6 +448,13 @@ static int read_symbols(struct elf *elf) list_add(&sym->list, entry); elf_hash_add(elf->symbol_hash, &sym->hash, sym->idx); elf_hash_add(elf->symbol_name_hash, &sym->name_hash, str_hash(sym->name)); + + /* + * Don't store empty STT_NOTYPE symbols in the rbtree. They + * can exist within a function, confusing the sorting. + */ + if (!sym->len) + rb_erase(&sym->node, &sym->sec->symbol_tree); } if (stats) From 6e6aa61d81194c01283880950df563b1b9abec46 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 13 Jan 2021 14:45:10 -0500 Subject: [PATCH 052/135] USB: gadget: dummy-hcd: Fix errors in port-reset handling Commit c318840fb2a4 ("USB: Gadget: dummy-hcd: Fix shift-out-of-bounds bug") messed up the way dummy-hcd handles requests to turn on the RESET port feature (I didn't notice that the original switch case ended with a fallthrough). The call to set_link_state() was inadvertently removed, as was the code to set the USB_PORT_STAT_RESET flag when the speed is USB2. In addition, the original code never checked whether the port was connected before handling the port-reset request. There was a check for the port being powered, but it was removed by that commit! In practice this doesn't matter much because the kernel doesn't try to reset disconnected ports, but it's still bad form. This patch fixes these problems by changing the fallthrough to break, adding back in the missing set_link_state() call, setting the port-reset status flag, adding a port-is-connected test, and removing a redundant assignment statement. Fixes: c318840fb2a4 ("USB: Gadget: dummy-hcd: Fix shift-out-of-bounds bug") CC: Acked-by: Felipe Balbi Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/20210113194510.GA1290698@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/dummy_hcd.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index 1a953f44183a..57067763b100 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -2270,17 +2270,20 @@ static int dummy_hub_control( } fallthrough; case USB_PORT_FEAT_RESET: + if (!(dum_hcd->port_status & USB_PORT_STAT_CONNECTION)) + break; /* if it's already enabled, disable */ if (hcd->speed == HCD_USB3) { - dum_hcd->port_status = 0; dum_hcd->port_status = (USB_SS_PORT_STAT_POWER | USB_PORT_STAT_CONNECTION | USB_PORT_STAT_RESET); - } else + } else { dum_hcd->port_status &= ~(USB_PORT_STAT_ENABLE | USB_PORT_STAT_LOW_SPEED | USB_PORT_STAT_HIGH_SPEED); + dum_hcd->port_status |= USB_PORT_STAT_RESET; + } /* * We want to reset device status. All but the * Self powered feature @@ -2292,7 +2295,8 @@ static int dummy_hub_control( * interval? Is it still 50msec as for HS? */ dum_hcd->re_timeout = jiffies + msecs_to_jiffies(50); - fallthrough; + set_link_state(dum_hcd); + break; case USB_PORT_FEAT_C_CONNECTION: case USB_PORT_FEAT_C_RESET: case USB_PORT_FEAT_C_ENABLE: From 1eb8f690bcb565a6600f8b6dcc78f7b239ceba17 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 14 Jan 2021 10:36:59 +0100 Subject: [PATCH 053/135] x86/topology: Make __max_die_per_package available unconditionally Move it outside of CONFIG_SMP in order to avoid ifdeffery at the usage sites. Fixes: 76e2fc63ca40 ("x86/cpu/amd: Set __max_die_per_package on AMD") Reported-by: Stephen Rothwell Reported-by: kernel test robot Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210114111814.5346-1-bp@alien8.de --- arch/x86/include/asm/topology.h | 4 ++-- arch/x86/kernel/cpu/topology.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 488a8e848754..9239399e5491 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -110,6 +110,8 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) +extern unsigned int __max_die_per_package; + #ifdef CONFIG_SMP #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) @@ -118,8 +120,6 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); extern unsigned int __max_logical_packages; #define topology_max_packages() (__max_logical_packages) -extern unsigned int __max_die_per_package; - static inline int topology_max_die_per_package(void) { return __max_die_per_package; diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 1068002c8532..8678864ce712 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -25,10 +25,10 @@ #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) #define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) -#ifdef CONFIG_SMP unsigned int __max_die_per_package __read_mostly = 1; EXPORT_SYMBOL(__max_die_per_package); +#ifdef CONFIG_SMP /* * Check if given CPUID extended toplogy "leaf" is implemented */ From a06b63a1200bd40fd20fa695739e479e2b2ae948 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 1 Dec 2020 10:03:28 +0300 Subject: [PATCH 054/135] iio: sx9310: Off by one in sx9310_read_thresh() This > should be >= to prevent reading one element beyond the end of the sx9310_pthresh_codes[] array. Fixes: ad2b473e2ba3 ("iio: sx9310: Support setting proximity thresholds") Signed-off-by: Dan Carpenter Reviewed-by: Stephen Boyd Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/X8XqwK0z//8sSWJR@mwanda Signed-off-by: Jonathan Cameron --- drivers/iio/proximity/sx9310.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/proximity/sx9310.c b/drivers/iio/proximity/sx9310.c index a2f820997afc..62eacb22e9bc 100644 --- a/drivers/iio/proximity/sx9310.c +++ b/drivers/iio/proximity/sx9310.c @@ -601,7 +601,7 @@ static int sx9310_read_thresh(struct sx9310_data *data, return ret; regval = FIELD_GET(SX9310_REG_PROX_CTRL8_9_PTHRESH_MASK, regval); - if (regval > ARRAY_SIZE(sx9310_pthresh_codes)) + if (regval >= ARRAY_SIZE(sx9310_pthresh_codes)) return -EINVAL; *val = sx9310_pthresh_codes[regval]; From b6bc1b4ffad4a55c9461707833dc45de2e4367cc Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Wed, 2 Dec 2020 09:35:51 +0100 Subject: [PATCH 055/135] dt-bindings: iio: accel: bma255: Fix bmc150/bmi055 compatible The bmc150-accel-i2c.c driver has an "_accel" suffix for the compatibles of BMC150 and BMI055. This is necessary because BMC150 contains both accelerometer (bosch,bmc150_accel) and magnetometer (bosch,bmc150_magn) and therefore "bosch,bmc150" would be ambiguous. However, the binding documentation suggests using "bosch,bmc150". Add the "_accel" suffix for BMC150 and BMI055 so the binding docs match what is expected by the driver. Fixes: 6259551cf19b ("iio: accel: bmc150-accel: Add DT bindings") Signed-off-by: Stephan Gerhold Reviewed-by: Linus Walleij Reviewed-by: Rob Herring Cc: Linus Walleij Link: https://lore.kernel.org/r/20201202083551.7753-1-stephan@gerhold.net Signed-off-by: Jonathan Cameron --- Documentation/devicetree/bindings/iio/accel/bosch,bma255.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/iio/accel/bosch,bma255.yaml b/Documentation/devicetree/bindings/iio/accel/bosch,bma255.yaml index 6eef3480ea8f..c2efbb813ca2 100644 --- a/Documentation/devicetree/bindings/iio/accel/bosch,bma255.yaml +++ b/Documentation/devicetree/bindings/iio/accel/bosch,bma255.yaml @@ -16,8 +16,8 @@ description: properties: compatible: enum: - - bosch,bmc150 - - bosch,bmi055 + - bosch,bmc150_accel + - bosch,bmi055_accel - bosch,bma255 - bosch,bma250e - bosch,bma222 From 7e6d9788aa02333a4353058816d52b9a90aae0d3 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 3 Dec 2020 09:26:50 +0200 Subject: [PATCH 056/135] iio: adc: ti_am335x_adc: remove omitted iio_kfifo_free() When the conversion was done to use devm_iio_kfifo_allocate(), a call to iio_kfifo_free() was omitted (to be removed). This change removes it. Fixes: 3c5308058899 ("iio: adc: ti_am335x_adc: alloc kfifo & IRQ via devm_ functions") Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20201203072650.24128-1-alexandru.ardelean@analog.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ti_am335x_adc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/iio/adc/ti_am335x_adc.c b/drivers/iio/adc/ti_am335x_adc.c index b11c8c47ba2a..e946903b0993 100644 --- a/drivers/iio/adc/ti_am335x_adc.c +++ b/drivers/iio/adc/ti_am335x_adc.c @@ -397,16 +397,12 @@ static int tiadc_iio_buffered_hardware_setup(struct device *dev, ret = devm_request_threaded_irq(dev, irq, pollfunc_th, pollfunc_bh, flags, indio_dev->name, indio_dev); if (ret) - goto error_kfifo_free; + return ret; indio_dev->setup_ops = setup_ops; indio_dev->modes |= INDIO_BUFFER_SOFTWARE; return 0; - -error_kfifo_free: - iio_kfifo_free(indio_dev->buffer); - return ret; } static const char * const chan_name_ain[] = { From cf5b1385d748b2f91b0c05bb301fcaf9bdbad385 Mon Sep 17 00:00:00 2001 From: Slaveyko Slaveykov Date: Wed, 16 Dec 2020 13:57:20 +0200 Subject: [PATCH 057/135] drivers: iio: temperature: Add delay after the addressed reset command in mlx90632.c After an I2C reset command, the mlx90632 needs some time before responding to other I2C commands. Without that delay, there is a chance that the I2C command(s) after the reset will not be accepted. Signed-off-by: Slaveyko Slaveykov Reviewed-by: Andy Shevchenko Reviewed-by: Crt Mori Fixes: e02472f74a81 ("iio:temperature:mlx90632: Adding extended calibration option") Link: https://lore.kernel.org/r/20201216115720.12404-2-sis@melexis.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/temperature/mlx90632.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/iio/temperature/mlx90632.c b/drivers/iio/temperature/mlx90632.c index 503fe54a0bb9..608ccb1d8bc8 100644 --- a/drivers/iio/temperature/mlx90632.c +++ b/drivers/iio/temperature/mlx90632.c @@ -248,6 +248,12 @@ static int mlx90632_set_meas_type(struct regmap *regmap, u8 type) if (ret < 0) return ret; + /* + * Give the mlx90632 some time to reset properly before sending a new I2C command + * if this is not done, the following I2C command(s) will not be accepted. + */ + usleep_range(150, 200); + ret = regmap_write_bits(regmap, MLX90632_REG_CONTROL, (MLX90632_CFG_MTYP_MASK | MLX90632_CFG_PWR_MASK), (MLX90632_MTYP_STATUS(type) | MLX90632_PWR_STATUS_HALT)); From 49a9565a7a7ce168e3e6482fb24e62d12f72ab81 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Sun, 13 Dec 2020 18:09:27 -0600 Subject: [PATCH 058/135] counter:ti-eqep: remove floor The hardware doesn't support this. QPOSINIT is an initialization value that is triggered by other things. When the counter overflows, it always wraps around to zero. Fixes: f213729f6796 "counter: new TI eQEP driver" Signed-off-by: David Lechner Acked-by: William Breathitt Gray Link: https://lore.kernel.org/r/20201214000927.1793062-1-david@lechnology.com Cc: Signed-off-by: Jonathan Cameron --- drivers/counter/ti-eqep.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/drivers/counter/ti-eqep.c b/drivers/counter/ti-eqep.c index a60aee1a1a29..65df9ef5b5bc 100644 --- a/drivers/counter/ti-eqep.c +++ b/drivers/counter/ti-eqep.c @@ -235,36 +235,6 @@ static ssize_t ti_eqep_position_ceiling_write(struct counter_device *counter, return len; } -static ssize_t ti_eqep_position_floor_read(struct counter_device *counter, - struct counter_count *count, - void *ext_priv, char *buf) -{ - struct ti_eqep_cnt *priv = counter->priv; - u32 qposinit; - - regmap_read(priv->regmap32, QPOSINIT, &qposinit); - - return sprintf(buf, "%u\n", qposinit); -} - -static ssize_t ti_eqep_position_floor_write(struct counter_device *counter, - struct counter_count *count, - void *ext_priv, const char *buf, - size_t len) -{ - struct ti_eqep_cnt *priv = counter->priv; - int err; - u32 res; - - err = kstrtouint(buf, 0, &res); - if (err < 0) - return err; - - regmap_write(priv->regmap32, QPOSINIT, res); - - return len; -} - static ssize_t ti_eqep_position_enable_read(struct counter_device *counter, struct counter_count *count, void *ext_priv, char *buf) @@ -301,11 +271,6 @@ static struct counter_count_ext ti_eqep_position_ext[] = { .read = ti_eqep_position_ceiling_read, .write = ti_eqep_position_ceiling_write, }, - { - .name = "floor", - .read = ti_eqep_position_floor_read, - .write = ti_eqep_position_floor_write, - }, { .name = "enable", .read = ti_eqep_position_enable_read, From efd597b2839a9895e8a98fcb0b76d2f545802cd4 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Wed, 9 Dec 2020 11:46:49 +0100 Subject: [PATCH 059/135] iio: ad5504: Fix setting power-down state The power-down mask of the ad5504 is actually a power-up mask. Meaning if a bit is set the corresponding channel is powered up and if it is not set the channel is powered down. The driver currently has this the wrong way around, resulting in the channel being powered up when requested to be powered down and vice versa. Fixes: 3bbbf150ffde ("staging:iio:dac:ad5504: Use strtobool for boolean values") Signed-off-by: Lars-Peter Clausen Acked-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20201209104649.5794-1-lars@metafoo.de Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/dac/ad5504.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/dac/ad5504.c b/drivers/iio/dac/ad5504.c index 28921b62e642..e9297c25d4ef 100644 --- a/drivers/iio/dac/ad5504.c +++ b/drivers/iio/dac/ad5504.c @@ -187,9 +187,9 @@ static ssize_t ad5504_write_dac_powerdown(struct iio_dev *indio_dev, return ret; if (pwr_down) - st->pwr_down_mask |= (1 << chan->channel); - else st->pwr_down_mask &= ~(1 << chan->channel); + else + st->pwr_down_mask |= (1 << chan->channel); ret = ad5504_spi_write(st, AD5504_ADDR_CTRL, AD5504_DAC_PWRDWN_MODE(st->pwr_down_mode) | From 40c48fb79b9798954691f24b8ece1d3a7eb1b353 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 8 Dec 2020 15:36:40 +0100 Subject: [PATCH 060/135] iio: common: st_sensors: fix possible infinite loop in st_sensors_irq_thread Return a boolean value in st_sensors_new_samples_available routine in order to avoid an infinite loop in st_sensors_irq_thread if stat_drdy.addr is not defined or stat_drdy read fails Fixes: 90efe05562921 ("iio: st_sensors: harden interrupt handling") Reported-by: Jonathan Cameron Signed-off-by: Lorenzo Bianconi Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/c9ec69ed349e7200c779fd7a5bf04c1aaa2817aa.1607438132.git.lorenzo@kernel.org Cc: Signed-off-by: Jonathan Cameron --- .../common/st_sensors/st_sensors_trigger.c | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/iio/common/st_sensors/st_sensors_trigger.c b/drivers/iio/common/st_sensors/st_sensors_trigger.c index 0507283bd4c1..2dbd2646e44e 100644 --- a/drivers/iio/common/st_sensors/st_sensors_trigger.c +++ b/drivers/iio/common/st_sensors/st_sensors_trigger.c @@ -23,35 +23,31 @@ * @sdata: Sensor data. * * returns: - * 0 - no new samples available - * 1 - new samples available - * negative - error or unknown + * false - no new samples available or read error + * true - new samples available */ -static int st_sensors_new_samples_available(struct iio_dev *indio_dev, - struct st_sensor_data *sdata) +static bool st_sensors_new_samples_available(struct iio_dev *indio_dev, + struct st_sensor_data *sdata) { int ret, status; /* How would I know if I can't check it? */ if (!sdata->sensor_settings->drdy_irq.stat_drdy.addr) - return -EINVAL; + return true; /* No scan mask, no interrupt */ if (!indio_dev->active_scan_mask) - return 0; + return false; ret = regmap_read(sdata->regmap, sdata->sensor_settings->drdy_irq.stat_drdy.addr, &status); if (ret < 0) { dev_err(sdata->dev, "error checking samples available\n"); - return ret; + return false; } - if (status & sdata->sensor_settings->drdy_irq.stat_drdy.mask) - return 1; - - return 0; + return !!(status & sdata->sensor_settings->drdy_irq.stat_drdy.mask); } /** @@ -180,9 +176,15 @@ int st_sensors_allocate_trigger(struct iio_dev *indio_dev, /* Tell the interrupt handler that we're dealing with edges */ if (irq_trig == IRQF_TRIGGER_FALLING || - irq_trig == IRQF_TRIGGER_RISING) + irq_trig == IRQF_TRIGGER_RISING) { + if (!sdata->sensor_settings->drdy_irq.stat_drdy.addr) { + dev_err(&indio_dev->dev, + "edge IRQ not supported w/o stat register.\n"); + err = -EOPNOTSUPP; + goto iio_trigger_free; + } sdata->edge_irq = true; - else + } else { /* * If we're not using edges (i.e. level interrupts) we * just mask off the IRQ, handle one interrupt, then @@ -190,6 +192,7 @@ int st_sensors_allocate_trigger(struct iio_dev *indio_dev, * interrupt handler top half again and start over. */ irq_trig |= IRQF_ONESHOT; + } /* * If the interrupt pin is Open Drain, by definition this From b8653aff1c8876142f965fc69e12ba217da13182 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 2 Dec 2020 12:02:52 -0800 Subject: [PATCH 061/135] iio: sx9310: Fix semtech,avg-pos-strength setting when > 16 This DT property can be 0, 16, and then 64, but not 32. The math here doesn't recognize this slight bump in the power of 2 numbers and translates a DT property of 64 into the register value '3' when it really should be '2'. Fix it by subtracting one more if the number being translated is larger than 31. Also use clamp() because we're here. Cc: Daniel Campello Cc: Lars-Peter Clausen Cc: Peter Meerwald-Stadler Cc: Douglas Anderson Cc: Gwendal Grignou Cc: Evan Green Signed-off-by: Stephen Boyd Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20201202200252.986230-1-swboyd@chromium.org Signed-off-by: Jonathan Cameron --- drivers/iio/proximity/sx9310.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iio/proximity/sx9310.c b/drivers/iio/proximity/sx9310.c index 62eacb22e9bc..37fd0b65a014 100644 --- a/drivers/iio/proximity/sx9310.c +++ b/drivers/iio/proximity/sx9310.c @@ -1305,7 +1305,8 @@ sx9310_get_default_reg(struct sx9310_data *data, int i, if (ret) break; - pos = min(max(ilog2(pos), 3), 10) - 3; + /* Powers of 2, except for a gap between 16 and 64 */ + pos = clamp(ilog2(pos), 3, 11) - (pos >= 32 ? 4 : 3); reg_def->def &= ~SX9310_REG_PROX_CTRL7_AVGPOSFILT_MASK; reg_def->def |= FIELD_PREP(SX9310_REG_PROX_CTRL7_AVGPOSFILT_MASK, pos); From c28095bc99073ddda65e4f31f6ae0d908d4d5cd8 Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Thu, 14 Jan 2021 00:09:51 -0800 Subject: [PATCH 062/135] usb: udc: core: Use lock when write to soft_connect Use lock to guard against concurrent access for soft-connect/disconnect operations when writing to soft_connect sysfs. Fixes: 2ccea03a8f7e ("usb: gadget: introduce UDC Class") Cc: stable@vger.kernel.org Acked-by: Felipe Balbi Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/338ea01fbd69b1985ef58f0f59af02c805ddf189.1610611437.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index 6a62bbd01324..ea114f922ccf 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1529,10 +1529,13 @@ static ssize_t soft_connect_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); + ssize_t ret; + mutex_lock(&udc_lock); if (!udc->driver) { dev_err(dev, "soft-connect without a gadget driver\n"); - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; + goto out; } if (sysfs_streq(buf, "connect")) { @@ -1543,10 +1546,14 @@ static ssize_t soft_connect_store(struct device *dev, usb_gadget_udc_stop(udc); } else { dev_err(dev, "unsupported command '%s'\n", buf); - return -EINVAL; + ret = -EINVAL; + goto out; } - return n; + ret = n; +out: + mutex_unlock(&udc_lock); + return ret; } static DEVICE_ATTR_WO(soft_connect); From 576667bad341516edc4e18eb85acb0a2b4c9c9d9 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Fri, 15 Jan 2021 18:19:06 +0200 Subject: [PATCH 063/135] xhci: make sure TRB is fully written before giving it to the controller Once the command ring doorbell is rung the xHC controller will parse all command TRBs on the command ring that have the cycle bit set properly. If the driver just started writing the next command TRB to the ring when hardware finished the previous TRB, then HW might fetch an incomplete TRB as long as its cycle bit set correctly. A command TRB is 16 bytes (128 bits) long. Driver writes the command TRB in four 32 bit chunks, with the chunk containing the cycle bit last. This does however not guarantee that chunks actually get written in that order. This was detected in stress testing when canceling URBs with several connected USB devices. Two consecutive "Set TR Dequeue pointer" commands got queued right after each other, and the second one was only partially written when the controller parsed it, causing the dequeue pointer to be set to bogus values. This was seen as error messages: "Mismatch between completed Set TR Deq Ptr command & xHCI internal state" Solution is to add a write memory barrier before writing the cycle bit. Cc: Tested-by: Ross Zwisler Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20210115161907.2875631-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 5677b81c0915..cf0c93a90200 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -2931,6 +2931,8 @@ static void queue_trb(struct xhci_hcd *xhci, struct xhci_ring *ring, trb->field[0] = cpu_to_le32(field1); trb->field[1] = cpu_to_le32(field2); trb->field[2] = cpu_to_le32(field3); + /* make sure TRB is fully written before giving it to the controller */ + wmb(); trb->field[3] = cpu_to_le32(field4); trace_xhci_queue_trb(ring, trb); From da7e0c3c2909a3d9bf8acfe1db3cb213bd7febfb Mon Sep 17 00:00:00 2001 From: JC Kuo Date: Fri, 15 Jan 2021 18:19:07 +0200 Subject: [PATCH 064/135] xhci: tegra: Delay for disabling LFPS detector Occasionally, we are seeing some SuperSpeed devices resumes right after being directed to U3. This commits add 500us delay to ensure LFPS detector is disabled before sending ACK to firmware. [ 16.099363] tegra-xusb 70090000.usb: entering ELPG [ 16.104343] tegra-xusb 70090000.usb: 2-1 isn't suspended: 0x0c001203 [ 16.114576] tegra-xusb 70090000.usb: not all ports suspended: -16 [ 16.120789] tegra-xusb 70090000.usb: entering ELPG failed The register write passes through a few flop stages of 32KHz clock domain. NVIDIA ASIC designer reviewed RTL and suggests 500us delay. Cc: stable@vger.kernel.org Signed-off-by: JC Kuo Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20210115161907.2875631-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-tegra.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/host/xhci-tegra.c b/drivers/usb/host/xhci-tegra.c index 934be1686352..50bb91b6a4b8 100644 --- a/drivers/usb/host/xhci-tegra.c +++ b/drivers/usb/host/xhci-tegra.c @@ -623,6 +623,13 @@ static void tegra_xusb_mbox_handle(struct tegra_xusb *tegra, enable); if (err < 0) break; + + /* + * wait 500us for LFPS detector to be disabled before + * sending ACK + */ + if (!enable) + usleep_range(500, 1000); } if (err < 0) { From c93cc9e16d88e0f5ea95d2d65d58a8a4dab258bc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Jan 2021 11:52:11 -0700 Subject: [PATCH 065/135] io_uring: iopoll requests should also wake task ->in_idle state If we're freeing/finishing iopoll requests, ensure we check if the task is in idling in terms of cancelation. Otherwise we could end up waiting forever in __io_uring_task_cancel() if the task has active iopoll requests that need cancelation. Cc: stable@vger.kernel.org # 5.9+ Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 985a9e3f976d..5cda878b69cf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2270,6 +2270,8 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx, struct io_uring_task *tctx = rb->task->io_uring; percpu_counter_sub(&tctx->inflight, rb->task_refs); + if (atomic_read(&tctx->in_idle)) + wake_up(&tctx->wait); put_task_struct_many(rb->task, rb->task_refs); rb->task = NULL; } @@ -2288,6 +2290,8 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) struct io_uring_task *tctx = rb->task->io_uring; percpu_counter_sub(&tctx->inflight, rb->task_refs); + if (atomic_read(&tctx->in_idle)) + wake_up(&tctx->wait); put_task_struct_many(rb->task, rb->task_refs); } rb->task = req->task; From 6b393a1ff1746a1c91bd95cbb2d79b104d8f15ac Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 16 Jan 2021 05:32:29 +0000 Subject: [PATCH 066/135] io_uring: fix false positive sqo warning on flush WARNING: CPU: 1 PID: 9094 at fs/io_uring.c:8884 io_disable_sqo_submit+0x106/0x130 fs/io_uring.c:8884 Call Trace: io_uring_flush+0x28b/0x3a0 fs/io_uring.c:9099 filp_close+0xb4/0x170 fs/open.c:1280 close_fd+0x5c/0x80 fs/file.c:626 __do_sys_close fs/open.c:1299 [inline] __se_sys_close fs/open.c:1297 [inline] __x64_sys_close+0x2f/0xa0 fs/open.c:1297 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 io_uring's final close() may be triggered by any task not only the creator. It's well handled by io_uring_flush() including SQPOLL case, though a warning in io_disable_sqo_submit() will fallaciously fire by moving this warning out to the only call site that matters. Reported-by: syzbot+2f5d1785dc624932da78@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5cda878b69cf..616c5f732a26 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8918,8 +8918,6 @@ static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, static void io_disable_sqo_submit(struct io_ring_ctx *ctx) { - WARN_ON_ONCE(ctx->sqo_task != current); - mutex_lock(&ctx->uring_lock); ctx->sqo_dead = 1; mutex_unlock(&ctx->uring_lock); @@ -8941,6 +8939,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { /* for SQPOLL only sqo_task has task notes */ + WARN_ON_ONCE(ctx->sqo_task != current); io_disable_sqo_submit(ctx); task = ctx->sq_data->thread; atomic_inc(&task->io_uring->in_idle); From 4325cb498cb743dacaa3edbec398c5255f476ef6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 16 Jan 2021 05:32:30 +0000 Subject: [PATCH 067/135] io_uring: fix uring_flush in exit_files() warning WARNING: CPU: 1 PID: 11100 at fs/io_uring.c:9096 io_uring_flush+0x326/0x3a0 fs/io_uring.c:9096 RIP: 0010:io_uring_flush+0x326/0x3a0 fs/io_uring.c:9096 Call Trace: filp_close+0xb4/0x170 fs/open.c:1280 close_files fs/file.c:401 [inline] put_files_struct fs/file.c:416 [inline] put_files_struct+0x1cc/0x350 fs/file.c:413 exit_files+0x7e/0xa0 fs/file.c:433 do_exit+0xc22/0x2ae0 kernel/exit.c:820 do_group_exit+0x125/0x310 kernel/exit.c:922 get_signal+0x3e9/0x20a0 kernel/signal.c:2770 arch_do_signal_or_restart+0x2a8/0x1eb0 arch/x86/kernel/signal.c:811 handle_signal_work kernel/entry/common.c:147 [inline] exit_to_user_mode_loop kernel/entry/common.c:171 [inline] exit_to_user_mode_prepare+0x148/0x250 kernel/entry/common.c:201 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x19/0x50 kernel/entry/common.c:302 entry_SYSCALL_64_after_hwframe+0x44/0xa9 An SQPOLL ring creator task may have gotten rid of its file note during exit and called io_disable_sqo_submit(), but the io_uring is still left referenced through fdtable, which will be put during close_files() and cause a false positive warning. First split the warning into two for more clarity when is hit, and the add sqo_dead check to handle the described case. Reported-by: syzbot+a32b546d58dde07875a1@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 616c5f732a26..d494c4269fc5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9131,7 +9131,10 @@ static int io_uring_flush(struct file *file, void *data) if (ctx->flags & IORING_SETUP_SQPOLL) { /* there is only one file note, which is owned by sqo_task */ - WARN_ON_ONCE((ctx->sqo_task == current) == + WARN_ON_ONCE(ctx->sqo_task != current && + xa_load(&tctx->xa, (unsigned long)file)); + /* sqo_dead check is for when this happens after cancellation */ + WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead && !xa_load(&tctx->xa, (unsigned long)file)); io_disable_sqo_submit(ctx); From 0b5cd6c32b14413bf87e10ee62be3162588dcbe6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 17 Jan 2021 02:29:56 +0000 Subject: [PATCH 068/135] io_uring: fix skipping disabling sqo on exec If there are no requests at the time __io_uring_task_cancel() is called, tctx_inflight() returns zero and and it terminates not getting a chance to go through __io_uring_files_cancel() and do io_disable_sqo_submit(). And we absolutely want them disabled by the time cancellation ends. Reported-by: Jens Axboe Signed-off-by: Pavel Begunkov Fixes: d9d05217cb69 ("io_uring: stop SQPOLL submit on creator's death") Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index d494c4269fc5..383ff6ed3734 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9085,6 +9085,10 @@ void __io_uring_task_cancel(void) /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); + /* trigger io_disable_sqo_submit() */ + if (tctx->sqpoll) + __io_uring_files_cancel(NULL); + do { /* read completions before cancelations */ inflight = tctx_inflight(tctx); From 2be449fcf38ff7e44cf76a2bba1376e923637eb1 Mon Sep 17 00:00:00 2001 From: Jiapeng Zhong Date: Thu, 14 Jan 2021 17:09:20 +0800 Subject: [PATCH 069/135] fs/cifs: Assign boolean values to a bool variable Fix the following coccicheck warnings: ./fs/cifs/connect.c:3386:2-21: WARNING: Assignment of 0/1 to bool variable. Reported-by: Abaci Robot Signed-off-by: Jiapeng Zhong Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5d39129406ea..7c3325c0fadc 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2195,7 +2195,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) tcon->nohandlecache = ctx->nohandlecache; else - tcon->nohandlecache = 1; + tcon->nohandlecache = true; tcon->nodelete = ctx->nodelete; tcon->local_lease = ctx->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); From 16a78851e1f52eaed7034b75707d3662b4b13b77 Mon Sep 17 00:00:00 2001 From: Jiapeng Zhong Date: Thu, 14 Jan 2021 18:02:23 +0800 Subject: [PATCH 070/135] fs/cifs: Simplify bool comparison. Fix the follow warnings: ./fs/cifs/connect.c: WARNING: Comparison of 0/1 to bool variable Reported-by: Abaci Robot Signed-off-by: Jiapeng Zhong Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7c3325c0fadc..c8ef24bac94f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2628,7 +2628,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon, } else if (ctx) tcon->unix_ext = 1; /* Unix Extensions supported */ - if (tcon->unix_ext == 0) { + if (!tcon->unix_ext) { cifs_dbg(FYI, "Unix extensions disabled so not set on reconnect\n"); return; } From 4d6b1c95b974761c01cbad92321b82232b66d2a2 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 14 Jan 2021 18:55:07 -0700 Subject: [PATCH 071/135] nvme: check the PRINFO bit before deciding the host buffer length According to NVMe spec v1.4, section 8.3.1, the PRINFO bit and the metadata size play a vital role in deteriming the host buffer size. If PRIFNO bit is set and MS==8, the host doesn't add the metadata buffer, instead the controller adds it. Signed-off-by: Revanth Rajashekar Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 200bdd672c28..8caf9b34734d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1543,8 +1543,21 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) } length = (io.nblocks + 1) << ns->lba_shift; - meta_len = (io.nblocks + 1) * ns->ms; - metadata = nvme_to_user_ptr(io.metadata); + + if ((io.control & NVME_RW_PRINFO_PRACT) && + ns->ms == sizeof(struct t10_pi_tuple)) { + /* + * Protection information is stripped/inserted by the + * controller. + */ + if (nvme_to_user_ptr(io.metadata)) + return -EINVAL; + meta_len = 0; + metadata = NULL; + } else { + meta_len = (io.nblocks + 1) * ns->ms; + metadata = nvme_to_user_ptr(io.metadata); + } if (ns->features & NVME_NS_EXT_LBAS) { length += meta_len; From 7674073b2ed35ac951a49c425dec6b39d5a57140 Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Thu, 14 Jan 2021 17:09:25 +0800 Subject: [PATCH 072/135] nvme-rdma: avoid request double completion for concurrent nvme_rdma_timeout A crash happens when inject completing request long time(nearly 30s). Each name space has a request queue, when inject completing request long time, multi request queues may have time out requests at the same time, nvme_rdma_timeout will execute concurrently. Multi requests in different request queues may be queued in the same rdma queue, multi nvme_rdma_timeout may call nvme_rdma_stop_queue at the same time. The first nvme_rdma_timeout will clear NVME_RDMA_Q_LIVE and continue stopping the rdma queue(drain qp), but the others check NVME_RDMA_Q_LIVE is already cleared, and then directly complete the requests, complete request before the qp is fully drained may lead to a use-after-free condition. Add a multex lock to serialize nvme_rdma_stop_queue. Signed-off-by: Chao Leng Tested-by: Israel Rukshin Reviewed-by: Israel Rukshin Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index cf6c49d09c82..b7ce4f221d99 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -97,6 +97,7 @@ struct nvme_rdma_queue { struct completion cm_done; bool pi_support; int cq_size; + struct mutex queue_lock; }; struct nvme_rdma_ctrl { @@ -579,6 +580,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, int ret; queue = &ctrl->queues[idx]; + mutex_init(&queue->queue_lock); queue->ctrl = ctrl; if (idx && ctrl->ctrl.max_integrity_segments) queue->pi_support = true; @@ -598,7 +600,8 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, if (IS_ERR(queue->cm_id)) { dev_info(ctrl->ctrl.device, "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id)); - return PTR_ERR(queue->cm_id); + ret = PTR_ERR(queue->cm_id); + goto out_destroy_mutex; } if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) @@ -628,6 +631,8 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, out_destroy_cm_id: rdma_destroy_id(queue->cm_id); nvme_rdma_destroy_queue_ib(queue); +out_destroy_mutex: + mutex_destroy(&queue->queue_lock); return ret; } @@ -639,9 +644,10 @@ static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) { - if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags)) - return; - __nvme_rdma_stop_queue(queue); + mutex_lock(&queue->queue_lock); + if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags)) + __nvme_rdma_stop_queue(queue); + mutex_unlock(&queue->queue_lock); } static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) @@ -651,6 +657,7 @@ static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) nvme_rdma_destroy_queue_ib(queue); rdma_destroy_id(queue->cm_id); + mutex_destroy(&queue->queue_lock); } static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) From 9ebbfe495ecd2e51bc92ac21ed5817c3b9e223ce Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Thu, 14 Jan 2021 17:09:26 +0800 Subject: [PATCH 073/135] nvme-tcp: avoid request double completion for concurrent nvme_tcp_timeout Each name space has a request queue, if complete request long time, multi request queues may have time out requests at the same time, nvme_tcp_timeout will execute concurrently. Multi requests in different request queues may be queued in the same tcp queue, multi nvme_tcp_timeout may call nvme_tcp_stop_queue at the same time. The first nvme_tcp_stop_queue will clear NVME_TCP_Q_LIVE and continue stopping the tcp queue(cancel io_work), but the others check NVME_TCP_Q_LIVE is already cleared, and then directly complete the requests, complete request before the io work is completely canceled may lead to a use-after-free condition. Add a multex lock to serialize nvme_tcp_stop_queue. Signed-off-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 216619926563..881d28eb15e9 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -76,6 +76,7 @@ struct nvme_tcp_queue { struct work_struct io_work; int io_cpu; + struct mutex queue_lock; struct mutex send_mutex; struct llist_head req_list; struct list_head send_list; @@ -1219,6 +1220,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) sock_release(queue->sock); kfree(queue->pdu); + mutex_destroy(&queue->queue_lock); } static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) @@ -1380,6 +1382,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, struct nvme_tcp_queue *queue = &ctrl->queues[qid]; int ret, rcv_pdu_size; + mutex_init(&queue->queue_lock); queue->ctrl = ctrl; init_llist_head(&queue->req_list); INIT_LIST_HEAD(&queue->send_list); @@ -1398,7 +1401,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, if (ret) { dev_err(nctrl->device, "failed to create socket: %d\n", ret); - return ret; + goto err_destroy_mutex; } /* Single syn retry */ @@ -1507,6 +1510,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, err_sock: sock_release(queue->sock); queue->sock = NULL; +err_destroy_mutex: + mutex_destroy(&queue->queue_lock); return ret; } @@ -1534,9 +1539,10 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; - if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) - return; - __nvme_tcp_stop_queue(queue); + mutex_lock(&queue->queue_lock); + if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) + __nvme_tcp_stop_queue(queue); + mutex_unlock(&queue->queue_lock); } static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) From 20d3bb92e84d417b0494a3b6867f0c86713db257 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 15 Jan 2021 07:30:46 +0100 Subject: [PATCH 074/135] nvme-pci: allow use of cmb on v1.4 controllers Since NVMe v1.4 the Controller Memory Buffer must be explicitly enabled by the host. Signed-off-by: Klaus Jensen [hch: avoid a local variable and add a comment] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 14 ++++++++++++++ include/linux/nvme.h | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 50d9a20568a2..25456d02eddb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -1795,6 +1796,9 @@ static void nvme_map_cmb(struct nvme_dev *dev) if (dev->cmb_size) return; + if (NVME_CAP_CMBS(dev->ctrl.cap)) + writel(NVME_CMBMSC_CRE, dev->bar + NVME_REG_CMBMSC); + dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); if (!dev->cmbsz) return; @@ -1808,6 +1812,16 @@ static void nvme_map_cmb(struct nvme_dev *dev) if (offset > bar_size) return; + /* + * Tell the controller about the host side address mapping the CMB, + * and enable CMB decoding for the NVMe 1.4+ scheme: + */ + if (NVME_CAP_CMBS(dev->ctrl.cap)) { + hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE | + (pci_bus_address(pdev, bar) + offset), + dev->bar + NVME_REG_CMBMSC); + } + /* * Controllers may support a CMB size larger than their BAR, * for example, due to being behind a bridge. Reduce the CMB to diff --git a/include/linux/nvme.h b/include/linux/nvme.h index d92535997687..bfed36e342cc 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -116,6 +116,9 @@ enum { NVME_REG_BPMBL = 0x0048, /* Boot Partition Memory Buffer * Location */ + NVME_REG_CMBMSC = 0x0050, /* Controller Memory Buffer Memory + * Space Control + */ NVME_REG_PMRCAP = 0x0e00, /* Persistent Memory Capabilities */ NVME_REG_PMRCTL = 0x0e04, /* Persistent Memory Region Control */ NVME_REG_PMRSTS = 0x0e08, /* Persistent Memory Region Status */ @@ -135,6 +138,7 @@ enum { #define NVME_CAP_CSS(cap) (((cap) >> 37) & 0xff) #define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) #define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf) +#define NVME_CAP_CMBS(cap) (((cap) >> 57) & 0x1) #define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) #define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) @@ -192,6 +196,8 @@ enum { NVME_CSTS_SHST_OCCUR = 1 << 2, NVME_CSTS_SHST_CMPLT = 2 << 2, NVME_CSTS_SHST_MASK = 3 << 2, + NVME_CMBMSC_CRE = 1 << 0, + NVME_CMBMSC_CMSE = 1 << 1, }; struct nvme_id_power_state { From bffcd507780ea614b5543c66f2e37ce0d55cd449 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 13 Jan 2021 17:33:51 -0800 Subject: [PATCH 075/135] nvmet: set right status on error in id-ns handler The function nvmet_execute_identify_ns() doesn't set the status if call to nvmet_find_namespace() fails. In that case we set the status of the request to the value return by the nvmet_copy_sgl(). Set the status to NVME_SC_INVALID_NS and adjust the code such that request will have the right status on nvmet_find_namespace() failure. Without this patch :- NVME Identify Namespace 3: nsze : 0 ncap : 0 nuse : 0 nsfeat : 0 nlbaf : 0 flbas : 0 mc : 0 dpc : 0 dps : 0 nmic : 0 rescap : 0 fpi : 0 dlfeat : 0 nawun : 0 nawupf : 0 nacwu : 0 nabsn : 0 nabo : 0 nabspf : 0 noiob : 0 nvmcap : 0 mssrl : 0 mcl : 0 msrc : 0 nsattr : 0 nvmsetid: 0 anagrpid: 0 endgid : 0 nguid : 00000000000000000000000000000000 eui64 : 0000000000000000 lbaf 0 : ms:0 lbads:0 rp:0 (in use) With this patch-series :- feb3b88b501e (HEAD -> nvme-5.11) nvmet: remove extra variable in identify ns 6302aa67210a nvmet: remove extra variable in id-desclist ed57951da453 nvmet: remove extra variable in smart log nsid be384b8c24dc nvmet: set right status on error in id-ns handler NVMe status: INVALID_NS: The namespace or the format of that namespace is invalid(0xb) Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 8d90235e4fcc..dc1ea468b182 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -487,8 +487,10 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) /* return an all zeroed buffer if we can't find an active namespace */ ns = nvmet_find_namespace(ctrl, req->cmd->identify.nsid); - if (!ns) + if (!ns) { + status = NVME_SC_INVALID_NS; goto done; + } nvmet_ns_revalidate(ns); @@ -541,7 +543,9 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) id->nsattr |= (1 << 0); nvmet_put_namespace(ns); done: - status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + if (!status) + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + kfree(id); out: nvmet_req_complete(req, status); From 92a5e1fdb286851d5bd0eb966b8d075be27cf5ee Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Mon, 18 Jan 2021 15:01:45 +0530 Subject: [PATCH 076/135] selftests/powerpc: Fix exit status of pkey tests Since main() does not return a value explicitly, the return values from FAIL_IF() conditions are ignored and the tests can still pass irrespective of failures. This makes sure that we always explicitly return the correct test exit status. Fixes: 1addb6444791 ("selftests/powerpc: Add test for execute-disabled pkeys") Fixes: c27f2fd1705a ("selftests/powerpc: Add test for pkey siginfo verification") Reported-by: Eirik Fuller Signed-off-by: Sandipan Das Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210118093145.10134-1-sandipan@linux.ibm.com --- tools/testing/selftests/powerpc/mm/pkey_exec_prot.c | 2 +- tools/testing/selftests/powerpc/mm/pkey_siginfo.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c index 9e5c7f3f498a..0af4f02669a1 100644 --- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c +++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c @@ -290,5 +290,5 @@ static int test(void) int main(void) { - test_harness(test, "pkey_exec_prot"); + return test_harness(test, "pkey_exec_prot"); } diff --git a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c index 4f815d7c1214..2db76e56d4cb 100644 --- a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c +++ b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c @@ -329,5 +329,5 @@ static int test(void) int main(void) { - test_harness(test, "pkey_siginfo"); + return test_harness(test, "pkey_siginfo"); } From ef02684c4e67d8c35ac83083564135bc7b1d3445 Mon Sep 17 00:00:00 2001 From: Patrik Jakobsson Date: Mon, 18 Jan 2021 21:36:15 +0100 Subject: [PATCH 077/135] usb: bdc: Make bdc pci driver depend on BROKEN The bdc pci driver is going to be removed due to it not existing in the wild. This patch turns off compilation of the driver so that stable kernels can also pick up the change. This helps the out-of-tree facetimehd webcam driver as the pci id conflicts with bdc. Cc: Al Cooper Cc: Acked-by: Felipe Balbi Signed-off-by: Patrik Jakobsson Link: https://lore.kernel.org/r/20210118203615.13995-1-patrik.r.jakobsson@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/bdc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/bdc/Kconfig b/drivers/usb/gadget/udc/bdc/Kconfig index 3e88c7670b2e..fb01ff47b64c 100644 --- a/drivers/usb/gadget/udc/bdc/Kconfig +++ b/drivers/usb/gadget/udc/bdc/Kconfig @@ -17,7 +17,7 @@ if USB_BDC_UDC comment "Platform Support" config USB_BDC_PCI tristate "BDC support for PCIe based platforms" - depends on USB_PCI + depends on USB_PCI && BROKEN default USB_BDC_UDC help Enable support for platforms which have BDC connected through PCIe, such as Lego3 FPGA platform. From 9c7d9017a49fb8516c13b7bff59b7da2abed23e1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 8 Jan 2021 19:05:59 +0100 Subject: [PATCH 078/135] x86: PM: Register syscore_ops for scale invariance On x86 scale invariace tends to be disabled during resume from suspend-to-RAM, because the MPERF or APERF MSR values are not as expected then due to updates taking place after the platform firmware has been invoked to complete the suspend transition. That, of course, is not desirable, especially if the schedutil scaling governor is in use, because the lack of scale invariance causes it to be less reliable. To counter that effect, modify init_freq_invariance() to register a syscore_ops object for scale invariance with the ->resume callback pointing to init_counter_refs() which will run on the CPU starting the resume transition (the other CPUs will be taken care of the "online" operations taking place later). Fixes: e2b0d619b400 ("x86, sched: check for counters overflow in frequency invariant accounting") Signed-off-by: Rafael J. Wysocki Signed-off-by: Peter Zijlstra (Intel) Acked-by: Giovanni Gherdovich Link: https://lkml.kernel.org/r/1803209.Mvru99baaF@kreacher --- arch/x86/kernel/smpboot.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8ca66af96a54..117e24fbfd8a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -2083,6 +2084,23 @@ static void init_counter_refs(void) this_cpu_write(arch_prev_mperf, mperf); } +#ifdef CONFIG_PM_SLEEP +static struct syscore_ops freq_invariance_syscore_ops = { + .resume = init_counter_refs, +}; + +static void register_freq_invariance_syscore_ops(void) +{ + /* Bail out if registered already. */ + if (freq_invariance_syscore_ops.node.prev) + return; + + register_syscore_ops(&freq_invariance_syscore_ops); +} +#else +static inline void register_freq_invariance_syscore_ops(void) {} +#endif + static void init_freq_invariance(bool secondary, bool cppc_ready) { bool ret = false; @@ -2109,6 +2127,7 @@ static void init_freq_invariance(bool secondary, bool cppc_ready) if (ret) { init_counter_refs(); static_branch_enable(&arch_scale_freq_key); + register_freq_invariance_syscore_ops(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } else { pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); From dd3a44c06f7b4f14e90065bf05d62c255b20005f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 19 Jan 2021 15:18:00 +1100 Subject: [PATCH 079/135] selftests/powerpc: Only test lwm/stmw on big endian Newer binutils (>= 2.36) refuse to assemble lmw/stmw when building in little endian mode. That breaks compilation of our alignment handler test: /tmp/cco4l14N.s: Assembler messages: /tmp/cco4l14N.s:1440: Error: `lmw' invalid when little-endian /tmp/cco4l14N.s:1814: Error: `stmw' invalid when little-endian make[2]: *** [../../lib.mk:139: /output/kselftest/powerpc/alignment/alignment_handler] Error 1 These tests do pass on little endian machines, as the kernel will still emulate those instructions even when running little endian (which is arguably a kernel bug). But we don't really need to test that case, so ifdef those instructions out to get the alignment test building again. Reported-by: Libor Pechacek Signed-off-by: Michael Ellerman Tested-by: Libor Pechacek Link: https://lore.kernel.org/r/20210119041800.3093047-1-mpe@ellerman.id.au --- .../testing/selftests/powerpc/alignment/alignment_handler.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/alignment/alignment_handler.c b/tools/testing/selftests/powerpc/alignment/alignment_handler.c index cb53a8b777e6..c25cf7cd45e9 100644 --- a/tools/testing/selftests/powerpc/alignment/alignment_handler.c +++ b/tools/testing/selftests/powerpc/alignment/alignment_handler.c @@ -443,7 +443,6 @@ int test_alignment_handler_integer(void) LOAD_DFORM_TEST(ldu); LOAD_XFORM_TEST(ldx); LOAD_XFORM_TEST(ldux); - LOAD_DFORM_TEST(lmw); STORE_DFORM_TEST(stb); STORE_XFORM_TEST(stbx); STORE_DFORM_TEST(stbu); @@ -462,7 +461,11 @@ int test_alignment_handler_integer(void) STORE_XFORM_TEST(stdx); STORE_DFORM_TEST(stdu); STORE_XFORM_TEST(stdux); + +#ifdef __BIG_ENDIAN__ + LOAD_DFORM_TEST(lmw); STORE_DFORM_TEST(stmw); +#endif return rc; } From 08685be7761d69914f08c3d6211c543a385a5b9c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 11 Jan 2021 16:24:08 +1000 Subject: [PATCH 080/135] powerpc/64s: fix scv entry fallback flush vs interrupt The L1D flush fallback functions are not recoverable vs interrupts, yet the scv entry flush runs with MSR[EE]=1. This can result in a timer (soft-NMI) or MCE or SRESET interrupt hitting here and overwriting the EXRFI save area, which ends up corrupting userspace registers for scv return. Fix this by disabling RI and EE for the scv entry fallback flush. Fixes: f79643787e0a0 ("powerpc/64s: flush L1D on kernel entry") Cc: stable@vger.kernel.org # 5.9+ which also have flush L1D patch backport Reported-by: Tulio Magno Quites Machado Filho Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210111062408.287092-1-npiggin@gmail.com --- arch/powerpc/include/asm/exception-64s.h | 13 ++++++++++++ arch/powerpc/include/asm/feature-fixups.h | 10 ++++++++++ arch/powerpc/kernel/entry_64.S | 2 +- arch/powerpc/kernel/exceptions-64s.S | 19 ++++++++++++++++++ arch/powerpc/kernel/vmlinux.lds.S | 7 +++++++ arch/powerpc/lib/feature-fixups.c | 24 ++++++++++++++++++++--- 6 files changed, 71 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 1d32b174ab6a..c1a8aac01cf9 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -63,6 +63,12 @@ nop; \ nop; +#define SCV_ENTRY_FLUSH_SLOT \ + SCV_ENTRY_FLUSH_FIXUP_SECTION; \ + nop; \ + nop; \ + nop; + /* * r10 must be free to use, r13 must be paca */ @@ -70,6 +76,13 @@ STF_ENTRY_BARRIER_SLOT; \ ENTRY_FLUSH_SLOT +/* + * r10, ctr must be free to use, r13 must be paca + */ +#define SCV_INTERRUPT_TO_KERNEL \ + STF_ENTRY_BARRIER_SLOT; \ + SCV_ENTRY_FLUSH_SLOT + /* * Macros for annotating the expected destination of (h)rfid * diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index f6d2acb57425..ac605fc369c4 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -240,6 +240,14 @@ label##3: \ FTR_ENTRY_OFFSET 957b-958b; \ .popsection; +#define SCV_ENTRY_FLUSH_FIXUP_SECTION \ +957: \ + .pushsection __scv_entry_flush_fixup,"a"; \ + .align 2; \ +958: \ + FTR_ENTRY_OFFSET 957b-958b; \ + .popsection; + #define RFI_FLUSH_FIXUP_SECTION \ 951: \ .pushsection __rfi_flush_fixup,"a"; \ @@ -273,10 +281,12 @@ label##3: \ extern long stf_barrier_fallback; extern long entry_flush_fallback; +extern long scv_entry_flush_fallback; extern long __start___stf_entry_barrier_fixup, __stop___stf_entry_barrier_fixup; extern long __start___stf_exit_barrier_fixup, __stop___stf_exit_barrier_fixup; extern long __start___uaccess_flush_fixup, __stop___uaccess_flush_fixup; extern long __start___entry_flush_fixup, __stop___entry_flush_fixup; +extern long __start___scv_entry_flush_fixup, __stop___scv_entry_flush_fixup; extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup; extern long __start___barrier_nospec_fixup, __stop___barrier_nospec_fixup; extern long __start__btb_flush_fixup, __stop__btb_flush_fixup; diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index aa1af139d947..33ddfeef4fe9 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -75,7 +75,7 @@ BEGIN_FTR_SECTION bne .Ltabort_syscall END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif - INTERRUPT_TO_KERNEL + SCV_INTERRUPT_TO_KERNEL mr r10,r1 ld r1,PACAKSAVE(r13) std r10,0(r1) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index e02ad6fefa46..6e53f7638737 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -2993,6 +2993,25 @@ TRAMP_REAL_BEGIN(entry_flush_fallback) ld r11,PACA_EXRFI+EX_R11(r13) blr +/* + * The SCV entry flush happens with interrupts enabled, so it must disable + * to prevent EXRFI being clobbered by NMIs (e.g., soft_nmi_common). r10 + * (containing LR) does not need to be preserved here because scv entry + * puts 0 in the pt_regs, CTR can be clobbered for the same reason. + */ +TRAMP_REAL_BEGIN(scv_entry_flush_fallback) + li r10,0 + mtmsrd r10,1 + lbz r10,PACAIRQHAPPENED(r13) + ori r10,r10,PACA_IRQ_HARD_DIS + stb r10,PACAIRQHAPPENED(r13) + std r11,PACA_EXRFI+EX_R11(r13) + L1D_DISPLACEMENT_FLUSH + ld r11,PACA_EXRFI+EX_R11(r13) + li r10,MSR_RI + mtmsrd r10,1 + blr + TRAMP_REAL_BEGIN(rfi_flush_fallback) SET_SCRATCH0(r13); GET_PACA(r13); diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 4ab426b8b0e0..72fa3c00229a 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -145,6 +145,13 @@ SECTIONS __stop___entry_flush_fixup = .; } + . = ALIGN(8); + __scv_entry_flush_fixup : AT(ADDR(__scv_entry_flush_fixup) - LOAD_OFFSET) { + __start___scv_entry_flush_fixup = .; + *(__scv_entry_flush_fixup) + __stop___scv_entry_flush_fixup = .; + } + . = ALIGN(8); __stf_exit_barrier_fixup : AT(ADDR(__stf_exit_barrier_fixup) - LOAD_OFFSET) { __start___stf_exit_barrier_fixup = .; diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 47821055b94c..1fd31b4b0e13 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -290,9 +290,6 @@ void do_entry_flush_fixups(enum l1d_flush_type types) long *start, *end; int i; - start = PTRRELOC(&__start___entry_flush_fixup); - end = PTRRELOC(&__stop___entry_flush_fixup); - instrs[0] = 0x60000000; /* nop */ instrs[1] = 0x60000000; /* nop */ instrs[2] = 0x60000000; /* nop */ @@ -312,6 +309,8 @@ void do_entry_flush_fixups(enum l1d_flush_type types) if (types & L1D_FLUSH_MTTRIG) instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */ + start = PTRRELOC(&__start___entry_flush_fixup); + end = PTRRELOC(&__stop___entry_flush_fixup); for (i = 0; start < end; start++, i++) { dest = (void *)start + *start; @@ -328,6 +327,25 @@ void do_entry_flush_fixups(enum l1d_flush_type types) patch_instruction((struct ppc_inst *)(dest + 2), ppc_inst(instrs[2])); } + start = PTRRELOC(&__start___scv_entry_flush_fixup); + end = PTRRELOC(&__stop___scv_entry_flush_fixup); + for (; start < end; start++, i++) { + dest = (void *)start + *start; + + pr_devel("patching dest %lx\n", (unsigned long)dest); + + patch_instruction((struct ppc_inst *)dest, ppc_inst(instrs[0])); + + if (types == L1D_FLUSH_FALLBACK) + patch_branch((struct ppc_inst *)(dest + 1), (unsigned long)&scv_entry_flush_fallback, + BRANCH_SET_LINK); + else + patch_instruction((struct ppc_inst *)(dest + 1), ppc_inst(instrs[1])); + + patch_instruction((struct ppc_inst *)(dest + 2), ppc_inst(instrs[2])); + } + + printk(KERN_DEBUG "entry-flush: patched %d locations (%s flush)\n", i, (types == L1D_FLUSH_NONE) ? "no" : (types == L1D_FLUSH_FALLBACK) ? "fallback displacement" : From dc5d17a3c39b06aef866afca19245a9cfb533a79 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Thu, 10 Dec 2020 14:33:32 +0800 Subject: [PATCH 081/135] md: Set prev_flush_start and flush_bio in an atomic way One customer reports a crash problem which causes by flush request. It triggers a warning before crash. /* new request after previous flush is completed */ if (ktime_after(req_start, mddev->prev_flush_start)) { WARN_ON(mddev->flush_bio); mddev->flush_bio = bio; bio = NULL; } The WARN_ON is triggered. We use spin lock to protect prev_flush_start and flush_bio in md_flush_request. But there is no lock protection in md_submit_flush_data. It can set flush_bio to NULL first because of compiler reordering write instructions. For example, flush bio1 sets flush bio to NULL first in md_submit_flush_data. An interrupt or vmware causing an extended stall happen between updating flush_bio and prev_flush_start. Because flush_bio is NULL, flush bio2 can get the lock and submit to underlayer disks. Then flush bio1 updates prev_flush_start after the interrupt or extended stall. Then flush bio3 enters in md_flush_request. The start time req_start is behind prev_flush_start. The flush_bio is not NULL(flush bio2 hasn't finished). So it can trigger the WARN_ON now. Then it calls INIT_WORK again. INIT_WORK() will re-initialize the list pointers in the work_struct, which then can result in a corrupted work list and the work_struct queued a second time. With the work list corrupted, it can lead in invalid work items being used and cause a crash in process_one_work. We need to make sure only one flush bio can be handled at one same time. So add spin lock in md_submit_flush_data to protect prev_flush_start and flush_bio in an atomic way. Reviewed-by: David Jeffery Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index ca409428b4fc..04384452a7ab 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -639,8 +639,10 @@ static void md_submit_flush_data(struct work_struct *ws) * could wait for this and below md_handle_request could wait for those * bios because of suspend check */ + spin_lock_irq(&mddev->lock); mddev->prev_flush_start = mddev->start_flush; mddev->flush_bio = NULL; + spin_unlock_irq(&mddev->lock); wake_up(&mddev->sb_wait); if (bio->bi_iter.bi_size == 0) { From 9275c206f88e5c49cb3e71932c81c8561083db9e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2021 09:33:52 +0100 Subject: [PATCH 082/135] nvme-pci: refactor nvme_unmap_data Split out three helpers from nvme_unmap_data that will allow finer grained unwinding from nvme_map_data. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Marc Orr --- drivers/nvme/host/pci.c | 77 ++++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 25456d02eddb..e29ece9e4d4b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -543,50 +543,71 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) return true; } -static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) +static void nvme_free_prps(struct nvme_dev *dev, struct request *req) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; - dma_addr_t dma_addr = iod->first_dma, next_dma_addr; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + dma_addr_t dma_addr = iod->first_dma; int i; - if (iod->dma_len) { - dma_unmap_page(dev->dev, dma_addr, iod->dma_len, - rq_dma_dir(req)); - return; + for (i = 0; i < iod->npages; i++) { + __le64 *prp_list = nvme_pci_iod_list(req)[i]; + dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); + + dma_pool_free(dev->prp_page_pool, prp_list, dma_addr); + dma_addr = next_dma_addr; } - WARN_ON_ONCE(!iod->nents); +} + +static void nvme_free_sgls(struct nvme_dev *dev, struct request *req) +{ + const int last_sg = SGES_PER_PAGE - 1; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + dma_addr_t dma_addr = iod->first_dma; + int i; + + for (i = 0; i < iod->npages; i++) { + struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i]; + dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr); + + dma_pool_free(dev->prp_page_pool, sg_list, dma_addr); + dma_addr = next_dma_addr; + } + +} + +static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); if (is_pci_p2pdma_page(sg_page(iod->sg))) pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req)); else dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req)); +} +static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - if (iod->npages == 0) - dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], - dma_addr); - - for (i = 0; i < iod->npages; i++) { - void *addr = nvme_pci_iod_list(req)[i]; - - if (iod->use_sgl) { - struct nvme_sgl_desc *sg_list = addr; - - next_dma_addr = - le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr); - } else { - __le64 *prp_list = addr; - - next_dma_addr = le64_to_cpu(prp_list[last_prp]); - } - - dma_pool_free(dev->prp_page_pool, addr, dma_addr); - dma_addr = next_dma_addr; + if (iod->dma_len) { + dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, + rq_dma_dir(req)); + return; } + WARN_ON_ONCE(!iod->nents); + + nvme_unmap_sg(dev, req); + if (iod->npages == 0) + dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], + iod->first_dma); + else if (iod->use_sgl) + nvme_free_sgls(dev, req); + else + nvme_free_prps(dev, req); mempool_free(iod->sg, dev->iod_mempool); } From fa0732168fa1369dd089e5b06d6158a68229f7b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2021 09:35:01 +0100 Subject: [PATCH 083/135] nvme-pci: fix error unwind in nvme_map_data Properly unwind step by step using refactored helpers from nvme_unmap_data to avoid a potential double dma_unmap on a mapping failure. Fixes: 7fe07d14f71f ("nvme-pci: merge nvme_free_iod into nvme_unmap_data") Reported-by: Marc Orr Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Marc Orr --- drivers/nvme/host/pci.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e29ece9e4d4b..856aa31931c1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -683,7 +683,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, __le64 *old_prp_list = prp_list; prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) - return BLK_STS_RESOURCE; + goto free_prps; list[iod->npages++] = prp_list; prp_list[0] = old_prp_list[i - 1]; old_prp_list[i - 1] = cpu_to_le64(prp_dma); @@ -703,14 +703,14 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, dma_addr = sg_dma_address(sg); dma_len = sg_dma_len(sg); } - done: cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); - return BLK_STS_OK; - - bad_sgl: +free_prps: + nvme_free_prps(dev, req); + return BLK_STS_RESOURCE; +bad_sgl: WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents), "Invalid SGL for payload:%d nents:%d\n", blk_rq_payload_bytes(req), iod->nents); @@ -782,7 +782,7 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); if (!sg_list) - return BLK_STS_RESOURCE; + goto free_sgls; i = 0; nvme_pci_iod_list(req)[iod->npages++] = sg_list; @@ -795,6 +795,9 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, } while (--entries > 0); return BLK_STS_OK; +free_sgls: + nvme_free_sgls(dev, req); + return BLK_STS_RESOURCE; } static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, @@ -863,7 +866,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); iod->nents = blk_rq_map_sg(req->q, req, iod->sg); if (!iod->nents) - goto out; + goto out_free_sg; if (is_pci_p2pdma_page(sg_page(iod->sg))) nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg, @@ -872,16 +875,21 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN); if (!nr_mapped) - goto out; + goto out_free_sg; iod->use_sgl = nvme_pci_use_sgls(dev, req); if (iod->use_sgl) ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped); else ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); -out: if (ret != BLK_STS_OK) - nvme_unmap_data(dev, req); + goto out_unmap_sg; + return BLK_STS_OK; + +out_unmap_sg: + nvme_unmap_sg(dev, req); +out_free_sg: + mempool_free(iod->sg, dev->iod_mempool); return ret; } From 9bb48c82aced07698a2d08ee0f1475a6c4f6b266 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 19 Jan 2021 11:41:16 -0800 Subject: [PATCH 084/135] tty: implement write_iter This makes the tty layer use the .write_iter() function instead of the traditional .write() functionality. That allows writev(), but more importantly also makes it possible to enable .splice_write() for ttys, reinstating the "splice to tty" functionality that was lost in commit 36e2c7421f02 ("fs: don't allow splice read/write without explicit ops"). Fixes: 36e2c7421f02 ("fs: don't allow splice read/write without explicit ops") Reported-by: Oliver Giles Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: Al Viro Signed-off-by: Linus Torvalds --- drivers/tty/tty_io.c | 48 ++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 56ade99ef99f..338bc4ef5549 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -143,9 +143,8 @@ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ DEFINE_MUTEX(tty_mutex); static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *); -static ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *); -ssize_t redirected_tty_write(struct file *, const char __user *, - size_t, loff_t *); +static ssize_t tty_write(struct kiocb *, struct iov_iter *); +ssize_t redirected_tty_write(struct kiocb *, struct iov_iter *); static __poll_t tty_poll(struct file *, poll_table *); static int tty_open(struct inode *, struct file *); long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -478,7 +477,8 @@ static void tty_show_fdinfo(struct seq_file *m, struct file *file) static const struct file_operations tty_fops = { .llseek = no_llseek, .read = tty_read, - .write = tty_write, + .write_iter = tty_write, + .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, @@ -491,7 +491,8 @@ static const struct file_operations tty_fops = { static const struct file_operations console_fops = { .llseek = no_llseek, .read = tty_read, - .write = redirected_tty_write, + .write_iter = redirected_tty_write, + .splice_write = iter_file_splice_write, .poll = tty_poll, .unlocked_ioctl = tty_ioctl, .compat_ioctl = tty_compat_ioctl, @@ -607,9 +608,9 @@ static void __tty_hangup(struct tty_struct *tty, int exit_session) /* This breaks for file handles being sent over AF_UNIX sockets ? */ list_for_each_entry(priv, &tty->tty_files, list) { filp = priv->file; - if (filp->f_op->write == redirected_tty_write) + if (filp->f_op->write_iter == redirected_tty_write) cons_filp = filp; - if (filp->f_op->write != tty_write) + if (filp->f_op->write_iter != tty_write) continue; closecount++; __tty_fasync(-1, filp, 0); /* can't block */ @@ -902,9 +903,9 @@ static inline ssize_t do_tty_write( ssize_t (*write)(struct tty_struct *, struct file *, const unsigned char *, size_t), struct tty_struct *tty, struct file *file, - const char __user *buf, - size_t count) + struct iov_iter *from) { + size_t count = iov_iter_count(from); ssize_t ret, written = 0; unsigned int chunk; @@ -956,14 +957,20 @@ static inline ssize_t do_tty_write( size_t size = count; if (size > chunk) size = chunk; + ret = -EFAULT; - if (copy_from_user(tty->write_buf, buf, size)) + if (copy_from_iter(tty->write_buf, size, from) != size) break; + ret = write(tty, file, tty->write_buf, size); if (ret <= 0) break; + + /* FIXME! Have Al check this! */ + if (ret != size) + iov_iter_revert(from, size-ret); + written += ret; - buf += ret; count -= ret; if (!count) break; @@ -1023,9 +1030,9 @@ void tty_write_message(struct tty_struct *tty, char *msg) * write method will not be invoked in parallel for each device. */ -static ssize_t tty_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t tty_write(struct kiocb *iocb, struct iov_iter *from) { + struct file *file = iocb->ki_filp; struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; ssize_t ret; @@ -1038,18 +1045,15 @@ static ssize_t tty_write(struct file *file, const char __user *buf, if (tty->ops->write_room == NULL) tty_err(tty, "missing write_room method\n"); ld = tty_ldisc_ref_wait(tty); - if (!ld) - return hung_up_tty_write(file, buf, count, ppos); - if (!ld->ops->write) + if (!ld || !ld->ops->write) ret = -EIO; else - ret = do_tty_write(ld->ops->write, tty, file, buf, count); + ret = do_tty_write(ld->ops->write, tty, file, from); tty_ldisc_deref(ld); return ret; } -ssize_t redirected_tty_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +ssize_t redirected_tty_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *p = NULL; @@ -1060,11 +1064,11 @@ ssize_t redirected_tty_write(struct file *file, const char __user *buf, if (p) { ssize_t res; - res = vfs_write(p, buf, count, &p->f_pos); + res = vfs_iocb_iter_write(p, iocb, iter); fput(p); return res; } - return tty_write(file, buf, count, ppos); + return tty_write(iocb, iter); } /** @@ -2293,7 +2297,7 @@ static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (file->f_op->write == redirected_tty_write) { + if (file->f_op->write_iter == redirected_tty_write) { struct file *f; spin_lock(&redirect_lock); f = redirect; From e45122893a9870813f9bd7b4add4f613e6f29008 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 20 Jan 2021 21:09:48 -0800 Subject: [PATCH 085/135] x86/fpu: Add kernel_fpu_begin_mask() to selectively initialize state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, requesting kernel FPU access doesn't distinguish which parts of the extended ("FPU") state are needed. This is nice for simplicity, but there are a few cases in which it's suboptimal: - The vast majority of in-kernel FPU users want XMM/YMM/ZMM state but do not use legacy 387 state. These users want MXCSR initialized but don't care about the FPU control word. Skipping FNINIT would save time. (Empirically, FNINIT is several times slower than LDMXCSR.) - Code that wants MMX doesn't want or need MXCSR initialized. _mmx_memcpy(), for example, can run before CR4.OSFXSR gets set, and initializing MXCSR will fail because LDMXCSR generates an #UD when the aforementioned CR4 bit is not set. - Any future in-kernel users of XFD (eXtended Feature Disable)-capable dynamic states will need special handling. Add a more specific API that allows callers to specify exactly what they want. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Tested-by: Krzysztof Piotr Olędzki Link: https://lkml.kernel.org/r/aff1cac8b8fc7ee900cf73e8f2369966621b053f.1611205691.git.luto@kernel.org --- arch/x86/include/asm/fpu/api.h | 15 +++++++++++++-- arch/x86/kernel/fpu/core.c | 9 +++++---- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index a5aba4ab0224..67a4f1cb2aac 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -16,14 +16,25 @@ * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It * disables preemption so be careful if you intend to use it for long periods * of time. - * If you intend to use the FPU in softirq you need to check first with + * If you intend to use the FPU in irq/softirq you need to check first with * irq_fpu_usable() if it is possible. */ -extern void kernel_fpu_begin(void); + +/* Kernel FPU states to initialize in kernel_fpu_begin_mask() */ +#define KFPU_387 _BITUL(0) /* 387 state will be initialized */ +#define KFPU_MXCSR _BITUL(1) /* MXCSR will be initialized */ + +extern void kernel_fpu_begin_mask(unsigned int kfpu_mask); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); extern void fpregs_mark_activate(void); +/* Code that is unaware of kernel_fpu_begin_mask() can use this */ +static inline void kernel_fpu_begin(void) +{ + kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); +} + /* * Use fpregs_lock() while editing CPU's FPU registers or fpu->state. * A context switch will (and softirq might) save CPU's FPU registers to diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index eb86a2b831b1..571220ac8bea 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -121,7 +121,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu) } EXPORT_SYMBOL(copy_fpregs_to_fpstate); -void kernel_fpu_begin(void) +void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); @@ -141,13 +141,14 @@ void kernel_fpu_begin(void) } __cpu_invalidate_fpregs_state(); - if (boot_cpu_has(X86_FEATURE_XMM)) + /* Put sane initial values into the control registers. */ + if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) ldmxcsr(MXCSR_DEFAULT); - if (boot_cpu_has(X86_FEATURE_FPU)) + if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) asm volatile ("fninit"); } -EXPORT_SYMBOL_GPL(kernel_fpu_begin); +EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { From 67de8dca50c027ca0fa3b62a488ee5035036a0da Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 20 Jan 2021 21:09:49 -0800 Subject: [PATCH 086/135] x86/mmx: Use KFPU_387 for MMX string operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default kernel_fpu_begin() doesn't work on systems that support XMM but haven't yet enabled CR4.OSFXSR. This causes crashes when _mmx_memcpy() is called too early because LDMXCSR generates #UD when the aforementioned bit is clear. Fix it by using kernel_fpu_begin_mask(KFPU_387) explicitly. Fixes: 7ad816762f9b ("x86/fpu: Reset MXCSR to default in kernel_fpu_begin()") Reported-by: Krzysztof Mazur Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Tested-by: Krzysztof Piotr Olędzki Tested-by: Krzysztof Mazur Cc: Link: https://lkml.kernel.org/r/e7bf21855fe99e5f3baa27446e32623358f69e8d.1611205691.git.luto@kernel.org --- arch/x86/lib/mmx_32.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index 4321fa02e18d..419365c48b2a 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -26,6 +26,16 @@ #include #include +/* + * Use KFPU_387. MMX instructions are not affected by MXCSR, + * but both AMD and Intel documentation states that even integer MMX + * operations will result in #MF if an exception is pending in FCW. + * + * EMMS is not needed afterwards because, after calling kernel_fpu_end(), + * any subsequent user of the 387 stack will reinitialize it using + * KFPU_387. + */ + void *_mmx_memcpy(void *to, const void *from, size_t len) { void *p; @@ -37,7 +47,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) p = to; i = len >> 6; /* len/64 */ - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); __asm__ __volatile__ ( "1: prefetch (%0)\n" /* This set is 28 bytes */ @@ -127,7 +137,7 @@ static void fast_clear_page(void *page) { int i; - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : @@ -160,7 +170,7 @@ static void fast_copy_page(void *to, void *from) { int i; - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); /* * maybe the prefetch stuff can go before the expensive fnsave... @@ -247,7 +257,7 @@ static void fast_clear_page(void *page) { int i; - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : @@ -282,7 +292,7 @@ static void fast_copy_page(void *to, void *from) { int i; - kernel_fpu_begin(); + kernel_fpu_begin_mask(KFPU_387); __asm__ __volatile__ ( "1: prefetch (%0)\n" From 97784481757fba7570121a70dd37ca74a29f50a8 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Wed, 20 Jan 2021 23:22:02 -0800 Subject: [PATCH 087/135] lightnvm: fix memory leak when submit fails The allocated page is not released if error occurs in nvm_submit_io_sync_raw(). __free_page() is moved ealier to avoid possible memory leak issue. Fixes: aff3fb18f957 ("lightnvm: move bad block and chunk state logic to core") Signed-off-by: Pan Bian Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index c1bcac71008c..28ddcaa5358b 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -844,11 +844,10 @@ static int nvm_bb_chunk_sense(struct nvm_dev *dev, struct ppa_addr ppa) rqd.ppa_addr = generic_to_dev_addr(dev, ppa); ret = nvm_submit_io_sync_raw(dev, &rqd); + __free_page(page); if (ret) return ret; - __free_page(page); - return rqd.error; } From 4eaad21a6ac9865df7f31983232ed5928450458d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2021 21:46:29 +0100 Subject: [PATCH 088/135] kernfs: implement ->read_iter Switch kernfs to implement the read_iter method instead of plain old read to prepare to supporting splice and sendfile again. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210120204631.274206-2-hch@lst.de Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index f277d023ebcd..8276e4c8722d 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "kernfs-internal.h" @@ -180,11 +181,10 @@ static const struct seq_operations kernfs_seq_ops = { * it difficult to use seq_file. Implement simplistic custom buffering for * bin files. */ -static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, - char __user *user_buf, size_t count, - loff_t *ppos) +static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - ssize_t len = min_t(size_t, count, PAGE_SIZE); + struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); + ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; char *buf; @@ -210,7 +210,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, of->event = atomic_read(&of->kn->attr.open->event); ops = kernfs_ops(of->kn); if (ops->read) - len = ops->read(of, buf, len, *ppos); + len = ops->read(of, buf, len, iocb->ki_pos); else len = -EINVAL; @@ -220,12 +220,12 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, if (len < 0) goto out_free; - if (copy_to_user(user_buf, buf, len)) { + if (copy_to_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } - *ppos += len; + iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) @@ -235,22 +235,11 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, return len; } -/** - * kernfs_fop_read - kernfs vfs read callback - * @file: file pointer - * @user_buf: data to write - * @count: number of bytes - * @ppos: starting offset - */ -static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct kernfs_open_file *of = kernfs_of(file); - - if (of->kn->flags & KERNFS_HAS_SEQ_SHOW) - return seq_read(file, user_buf, count, ppos); - else - return kernfs_file_direct_read(of, user_buf, count, ppos); + if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW) + return seq_read_iter(iocb, iter); + return kernfs_file_read_iter(iocb, iter); } /** @@ -960,7 +949,7 @@ void kernfs_notify(struct kernfs_node *kn) EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { - .read = kernfs_fop_read, + .read_iter = kernfs_fop_read_iter, .write = kernfs_fop_write, .llseek = generic_file_llseek, .mmap = kernfs_fop_mmap, From cc099e0b399889c6485c88368b19824b087c9f8c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2021 21:46:30 +0100 Subject: [PATCH 089/135] kernfs: implement ->write_iter Switch kernfs to implement the write_iter method instead of plain old write to prepare to supporting splice and sendfile again. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210120204631.274206-3-hch@lst.de Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 8276e4c8722d..b1a5cccf189e 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -242,13 +242,7 @@ static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) return kernfs_file_read_iter(iocb, iter); } -/** - * kernfs_fop_write - kernfs vfs write callback - * @file: file pointer - * @user_buf: data to write - * @count: number of bytes - * @ppos: starting offset - * +/* * Copy data in from userland and pass it to the matching kernfs write * operation. * @@ -258,20 +252,18 @@ static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) * modify only the the value you're changing, then write entire buffer * back. */ -static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) +static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct kernfs_open_file *of = kernfs_of(file); + struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); + ssize_t len = iov_iter_count(iter); const struct kernfs_ops *ops; - ssize_t len; char *buf; if (of->atomic_write_len) { - len = count; if (len > of->atomic_write_len) return -E2BIG; } else { - len = min_t(size_t, count, PAGE_SIZE); + len = min_t(size_t, len, PAGE_SIZE); } buf = of->prealloc_buf; @@ -282,7 +274,7 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, if (!buf) return -ENOMEM; - if (copy_from_user(buf, user_buf, len)) { + if (copy_from_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } @@ -301,7 +293,7 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, ops = kernfs_ops(of->kn); if (ops->write) - len = ops->write(of, buf, len, *ppos); + len = ops->write(of, buf, len, iocb->ki_pos); else len = -EINVAL; @@ -309,7 +301,7 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, mutex_unlock(&of->mutex); if (len > 0) - *ppos += len; + iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) @@ -662,7 +654,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) /* * Write path needs to atomic_write_len outside active reference. - * Cache it in open_file. See kernfs_fop_write() for details. + * Cache it in open_file. See kernfs_fop_write_iter() for details. */ of->atomic_write_len = ops->atomic_write_len; @@ -950,7 +942,7 @@ EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { .read_iter = kernfs_fop_read_iter, - .write = kernfs_fop_write, + .write_iter = kernfs_fop_write_iter, .llseek = generic_file_llseek, .mmap = kernfs_fop_mmap, .open = kernfs_fop_open, From f2d6c2708bd84ca953fa6b6ca5717e79eb0140c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Jan 2021 21:46:31 +0100 Subject: [PATCH 090/135] kernfs: wire up ->splice_read and ->splice_write Wire up the splice_read and splice_write methods to the default helpers using ->read_iter and ->write_iter now that those are implemented for kernfs. This restores support to use splice and sendfile on kernfs files. Fixes: 36e2c7421f02 ("fs: don't allow splice read/write without explicit ops") Reported-by: Siddharth Gupta Tested-by: Siddharth Gupta Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210120204631.274206-4-hch@lst.de Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index b1a5cccf189e..c75719312147 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -949,6 +949,8 @@ const struct file_operations kernfs_file_fops = { .release = kernfs_fop_release, .poll = kernfs_fop_poll, .fsync = noop_fsync, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, }; /** From 3d1cf435e201d1fd63e4346b141881aed086effd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 15 Jan 2021 19:30:51 +0100 Subject: [PATCH 091/135] driver core: Extend device_is_dependent() If the device passed as the target (second argument) to device_is_dependent() is not completely registered (that is, it has been initialized, but not added yet), but the parent pointer of it is set, it may be missing from the list of the parent's children and device_for_each_child() called by device_is_dependent() cannot be relied on to catch that dependency. For this reason, modify device_is_dependent() to check the ancestors of the target device by following its parent pointer in addition to the device_for_each_child() walk. Fixes: 9ed9895370ae ("driver core: Functional dependencies tracking support") Reported-by: Stephan Gerhold Tested-by: Stephan Gerhold Reviewed-by: Saravana Kannan Signed-off-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/17705994.d592GUb2YH@kreacher Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 25e08e5f40bd..3819fd012e27 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -208,6 +208,16 @@ int device_links_read_lock_held(void) #endif #endif /* !CONFIG_SRCU */ +static bool device_is_ancestor(struct device *dev, struct device *target) +{ + while (target->parent) { + target = target->parent; + if (dev == target) + return true; + } + return false; +} + /** * device_is_dependent - Check if one device depends on another one * @dev: Device to check dependencies for. @@ -221,7 +231,12 @@ int device_is_dependent(struct device *dev, void *target) struct device_link *link; int ret; - if (dev == target) + /* + * The "ancestors" check is needed to catch the case when the target + * device has not been completely initialized yet and it is still + * missing from the list of children of its parent device. + */ + if (dev == target || device_is_ancestor(dev, target)) return 1; ret = device_for_each_child(dev, target, device_is_dependent); From 927633a6d20af319d986f3e42c3ef9f6d7835008 Mon Sep 17 00:00:00 2001 From: Wang Hui Date: Fri, 15 Jan 2021 22:59:16 +0300 Subject: [PATCH 092/135] stm class: Fix module init return on allocation failure In stm_heartbeat_init(): return value gets reset after the first iteration by stm_source_register_device(), so allocation failures after that will, after a clean up, return success. Fix that. Fixes: 119291853038 ("stm class: Add heartbeat stm source device") Reported-by: Hulk Robot Signed-off-by: Wang Hui Signed-off-by: Alexander Shishkin Link: https://lore.kernel.org/r/20210115195917.3184-2-alexander.shishkin@linux.intel.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/stm/heartbeat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/hwtracing/stm/heartbeat.c b/drivers/hwtracing/stm/heartbeat.c index 3e7df1c0477f..81d7b21d31ec 100644 --- a/drivers/hwtracing/stm/heartbeat.c +++ b/drivers/hwtracing/stm/heartbeat.c @@ -64,7 +64,7 @@ static void stm_heartbeat_unlink(struct stm_source_data *data) static int stm_heartbeat_init(void) { - int i, ret = -ENOMEM; + int i, ret; if (nr_devs < 0 || nr_devs > STM_HEARTBEAT_MAX) return -EINVAL; @@ -72,8 +72,10 @@ static int stm_heartbeat_init(void) for (i = 0; i < nr_devs; i++) { stm_heartbeat[i].data.name = kasprintf(GFP_KERNEL, "heartbeat.%d", i); - if (!stm_heartbeat[i].data.name) + if (!stm_heartbeat[i].data.name) { + ret = -ENOMEM; goto fail_unregister; + } stm_heartbeat[i].data.nr_chans = 1; stm_heartbeat[i].data.link = stm_heartbeat_link; From cb5c681ab9037e25fcca20689c82cf034566d610 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 15 Jan 2021 22:59:17 +0300 Subject: [PATCH 093/135] intel_th: pci: Add Alder Lake-P support This adds support for the Trace Hub in Alder Lake-P. Signed-off-by: Alexander Shishkin Link: https://lore.kernel.org/r/20210115195917.3184-3-alexander.shishkin@linux.intel.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index 52acd77438ed..251e75c9ba9d 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -268,6 +268,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7aa6), .driver_data = (kernel_ulong_t)&intel_th_2x, }, + { + /* Alder Lake-P */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x51a6), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, { /* Alder Lake CPU */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x466f), From 9354f1b421f76f8368be13954f87d07bcbd6fffe Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Sun, 17 Jan 2021 09:39:37 +0200 Subject: [PATCH 094/135] habanalabs: zero pci counters packet before submit to FW Driver does not zero some pci counters packets before sending to FW. This causes an out of sync PI/CI between driver and FW. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 20f77f58edef..c9a12980218a 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -402,6 +402,10 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev, } counters->rx_throughput = result; + memset(&pkt, 0, sizeof(pkt)); + pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_THROUGHPUT_GET << + CPUCP_PKT_CTL_OPCODE_SHIFT); + /* Fetch PCI tx counter */ pkt.index = cpu_to_le32(cpucp_pcie_throughput_tx); rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), @@ -414,6 +418,7 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev, counters->tx_throughput = result; /* Fetch PCI replay counter */ + memset(&pkt, 0, sizeof(pkt)); pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_REPLAY_CNT_GET << CPUCP_PKT_CTL_OPCODE_SHIFT); From f8abaf379bfe19600f96ae79a6759eb37039ae05 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 18 Jan 2021 13:19:51 +0200 Subject: [PATCH 095/135] habanalabs: fix backward compatibility of idle check Need to take the lower 32 bits of the driver's 64-bit idle mask and put it in the legacy 32-bit variable that the userspace reads to know the idle mask. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 12efbd9d2e3a..d25892d61ec9 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -133,6 +133,8 @@ static int hw_idle(struct hl_device *hdev, struct hl_info_args *args) hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev, &hw_idle.busy_engines_mask_ext, NULL); + hw_idle.busy_engines_mask = + lower_32_bits(hw_idle.busy_engines_mask_ext); return copy_to_user(out, &hw_idle, min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0; From 2dc4a6d79168e7e426e8ddf8e7219c9ffd13b2b1 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 18 Jan 2021 21:39:46 +0200 Subject: [PATCH 096/135] habanalabs: disable FW events on device removal When device is removed, we need to make sure the F/W won't send us any more events because during the remove process we disable the interrupts. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 1ea57d86caa3..69d04eca767f 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1487,6 +1487,15 @@ void hl_device_fini(struct hl_device *hdev) } } + /* Disable PCI access from device F/W so it won't send us additional + * interrupts. We disable MSI/MSI-X at the halt_engines function and we + * can't have the F/W sending us interrupts after that. We need to + * disable the access here because if the device is marked disable, the + * message won't be send. Also, in case of heartbeat, the device CPU is + * marked as disable so this message won't be sent + */ + hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS); + /* Mark device as disabled */ hdev->disabled = true; From e020ff611ba9be54e959e6b548038f8a020da1c9 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Sun, 10 Jan 2021 09:54:07 -0800 Subject: [PATCH 097/135] driver core: Fix device link device name collision The device link device's name was of the form: -- This can cause name collision as reported here [1] as device names are not globally unique. Since device names have to be unique within the bus/class, add the bus/class name as a prefix to the device names used to construct the device link device name. So the devuce link device's name will be of the form: :--: [1] - https://lore.kernel.org/lkml/20201229033440.32142-1-michael@walle.cc/ Fixes: 287905e68dd2 ("driver core: Expose device link details in sysfs") Cc: stable@vger.kernel.org Reported-by: Michael Walle Tested-by: Michael Walle Signed-off-by: Saravana Kannan Link: https://lore.kernel.org/r/20210110175408.1465657-1-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-class-devlink | 4 +-- .../ABI/testing/sysfs-devices-consumer | 5 ++-- .../ABI/testing/sysfs-devices-supplier | 5 ++-- drivers/base/core.c | 27 ++++++++++--------- include/linux/device.h | 12 +++++++++ 5 files changed, 35 insertions(+), 18 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-class-devlink b/Documentation/ABI/testing/sysfs-class-devlink index b662f747c83e..8a21ce515f61 100644 --- a/Documentation/ABI/testing/sysfs-class-devlink +++ b/Documentation/ABI/testing/sysfs-class-devlink @@ -5,8 +5,8 @@ Description: Provide a place in sysfs for the device link objects in the kernel at any given time. The name of a device link directory, denoted as ... above, is of the form -- - where is the supplier device name and is - the consumer device name. + where is the supplier bus:device name and + is the consumer bus:device name. What: /sys/class/devlink/.../auto_remove_on Date: May 2020 diff --git a/Documentation/ABI/testing/sysfs-devices-consumer b/Documentation/ABI/testing/sysfs-devices-consumer index 1f06d74d1c3c..0809fda092e6 100644 --- a/Documentation/ABI/testing/sysfs-devices-consumer +++ b/Documentation/ABI/testing/sysfs-devices-consumer @@ -4,5 +4,6 @@ Contact: Saravana Kannan Description: The /sys/devices/.../consumer: are symlinks to device links where this device is the supplier. denotes the - name of the consumer in that device link. There can be zero or - more of these symlinks for a given device. + name of the consumer in that device link and is of the form + bus:device name. There can be zero or more of these symlinks + for a given device. diff --git a/Documentation/ABI/testing/sysfs-devices-supplier b/Documentation/ABI/testing/sysfs-devices-supplier index a919e0db5e90..207f5972e98d 100644 --- a/Documentation/ABI/testing/sysfs-devices-supplier +++ b/Documentation/ABI/testing/sysfs-devices-supplier @@ -4,5 +4,6 @@ Contact: Saravana Kannan Description: The /sys/devices/.../supplier: are symlinks to device links where this device is the consumer. denotes the - name of the supplier in that device link. There can be zero or - more of these symlinks for a given device. + name of the supplier in that device link and is of the form + bus:device name. There can be zero or more of these symlinks + for a given device. diff --git a/drivers/base/core.c b/drivers/base/core.c index 3819fd012e27..78f6db169d47 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -471,7 +471,9 @@ static int devlink_add_symlinks(struct device *dev, struct device *con = link->consumer; char *buf; - len = max(strlen(dev_name(sup)), strlen(dev_name(con))); + len = max(strlen(dev_bus_name(sup)) + strlen(dev_name(sup)), + strlen(dev_bus_name(con)) + strlen(dev_name(con))); + len += strlen(":"); len += strlen("supplier:") + 1; buf = kzalloc(len, GFP_KERNEL); if (!buf) @@ -485,12 +487,12 @@ static int devlink_add_symlinks(struct device *dev, if (ret) goto err_con; - snprintf(buf, len, "consumer:%s", dev_name(con)); + snprintf(buf, len, "consumer:%s:%s", dev_bus_name(con), dev_name(con)); ret = sysfs_create_link(&sup->kobj, &link->link_dev.kobj, buf); if (ret) goto err_con_dev; - snprintf(buf, len, "supplier:%s", dev_name(sup)); + snprintf(buf, len, "supplier:%s:%s", dev_bus_name(sup), dev_name(sup)); ret = sysfs_create_link(&con->kobj, &link->link_dev.kobj, buf); if (ret) goto err_sup_dev; @@ -498,7 +500,7 @@ static int devlink_add_symlinks(struct device *dev, goto out; err_sup_dev: - snprintf(buf, len, "consumer:%s", dev_name(con)); + snprintf(buf, len, "consumer:%s:%s", dev_bus_name(con), dev_name(con)); sysfs_remove_link(&sup->kobj, buf); err_con_dev: sysfs_remove_link(&link->link_dev.kobj, "consumer"); @@ -521,7 +523,9 @@ static void devlink_remove_symlinks(struct device *dev, sysfs_remove_link(&link->link_dev.kobj, "consumer"); sysfs_remove_link(&link->link_dev.kobj, "supplier"); - len = max(strlen(dev_name(sup)), strlen(dev_name(con))); + len = max(strlen(dev_bus_name(sup)) + strlen(dev_name(sup)), + strlen(dev_bus_name(con)) + strlen(dev_name(con))); + len += strlen(":"); len += strlen("supplier:") + 1; buf = kzalloc(len, GFP_KERNEL); if (!buf) { @@ -529,9 +533,9 @@ static void devlink_remove_symlinks(struct device *dev, return; } - snprintf(buf, len, "supplier:%s", dev_name(sup)); + snprintf(buf, len, "supplier:%s:%s", dev_bus_name(sup), dev_name(sup)); sysfs_remove_link(&con->kobj, buf); - snprintf(buf, len, "consumer:%s", dev_name(con)); + snprintf(buf, len, "consumer:%s:%s", dev_bus_name(con), dev_name(con)); sysfs_remove_link(&sup->kobj, buf); kfree(buf); } @@ -752,8 +756,9 @@ struct device_link *device_link_add(struct device *consumer, link->link_dev.class = &devlink_class; device_set_pm_not_required(&link->link_dev); - dev_set_name(&link->link_dev, "%s--%s", - dev_name(supplier), dev_name(consumer)); + dev_set_name(&link->link_dev, "%s:%s--%s:%s", + dev_bus_name(supplier), dev_name(supplier), + dev_bus_name(consumer), dev_name(consumer)); if (device_register(&link->link_dev)) { put_device(consumer); put_device(supplier); @@ -1823,9 +1828,7 @@ const char *dev_driver_string(const struct device *dev) * never change once they are set, so they don't need special care. */ drv = READ_ONCE(dev->driver); - return drv ? drv->name : - (dev->bus ? dev->bus->name : - (dev->class ? dev->class->name : "")); + return drv ? drv->name : dev_bus_name(dev); } EXPORT_SYMBOL(dev_driver_string); diff --git a/include/linux/device.h b/include/linux/device.h index 89bb8b84173e..1779f90eeb4c 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -609,6 +609,18 @@ static inline const char *dev_name(const struct device *dev) return kobject_name(&dev->kobj); } +/** + * dev_bus_name - Return a device's bus/class name, if at all possible + * @dev: struct device to get the bus/class name of + * + * Will return the name of the bus/class the device is attached to. If it is + * not attached to a bus/class, an empty string will be returned. + */ +static inline const char *dev_bus_name(const struct device *dev) +{ + return dev->bus ? dev->bus->name : (dev->class ? dev->class->name : ""); +} + __printf(2, 3) int dev_set_name(struct device *dev, const char *name, ...); #ifdef CONFIG_NUMA From 655cf86548a3938538642a6df27dd359e13c86bd Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 14 Jan 2021 16:32:42 -0600 Subject: [PATCH 098/135] objtool: Don't fail the kernel build on fatal errors This is basically a revert of commit 644592d32837 ("objtool: Fail the kernel build on fatal errors"). That change turned out to be more trouble than it's worth. Failing the build is an extreme measure which sometimes gets too much attention and blocks CI build testing. These fatal-type warnings aren't yet as rare as we'd hope, due to the ever-increasing matrix of supported toolchains/plugins and their fast-changing nature as of late. Also, there are more people (and bots) looking for objtool warnings than ever before, so even non-fatal warnings aren't likely to be ignored for long. Suggested-by: Nick Desaulniers Reviewed-by: Miroslav Benes Reviewed-by: Nick Desaulniers Reviewed-by: Kamalesh Babulal Signed-off-by: Josh Poimboeuf --- tools/objtool/check.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 5f8d3eed78a1..4bd30315eb62 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2928,14 +2928,10 @@ int check(struct objtool_file *file) warnings += ret; out: - if (ret < 0) { - /* - * Fatal error. The binary is corrupt or otherwise broken in - * some way, or objtool itself is broken. Fail the kernel - * build. - */ - return ret; - } - + /* + * For now, don't fail the kernel build on fatal warnings. These + * errors are still fairly common due to the growing matrix of + * supported toolchains and their recent pace of change. + */ return 0; } From 1d489151e9f9d1647110277ff77282fe4d96d09b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 14 Jan 2021 16:14:01 -0600 Subject: [PATCH 099/135] objtool: Don't fail on missing symbol table Thanks to a recent binutils change which doesn't generate unused symbols, it's now possible for thunk_64.o be completely empty without CONFIG_PREEMPTION: no text, no data, no symbols. We could edit the Makefile to only build that file when CONFIG_PREEMPTION is enabled, but that will likely create confusion if/when the thunks end up getting used by some other code again. Just ignore it and move on. Reported-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Reviewed-by: Miroslav Benes Tested-by: Nathan Chancellor Link: https://github.com/ClangBuiltLinux/linux/issues/1254 Signed-off-by: Josh Poimboeuf --- tools/objtool/elf.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index f9682db33cca..d8421e1d06be 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -380,8 +380,11 @@ static int read_symbols(struct elf *elf) symtab = find_section_by_name(elf, ".symtab"); if (!symtab) { - WARN("missing symbol table"); - return -1; + /* + * A missing symbol table is actually possible if it's an empty + * .o file. This can happen for thunk_64.o. + */ + return 0; } symtab_shndx = find_section_by_name(elf, ".symtab_shndx"); From 6e1239c13953f3c2a76e70031f74ddca9ae57cd3 Mon Sep 17 00:00:00 2001 From: Gayatri Kammela Date: Thu, 21 Jan 2021 13:50:04 -0800 Subject: [PATCH 100/135] x86/cpu: Add another Alder Lake CPU to the Intel family Add Alder Lake mobile CPU model number to Intel family. Signed-off-by: Gayatri Kammela Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210121215004.11618-1-tony.luck@intel.com --- arch/x86/include/asm/intel-family.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 5e658ba2654a..9abe842dbd84 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -97,6 +97,7 @@ #define INTEL_FAM6_LAKEFIELD 0x8A #define INTEL_FAM6_ALDERLAKE 0x97 +#define INTEL_FAM6_ALDERLAKE_L 0x9A /* "Small Core" Processors (Atom) */ From 17749851eb9ca2298e7c3b81aae4228961b36f28 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 21 Jan 2021 10:04:27 -0800 Subject: [PATCH 101/135] tty: fix up hung_up_tty_write() conversion In commit "tty: implement write_iter", I left the write_iter conversion of the hung up tty case alone, because I incorrectly thought it didn't matter. Jiri showed me the errors of my ways, and pointed out the problems with that incomplete conversion. Fix it all up. Reported-by: Jiri Slaby Signed-off-by: Linus Torvalds Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/r/CAHk-=wh+-rGsa=xruEWdg_fJViFG8rN9bpLrfLz=_yBYh2tBhA@mail.gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_io.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 502862626b2b..4a208a95e921 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -437,8 +437,7 @@ static ssize_t hung_up_tty_read(struct file *file, char __user *buf, return 0; } -static ssize_t hung_up_tty_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t hung_up_tty_write(struct kiocb *iocb, struct iov_iter *from) { return -EIO; } @@ -504,7 +503,7 @@ static const struct file_operations console_fops = { static const struct file_operations hung_up_tty_fops = { .llseek = no_llseek, .read = hung_up_tty_read, - .write = hung_up_tty_write, + .write_iter = hung_up_tty_write, .poll = hung_up_tty_poll, .unlocked_ioctl = hung_up_tty_ioctl, .compat_ioctl = hung_up_tty_compat_ioctl, @@ -1044,7 +1043,9 @@ static ssize_t tty_write(struct kiocb *iocb, struct iov_iter *from) if (tty->ops->write_room == NULL) tty_err(tty, "missing write_room method\n"); ld = tty_ldisc_ref_wait(tty); - if (!ld || !ld->ops->write) + if (!ld) + return hung_up_tty_write(iocb, from); + if (!ld->ops->write) ret = -EIO; else ret = do_tty_write(ld->ops->write, tty, file, from); From 31b081066e9c8f4a931a3d20dc0c6ca63c595c44 Mon Sep 17 00:00:00 2001 From: Ricky Wu Date: Fri, 22 Jan 2021 16:19:06 +0800 Subject: [PATCH 102/135] misc: rtsx: init value of aspm_enabled make sure ASPM state sync with pcr->aspm_enabled init value pcr->aspm_enabled Cc: stable@vger.kernel.org Signed-off-by: Ricky Wu Link: https://lore.kernel.org/r/20210122081906.19100-1-ricky_wu@realtek.com Fixes: d928061c3143 ("misc: rtsx: modify en/disable aspm function") Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cardreader/rtsx_pcr.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/misc/cardreader/rtsx_pcr.c b/drivers/misc/cardreader/rtsx_pcr.c index 2aa6648fa41f..5a491d2cd1ae 100644 --- a/drivers/misc/cardreader/rtsx_pcr.c +++ b/drivers/misc/cardreader/rtsx_pcr.c @@ -1512,6 +1512,7 @@ static int rtsx_pci_probe(struct pci_dev *pcidev, struct pcr_handle *handle; u32 base, len; int ret, i, bar = 0; + u8 val; dev_dbg(&(pcidev->dev), ": Realtek PCI-E Card Reader found at %s [%04x:%04x] (rev %x)\n", @@ -1577,7 +1578,11 @@ static int rtsx_pci_probe(struct pci_dev *pcidev, pcr->host_cmds_addr = pcr->rtsx_resv_buf_addr; pcr->host_sg_tbl_ptr = pcr->rtsx_resv_buf + HOST_CMDS_BUF_LEN; pcr->host_sg_tbl_addr = pcr->rtsx_resv_buf_addr + HOST_CMDS_BUF_LEN; - + rtsx_pci_read_register(pcr, ASPM_FORCE_CTL, &val); + if (val & FORCE_ASPM_CTL0 && val & FORCE_ASPM_CTL1) + pcr->aspm_enabled = false; + else + pcr->aspm_enabled = true; pcr->card_inserted = 0; pcr->card_removed = 0; INIT_DELAYED_WORK(&pcr->carddet_work, rtsx_pci_card_detect); From 36c6e17bf16922935a5a0dd073d5b032d34aa73d Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 13 Jan 2021 18:31:41 +0000 Subject: [PATCH 103/135] sched/core: Print out straggler tasks in sched_cpu_dying() Since commit 1cf12e08bc4d ("sched/hotplug: Consolidate task migration on CPU unplug") tasks are expected to move themselves out of a out-going CPU. For most tasks this will be done automagically via BALANCE_PUSH, but percpu kthreads will have to cooperate and move themselves away one way or another. Currently, some percpu kthreads (workqueues being a notable exemple) do not cooperate nicely and can end up on an out-going CPU at the time sched_cpu_dying() is invoked. Print the dying rq's tasks to shed some light on the stragglers. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210113183141.11974-1-valentin.schneider@arm.com --- kernel/sched/core.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 15d2562118d1..627534fa9196 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7574,6 +7574,25 @@ static void calc_load_migrate(struct rq *rq) atomic_long_add(delta, &calc_load_tasks); } +static void dump_rq_tasks(struct rq *rq, const char *loglvl) +{ + struct task_struct *g, *p; + int cpu = cpu_of(rq); + + lockdep_assert_held(&rq->lock); + + printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running); + for_each_process_thread(g, p) { + if (task_cpu(p) != cpu) + continue; + + if (!task_on_rq_queued(p)) + continue; + + printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm); + } +} + int sched_cpu_dying(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); @@ -7583,7 +7602,10 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); - BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); + if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { + WARN(true, "Dying CPU not properly vacated!"); + dump_rq_tasks(rq, KERN_WARNING); + } rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); From 547a77d02f8cfb345631ce23b5b548d27afa0fc4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 11 Jan 2021 23:26:33 +0800 Subject: [PATCH 104/135] workqueue: Use cpu_possible_mask instead of cpu_active_mask to break affinity The scheduler won't break affinity for us any more, and we should "emulate" the same behavior when the scheduler breaks affinity for us. The behavior is "changing the cpumask to cpu_possible_mask". And there might be some other CPUs online later while the worker is still running with the pending work items. The worker should be allowed to use the later online CPUs as before and process the work items ASAP. If we use cpu_active_mask here, we can't achieve this goal but using cpu_possible_mask can. Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug") Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Acked-by: Tejun Heo Tested-by: Paul E. McKenney Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210111152638.2417-4-jiangshanlai@gmail.com --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9880b6c0e272..1646331546eb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4920,7 +4920,7 @@ static void unbind_workers(int cpu) raw_spin_unlock_irq(&pool->lock); for_each_pool_worker(worker, pool) - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); mutex_unlock(&wq_pool_attach_mutex); From 22f667c97aadbf481e2cae2d6feabdf431e27b31 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jan 2021 18:17:45 +0100 Subject: [PATCH 105/135] sched: Don't run cpu-online with balance_push() enabled We don't need to push away tasks when we come online, mark the push complete right before the CPU dies. XXX hotplug state machine has trouble with rollback here. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103506.415606087@infradead.org --- kernel/sched/core.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 627534fa9196..8da0fd7f3cca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7320,10 +7320,12 @@ static void balance_push_set(int cpu, bool on) struct rq_flags rf; rq_lock_irqsave(rq, &rf); - if (on) + if (on) { + WARN_ON_ONCE(rq->balance_callback); rq->balance_callback = &balance_push_callback; - else + } else if (rq->balance_callback == &balance_push_callback) { rq->balance_callback = NULL; + } rq_unlock_irqrestore(rq, &rf); } @@ -7441,6 +7443,10 @@ int sched_cpu_activate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; + /* + * Make sure that when the hotplug state machine does a roll-back + * we clear balance_push. Ideally that would happen earlier... + */ balance_push_set(cpu, false); #ifdef CONFIG_SCHED_SMT @@ -7608,6 +7614,12 @@ int sched_cpu_dying(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); + /* + * Now that the CPU is offline, make sure we're welcome + * to new tasks once we come back up. + */ + balance_push_set(cpu, false); + calc_load_migrate(rq); update_max_interval(); nohz_balance_exit_idle(rq); From ac687e6e8c26181a33270efd1a2e2241377924b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Jan 2021 11:24:04 +0100 Subject: [PATCH 106/135] kthread: Extract KTHREAD_IS_PER_CPU There is a need to distinguish geniune per-cpu kthreads from kthreads that happen to have a single CPU affinity. Geniune per-cpu kthreads are kthreads that are CPU affine for correctness, these will obviously have PF_KTHREAD set, but must also have PF_NO_SETAFFINITY set, lest userspace modify their affinity and ruins things. However, these two things are not sufficient, PF_NO_SETAFFINITY is also set on other tasks that have their affinities controlled through other means, like for instance workqueues. Therefore another bit is needed; it turns out kthread_create_per_cpu() already has such a bit: KTHREAD_IS_PER_CPU, which is used to make kthread_park()/kthread_unpark() work correctly. Expose this flag and remove the implicit setting of it from kthread_create_on_cpu(); the io_uring usage of it seems dubious at best. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103506.557620262@infradead.org --- include/linux/kthread.h | 3 +++ kernel/kthread.c | 27 ++++++++++++++++++++++++++- kernel/smpboot.c | 1 + 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 65b81e0c494d..2484ed97e72f 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -33,6 +33,9 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), unsigned int cpu, const char *namefmt); +void kthread_set_per_cpu(struct task_struct *k, int cpu); +bool kthread_is_per_cpu(struct task_struct *k); + /** * kthread_run - create and wake a thread. * @threadfn: the function to run until signal_pending(current). diff --git a/kernel/kthread.c b/kernel/kthread.c index a5eceecd4513..e0e4a423f184 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -493,11 +493,36 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ - set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); to_kthread(p)->cpu = cpu; return p; } +void kthread_set_per_cpu(struct task_struct *k, int cpu) +{ + struct kthread *kthread = to_kthread(k); + if (!kthread) + return; + + WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); + + if (cpu < 0) { + clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); + return; + } + + kthread->cpu = cpu; + set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); +} + +bool kthread_is_per_cpu(struct task_struct *k) +{ + struct kthread *kthread = to_kthread(k); + if (!kthread) + return false; + + return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); +} + /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 2efe1e206167..f25208e8df83 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -188,6 +188,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kfree(td); return PTR_ERR(tsk); } + kthread_set_per_cpu(tsk, cpu); /* * Park the thread so that it could start right on the CPU * when it is available. From 5c25b5ff89f004c30b04759dc34ace8585a4085f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Jan 2021 11:26:49 +0100 Subject: [PATCH 107/135] workqueue: Tag bound workers with KTHREAD_IS_PER_CPU Mark the per-cpu workqueue workers as KTHREAD_IS_PER_CPU. Workqueues have unfortunate semantics in that per-cpu workers are not default flushed and parked during hotplug, however a subset does manual flush on hotplug and hard relies on them for correctness. Therefore play silly games.. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103506.693465814@infradead.org --- kernel/workqueue.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1646331546eb..cce34336fbd7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1861,6 +1861,8 @@ static void worker_attach_to_pool(struct worker *worker, */ if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; + else + kthread_set_per_cpu(worker->task, pool->cpu); list_add_tail(&worker->node, &pool->workers); worker->pool = pool; @@ -1883,6 +1885,7 @@ static void worker_detach_from_pool(struct worker *worker) mutex_lock(&wq_pool_attach_mutex); + kthread_set_per_cpu(worker->task, -1); list_del(&worker->node); worker->pool = NULL; @@ -4919,8 +4922,10 @@ static void unbind_workers(int cpu) raw_spin_unlock_irq(&pool->lock); - for_each_pool_worker(worker, pool) + for_each_pool_worker(worker, pool) { + kthread_set_per_cpu(worker->task, -1); WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); + } mutex_unlock(&wq_pool_attach_mutex); @@ -4972,9 +4977,11 @@ static void rebind_workers(struct worker_pool *pool) * of all workers first and then clear UNBOUND. As we're called * from CPU_ONLINE, the following shouldn't fail. */ - for_each_pool_worker(worker, pool) + for_each_pool_worker(worker, pool) { + kthread_set_per_cpu(worker->task, pool->cpu); WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); + } raw_spin_lock_irq(&pool->lock); From 640f17c82460e9724fd256f0a1f5d99e7ff0bda4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jan 2021 19:08:36 +0100 Subject: [PATCH 108/135] workqueue: Restrict affinity change to rescuer create_worker() will already set the right affinity using kthread_bind_mask(), this means only the rescuer will need to change it's affinity. Howveer, while in cpu-hot-unplug a regular task is not allowed to run on online&&!active as it would be pushed away quite agressively. We need KTHREAD_IS_PER_CPU to survive in that environment. Therefore set the affinity after getting that magic flag. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103506.826629830@infradead.org --- kernel/workqueue.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cce34336fbd7..894bb885b40b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1848,12 +1848,6 @@ static void worker_attach_to_pool(struct worker *worker, { mutex_lock(&wq_pool_attach_mutex); - /* - * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any - * online CPUs. It'll be re-applied when any of the CPUs come up. - */ - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - /* * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains * stable across this function. See the comments above the flag @@ -1864,6 +1858,9 @@ static void worker_attach_to_pool(struct worker *worker, else kthread_set_per_cpu(worker->task, pool->cpu); + if (worker->rescue_wq) + set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + list_add_tail(&worker->node, &pool->workers); worker->pool = pool; From 975707f227b07a8212060f94447171d15d7a681b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Jan 2021 15:05:41 +0100 Subject: [PATCH 109/135] sched: Prepare to use balance_push in ttwu() In preparation of using the balance_push state in ttwu() we need it to provide a reliable and consistent state. The immediate problem is that rq->balance_callback gets cleared every schedule() and then re-set in the balance_push_callback() itself. This is not a reliable signal, so add a variable that stays set during the entire time. Also move setting it before the synchronize_rcu() in sched_cpu_deactivate(), such that we get guaranteed visibility to ttwu(), which is a preempt-disable region. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103506.966069627@infradead.org --- kernel/sched/core.c | 11 ++++++----- kernel/sched/sched.h | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8da0fd7f3cca..16946b55e8e7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7320,6 +7320,7 @@ static void balance_push_set(int cpu, bool on) struct rq_flags rf; rq_lock_irqsave(rq, &rf); + rq->balance_push = on; if (on) { WARN_ON_ONCE(rq->balance_callback); rq->balance_callback = &balance_push_callback; @@ -7489,17 +7490,17 @@ int sched_cpu_deactivate(unsigned int cpu) int ret; set_cpu_active(cpu, false); + balance_push_set(cpu, true); + /* - * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU - * users of this state to go away such that all new such users will - * observe it. + * We've cleared cpu_active_mask / set balance_push, wait for all + * preempt-disabled and RCU users of this state to go away such that + * all new such users will observe it. * * Do sync before park smpboot threads to take care the rcu boost case. */ synchronize_rcu(); - balance_push_set(cpu, true); - rq_lock_irqsave(rq, &rf); if (rq->rd) { update_rq_clock(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 12ada79d40f3..bb09988451a0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -975,6 +975,7 @@ struct rq { unsigned long cpu_capacity_orig; struct callback_head *balance_callback; + unsigned char balance_push; unsigned char nohz_idle_balance; unsigned char idle_balance; From 5ba2ffba13a1e24e7b153683e97300f9cc6f605a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Jan 2021 11:28:16 +0100 Subject: [PATCH 110/135] sched: Fix CPU hotplug / tighten is_per_cpu_kthread() Prior to commit 1cf12e08bc4d ("sched/hotplug: Consolidate task migration on CPU unplug") we'd leave any task on the dying CPU and break affinity and force them off at the very end. This scheme had to change in order to enable migrate_disable(). One cannot wait for migrate_disable() to complete while stuck in stop_machine(). Furthermore, since we need at the very least: idle, hotplug and stop threads at any point before stop_machine, we can't break affinity and/or push those away. Under the assumption that all per-cpu kthreads are sanely handled by CPU hotplug, the new code no long breaks affinity or migrates any of them (which then includes the critical ones above). However, there's an important difference between per-cpu kthreads and kthreads that happen to have a single CPU affinity which is lost. The latter class very much relies on the forced affinity breaking and migration semantics previously provided. Use the new kthread_is_per_cpu() infrastructure to tighten is_per_cpu_kthread() and fix the hot-unplug problems stemming from the change. Fixes: 1cf12e08bc4d ("sched/hotplug: Consolidate task migration on CPU unplug") Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103507.102416009@infradead.org --- kernel/sched/core.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 16946b55e8e7..56b09628692a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1796,13 +1796,28 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) */ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) { + /* When not in the task's cpumask, no point in looking further. */ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; - if (is_per_cpu_kthread(p) || is_migration_disabled(p)) + /* migrate_disabled() must be allowed to finish. */ + if (is_migration_disabled(p)) return cpu_online(cpu); - return cpu_active(cpu); + /* Non kernel threads are not allowed during either online or offline. */ + if (!(p->flags & PF_KTHREAD)) + return cpu_active(cpu); + + /* KTHREAD_IS_PER_CPU is always allowed. */ + if (kthread_is_per_cpu(p)) + return cpu_online(cpu); + + /* Regular kernel threads don't get to stay during offline. */ + if (cpu_rq(cpu)->balance_push) + return false; + + /* But are allowed during online. */ + return cpu_online(cpu); } /* @@ -3121,6 +3136,13 @@ bool cpus_share_cache(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(int cpu, int wake_flags) { + /* + * Do not complicate things with the async wake_list while the CPU is + * in hotplug state. + */ + if (!cpu_active(cpu)) + return false; + /* * If the CPU does not share cache, then queue the task on the * remote rqs wakelist to avoid accessing remote data. @@ -7276,8 +7298,14 @@ static void balance_push(struct rq *rq) /* * Both the cpu-hotplug and stop task are in this case and are * required to complete the hotplug process. + * + * XXX: the idle task does not match kthread_is_per_cpu() due to + * histerical raisins. */ - if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) { + if (rq->idle == push_task || + ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) || + is_migration_disabled(push_task)) { + /* * If this is the idle task on the outgoing CPU try to wake * up the hotplug control thread which might wait for the @@ -7309,7 +7337,7 @@ static void balance_push(struct rq *rq) /* * At this point need_resched() is true and we'll take the loop in * schedule(). The next pick is obviously going to be the stop task - * which is_per_cpu_kthread() and will push this task away. + * which kthread_is_per_cpu() and will push this task away. */ raw_spin_lock(&rq->lock); } @@ -7497,6 +7525,9 @@ int sched_cpu_deactivate(unsigned int cpu) * preempt-disabled and RCU users of this state to go away such that * all new such users will observe it. * + * Specifically, we rely on ttwu to no longer target this CPU, see + * ttwu_queue_cond() and is_cpu_allowed(). + * * Do sync before park smpboot threads to take care the rcu boost case. */ synchronize_rcu(); From 741ba80f6f9a4702089c122129f22df9774b3e64 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 16 Jan 2021 11:56:37 +0100 Subject: [PATCH 111/135] sched: Relax the set_cpus_allowed_ptr() semantics Now that we have KTHREAD_IS_PER_CPU to denote the critical per-cpu tasks to retain during CPU offline, we can relax the warning in set_cpus_allowed_ptr(). Any spurious kthread that wants to get on at the last minute will get pushed off before it can run. While during CPU online there is no harm, and actual benefit, to allowing kthreads back on early, it simplifies hotplug code and fixes a number of outstanding races. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lai jiangshan Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103507.240724591@infradead.org --- kernel/sched/core.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 56b09628692a..ff74fca39ed2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2342,7 +2342,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { /* - * Kernel threads are allowed on online && !active CPUs. + * Kernel threads are allowed on online && !active CPUs, + * however, during cpu-hot-unplug, even these might get pushed + * away if not KTHREAD_IS_PER_CPU. * * Specifically, migration_disabled() tasks must not fail the * cpumask_any_and_distribute() pick below, esp. so on @@ -2386,16 +2388,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, __do_set_cpus_allowed(p, new_mask, flags); - if (p->flags & PF_KTHREAD) { - /* - * For kernel threads that do indeed end up on online && - * !active we want to ensure they are strict per-CPU threads. - */ - WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && - !cpumask_intersects(new_mask, cpu_active_mask) && - p->nr_cpus_allowed != 1); - } - return affine_move_task(rq, p, &rf, dest_cpu, flags); out: @@ -7518,6 +7510,13 @@ int sched_cpu_deactivate(unsigned int cpu) int ret; set_cpu_active(cpu, false); + + /* + * From this point forward, this CPU will refuse to run any task that + * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively + * push those tasks away until this gets cleared, see + * sched_cpu_dying(). + */ balance_push_set(cpu, true); /* From 607ec89ed18f49ca59689572659b9c0076f1991f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Jan 2021 10:10:54 -0700 Subject: [PATCH 112/135] io_uring: fix SQPOLL IORING_OP_CLOSE cancelation state IORING_OP_CLOSE is special in terms of cancelation, since it has an intermediate state where we've removed the file descriptor but hasn't closed the file yet. For that reason, it's currently marked with IO_WQ_WORK_NO_CANCEL to prevent cancelation. This ensures that the op is always run even if canceled, to prevent leaving us with a live file but an fd that is gone. However, with SQPOLL, since a cancel request doesn't carry any resources on behalf of the request being canceled, if we cancel before any of the close op has been run, we can end up with io-wq not having the ->files assigned. This can result in the following oops reported by Joseph: BUG: kernel NULL pointer dereference, address: 00000000000000d8 PGD 800000010b76f067 P4D 800000010b76f067 PUD 10b462067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 1 PID: 1788 Comm: io_uring-sq Not tainted 5.11.0-rc4 #1 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:__lock_acquire+0x19d/0x18c0 Code: 00 00 8b 1d fd 56 dd 08 85 db 0f 85 43 05 00 00 48 c7 c6 98 7b 95 82 48 c7 c7 57 96 93 82 e8 9a bc f5 ff 0f 0b e9 2b 05 00 00 <48> 81 3f c0 ca 67 8a b8 00 00 00 00 41 0f 45 c0 89 04 24 e9 81 fe RSP: 0018:ffffc90001933828 EFLAGS: 00010002 RAX: 0000000000000001 RBX: 0000000000000001 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000000000d8 RBP: 0000000000000246 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: ffff888106e8a140 R15: 00000000000000d8 FS: 0000000000000000(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000d8 CR3: 0000000106efa004 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: lock_acquire+0x31a/0x440 ? close_fd_get_file+0x39/0x160 ? __lock_acquire+0x647/0x18c0 _raw_spin_lock+0x2c/0x40 ? close_fd_get_file+0x39/0x160 close_fd_get_file+0x39/0x160 io_issue_sqe+0x1334/0x14e0 ? lock_acquire+0x31a/0x440 ? __io_free_req+0xcf/0x2e0 ? __io_free_req+0x175/0x2e0 ? find_held_lock+0x28/0xb0 ? io_wq_submit_work+0x7f/0x240 io_wq_submit_work+0x7f/0x240 io_wq_cancel_cb+0x161/0x580 ? io_wqe_wake_worker+0x114/0x360 ? io_uring_get_socket+0x40/0x40 io_async_find_and_cancel+0x3b/0x140 io_issue_sqe+0xbe1/0x14e0 ? __lock_acquire+0x647/0x18c0 ? __io_queue_sqe+0x10b/0x5f0 __io_queue_sqe+0x10b/0x5f0 ? io_req_prep+0xdb/0x1150 ? mark_held_locks+0x6d/0xb0 ? mark_held_locks+0x6d/0xb0 ? io_queue_sqe+0x235/0x4b0 io_queue_sqe+0x235/0x4b0 io_submit_sqes+0xd7e/0x12a0 ? _raw_spin_unlock_irq+0x24/0x30 ? io_sq_thread+0x3ae/0x940 io_sq_thread+0x207/0x940 ? do_wait_intr_irq+0xc0/0xc0 ? __ia32_sys_io_uring_enter+0x650/0x650 kthread+0x134/0x180 ? kthread_create_worker_on_cpu+0x90/0x90 ret_from_fork+0x1f/0x30 Fix this by moving the IO_WQ_WORK_NO_CANCEL until _after_ we've modified the fdtable. Canceling before this point is totally fine, and running it in the io-wq context _after_ that point is also fine. For 5.12, we'll handle this internally and get rid of the no-cancel flag, as IORING_OP_CLOSE is the only user of it. Cc: stable@vger.kernel.org Fixes: b5dba59e0cf7 ("io_uring: add support for IORING_OP_CLOSE") Reported-by: "Abaci " Reviewed-and-tested-by: Joseph Qi Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 383ff6ed3734..e4c1cdf0325d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4472,7 +4472,6 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) * io_wq_work.flags, so initialize io_wq_work firstly. */ io_req_init_async(req); - req->work.flags |= IO_WQ_WORK_NO_CANCEL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4505,6 +4504,8 @@ static int io_close(struct io_kiocb *req, bool force_nonblock, /* if the file has a flush method, be safe and punt to async */ if (close->put_file->f_op->flush && force_nonblock) { + /* not safe to cancel at this point */ + req->work.flags |= IO_WQ_WORK_NO_CANCEL; /* was never set, but play safe */ req->flags &= ~REQ_F_NOWAIT; /* avoid grabbing files - we don't need the files */ From 9a173346bd9e16ab19c7addb8862d95a5cea9feb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 21 Jan 2021 12:01:08 +0000 Subject: [PATCH 113/135] io_uring: fix short read retries for non-reg files Sockets and other non-regular files may actually expect short reads to happen, don't retry reads for them. Because non-reg files don't set FMODE_BUF_RASYNC and so it won't do second/retry do_read, we can filter out those cases after first do_read() attempt with ret>0. Cc: stable@vger.kernel.org # 5.9+ Suggested-by: Jens Axboe Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e4c1cdf0325d..862113a9364f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3552,7 +3552,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, /* read it all, or we did blocking attempt. no retry. */ if (!iov_iter_count(iter) || !force_nonblock || - (req->file->f_flags & O_NONBLOCK)) + (req->file->f_flags & O_NONBLOCK) || !(req->flags & REQ_F_ISREG)) goto done; io_size -= ret; From 214a5ea081e77346e4963dd6d20c5539ff8b6ae6 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 21 Jan 2021 08:22:48 +1000 Subject: [PATCH 114/135] cifs: do not fail __smb_send_rqst if non-fatal signals are pending RHBZ 1848178 The original intent of returning an error in this function in the patch: "CIFS: Mask off signals when sending SMB packets" was to avoid interrupting packet send in the middle of sending the data (and thus breaking an SMB connection), but we also don't want to fail the request for non-fatal signals even before we have had a chance to try to send it (the reported problem could be reproduced e.g. by exiting a child process when the parent process was in the midst of calling futimens to update a file's timestamps). In addition, since the signal may remain pending when we enter the sending loop, we may end up not sending the whole packet before TCP buffers become full. In this case the code returns -EINTR but what we need here is to return -ERESTARTSYS instead to allow system calls to be restarted. Fixes: b30c74c73c78 ("CIFS: Mask off signals when sending SMB packets") Cc: stable@vger.kernel.org # v5.1+ Signed-off-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/transport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index e9abb41aa89b..95ef26b555b9 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -338,7 +338,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, if (ssocket == NULL) return -EAGAIN; - if (signal_pending(current)) { + if (fatal_signal_pending(current)) { cifs_dbg(FYI, "signal pending before send request\n"); return -ERESTARTSYS; } @@ -429,7 +429,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, if (signal_pending(current) && (total_len != send_length)) { cifs_dbg(FYI, "signal is pending after attempt to send\n"); - rc = -EINTR; + rc = -ERESTARTSYS; } /* uncork it */ From 9d5c8190683a462dbc787658467a0da17011ea5f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 24 Jan 2021 15:08:14 +0000 Subject: [PATCH 115/135] io_uring: fix sleeping under spin in __io_clean_op [ 27.629441] BUG: sleeping function called from invalid context at fs/file.c:402 [ 27.631317] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1012, name: io_wqe_worker-0 [ 27.633220] 1 lock held by io_wqe_worker-0/1012: [ 27.634286] #0: ffff888105e26c98 (&ctx->completion_lock) {....}-{2:2}, at: __io_req_complete.part.102+0x30/0x70 [ 27.649249] Call Trace: [ 27.649874] dump_stack+0xac/0xe3 [ 27.650666] ___might_sleep+0x284/0x2c0 [ 27.651566] put_files_struct+0xb8/0x120 [ 27.652481] __io_clean_op+0x10c/0x2a0 [ 27.653362] __io_cqring_fill_event+0x2c1/0x350 [ 27.654399] __io_req_complete.part.102+0x41/0x70 [ 27.655464] io_openat2+0x151/0x300 [ 27.656297] io_issue_sqe+0x6c/0x14e0 [ 27.660991] io_wq_submit_work+0x7f/0x240 [ 27.662890] io_worker_handle_work+0x501/0x8a0 [ 27.664836] io_wqe_worker+0x158/0x520 [ 27.667726] kthread+0x134/0x180 [ 27.669641] ret_from_fork+0x1f/0x30 Instead of cleaning files on overflow, return back overflow cancellation into io_uring_cancel_files(). Previously it was racy to clean REQ_F_OVERFLOW flag, but we got rid of it, and can do it through repetitive attempts targeting all matching requests. Reported-by: Abaci Reported-by: Joseph Qi Cc: Xiaoguang Wang Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 862113a9364f..8a98afed50cd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1025,6 +1025,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter, bool force); +static void io_req_drop_files(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -1048,8 +1049,7 @@ EXPORT_SYMBOL(io_uring_get_socket); static inline void io_clean_op(struct io_kiocb *req) { - if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | - REQ_F_INFLIGHT)) + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) __io_clean_op(req); } @@ -1394,6 +1394,8 @@ static void io_req_clean_work(struct io_kiocb *req) free_fs_struct(fs); req->work.flags &= ~IO_WQ_WORK_FS; } + if (req->flags & REQ_F_INFLIGHT) + io_req_drop_files(req); io_put_identity(req->task->io_uring, req); } @@ -6230,9 +6232,6 @@ static void __io_clean_op(struct io_kiocb *req) } req->flags &= ~REQ_F_NEED_CLEANUP; } - - if (req->flags & REQ_F_INFLIGHT) - io_req_drop_files(req); } static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, @@ -8879,6 +8878,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); io_poll_remove_all(ctx, task, files); io_kill_timeouts(ctx, task, files); + io_cqring_overflow_flush(ctx, true, task, files); /* cancellations _may_ trigger task work */ io_run_task_work(); schedule(); From 02a13674fa0e8dd326de8b9f4514b41b03d99003 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 23 Jan 2021 15:49:31 -0700 Subject: [PATCH 116/135] io_uring: account io_uring internal files as REQ_F_INFLIGHT We need to actively cancel anything that introduces a potential circular loop, where io_uring holds a reference to itself. If the file in question is an io_uring file, then add the request to the inflight list. Cc: stable@vger.kernel.org # 5.9+ Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8a98afed50cd..c07913ec0cca 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1075,8 +1075,11 @@ static bool io_match_task(struct io_kiocb *head, return true; io_for_each_link(req, head) { - if ((req->flags & REQ_F_WORK_INITIALIZED) && - (req->work.flags & IO_WQ_WORK_FILES) && + if (!(req->flags & REQ_F_WORK_INITIALIZED)) + continue; + if (req->file && req->file->f_op == &io_uring_fops) + return true; + if ((req->work.flags & IO_WQ_WORK_FILES) && req->work.identity->files == files) return true; } @@ -1505,11 +1508,14 @@ static bool io_grab_identity(struct io_kiocb *req) return false; atomic_inc(&id->files->count); get_nsproxy(id->nsproxy); - req->flags |= REQ_F_INFLIGHT; - spin_lock_irq(&ctx->inflight_lock); - list_add(&req->inflight_entry, &ctx->inflight_list); - spin_unlock_irq(&ctx->inflight_lock); + if (!(req->flags & REQ_F_INFLIGHT)) { + req->flags |= REQ_F_INFLIGHT; + + spin_lock_irq(&ctx->inflight_lock); + list_add(&req->inflight_entry, &ctx->inflight_list); + spin_unlock_irq(&ctx->inflight_lock); + } req->work.flags |= IO_WQ_WORK_FILES; } if (!(req->work.flags & IO_WQ_WORK_MM) && @@ -6164,8 +6170,10 @@ static void io_req_drop_files(struct io_kiocb *req) struct io_uring_task *tctx = req->task->io_uring; unsigned long flags; - put_files_struct(req->work.identity->files); - put_nsproxy(req->work.identity->nsproxy); + if (req->work.flags & IO_WQ_WORK_FILES) { + put_files_struct(req->work.identity->files); + put_nsproxy(req->work.identity->nsproxy); + } spin_lock_irqsave(&ctx->inflight_lock, flags); list_del(&req->inflight_entry); spin_unlock_irqrestore(&ctx->inflight_lock, flags); @@ -6450,6 +6458,15 @@ static struct file *io_file_get(struct io_submit_state *state, file = __io_file_get(state, fd); } + if (file && file->f_op == &io_uring_fops) { + io_req_init_async(req); + req->flags |= REQ_F_INFLIGHT; + + spin_lock_irq(&ctx->inflight_lock); + list_add(&req->inflight_entry, &ctx->inflight_list); + spin_unlock_irq(&ctx->inflight_lock); + } + return file; } @@ -8860,8 +8877,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (req->task != task || - req->work.identity->files != files) + if (!io_match_task(req, task, files)) continue; found = true; break; From bde9cfa3afe4324ec251e4af80ebf9b7afaf7afe Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sat, 23 Jan 2021 21:00:57 -0800 Subject: [PATCH 117/135] x86/setup: don't remove E820_TYPE_RAM for pfn 0 Patch series "mm: fix initialization of struct page for holes in memory layout", v3. Commit 73a6e474cb37 ("mm: memmap_init: iterate over memblock regions rather that check each PFN") exposed several issues with the memory map initialization and these patches fix those issues. Initially there were crashes during compaction that Qian Cai reported back in April [1]. It seemed back then that the problem was fixed, but a few weeks ago Andrea Arcangeli hit the same bug [2] and there was an additional discussion at [3]. [1] https://lore.kernel.org/lkml/8C537EB7-85EE-4DCF-943E-3CC0ED0DF56D@lca.pw [2] https://lore.kernel.org/lkml/20201121194506.13464-1-aarcange@redhat.com [3] https://lore.kernel.org/mm-commits/20201206005401.qKuAVgOXr%akpm@linux-foundation.org This patch (of 2): The first 4Kb of memory is a BIOS owned area and to avoid its allocation for the kernel it was not listed in e820 tables as memory. As the result, pfn 0 was never recognised by the generic memory management and it is not a part of neither node 0 nor ZONE_DMA. If set_pfnblock_flags_mask() would be ever called for the pageblock corresponding to the first 2Mbytes of memory, having pfn 0 outside of ZONE_DMA would trigger VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); Along with reserving the first 4Kb in e820 tables, several first pages are reserved with memblock in several places during setup_arch(). These reservations are enough to ensure the kernel does not touch the BIOS area and it is not necessary to remove E820_TYPE_RAM for pfn 0. Remove the update of e820 table that changes the type of pfn 0 and move the comment describing why it was done to trim_low_memory_range() that reserves the beginning of the memory. Link: https://lkml.kernel.org/r/20210111194017.22696-2-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Baoquan He Cc: Borislav Petkov Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mel Gorman Cc: Michal Hocko Cc: Qian Cai Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 740f3bdb3f61..3412c4595efd 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -660,17 +660,6 @@ static void __init trim_platform_memory_ranges(void) static void __init trim_bios_range(void) { - /* - * A special case is the first 4Kb of memory; - * This is a BIOS owned area, not kernel ram, but generally - * not listed as such in the E820 table. - * - * This typically reserves additional memory (64KiB by default) - * since some BIOSes are known to corrupt low memory. See the - * Kconfig help text for X86_RESERVE_LOW. - */ - e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); - /* * special case: Some BIOSes report the PC BIOS * area (640Kb -> 1Mb) as RAM even though it is not. @@ -728,6 +717,15 @@ early_param("reservelow", parse_reservelow); static void __init trim_low_memory_range(void) { + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + * + * This typically reserves additional memory (64KiB by default) + * since some BIOSes are known to corrupt low memory. See the + * Kconfig help text for X86_RESERVE_LOW. + */ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); } From d3921cb8be29ce5668c64e23ffdaeec5f8c69399 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sat, 23 Jan 2021 21:01:02 -0800 Subject: [PATCH 118/135] mm: fix initialization of struct page for holes in memory layout There could be struct pages that are not backed by actual physical memory. This can happen when the actual memory bank is not a multiple of SECTION_SIZE or when an architecture does not register memory holes reserved by the firmware as memblock.memory. Such pages are currently initialized using init_unavailable_mem() function that iterates through PFNs in holes in memblock.memory and if there is a struct page corresponding to a PFN, the fields if this page are set to default values and the page is marked as Reserved. init_unavailable_mem() does not take into account zone and node the page belongs to and sets both zone and node links in struct page to zero. On a system that has firmware reserved holes in a zone above ZONE_DMA, for instance in a configuration below: # grep -A1 E820 /proc/iomem 7a17b000-7a216fff : Unknown E820 type 7a217000-7bffffff : System RAM unset zone link in struct page will trigger VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); because there are pages in both ZONE_DMA32 and ZONE_DMA (unset zone link in struct page) in the same pageblock. Update init_unavailable_mem() to use zone constraints defined by an architecture to properly setup the zone link and use node ID of the adjacent range in memblock.memory to set the node link. Link: https://lkml.kernel.org/r/20210111194017.22696-3-rppt@kernel.org Fixes: 73a6e474cb37 ("mm: memmap_init: iterate over memblock regions rather that check each PFN") Signed-off-by: Mike Rapoport Reported-by: Andrea Arcangeli Cc: Andrea Arcangeli Cc: Baoquan He Cc: Borislav Petkov Cc: David Hildenbrand Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mel Gorman Cc: Michal Hocko Cc: Qian Cai Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 84 +++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 027f6481ba59..85ecaa6d0d06 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7078,23 +7078,26 @@ void __init free_area_init_memoryless_node(int nid) * Initialize all valid struct pages in the range [spfn, epfn) and mark them * PageReserved(). Return the number of struct pages that were initialized. */ -static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn) +static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn, + int zone, int nid) { - unsigned long pfn; + unsigned long pfn, zone_spfn, zone_epfn; u64 pgcnt = 0; + zone_spfn = arch_zone_lowest_possible_pfn[zone]; + zone_epfn = arch_zone_highest_possible_pfn[zone]; + + spfn = clamp(spfn, zone_spfn, zone_epfn); + epfn = clamp(epfn, zone_spfn, zone_epfn); + for (pfn = spfn; pfn < epfn; pfn++) { if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) + pageblock_nr_pages - 1; continue; } - /* - * Use a fake node/zone (0) for now. Some of these pages - * (in memblock.reserved but not in memblock.memory) will - * get re-initialized via reserve_bootmem_region() later. - */ - __init_single_page(pfn_to_page(pfn), pfn, 0, 0); + + __init_single_page(pfn_to_page(pfn), pfn, zone, nid); __SetPageReserved(pfn_to_page(pfn)); pgcnt++; } @@ -7103,51 +7106,64 @@ static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn) } /* - * Only struct pages that are backed by physical memory are zeroed and - * initialized by going through __init_single_page(). But, there are some - * struct pages which are reserved in memblock allocator and their fields - * may be accessed (for example page_to_pfn() on some configuration accesses - * flags). We must explicitly initialize those struct pages. + * Only struct pages that correspond to ranges defined by memblock.memory + * are zeroed and initialized by going through __init_single_page() during + * memmap_init(). * - * This function also addresses a similar issue where struct pages are left - * uninitialized because the physical address range is not covered by - * memblock.memory or memblock.reserved. That could happen when memblock - * layout is manually configured via memmap=, or when the highest physical - * address (max_pfn) does not end on a section boundary. + * But, there could be struct pages that correspond to holes in + * memblock.memory. This can happen because of the following reasons: + * - phyiscal memory bank size is not necessarily the exact multiple of the + * arbitrary section size + * - early reserved memory may not be listed in memblock.memory + * - memory layouts defined with memmap= kernel parameter may not align + * nicely with memmap sections + * + * Explicitly initialize those struct pages so that: + * - PG_Reserved is set + * - zone link is set accorging to the architecture constrains + * - node is set to node id of the next populated region except for the + * trailing hole where last node id is used */ -static void __init init_unavailable_mem(void) +static void __init init_zone_unavailable_mem(int zone) { - phys_addr_t start, end; - u64 i, pgcnt; - phys_addr_t next = 0; + unsigned long start, end; + int i, nid; + u64 pgcnt; + unsigned long next = 0; /* - * Loop through unavailable ranges not covered by memblock.memory. + * Loop through holes in memblock.memory and initialize struct + * pages corresponding to these holes */ pgcnt = 0; - for_each_mem_range(i, &start, &end) { + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { if (next < start) - pgcnt += init_unavailable_range(PFN_DOWN(next), - PFN_UP(start)); + pgcnt += init_unavailable_range(next, start, zone, nid); next = end; } /* - * Early sections always have a fully populated memmap for the whole - * section - see pfn_valid(). If the last section has holes at the - * end and that section is marked "online", the memmap will be - * considered initialized. Make sure that memmap has a well defined - * state. + * Last section may surpass the actual end of memory (e.g. we can + * have 1Gb section and 512Mb of RAM pouplated). + * Make sure that memmap has a well defined state in this case. */ - pgcnt += init_unavailable_range(PFN_DOWN(next), - round_up(max_pfn, PAGES_PER_SECTION)); + end = round_up(max_pfn, PAGES_PER_SECTION); + pgcnt += init_unavailable_range(next, end, zone, nid); /* * Struct pages that do not have backing memory. This could be because * firmware is using some of this memory, or for some other reasons. */ if (pgcnt) - pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); + pr_info("Zone %s: zeroed struct page in unavailable ranges: %lld pages", zone_names[zone], pgcnt); +} + +static void __init init_unavailable_mem(void) +{ + int zone; + + for (zone = 0; zone < ZONE_MOVABLE; zone++) + init_zone_unavailable_mem(zone); } #else static inline void __init init_unavailable_mem(void) From 3de7d4f25a7438f09fef4e71ef111f1805cd8e7c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sat, 23 Jan 2021 21:01:07 -0800 Subject: [PATCH 119/135] mm: memcg/slab: optimize objcg stock draining MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Imran Khan reported a 16% regression in hackbench results caused by the commit f2fe7b09a52b ("mm: memcg/slab: charge individual slab objects instead of pages"). The regression is noticeable in the case of a consequent allocation of several relatively large slab objects, e.g. skb's. As soon as the amount of stocked bytes exceeds PAGE_SIZE, drain_obj_stock() and __memcg_kmem_uncharge() are called, and it leads to a number of atomic operations in page_counter_uncharge(). The corresponding call graph is below (provided by Imran Khan): |__alloc_skb | | | |__kmalloc_reserve.isra.61 | | | | | |__kmalloc_node_track_caller | | | | | | | |slab_pre_alloc_hook.constprop.88 | | | obj_cgroup_charge | | | | | | | | | |__memcg_kmem_charge | | | | | | | | | | | |page_counter_try_charge | | | | | | | | | |refill_obj_stock | | | | | | | | | | | |drain_obj_stock.isra.68 | | | | | | | | | | | | | |__memcg_kmem_uncharge | | | | | | | | | | | | | | | |page_counter_uncharge | | | | | | | | | | | | | | | | | |page_counter_cancel | | | | | | | | | | | |__slab_alloc | | | | | | | | | |___slab_alloc | | | | | | | | |slab_post_alloc_hook Instead of directly uncharging the accounted kernel memory, it's possible to refill the generic page-sized per-cpu stock instead. It's a much faster operation, especially on a default hierarchy. As a bonus, __memcg_kmem_uncharge_page() will also get faster, so the freeing of page-sized kernel allocations (e.g. large kmallocs) will become faster. A similar change has been done earlier for the socket memory by the commit 475d0487a2ad ("mm: memcontrol: use per-cpu stocks for socket memory uncharging"). Link: https://lkml.kernel.org/r/20210106042239.2860107-1-guro@fb.com Fixes: f2fe7b09a52b ("mm: memcg/slab: charge individual slab objects instead of pages") Signed-off-by: Roman Gushchin Reported-by: Imran Khan Tested-by: Imran Khan Reviewed-by: Shakeel Butt Reviewed-by: Michal Koutn Cc: Michal Koutný Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 605f671203ef..e2de77b5bcc2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3115,9 +3115,7 @@ void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->kmem, nr_pages); - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); + refill_stock(memcg, nr_pages); } /** From 8a8792f600abacd7e1b9bb667759dca1c153f64c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sat, 23 Jan 2021 21:01:11 -0800 Subject: [PATCH 120/135] mm: memcg: fix memcg file_dirty numa stat The kernel updates the per-node NR_FILE_DIRTY stats on page migration but not the memcg numa stats. That was not an issue until recently the commit 5f9a4f4a7096 ("mm: memcontrol: add the missing numa_stat interface for cgroup v2") exposed numa stats for the memcg. So fix the file_dirty per-memcg numa stat. Link: https://lkml.kernel.org/r/20210108155813.2914586-1-shakeelb@google.com Fixes: 5f9a4f4a7096 ("mm: memcontrol: add the missing numa_stat interface for cgroup v2") Signed-off-by: Shakeel Butt Reviewed-by: Muchun Song Acked-by: Yang Shi Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index ee5e612b4cd8..613794f6a433 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -500,9 +500,9 @@ int migrate_page_move_mapping(struct address_space *mapping, __inc_lruvec_state(new_lruvec, NR_SHMEM); } if (dirty && mapping_can_writeback(mapping)) { - __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); + __dec_lruvec_state(old_lruvec, NR_FILE_DIRTY); __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); - __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); + __inc_lruvec_state(new_lruvec, NR_FILE_DIRTY); __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); } } From 5c447d274f3746fbed6e695e7b9a2d7bd8b31b71 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sat, 23 Jan 2021 21:01:15 -0800 Subject: [PATCH 121/135] mm: fix numa stats for thp migration Currently the kernel is not correctly updating the numa stats for NR_FILE_PAGES and NR_SHMEM on THP migration. Fix that. For NR_FILE_DIRTY and NR_ZONE_WRITE_PENDING, although at the moment there is no need to handle THP migration as kernel still does not have write support for file THP but to be more future proof, this patch adds the THP support for those stats as well. Link: https://lkml.kernel.org/r/20210108155813.2914586-2-shakeelb@google.com Fixes: e71769ae52609 ("mm: enable thp migration for shmem thp") Signed-off-by: Shakeel Butt Acked-by: Yang Shi Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 613794f6a433..c0efe921bca5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -402,6 +402,7 @@ int migrate_page_move_mapping(struct address_space *mapping, struct zone *oldzone, *newzone; int dirty; int expected_count = expected_page_refs(mapping, page) + extra_count; + int nr = thp_nr_pages(page); if (!mapping) { /* Anonymous page without mapping */ @@ -437,7 +438,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ newpage->index = page->index; newpage->mapping = page->mapping; - page_ref_add(newpage, thp_nr_pages(page)); /* add cache reference */ + page_ref_add(newpage, nr); /* add cache reference */ if (PageSwapBacked(page)) { __SetPageSwapBacked(newpage); if (PageSwapCache(page)) { @@ -459,7 +460,7 @@ int migrate_page_move_mapping(struct address_space *mapping, if (PageTransHuge(page)) { int i; - for (i = 1; i < HPAGE_PMD_NR; i++) { + for (i = 1; i < nr; i++) { xas_next(&xas); xas_store(&xas, newpage); } @@ -470,7 +471,7 @@ int migrate_page_move_mapping(struct address_space *mapping, * to one less reference. * We know this isn't the last reference. */ - page_ref_unfreeze(page, expected_count - thp_nr_pages(page)); + page_ref_unfreeze(page, expected_count - nr); xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ @@ -493,17 +494,17 @@ int migrate_page_move_mapping(struct address_space *mapping, old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); - __dec_lruvec_state(old_lruvec, NR_FILE_PAGES); - __inc_lruvec_state(new_lruvec, NR_FILE_PAGES); + __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); + __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); if (PageSwapBacked(page) && !PageSwapCache(page)) { - __dec_lruvec_state(old_lruvec, NR_SHMEM); - __inc_lruvec_state(new_lruvec, NR_SHMEM); + __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); + __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); } if (dirty && mapping_can_writeback(mapping)) { - __dec_lruvec_state(old_lruvec, NR_FILE_DIRTY); - __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); - __inc_lruvec_state(new_lruvec, NR_FILE_DIRTY); - __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); + __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); + __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); + __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); + __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); } } local_irq_enable(); From a11a496ee6e2ab6ed850233c96b94caf042af0b9 Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Sat, 23 Jan 2021 21:01:25 -0800 Subject: [PATCH 122/135] kasan: fix unaligned address is unhandled in kasan_remove_zero_shadow During testing kasan_populate_early_shadow and kasan_remove_zero_shadow, if the shadow start and end address in kasan_remove_zero_shadow() is not aligned to PMD_SIZE, the remain unaligned PTE won't be removed. In the test case for kasan_remove_zero_shadow(): shadow_start: 0xffffffb802000000, shadow end: 0xffffffbfbe000000 3-level page table: PUD_SIZE: 0x40000000 PMD_SIZE: 0x200000 PAGE_SIZE: 4K 0xffffffbf80000000 ~ 0xffffffbfbdf80000 will not be removed because in kasan_remove_pud_table(), kasan_pmd_table(*pud) is true but the next address is 0xffffffbfbdf80000 which is not aligned to PUD_SIZE. In the correct condition, this should fallback to the next level kasan_remove_pmd_table() but the condition flow always continue to skip the unaligned part. Fix by correcting the condition when next and addr are neither aligned. Link: https://lkml.kernel.org/r/20210103135621.83129-1-lecopzer@gmail.com Fixes: 0207df4fa1a86 ("kernel/memremap, kasan: make ZONE_DEVICE with work with KASAN") Signed-off-by: Lecopzer Chen Cc: Andrey Ryabinin Cc: Dan Williams Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: YJ Chiang Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/init.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index 7ca0b92d5886..91049c1b8994 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -373,9 +373,10 @@ static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr, if (kasan_pte_table(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && - IS_ALIGNED(next, PMD_SIZE)) + IS_ALIGNED(next, PMD_SIZE)) { pmd_clear(pmd); - continue; + continue; + } } pte = pte_offset_kernel(pmd, addr); kasan_remove_pte_table(pte, addr, next); @@ -398,9 +399,10 @@ static void kasan_remove_pud_table(pud_t *pud, unsigned long addr, if (kasan_pmd_table(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && - IS_ALIGNED(next, PUD_SIZE)) + IS_ALIGNED(next, PUD_SIZE)) { pud_clear(pud); - continue; + continue; + } } pmd = pmd_offset(pud, addr); pmd_base = pmd_offset(pud, 0); @@ -424,9 +426,10 @@ static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr, if (kasan_pud_table(*p4d)) { if (IS_ALIGNED(addr, P4D_SIZE) && - IS_ALIGNED(next, P4D_SIZE)) + IS_ALIGNED(next, P4D_SIZE)) { p4d_clear(p4d); - continue; + continue; + } } pud = pud_offset(p4d, addr); kasan_remove_pud_table(pud, addr, next); @@ -457,9 +460,10 @@ void kasan_remove_zero_shadow(void *start, unsigned long size) if (kasan_p4d_table(*pgd)) { if (IS_ALIGNED(addr, PGDIR_SIZE) && - IS_ALIGNED(next, PGDIR_SIZE)) + IS_ALIGNED(next, PGDIR_SIZE)) { pgd_clear(pgd); - continue; + continue; + } } p4d = p4d_offset(pgd, addr); From 5dabd1712cd056814f9ab15f1d68157ceb04e741 Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Sat, 23 Jan 2021 21:01:29 -0800 Subject: [PATCH 123/135] kasan: fix incorrect arguments passing in kasan_add_zero_shadow kasan_remove_zero_shadow() shall use original virtual address, start and size, instead of shadow address. Link: https://lkml.kernel.org/r/20210103063847.5963-1-lecopzer@gmail.com Fixes: 0207df4fa1a86 ("kernel/memremap, kasan: make ZONE_DEVICE with work with KASAN") Signed-off-by: Lecopzer Chen Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dan Williams Cc: Dmitry Vyukov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index 91049c1b8994..c4605ac9837b 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -486,7 +486,6 @@ int kasan_add_zero_shadow(void *start, unsigned long size) ret = kasan_populate_early_shadow(shadow_start, shadow_end); if (ret) - kasan_remove_zero_shadow(shadow_start, - size >> KASAN_SHADOW_SCALE_SHIFT); + kasan_remove_zero_shadow(start, size); return ret; } From 76bc99e81a7cb78a78e058107e4b5b1d8ed3c874 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Sat, 23 Jan 2021 21:01:34 -0800 Subject: [PATCH 124/135] kasan: fix HW_TAGS boot parameters The initially proposed KASAN command line parameters are redundant. This change drops the complex "kasan.mode=off/prod/full" parameter and adds a simpler kill switch "kasan=off/on" instead. The new parameter together with the already existing ones provides a cleaner way to express the same set of features. The full set of parameters with this change: kasan=off/on - whether KASAN is enabled kasan.fault=report/panic - whether to only print a report or also panic kasan.stacktrace=off/on - whether to collect alloc/free stack traces Default values: kasan=on kasan.fault=report kasan.stacktrace=on (if CONFIG_DEBUG_KERNEL=y) kasan.stacktrace=off (otherwise) Link: https://linux-review.googlesource.com/id/Ib3694ed90b1e8ccac6cf77dfd301847af4aba7b8 Link: https://lkml.kernel.org/r/4e9c4a4bdcadc168317deb2419144582a9be6e61.1610736745.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Vincenzo Frascino Reviewed-by: Marco Elver Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Catalin Marinas Cc: Will Deacon Cc: Andrey Ryabinin Cc: Peter Collingbourne Cc: Evgenii Stepanov Cc: Branislav Rankov Cc: Kevin Brodsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 27 +++-------- mm/kasan/hw_tags.c | 77 +++++++++++++------------------ 2 files changed, 38 insertions(+), 66 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 0fc3fb1860c4..1651d961f06a 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -160,29 +160,14 @@ intended for use in production as a security mitigation. Therefore it supports boot parameters that allow to disable KASAN competely or otherwise control particular KASAN features. -The things that can be controlled are: +- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``). -1. Whether KASAN is enabled at all. -2. Whether KASAN collects and saves alloc/free stacks. -3. Whether KASAN panics on a detected bug or not. +- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack + traces collection (default: ``on`` for ``CONFIG_DEBUG_KERNEL=y``, otherwise + ``off``). -The ``kasan.mode`` boot parameter allows to choose one of three main modes: - -- ``kasan.mode=off`` - KASAN is disabled, no tag checks are performed -- ``kasan.mode=prod`` - only essential production features are enabled -- ``kasan.mode=full`` - all KASAN features are enabled - -The chosen mode provides default control values for the features mentioned -above. However it's also possible to override the default values by providing: - -- ``kasan.stacktrace=off`` or ``=on`` - enable alloc/free stack collection - (default: ``on`` for ``mode=full``, - otherwise ``off``) -- ``kasan.fault=report`` or ``=panic`` - only print KASAN report or also panic - (default: ``report``) - -If ``kasan.mode`` parameter is not provided, it defaults to ``full`` when -``CONFIG_DEBUG_KERNEL`` is enabled, and to ``prod`` otherwise. +- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN + report or also panic the kernel (default: ``report``). For developers ~~~~~~~~~~~~~~ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 55bd6f09c70f..e529428e7a11 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -19,11 +19,10 @@ #include "kasan.h" -enum kasan_arg_mode { - KASAN_ARG_MODE_DEFAULT, - KASAN_ARG_MODE_OFF, - KASAN_ARG_MODE_PROD, - KASAN_ARG_MODE_FULL, +enum kasan_arg { + KASAN_ARG_DEFAULT, + KASAN_ARG_OFF, + KASAN_ARG_ON, }; enum kasan_arg_stacktrace { @@ -38,7 +37,7 @@ enum kasan_arg_fault { KASAN_ARG_FAULT_PANIC, }; -static enum kasan_arg_mode kasan_arg_mode __ro_after_init; +static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; static enum kasan_arg_fault kasan_arg_fault __ro_after_init; @@ -52,26 +51,24 @@ DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); /* Whether panic or disable tag checking on fault. */ bool kasan_flag_panic __ro_after_init; -/* kasan.mode=off/prod/full */ -static int __init early_kasan_mode(char *arg) +/* kasan=off/on */ +static int __init early_kasan_flag(char *arg) { if (!arg) return -EINVAL; if (!strcmp(arg, "off")) - kasan_arg_mode = KASAN_ARG_MODE_OFF; - else if (!strcmp(arg, "prod")) - kasan_arg_mode = KASAN_ARG_MODE_PROD; - else if (!strcmp(arg, "full")) - kasan_arg_mode = KASAN_ARG_MODE_FULL; + kasan_arg = KASAN_ARG_OFF; + else if (!strcmp(arg, "on")) + kasan_arg = KASAN_ARG_ON; else return -EINVAL; return 0; } -early_param("kasan.mode", early_kasan_mode); +early_param("kasan", early_kasan_flag); -/* kasan.stack=off/on */ +/* kasan.stacktrace=off/on */ static int __init early_kasan_flag_stacktrace(char *arg) { if (!arg) @@ -113,8 +110,8 @@ void kasan_init_hw_tags_cpu(void) * as this function is only called for MTE-capable hardware. */ - /* If KASAN is disabled, do nothing. */ - if (kasan_arg_mode == KASAN_ARG_MODE_OFF) + /* If KASAN is disabled via command line, don't initialize it. */ + if (kasan_arg == KASAN_ARG_OFF) return; hw_init_tags(KASAN_TAG_MAX); @@ -124,43 +121,28 @@ void kasan_init_hw_tags_cpu(void) /* kasan_init_hw_tags() is called once on boot CPU. */ void __init kasan_init_hw_tags(void) { - /* If hardware doesn't support MTE, do nothing. */ + /* If hardware doesn't support MTE, don't initialize KASAN. */ if (!system_supports_mte()) return; - /* Choose KASAN mode if kasan boot parameter is not provided. */ - if (kasan_arg_mode == KASAN_ARG_MODE_DEFAULT) { - if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) - kasan_arg_mode = KASAN_ARG_MODE_FULL; - else - kasan_arg_mode = KASAN_ARG_MODE_PROD; - } - - /* Preset parameter values based on the mode. */ - switch (kasan_arg_mode) { - case KASAN_ARG_MODE_DEFAULT: - /* Shouldn't happen as per the check above. */ - WARN_ON(1); + /* If KASAN is disabled via command line, don't initialize it. */ + if (kasan_arg == KASAN_ARG_OFF) return; - case KASAN_ARG_MODE_OFF: - /* If KASAN is disabled, do nothing. */ - return; - case KASAN_ARG_MODE_PROD: - static_branch_enable(&kasan_flag_enabled); - break; - case KASAN_ARG_MODE_FULL: - static_branch_enable(&kasan_flag_enabled); - static_branch_enable(&kasan_flag_stacktrace); - break; - } - /* Now, optionally override the presets. */ + /* Enable KASAN. */ + static_branch_enable(&kasan_flag_enabled); switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: + /* + * Default to enabling stack trace collection for + * debug kernels. + */ + if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) + static_branch_enable(&kasan_flag_stacktrace); break; case KASAN_ARG_STACKTRACE_OFF: - static_branch_disable(&kasan_flag_stacktrace); + /* Do nothing, kasan_flag_stacktrace keeps its default value. */ break; case KASAN_ARG_STACKTRACE_ON: static_branch_enable(&kasan_flag_stacktrace); @@ -169,11 +151,16 @@ void __init kasan_init_hw_tags(void) switch (kasan_arg_fault) { case KASAN_ARG_FAULT_DEFAULT: + /* + * Default to no panic on report. + * Do nothing, kasan_flag_panic keeps its default value. + */ break; case KASAN_ARG_FAULT_REPORT: - kasan_flag_panic = false; + /* Do nothing, kasan_flag_panic keeps its default value. */ break; case KASAN_ARG_FAULT_PANIC: + /* Enable panic on report. */ kasan_flag_panic = true; break; } From ce5716c618524241a3cea821e18ee1e0d16f6c70 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Sat, 23 Jan 2021 21:01:38 -0800 Subject: [PATCH 125/135] kasan, mm: fix conflicts with init_on_alloc/free A few places where SLUB accesses object's data or metadata were missed in a previous patch. This leads to false positives with hardware tag-based KASAN when bulk allocations are used with init_on_alloc/free. Fix the false-positives by resetting pointer tags during these accesses. (The kasan_reset_tag call is removed from slab_alloc_node, as it's added into maybe_wipe_obj_freeptr.) Link: https://linux-review.googlesource.com/id/I50dd32838a666e173fe06c3c5c766f2c36aae901 Link: https://lkml.kernel.org/r/093428b5d2ca8b507f4a79f92f9929b35f7fada7.1610731872.git.andreyknvl@google.com Fixes: aa1ef4d7b3f67 ("kasan, mm: reset tags when accessing metadata") Signed-off-by: Andrey Konovalov Reported-by: Dmitry Vyukov Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: Vincenzo Frascino Cc: Alexander Potapenko Cc: Marco Elver Cc: Will Deacon Cc: Andrey Ryabinin Cc: Peter Collingbourne Cc: Evgenii Stepanov Cc: Branislav Rankov Cc: Kevin Brodsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d9e4e10683cc..69742ab9a21d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2791,7 +2791,8 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, void *obj) { if (unlikely(slab_want_init_on_free(s)) && obj) - memset((void *)((char *)obj + s->offset), 0, sizeof(void *)); + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); } /* @@ -2883,7 +2884,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, stat(s, ALLOC_FASTPATH); } - maybe_wipe_obj_freeptr(s, kasan_reset_tag(object)); + maybe_wipe_obj_freeptr(s, object); if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) memset(kasan_reset_tag(object), 0, s->object_size); @@ -3329,7 +3330,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, int j; for (j = 0; j < i; j++) - memset(p[j], 0, s->object_size); + memset(kasan_reset_tag(p[j]), 0, s->object_size); } /* memcg and kmem_cache debug support */ From acb35b177c71d3d39b9a3b9ea213d926235066e3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Sat, 23 Jan 2021 21:01:43 -0800 Subject: [PATCH 126/135] kasan, mm: fix resetting page_alloc tags for HW_TAGS A previous commit added resetting KASAN page tags to kernel_init_free_pages() to avoid false-positives due to accesses to metadata with the hardware tag-based mode. That commit did reset page tags before the metadata access, but didn't restore them after. As the result, KASAN fails to detect bad accesses to page_alloc allocations on some configurations. Fix this by recovering the tag after the metadata access. Link: https://lkml.kernel.org/r/02b5bcd692e912c27d484030f666b350ad7e4ae4.1611074450.git.andreyknvl@google.com Fixes: aa1ef4d7b3f6 ("kasan, mm: reset tags when accessing metadata") Signed-off-by: Andrey Konovalov Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Marco Elver Cc: Catalin Marinas Cc: Will Deacon Cc: Vincenzo Frascino Cc: Andrey Ryabinin Cc: Peter Collingbourne Cc: Evgenii Stepanov Cc: Branislav Rankov Cc: Kevin Brodsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 85ecaa6d0d06..783913e41f65 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1207,8 +1207,10 @@ static void kernel_init_free_pages(struct page *page, int numpages) /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); for (i = 0; i < numpages; i++) { + u8 tag = page_kasan_tag(page + i); page_kasan_tag_reset(page + i); clear_highpage(page + i); + page_kasan_tag_set(page + i, tag); } kasan_enable_current(); } From 251b5497c5c95e4548e3d33cbda3f638fea2c11e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 23 Jan 2021 21:01:48 -0800 Subject: [PATCH 127/135] ubsan: disable unsigned-overflow check for i386 Building ubsan kernels even for compile-testing introduced these warnings in my randconfig environment: crypto/blake2b_generic.c:98:13: error: stack frame size of 9636 bytes in function 'blake2b_compress' [-Werror,-Wframe-larger-than=] static void blake2b_compress(struct blake2b_state *S, crypto/sha512_generic.c:151:13: error: stack frame size of 1292 bytes in function 'sha512_generic_block_fn' [-Werror,-Wframe-larger-than=] static void sha512_generic_block_fn(struct sha512_state *sst, u8 const *src, lib/crypto/curve25519-fiat32.c:312:22: error: stack frame size of 2180 bytes in function 'fe_mul_impl' [-Werror,-Wframe-larger-than=] static noinline void fe_mul_impl(u32 out[10], const u32 in1[10], const u32 in2[10]) lib/crypto/curve25519-fiat32.c:444:22: error: stack frame size of 1588 bytes in function 'fe_sqr_impl' [-Werror,-Wframe-larger-than=] static noinline void fe_sqr_impl(u32 out[10], const u32 in1[10]) Further testing showed that this is caused by -fsanitize=unsigned-integer-overflow, but is isolated to the 32-bit x86 architecture. The one in blake2b immediately overflows the 8KB stack area architectures, so better ensure this never happens by disabling the option for 32-bit x86. Link: https://lkml.kernel.org/r/20210112202922.2454435-1-arnd@kernel.org Link: https://lore.kernel.org/lkml/20201230154749.746641-1-arnd@kernel.org/ Fixes: d0a3ac549f38 ("ubsan: enable for all*config builds") Signed-off-by: Arnd Bergmann Acked-by: Kees Cook Reviewed-by: Nathan Chancellor Cc: Nick Desaulniers Cc: Stephen Rothwell Cc: Marco Elver Cc: George Popescu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.ubsan | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 8b635fd75fe4..3a0b1c930733 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -123,6 +123,7 @@ config UBSAN_SIGNED_OVERFLOW config UBSAN_UNSIGNED_OVERFLOW bool "Perform checking for unsigned arithmetic overflow" depends on $(cc-option,-fsanitize=unsigned-integer-overflow) + depends on !X86_32 # avoid excessive stack usage on x86-32/clang help This option enables -fsanitize=unsigned-integer-overflow which checks for overflow of any arithmetic operations with unsigned integers. This From dad4e5b390866ca902653df0daa864ae4b8d4147 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 23 Jan 2021 21:01:52 -0800 Subject: [PATCH 128/135] mm: fix page reference leak in soft_offline_page() The conversion to move pfn_to_online_page() internal to soft_offline_page() missed that the get_user_pages() reference taken by the madvise() path needs to be dropped when pfn_to_online_page() fails. Note the direct sysfs-path to soft_offline_page() does not perform a get_user_pages() lookup. When soft_offline_page() is handed a pfn_valid() && !pfn_to_online_page() pfn the kernel hangs at dax-device shutdown due to a leaked reference. Link: https://lkml.kernel.org/r/161058501210.1840162.8108917599181157327.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: feec24a6139d ("mm, soft-offline: convert parameter to pfn") Signed-off-by: Dan Williams Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Naoya Horiguchi Cc: Michal Hocko Cc: Qian Cai Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 04d9f154a130..e9481632fcd1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1885,6 +1885,12 @@ static int soft_offline_free_page(struct page *page) return rc; } +static void put_ref_page(struct page *page) +{ + if (page) + put_page(page); +} + /** * soft_offline_page - Soft offline a page. * @pfn: pfn to soft-offline @@ -1910,20 +1916,26 @@ static int soft_offline_free_page(struct page *page) int soft_offline_page(unsigned long pfn, int flags) { int ret; - struct page *page; bool try_again = true; + struct page *page, *ref_page = NULL; + + WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED)); if (!pfn_valid(pfn)) return -ENXIO; + if (flags & MF_COUNT_INCREASED) + ref_page = pfn_to_page(pfn); + /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ page = pfn_to_online_page(pfn); - if (!page) + if (!page) { + put_ref_page(ref_page); return -EIO; + } if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); - if (flags & MF_COUNT_INCREASED) - put_page(page); + put_ref_page(ref_page); return 0; } From f99e02372af2e7ee72a6da497712ec9152964347 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Jan 2021 21:01:57 -0800 Subject: [PATCH 129/135] sparc/mm/highmem: flush cache and TLB Patch series "mm/highmem: Fix fallout from generic kmap_local conversions". The kmap_local conversion wreckaged sparc, mips and powerpc as it missed some of the details in the original implementation. This patch (of 4): The recent conversion to the generic kmap_local infrastructure failed to assign the proper pre/post map/unmap flush operations for sparc. Sparc requires cache flush before map/unmap and tlb flush afterwards. Link: https://lkml.kernel.org/r/20210112170136.078559026@linutronix.de Link: https://lkml.kernel.org/r/20210112170410.905976187@linutronix.de Fixes: 3293efa97807 ("sparc/mm/highmem: Switch to generic kmap atomic") Signed-off-by: Thomas Gleixner Reported-by: Andreas Larsson Cc: "David S. Miller" Cc: Peter Zijlstra Cc: Paul Cercueil Cc: Thomas Bogendoerfer Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/highmem.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 875116209ec1..c7b2e208328b 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,10 +50,11 @@ extern pte_t *pkmap_page_table; #define flush_cache_kmaps() flush_cache_all() -/* FIXME: Use __flush_tlb_one(vaddr) instead of flush_cache_all() -- Anton */ -#define arch_kmap_local_post_map(vaddr, pteval) flush_cache_all() -#define arch_kmap_local_post_unmap(vaddr) flush_cache_all() - +/* FIXME: Use __flush_*_one(vaddr) instead of flush_*_all() -- Anton */ +#define arch_kmap_local_pre_map(vaddr, pteval) flush_cache_all() +#define arch_kmap_local_pre_unmap(vaddr) flush_cache_all() +#define arch_kmap_local_post_map(vaddr, pteval) flush_tlb_all() +#define arch_kmap_local_post_unmap(vaddr) flush_tlb_all() #endif /* __KERNEL__ */ From a1dce7fd2ade8e71e5f95e58b99aa512607f52b0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Jan 2021 21:02:02 -0800 Subject: [PATCH 130/135] mm/highmem: prepare for overriding set_pte_at() The generic kmap_local() map function uses set_pte_at(), but MIPS requires set_pte() and PowerPC wants __set_pte_at(). Provide arch_kmap_local_set_pte() and default it to set_pte_at(). Link: https://lkml.kernel.org/r/20210112170411.056306194@linutronix.de Signed-off-by: Thomas Gleixner Cc: Andreas Larsson Cc: "David S. Miller" Cc: Michael Ellerman Cc: Paul Cercueil Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/highmem.c b/mm/highmem.c index c3a9ea7875ef..874b732b120c 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -473,6 +473,11 @@ static inline void *arch_kmap_local_high_get(struct page *page) } #endif +#ifndef arch_kmap_local_set_pte +#define arch_kmap_local_set_pte(mm, vaddr, ptep, ptev) \ + set_pte_at(mm, vaddr, ptep, ptev) +#endif + /* Unmap a local mapping which was obtained by kmap_high_get() */ static inline bool kmap_high_unmap_local(unsigned long vaddr) { @@ -515,7 +520,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte - idx))); pteval = pfn_pte(pfn, prot); - set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval); + arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte - idx, pteval); arch_kmap_local_post_map(vaddr, pteval); current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; preempt_enable(); From 8c0d5d78f3596e203e9cd27563a8380649c03ad0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Jan 2021 21:02:07 -0800 Subject: [PATCH 131/135] mips/mm/highmem: use set_pte() for kmap_local() set_pte_at() on MIPS invokes update_cache() which might recurse into kmap_local(). Use set_pte() like the original MIPS highmem implementation did. Link: https://lkml.kernel.org/r/20210112170411.187513575@linutronix.de Fixes: a4c33e83bca1 ("mips/mm/highmem: Switch to generic kmap atomic") Signed-off-by: Thomas Gleixner Reported-by: Paul Cercueil Reported-by: Thomas Bogendoerfer Acked-by: Thomas Bogendoerfer Cc: Andreas Larsson Cc: "David S. Miller" Cc: Michael Ellerman Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/include/asm/highmem.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 19edf8e69971..292d0425717f 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -51,6 +51,7 @@ extern void kmap_flush_tlb(unsigned long addr); #define flush_cache_kmaps() BUG_ON(cpu_has_dc_aliases) +#define arch_kmap_local_set_pte(mm, vaddr, ptep, ptev) set_pte(ptep, ptev) #define arch_kmap_local_post_map(vaddr, pteval) local_flush_tlb_one(vaddr) #define arch_kmap_local_post_unmap(vaddr) local_flush_tlb_one(vaddr) From 785025820a6a565185ce9d47fdd8d23dbf91dee8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Jan 2021 21:02:11 -0800 Subject: [PATCH 132/135] powerpc/mm/highmem: use __set_pte_at() for kmap_local() The original PowerPC highmem mapping function used __set_pte_at() to denote that the mapping is per CPU. This got lost with the conversion to the generic implementation. Override the default map function. Link: https://lkml.kernel.org/r/20210112170411.281464308@linutronix.de Fixes: 47da42b27a56 ("powerpc/mm/highmem: Switch to generic kmap atomic") Signed-off-by: Thomas Gleixner Cc: Michael Ellerman Cc: Andreas Larsson Cc: "David S. Miller" Cc: Paul Cercueil Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/highmem.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 80a5ae771c65..c0fcd1bbdba9 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -58,6 +58,8 @@ extern pte_t *pkmap_page_table; #define flush_cache_kmaps() flush_cache_all() +#define arch_kmap_local_set_pte(mm, vaddr, ptep, ptev) \ + __set_pte_at(mm, vaddr, ptep, ptev, 1) #define arch_kmap_local_post_map(vaddr, pteval) \ local_flush_tlb_page(NULL, vaddr) #define arch_kmap_local_post_unmap(vaddr) \ From 697edcb0e4eadc41645fe88c991fe6a206b1a08d Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Sat, 23 Jan 2021 21:02:16 -0800 Subject: [PATCH 133/135] proc_sysctl: fix oops caused by incorrect command parameters The process_sysctl_arg() does not check whether val is empty before invoking strlen(val). If the command line parameter () is incorrectly configured and val is empty, oops is triggered. For example: "hung_task_panic=1" is incorrectly written as "hung_task_panic", oops is triggered. The call stack is as follows: Kernel command line: .... hung_task_panic ...... Call trace: __pi_strlen+0x10/0x98 parse_args+0x278/0x344 do_sysctl_args+0x8c/0xfc kernel_init+0x5c/0xf4 ret_from_fork+0x10/0x30 To fix it, check whether "val" is empty when "phram" is a sysctl field. Error codes are returned in the failure branch, and error logs are generated by parse_args(). Link: https://lkml.kernel.org/r/20210118133029.28580-1-nixiaoming@huawei.com Fixes: 3db978d480e2843 ("kernel/sysctl: support setting sysctl parameters from kernel command line") Signed-off-by: Xiaoming Ni Acked-by: Vlastimil Babka Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Alexey Dobriyan Cc: Michal Hocko Cc: Masami Hiramatsu Cc: Heiner Kallweit Cc: Randy Dunlap Cc: [5.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 317899222d7f..d2018f70d1fa 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1770,6 +1770,12 @@ static int process_sysctl_arg(char *param, char *val, return 0; } + if (!val) + return -EINVAL; + len = strlen(val); + if (len == 0) + return -EINVAL; + /* * To set sysctl options, we use a temporary mount of proc, look up the * respective sys/ file and write to it. To avoid mounting it when no @@ -1811,7 +1817,6 @@ static int process_sysctl_arg(char *param, char *val, file, param, val); goto out; } - len = strlen(val); wret = kernel_write(file, val, len, &pos); if (wret < 0) { err = wret; From e82d891a63afebefde5d26971768f5cb91627f73 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 23 Jan 2021 21:02:21 -0800 Subject: [PATCH 134/135] MAINTAINERS: add a couple more files to the Clang/LLVM section The K: entry should ensure that Nick and I always get CC'd on patches that touch these files but it is better to be explicit rather than implicit. Link: https://lkml.kernel.org/r/20210114004059.2129921-1-natechancellor@gmail.com Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f79ec98bbb29..91c6dee7850e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4311,7 +4311,9 @@ W: https://clangbuiltlinux.github.io/ B: https://github.com/ClangBuiltLinux/linux/issues C: irc://chat.freenode.net/clangbuiltlinux F: Documentation/kbuild/llvm.rst +F: include/linux/compiler-clang.h F: scripts/clang-tools/ +F: scripts/clang-version.sh F: scripts/lld-version.sh K: \b(?i:clang|llvm)\b From 6ee1d745b7c9fd573fba142a2efdad76a9f1cb04 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 24 Jan 2021 16:47:14 -0800 Subject: [PATCH 135/135] Linux 5.11-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b0e4767735dc..e0af7a4a5598 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Kleptomaniac Octopus # *DOCUMENTATION*