From 00e772c4929257b11b51d47e4645f67826ded0fc Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 20 Jan 2021 14:30:07 +0100
Subject: [PATCH 001/183] irqchip: Remove sigma tango driver

The tango platform is getting removed, so the driver is no
longer needed.

Cc: Marc Gonzalez <marc.w.gonzalez@free.fr>
Cc: Mans Rullgard <mans@mansr.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Mans Rullgard <mans@mansr.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210120133008.2421897-2-arnd@kernel.org
---
 .../sigma,smp8642-intc.txt                    |  48 ----
 drivers/irqchip/Kconfig                       |   5 -
 drivers/irqchip/Makefile                      |   1 -
 drivers/irqchip/irq-tango.c                   | 227 ------------------
 4 files changed, 281 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt
 delete mode 100644 drivers/irqchip/irq-tango.c

diff --git a/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt
deleted file mode 100644
index 355c18a3a4d3..000000000000
--- a/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-Sigma Designs SMP86xx/SMP87xx secondary interrupt controller
-
-Required properties:
-- compatible: should be "sigma,smp8642-intc"
-- reg: physical address of MMIO region
-- ranges: address space mapping of child nodes
-- interrupt-controller: boolean
-- #address-cells: should be <1>
-- #size-cells: should be <1>
-
-One child node per control block with properties:
-- reg: address of registers for this control block
-- interrupt-controller: boolean
-- #interrupt-cells: should be <2>, interrupt index and flags per interrupts.txt
-- interrupts: interrupt spec of primary interrupt controller
-
-Example:
-
-interrupt-controller@6e000 {
-	compatible = "sigma,smp8642-intc";
-	reg = <0x6e000 0x400>;
-	ranges = <0x0 0x6e000 0x400>;
-	interrupt-parent = <&gic>;
-	interrupt-controller;
-	#address-cells = <1>;
-	#size-cells = <1>;
-
-	irq0: interrupt-controller@0 {
-		reg = <0x000 0x100>;
-		interrupt-controller;
-		#interrupt-cells = <2>;
-		interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>;
-	};
-
-	irq1: interrupt-controller@100 {
-		reg = <0x100 0x100>;
-		interrupt-controller;
-		#interrupt-cells = <2>;
-		interrupts = <GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>;
-	};
-
-	irq2: interrupt-controller@300 {
-		reg = <0x300 0x100>;
-		interrupt-controller;
-		#interrupt-cells = <2>;
-		interrupts = <GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>;
-	};
-};
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 94920a51c628..f95d114c63ed 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -260,11 +260,6 @@ config ST_IRQCHIP
 	help
 	  Enables SysCfg Controlled IRQs on STi based platforms.
 
-config TANGO_IRQ
-	bool
-	select IRQ_DOMAIN
-	select GENERIC_IRQ_CHIP
-
 config TB10X_IRQC
 	bool
 	select IRQ_DOMAIN
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 0ac93bfaec61..084e11774071 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -55,7 +55,6 @@ obj-$(CONFIG_VERSATILE_FPGA_IRQ)	+= irq-versatile-fpga.o
 obj-$(CONFIG_ARCH_NSPIRE)		+= irq-zevio.o
 obj-$(CONFIG_ARCH_VT8500)		+= irq-vt8500.o
 obj-$(CONFIG_ST_IRQCHIP)		+= irq-st.o
-obj-$(CONFIG_TANGO_IRQ)			+= irq-tango.o
 obj-$(CONFIG_TB10X_IRQC)		+= irq-tb10x.o
 obj-$(CONFIG_TS4800_IRQ)		+= irq-ts4800.o
 obj-$(CONFIG_XTENSA)			+= irq-xtensa-pic.o
diff --git a/drivers/irqchip/irq-tango.c b/drivers/irqchip/irq-tango.c
deleted file mode 100644
index 34290f09b853..000000000000
--- a/drivers/irqchip/irq-tango.c
+++ /dev/null
@@ -1,227 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) 2014 Mans Rullgard <mans@mansr.com>
- */
-
-#include <linux/init.h>
-#include <linux/irq.h>
-#include <linux/irqchip.h>
-#include <linux/irqchip/chained_irq.h>
-#include <linux/ioport.h>
-#include <linux/io.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/slab.h>
-
-#define IRQ0_CTL_BASE		0x0000
-#define IRQ1_CTL_BASE		0x0100
-#define EDGE_CTL_BASE		0x0200
-#define IRQ2_CTL_BASE		0x0300
-
-#define IRQ_CTL_HI		0x18
-#define EDGE_CTL_HI		0x20
-
-#define IRQ_STATUS		0x00
-#define IRQ_RAWSTAT		0x04
-#define IRQ_EN_SET		0x08
-#define IRQ_EN_CLR		0x0c
-#define IRQ_SOFT_SET		0x10
-#define IRQ_SOFT_CLR		0x14
-
-#define EDGE_STATUS		0x00
-#define EDGE_RAWSTAT		0x04
-#define EDGE_CFG_RISE		0x08
-#define EDGE_CFG_FALL		0x0c
-#define EDGE_CFG_RISE_SET	0x10
-#define EDGE_CFG_RISE_CLR	0x14
-#define EDGE_CFG_FALL_SET	0x18
-#define EDGE_CFG_FALL_CLR	0x1c
-
-struct tangox_irq_chip {
-	void __iomem *base;
-	unsigned long ctl;
-};
-
-static inline u32 intc_readl(struct tangox_irq_chip *chip, int reg)
-{
-	return readl_relaxed(chip->base + reg);
-}
-
-static inline void intc_writel(struct tangox_irq_chip *chip, int reg, u32 val)
-{
-	writel_relaxed(val, chip->base + reg);
-}
-
-static void tangox_dispatch_irqs(struct irq_domain *dom, unsigned int status,
-				 int base)
-{
-	unsigned int hwirq;
-	unsigned int virq;
-
-	while (status) {
-		hwirq = __ffs(status);
-		virq = irq_find_mapping(dom, base + hwirq);
-		if (virq)
-			generic_handle_irq(virq);
-		status &= ~BIT(hwirq);
-	}
-}
-
-static void tangox_irq_handler(struct irq_desc *desc)
-{
-	struct irq_domain *dom = irq_desc_get_handler_data(desc);
-	struct irq_chip *host_chip = irq_desc_get_chip(desc);
-	struct tangox_irq_chip *chip = dom->host_data;
-	unsigned int status_lo, status_hi;
-
-	chained_irq_enter(host_chip, desc);
-
-	status_lo = intc_readl(chip, chip->ctl + IRQ_STATUS);
-	status_hi = intc_readl(chip, chip->ctl + IRQ_CTL_HI + IRQ_STATUS);
-
-	tangox_dispatch_irqs(dom, status_lo, 0);
-	tangox_dispatch_irqs(dom, status_hi, 32);
-
-	chained_irq_exit(host_chip, desc);
-}
-
-static int tangox_irq_set_type(struct irq_data *d, unsigned int flow_type)
-{
-	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
-	struct tangox_irq_chip *chip = gc->domain->host_data;
-	struct irq_chip_regs *regs = &gc->chip_types[0].regs;
-
-	switch (flow_type & IRQ_TYPE_SENSE_MASK) {
-	case IRQ_TYPE_EDGE_RISING:
-		intc_writel(chip, regs->type + EDGE_CFG_RISE_SET, d->mask);
-		intc_writel(chip, regs->type + EDGE_CFG_FALL_CLR, d->mask);
-		break;
-
-	case IRQ_TYPE_EDGE_FALLING:
-		intc_writel(chip, regs->type + EDGE_CFG_RISE_CLR, d->mask);
-		intc_writel(chip, regs->type + EDGE_CFG_FALL_SET, d->mask);
-		break;
-
-	case IRQ_TYPE_LEVEL_HIGH:
-		intc_writel(chip, regs->type + EDGE_CFG_RISE_CLR, d->mask);
-		intc_writel(chip, regs->type + EDGE_CFG_FALL_CLR, d->mask);
-		break;
-
-	case IRQ_TYPE_LEVEL_LOW:
-		intc_writel(chip, regs->type + EDGE_CFG_RISE_SET, d->mask);
-		intc_writel(chip, regs->type + EDGE_CFG_FALL_SET, d->mask);
-		break;
-
-	default:
-		pr_err("Invalid trigger mode %x for IRQ %d\n",
-		       flow_type, d->irq);
-		return -EINVAL;
-	}
-
-	return irq_setup_alt_chip(d, flow_type);
-}
-
-static void __init tangox_irq_init_chip(struct irq_chip_generic *gc,
-					unsigned long ctl_offs,
-					unsigned long edge_offs)
-{
-	struct tangox_irq_chip *chip = gc->domain->host_data;
-	struct irq_chip_type *ct = gc->chip_types;
-	unsigned long ctl_base = chip->ctl + ctl_offs;
-	unsigned long edge_base = EDGE_CTL_BASE + edge_offs;
-	int i;
-
-	gc->reg_base = chip->base;
-	gc->unused = 0;
-
-	for (i = 0; i < 2; i++) {
-		ct[i].chip.irq_ack = irq_gc_ack_set_bit;
-		ct[i].chip.irq_mask = irq_gc_mask_disable_reg;
-		ct[i].chip.irq_mask_ack = irq_gc_mask_disable_and_ack_set;
-		ct[i].chip.irq_unmask = irq_gc_unmask_enable_reg;
-		ct[i].chip.irq_set_type = tangox_irq_set_type;
-		ct[i].chip.name = gc->domain->name;
-
-		ct[i].regs.enable = ctl_base + IRQ_EN_SET;
-		ct[i].regs.disable = ctl_base + IRQ_EN_CLR;
-		ct[i].regs.ack = edge_base + EDGE_RAWSTAT;
-		ct[i].regs.type = edge_base;
-	}
-
-	ct[0].type = IRQ_TYPE_LEVEL_MASK;
-	ct[0].handler = handle_level_irq;
-
-	ct[1].type = IRQ_TYPE_EDGE_BOTH;
-	ct[1].handler = handle_edge_irq;
-
-	intc_writel(chip, ct->regs.disable, 0xffffffff);
-	intc_writel(chip, ct->regs.ack, 0xffffffff);
-}
-
-static void __init tangox_irq_domain_init(struct irq_domain *dom)
-{
-	struct irq_chip_generic *gc;
-	int i;
-
-	for (i = 0; i < 2; i++) {
-		gc = irq_get_domain_generic_chip(dom, i * 32);
-		tangox_irq_init_chip(gc, i * IRQ_CTL_HI, i * EDGE_CTL_HI);
-	}
-}
-
-static int __init tangox_irq_init(void __iomem *base, struct resource *baseres,
-				  struct device_node *node)
-{
-	struct tangox_irq_chip *chip;
-	struct irq_domain *dom;
-	struct resource res;
-	int irq;
-	int err;
-
-	irq = irq_of_parse_and_map(node, 0);
-	if (!irq)
-		panic("%pOFn: failed to get IRQ", node);
-
-	err = of_address_to_resource(node, 0, &res);
-	if (err)
-		panic("%pOFn: failed to get address", node);
-
-	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
-	chip->ctl = res.start - baseres->start;
-	chip->base = base;
-
-	dom = irq_domain_add_linear(node, 64, &irq_generic_chip_ops, chip);
-	if (!dom)
-		panic("%pOFn: failed to create irqdomain", node);
-
-	err = irq_alloc_domain_generic_chips(dom, 32, 2, node->name,
-					     handle_level_irq, 0, 0, 0);
-	if (err)
-		panic("%pOFn: failed to allocate irqchip", node);
-
-	tangox_irq_domain_init(dom);
-
-	irq_set_chained_handler_and_data(irq, tangox_irq_handler, dom);
-
-	return 0;
-}
-
-static int __init tangox_of_irq_init(struct device_node *node,
-				     struct device_node *parent)
-{
-	struct device_node *c;
-	struct resource res;
-	void __iomem *base;
-
-	base = of_iomap(node, 0);
-	if (!base)
-		panic("%pOFn: of_iomap failed", node);
-
-	of_address_to_resource(node, 0, &res);
-
-	for_each_child_of_node(node, c)
-		tangox_irq_init(base, &res, c);
-
-	return 0;
-}
-IRQCHIP_DECLARE(tangox_intc, "sigma,smp8642-intc", tangox_of_irq_init);

From 5c1ea0d842b1e73ae04870527ec29d5479c35041 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 20 Jan 2021 14:30:08 +0100
Subject: [PATCH 002/183] irqchip: Remove sirfsoc driver

The CSR SiRF prima2/atlas platforms are getting removed, so this driver
is no longer needed.

Cc: Barry Song <baohua@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Barry Song <baohua@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210120133008.2421897-3-arnd@kernel.org
---
 drivers/irqchip/Makefile      |   1 -
 drivers/irqchip/irq-sirfsoc.c | 134 ----------------------------------
 2 files changed, 135 deletions(-)
 delete mode 100644 drivers/irqchip/irq-sirfsoc.c

diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 084e11774071..37e3556df127 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -45,7 +45,6 @@ obj-$(CONFIG_I8259)			+= irq-i8259.o
 obj-$(CONFIG_IMGPDC_IRQ)		+= irq-imgpdc.o
 obj-$(CONFIG_IRQ_MIPS_CPU)		+= irq-mips-cpu.o
 obj-$(CONFIG_IXP4XX_IRQ)		+= irq-ixp4xx.o
-obj-$(CONFIG_SIRF_IRQ)			+= irq-sirfsoc.o
 obj-$(CONFIG_JCORE_AIC)			+= irq-jcore-aic.o
 obj-$(CONFIG_RDA_INTC)			+= irq-rda-intc.o
 obj-$(CONFIG_RENESAS_INTC_IRQPIN)	+= irq-renesas-intc-irqpin.o
diff --git a/drivers/irqchip/irq-sirfsoc.c b/drivers/irqchip/irq-sirfsoc.c
deleted file mode 100644
index c86faaa35ca4..000000000000
--- a/drivers/irqchip/irq-sirfsoc.c
+++ /dev/null
@@ -1,134 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * interrupt controller support for CSR SiRFprimaII
- *
- * Copyright (c) 2011 Cambridge Silicon Radio Limited, a CSR plc group company.
- */
-
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/irqchip.h>
-#include <linux/irqdomain.h>
-#include <linux/syscore_ops.h>
-#include <asm/mach/irq.h>
-#include <asm/exception.h>
-
-#define SIRFSOC_INT_RISC_MASK0		0x0018
-#define SIRFSOC_INT_RISC_MASK1		0x001C
-#define SIRFSOC_INT_RISC_LEVEL0		0x0020
-#define SIRFSOC_INT_RISC_LEVEL1		0x0024
-#define SIRFSOC_INIT_IRQ_ID		0x0038
-#define SIRFSOC_INT_BASE_OFFSET		0x0004
-
-#define SIRFSOC_NUM_IRQS		64
-#define SIRFSOC_NUM_BANKS		(SIRFSOC_NUM_IRQS / 32)
-
-static struct irq_domain *sirfsoc_irqdomain;
-
-static void __iomem *sirfsoc_irq_get_regbase(void)
-{
-	return (void __iomem __force *)sirfsoc_irqdomain->host_data;
-}
-
-static __init void sirfsoc_alloc_gc(void __iomem *base)
-{
-	unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN;
-	unsigned int set = IRQ_LEVEL;
-	struct irq_chip_generic *gc;
-	struct irq_chip_type *ct;
-	int i;
-
-	irq_alloc_domain_generic_chips(sirfsoc_irqdomain, 32, 1, "irq_sirfsoc",
-				       handle_level_irq, clr, set,
-				       IRQ_GC_INIT_MASK_CACHE);
-
-	for (i = 0; i < SIRFSOC_NUM_BANKS; i++) {
-		gc = irq_get_domain_generic_chip(sirfsoc_irqdomain, i * 32);
-		gc->reg_base = base + i * SIRFSOC_INT_BASE_OFFSET;
-		ct = gc->chip_types;
-		ct->chip.irq_mask = irq_gc_mask_clr_bit;
-		ct->chip.irq_unmask = irq_gc_mask_set_bit;
-		ct->regs.mask = SIRFSOC_INT_RISC_MASK0;
-	}
-}
-
-static void __exception_irq_entry sirfsoc_handle_irq(struct pt_regs *regs)
-{
-	void __iomem *base = sirfsoc_irq_get_regbase();
-	u32 irqstat;
-
-	irqstat = readl_relaxed(base + SIRFSOC_INIT_IRQ_ID);
-	handle_domain_irq(sirfsoc_irqdomain, irqstat & 0xff, regs);
-}
-
-static int __init sirfsoc_irq_init(struct device_node *np,
-	struct device_node *parent)
-{
-	void __iomem *base = of_iomap(np, 0);
-	if (!base)
-		panic("unable to map intc cpu registers\n");
-
-	sirfsoc_irqdomain = irq_domain_add_linear(np, SIRFSOC_NUM_IRQS,
-						  &irq_generic_chip_ops, base);
-	sirfsoc_alloc_gc(base);
-
-	writel_relaxed(0, base + SIRFSOC_INT_RISC_LEVEL0);
-	writel_relaxed(0, base + SIRFSOC_INT_RISC_LEVEL1);
-
-	writel_relaxed(0, base + SIRFSOC_INT_RISC_MASK0);
-	writel_relaxed(0, base + SIRFSOC_INT_RISC_MASK1);
-
-	set_handle_irq(sirfsoc_handle_irq);
-
-	return 0;
-}
-IRQCHIP_DECLARE(sirfsoc_intc, "sirf,prima2-intc", sirfsoc_irq_init);
-
-struct sirfsoc_irq_status {
-	u32 mask0;
-	u32 mask1;
-	u32 level0;
-	u32 level1;
-};
-
-static struct sirfsoc_irq_status sirfsoc_irq_st;
-
-static int sirfsoc_irq_suspend(void)
-{
-	void __iomem *base = sirfsoc_irq_get_regbase();
-
-	sirfsoc_irq_st.mask0 = readl_relaxed(base + SIRFSOC_INT_RISC_MASK0);
-	sirfsoc_irq_st.mask1 = readl_relaxed(base + SIRFSOC_INT_RISC_MASK1);
-	sirfsoc_irq_st.level0 = readl_relaxed(base + SIRFSOC_INT_RISC_LEVEL0);
-	sirfsoc_irq_st.level1 = readl_relaxed(base + SIRFSOC_INT_RISC_LEVEL1);
-
-	return 0;
-}
-
-static void sirfsoc_irq_resume(void)
-{
-	void __iomem *base = sirfsoc_irq_get_regbase();
-
-	writel_relaxed(sirfsoc_irq_st.mask0, base + SIRFSOC_INT_RISC_MASK0);
-	writel_relaxed(sirfsoc_irq_st.mask1, base + SIRFSOC_INT_RISC_MASK1);
-	writel_relaxed(sirfsoc_irq_st.level0, base + SIRFSOC_INT_RISC_LEVEL0);
-	writel_relaxed(sirfsoc_irq_st.level1, base + SIRFSOC_INT_RISC_LEVEL1);
-}
-
-static struct syscore_ops sirfsoc_irq_syscore_ops = {
-	.suspend	= sirfsoc_irq_suspend,
-	.resume		= sirfsoc_irq_resume,
-};
-
-static int __init sirfsoc_irq_pm_init(void)
-{
-	if (!sirfsoc_irqdomain)
-		return 0;
-
-	register_syscore_ops(&sirfsoc_irq_syscore_ops);
-	return 0;
-}
-device_initcall(sirfsoc_irq_pm_init);

From d40341145a2497cb7a18d72fda53cd2220fe10f3 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Thu, 21 Jan 2021 18:22:52 +0000
Subject: [PATCH 003/183] irqchip/gic-v3: Fix typos in PMR/RPR SCR_EL3.FIQ
 handling explanation

The GICv3 driver explanation related to PMR/RPR and SCR_EL3.FIQ
secure/non-secure priority handling contains a couple of typos.

Fix them.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210121182252.29320-1-lorenzo.pieralisi@arm.com
---
 drivers/irqchip/irq-gic-v3.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 3fc65375cbe0..eb0ee356a629 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -75,10 +75,10 @@ static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key);
  * are presented to the GIC CPUIF as follow:
  *     (GIC_(R)DIST_PRI[irq] >> 1) | 0x80;
  *
- * If SCR_EL3.FIQ == 1, the values writen to/read from PMR and RPR at non-secure
+ * If SCR_EL3.FIQ == 1, the values written to/read from PMR and RPR at non-secure
  * EL1 are subject to a similar operation thus matching the priorities presented
  * from the (re)distributor when security is enabled. When SCR_EL3.FIQ == 0,
- * these values are unchanched by the GIC.
+ * these values are unchanged by the GIC.
  *
  * see GICv3/GICv4 Architecture Specification (IHI0069D):
  * - section 4.8.1 Non-secure accesses to register fields for Secure interrupt

From ad6b47cdef760410311f41876b21eb0c6fda4717 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel@sholland.org>
Date: Sun, 17 Jan 2021 23:50:31 -0600
Subject: [PATCH 004/183] dt-bindings: irq: sun6i-r: Split the binding from
 sun7i-nmi

The R_INTC in the A31 and newer sun8i/sun50i SoCs has additional
functionality compared to the sun7i/sun9i NMI controller. Among other
things, it multiplexes access to up to 128 interrupts corresponding to
(and in parallel to) the first 128 GIC SPIs. This means the NMI is no
longer the lowest-numbered hwirq at this irqchip, since it is SPI 32 or
96 (depending on SoC). hwirq 0 now corresponds to SPI 0, usually UART0.

To allow access to all multiplexed IRQs, the R_INTC requires a new
binding where the interrupt number matches the GIC interrupt number.
Otherwise, interrupts with hwirq numbers below the NMI would not be
representable in the device tree.

For simplicity, copy the three-cell GIC binding; this disambiguates
interrupt 0 in the old binding (the NMI) from interrupt 0 in the new
binding (SPI 0) by the number of cells.

Because the H6 R_INTC has a different mapping from multiplexed IRQs to
top-level register bits, it is no longer compatible with the A31 R_INTC.

Acked-by: Maxime Ripard <mripard@kernel.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Samuel Holland <samuel@sholland.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210118055040.21910-2-samuel@sholland.org
---
 .../allwinner,sun6i-a31-r-intc.yaml           | 66 +++++++++++++++++++
 .../allwinner,sun7i-a20-sc-nmi.yaml           | 10 ---
 2 files changed, 66 insertions(+), 10 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/interrupt-controller/allwinner,sun6i-a31-r-intc.yaml

diff --git a/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun6i-a31-r-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun6i-a31-r-intc.yaml
new file mode 100644
index 000000000000..50e607e607c8
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun6i-a31-r-intc.yaml
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/interrupt-controller/allwinner,sun6i-a31-r-intc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A31 NMI/Wakeup Interrupt Controller Device Tree Bindings
+
+maintainers:
+  - Chen-Yu Tsai <wens@csie.org>
+  - Maxime Ripard <mripard@kernel.org>
+
+allOf:
+  - $ref: /schemas/interrupt-controller.yaml#
+
+properties:
+  "#interrupt-cells":
+    const: 3
+    description:
+      The first cell is GIC_SPI (0), the second cell is the IRQ number, and
+      the third cell is the trigger type as defined in interrupt.txt in this
+      directory.
+
+  compatible:
+    oneOf:
+      - const: allwinner,sun6i-a31-r-intc
+      - items:
+          - enum:
+              - allwinner,sun8i-a83t-r-intc
+              - allwinner,sun50i-a64-r-intc
+          - const: allwinner,sun6i-a31-r-intc
+      - const: allwinner,sun50i-h6-r-intc
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+    description:
+      The GIC interrupt labeled as "External NMI".
+
+  interrupt-controller: true
+
+required:
+  - "#interrupt-cells"
+  - compatible
+  - reg
+  - interrupts
+  - interrupt-controller
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    r_intc: interrupt-controller@1f00c00 {
+            compatible = "allwinner,sun50i-a64-r-intc",
+                         "allwinner,sun6i-a31-r-intc";
+            interrupt-controller;
+            #interrupt-cells = <3>;
+            reg = <0x01f00c00 0x400>;
+            interrupts = <GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>;
+    };
+
+...
diff --git a/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml
index 8acca0ae3129..f34ecc8c7093 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml
+++ b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun7i-a20-sc-nmi.yaml
@@ -22,23 +22,13 @@ properties:
 
   compatible:
     oneOf:
-      - const: allwinner,sun6i-a31-r-intc
       - const: allwinner,sun6i-a31-sc-nmi
         deprecated: true
       - const: allwinner,sun7i-a20-sc-nmi
-      - items:
-          - const: allwinner,sun8i-a83t-r-intc
-          - const: allwinner,sun6i-a31-r-intc
       - const: allwinner,sun9i-a80-nmi
-      - items:
-          - const: allwinner,sun50i-a64-r-intc
-          - const: allwinner,sun6i-a31-r-intc
       - items:
           - const: allwinner,sun50i-a100-nmi
           - const: allwinner,sun9i-a80-nmi
-      - items:
-          - const: allwinner,sun50i-h6-r-intc
-          - const: allwinner,sun6i-a31-r-intc
 
   reg:
     maxItems: 1

From 6436eb4417094ea3308b33d8392fc02a1068dc78 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel@sholland.org>
Date: Sun, 17 Jan 2021 23:50:32 -0600
Subject: [PATCH 005/183] dt-bindings: irq: sun6i-r: Add a compatible for the
 H3

The Allwinner H3 SoC contains an R_INTC that is, as far as we know,
compatible with the R_INTC present in other sun8i SoCs starting with
the A31. Since the R_INTC hardware is undocumented, introduce a new
compatible for the R_INTC variant in this SoC, in case there turns out
to be some difference.

Acked-by: Maxime Ripard <mripard@kernel.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Samuel Holland <samuel@sholland.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210118055040.21910-3-samuel@sholland.org
---
 .../interrupt-controller/allwinner,sun6i-a31-r-intc.yaml         | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun6i-a31-r-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun6i-a31-r-intc.yaml
index 50e607e607c8..4db24b8a9ffe 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun6i-a31-r-intc.yaml
+++ b/Documentation/devicetree/bindings/interrupt-controller/allwinner,sun6i-a31-r-intc.yaml
@@ -27,6 +27,7 @@ properties:
       - items:
           - enum:
               - allwinner,sun8i-a83t-r-intc
+              - allwinner,sun8i-h3-r-intc
               - allwinner,sun50i-a64-r-intc
           - const: allwinner,sun6i-a31-r-intc
       - const: allwinner,sun50i-h6-r-intc

From 4e34614636b31747b190488240a95647c227021f Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel@sholland.org>
Date: Sun, 17 Jan 2021 23:50:33 -0600
Subject: [PATCH 006/183] irqchip/sun6i-r: Use a stacked irqchip driver

The R_INTC in the A31 and newer sun8i/sun50i SoCs is more similar to the
original sun4i interrupt controller than the sun7i/sun9i NMI controller.
It is used for two distinct purposes:
 - To control the trigger, latch, and mask for the NMI input pin
 - To provide the interrupt input for the ARISC coprocessor

As this interrupt controller is not documented, information about it
comes from vendor-provided firmware blobs and from experimentation.

Differences from the sun4i interrupt controller appear to be:
 - It only has one or two registers of each kind (max 32 or 64 IRQs)
 - Multiplexing logic is added to support additional inputs
 - There is no FIQ-related logic
 - There is no interrupt priority logic

In order to fulfill its two purposes, this hardware block combines four
types of IRQs. First, the NMI pin is routed to the "IRQ 0" input on this
chip, with a trigger type controlled by the NMI_CTRL_REG. The "IRQ 0
pending" output from this chip, if enabled, is then routed to a SPI IRQ
input on the GIC. In other words, bit 0 of IRQ_ENABLE_REG *does* affect
the NMI IRQ seen at the GIC.

The NMI is followed by a contiguous block of 15 "direct" (my name for
them) IRQ inputs that are connected in parallel to both R_INTC and the
GIC. Or in other words, these bits of IRQ_ENABLE_REG *do not* affect the
IRQs seen at the GIC.

Following the direct IRQs are the ARISC's copy of banked IRQs for shared
peripherals. These are not relevant to Linux. The remaining IRQs are
connected to a multiplexer and provide access to the first (up to) 128
SPIs from the ARISC. This range of SPIs overlaps with the direct IRQs.

Because of the 1:1 correspondence between R_INTC and GIC inputs, this is
a perfect scenario for using a stacked irqchip driver. We want to hook
into setting the NMI trigger type, but not actually handle any IRQ here.

To allow access to all multiplexed IRQs, this driver requires a new
binding where the interrupt number matches the GIC interrupt number.
(This moves the NMI from number 0 to 32 or 96, depending on the SoC.)
For simplicity, copy the three-cell GIC binding; this disambiguates
interrupt 0 in the old binding (the NMI) from interrupt 0 in the new
binding (SPI 0) by the number of cells.

Since R_INTC is in the always-on power domain, and its output is visible
to the power management coprocessor, a stacked irqchip driver provides a
simple way to add wakeup support to any of its IRQs. That is the next
patch; for now, just the NMI is moved over.

This commit mostly reverts commit 173bda53b340 ("irqchip/sunxi-nmi:
Support sun6i-a31-r-intc compatible").

Acked-by: Maxime Ripard <mripard@kernel.org>
Signed-off-by: Samuel Holland <samuel@sholland.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210118055040.21910-4-samuel@sholland.org
---
 arch/arm/mach-sunxi/Kconfig     |   2 +
 arch/arm64/Kconfig.platforms    |   2 +
 drivers/irqchip/Makefile        |   1 +
 drivers/irqchip/irq-sun6i-r.c   | 284 ++++++++++++++++++++++++++++++++
 drivers/irqchip/irq-sunxi-nmi.c |  26 +--
 5 files changed, 292 insertions(+), 23 deletions(-)
 create mode 100644 drivers/irqchip/irq-sun6i-r.c

diff --git a/arch/arm/mach-sunxi/Kconfig b/arch/arm/mach-sunxi/Kconfig
index eeadb1a4dcfe..e5c2fce281cd 100644
--- a/arch/arm/mach-sunxi/Kconfig
+++ b/arch/arm/mach-sunxi/Kconfig
@@ -6,6 +6,8 @@ menuconfig ARCH_SUNXI
 	select CLKSRC_MMIO
 	select GENERIC_IRQ_CHIP
 	select GPIOLIB
+	select IRQ_DOMAIN_HIERARCHY
+	select IRQ_FASTEOI_HIERARCHY_HANDLERS
 	select PINCTRL
 	select PM_OPP
 	select SUN4I_TIMER
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index 6eecdef538bd..f2aa1518c6f4 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -17,6 +17,8 @@ config ARCH_SUNXI
 	bool "Allwinner sunxi 64-bit SoC Family"
 	select ARCH_HAS_RESET_CONTROLLER
 	select GENERIC_IRQ_CHIP
+	select IRQ_DOMAIN_HIERARCHY
+	select IRQ_FASTEOI_HIERARCHY_HANDLERS
 	select PINCTRL
 	select RESET_CONTROLLER
 	help
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 37e3556df127..2a1994d7f99a 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_OR1K_PIC)			+= irq-or1k-pic.o
 obj-$(CONFIG_ORION_IRQCHIP)		+= irq-orion.o
 obj-$(CONFIG_OMAP_IRQCHIP)		+= irq-omap-intc.o
 obj-$(CONFIG_ARCH_SUNXI)		+= irq-sun4i.o
+obj-$(CONFIG_ARCH_SUNXI)		+= irq-sun6i-r.o
 obj-$(CONFIG_ARCH_SUNXI)		+= irq-sunxi-nmi.o
 obj-$(CONFIG_ARCH_SPEAR3XX)		+= spear-shirq.o
 obj-$(CONFIG_ARM_GIC)			+= irq-gic.o irq-gic-common.o
diff --git a/drivers/irqchip/irq-sun6i-r.c b/drivers/irqchip/irq-sun6i-r.c
new file mode 100644
index 000000000000..284b56905eb7
--- /dev/null
+++ b/drivers/irqchip/irq-sun6i-r.c
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * The R_INTC in Allwinner A31 and newer SoCs manages several types of
+ * interrupts, as shown below:
+ *
+ *             NMI IRQ                DIRECT IRQs           MUXED IRQs
+ *              bit 0                  bits 1-15^           bits 19-31
+ *
+ *   +---------+                      +---------+    +---------+  +---------+
+ *   | NMI Pad |                      |  IRQ d  |    |  IRQ m  |  | IRQ m+7 |
+ *   +---------+                      +---------+    +---------+  +---------+
+ *        |                             |     |         |    |      |    |
+ *        |                             |     |         |    |......|    |
+ * +------V------+ +------------+       |     |         | +--V------V--+ |
+ * |   Invert/   | | Write 1 to |       |     |         | |  AND with  | |
+ * | Edge Detect | | PENDING[0] |       |     |         | |  MUX[m/8]  | |
+ * +-------------+ +------------+       |     |         | +------------+ |
+ *            |       |                 |     |         |       |        |
+ *         +--V-------V--+           +--V--+  |      +--V--+    |     +--V--+
+ *         | Set    Reset|           | GIC |  |      | GIC |    |     | GIC |
+ *         |    Latch    |           | SPI |  |      | SPI |... |  ...| SPI |
+ *         +-------------+           | N+d |  |      |  m  |    |     | m+7 |
+ *             |     |               +-----+  |      +-----+    |     +-----+
+ *             |     |                        |                 |
+ *     +-------V-+ +-V----------+   +---------V--+     +--------V--------+
+ *     | GIC SPI | |  AND with  |   |  AND with  |     |    AND with     |
+ *     | N (=32) | |  ENABLE[0] |   |  ENABLE[d] |     |  ENABLE[19+m/8] |
+ *     +---------+ +------------+   +------------+     +-----------------+
+ *                        |                |                    |
+ *                 +------V-----+   +------V-----+     +--------V--------+
+ *                 |    Read    |   |    Read    |     |     Read        |
+ *                 | PENDING[0] |   | PENDING[d] |     | PENDING[19+m/8] |
+ *                 +------------+   +------------+     +-----------------+
+ *
+ * ^ bits 16-18 are direct IRQs for peripherals with banked interrupts, such as
+ *   the MSGBOX. These IRQs do not map to any GIC SPI.
+ *
+ * The H6 variant adds two more (banked) direct IRQs and implements the full
+ * set of 128 mux bits. This requires a second set of top-level registers.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+
+#include <dt-bindings/interrupt-controller/arm-gic.h>
+
+#define SUN6I_NMI_CTRL			(0x0c)
+#define SUN6I_IRQ_PENDING(n)		(0x10 + 4 * (n))
+#define SUN6I_IRQ_ENABLE(n)		(0x40 + 4 * (n))
+#define SUN6I_MUX_ENABLE(n)		(0xc0 + 4 * (n))
+
+#define SUN6I_NMI_SRC_TYPE_LEVEL_LOW	0
+#define SUN6I_NMI_SRC_TYPE_EDGE_FALLING	1
+#define SUN6I_NMI_SRC_TYPE_LEVEL_HIGH	2
+#define SUN6I_NMI_SRC_TYPE_EDGE_RISING	3
+
+#define SUN6I_NMI_BIT			BIT(0)
+
+#define SUN6I_NMI_NEEDS_ACK		((void *)1)
+
+#define SUN6I_NR_TOP_LEVEL_IRQS		64
+#define SUN6I_NR_DIRECT_IRQS		16
+#define SUN6I_NR_MUX_BITS		128
+
+static void __iomem *base;
+static irq_hw_number_t nmi_hwirq;
+
+static void sun6i_r_intc_ack_nmi(void)
+{
+	writel_relaxed(SUN6I_NMI_BIT, base + SUN6I_IRQ_PENDING(0));
+}
+
+static void sun6i_r_intc_nmi_ack(struct irq_data *data)
+{
+	if (irqd_get_trigger_type(data) & IRQ_TYPE_EDGE_BOTH)
+		sun6i_r_intc_ack_nmi();
+	else
+		data->chip_data = SUN6I_NMI_NEEDS_ACK;
+}
+
+static void sun6i_r_intc_nmi_eoi(struct irq_data *data)
+{
+	/* For oneshot IRQs, delay the ack until the IRQ is unmasked. */
+	if (data->chip_data == SUN6I_NMI_NEEDS_ACK && !irqd_irq_masked(data)) {
+		data->chip_data = NULL;
+		sun6i_r_intc_ack_nmi();
+	}
+
+	irq_chip_eoi_parent(data);
+}
+
+static void sun6i_r_intc_nmi_unmask(struct irq_data *data)
+{
+	if (data->chip_data == SUN6I_NMI_NEEDS_ACK) {
+		data->chip_data = NULL;
+		sun6i_r_intc_ack_nmi();
+	}
+
+	irq_chip_unmask_parent(data);
+}
+
+static int sun6i_r_intc_nmi_set_type(struct irq_data *data, unsigned int type)
+{
+	u32 nmi_src_type;
+
+	switch (type) {
+	case IRQ_TYPE_EDGE_RISING:
+		nmi_src_type = SUN6I_NMI_SRC_TYPE_EDGE_RISING;
+		break;
+	case IRQ_TYPE_EDGE_FALLING:
+		nmi_src_type = SUN6I_NMI_SRC_TYPE_EDGE_FALLING;
+		break;
+	case IRQ_TYPE_LEVEL_HIGH:
+		nmi_src_type = SUN6I_NMI_SRC_TYPE_LEVEL_HIGH;
+		break;
+	case IRQ_TYPE_LEVEL_LOW:
+		nmi_src_type = SUN6I_NMI_SRC_TYPE_LEVEL_LOW;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	writel_relaxed(nmi_src_type, base + SUN6I_NMI_CTRL);
+
+	/*
+	 * The "External NMI" GIC input connects to a latch inside R_INTC, not
+	 * directly to the pin. So the GIC trigger type does not depend on the
+	 * NMI pin trigger type.
+	 */
+	return irq_chip_set_type_parent(data, IRQ_TYPE_LEVEL_HIGH);
+}
+
+static int sun6i_r_intc_nmi_set_irqchip_state(struct irq_data *data,
+					      enum irqchip_irq_state which,
+					      bool state)
+{
+	if (which == IRQCHIP_STATE_PENDING && !state)
+		sun6i_r_intc_ack_nmi();
+
+	return irq_chip_set_parent_state(data, which, state);
+}
+
+static struct irq_chip sun6i_r_intc_nmi_chip = {
+	.name			= "sun6i-r-intc",
+	.irq_ack		= sun6i_r_intc_nmi_ack,
+	.irq_mask		= irq_chip_mask_parent,
+	.irq_unmask		= sun6i_r_intc_nmi_unmask,
+	.irq_eoi		= sun6i_r_intc_nmi_eoi,
+	.irq_set_affinity	= irq_chip_set_affinity_parent,
+	.irq_set_type		= sun6i_r_intc_nmi_set_type,
+	.irq_set_irqchip_state	= sun6i_r_intc_nmi_set_irqchip_state,
+	.flags			= IRQCHIP_SET_TYPE_MASKED |
+				  IRQCHIP_SKIP_SET_WAKE,
+};
+
+static int sun6i_r_intc_domain_translate(struct irq_domain *domain,
+					 struct irq_fwspec *fwspec,
+					 unsigned long *hwirq,
+					 unsigned int *type)
+{
+	/* Accept the old two-cell binding for the NMI only. */
+	if (fwspec->param_count == 2 && fwspec->param[0] == 0) {
+		*hwirq = nmi_hwirq;
+		*type  = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+		return 0;
+	}
+
+	/* Otherwise this binding should match the GIC SPI binding. */
+	if (fwspec->param_count < 3)
+		return -EINVAL;
+	if (fwspec->param[0] != GIC_SPI)
+		return -EINVAL;
+
+	*hwirq = fwspec->param[1];
+	*type  = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;
+
+	return 0;
+}
+
+static int sun6i_r_intc_domain_alloc(struct irq_domain *domain,
+				     unsigned int virq,
+				     unsigned int nr_irqs, void *arg)
+{
+	struct irq_fwspec *fwspec = arg;
+	struct irq_fwspec gic_fwspec;
+	unsigned long hwirq;
+	unsigned int type;
+	int i, ret;
+
+	ret = sun6i_r_intc_domain_translate(domain, fwspec, &hwirq, &type);
+	if (ret)
+		return ret;
+	if (hwirq + nr_irqs > SUN6I_NR_MUX_BITS)
+		return -EINVAL;
+
+	/* Construct a GIC-compatible fwspec from this fwspec. */
+	gic_fwspec = (struct irq_fwspec) {
+		.fwnode      = domain->parent->fwnode,
+		.param_count = 3,
+		.param       = { GIC_SPI, hwirq, type },
+	};
+
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, &gic_fwspec);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < nr_irqs; ++i, ++hwirq, ++virq) {
+		if (hwirq == nmi_hwirq) {
+			irq_domain_set_hwirq_and_chip(domain, virq, hwirq,
+						      &sun6i_r_intc_nmi_chip, 0);
+			irq_set_handler(virq, handle_fasteoi_ack_irq);
+		} else {
+			/* Only the NMI is currently supported. */
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static const struct irq_domain_ops sun6i_r_intc_domain_ops = {
+	.translate	= sun6i_r_intc_domain_translate,
+	.alloc		= sun6i_r_intc_domain_alloc,
+	.free		= irq_domain_free_irqs_common,
+};
+
+static void sun6i_r_intc_resume(void)
+{
+	int i;
+
+	/* Only the NMI is relevant during normal operation. */
+	writel_relaxed(SUN6I_NMI_BIT, base + SUN6I_IRQ_ENABLE(0));
+	for (i = 1; i < BITS_TO_U32(SUN6I_NR_TOP_LEVEL_IRQS); ++i)
+		writel_relaxed(0, base + SUN6I_IRQ_ENABLE(i));
+}
+
+static int __init sun6i_r_intc_init(struct device_node *node,
+				    struct device_node *parent)
+{
+	struct irq_domain *domain, *parent_domain;
+	struct of_phandle_args nmi_parent;
+	int ret;
+
+	/* Extract the NMI hwirq number from the OF node. */
+	ret = of_irq_parse_one(node, 0, &nmi_parent);
+	if (ret)
+		return ret;
+	if (nmi_parent.args_count < 3 ||
+	    nmi_parent.args[0] != GIC_SPI ||
+	    nmi_parent.args[2] != IRQ_TYPE_LEVEL_HIGH)
+		return -EINVAL;
+	nmi_hwirq = nmi_parent.args[1];
+
+	parent_domain = irq_find_host(parent);
+	if (!parent_domain) {
+		pr_err("%pOF: Failed to obtain parent domain\n", node);
+		return -ENXIO;
+	}
+
+	base = of_io_request_and_map(node, 0, NULL);
+	if (IS_ERR(base)) {
+		pr_err("%pOF: Failed to map MMIO region\n", node);
+		return PTR_ERR(base);
+	}
+
+	domain = irq_domain_add_hierarchy(parent_domain, 0, 0, node,
+					  &sun6i_r_intc_domain_ops, NULL);
+	if (!domain) {
+		pr_err("%pOF: Failed to allocate domain\n", node);
+		iounmap(base);
+		return -ENOMEM;
+	}
+
+	sun6i_r_intc_ack_nmi();
+	sun6i_r_intc_resume();
+
+	return 0;
+}
+IRQCHIP_DECLARE(sun6i_r_intc, "allwinner,sun6i-a31-r-intc", sun6i_r_intc_init);
diff --git a/drivers/irqchip/irq-sunxi-nmi.c b/drivers/irqchip/irq-sunxi-nmi.c
index a412b5d5d0fa..9f2bd0c5d289 100644
--- a/drivers/irqchip/irq-sunxi-nmi.c
+++ b/drivers/irqchip/irq-sunxi-nmi.c
@@ -27,18 +27,12 @@
 
 #define SUNXI_NMI_IRQ_BIT	BIT(0)
 
-#define SUN6I_R_INTC_CTRL	0x0c
-#define SUN6I_R_INTC_PENDING	0x10
-#define SUN6I_R_INTC_ENABLE	0x40
-
 /*
  * For deprecated sun6i-a31-sc-nmi compatible.
- * Registers are offset by 0x0c.
  */
-#define SUN6I_R_INTC_NMI_OFFSET	0x0c
-#define SUN6I_NMI_CTRL		(SUN6I_R_INTC_CTRL - SUN6I_R_INTC_NMI_OFFSET)
-#define SUN6I_NMI_PENDING	(SUN6I_R_INTC_PENDING - SUN6I_R_INTC_NMI_OFFSET)
-#define SUN6I_NMI_ENABLE	(SUN6I_R_INTC_ENABLE - SUN6I_R_INTC_NMI_OFFSET)
+#define SUN6I_NMI_CTRL		0x00
+#define SUN6I_NMI_PENDING	0x04
+#define SUN6I_NMI_ENABLE	0x34
 
 #define SUN7I_NMI_CTRL		0x00
 #define SUN7I_NMI_PENDING	0x04
@@ -61,12 +55,6 @@ struct sunxi_sc_nmi_reg_offs {
 	u32 enable;
 };
 
-static const struct sunxi_sc_nmi_reg_offs sun6i_r_intc_reg_offs __initconst = {
-	.ctrl	= SUN6I_R_INTC_CTRL,
-	.pend	= SUN6I_R_INTC_PENDING,
-	.enable	= SUN6I_R_INTC_ENABLE,
-};
-
 static const struct sunxi_sc_nmi_reg_offs sun6i_reg_offs __initconst = {
 	.ctrl	= SUN6I_NMI_CTRL,
 	.pend	= SUN6I_NMI_PENDING,
@@ -232,14 +220,6 @@ static int __init sunxi_sc_nmi_irq_init(struct device_node *node,
 	return ret;
 }
 
-static int __init sun6i_r_intc_irq_init(struct device_node *node,
-					struct device_node *parent)
-{
-	return sunxi_sc_nmi_irq_init(node, &sun6i_r_intc_reg_offs);
-}
-IRQCHIP_DECLARE(sun6i_r_intc, "allwinner,sun6i-a31-r-intc",
-		sun6i_r_intc_irq_init);
-
 static int __init sun6i_sc_nmi_irq_init(struct device_node *node,
 					struct device_node *parent)
 {

From 7ab365f6cd6de1e2b0cb1e1e3873dbf68e6f1003 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel@sholland.org>
Date: Sun, 17 Jan 2021 23:50:34 -0600
Subject: [PATCH 007/183] irqchip/sun6i-r: Add wakeup support

Maintain bitmaps of wake-enabled IRQs and mux inputs, and program them
to the hardware during the syscore phase of suspend and shutdown. Then
restore the original set of enabled IRQs (only the NMI) during resume.

This serves two purposes. First, it lets power management firmware
running on the ARISC coprocessor know which wakeup sources Linux wants
to have enabled. That way, it can avoid turning them off when it shuts
down the remainder of the clock tree. Second, it preconfigures the
coprocessor's interrupt controller, so the firmware's wakeup logic
is as simple as waiting for an interrupt to arrive.

The suspend/resume logic is not conditional on PM_SLEEP because it is
identical to the init/shutdown logic. Wake IRQs may be enabled during
shutdown to allow powering the board back on. As an example, see
commit a5c5e50cce9d ("Input: gpio-keys - add shutdown callback").

Acked-by: Maxime Ripard <mripard@kernel.org>
Signed-off-by: Samuel Holland <samuel@sholland.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210118055040.21910-5-samuel@sholland.org
---
 drivers/irqchip/irq-sun6i-r.c | 107 ++++++++++++++++++++++++++++++++--
 1 file changed, 101 insertions(+), 6 deletions(-)

diff --git a/drivers/irqchip/irq-sun6i-r.c b/drivers/irqchip/irq-sun6i-r.c
index 284b56905eb7..4cd3e533740b 100644
--- a/drivers/irqchip/irq-sun6i-r.c
+++ b/drivers/irqchip/irq-sun6i-r.c
@@ -39,6 +39,7 @@
  * set of 128 mux bits. This requires a second set of top-level registers.
  */
 
+#include <linux/bitmap.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/irqchip.h>
@@ -46,6 +47,7 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
+#include <linux/syscore_ops.h>
 
 #include <dt-bindings/interrupt-controller/arm-gic.h>
 
@@ -67,8 +69,17 @@
 #define SUN6I_NR_DIRECT_IRQS		16
 #define SUN6I_NR_MUX_BITS		128
 
+struct sun6i_r_intc_variant {
+	u32		first_mux_irq;
+	u32		nr_mux_irqs;
+	u32		mux_valid[BITS_TO_U32(SUN6I_NR_MUX_BITS)];
+};
+
 static void __iomem *base;
 static irq_hw_number_t nmi_hwirq;
+static DECLARE_BITMAP(wake_irq_enabled, SUN6I_NR_TOP_LEVEL_IRQS);
+static DECLARE_BITMAP(wake_mux_enabled, SUN6I_NR_MUX_BITS);
+static DECLARE_BITMAP(wake_mux_valid, SUN6I_NR_MUX_BITS);
 
 static void sun6i_r_intc_ack_nmi(void)
 {
@@ -145,6 +156,21 @@ static int sun6i_r_intc_nmi_set_irqchip_state(struct irq_data *data,
 	return irq_chip_set_parent_state(data, which, state);
 }
 
+static int sun6i_r_intc_irq_set_wake(struct irq_data *data, unsigned int on)
+{
+	unsigned long offset_from_nmi = data->hwirq - nmi_hwirq;
+
+	if (offset_from_nmi < SUN6I_NR_DIRECT_IRQS)
+		assign_bit(offset_from_nmi, wake_irq_enabled, on);
+	else if (test_bit(data->hwirq, wake_mux_valid))
+		assign_bit(data->hwirq, wake_mux_enabled, on);
+	else
+		/* Not wakeup capable. */
+		return -EPERM;
+
+	return 0;
+}
+
 static struct irq_chip sun6i_r_intc_nmi_chip = {
 	.name			= "sun6i-r-intc",
 	.irq_ack		= sun6i_r_intc_nmi_ack,
@@ -154,8 +180,19 @@ static struct irq_chip sun6i_r_intc_nmi_chip = {
 	.irq_set_affinity	= irq_chip_set_affinity_parent,
 	.irq_set_type		= sun6i_r_intc_nmi_set_type,
 	.irq_set_irqchip_state	= sun6i_r_intc_nmi_set_irqchip_state,
-	.flags			= IRQCHIP_SET_TYPE_MASKED |
-				  IRQCHIP_SKIP_SET_WAKE,
+	.irq_set_wake		= sun6i_r_intc_irq_set_wake,
+	.flags			= IRQCHIP_SET_TYPE_MASKED,
+};
+
+static struct irq_chip sun6i_r_intc_wakeup_chip = {
+	.name			= "sun6i-r-intc",
+	.irq_mask		= irq_chip_mask_parent,
+	.irq_unmask		= irq_chip_unmask_parent,
+	.irq_eoi		= irq_chip_eoi_parent,
+	.irq_set_affinity	= irq_chip_set_affinity_parent,
+	.irq_set_type		= irq_chip_set_type_parent,
+	.irq_set_wake		= sun6i_r_intc_irq_set_wake,
+	.flags			= IRQCHIP_SET_TYPE_MASKED,
 };
 
 static int sun6i_r_intc_domain_translate(struct irq_domain *domain,
@@ -215,8 +252,8 @@ static int sun6i_r_intc_domain_alloc(struct irq_domain *domain,
 						      &sun6i_r_intc_nmi_chip, 0);
 			irq_set_handler(virq, handle_fasteoi_ack_irq);
 		} else {
-			/* Only the NMI is currently supported. */
-			return -EINVAL;
+			irq_domain_set_hwirq_and_chip(domain, virq, hwirq,
+						      &sun6i_r_intc_wakeup_chip, 0);
 		}
 	}
 
@@ -229,6 +266,22 @@ static const struct irq_domain_ops sun6i_r_intc_domain_ops = {
 	.free		= irq_domain_free_irqs_common,
 };
 
+static int sun6i_r_intc_suspend(void)
+{
+	u32 buf[BITS_TO_U32(max(SUN6I_NR_TOP_LEVEL_IRQS, SUN6I_NR_MUX_BITS))];
+	int i;
+
+	/* Wake IRQs are enabled during system sleep and shutdown. */
+	bitmap_to_arr32(buf, wake_irq_enabled, SUN6I_NR_TOP_LEVEL_IRQS);
+	for (i = 0; i < BITS_TO_U32(SUN6I_NR_TOP_LEVEL_IRQS); ++i)
+		writel_relaxed(buf[i], base + SUN6I_IRQ_ENABLE(i));
+	bitmap_to_arr32(buf, wake_mux_enabled, SUN6I_NR_MUX_BITS);
+	for (i = 0; i < BITS_TO_U32(SUN6I_NR_MUX_BITS); ++i)
+		writel_relaxed(buf[i], base + SUN6I_MUX_ENABLE(i));
+
+	return 0;
+}
+
 static void sun6i_r_intc_resume(void)
 {
 	int i;
@@ -239,8 +292,20 @@ static void sun6i_r_intc_resume(void)
 		writel_relaxed(0, base + SUN6I_IRQ_ENABLE(i));
 }
 
+static void sun6i_r_intc_shutdown(void)
+{
+	sun6i_r_intc_suspend();
+}
+
+static struct syscore_ops sun6i_r_intc_syscore_ops = {
+	.suspend	= sun6i_r_intc_suspend,
+	.resume		= sun6i_r_intc_resume,
+	.shutdown	= sun6i_r_intc_shutdown,
+};
+
 static int __init sun6i_r_intc_init(struct device_node *node,
-				    struct device_node *parent)
+				    struct device_node *parent,
+				    const struct sun6i_r_intc_variant *v)
 {
 	struct irq_domain *domain, *parent_domain;
 	struct of_phandle_args nmi_parent;
@@ -256,6 +321,9 @@ static int __init sun6i_r_intc_init(struct device_node *node,
 		return -EINVAL;
 	nmi_hwirq = nmi_parent.args[1];
 
+	bitmap_set(wake_irq_enabled, v->first_mux_irq, v->nr_mux_irqs);
+	bitmap_from_arr32(wake_mux_valid, v->mux_valid, SUN6I_NR_MUX_BITS);
+
 	parent_domain = irq_find_host(parent);
 	if (!parent_domain) {
 		pr_err("%pOF: Failed to obtain parent domain\n", node);
@@ -276,9 +344,36 @@ static int __init sun6i_r_intc_init(struct device_node *node,
 		return -ENOMEM;
 	}
 
+	register_syscore_ops(&sun6i_r_intc_syscore_ops);
+
 	sun6i_r_intc_ack_nmi();
 	sun6i_r_intc_resume();
 
 	return 0;
 }
-IRQCHIP_DECLARE(sun6i_r_intc, "allwinner,sun6i-a31-r-intc", sun6i_r_intc_init);
+
+static const struct sun6i_r_intc_variant sun6i_a31_r_intc_variant __initconst = {
+	.first_mux_irq	= 19,
+	.nr_mux_irqs	= 13,
+	.mux_valid	= { 0xffffffff, 0xfff80000, 0xffffffff, 0x0000000f },
+};
+
+static int __init sun6i_a31_r_intc_init(struct device_node *node,
+					struct device_node *parent)
+{
+	return sun6i_r_intc_init(node, parent, &sun6i_a31_r_intc_variant);
+}
+IRQCHIP_DECLARE(sun6i_a31_r_intc, "allwinner,sun6i-a31-r-intc", sun6i_a31_r_intc_init);
+
+static const struct sun6i_r_intc_variant sun50i_h6_r_intc_variant __initconst = {
+	.first_mux_irq	= 21,
+	.nr_mux_irqs	= 16,
+	.mux_valid	= { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff },
+};
+
+static int __init sun50i_h6_r_intc_init(struct device_node *node,
+					struct device_node *parent)
+{
+	return sun6i_r_intc_init(node, parent, &sun50i_h6_r_intc_variant);
+}
+IRQCHIP_DECLARE(sun50i_h6_r_intc, "allwinner,sun50i-h6-r-intc", sun50i_h6_r_intc_init);

From e6f93c0115cb24ae4b473f28a27294e99faf129a Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Fri, 15 Jan 2021 14:39:40 +0530
Subject: [PATCH 008/183] dt-bindings: qcom,pdc: Add compatible for SM8250

Add the compatible string for SM8250 SoC from Qualcomm. This compatible
is used already in DTS files but not documented yet

Signed-off-by: Vinod Koul <vkoul@kernel.org>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210115090941.2289416-1-vkoul@kernel.org
---
 .../devicetree/bindings/interrupt-controller/qcom,pdc.txt        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/interrupt-controller/qcom,pdc.txt b/Documentation/devicetree/bindings/interrupt-controller/qcom,pdc.txt
index 1df293953327..9c1a046e6fd9 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/qcom,pdc.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/qcom,pdc.txt
@@ -20,6 +20,7 @@ Properties:
 	Definition: Should contain "qcom,<soc>-pdc" and "qcom,pdc"
 		    - "qcom,sc7180-pdc": For SC7180
 		    - "qcom,sdm845-pdc": For SDM845
+		    - "qcom,sdm8250-pdc": For SM8250
 
 - reg:
 	Usage: required

From 9eaad15e5a409f59660f9fdf867f7d3e6e3db15a Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Fri, 15 Jan 2021 14:39:41 +0530
Subject: [PATCH 009/183] dt-bindings: qcom,pdc: Add compatible for SM8350

Add the compatible string for SM8350 SoC from Qualcomm.

Signed-off-by: Vinod Koul <vkoul@kernel.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210115090941.2289416-2-vkoul@kernel.org
---
 .../devicetree/bindings/interrupt-controller/qcom,pdc.txt        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/interrupt-controller/qcom,pdc.txt b/Documentation/devicetree/bindings/interrupt-controller/qcom,pdc.txt
index 9c1a046e6fd9..e9afb48182c7 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/qcom,pdc.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/qcom,pdc.txt
@@ -21,6 +21,7 @@ Properties:
 		    - "qcom,sc7180-pdc": For SC7180
 		    - "qcom,sdm845-pdc": For SDM845
 		    - "qcom,sdm8250-pdc": For SM8250
+		    - "qcom,sdm8350-pdc": For SM8350
 
 - reg:
 	Usage: required

From 6cc8e7430801fa238bd7d3acae1eb406c6e02fe1 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 26 Jan 2021 09:46:30 -0500
Subject: [PATCH 010/183] loop: scale loop device by introducing per device
 lock

Currently, loop device has only one global lock: loop_ctl_mutex.

This becomes hot in scenarios where many loop devices are used.

Scale it by introducing per-device lock: lo_mutex that protects
modifications of all fields in struct loop_device.

Keep loop_ctl_mutex to protect global data: loop_index_idr, loop_lookup,
loop_add.

The new lock ordering requirement is that loop_ctl_mutex must be taken
before lo_mutex.

Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Tyler Hicks <tyhicks@linux.microsoft.com>
Reviewed-by: Petr Vorel <pvorel@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 93 +++++++++++++++++++++++++-------------------
 drivers/block/loop.h |  1 +
 2 files changed, 54 insertions(+), 40 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index e5ff328f0917..578fc034db3f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -704,7 +704,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	int		error;
 	bool		partscan;
 
-	error = mutex_lock_killable(&loop_ctl_mutex);
+	error = mutex_lock_killable(&lo->lo_mutex);
 	if (error)
 		return error;
 	error = -ENXIO;
@@ -743,9 +743,9 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	loop_update_dio(lo);
 	blk_mq_unfreeze_queue(lo->lo_queue);
 	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&lo->lo_mutex);
 	/*
-	 * We must drop file reference outside of loop_ctl_mutex as dropping
+	 * We must drop file reference outside of lo_mutex as dropping
 	 * the file ref can take bd_mutex which creates circular locking
 	 * dependency.
 	 */
@@ -755,7 +755,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	return 0;
 
 out_err:
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&lo->lo_mutex);
 	if (file)
 		fput(file);
 	return error;
@@ -1092,7 +1092,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 			goto out_putf;
 	}
 
-	error = mutex_lock_killable(&loop_ctl_mutex);
+	error = mutex_lock_killable(&lo->lo_mutex);
 	if (error)
 		goto out_bdev;
 
@@ -1171,7 +1171,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	 * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev).
 	 */
 	bdgrab(bdev);
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&lo->lo_mutex);
 	if (partscan)
 		loop_reread_partitions(lo, bdev);
 	if (!(mode & FMODE_EXCL))
@@ -1179,7 +1179,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
 	return 0;
 
 out_unlock:
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&lo->lo_mutex);
 out_bdev:
 	if (!(mode & FMODE_EXCL))
 		bd_abort_claiming(bdev, loop_configure);
@@ -1200,7 +1200,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	bool partscan = false;
 	int lo_number;
 
-	mutex_lock(&loop_ctl_mutex);
+	mutex_lock(&lo->lo_mutex);
 	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
 		err = -ENXIO;
 		goto out_unlock;
@@ -1253,7 +1253,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	lo_number = lo->lo_number;
 	loop_unprepare_queue(lo);
 out_unlock:
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&lo->lo_mutex);
 	if (partscan) {
 		/*
 		 * bd_mutex has been held already in release path, so don't
@@ -1284,18 +1284,17 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
 	 * protects us from all the other places trying to change the 'lo'
 	 * device.
 	 */
-	mutex_lock(&loop_ctl_mutex);
+	mutex_lock(&lo->lo_mutex);
 	lo->lo_flags = 0;
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
 	lo->lo_state = Lo_unbound;
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&lo->lo_mutex);
 
 	/*
-	 * Need not hold loop_ctl_mutex to fput backing file.
-	 * Calling fput holding loop_ctl_mutex triggers a circular
-	 * lock dependency possibility warning as fput can take
-	 * bd_mutex which is usually taken before loop_ctl_mutex.
+	 * Need not hold lo_mutex to fput backing file. Calling fput holding
+	 * lo_mutex triggers a circular lock dependency possibility warning as
+	 * fput can take bd_mutex which is usually taken before lo_mutex.
 	 */
 	if (filp)
 		fput(filp);
@@ -1306,11 +1305,11 @@ static int loop_clr_fd(struct loop_device *lo)
 {
 	int err;
 
-	err = mutex_lock_killable(&loop_ctl_mutex);
+	err = mutex_lock_killable(&lo->lo_mutex);
 	if (err)
 		return err;
 	if (lo->lo_state != Lo_bound) {
-		mutex_unlock(&loop_ctl_mutex);
+		mutex_unlock(&lo->lo_mutex);
 		return -ENXIO;
 	}
 	/*
@@ -1325,11 +1324,11 @@ static int loop_clr_fd(struct loop_device *lo)
 	 */
 	if (atomic_read(&lo->lo_refcnt) > 1) {
 		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
-		mutex_unlock(&loop_ctl_mutex);
+		mutex_unlock(&lo->lo_mutex);
 		return 0;
 	}
 	lo->lo_state = Lo_rundown;
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&lo->lo_mutex);
 
 	return __loop_clr_fd(lo, false);
 }
@@ -1344,7 +1343,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	bool partscan = false;
 	bool size_changed = false;
 
-	err = mutex_lock_killable(&loop_ctl_mutex);
+	err = mutex_lock_killable(&lo->lo_mutex);
 	if (err)
 		return err;
 	if (lo->lo_encrypt_key_size &&
@@ -1411,7 +1410,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		partscan = true;
 	}
 out_unlock:
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&lo->lo_mutex);
 	if (partscan)
 		loop_reread_partitions(lo, bdev);
 
@@ -1425,11 +1424,11 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 	struct kstat stat;
 	int ret;
 
-	ret = mutex_lock_killable(&loop_ctl_mutex);
+	ret = mutex_lock_killable(&lo->lo_mutex);
 	if (ret)
 		return ret;
 	if (lo->lo_state != Lo_bound) {
-		mutex_unlock(&loop_ctl_mutex);
+		mutex_unlock(&lo->lo_mutex);
 		return -ENXIO;
 	}
 
@@ -1448,10 +1447,10 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 		       lo->lo_encrypt_key_size);
 	}
 
-	/* Drop loop_ctl_mutex while we call into the filesystem. */
+	/* Drop lo_mutex while we call into the filesystem. */
 	path = lo->lo_backing_file->f_path;
 	path_get(&path);
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&lo->lo_mutex);
 	ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
 	if (!ret) {
 		info->lo_device = huge_encode_dev(stat.dev);
@@ -1637,7 +1636,7 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
 {
 	int err;
 
-	err = mutex_lock_killable(&loop_ctl_mutex);
+	err = mutex_lock_killable(&lo->lo_mutex);
 	if (err)
 		return err;
 	switch (cmd) {
@@ -1653,7 +1652,7 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
 	default:
 		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
 	}
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&lo->lo_mutex);
 	return err;
 }
 
@@ -1879,27 +1878,33 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
 	struct loop_device *lo;
 	int err;
 
+	/*
+	 * take loop_ctl_mutex to protect lo pointer from race with
+	 * loop_control_ioctl(LOOP_CTL_REMOVE), however, to reduce contention
+	 * release it prior to updating lo->lo_refcnt.
+	 */
 	err = mutex_lock_killable(&loop_ctl_mutex);
 	if (err)
 		return err;
 	lo = bdev->bd_disk->private_data;
 	if (!lo) {
-		err = -ENXIO;
-		goto out;
+		mutex_unlock(&loop_ctl_mutex);
+		return -ENXIO;
 	}
-
-	atomic_inc(&lo->lo_refcnt);
-out:
+	err = mutex_lock_killable(&lo->lo_mutex);
 	mutex_unlock(&loop_ctl_mutex);
-	return err;
+	if (err)
+		return err;
+	atomic_inc(&lo->lo_refcnt);
+	mutex_unlock(&lo->lo_mutex);
+	return 0;
 }
 
 static void lo_release(struct gendisk *disk, fmode_t mode)
 {
-	struct loop_device *lo;
+	struct loop_device *lo = disk->private_data;
 
-	mutex_lock(&loop_ctl_mutex);
-	lo = disk->private_data;
+	mutex_lock(&lo->lo_mutex);
 	if (atomic_dec_return(&lo->lo_refcnt))
 		goto out_unlock;
 
@@ -1907,7 +1912,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		if (lo->lo_state != Lo_bound)
 			goto out_unlock;
 		lo->lo_state = Lo_rundown;
-		mutex_unlock(&loop_ctl_mutex);
+		mutex_unlock(&lo->lo_mutex);
 		/*
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
@@ -1924,7 +1929,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 	}
 
 out_unlock:
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&lo->lo_mutex);
 }
 
 static const struct block_device_operations lo_fops = {
@@ -1963,10 +1968,10 @@ static int unregister_transfer_cb(int id, void *ptr, void *data)
 	struct loop_device *lo = ptr;
 	struct loop_func_table *xfer = data;
 
-	mutex_lock(&loop_ctl_mutex);
+	mutex_lock(&lo->lo_mutex);
 	if (lo->lo_encryption == xfer)
 		loop_release_xfer(lo);
-	mutex_unlock(&loop_ctl_mutex);
+	mutex_unlock(&lo->lo_mutex);
 	return 0;
 }
 
@@ -2152,6 +2157,7 @@ static int loop_add(struct loop_device **l, int i)
 		disk->flags |= GENHD_FL_NO_PART_SCAN;
 	disk->flags |= GENHD_FL_EXT_DEVT;
 	atomic_set(&lo->lo_refcnt, 0);
+	mutex_init(&lo->lo_mutex);
 	lo->lo_number		= i;
 	spin_lock_init(&lo->lo_lock);
 	disk->major		= LOOP_MAJOR;
@@ -2182,6 +2188,7 @@ static void loop_remove(struct loop_device *lo)
 	blk_cleanup_queue(lo->lo_queue);
 	blk_mq_free_tag_set(&lo->tag_set);
 	put_disk(lo->lo_disk);
+	mutex_destroy(&lo->lo_mutex);
 	kfree(lo);
 }
 
@@ -2261,15 +2268,21 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 		ret = loop_lookup(&lo, parm);
 		if (ret < 0)
 			break;
+		ret = mutex_lock_killable(&lo->lo_mutex);
+		if (ret)
+			break;
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
+			mutex_unlock(&lo->lo_mutex);
 			break;
 		}
 		if (atomic_read(&lo->lo_refcnt) > 0) {
 			ret = -EBUSY;
+			mutex_unlock(&lo->lo_mutex);
 			break;
 		}
 		lo->lo_disk->private_data = NULL;
+		mutex_unlock(&lo->lo_mutex);
 		idr_remove(&loop_index_idr, lo->lo_number);
 		loop_remove(lo);
 		break;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index af75a5ee4094..a3c04f310672 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -62,6 +62,7 @@ struct loop_device {
 	struct request_queue	*lo_queue;
 	struct blk_mq_tag_set	tag_set;
 	struct gendisk		*lo_disk;
+	struct mutex		lo_mutex;
 };
 
 struct loop_cmd {

From 416c05477772c147190d6b2371254510c81a4a04 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 26 Jan 2021 14:04:32 -0600
Subject: [PATCH 011/183] mtip32xx: use PCI #defines instead of numbers

Use PCI #defines for PCIe Device Control register values instead of
hard-coding bit positions.  No functional change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 3fd99836bb1c..b58f3a59b5bb 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3932,8 +3932,8 @@ static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev)
 		pci_read_config_word(pdev,
 			pos + PCI_EXP_DEVCTL,
 			&pcie_dev_ctrl);
-		if (pcie_dev_ctrl & (1 << 11) ||
-		    pcie_dev_ctrl & (1 << 4)) {
+		if (pcie_dev_ctrl & PCI_EXP_DEVCTL_NOSNOOP_EN ||
+		    pcie_dev_ctrl & PCI_EXP_DEVCTL_RELAX_EN) {
 			dev_info(&dd->pdev->dev,
 				"Disabling ERO/No-Snoop on bridge device %04x:%04x\n",
 					pdev->vendor, pdev->device);

From 2126979183148a1bbe8aebe67079856c15ae1763 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 26 Jan 2021 14:04:33 -0600
Subject: [PATCH 012/183] mtip32xx: prefer pcie_capability_read_word()

Replace pci_read_config_word() with pcie_capability_read_word().

pcie_capability_read_word() takes care of a few special cases when reading
the PCIe capability.  See 8c0d3a02c130 ("PCI: Add accessors for PCI Express
Capability").

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index b58f3a59b5bb..3be0dbc674bd 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3924,14 +3924,10 @@ static DEFINE_HANDLER(7);
 
 static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev)
 {
-	int pos;
 	unsigned short pcie_dev_ctrl;
 
-	pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
-	if (pos) {
-		pci_read_config_word(pdev,
-			pos + PCI_EXP_DEVCTL,
-			&pcie_dev_ctrl);
+	if (pci_is_pcie(pdev)) {
+		pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &pcie_dev_ctrl);
 		if (pcie_dev_ctrl & PCI_EXP_DEVCTL_NOSNOOP_EN ||
 		    pcie_dev_ctrl & PCI_EXP_DEVCTL_RELAX_EN) {
 			dev_info(&dd->pdev->dev,
@@ -3939,8 +3935,7 @@ static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev)
 					pdev->vendor, pdev->device);
 			pcie_dev_ctrl &= ~(PCI_EXP_DEVCTL_NOSNOOP_EN |
 						PCI_EXP_DEVCTL_RELAX_EN);
-			pci_write_config_word(pdev,
-				pos + PCI_EXP_DEVCTL,
+			pcie_capability_write_word(pdev, PCI_EXP_DEVCTL,
 				pcie_dev_ctrl);
 		}
 	}

From 370276bac8ec6f74fb52a518ef05aa84d1059067 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Thu, 21 Jan 2021 15:21:50 +0100
Subject: [PATCH 013/183] drbd: remove unused argument from
 drbd_request_prepare and __drbd_make_request

We can remove start_jif since it is not used by drbd_request_prepare,
then remove it from __drbd_make_request further.

Cc: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Lars Ellenberg <lars.ellenberg@linbit.com>
Cc: drbd-dev@lists.linbit.com
Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_int.h  |  2 +-
 drivers/block/drbd/drbd_main.c |  3 +--
 drivers/block/drbd/drbd_req.c  | 11 ++++-------
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b2c93a29c251..de59f72d49cc 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1449,7 +1449,7 @@ extern void conn_free_crypto(struct drbd_connection *connection);
 
 /* drbd_req */
 extern void do_submit(struct work_struct *ws);
-extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long);
+extern void __drbd_make_request(struct drbd_device *, struct bio *);
 extern blk_qc_t drbd_submit_bio(struct bio *bio);
 extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
 extern int is_valid_ar_handle(struct drbd_request *, sector_t);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 1c8c18b2a25f..7e5fcce812e1 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2288,7 +2288,6 @@ static void do_retry(struct work_struct *ws)
 	list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
 		struct drbd_device *device = req->device;
 		struct bio *bio = req->master_bio;
-		unsigned long start_jif = req->start_jif;
 		bool expected;
 
 		expected =
@@ -2323,7 +2322,7 @@ static void do_retry(struct work_struct *ws)
 		/* We are not just doing submit_bio_noacct(),
 		 * as we want to keep the start_time information. */
 		inc_ap_bio(device);
-		__drbd_make_request(device, bio, start_jif);
+		__drbd_make_request(device, bio);
 	}
 }
 
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index ea0f31ab3343..ee785f2bdf79 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1188,7 +1188,7 @@ static void drbd_queue_write(struct drbd_device *device, struct drbd_request *re
  * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
  */
 static struct drbd_request *
-drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
+drbd_request_prepare(struct drbd_device *device, struct bio *bio)
 {
 	const int rw = bio_data_dir(bio);
 	struct drbd_request *req;
@@ -1416,9 +1416,9 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
 		complete_master_bio(device, &m);
 }
 
-void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
+void __drbd_make_request(struct drbd_device *device, struct bio *bio)
 {
-	struct drbd_request *req = drbd_request_prepare(device, bio, start_jif);
+	struct drbd_request *req = drbd_request_prepare(device, bio);
 	if (IS_ERR_OR_NULL(req))
 		return;
 	drbd_send_and_submit(device, req);
@@ -1596,19 +1596,16 @@ void do_submit(struct work_struct *ws)
 blk_qc_t drbd_submit_bio(struct bio *bio)
 {
 	struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
-	unsigned long start_jif;
 
 	blk_queue_split(&bio);
 
-	start_jif = jiffies;
-
 	/*
 	 * what we "blindly" assume:
 	 */
 	D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
 
 	inc_ap_bio(device);
-	__drbd_make_request(device, bio, start_jif);
+	__drbd_make_request(device, bio);
 	return BLK_QC_T_NONE;
 }
 

From 294ed6b9f00665acc22253044890257c5d9d18c1 Mon Sep 17 00:00:00 2001
From: Tian Tao <tiantao6@hisilicon.com>
Date: Mon, 25 Jan 2021 16:13:01 +0800
Subject: [PATCH 014/183] zram: fix NULL check before some freeing functions is
 not needed

fixed the below warning:
/drivers/block/zram/zram_drv.c:534:2-8: WARNING: NULL check
before some freeing functions is not needed.

Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/zram/zram_drv.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d6243dbc53cc..d7018543842e 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -530,8 +530,7 @@ static ssize_t backing_dev_store(struct device *dev,
 
 	return len;
 out:
-	if (bitmap)
-		kvfree(bitmap);
+	kvfree(bitmap);
 
 	if (bdev)
 		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);

From 9abe47cc5cbeda75a1ae2ffe6bb8636a0327eddc Mon Sep 17 00:00:00 2001
From: Yang Li <abaci-bugfix@linux.alibaba.com>
Date: Thu, 21 Jan 2021 17:43:22 +0800
Subject: [PATCH 015/183] rsxx: remove redundant NULL check

Fix below warnings reported by coccicheck:
./drivers/block/rsxx/dma.c:948:3-8: WARNING: NULL check
before some freeing functions is not needed.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <abaci-bugfix@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/rsxx/dma.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 1914f5488b22..0574f4495755 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -944,8 +944,7 @@ int rsxx_dma_setup(struct rsxx_cardinfo *card)
 			ctrl->done_wq = NULL;
 		}
 
-		if (ctrl->trackers)
-			vfree(ctrl->trackers);
+		vfree(ctrl->trackers);
 
 		if (ctrl->status.buf)
 			dma_free_coherent(&card->dev->dev, STATUS_BUFFER_SIZE8,

From c260954177c4f1926b423823bca5728f19b40d67 Mon Sep 17 00:00:00 2001
From: Emil Renner Berthing <kernel@esmil.dk>
Date: Sat, 23 Jan 2021 19:24:56 +0100
Subject: [PATCH 016/183] genirq: Use new tasklet API for resend_tasklet

This converts the resend_tasklet to use the new API in
commit 12cc923f1ccc ("tasklet: Introduce new initialization API")

The new API changes the argument passed to the callback function, but
fortunately the argument isn't used so it is straight forward to use
DECLARE_TASKLET() rather than DECLARE_TASKLET_OLD().

Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20210123182456.6521-1-esmil@mailme.dk
---
 kernel/irq/resend.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 8ccd32a0cc80..bd1d85c610aa 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -27,7 +27,7 @@ static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
 /*
  * Run software resends of IRQ's
  */
-static void resend_irqs(unsigned long arg)
+static void resend_irqs(struct tasklet_struct *unused)
 {
 	struct irq_desc *desc;
 	int irq;
@@ -45,7 +45,7 @@ static void resend_irqs(unsigned long arg)
 }
 
 /* Tasklet to handle resend: */
-static DECLARE_TASKLET_OLD(resend_tasklet, resend_irqs);
+static DECLARE_TASKLET(resend_tasklet, resend_irqs);
 
 static int irq_sw_resend(struct irq_desc *desc)
 {

From c60767421e102dfd1f4d99ad0cc7f8ba24461eb8 Mon Sep 17 00:00:00 2001
From: Biwen Li <biwen.li@nxp.com>
Date: Fri, 29 Jan 2021 17:50:34 +0800
Subject: [PATCH 017/183] irqchip/ls-extirq: add IRQCHIP_SKIP_SET_WAKE to the
 irqchip flags

The ls-extirq driver doesn't implement the irq_set_wake()
callback, while being wake-up capable. This results in
ugly behaviours across suspend/resume cycles.

Advertise this by adding IRQCHIP_SKIP_SET_WAKE to
the irqchip flags

Fixes: b16a1caf4686 ("irqchip/ls-extirq: Add LS1043A, LS1088A external interrupt support")
Signed-off-by: Biwen Li <biwen.li@nxp.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210129095034.33821-1-biwen.li@oss.nxp.com
---
 drivers/irqchip/irq-ls-extirq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-ls-extirq.c b/drivers/irqchip/irq-ls-extirq.c
index f94f974a8764..853b3972dbe7 100644
--- a/drivers/irqchip/irq-ls-extirq.c
+++ b/drivers/irqchip/irq-ls-extirq.c
@@ -64,7 +64,7 @@ static struct irq_chip ls_extirq_chip = {
 	.irq_set_type		= ls_extirq_set_type,
 	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.irq_set_affinity	= irq_chip_set_affinity_parent,
-	.flags                  = IRQCHIP_SET_TYPE_MASKED,
+	.flags                  = IRQCHIP_SET_TYPE_MASKED | IRQCHIP_SKIP_SET_WAKE,
 };
 
 static int

From e8628013e5ddc7cf78cc2f738ab760e8c0fa8559 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 24 Aug 2020 21:56:03 -0700
Subject: [PATCH 018/183] drbd: Avoid comma separated statements

Use semicolons and braces.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_receiver.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 09c86ef3f0fd..c3f09a122f20 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -111,8 +111,10 @@ static struct page *page_chain_tail(struct page *page, int *len)
 {
 	struct page *tmp;
 	int i = 1;
-	while ((tmp = page_chain_next(page)))
-		++i, page = tmp;
+	while ((tmp = page_chain_next(page))) {
+		++i;
+		page = tmp;
+	}
 	if (len)
 		*len = i;
 	return page;

From 3a81fd02045c329f25e5900fa61f613c9b317644 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 10 Dec 2020 12:25:36 -0700
Subject: [PATCH 019/183] io_uring: enable LOOKUP_CACHED path resolution for
 filename lookups

Instead of being pessimistic and assume that path lookup will block, use
LOOKUP_CACHED to attempt just a cached lookup. This ensures that the
fast path is always done inline, and we only punt to async context if
IO is needed to satisfy the lookup.

For forced nonblock open attempts, mark the file O_NONBLOCK over the
actual ->open() call as well. We can safely clear this again before
doing fd_install(), so it'll never be user visible that we fiddled with
it.

This greatly improves the performance of file open where the dentry is
already cached:

ached		5.10-git	5.10-git+LOOKUP_CACHED	Speedup
---------------------------------------------------------------
33%		1,014,975	900,474			1.1x
89%		 545,466	292,937			1.9x
100%		 435,636	151,475			2.9x

The more cache hot we are, the faster the inline LOOKUP_CACHED
optimization helps. This is unsurprising and expected, as a thread
offload becomes a more dominant part of the total overhead. If we look
at io_uring tracing, doing an IORING_OP_OPENAT on a file that isn't in
the dentry cache will yield:

275.550481: io_uring_create: ring 00000000ddda6278, fd 3 sq size 8, cq size 16, flags 0
275.550491: io_uring_submit_sqe: ring 00000000ddda6278, op 18, data 0x0, non block 1, sq_thread 0
275.550498: io_uring_queue_async_work: ring 00000000ddda6278, request 00000000c0267d17, flags 69760, normal queue, work 000000003d683991
275.550502: io_uring_cqring_wait: ring 00000000ddda6278, min_events 1
275.550556: io_uring_complete: ring 00000000ddda6278, user_data 0x0, result 4

which shows a failed nonblock lookup, then punt to worker, and then we
complete with fd == 4. This takes 65 usec in total. Re-running the same
test case again:

281.253956: io_uring_create: ring 0000000008207252, fd 3 sq size 8, cq size 16, flags 0
281.253967: io_uring_submit_sqe: ring 0000000008207252, op 18, data 0x0, non block 1, sq_thread 0
281.253973: io_uring_complete: ring 0000000008207252, user_data 0x0, result 4

shows the same request completing inline, also returning fd == 4. This
takes 6 usec.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 47 +++++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 38c6cbe1ab38..fbc6d2fb7c1d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -489,7 +489,6 @@ struct io_sr_msg {
 struct io_open {
 	struct file			*file;
 	int				dfd;
-	bool				ignore_nonblock;
 	struct filename			*filename;
 	struct open_how			how;
 	unsigned long			nofile;
@@ -4054,7 +4053,6 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 		return ret;
 	}
 	req->open.nofile = rlimit(RLIMIT_NOFILE);
-	req->open.ignore_nonblock = false;
 	req->flags |= REQ_F_NEED_CLEANUP;
 	return 0;
 }
@@ -4096,39 +4094,48 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)
 {
 	struct open_flags op;
 	struct file *file;
+	bool nonblock_set;
+	bool resolve_nonblock;
 	int ret;
 
-	if (force_nonblock && !req->open.ignore_nonblock)
-		return -EAGAIN;
-
 	ret = build_open_flags(&req->open.how, &op);
 	if (ret)
 		goto err;
+	nonblock_set = op.open_flag & O_NONBLOCK;
+	resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
+	if (force_nonblock) {
+		/*
+		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
+		 * it'll always -EAGAIN
+		 */
+		if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
+			return -EAGAIN;
+		op.lookup_flags |= LOOKUP_CACHED;
+		op.open_flag |= O_NONBLOCK;
+	}
 
 	ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
 	if (ret < 0)
 		goto err;
 
 	file = do_filp_open(req->open.dfd, req->open.filename, &op);
+	/* only retry if RESOLVE_CACHED wasn't already set by application */
+	if ((!resolve_nonblock && force_nonblock) && file == ERR_PTR(-EAGAIN)) {
+		/*
+		 * We could hang on to this 'fd', but seems like marginal
+		 * gain for something that is now known to be a slower path.
+		 * So just put it, and we'll get a new one when we retry.
+		 */
+		put_unused_fd(ret);
+		return -EAGAIN;
+	}
+
 	if (IS_ERR(file)) {
 		put_unused_fd(ret);
 		ret = PTR_ERR(file);
-		/*
-		 * A work-around to ensure that /proc/self works that way
-		 * that it should - if we get -EOPNOTSUPP back, then assume
-		 * that proc_self_get_link() failed us because we're in async
-		 * context. We should be safe to retry this from the task
-		 * itself with force_nonblock == false set, as it should not
-		 * block on lookup. Would be nice to know this upfront and
-		 * avoid the async dance, but doesn't seem feasible.
-		 */
-		if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) {
-			req->open.ignore_nonblock = true;
-			refcount_inc(&req->refs);
-			io_req_task_queue(req);
-			return 0;
-		}
 	} else {
+		if (force_nonblock && !nonblock_set)
+			file->f_flags &= ~O_NONBLOCK;
 		fsnotify_open(file);
 		fd_install(ret, file);
 	}

From 0a96bbe49994a46c1fea34619a501ead46aa7584 Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Wed, 6 Jan 2021 12:39:10 -0800
Subject: [PATCH 020/183] io_uring: modularize io_sqe_buffer_register

Split io_sqe_buffer_register into two routines:

- io_sqe_buffer_register() registers a single buffer
- io_sqe_buffers_register iterates over all user specified buffers

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 210 +++++++++++++++++++++++++-------------------------
 1 file changed, 107 insertions(+), 103 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fbc6d2fb7c1d..ec70ba064774 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8370,7 +8370,7 @@ static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
 	return pages;
 }
 
-static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
+static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
 	int i, j;
 
@@ -8488,14 +8488,103 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	return ret;
 }
 
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
-				  unsigned nr_args)
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+				  struct io_mapped_ubuf *imu,
+				  struct page **last_hpage)
 {
 	struct vm_area_struct **vmas = NULL;
 	struct page **pages = NULL;
+	unsigned long off, start, end, ubuf;
+	size_t size;
+	int ret, pret, nr_pages, i;
+
+	ubuf = (unsigned long) iov->iov_base;
+	end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	start = ubuf >> PAGE_SHIFT;
+	nr_pages = end - start;
+
+	ret = -ENOMEM;
+
+	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		goto done;
+
+	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
+			      GFP_KERNEL);
+	if (!vmas)
+		goto done;
+
+	imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
+				   GFP_KERNEL);
+	if (!imu->bvec)
+		goto done;
+
+	ret = 0;
+	mmap_read_lock(current->mm);
+	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+			      pages, vmas);
+	if (pret == nr_pages) {
+		/* don't support file backed memory */
+		for (i = 0; i < nr_pages; i++) {
+			struct vm_area_struct *vma = vmas[i];
+
+			if (vma->vm_file &&
+			    !is_file_hugepages(vma->vm_file)) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+		}
+	} else {
+		ret = pret < 0 ? pret : -EFAULT;
+	}
+	mmap_read_unlock(current->mm);
+	if (ret) {
+		/*
+		 * if we did partial map, or found file backed vmas,
+		 * release any pages we did get
+		 */
+		if (pret > 0)
+			unpin_user_pages(pages, pret);
+		kvfree(imu->bvec);
+		goto done;
+	}
+
+	ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
+	if (ret) {
+		unpin_user_pages(pages, pret);
+		kvfree(imu->bvec);
+		goto done;
+	}
+
+	off = ubuf & ~PAGE_MASK;
+	size = iov->iov_len;
+	for (i = 0; i < nr_pages; i++) {
+		size_t vec_len;
+
+		vec_len = min_t(size_t, size, PAGE_SIZE - off);
+		imu->bvec[i].bv_page = pages[i];
+		imu->bvec[i].bv_len = vec_len;
+		imu->bvec[i].bv_offset = off;
+		off = 0;
+		size -= vec_len;
+	}
+	/* store original address for later verification */
+	imu->ubuf = ubuf;
+	imu->len = iov->iov_len;
+	imu->nr_bvecs = nr_pages;
+	ret = 0;
+done:
+	kvfree(pages);
+	kvfree(vmas);
+	return ret;
+}
+
+static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
+				   unsigned int nr_args)
+{
+	int i, ret;
+	struct iovec iov;
 	struct page *last_hpage = NULL;
-	int i, j, got_pages = 0;
-	int ret = -EINVAL;
 
 	if (ctx->user_bufs)
 		return -EBUSY;
@@ -8509,14 +8598,10 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 
 	for (i = 0; i < nr_args; i++) {
 		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
-		unsigned long off, start, end, ubuf;
-		int pret, nr_pages;
-		struct iovec iov;
-		size_t size;
 
 		ret = io_copy_iov(ctx, &iov, arg, i);
 		if (ret)
-			goto err;
+			break;
 
 		/*
 		 * Don't impose further limits on the size and buffer
@@ -8525,103 +8610,22 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 		 */
 		ret = -EFAULT;
 		if (!iov.iov_base || !iov.iov_len)
-			goto err;
+			break;
 
 		/* arbitrary limit, but we need something */
 		if (iov.iov_len > SZ_1G)
-			goto err;
+			break;
 
-		ubuf = (unsigned long) iov.iov_base;
-		end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		start = ubuf >> PAGE_SHIFT;
-		nr_pages = end - start;
-
-		ret = 0;
-		if (!pages || nr_pages > got_pages) {
-			kvfree(vmas);
-			kvfree(pages);
-			pages = kvmalloc_array(nr_pages, sizeof(struct page *),
-						GFP_KERNEL);
-			vmas = kvmalloc_array(nr_pages,
-					sizeof(struct vm_area_struct *),
-					GFP_KERNEL);
-			if (!pages || !vmas) {
-				ret = -ENOMEM;
-				goto err;
-			}
-			got_pages = nr_pages;
-		}
-
-		imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
-						GFP_KERNEL);
-		ret = -ENOMEM;
-		if (!imu->bvec)
-			goto err;
-
-		ret = 0;
-		mmap_read_lock(current->mm);
-		pret = pin_user_pages(ubuf, nr_pages,
-				      FOLL_WRITE | FOLL_LONGTERM,
-				      pages, vmas);
-		if (pret == nr_pages) {
-			/* don't support file backed memory */
-			for (j = 0; j < nr_pages; j++) {
-				struct vm_area_struct *vma = vmas[j];
-
-				if (vma->vm_file &&
-				    !is_file_hugepages(vma->vm_file)) {
-					ret = -EOPNOTSUPP;
-					break;
-				}
-			}
-		} else {
-			ret = pret < 0 ? pret : -EFAULT;
-		}
-		mmap_read_unlock(current->mm);
-		if (ret) {
-			/*
-			 * if we did partial map, or found file backed vmas,
-			 * release any pages we did get
-			 */
-			if (pret > 0)
-				unpin_user_pages(pages, pret);
-			kvfree(imu->bvec);
-			goto err;
-		}
-
-		ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage);
-		if (ret) {
-			unpin_user_pages(pages, pret);
-			kvfree(imu->bvec);
-			goto err;
-		}
-
-		off = ubuf & ~PAGE_MASK;
-		size = iov.iov_len;
-		for (j = 0; j < nr_pages; j++) {
-			size_t vec_len;
-
-			vec_len = min_t(size_t, size, PAGE_SIZE - off);
-			imu->bvec[j].bv_page = pages[j];
-			imu->bvec[j].bv_len = vec_len;
-			imu->bvec[j].bv_offset = off;
-			off = 0;
-			size -= vec_len;
-		}
-		/* store original address for later verification */
-		imu->ubuf = ubuf;
-		imu->len = iov.iov_len;
-		imu->nr_bvecs = nr_pages;
+		ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);
+		if (ret)
+			break;
 
 		ctx->nr_user_bufs++;
 	}
-	kvfree(pages);
-	kvfree(vmas);
-	return 0;
-err:
-	kvfree(pages);
-	kvfree(vmas);
-	io_sqe_buffer_unregister(ctx);
+
+	if (ret)
+		io_sqe_buffers_unregister(ctx);
+
 	return ret;
 }
 
@@ -8675,7 +8679,7 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_finish_async(ctx);
-	io_sqe_buffer_unregister(ctx);
+	io_sqe_buffers_unregister(ctx);
 
 	if (ctx->sqo_task) {
 		put_task_struct(ctx->sqo_task);
@@ -10057,13 +10061,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 
 	switch (opcode) {
 	case IORING_REGISTER_BUFFERS:
-		ret = io_sqe_buffer_register(ctx, arg, nr_args);
+		ret = io_sqe_buffers_register(ctx, arg, nr_args);
 		break;
 	case IORING_UNREGISTER_BUFFERS:
 		ret = -EINVAL;
 		if (arg || nr_args)
 			break;
-		ret = io_sqe_buffer_unregister(ctx);
+		ret = io_sqe_buffers_unregister(ctx);
 		break;
 	case IORING_REGISTER_FILES:
 		ret = io_sqe_files_register(ctx, arg, nr_args);

From 2b358604aa6e8c12d7efa14777fcc66c377682b0 Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Wed, 6 Jan 2021 12:39:11 -0800
Subject: [PATCH 021/183] io_uring: modularize io_sqe_buffers_register

Move allocation of buffer management structures, and validation of
buffers into separate routines.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 51 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ec70ba064774..4cdf0f906f12 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8579,13 +8579,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	return ret;
 }
 
-static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
-				   unsigned int nr_args)
+static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
 {
-	int i, ret;
-	struct iovec iov;
-	struct page *last_hpage = NULL;
-
 	if (ctx->user_bufs)
 		return -EBUSY;
 	if (!nr_args || nr_args > UIO_MAXIOV)
@@ -8596,6 +8591,37 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	if (!ctx->user_bufs)
 		return -ENOMEM;
 
+	return 0;
+}
+
+static int io_buffer_validate(struct iovec *iov)
+{
+	/*
+	 * Don't impose further limits on the size and buffer
+	 * constraints here, we'll -EINVAL later when IO is
+	 * submitted if they are wrong.
+	 */
+	if (!iov->iov_base || !iov->iov_len)
+		return -EFAULT;
+
+	/* arbitrary limit, but we need something */
+	if (iov->iov_len > SZ_1G)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
+				   unsigned int nr_args)
+{
+	int i, ret;
+	struct iovec iov;
+	struct page *last_hpage = NULL;
+
+	ret = io_buffers_map_alloc(ctx, nr_args);
+	if (ret)
+		return ret;
+
 	for (i = 0; i < nr_args; i++) {
 		struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
 
@@ -8603,17 +8629,8 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		if (ret)
 			break;
 
-		/*
-		 * Don't impose further limits on the size and buffer
-		 * constraints here, we'll -EINVAL later when IO is
-		 * submitted if they are wrong.
-		 */
-		ret = -EFAULT;
-		if (!iov.iov_base || !iov.iov_len)
-			break;
-
-		/* arbitrary limit, but we need something */
-		if (iov.iov_len > SZ_1G)
+		ret = io_buffer_validate(&iov);
+		if (ret)
 			break;
 
 		ret = io_sqe_buffer_register(ctx, &iov, imu, &last_hpage);

From 269bbe5fd4d2fdd3b0d3a82a3c3c1dd1209aa8b8 Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Fri, 15 Jan 2021 17:37:44 +0000
Subject: [PATCH 022/183] io_uring: rename file related variables to rsrc

This is a prep rename patch for subsequent patches to generalize file
registration.

[io_uring_rsrc_update:: rename fds -> data]

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
[leave io_uring_files_update as struct]
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 228 +++++++++++++++++-----------------
 include/uapi/linux/io_uring.h |   7 ++
 2 files changed, 124 insertions(+), 111 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4cdf0f906f12..95b3b8747a65 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -195,24 +195,29 @@ struct io_mapped_ubuf {
 	unsigned long	acct_pages;
 };
 
-struct fixed_file_table {
+struct io_rsrc_put {
+	struct list_head list;
+	struct file *file;
+};
+
+struct fixed_rsrc_table {
 	struct file		**files;
 };
 
-struct fixed_file_ref_node {
+struct fixed_rsrc_ref_node {
 	struct percpu_ref		refs;
 	struct list_head		node;
-	struct list_head		file_list;
-	struct fixed_file_data		*file_data;
+	struct list_head		rsrc_list;
+	struct fixed_rsrc_data		*rsrc_data;
 	struct llist_node		llist;
 	bool				done;
 };
 
-struct fixed_file_data {
-	struct fixed_file_table		*table;
+struct fixed_rsrc_data {
+	struct fixed_rsrc_table		*table;
 	struct io_ring_ctx		*ctx;
 
-	struct fixed_file_ref_node	*node;
+	struct fixed_rsrc_ref_node	*node;
 	struct percpu_ref		refs;
 	struct completion		done;
 	struct list_head		ref_list;
@@ -319,7 +324,7 @@ struct io_ring_ctx {
 	 * readers must ensure that ->refs is alive as long as the file* is
 	 * used. Only updated through io_uring_register(2).
 	 */
-	struct fixed_file_data	*file_data;
+	struct fixed_rsrc_data	*file_data;
 	unsigned		nr_user_files;
 
 	/* if used, fixed mapped user buffers */
@@ -384,8 +389,8 @@ struct io_ring_ctx {
 		struct list_head	inflight_list;
 	} ____cacheline_aligned_in_smp;
 
-	struct delayed_work		file_put_work;
-	struct llist_head		file_put_llist;
+	struct delayed_work		rsrc_put_work;
+	struct llist_head		rsrc_put_llist;
 
 	struct work_struct		exit_work;
 	struct io_restriction		restrictions;
@@ -494,7 +499,7 @@ struct io_open {
 	unsigned long			nofile;
 };
 
-struct io_files_update {
+struct io_rsrc_update {
 	struct file			*file;
 	u64				arg;
 	u32				nr_args;
@@ -688,7 +693,7 @@ struct io_kiocb {
 		struct io_sr_msg	sr_msg;
 		struct io_open		open;
 		struct io_close		close;
-		struct io_files_update	files_update;
+		struct io_rsrc_update	rsrc_update;
 		struct io_fadvise	fadvise;
 		struct io_madvise	madvise;
 		struct io_epoll		epoll;
@@ -718,7 +723,7 @@ struct io_kiocb {
 	u64				user_data;
 
 	struct io_kiocb			*link;
-	struct percpu_ref		*fixed_file_refs;
+	struct percpu_ref		*fixed_rsrc_refs;
 
 	/*
 	 * 1. used with ctx->iopoll_list with reads/writes
@@ -996,8 +1001,8 @@ enum io_mem_account {
 static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 					    struct task_struct *task);
 
-static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node);
-static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
+static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
+static struct fixed_rsrc_ref_node *alloc_fixed_file_ref_node(
 			struct io_ring_ctx *ctx);
 
 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
@@ -1010,13 +1015,13 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void __io_queue_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
-				 struct io_uring_files_update *ip,
+				 struct io_uring_rsrc_update *ip,
 				 unsigned nr_args);
 static void __io_clean_op(struct io_kiocb *req);
 static struct file *io_file_get(struct io_submit_state *state,
 				struct io_kiocb *req, int fd, bool fixed);
 static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
-static void io_file_put_work(struct work_struct *work);
+static void io_rsrc_put_work(struct work_struct *work);
 
 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 			       struct iovec **iovec, struct iov_iter *iter,
@@ -1057,9 +1062,9 @@ static inline void io_set_resource_node(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (!req->fixed_file_refs) {
-		req->fixed_file_refs = &ctx->file_data->node->refs;
-		percpu_ref_get(req->fixed_file_refs);
+	if (!req->fixed_rsrc_refs) {
+		req->fixed_rsrc_refs = &ctx->file_data->node->refs;
+		percpu_ref_get(req->fixed_rsrc_refs);
 	}
 }
 
@@ -1330,8 +1335,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->timeout_list);
 	spin_lock_init(&ctx->inflight_lock);
 	INIT_LIST_HEAD(&ctx->inflight_list);
-	INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
-	init_llist_head(&ctx->file_put_llist);
+	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
+	init_llist_head(&ctx->rsrc_put_llist);
 	return ctx;
 err:
 	if (ctx->fallback_req)
@@ -2011,8 +2016,8 @@ static void io_dismantle_req(struct io_kiocb *req)
 		kfree(req->async_data);
 	if (req->file)
 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
-	if (req->fixed_file_refs)
-		percpu_ref_put(req->fixed_file_refs);
+	if (req->fixed_rsrc_refs)
+		percpu_ref_put(req->fixed_rsrc_refs);
 	io_req_clean_work(req);
 }
 
@@ -5988,7 +5993,7 @@ static int io_async_cancel(struct io_kiocb *req)
 	return 0;
 }
 
-static int io_files_update_prep(struct io_kiocb *req,
+static int io_rsrc_update_prep(struct io_kiocb *req,
 				const struct io_uring_sqe *sqe)
 {
 	if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
@@ -5998,11 +6003,11 @@ static int io_files_update_prep(struct io_kiocb *req,
 	if (sqe->ioprio || sqe->rw_flags)
 		return -EINVAL;
 
-	req->files_update.offset = READ_ONCE(sqe->off);
-	req->files_update.nr_args = READ_ONCE(sqe->len);
-	if (!req->files_update.nr_args)
+	req->rsrc_update.offset = READ_ONCE(sqe->off);
+	req->rsrc_update.nr_args = READ_ONCE(sqe->len);
+	if (!req->rsrc_update.nr_args)
 		return -EINVAL;
-	req->files_update.arg = READ_ONCE(sqe->addr);
+	req->rsrc_update.arg = READ_ONCE(sqe->addr);
 	return 0;
 }
 
@@ -6010,17 +6015,17 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock,
 			   struct io_comp_state *cs)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_uring_files_update up;
+	struct io_uring_rsrc_update up;
 	int ret;
 
 	if (force_nonblock)
 		return -EAGAIN;
 
-	up.offset = req->files_update.offset;
-	up.fds = req->files_update.arg;
+	up.offset = req->rsrc_update.offset;
+	up.data = req->rsrc_update.arg;
 
 	mutex_lock(&ctx->uring_lock);
-	ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
+	ret = __io_sqe_files_update(ctx, &up, req->rsrc_update.nr_args);
 	mutex_unlock(&ctx->uring_lock);
 
 	if (ret < 0)
@@ -6075,7 +6080,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	case IORING_OP_CLOSE:
 		return io_close_prep(req, sqe);
 	case IORING_OP_FILES_UPDATE:
-		return io_files_update_prep(req, sqe);
+		return io_rsrc_update_prep(req, sqe);
 	case IORING_OP_STATX:
 		return io_statx_prep(req, sqe);
 	case IORING_OP_FADVISE:
@@ -6444,7 +6449,7 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
 					      int index)
 {
-	struct fixed_file_table *table;
+	struct fixed_rsrc_table *table;
 
 	table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
 	return table->files[index & IORING_FILE_TABLE_MASK];
@@ -6840,7 +6845,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->ctx = ctx;
 	req->flags = 0;
 	req->link = NULL;
-	req->fixed_file_refs = NULL;
+	req->fixed_rsrc_refs = NULL;
 	/* one is dropped after submission, the other at completion */
 	refcount_set(&req->refs, 2);
 	req->task = current;
@@ -7328,28 +7333,28 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 #endif
 }
 
-static void io_file_ref_kill(struct percpu_ref *ref)
+static void io_rsrc_ref_kill(struct percpu_ref *ref)
 {
-	struct fixed_file_data *data;
+	struct fixed_rsrc_data *data;
 
-	data = container_of(ref, struct fixed_file_data, refs);
+	data = container_of(ref, struct fixed_rsrc_data, refs);
 	complete(&data->done);
 }
 
-static void io_sqe_files_set_node(struct fixed_file_data *file_data,
-				  struct fixed_file_ref_node *ref_node)
+static void io_sqe_rsrc_set_node(struct fixed_rsrc_data *rsrc_data,
+				 struct fixed_rsrc_ref_node *ref_node)
 {
-	spin_lock_bh(&file_data->lock);
-	file_data->node = ref_node;
-	list_add_tail(&ref_node->node, &file_data->ref_list);
-	spin_unlock_bh(&file_data->lock);
-	percpu_ref_get(&file_data->refs);
+	spin_lock_bh(&rsrc_data->lock);
+	rsrc_data->node = ref_node;
+	list_add_tail(&ref_node->node, &rsrc_data->ref_list);
+	spin_unlock_bh(&rsrc_data->lock);
+	percpu_ref_get(&rsrc_data->refs);
 }
 
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
-	struct fixed_file_data *data = ctx->file_data;
-	struct fixed_file_ref_node *backup_node, *ref_node = NULL;
+	struct fixed_rsrc_data *data = ctx->file_data;
+	struct fixed_rsrc_ref_node *backup_node, *ref_node = NULL;
 	unsigned nr_tables, i;
 	int ret;
 
@@ -7368,7 +7373,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	percpu_ref_kill(&data->refs);
 
 	/* wait for all refs nodes to complete */
-	flush_delayed_work(&ctx->file_put_work);
+	flush_delayed_work(&ctx->rsrc_put_work);
 	do {
 		ret = wait_for_completion_interruptible(&data->done);
 		if (!ret)
@@ -7377,7 +7382,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 		if (ret < 0) {
 			percpu_ref_resurrect(&data->refs);
 			reinit_completion(&data->done);
-			io_sqe_files_set_node(data, backup_node);
+			io_sqe_rsrc_set_node(data, backup_node);
 			return ret;
 		}
 	} while (1);
@@ -7391,7 +7396,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	kfree(data);
 	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
-	destroy_fixed_file_ref_node(backup_node);
+	destroy_fixed_rsrc_ref_node(backup_node);
 	return 0;
 }
 
@@ -7614,13 +7619,13 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
 }
 #endif
 
-static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data,
+static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
 				    unsigned nr_tables, unsigned nr_files)
 {
 	int i;
 
 	for (i = 0; i < nr_tables; i++) {
-		struct fixed_file_table *table = &file_data->table[i];
+		struct fixed_rsrc_table *table = &file_data->table[i];
 		unsigned this_files;
 
 		this_files = min(nr_files, IORING_MAX_FILES_TABLE);
@@ -7635,7 +7640,7 @@ static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data,
 		return 0;
 
 	for (i = 0; i < nr_tables; i++) {
-		struct fixed_file_table *table = &file_data->table[i];
+		struct fixed_rsrc_table *table = &file_data->table[i];
 		kfree(table->files);
 	}
 	return 1;
@@ -7703,56 +7708,51 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
 #endif
 }
 
-struct io_file_put {
-	struct list_head list;
-	struct file *file;
-};
-
-static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
+static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
 {
-	struct fixed_file_data *file_data = ref_node->file_data;
-	struct io_ring_ctx *ctx = file_data->ctx;
-	struct io_file_put *pfile, *tmp;
+	struct fixed_rsrc_data *rsrc_data = ref_node->rsrc_data;
+	struct io_ring_ctx *ctx = rsrc_data->ctx;
+	struct io_rsrc_put *prsrc, *tmp;
 
-	list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
-		list_del(&pfile->list);
-		io_ring_file_put(ctx, pfile->file);
-		kfree(pfile);
+	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
+		list_del(&prsrc->list);
+		io_ring_file_put(ctx, prsrc->file);
+		kfree(prsrc);
 	}
 
 	percpu_ref_exit(&ref_node->refs);
 	kfree(ref_node);
-	percpu_ref_put(&file_data->refs);
+	percpu_ref_put(&rsrc_data->refs);
 }
 
-static void io_file_put_work(struct work_struct *work)
+static void io_rsrc_put_work(struct work_struct *work)
 {
 	struct io_ring_ctx *ctx;
 	struct llist_node *node;
 
-	ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
-	node = llist_del_all(&ctx->file_put_llist);
+	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
+	node = llist_del_all(&ctx->rsrc_put_llist);
 
 	while (node) {
-		struct fixed_file_ref_node *ref_node;
+		struct fixed_rsrc_ref_node *ref_node;
 		struct llist_node *next = node->next;
 
-		ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
-		__io_file_put_work(ref_node);
+		ref_node = llist_entry(node, struct fixed_rsrc_ref_node, llist);
+		__io_rsrc_put_work(ref_node);
 		node = next;
 	}
 }
 
-static void io_file_data_ref_zero(struct percpu_ref *ref)
+static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
 {
-	struct fixed_file_ref_node *ref_node;
-	struct fixed_file_data *data;
+	struct fixed_rsrc_ref_node *ref_node;
+	struct fixed_rsrc_data *data;
 	struct io_ring_ctx *ctx;
 	bool first_add = false;
 	int delay = HZ;
 
-	ref_node = container_of(ref, struct fixed_file_ref_node, refs);
-	data = ref_node->file_data;
+	ref_node = container_of(ref, struct fixed_rsrc_ref_node, refs);
+	data = ref_node->rsrc_data;
 	ctx = data->ctx;
 
 	spin_lock_bh(&data->lock);
@@ -7760,12 +7760,12 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
 
 	while (!list_empty(&data->ref_list)) {
 		ref_node = list_first_entry(&data->ref_list,
-					struct fixed_file_ref_node, node);
+					struct fixed_rsrc_ref_node, node);
 		/* recycle ref nodes in order */
 		if (!ref_node->done)
 			break;
 		list_del(&ref_node->node);
-		first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist);
+		first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
 	}
 	spin_unlock_bh(&data->lock);
 
@@ -7773,33 +7773,33 @@ static void io_file_data_ref_zero(struct percpu_ref *ref)
 		delay = 0;
 
 	if (!delay)
-		mod_delayed_work(system_wq, &ctx->file_put_work, 0);
+		mod_delayed_work(system_wq, &ctx->rsrc_put_work, 0);
 	else if (first_add)
-		queue_delayed_work(system_wq, &ctx->file_put_work, delay);
+		queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
 }
 
-static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
+static struct fixed_rsrc_ref_node *alloc_fixed_file_ref_node(
 			struct io_ring_ctx *ctx)
 {
-	struct fixed_file_ref_node *ref_node;
+	struct fixed_rsrc_ref_node *ref_node;
 
 	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
 	if (!ref_node)
 		return NULL;
 
-	if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
+	if (percpu_ref_init(&ref_node->refs, io_rsrc_data_ref_zero,
 			    0, GFP_KERNEL)) {
 		kfree(ref_node);
 		return NULL;
 	}
 	INIT_LIST_HEAD(&ref_node->node);
-	INIT_LIST_HEAD(&ref_node->file_list);
-	ref_node->file_data = ctx->file_data;
+	INIT_LIST_HEAD(&ref_node->rsrc_list);
+	ref_node->rsrc_data = ctx->file_data;
 	ref_node->done = false;
 	return ref_node;
 }
 
-static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
+static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
 {
 	percpu_ref_exit(&ref_node->refs);
 	kfree(ref_node);
@@ -7812,8 +7812,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	unsigned nr_tables, i;
 	struct file *file;
 	int fd, ret = -ENOMEM;
-	struct fixed_file_ref_node *ref_node;
-	struct fixed_file_data *file_data;
+	struct fixed_rsrc_ref_node *ref_node;
+	struct fixed_rsrc_data *file_data;
 
 	if (ctx->file_data)
 		return -EBUSY;
@@ -7836,7 +7836,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	if (!file_data->table)
 		goto out_free;
 
-	if (percpu_ref_init(&file_data->refs, io_file_ref_kill,
+	if (percpu_ref_init(&file_data->refs, io_rsrc_ref_kill,
 				PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 		goto out_free;
 
@@ -7845,7 +7845,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	ctx->file_data = file_data;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		struct fixed_file_table *table;
+		struct fixed_rsrc_table *table;
 		unsigned index;
 
 		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
@@ -7889,7 +7889,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -ENOMEM;
 	}
 
-	io_sqe_files_set_node(file_data, ref_node);
+	io_sqe_rsrc_set_node(file_data, ref_node);
 	return ret;
 out_fput:
 	for (i = 0; i < ctx->nr_user_files; i++) {
@@ -7952,28 +7952,34 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 #endif
 }
 
-static int io_queue_file_removal(struct fixed_file_data *data,
-				 struct file *file)
+static int io_queue_rsrc_removal(struct fixed_rsrc_data *data,
+				 struct file *rsrc)
 {
-	struct io_file_put *pfile;
-	struct fixed_file_ref_node *ref_node = data->node;
+	struct io_rsrc_put *prsrc;
+	struct fixed_rsrc_ref_node *ref_node = data->node;
 
-	pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
-	if (!pfile)
+	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
+	if (!prsrc)
 		return -ENOMEM;
 
-	pfile->file = file;
-	list_add(&pfile->list, &ref_node->file_list);
+	prsrc->file = rsrc;
+	list_add(&prsrc->list, &ref_node->rsrc_list);
 
 	return 0;
 }
 
+static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
+					struct file *file)
+{
+	return io_queue_rsrc_removal(data, file);
+}
+
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
-				 struct io_uring_files_update *up,
+				 struct io_uring_rsrc_update *up,
 				 unsigned nr_args)
 {
-	struct fixed_file_data *data = ctx->file_data;
-	struct fixed_file_ref_node *ref_node;
+	struct fixed_rsrc_data *data = ctx->file_data;
+	struct fixed_rsrc_ref_node *ref_node;
 	struct file *file;
 	__s32 __user *fds;
 	int fd, i, err;
@@ -7990,9 +7996,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		return -ENOMEM;
 
 	done = 0;
-	fds = u64_to_user_ptr(up->fds);
+	fds = u64_to_user_ptr(up->data);
 	while (nr_args) {
-		struct fixed_file_table *table;
+		struct fixed_rsrc_table *table;
 		unsigned index;
 
 		err = 0;
@@ -8045,9 +8051,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 	if (needs_switch) {
 		percpu_ref_kill(&data->node->refs);
-		io_sqe_files_set_node(data, ref_node);
+		io_sqe_rsrc_set_node(data, ref_node);
 	} else
-		destroy_fixed_file_ref_node(ref_node);
+		destroy_fixed_rsrc_ref_node(ref_node);
 
 	return done ? done : err;
 }
@@ -8055,7 +8061,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 			       unsigned nr_args)
 {
-	struct io_uring_files_update up;
+	struct io_uring_rsrc_update up;
 
 	if (!ctx->file_data)
 		return -ENXIO;
@@ -9482,7 +9488,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
 	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
 	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
-		struct fixed_file_table *table;
+		struct fixed_rsrc_table *table;
 		struct file *f;
 
 		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d31a2a1e8ef9..f9f106c54d90 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -285,12 +285,19 @@ enum {
 	IORING_REGISTER_LAST
 };
 
+/* deprecated, see struct io_uring_rsrc_update */
 struct io_uring_files_update {
 	__u32 offset;
 	__u32 resv;
 	__aligned_u64 /* __s32 * */ fds;
 };
 
+struct io_uring_rsrc_update {
+	__u32 offset;
+	__u32 resv;
+	__aligned_u64 data;
+};
+
 #define IO_URING_OP_SUPPORTED	(1U << 0)
 
 struct io_uring_probe_op {

From 5023853183699dd1e3e47622c03d7ae11343837a Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Fri, 15 Jan 2021 17:37:45 +0000
Subject: [PATCH 023/183] io_uring: generalize io_queue_rsrc_removal

Generalize io_queue_rsrc_removal to handle both files and buffers.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
[remove io_mapped_ubuf from rsrc tables/etc. for now]
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 95b3b8747a65..e52800e19c60 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -195,9 +195,14 @@ struct io_mapped_ubuf {
 	unsigned long	acct_pages;
 };
 
+struct io_ring_ctx;
+
 struct io_rsrc_put {
 	struct list_head list;
-	struct file *file;
+	union {
+		void *rsrc;
+		struct file *file;
+	};
 };
 
 struct fixed_rsrc_table {
@@ -209,6 +214,8 @@ struct fixed_rsrc_ref_node {
 	struct list_head		node;
 	struct list_head		rsrc_list;
 	struct fixed_rsrc_data		*rsrc_data;
+	void				(*rsrc_put)(struct io_ring_ctx *ctx,
+						    struct io_rsrc_put *prsrc);
 	struct llist_node		llist;
 	bool				done;
 };
@@ -7646,8 +7653,9 @@ static int io_sqe_alloc_file_tables(struct fixed_rsrc_data *file_data,
 	return 1;
 }
 
-static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
+static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
 {
+	struct file *file = prsrc->file;
 #if defined(CONFIG_UNIX)
 	struct sock *sock = ctx->ring_sock->sk;
 	struct sk_buff_head list, *head = &sock->sk_receive_queue;
@@ -7716,7 +7724,7 @@ static void __io_rsrc_put_work(struct fixed_rsrc_ref_node *ref_node)
 
 	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
 		list_del(&prsrc->list);
-		io_ring_file_put(ctx, prsrc->file);
+		ref_node->rsrc_put(ctx, prsrc);
 		kfree(prsrc);
 	}
 
@@ -7795,6 +7803,7 @@ static struct fixed_rsrc_ref_node *alloc_fixed_file_ref_node(
 	INIT_LIST_HEAD(&ref_node->node);
 	INIT_LIST_HEAD(&ref_node->rsrc_list);
 	ref_node->rsrc_data = ctx->file_data;
+	ref_node->rsrc_put = io_ring_file_put;
 	ref_node->done = false;
 	return ref_node;
 }
@@ -7952,8 +7961,7 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
 #endif
 }
 
-static int io_queue_rsrc_removal(struct fixed_rsrc_data *data,
-				 struct file *rsrc)
+static int io_queue_rsrc_removal(struct fixed_rsrc_data *data, void *rsrc)
 {
 	struct io_rsrc_put *prsrc;
 	struct fixed_rsrc_ref_node *ref_node = data->node;
@@ -7962,7 +7970,7 @@ static int io_queue_rsrc_removal(struct fixed_rsrc_data *data,
 	if (!prsrc)
 		return -ENOMEM;
 
-	prsrc->file = rsrc;
+	prsrc->rsrc = rsrc;
 	list_add(&prsrc->list, &ref_node->rsrc_list);
 
 	return 0;
@@ -7971,7 +7979,7 @@ static int io_queue_rsrc_removal(struct fixed_rsrc_data *data,
 static inline int io_queue_file_removal(struct fixed_rsrc_data *data,
 					struct file *file)
 {
-	return io_queue_rsrc_removal(data, file);
+	return io_queue_rsrc_removal(data, (void *)file);
 }
 
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,

From d67d2263fb2350a68074f2cb4dd78549aeebbfae Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Fri, 15 Jan 2021 17:37:46 +0000
Subject: [PATCH 024/183] io_uring: separate ref_list from fixed_rsrc_data

Uplevel ref_list and make it common to all resources.  This is to
allow one common ref_list to be used for both files, and buffers
in upcoming patches.

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e52800e19c60..fcc7a3ed800a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -227,8 +227,6 @@ struct fixed_rsrc_data {
 	struct fixed_rsrc_ref_node	*node;
 	struct percpu_ref		refs;
 	struct completion		done;
-	struct list_head		ref_list;
-	spinlock_t			lock;
 };
 
 struct io_buffer {
@@ -398,6 +396,8 @@ struct io_ring_ctx {
 
 	struct delayed_work		rsrc_put_work;
 	struct llist_head		rsrc_put_llist;
+	struct list_head		rsrc_ref_list;
+	spinlock_t			rsrc_ref_lock;
 
 	struct work_struct		exit_work;
 	struct io_restriction		restrictions;
@@ -1342,6 +1342,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->timeout_list);
 	spin_lock_init(&ctx->inflight_lock);
 	INIT_LIST_HEAD(&ctx->inflight_list);
+	spin_lock_init(&ctx->rsrc_ref_lock);
+	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 	init_llist_head(&ctx->rsrc_put_llist);
 	return ctx;
@@ -7348,13 +7350,14 @@ static void io_rsrc_ref_kill(struct percpu_ref *ref)
 	complete(&data->done);
 }
 
-static void io_sqe_rsrc_set_node(struct fixed_rsrc_data *rsrc_data,
+static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
+				 struct fixed_rsrc_data *rsrc_data,
 				 struct fixed_rsrc_ref_node *ref_node)
 {
-	spin_lock_bh(&rsrc_data->lock);
+	spin_lock_bh(&ctx->rsrc_ref_lock);
 	rsrc_data->node = ref_node;
-	list_add_tail(&ref_node->node, &rsrc_data->ref_list);
-	spin_unlock_bh(&rsrc_data->lock);
+	list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
+	spin_unlock_bh(&ctx->rsrc_ref_lock);
 	percpu_ref_get(&rsrc_data->refs);
 }
 
@@ -7371,9 +7374,9 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	if (!backup_node)
 		return -ENOMEM;
 
-	spin_lock_bh(&data->lock);
+	spin_lock_bh(&ctx->rsrc_ref_lock);
 	ref_node = data->node;
-	spin_unlock_bh(&data->lock);
+	spin_unlock_bh(&ctx->rsrc_ref_lock);
 	if (ref_node)
 		percpu_ref_kill(&ref_node->refs);
 
@@ -7389,7 +7392,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 		if (ret < 0) {
 			percpu_ref_resurrect(&data->refs);
 			reinit_completion(&data->done);
-			io_sqe_rsrc_set_node(data, backup_node);
+			io_sqe_rsrc_set_node(ctx, data, backup_node);
 			return ret;
 		}
 	} while (1);
@@ -7763,11 +7766,11 @@ static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
 	data = ref_node->rsrc_data;
 	ctx = data->ctx;
 
-	spin_lock_bh(&data->lock);
+	spin_lock_bh(&ctx->rsrc_ref_lock);
 	ref_node->done = true;
 
-	while (!list_empty(&data->ref_list)) {
-		ref_node = list_first_entry(&data->ref_list,
+	while (!list_empty(&ctx->rsrc_ref_list)) {
+		ref_node = list_first_entry(&ctx->rsrc_ref_list,
 					struct fixed_rsrc_ref_node, node);
 		/* recycle ref nodes in order */
 		if (!ref_node->done)
@@ -7775,7 +7778,7 @@ static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
 		list_del(&ref_node->node);
 		first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
 	}
-	spin_unlock_bh(&data->lock);
+	spin_unlock_bh(&ctx->rsrc_ref_lock);
 
 	if (percpu_ref_is_dying(&data->refs))
 		delay = 0;
@@ -7836,8 +7839,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -ENOMEM;
 	file_data->ctx = ctx;
 	init_completion(&file_data->done);
-	INIT_LIST_HEAD(&file_data->ref_list);
-	spin_lock_init(&file_data->lock);
 
 	nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
 	file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
@@ -7898,7 +7899,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -ENOMEM;
 	}
 
-	io_sqe_rsrc_set_node(file_data, ref_node);
+	io_sqe_rsrc_set_node(ctx, file_data, ref_node);
 	return ret;
 out_fput:
 	for (i = 0; i < ctx->nr_user_files; i++) {
@@ -8059,7 +8060,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 	if (needs_switch) {
 		percpu_ref_kill(&data->node->refs);
-		io_sqe_rsrc_set_node(data, ref_node);
+		io_sqe_rsrc_set_node(ctx, data, ref_node);
 	} else
 		destroy_fixed_rsrc_ref_node(ref_node);
 

From 2a63b2d9c30b2029892c368d11ede1434de6c565 Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Fri, 15 Jan 2021 17:37:47 +0000
Subject: [PATCH 025/183] io_uring: add rsrc_ref locking routines

Encapsulate resource reference locking into separate routines.

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fcc7a3ed800a..a129192c20d3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7350,14 +7350,24 @@ static void io_rsrc_ref_kill(struct percpu_ref *ref)
 	complete(&data->done);
 }
 
+static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx)
+{
+	spin_lock_bh(&ctx->rsrc_ref_lock);
+}
+
+static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx)
+{
+	spin_unlock_bh(&ctx->rsrc_ref_lock);
+}
+
 static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
 				 struct fixed_rsrc_data *rsrc_data,
 				 struct fixed_rsrc_ref_node *ref_node)
 {
-	spin_lock_bh(&ctx->rsrc_ref_lock);
+	io_rsrc_ref_lock(ctx);
 	rsrc_data->node = ref_node;
 	list_add_tail(&ref_node->node, &ctx->rsrc_ref_list);
-	spin_unlock_bh(&ctx->rsrc_ref_lock);
+	io_rsrc_ref_unlock(ctx);
 	percpu_ref_get(&rsrc_data->refs);
 }
 
@@ -7374,9 +7384,9 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	if (!backup_node)
 		return -ENOMEM;
 
-	spin_lock_bh(&ctx->rsrc_ref_lock);
+	io_rsrc_ref_lock(ctx);
 	ref_node = data->node;
-	spin_unlock_bh(&ctx->rsrc_ref_lock);
+	io_rsrc_ref_unlock(ctx);
 	if (ref_node)
 		percpu_ref_kill(&ref_node->refs);
 
@@ -7766,7 +7776,7 @@ static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
 	data = ref_node->rsrc_data;
 	ctx = data->ctx;
 
-	spin_lock_bh(&ctx->rsrc_ref_lock);
+	io_rsrc_ref_lock(ctx);
 	ref_node->done = true;
 
 	while (!list_empty(&ctx->rsrc_ref_list)) {
@@ -7778,7 +7788,7 @@ static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
 		list_del(&ref_node->node);
 		first_add |= llist_add(&ref_node->llist, &ctx->rsrc_put_llist);
 	}
-	spin_unlock_bh(&ctx->rsrc_ref_lock);
+	io_rsrc_ref_unlock(ctx);
 
 	if (percpu_ref_is_dying(&data->refs))
 		delay = 0;

From 6802535df7bf807c94de32a9d0bf0401d3109671 Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Fri, 15 Jan 2021 17:37:48 +0000
Subject: [PATCH 026/183] io_uring: split alloc_fixed_file_ref_node

Split alloc_fixed_file_ref_node into resource generic/specific parts,
to be leveraged for fixed buffers.

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a129192c20d3..ab5bf1bf0779 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7799,7 +7799,7 @@ static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
 		queue_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
 }
 
-static struct fixed_rsrc_ref_node *alloc_fixed_file_ref_node(
+static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 			struct io_ring_ctx *ctx)
 {
 	struct fixed_rsrc_ref_node *ref_node;
@@ -7815,9 +7815,21 @@ static struct fixed_rsrc_ref_node *alloc_fixed_file_ref_node(
 	}
 	INIT_LIST_HEAD(&ref_node->node);
 	INIT_LIST_HEAD(&ref_node->rsrc_list);
+	ref_node->done = false;
+	return ref_node;
+}
+
+static struct fixed_rsrc_ref_node *alloc_fixed_file_ref_node(
+			struct io_ring_ctx *ctx)
+{
+	struct fixed_rsrc_ref_node *ref_node;
+
+	ref_node = alloc_fixed_rsrc_ref_node(ctx);
+	if (!ref_node)
+		return NULL;
+
 	ref_node->rsrc_data = ctx->file_data;
 	ref_node->rsrc_put = io_ring_file_put;
-	ref_node->done = false;
 	return ref_node;
 }
 

From bc9744cd162b2f6c38d75dc49c310677dc13afa8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Jan 2021 17:37:49 +0000
Subject: [PATCH 027/183] io_uring: split ref_node alloc and init

A simple prep patch allowing to set refnode callbacks after it was
allocated. This needed to 1) keep ourself off of hi-level functions
where it's not pretty and they are not necessary 2) amortise ref_node
allocation in the future, e.g. for updates.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ab5bf1bf0779..bb51f2abd009 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1009,8 +1009,10 @@ static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 					    struct task_struct *task);
 
 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
-static struct fixed_rsrc_ref_node *alloc_fixed_file_ref_node(
+static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 			struct io_ring_ctx *ctx);
+static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
+				     struct fixed_rsrc_ref_node *ref_node);
 
 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 			     struct io_comp_state *cs);
@@ -7380,9 +7382,10 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 
 	if (!data)
 		return -ENXIO;
-	backup_node = alloc_fixed_file_ref_node(ctx);
+	backup_node = alloc_fixed_rsrc_ref_node(ctx);
 	if (!backup_node)
 		return -ENOMEM;
+	init_fixed_file_ref_node(ctx, backup_node);
 
 	io_rsrc_ref_lock(ctx);
 	ref_node = data->node;
@@ -7819,18 +7822,11 @@ static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 	return ref_node;
 }
 
-static struct fixed_rsrc_ref_node *alloc_fixed_file_ref_node(
-			struct io_ring_ctx *ctx)
+static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
+				     struct fixed_rsrc_ref_node *ref_node)
 {
-	struct fixed_rsrc_ref_node *ref_node;
-
-	ref_node = alloc_fixed_rsrc_ref_node(ctx);
-	if (!ref_node)
-		return NULL;
-
 	ref_node->rsrc_data = ctx->file_data;
 	ref_node->rsrc_put = io_ring_file_put;
-	return ref_node;
 }
 
 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
@@ -7915,11 +7911,12 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return ret;
 	}
 
-	ref_node = alloc_fixed_file_ref_node(ctx);
+	ref_node = alloc_fixed_rsrc_ref_node(ctx);
 	if (!ref_node) {
 		io_sqe_files_unregister(ctx);
 		return -ENOMEM;
 	}
+	init_fixed_file_ref_node(ctx, ref_node);
 
 	io_sqe_rsrc_set_node(ctx, file_data, ref_node);
 	return ret;
@@ -8022,9 +8019,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 	if (done > ctx->nr_user_files)
 		return -EINVAL;
 
-	ref_node = alloc_fixed_file_ref_node(ctx);
+	ref_node = alloc_fixed_rsrc_ref_node(ctx);
 	if (!ref_node)
 		return -ENOMEM;
+	init_fixed_file_ref_node(ctx, ref_node);
 
 	done = 0;
 	fds = u64_to_user_ptr(up->data);

From d7954b2ba94639b7f5b08760d36e54c28544730f Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Fri, 15 Jan 2021 17:37:50 +0000
Subject: [PATCH 028/183] io_uring: create common fixed_rsrc_ref_node handling
 routines

Create common routines to be used for both files/buffers registration.

[remove io_sqe_rsrc_set_node substitution]

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
[merge, quiesce only for files]
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bb51f2abd009..727d0d3cdbcc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7373,20 +7373,13 @@ static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx,
 	percpu_ref_get(&rsrc_data->refs);
 }
 
-static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
+			       struct io_ring_ctx *ctx,
+			       struct fixed_rsrc_ref_node *backup_node)
 {
-	struct fixed_rsrc_data *data = ctx->file_data;
-	struct fixed_rsrc_ref_node *backup_node, *ref_node = NULL;
-	unsigned nr_tables, i;
+	struct fixed_rsrc_ref_node *ref_node;
 	int ret;
 
-	if (!data)
-		return -ENXIO;
-	backup_node = alloc_fixed_rsrc_ref_node(ctx);
-	if (!backup_node)
-		return -ENOMEM;
-	init_fixed_file_ref_node(ctx, backup_node);
-
 	io_rsrc_ref_lock(ctx);
 	ref_node = data->node;
 	io_rsrc_ref_unlock(ctx);
@@ -7410,6 +7403,28 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 		}
 	} while (1);
 
+	destroy_fixed_rsrc_ref_node(backup_node);
+	return 0;
+}
+
+static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+	struct fixed_rsrc_data *data = ctx->file_data;
+	struct fixed_rsrc_ref_node *backup_node;
+	unsigned nr_tables, i;
+	int ret;
+
+	if (!data)
+		return -ENXIO;
+	backup_node = alloc_fixed_rsrc_ref_node(ctx);
+	if (!backup_node)
+		return -ENOMEM;
+	init_fixed_file_ref_node(ctx, backup_node);
+
+	ret = io_rsrc_ref_quiesce(data, ctx, backup_node);
+	if (ret)
+		return ret;
+
 	__io_sqe_files_unregister(ctx);
 	nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
 	for (i = 0; i < nr_tables; i++)
@@ -7419,7 +7434,6 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	kfree(data);
 	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
-	destroy_fixed_rsrc_ref_node(backup_node);
 	return 0;
 }
 

From 1ad555c6ae6e28ec7b1acaa2af72a9904e6ba96a Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Fri, 15 Jan 2021 17:37:51 +0000
Subject: [PATCH 029/183] io_uring: create common fixed_rsrc_data allocation
 routines

Create common alloc/free fixed_rsrc_data routines for both files and
buffers.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
[remove buffer part]
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 727d0d3cdbcc..8f7d95e0d240 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7407,6 +7407,31 @@ static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data,
 	return 0;
 }
 
+static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
+{
+	struct fixed_rsrc_data *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return NULL;
+
+	if (percpu_ref_init(&data->refs, io_rsrc_ref_kill,
+			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
+		kfree(data);
+		return NULL;
+	}
+	data->ctx = ctx;
+	init_completion(&data->done);
+	return data;
+}
+
+static void free_fixed_rsrc_data(struct fixed_rsrc_data *data)
+{
+	percpu_ref_exit(&data->refs);
+	kfree(data->table);
+	kfree(data);
+}
+
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 	struct fixed_rsrc_data *data = ctx->file_data;
@@ -7429,9 +7454,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
 	for (i = 0; i < nr_tables; i++)
 		kfree(data->table[i].files);
-	kfree(data->table);
-	percpu_ref_exit(&data->refs);
-	kfree(data);
+	free_fixed_rsrc_data(data);
 	ctx->file_data = NULL;
 	ctx->nr_user_files = 0;
 	return 0;
@@ -7866,11 +7889,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	if (nr_args > IORING_MAX_FIXED_FILES)
 		return -EMFILE;
 
-	file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
+	file_data = alloc_fixed_rsrc_data(ctx);
 	if (!file_data)
 		return -ENOMEM;
-	file_data->ctx = ctx;
-	init_completion(&file_data->done);
 
 	nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
 	file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
@@ -7878,12 +7899,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	if (!file_data->table)
 		goto out_free;
 
-	if (percpu_ref_init(&file_data->refs, io_rsrc_ref_kill,
-				PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
-		goto out_free;
-
 	if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
-		goto out_ref;
+		goto out_free;
 	ctx->file_data = file_data;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
@@ -7943,11 +7960,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	for (i = 0; i < nr_tables; i++)
 		kfree(file_data->table[i].files);
 	ctx->nr_user_files = 0;
-out_ref:
-	percpu_ref_exit(&file_data->refs);
 out_free:
-	kfree(file_data->table);
-	kfree(file_data);
+	free_fixed_rsrc_data(ctx->file_data);
 	ctx->file_data = NULL;
 	return ret;
 }

From 00835dce1406e746fe5ab8c522cceb9594c78acb Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Fri, 15 Jan 2021 17:37:52 +0000
Subject: [PATCH 030/183] io_uring: make percpu_ref_release names consistent

Make the percpu ref release function names consistent between rsrc data
and nodes.

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8f7d95e0d240..98789fece715 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7344,7 +7344,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 #endif
 }
 
-static void io_rsrc_ref_kill(struct percpu_ref *ref)
+static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
 {
 	struct fixed_rsrc_data *data;
 
@@ -7415,7 +7415,7 @@ static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx)
 	if (!data)
 		return NULL;
 
-	if (percpu_ref_init(&data->refs, io_rsrc_ref_kill,
+	if (percpu_ref_init(&data->refs, io_rsrc_data_ref_zero,
 			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
 		kfree(data);
 		return NULL;
@@ -7804,7 +7804,7 @@ static void io_rsrc_put_work(struct work_struct *work)
 	}
 }
 
-static void io_rsrc_data_ref_zero(struct percpu_ref *ref)
+static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 {
 	struct fixed_rsrc_ref_node *ref_node;
 	struct fixed_rsrc_data *data;
@@ -7848,7 +7848,7 @@ static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 	if (!ref_node)
 		return NULL;
 
-	if (percpu_ref_init(&ref_node->refs, io_rsrc_data_ref_zero,
+	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
 			    0, GFP_KERNEL)) {
 		kfree(ref_node);
 		return NULL;

From bf6182b6d46e28c3e59b9c0d6097b379cae56b94 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:34 +0000
Subject: [PATCH 031/183] io_uring: optimise io_rw_reissue()

The hot path is IO completing on the first try. Reshuffle io_rw_reissue() so
it's checked first.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 98789fece715..4a8900d480c5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2738,12 +2738,13 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
 static bool io_rw_reissue(struct io_kiocb *req, long res)
 {
 #ifdef CONFIG_BLOCK
-	umode_t mode = file_inode(req->file)->i_mode;
+	umode_t mode;
 	int ret;
 
-	if (!S_ISBLK(mode) && !S_ISREG(mode))
+	if (res != -EAGAIN && res != -EOPNOTSUPP)
 		return false;
-	if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
+	mode = file_inode(req->file)->i_mode;
+	if ((!S_ISBLK(mode) && !S_ISREG(mode)) || io_wq_current_is_worker())
 		return false;
 
 	lockdep_assert_held(&req->ctx->uring_lock);

From dc2a6e9aa9c349d76c318d22bbe26006fda1ce97 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:35 +0000
Subject: [PATCH 032/183] io_uring: refactor io_resubmit_prep()

It's awkward to pass return a value into a function for it to return it
back. Check it at the caller site and clean up io_resubmit_prep() a bit.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4a8900d480c5..be2760ae6c23 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2689,17 +2689,16 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res,
 }
 
 #ifdef CONFIG_BLOCK
-static bool io_resubmit_prep(struct io_kiocb *req, int error)
+static bool io_resubmit_prep(struct io_kiocb *req)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	ssize_t ret = -ECANCELED;
 	struct iov_iter iter;
 	int rw;
 
-	if (error) {
-		ret = error;
-		goto end_req;
-	}
+	/* already prepared */
+	if (req->async_data)
+		return true;
 
 	switch (req->opcode) {
 	case IORING_OP_READV:
@@ -2715,22 +2714,16 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
 	default:
 		printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
 				req->opcode);
-		goto end_req;
+		return false;
 	}
 
-	if (!req->async_data) {
-		ret = io_import_iovec(rw, req, &iovec, &iter, false);
-		if (ret < 0)
-			goto end_req;
-		ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
-		if (!ret)
-			return true;
-		kfree(iovec);
-	} else {
+	ret = io_import_iovec(rw, req, &iovec, &iter, false);
+	if (ret < 0)
+		return false;
+	ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
+	if (!ret)
 		return true;
-	}
-end_req:
-	req_set_fail_links(req);
+	kfree(iovec);
 	return false;
 }
 #endif
@@ -2751,12 +2744,12 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
 
 	ret = io_sq_thread_acquire_mm_files(req->ctx, req);
 
-	if (io_resubmit_prep(req, ret)) {
+	if (!ret && io_resubmit_prep(req)) {
 		refcount_inc(&req->refs);
 		io_queue_async_work(req);
 		return true;
 	}
-
+	req_set_fail_links(req);
 #endif
 	return false;
 }

From 5c766a908d06e96d30e0ec2511a24fa311553d2c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:36 +0000
Subject: [PATCH 033/183] io_uring: cleanup personalities under uring_lock

personality_idr is usually synchronised by uring_lock, the exception
would be removing personalities in io_ring_ctx_wait_and_kill(), which
is legit as refs are killed by that point but still would be more
resilient to do it under the lock.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index be2760ae6c23..5e576878efd9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8867,6 +8867,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	ctx->cq_overflow_flushed = 1;
 	if (ctx->rings)
 		__io_cqring_overflow_flush(ctx, true, NULL, NULL);
+	idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
 	mutex_unlock(&ctx->uring_lock);
 
 	io_kill_timeouts(ctx, NULL, NULL);
@@ -8877,7 +8878,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 
 	/* if we failed setting up the ctx, we might not have any rings */
 	io_iopoll_try_reap_events(ctx);
-	idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
 
 	/*
 	 * Do this upfront, so we won't have a grace period where the ring

From 2d7e935809b7f740442ce79fc6f53e94a1f0b874 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:37 +0000
Subject: [PATCH 034/183] io_uring: inline io_async_submit()

The name is confusing and it's used only in one place.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5e576878efd9..6eb4c25fa18b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1278,11 +1278,6 @@ static inline void io_req_init_async(struct io_kiocb *req)
 		refcount_inc(&req->work.identity->count);
 }
 
-static inline bool io_async_submit(struct io_ring_ctx *ctx)
-{
-	return ctx->flags & IORING_SETUP_SQPOLL;
-}
-
 static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 {
 	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -6969,7 +6964,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		}
 
 		trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
-						true, io_async_submit(ctx));
+					true, ctx->flags & IORING_SETUP_SQPOLL);
 		err = io_submit_sqe(req, sqe, &link, &state.comp);
 		if (err)
 			goto fail_req;

From ec30e04ba4a5c265f52482092a5f5f5232947c48 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:38 +0000
Subject: [PATCH 035/183] io_uring: inline __io_commit_cqring()

Inline it in its only user, that's cleaner

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6eb4c25fa18b..347bdcd2c0fe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1364,14 +1364,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
 	return false;
 }
 
-static void __io_commit_cqring(struct io_ring_ctx *ctx)
-{
-	struct io_rings *rings = ctx->rings;
-
-	/* order cqe stores with ring update */
-	smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
-}
-
 static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
 {
 	if (req->work.identity == &tctx->__identity)
@@ -1693,7 +1685,9 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
 static void io_commit_cqring(struct io_ring_ctx *ctx)
 {
 	io_flush_timeouts(ctx);
-	__io_commit_cqring(ctx);
+
+	/* order cqe stores with ring update */
+	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
 
 	if (unlikely(!list_empty(&ctx->defer_list)))
 		__io_queue_deferred(ctx);

From 888aae2eeddfe1d6c9731cf4af1a1b2605af6470 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:39 +0000
Subject: [PATCH 036/183] io_uring: further deduplicate #CQ events calc

Apparently, there is one more place hand coded calculation of number of
CQ events in the ring. Use __io_cqring_events() helper in
io_get_cqring() as well. Naturally, assembly stays identical.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 347bdcd2c0fe..0a578c40b854 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1700,21 +1700,25 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx)
 	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
 }
 
+static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
+{
+	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
+}
+
 static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
 {
 	struct io_rings *rings = ctx->rings;
 	unsigned tail;
 
-	tail = ctx->cached_cq_tail;
 	/*
 	 * writes to the cq entry need to come after reading head; the
 	 * control dependency is enough as we're using WRITE_ONCE to
 	 * fill the cq entry
 	 */
-	if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
+	if (__io_cqring_events(ctx) == rings->cq_ring_entries)
 		return NULL;
 
-	ctx->cached_cq_tail++;
+	tail = ctx->cached_cq_tail++;
 	return &rings->cqes[tail & ctx->cq_mask];
 }
 
@@ -1729,11 +1733,6 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
 	return io_wq_current_is_worker();
 }
 
-static inline unsigned __io_cqring_events(struct io_ring_ctx *ctx)
-{
-	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
-}
-
 static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 {
 	/* see waitqueue_active() comment */

From 85bcb6c67ea145b8032089db891218e3339cbdb8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:40 +0000
Subject: [PATCH 037/183] io_uring: simplify io_alloc_req()

Get rid of a label in io_alloc_req(), it's cleaner to do return
directly.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0a578c40b854..9ff84ceff4f9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1988,7 +1988,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
 		if (unlikely(ret <= 0)) {
 			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 			if (!state->reqs[0])
-				goto fallback;
+				return io_get_fallback_req(ctx);
 			ret = 1;
 		}
 		state->free_reqs = ret;
@@ -1996,8 +1996,6 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
 
 	state->free_reqs--;
 	return state->reqs[state->free_reqs];
-fallback:
-	return io_get_fallback_req(ctx);
 }
 
 static inline void io_put_file(struct io_kiocb *req, struct file *file,

From 02b23a9af5ba4db0a85ebb81c8b376b2fe860d0f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:41 +0000
Subject: [PATCH 038/183] io_uring: remove __io_state_file_put

The check in io_state_file_put() is optimised pretty well when called
from __io_file_get(). Don't pollute the code with all these variants.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9ff84ceff4f9..c3e0d6246d71 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2815,16 +2815,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async)
 		wake_up(&ctx->sq_data->wait);
 }
 
-static inline void __io_state_file_put(struct io_submit_state *state)
-{
-	fput_many(state->file, state->file_refs);
-	state->file_refs = 0;
-}
-
 static inline void io_state_file_put(struct io_submit_state *state)
 {
-	if (state->file_refs)
-		__io_state_file_put(state);
+	if (state->file_refs) {
+		fput_many(state->file, state->file_refs);
+		state->file_refs = 0;
+	}
 }
 
 /*
@@ -2842,7 +2838,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
 			state->file_refs--;
 			return state->file;
 		}
-		__io_state_file_put(state);
+		io_state_file_put(state);
 	}
 	state->file = fget_many(fd, state->ios_left);
 	if (unlikely(!state->file))

From eab30c4d20dc761d463445e5130421863ff81505 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:42 +0000
Subject: [PATCH 039/183] io_uring: deduplicate failing task_work_add

When io_req_task_work_add() fails, the request will be cancelled by
enqueueing via task_works of io-wq. Extract a function for that.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 46 +++++++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c3e0d6246d71..90c3cad1723b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2171,6 +2171,16 @@ static int io_req_task_work_add(struct io_kiocb *req)
 	return ret;
 }
 
+static void io_req_task_work_add_fallback(struct io_kiocb *req,
+					  void (*cb)(struct callback_head *))
+{
+	struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq);
+
+	init_task_work(&req->task_work, cb);
+	task_work_add(tsk, &req->task_work, TWA_NONE);
+	wake_up_process(tsk);
+}
+
 static void __io_req_task_cancel(struct io_kiocb *req, int error)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -2225,14 +2235,8 @@ static void io_req_task_queue(struct io_kiocb *req)
 	percpu_ref_get(&req->ctx->refs);
 
 	ret = io_req_task_work_add(req);
-	if (unlikely(ret)) {
-		struct task_struct *tsk;
-
-		init_task_work(&req->task_work, io_req_task_cancel);
-		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &req->task_work, TWA_NONE);
-		wake_up_process(tsk);
-	}
+	if (unlikely(ret))
+		io_req_task_work_add_fallback(req, io_req_task_cancel);
 }
 
 static inline void io_queue_next(struct io_kiocb *req)
@@ -2350,13 +2354,8 @@ static void io_free_req_deferred(struct io_kiocb *req)
 
 	init_task_work(&req->task_work, io_put_req_deferred_cb);
 	ret = io_req_task_work_add(req);
-	if (unlikely(ret)) {
-		struct task_struct *tsk;
-
-		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &req->task_work, TWA_NONE);
-		wake_up_process(tsk);
-	}
+	if (unlikely(ret))
+		io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
 }
 
 static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
@@ -3425,15 +3424,8 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 	/* submit ref gets dropped, acquire a new one */
 	refcount_inc(&req->refs);
 	ret = io_req_task_work_add(req);
-	if (unlikely(ret)) {
-		struct task_struct *tsk;
-
-		/* queue just for cancelation */
-		init_task_work(&req->task_work, io_req_task_cancel);
-		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &req->task_work, TWA_NONE);
-		wake_up_process(tsk);
-	}
+	if (unlikely(ret))
+		io_req_task_work_add_fallback(req, io_req_task_cancel);
 	return 1;
 }
 
@@ -5153,12 +5145,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 	 */
 	ret = io_req_task_work_add(req);
 	if (unlikely(ret)) {
-		struct task_struct *tsk;
-
 		WRITE_ONCE(poll->canceled, true);
-		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &req->task_work, TWA_NONE);
-		wake_up_process(tsk);
+		io_req_task_work_add_fallback(req, func);
 	}
 	return 1;
 }

From 8662daec09edcdba2659799040aee1ba575c4799 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:44 +0000
Subject: [PATCH 040/183] io_uring: add a helper timeout mode calculation

Deduplicates translation of timeout flags into hrtimer_mode.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 90c3cad1723b..4e167217c898 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5771,6 +5771,12 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
 	return 0;
 }
 
+static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
+{
+	return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
+					    : HRTIMER_MODE_REL;
+}
+
 /*
  * Remove or update an existing timeout command
  */
@@ -5781,14 +5787,11 @@ static int io_timeout_remove(struct io_kiocb *req)
 	int ret;
 
 	spin_lock_irq(&ctx->completion_lock);
-	if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) {
-		enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS)
-					? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
-
-		ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
-	} else {
+	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE))
 		ret = io_timeout_cancel(ctx, tr->addr);
-	}
+	else
+		ret = io_timeout_update(ctx, tr->addr, &tr->ts,
+					io_translate_timeout_mode(tr->flags));
 
 	io_cqring_fill_event(req, ret);
 	io_commit_cqring(ctx);
@@ -5828,11 +5831,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
 		return -EFAULT;
 
-	if (flags & IORING_TIMEOUT_ABS)
-		data->mode = HRTIMER_MODE_ABS;
-	else
-		data->mode = HRTIMER_MODE_REL;
-
+	data->mode = io_translate_timeout_mode(flags);
 	hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
 	return 0;
 }

From a38d68db6742c19a74141c0f56785ef67f51c504 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:45 +0000
Subject: [PATCH 041/183] io_uring: help inlining of io_req_complete()

__io_req_complete() inlining is a bit weird, some compilers don't
optimise out the non-NULL branch of it even when called as
io_req_complete(). Help it a bit by extracting state and stateless
helpers out of __io_req_complete().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4e167217c898..f676b198ee1b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1886,7 +1886,8 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
 	__io_cqring_fill_event(req, res, 0);
 }
 
-static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
+static void io_req_complete_nostate(struct io_kiocb *req, long res,
+				    unsigned int cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
@@ -1897,6 +1898,7 @@ static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	io_cqring_ev_posted(ctx);
+	io_put_req(req);
 }
 
 static void io_submit_flush_completions(struct io_comp_state *cs)
@@ -1932,23 +1934,27 @@ static void io_submit_flush_completions(struct io_comp_state *cs)
 	cs->nr = 0;
 }
 
-static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
-			      struct io_comp_state *cs)
+static void io_req_complete_state(struct io_kiocb *req, long res,
+				  unsigned int cflags, struct io_comp_state *cs)
 {
-	if (!cs) {
-		io_cqring_add_event(req, res, cflags);
-		io_put_req(req);
-	} else {
-		io_clean_op(req);
-		req->result = res;
-		req->compl.cflags = cflags;
-		list_add_tail(&req->compl.list, &cs->list);
-		if (++cs->nr >= 32)
-			io_submit_flush_completions(cs);
-	}
+	io_clean_op(req);
+	req->result = res;
+	req->compl.cflags = cflags;
+	list_add_tail(&req->compl.list, &cs->list);
+	if (++cs->nr >= 32)
+		io_submit_flush_completions(cs);
 }
 
-static void io_req_complete(struct io_kiocb *req, long res)
+static inline void __io_req_complete(struct io_kiocb *req, long res,
+				     unsigned cflags, struct io_comp_state *cs)
+{
+	if (!cs)
+		io_req_complete_nostate(req, res, cflags);
+	else
+		io_req_complete_state(req, res, cflags, cs);
+}
+
+static inline void io_req_complete(struct io_kiocb *req, long res)
 {
 	__io_req_complete(req, res, 0, NULL);
 }

From 9affd664f0e0512d8997dbdddb1448a4faf9bc82 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:46 +0000
Subject: [PATCH 042/183] io_uring: don't flush CQEs deep down the stack

io_submit_flush_completions() is called down the stack in the _state
version of io_req_complete(), that's ok because is only called by
io_uring opcode handler functions directly. Move it up to
__io_queue_sqe() as preparation.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f676b198ee1b..935a16a682a2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1941,8 +1941,7 @@ static void io_req_complete_state(struct io_kiocb *req, long res,
 	req->result = res;
 	req->compl.cflags = cflags;
 	list_add_tail(&req->compl.list, &cs->list);
-	if (++cs->nr >= 32)
-		io_submit_flush_completions(cs);
+	cs->nr++;
 }
 
 static inline void __io_req_complete(struct io_kiocb *req, long res,
@@ -6577,7 +6576,15 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
 			io_queue_linked_timeout(linked_timeout);
 	} else if (likely(!ret)) {
 		/* drop submission reference */
-		req = io_put_req_find_next(req);
+		if (cs) {
+			io_put_req(req);
+			if (cs->nr >= 32)
+				io_submit_flush_completions(cs);
+			req = NULL;
+		} else {
+			req = io_put_req_find_next(req);
+		}
+
 		if (linked_timeout)
 			io_queue_linked_timeout(linked_timeout);
 

From e342c807f556dbcee1370ab78af1d8faf497d771 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 19 Jan 2021 13:32:47 +0000
Subject: [PATCH 043/183] io_uring: save atomic dec for inline executed reqs

When a request is completed with comp_state, its completion reference
put is deferred to io_submit_flush_completions(), but the submission
is put not far from there, so do it together to save one atomic dec per
request. That targets requests that complete inline, e.g. buffered rw,
send/recv.

Proper benchmarking haven't been conducted but for nops(batch=32) it was
around 7901 vs 8117 KIOPS (~2.7%), or ~4% per perf profiling.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 935a16a682a2..3f6d055eb6d4 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -629,6 +629,7 @@ enum {
 	REQ_F_NO_FILE_TABLE_BIT,
 	REQ_F_WORK_INITIALIZED_BIT,
 	REQ_F_LTIMEOUT_ACTIVE_BIT,
+	REQ_F_COMPLETE_INLINE_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -672,6 +673,8 @@ enum {
 	REQ_F_WORK_INITIALIZED	= BIT(REQ_F_WORK_INITIALIZED_BIT),
 	/* linked timeout is active, i.e. prepared by link's head */
 	REQ_F_LTIMEOUT_ACTIVE	= BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
+	/* completion is deferred through io_comp_state */
+	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
 };
 
 struct async_poll {
@@ -1917,14 +1920,15 @@ static void io_submit_flush_completions(struct io_comp_state *cs)
 		 * io_free_req() doesn't care about completion_lock unless one
 		 * of these flags is set. REQ_F_WORK_INITIALIZED is in the list
 		 * because of a potential deadlock with req->work.fs->lock
+		 * We defer both, completion and submission refs.
 		 */
 		if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT
 				 |REQ_F_WORK_INITIALIZED)) {
 			spin_unlock_irq(&ctx->completion_lock);
-			io_put_req(req);
+			io_double_put_req(req);
 			spin_lock_irq(&ctx->completion_lock);
 		} else {
-			io_put_req(req);
+			io_double_put_req(req);
 		}
 	}
 	io_commit_cqring(ctx);
@@ -1940,8 +1944,7 @@ static void io_req_complete_state(struct io_kiocb *req, long res,
 	io_clean_op(req);
 	req->result = res;
 	req->compl.cflags = cflags;
-	list_add_tail(&req->compl.list, &cs->list);
-	cs->nr++;
+	req->flags |= REQ_F_COMPLETE_INLINE;
 }
 
 static inline void __io_req_complete(struct io_kiocb *req, long res,
@@ -6576,9 +6579,9 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
 			io_queue_linked_timeout(linked_timeout);
 	} else if (likely(!ret)) {
 		/* drop submission reference */
-		if (cs) {
-			io_put_req(req);
-			if (cs->nr >= 32)
+		if (req->flags & REQ_F_COMPLETE_INLINE) {
+			list_add_tail(&req->compl.list, &cs->list);
+			if (++cs->nr >= 32)
 				io_submit_flush_completions(cs);
 			req = NULL;
 		} else {

From 53dec2ea74f2ef360e8455439be96a780baa6097 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 19 Jan 2021 15:41:52 -0700
Subject: [PATCH 044/183] fs: provide locked helper variant of
 close_fd_get_file()

Assumes current->files->file_lock is already held on invocation. Helps
the caller check the file before removing the fd, if it needs to.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/file.c     | 48 +++++++++++++++++++++++++++++++-----------------
 fs/internal.h |  1 +
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index dab120b71e44..f3a4bac2cbe9 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -22,6 +22,8 @@
 #include <linux/close_range.h>
 #include <net/sock.h>
 
+#include "internal.h"
+
 unsigned int sysctl_nr_open __read_mostly = 1024*1024;
 unsigned int sysctl_nr_open_min = BITS_PER_LONG;
 /* our min() is unusable in constant expressions ;-/ */
@@ -731,6 +733,32 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 	return 0;
 }
 
+/*
+ * See close_fd_get_file() below, this variant assumes current->files->file_lock
+ * is held.
+ */
+int __close_fd_get_file(unsigned int fd, struct file **res)
+{
+	struct files_struct *files = current->files;
+	struct file *file;
+	struct fdtable *fdt;
+
+	fdt = files_fdtable(files);
+	if (fd >= fdt->max_fds)
+		goto out_err;
+	file = fdt->fd[fd];
+	if (!file)
+		goto out_err;
+	rcu_assign_pointer(fdt->fd[fd], NULL);
+	__put_unused_fd(files, fd);
+	get_file(file);
+	*res = file;
+	return 0;
+out_err:
+	*res = NULL;
+	return -ENOENT;
+}
+
 /*
  * variant of close_fd that gets a ref on the file for later fput.
  * The caller must ensure that filp_close() called on the file, and then
@@ -739,27 +767,13 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
 int close_fd_get_file(unsigned int fd, struct file **res)
 {
 	struct files_struct *files = current->files;
-	struct file *file;
-	struct fdtable *fdt;
+	int ret;
 
 	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	if (fd >= fdt->max_fds)
-		goto out_unlock;
-	file = fdt->fd[fd];
-	if (!file)
-		goto out_unlock;
-	rcu_assign_pointer(fdt->fd[fd], NULL);
-	__put_unused_fd(files, fd);
+	ret = __close_fd_get_file(fd, res);
 	spin_unlock(&files->file_lock);
-	get_file(file);
-	*res = file;
-	return 0;
 
-out_unlock:
-	spin_unlock(&files->file_lock);
-	*res = NULL;
-	return -ENOENT;
+	return ret;
 }
 
 void do_close_on_exec(struct files_struct *files)
diff --git a/fs/internal.h b/fs/internal.h
index 77c50befbfbe..c6c85f6ad598 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -132,6 +132,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
 		const char *, const struct open_flags *);
 extern struct open_how build_open_how(int flags, umode_t mode);
 extern int build_open_flags(const struct open_how *how, struct open_flags *op);
+extern int __close_fd_get_file(unsigned int fd, struct file **res);
 
 long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
 int chmod_common(const struct path *path, umode_t mode);

From 9eac1904d3364254d622bf2c771c4f85cd435fc2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 19 Jan 2021 15:50:37 -0700
Subject: [PATCH 045/183] io_uring: get rid of intermediate IORING_OP_CLOSE
 stage

We currently split the close into two, in case we have a ->flush op
that we can't safely handle from non-blocking context. This requires
us to flag the op as uncancelable if we do need to punt it async, and
that means special handling for just this op type.

Use __close_fd_get_file() and grab the files lock so we can get the file
and check if we need to go async in one atomic operation. That gets rid
of the need for splitting this into two steps, and hence the need for
IO_WQ_WORK_NO_CANCEL.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 64 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 29 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3f6d055eb6d4..4dd18c81789c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -423,7 +423,6 @@ struct io_poll_remove {
 
 struct io_close {
 	struct file			*file;
-	struct file			*put_file;
 	int				fd;
 };
 
@@ -920,8 +919,6 @@ static const struct io_op_def io_op_defs[] = {
 						IO_WQ_WORK_FS | IO_WQ_WORK_MM,
 	},
 	[IORING_OP_CLOSE] = {
-		.needs_file		= 1,
-		.needs_file_no_error	= 1,
 		.work_flags		= IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
 	},
 	[IORING_OP_FILES_UPDATE] = {
@@ -4475,13 +4472,6 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)
 
 static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	/*
-	 * If we queue this for async, it must not be cancellable. That would
-	 * leave the 'file' in an undeterminate state, and here need to modify
-	 * io_wq_work.flags, so initialize io_wq_work firstly.
-	 */
-	io_req_init_async(req);
-
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
@@ -4491,43 +4481,59 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EBADF;
 
 	req->close.fd = READ_ONCE(sqe->fd);
-	if ((req->file && req->file->f_op == &io_uring_fops))
-		return -EBADF;
-
-	req->close.put_file = NULL;
 	return 0;
 }
 
 static int io_close(struct io_kiocb *req, bool force_nonblock,
 		    struct io_comp_state *cs)
 {
+	struct files_struct *files = current->files;
 	struct io_close *close = &req->close;
+	struct fdtable *fdt;
+	struct file *file;
 	int ret;
 
-	/* might be already done during nonblock submission */
-	if (!close->put_file) {
-		ret = close_fd_get_file(close->fd, &close->put_file);
-		if (ret < 0)
-			return (ret == -ENOENT) ? -EBADF : ret;
+	file = NULL;
+	ret = -EBADF;
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	if (close->fd >= fdt->max_fds) {
+		spin_unlock(&files->file_lock);
+		goto err;
+	}
+	file = fdt->fd[close->fd];
+	if (!file) {
+		spin_unlock(&files->file_lock);
+		goto err;
+	}
+
+	if (file->f_op == &io_uring_fops) {
+		spin_unlock(&files->file_lock);
+		file = NULL;
+		goto err;
 	}
 
 	/* if the file has a flush method, be safe and punt to async */
-	if (close->put_file->f_op->flush && force_nonblock) {
-		/* not safe to cancel at this point */
-		req->work.flags |= IO_WQ_WORK_NO_CANCEL;
-		/* was never set, but play safe */
-		req->flags &= ~REQ_F_NOWAIT;
-		/* avoid grabbing files - we don't need the files */
-		req->flags |= REQ_F_NO_FILE_TABLE;
+	if (file->f_op->flush && force_nonblock) {
+		spin_unlock(&files->file_lock);
 		return -EAGAIN;
 	}
 
+	ret = __close_fd_get_file(close->fd, &file);
+	spin_unlock(&files->file_lock);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = -EBADF;
+		goto err;
+	}
+
 	/* No ->flush() or already async, safely close from here */
-	ret = filp_close(close->put_file, req->work.identity->files);
+	ret = filp_close(file, current->files);
+err:
 	if (ret < 0)
 		req_set_fail_links(req);
-	fput(close->put_file);
-	close->put_file = NULL;
+	if (file)
+		fput(file);
 	__io_req_complete(req, ret, 0, cs);
 	return 0;
 }

From 4014d943cb62db892eb023d385a966a3fce5ee4c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 19 Jan 2021 15:53:54 -0700
Subject: [PATCH 046/183] io_uring/io-wq: kill off now unused
 IO_WQ_WORK_NO_CANCEL

It's no longer used as IORING_OP_CLOSE got rid for the need of flagging
it as uncancelable, kill it of.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    | 1 -
 fs/io-wq.h    | 1 -
 fs/io_uring.c | 5 +----
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index a564f36e260c..2e2f14f42bf2 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -944,7 +944,6 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
 	 */
 	spin_lock_irqsave(&worker->lock, flags);
 	if (worker->cur_work &&
-	    !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
 	    match->fn(worker->cur_work, match->data)) {
 		send_sig(SIGINT, worker->task, 1);
 		match->nr_running++;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index b158f8addcf3..e1ffb80a4a1d 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -9,7 +9,6 @@ enum {
 	IO_WQ_WORK_CANCEL	= 1,
 	IO_WQ_WORK_HASHED	= 2,
 	IO_WQ_WORK_UNBOUND	= 4,
-	IO_WQ_WORK_NO_CANCEL	= 8,
 	IO_WQ_WORK_CONCURRENT	= 16,
 
 	IO_WQ_WORK_FILES	= 32,
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4dd18c81789c..be73f6ddbd9e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6386,11 +6386,8 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
 	if (timeout)
 		io_queue_linked_timeout(timeout);
 
-	/* if NO_CANCEL is set, we must still run the work */
-	if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
-				IO_WQ_WORK_CANCEL) {
+	if (work->flags & IO_WQ_WORK_CANCEL)
 		ret = -ECANCELED;
-	}
 
 	if (!ret) {
 		do {

From 0bead8cd39b9c9c7c4e902018ccf129107ac50ef Mon Sep 17 00:00:00 2001
From: Yejune Deng <yejune.deng@gmail.com>
Date: Thu, 24 Dec 2020 11:02:20 +0800
Subject: [PATCH 047/183] io_uring: simplify io_remove_personalities()

The function io_remove_personalities() is very similar to
io_unregister_personality(),so implement io_remove_personalities()
calling io_unregister_personality().

Signed-off-by: Yejune Deng <yejune.deng@gmail.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index be73f6ddbd9e..b05d37431c12 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8805,9 +8805,8 @@ static int io_uring_fasync(int fd, struct file *file, int on)
 	return fasync_helper(fd, file, on, &ctx->cq_fasync);
 }
 
-static int io_remove_personalities(int id, void *p, void *data)
+static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
 {
-	struct io_ring_ctx *ctx = data;
 	struct io_identity *iod;
 
 	iod = idr_remove(&ctx->personality_idr, id);
@@ -8815,7 +8814,17 @@ static int io_remove_personalities(int id, void *p, void *data)
 		put_cred(iod->creds);
 		if (refcount_dec_and_test(&iod->count))
 			kfree(iod);
+		return 0;
 	}
+
+	return -EINVAL;
+}
+
+static int io_remove_personalities(int id, void *p, void *data)
+{
+	struct io_ring_ctx *ctx = data;
+
+	io_unregister_personality(ctx, id);
 	return 0;
 }
 
@@ -9951,21 +9960,6 @@ static int io_register_personality(struct io_ring_ctx *ctx)
 	return ret;
 }
 
-static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
-{
-	struct io_identity *iod;
-
-	iod = idr_remove(&ctx->personality_idr, id);
-	if (iod) {
-		put_cred(iod->creds);
-		if (refcount_dec_and_test(&iod->count))
-			kfree(iod);
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
 static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
 				    unsigned int nr_args)
 {

From ecfc8492820732be652146280912554ced62c32b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 25 Jan 2021 11:42:20 +0000
Subject: [PATCH 048/183] io_uring: ensure only sqo_task has file notes

For SQPOLL io_uring we want to have only one file note held by
sqo_task. Add a warning to make sure it holds. It's deep in
io_uring_add_task_file() out of hot path, so shouldn't hurt.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b05d37431c12..68bf2c8c23a9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9099,6 +9099,10 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
 				fput(file);
 				return ret;
 			}
+
+			/* one and only SQPOLL file note, held by sqo_task */
+			WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) &&
+				     current != ctx->sqo_task);
 		}
 		tctx->last = file;
 	}

From 7c6607313f032b73638a6f752cb4adf50ba947cf Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 25 Jan 2021 11:42:21 +0000
Subject: [PATCH 049/183] io_uring: consolidate putting reqs task

We grab a task for each request and while putting it it also have to do
extra work like inflight accounting and waking up that task. This
sequence is duplicated several time, it's good time to add a helper.
More to that, the helper generates better code due to better locality
and so not failing alias analysis.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 68bf2c8c23a9..6d45a0975d9c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2023,17 +2023,22 @@ static void io_dismantle_req(struct io_kiocb *req)
 	io_req_clean_work(req);
 }
 
+static inline void io_put_task(struct task_struct *task, int nr)
+{
+	struct io_uring_task *tctx = task->io_uring;
+
+	percpu_counter_sub(&tctx->inflight, nr);
+	if (unlikely(atomic_read(&tctx->in_idle)))
+		wake_up(&tctx->wait);
+	put_task_struct_many(task, nr);
+}
+
 static void __io_free_req(struct io_kiocb *req)
 {
-	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
 
 	io_dismantle_req(req);
-
-	percpu_counter_dec(&tctx->inflight);
-	if (atomic_read(&tctx->in_idle))
-		wake_up(&tctx->wait);
-	put_task_struct(req->task);
+	io_put_task(req->task, 1);
 
 	if (likely(!io_is_fallback_req(req)))
 		kmem_cache_free(req_cachep, req);
@@ -2287,12 +2292,7 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
 	if (rb->to_free)
 		__io_req_free_batch_flush(ctx, rb);
 	if (rb->task) {
-		struct io_uring_task *tctx = rb->task->io_uring;
-
-		percpu_counter_sub(&tctx->inflight, rb->task_refs);
-		if (atomic_read(&tctx->in_idle))
-			wake_up(&tctx->wait);
-		put_task_struct_many(rb->task, rb->task_refs);
+		io_put_task(rb->task, rb->task_refs);
 		rb->task = NULL;
 	}
 }
@@ -2306,14 +2306,8 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
 	io_queue_next(req);
 
 	if (req->task != rb->task) {
-		if (rb->task) {
-			struct io_uring_task *tctx = rb->task->io_uring;
-
-			percpu_counter_sub(&tctx->inflight, rb->task_refs);
-			if (atomic_read(&tctx->in_idle))
-				wake_up(&tctx->wait);
-			put_task_struct_many(rb->task, rb->task_refs);
-		}
+		if (rb->task)
+			io_put_task(rb->task, rb->task_refs);
 		rb->task = req->task;
 		rb->task_refs = 0;
 	}

From 67973b933e347c38478b591d6c9dc076bea7c9dc Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 26 Jan 2021 13:51:09 +0000
Subject: [PATCH 050/183] io_uring: cleanup files_update looping

Replace a while with a simple for loop, that looks way more natural, and
enables us to use "continue" as indexes are no more updated by hand in
the end of the loop.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6d45a0975d9c..0ca99bd5c316 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8028,9 +8028,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		return -ENOMEM;
 	init_fixed_file_ref_node(ctx, ref_node);
 
-	done = 0;
 	fds = u64_to_user_ptr(up->data);
-	while (nr_args) {
+	for (done = 0; done < nr_args; done++) {
 		struct fixed_rsrc_table *table;
 		unsigned index;
 
@@ -8039,7 +8038,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			err = -EFAULT;
 			break;
 		}
-		i = array_index_nospec(up->offset, ctx->nr_user_files);
+		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
 		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
 		index = i & IORING_FILE_TABLE_MASK;
 		if (table->files[index]) {
@@ -8077,9 +8076,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				break;
 			}
 		}
-		nr_args--;
-		done++;
-		up->offset++;
 	}
 
 	if (needs_switch) {

From 4e0377a1c5c633852f443a562ec55f7dfea65350 Mon Sep 17 00:00:00 2001
From: noah <goldstein.w.n@gmail.com>
Date: Tue, 26 Jan 2021 15:23:28 -0500
Subject: [PATCH 051/183] io_uring: Add skip option for __io_sqe_files_update

This patch adds support for skipping a file descriptor when using
IORING_REGISTER_FILES_UPDATE.  __io_sqe_files_update will skip fds set
to IORING_REGISTER_FILES_SKIP. IORING_REGISTER_FILES_SKIP is inturn
added as a #define in io_uring.h

Signed-off-by: noah <goldstein.w.n@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 3 +++
 include/uapi/linux/io_uring.h | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0ca99bd5c316..dd83a64ba709 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8038,6 +8038,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			err = -EFAULT;
 			break;
 		}
+		if (fd == IORING_REGISTER_FILES_SKIP)
+			continue;
+
 		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
 		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
 		index = i & IORING_FILE_TABLE_MASK;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f9f106c54d90..ac4e1738a9af 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -298,6 +298,9 @@ struct io_uring_rsrc_update {
 	__aligned_u64 data;
 };
 
+/* Skip updating fd indexes set to this value in the fd table */
+#define IORING_REGISTER_FILES_SKIP	(-2)
+
 #define IO_URING_OP_SUPPORTED	(1U << 0)
 
 struct io_uring_probe_op {

From 090da7d52fe2aeabb73bf300154278e411cd069e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 27 Jan 2021 01:25:04 +0000
Subject: [PATCH 052/183] MAINTAINERS: update io_uring section

Add the missing kernel io_uring header, add Pavel as a reviewer, and
exclude io_uring from the FILESYSTEMS section to avoid keep spamming Al
(mainly) with bug reports, patches, etc.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index d3e847f7f3dc..363e7aa3b79c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6863,6 +6863,9 @@ F:	include/linux/fs.h
 F:	include/linux/fs_types.h
 F:	include/uapi/linux/fs.h
 F:	include/uapi/linux/openat2.h
+X:	fs/io-wq.c
+X:	fs/io-wq.h
+X:	fs/io_uring.c
 
 FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
 M:	Riku Voipio <riku.voipio@iki.fi>
@@ -9295,6 +9298,7 @@ F:	include/uapi/linux/iommu.h
 
 IO_URING
 M:	Jens Axboe <axboe@kernel.dk>
+R:	Pavel Begunkov <asml.silence@gmail.com>
 L:	io-uring@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.dk/linux-block
@@ -9302,6 +9306,7 @@ T:	git git://git.kernel.dk/liburing
 F:	fs/io-wq.c
 F:	fs/io-wq.h
 F:	fs/io_uring.c
+F:	include/linux/io_uring.h
 F:	include/uapi/linux/io_uring.h
 
 IPMI SUBSYSTEM

From 8b28fdf21193d35d6ec5a8430f0241f5f977c6ac Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Sun, 31 Jan 2021 22:39:04 +0800
Subject: [PATCH 053/183] io_uring: check kthread parked flag before sqthread
 goes to sleep

Abaci reported this issue:

#[  605.170872] INFO: task kworker/u4:1:53 blocked for more than 143 seconds.
[  605.172123]       Not tainted 5.10.0+ #1
[  605.172811] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  605.173915] task:kworker/u4:1    state:D stack:    0 pid:   53 ppid:     2 flags:0x00004000
[  605.175130] Workqueue: events_unbound io_ring_exit_work
[  605.175931] Call Trace:
[  605.176334]  __schedule+0xe0e/0x25a0
[  605.176971]  ? firmware_map_remove+0x1a1/0x1a1
[  605.177631]  ? write_comp_data+0x2a/0x80
[  605.178272]  schedule+0xd0/0x270
[  605.178811]  schedule_timeout+0x6b6/0x940
[  605.179415]  ? mark_lock.part.0+0xca/0x1420
[  605.180062]  ? usleep_range+0x170/0x170
[  605.180684]  ? wait_for_completion+0x16d/0x280
[  605.181392]  ? mark_held_locks+0x9e/0xe0
[  605.182079]  ? rwlock_bug.part.0+0x90/0x90
[  605.182853]  ? lockdep_hardirqs_on_prepare+0x286/0x400
[  605.183817]  wait_for_completion+0x175/0x280
[  605.184713]  ? wait_for_completion_interruptible+0x340/0x340
[  605.185611]  ? _raw_spin_unlock_irq+0x24/0x30
[  605.186307]  ? migrate_swap_stop+0x9c0/0x9c0
[  605.187046]  kthread_park+0x127/0x1c0
[  605.187738]  io_sq_thread_stop+0xd5/0x530
[  605.188459]  io_ring_exit_work+0xb1/0x970
[  605.189207]  process_one_work+0x92c/0x1510
[  605.189947]  ? pwq_dec_nr_in_flight+0x360/0x360
[  605.190682]  ? rwlock_bug.part.0+0x90/0x90
[  605.191430]  ? write_comp_data+0x2a/0x80
[  605.192207]  worker_thread+0x9b/0xe20
[  605.192900]  ? process_one_work+0x1510/0x1510
[  605.193599]  kthread+0x353/0x460
[  605.194154]  ? _raw_spin_unlock_irq+0x24/0x30
[  605.194910]  ? kthread_create_on_node+0x100/0x100
[  605.195821]  ret_from_fork+0x1f/0x30
[  605.196605]
[  605.196605] Showing all locks held in the system:
[  605.197598] 1 lock held by khungtaskd/25:
[  605.198301]  #0: ffffffff8b5f76a0 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire.constprop.0+0x0/0x30
[  605.199914] 3 locks held by kworker/u4:1/53:
[  605.200609]  #0: ffff888100109938 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x82a/0x1510
[  605.202108]  #1: ffff888100e47dc0 ((work_completion)(&ctx->exit_work)){+.+.}-{0:0}, at: process_one_work+0x85e/0x1510
[  605.203681]  #2: ffff888116931870 (&sqd->lock){+.+.}-{3:3}, at: io_sq_thread_park.part.0+0x19/0x50
[  605.205183] 3 locks held by systemd-journal/161:
[  605.206037] 1 lock held by syslog-ng/254:
[  605.206674] 2 locks held by agetty/311:
[  605.207292]  #0: ffff888101097098 (&tty->ldisc_sem){++++}-{0:0}, at: tty_ldisc_ref_wait+0x27/0x80
[  605.208715]  #1: ffffc900000332e8 (&ldata->atomic_read_lock){+.+.}-{3:3}, at: n_tty_read+0x222/0x1bb0
[  605.210131] 2 locks held by bash/677:
[  605.210723]  #0: ffff88810419a098 (&tty->ldisc_sem){++++}-{0:0}, at: tty_ldisc_ref_wait+0x27/0x80
[  605.212105]  #1: ffffc900000512e8 (&ldata->atomic_read_lock){+.+.}-{3:3}, at: n_tty_read+0x222/0x1bb0
[  605.213777]
[  605.214151] =============================================

I believe this is caused by the follow race:

(ctx_list is empty now)
=> io_put_sq_data               |
==> kthread_park(sqd->thread);  |
====> set KTHREAD_SHOULD_PARK	|
====> wake_up_process(k)        | sq thread is running
				|
				|
				| needs_sched is true since no ctx,
				| so TASK_INTERRUPTIBLE set and schedule
				| out then never wake up again
				|
====> wait_for_completion	|
	(stuck here)

So check if sqthread gets park flag right before schedule().
since ctx_list is always empty when this problem happens, here I put
kthread_should_park() before setting the wakeup flag(ctx_list is empty
so this for loop is fast), where is close enough to schedule(). The
problem doesn't show again in my repro testing after this fix.

Reported-by: Abaci <abaci@linux.alibaba.com>
Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index dd83a64ba709..a8bf867b6cf2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7117,9 +7117,6 @@ static int io_sq_thread(void *data)
 			continue;
 		}
 
-		if (kthread_should_park())
-			continue;
-
 		needs_sched = true;
 		prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
 		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
@@ -7134,7 +7131,7 @@ static int io_sq_thread(void *data)
 			}
 		}
 
-		if (needs_sched) {
+		if (needs_sched && !kthread_should_park()) {
 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
 				io_ring_set_wakeup_flag(ctx);
 

From 13770a71ed35512cc73c6b350297a797f0b27880 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 1 Feb 2021 15:23:42 +0300
Subject: [PATCH 054/183] io_uring: Fix NULL dereference in error in
 io_sqe_files_register()

If we hit a "goto out_free;" before the "ctx->file_data" pointer has
been assigned then it leads to a NULL derefence when we call:

	free_fixed_rsrc_data(ctx->file_data);

We can fix this by moving the assignment earlier.

Fixes: 1ad555c6ae6e ("io_uring: create common fixed_rsrc_data allocation routines")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a8bf867b6cf2..6711200ece22 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7865,6 +7865,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	file_data = alloc_fixed_rsrc_data(ctx);
 	if (!file_data)
 		return -ENOMEM;
+	ctx->file_data = file_data;
 
 	nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
 	file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
@@ -7874,7 +7875,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 
 	if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
 		goto out_free;
-	ctx->file_data = file_data;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
 		struct fixed_rsrc_table *table;

From 9ae1f8dd372e0e4c020b345cf9e09f519265e981 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 1 Feb 2021 18:59:51 +0000
Subject: [PATCH 055/183] io_uring: fix inconsistent lock state

WARNING: inconsistent lock state

inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
syz-executor217/8450 [HC1[1]:SC0[0]:HE0:SE1] takes:
ffff888023d6e620 (&fs->lock){?.+.}-{2:2}, at: spin_lock include/linux/spinlock.h:354 [inline]
ffff888023d6e620 (&fs->lock){?.+.}-{2:2}, at: io_req_clean_work fs/io_uring.c:1398 [inline]
ffff888023d6e620 (&fs->lock){?.+.}-{2:2}, at: io_dismantle_req+0x66f/0xf60 fs/io_uring.c:2029

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(&fs->lock);
  <Interrupt>
    lock(&fs->lock);

 *** DEADLOCK ***

1 lock held by syz-executor217/8450:
 #0: ffff88802417c3e8 (&ctx->uring_lock){+.+.}-{3:3}, at: __do_sys_io_uring_enter+0x1071/0x1f30 fs/io_uring.c:9442

stack backtrace:
CPU: 1 PID: 8450 Comm: syz-executor217 Not tainted 5.11.0-rc5-next-20210129-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 <IRQ>
[...]
 _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:151
 spin_lock include/linux/spinlock.h:354 [inline]
 io_req_clean_work fs/io_uring.c:1398 [inline]
 io_dismantle_req+0x66f/0xf60 fs/io_uring.c:2029
 __io_free_req+0x3d/0x2e0 fs/io_uring.c:2046
 io_free_req fs/io_uring.c:2269 [inline]
 io_double_put_req fs/io_uring.c:2392 [inline]
 io_put_req+0xf9/0x570 fs/io_uring.c:2388
 io_link_timeout_fn+0x30c/0x480 fs/io_uring.c:6497
 __run_hrtimer kernel/time/hrtimer.c:1519 [inline]
 __hrtimer_run_queues+0x609/0xe40 kernel/time/hrtimer.c:1583
 hrtimer_interrupt+0x334/0x940 kernel/time/hrtimer.c:1645
 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1085 [inline]
 __sysvec_apic_timer_interrupt+0x146/0x540 arch/x86/kernel/apic/apic.c:1102
 asm_call_irq_on_stack+0xf/0x20
 </IRQ>
 __run_sysvec_on_irqstack arch/x86/include/asm/irq_stack.h:37 [inline]
 run_sysvec_on_irqstack_cond arch/x86/include/asm/irq_stack.h:89 [inline]
 sysvec_apic_timer_interrupt+0xbd/0x100 arch/x86/kernel/apic/apic.c:1096
 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:629
RIP: 0010:__raw_spin_unlock_irq include/linux/spinlock_api_smp.h:169 [inline]
RIP: 0010:_raw_spin_unlock_irq+0x25/0x40 kernel/locking/spinlock.c:199
 spin_unlock_irq include/linux/spinlock.h:404 [inline]
 io_queue_linked_timeout+0x194/0x1f0 fs/io_uring.c:6525
 __io_queue_sqe+0x328/0x1290 fs/io_uring.c:6594
 io_queue_sqe+0x631/0x10d0 fs/io_uring.c:6639
 io_queue_link_head fs/io_uring.c:6650 [inline]
 io_submit_sqe fs/io_uring.c:6697 [inline]
 io_submit_sqes+0x19b5/0x2720 fs/io_uring.c:6960
 __do_sys_io_uring_enter+0x107d/0x1f30 fs/io_uring.c:9443
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Don't free requests from under hrtimer context (softirq) as it may sleep
or take spinlocks improperly (e.g. non-irq versions).

Cc: stable@vger.kernel.org # 5.6+
Reported-by: syzbot+81d17233a2b02eafba33@syzkaller.appspotmail.com
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6711200ece22..1310c074f4cc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1886,8 +1886,8 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
 	__io_cqring_fill_event(req, res, 0);
 }
 
-static void io_req_complete_nostate(struct io_kiocb *req, long res,
-				    unsigned int cflags)
+static void io_req_complete_post(struct io_kiocb *req, long res,
+				 unsigned int cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
@@ -1898,6 +1898,12 @@ static void io_req_complete_nostate(struct io_kiocb *req, long res,
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	io_cqring_ev_posted(ctx);
+}
+
+static inline void io_req_complete_nostate(struct io_kiocb *req, long res,
+					   unsigned int cflags)
+{
+	io_req_complete_post(req, res, cflags);
 	io_put_req(req);
 }
 
@@ -6489,9 +6495,10 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 	if (prev) {
 		req_set_fail_links(prev);
 		io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
-		io_put_req(prev);
+		io_put_req_deferred(prev, 1);
 	} else {
-		io_req_complete(req, -ETIME);
+		io_req_complete_post(req, -ETIME, 0);
+		io_put_req_deferred(req, 1);
 	}
 	return HRTIMER_NORESTART;
 }

From ba13e23f37c795bdd993523a6749d7afbf5ff7fb Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 1 Feb 2021 18:59:52 +0000
Subject: [PATCH 056/183] io_uring: kill not used needs_file_no_error

We have no request types left using needs_file_no_error, remove it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1310c074f4cc..66d9f3f4e43b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -788,8 +788,6 @@ struct io_submit_state {
 struct io_op_def {
 	/* needs req->file assigned */
 	unsigned		needs_file : 1;
-	/* don't fail if file grab fails */
-	unsigned		needs_file_no_error : 1;
 	/* hash wq insertion if file is a regular file */
 	unsigned		hash_reg_file : 1;
 	/* unbound wq insertion if file is a non-regular file */
@@ -6896,8 +6894,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		bool fixed = req->flags & REQ_F_FIXED_FILE;
 
 		req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed);
-		if (unlikely(!req->file &&
-		    !io_op_defs[req->opcode].needs_file_no_error))
+		if (unlikely(!req->file))
 			ret = -EBADF;
 	}
 

From 34e08fed2c1cc67df88d85fedde1d05fec62e5ca Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 1 Feb 2021 18:59:53 +0000
Subject: [PATCH 057/183] io_uring: inline io_req_drop_files()

req->files now have same lifetime as all other iowq-work resources,
inline io_req_drop_files() for consistency. Moreover, since
REQ_F_INFLIGHT is no more files specific, the function name became
very confusing.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 66d9f3f4e43b..9354e61243d9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1036,7 +1036,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 			     const struct iovec *fast_iov,
 			     struct iov_iter *iter, bool force);
-static void io_req_drop_files(struct io_kiocb *req);
 static void io_req_task_queue(struct io_kiocb *req);
 
 static struct kmem_cache *req_cachep;
@@ -1402,8 +1401,23 @@ static void io_req_clean_work(struct io_kiocb *req)
 			free_fs_struct(fs);
 		req->work.flags &= ~IO_WQ_WORK_FS;
 	}
-	if (req->flags & REQ_F_INFLIGHT)
-		io_req_drop_files(req);
+	if (req->work.flags & IO_WQ_WORK_FILES) {
+		put_files_struct(req->work.identity->files);
+		put_nsproxy(req->work.identity->nsproxy);
+		req->work.flags &= ~IO_WQ_WORK_FILES;
+	}
+	if (req->flags & REQ_F_INFLIGHT) {
+		struct io_ring_ctx *ctx = req->ctx;
+		struct io_uring_task *tctx = req->task->io_uring;
+		unsigned long flags;
+
+		spin_lock_irqsave(&ctx->inflight_lock, flags);
+		list_del(&req->inflight_entry);
+		spin_unlock_irqrestore(&ctx->inflight_lock, flags);
+		req->flags &= ~REQ_F_INFLIGHT;
+		if (atomic_read(&tctx->in_idle))
+			wake_up(&tctx->wait);
+	}
 
 	io_put_identity(req->task->io_uring, req);
 }
@@ -6164,25 +6178,6 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return -EIOCBQUEUED;
 }
 
-static void io_req_drop_files(struct io_kiocb *req)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-	struct io_uring_task *tctx = req->task->io_uring;
-	unsigned long flags;
-
-	if (req->work.flags & IO_WQ_WORK_FILES) {
-		put_files_struct(req->work.identity->files);
-		put_nsproxy(req->work.identity->nsproxy);
-	}
-	spin_lock_irqsave(&ctx->inflight_lock, flags);
-	list_del(&req->inflight_entry);
-	spin_unlock_irqrestore(&ctx->inflight_lock, flags);
-	req->flags &= ~REQ_F_INFLIGHT;
-	req->work.flags &= ~IO_WQ_WORK_FILES;
-	if (atomic_read(&tctx->in_idle))
-		wake_up(&tctx->wait);
-}
-
 static void __io_clean_op(struct io_kiocb *req)
 {
 	if (req->flags & REQ_F_BUFFER_SELECTED) {

From e86d004729ae9ce7d16ff3fad3708e1601eec0d2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 1 Feb 2021 18:59:54 +0000
Subject: [PATCH 058/183] io_uring: remove work flags after cleanup

Shouldn't be a problem now, but it's better to clean
REQ_F_WORK_INITIALIZED and work->flags only after relevant resources are
killed, so cancellation see them.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9354e61243d9..9b1f919b05c9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1374,22 +1374,14 @@ static void io_req_clean_work(struct io_kiocb *req)
 	if (!(req->flags & REQ_F_WORK_INITIALIZED))
 		return;
 
-	req->flags &= ~REQ_F_WORK_INITIALIZED;
-
-	if (req->work.flags & IO_WQ_WORK_MM) {
+	if (req->work.flags & IO_WQ_WORK_MM)
 		mmdrop(req->work.identity->mm);
-		req->work.flags &= ~IO_WQ_WORK_MM;
-	}
 #ifdef CONFIG_BLK_CGROUP
-	if (req->work.flags & IO_WQ_WORK_BLKCG) {
+	if (req->work.flags & IO_WQ_WORK_BLKCG)
 		css_put(req->work.identity->blkcg_css);
-		req->work.flags &= ~IO_WQ_WORK_BLKCG;
-	}
 #endif
-	if (req->work.flags & IO_WQ_WORK_CREDS) {
+	if (req->work.flags & IO_WQ_WORK_CREDS)
 		put_cred(req->work.identity->creds);
-		req->work.flags &= ~IO_WQ_WORK_CREDS;
-	}
 	if (req->work.flags & IO_WQ_WORK_FS) {
 		struct fs_struct *fs = req->work.identity->fs;
 
@@ -1399,12 +1391,10 @@ static void io_req_clean_work(struct io_kiocb *req)
 		spin_unlock(&req->work.identity->fs->lock);
 		if (fs)
 			free_fs_struct(fs);
-		req->work.flags &= ~IO_WQ_WORK_FS;
 	}
 	if (req->work.flags & IO_WQ_WORK_FILES) {
 		put_files_struct(req->work.identity->files);
 		put_nsproxy(req->work.identity->nsproxy);
-		req->work.flags &= ~IO_WQ_WORK_FILES;
 	}
 	if (req->flags & REQ_F_INFLIGHT) {
 		struct io_ring_ctx *ctx = req->ctx;
@@ -1419,6 +1409,9 @@ static void io_req_clean_work(struct io_kiocb *req)
 			wake_up(&tctx->wait);
 	}
 
+	req->flags &= ~REQ_F_WORK_INITIALIZED;
+	req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS |
+			     IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES);
 	io_put_identity(req->task->io_uring, req);
 }
 

From ce3d5aae331fa0eb1e88199e0380f517ed0c58f6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 1 Feb 2021 18:59:55 +0000
Subject: [PATCH 059/183] io_uring: deduplicate adding to REQ_F_INFLIGHT

We don't know for how long REQ_F_INFLIGHT is going to stay, cleaner to
extract a helper for marking requests as so.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9b1f919b05c9..77878274fcb1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1460,11 +1460,24 @@ static bool io_identity_cow(struct io_kiocb *req)
 	return true;
 }
 
+static void io_req_track_inflight(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (!(req->flags & REQ_F_INFLIGHT)) {
+		io_req_init_async(req);
+		req->flags |= REQ_F_INFLIGHT;
+
+		spin_lock_irq(&ctx->inflight_lock);
+		list_add(&req->inflight_entry, &ctx->inflight_list);
+		spin_unlock_irq(&ctx->inflight_lock);
+	}
+}
+
 static bool io_grab_identity(struct io_kiocb *req)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
 	struct io_identity *id = req->work.identity;
-	struct io_ring_ctx *ctx = req->ctx;
 
 	if (def->work_flags & IO_WQ_WORK_FSIZE) {
 		if (id->fsize != rlimit(RLIMIT_FSIZE))
@@ -1520,15 +1533,8 @@ static bool io_grab_identity(struct io_kiocb *req)
 			return false;
 		atomic_inc(&id->files->count);
 		get_nsproxy(id->nsproxy);
-
-		if (!(req->flags & REQ_F_INFLIGHT)) {
-			req->flags |= REQ_F_INFLIGHT;
-
-			spin_lock_irq(&ctx->inflight_lock);
-			list_add(&req->inflight_entry, &ctx->inflight_list);
-			spin_unlock_irq(&ctx->inflight_lock);
-		}
 		req->work.flags |= IO_WQ_WORK_FILES;
+		io_req_track_inflight(req);
 	}
 	if (!(req->work.flags & IO_WQ_WORK_MM) &&
 	    (def->work_flags & IO_WQ_WORK_MM)) {
@@ -6443,16 +6449,8 @@ static struct file *io_file_get(struct io_submit_state *state,
 		file = __io_file_get(state, fd);
 	}
 
-	if (file && file->f_op == &io_uring_fops &&
-	    !(req->flags & REQ_F_INFLIGHT)) {
-		io_req_init_async(req);
-		req->flags |= REQ_F_INFLIGHT;
-
-		spin_lock_irq(&ctx->inflight_lock);
-		list_add(&req->inflight_entry, &ctx->inflight_list);
-		spin_unlock_irq(&ctx->inflight_lock);
-	}
-
+	if (file && unlikely(file->f_op == &io_uring_fops))
+		io_req_track_inflight(req);
 	return file;
 }
 

From 57cd657b8272a66277c139e7bbdc8b86057cb415 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 1 Feb 2021 18:59:56 +0000
Subject: [PATCH 060/183] io_uring: simplify do_read return parsing

do_read() returning 0 bytes read (not -EAGAIN/etc.) is not an important
enough of a case to prioritise it. Fold it into ret < 0 check, so we get
rid of an extra if and make it a bit more readable.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 77878274fcb1..24ad36d71289 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3526,7 +3526,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	else
 		kiocb->ki_flags |= IOCB_NOWAIT;
 
-
 	/* If the file doesn't support async, just async punt */
 	no_async = force_nonblock && !io_file_supports_async(req->file, READ);
 	if (no_async)
@@ -3538,9 +3537,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 
 	ret = io_iter_do_read(req, iter);
 
-	if (!ret) {
-		goto done;
-	} else if (ret == -EIOCBQUEUED) {
+	if (ret == -EIOCBQUEUED) {
 		ret = 0;
 		goto out_free;
 	} else if (ret == -EAGAIN) {
@@ -3554,7 +3551,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = 0;
 		goto copy_iov;
-	} else if (ret < 0) {
+	} else if (ret <= 0) {
 		/* make sure -ERESTARTSYS -> -EINTR is done */
 		goto done;
 	}

From cc3456226176385aed8aa6ebb021ebb1380a0183 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@nvidia.com>
Date: Thu, 7 Jan 2021 17:34:13 +0200
Subject: [PATCH 061/183] nvmet: Use nvmet_is_port_enabled helper for pi_enable

Remove code duplication.

Signed-off-by: Israel Rukshin <israelr@nvidia.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/configfs.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index c61ffd767062..b2021bf6cee5 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -266,10 +266,8 @@ static ssize_t nvmet_param_pi_enable_store(struct config_item *item,
 	if (strtobool(page, &val))
 		return -EINVAL;
 
-	if (port->enabled) {
-		pr_err("Disable port before setting pi_enable value.\n");
+	if (nvmet_is_port_enabled(port, __func__))
 		return -EACCES;
-	}
 
 	port->pi_enable = val;
 	return count;

From 36ca03c830e41769c62d2ca15be8351059f86c45 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@nvidia.com>
Date: Thu, 7 Jan 2021 17:34:14 +0200
Subject: [PATCH 062/183] nvmet: Fix nvmet_is_port_enabled indentation

Remove extra tab.

Signed-off-by: Israel Rukshin <israelr@nvidia.com>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/configfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index b2021bf6cee5..635a7cb45d0b 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -45,7 +45,7 @@ static bool nvmet_is_port_enabled(struct nvmet_port *p, const char *caller)
 {
 	if (p->enabled)
 		pr_err("Disable port '%u' before changing attribute in %s\n",
-				le16_to_cpu(p->disc_addr.portid), caller);
+		       le16_to_cpu(p->disc_addr.portid), caller);
 	return p->enabled;
 }
 

From 4e2f02bf77dac7b8c841f93ae5a71556d733cb04 Mon Sep 17 00:00:00 2001
From: Leonid Ravich <Leonid.Ravich@emc.com>
Date: Sun, 3 Jan 2021 20:12:54 +0200
Subject: [PATCH 063/183] nvmet-fc: use RCU proctection for assoc_list

searching assoc_list protected by rcu_read_lock if list not changed inline.
and according to the rcu list rules.

queue array embedded into nvmet_fc_tgt_assoc protected by rcu_read_lock
according to rcu dereference/assign rules.

queue and assoc object freed after grace period by call_rcu.

tgtport lock taken for changing assoc_list.

Reviewed-by: Eldad Zinger <Eldad.Zinger@dell.com>
Reviewed-by: Elad Grupi <Elad.Grupi@dell.com>
Reviewed-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Leonid Ravich <Leonid.Ravich@emc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fc.c | 81 +++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 43 deletions(-)

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index cd4e73aa9807..c14c60bfdf85 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -145,6 +145,7 @@ struct nvmet_fc_tgt_queue {
 	struct list_head		avail_defer_list;
 	struct workqueue_struct		*work_q;
 	struct kref			ref;
+	struct rcu_head			rcu;
 	struct nvmet_fc_fcp_iod		fod[];		/* array of fcp_iods */
 } __aligned(sizeof(unsigned long long));
 
@@ -167,6 +168,7 @@ struct nvmet_fc_tgt_assoc {
 	struct nvmet_fc_tgt_queue	*queues[NVMET_NR_QUEUES + 1];
 	struct kref			ref;
 	struct work_struct		del_work;
+	struct rcu_head			rcu;
 };
 
 
@@ -790,7 +792,6 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
 			u16 qid, u16 sqsize)
 {
 	struct nvmet_fc_tgt_queue *queue;
-	unsigned long flags;
 	int ret;
 
 	if (qid > NVMET_NR_QUEUES)
@@ -829,9 +830,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
 		goto out_fail_iodlist;
 
 	WARN_ON(assoc->queues[qid]);
-	spin_lock_irqsave(&assoc->tgtport->lock, flags);
-	assoc->queues[qid] = queue;
-	spin_unlock_irqrestore(&assoc->tgtport->lock, flags);
+	rcu_assign_pointer(assoc->queues[qid], queue);
 
 	return queue;
 
@@ -851,11 +850,8 @@ nvmet_fc_tgt_queue_free(struct kref *ref)
 {
 	struct nvmet_fc_tgt_queue *queue =
 		container_of(ref, struct nvmet_fc_tgt_queue, ref);
-	unsigned long flags;
 
-	spin_lock_irqsave(&queue->assoc->tgtport->lock, flags);
-	queue->assoc->queues[queue->qid] = NULL;
-	spin_unlock_irqrestore(&queue->assoc->tgtport->lock, flags);
+	rcu_assign_pointer(queue->assoc->queues[queue->qid], NULL);
 
 	nvmet_fc_destroy_fcp_iodlist(queue->assoc->tgtport, queue);
 
@@ -863,7 +859,7 @@ nvmet_fc_tgt_queue_free(struct kref *ref)
 
 	destroy_workqueue(queue->work_q);
 
-	kfree(queue);
+	kfree_rcu(queue, rcu);
 }
 
 static void
@@ -965,24 +961,23 @@ nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport,
 	struct nvmet_fc_tgt_queue *queue;
 	u64 association_id = nvmet_fc_getassociationid(connection_id);
 	u16 qid = nvmet_fc_getqueueid(connection_id);
-	unsigned long flags;
 
 	if (qid > NVMET_NR_QUEUES)
 		return NULL;
 
-	spin_lock_irqsave(&tgtport->lock, flags);
-	list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(assoc, &tgtport->assoc_list, a_list) {
 		if (association_id == assoc->association_id) {
-			queue = assoc->queues[qid];
+			queue = rcu_dereference(assoc->queues[qid]);
 			if (queue &&
 			    (!atomic_read(&queue->connected) ||
 			     !nvmet_fc_tgt_q_get(queue)))
 				queue = NULL;
-			spin_unlock_irqrestore(&tgtport->lock, flags);
+			rcu_read_unlock();
 			return queue;
 		}
 	}
-	spin_unlock_irqrestore(&tgtport->lock, flags);
+	rcu_read_unlock();
 	return NULL;
 }
 
@@ -1137,7 +1132,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle)
 		}
 		if (!needrandom) {
 			assoc->association_id = ran;
-			list_add_tail(&assoc->a_list, &tgtport->assoc_list);
+			list_add_tail_rcu(&assoc->a_list, &tgtport->assoc_list);
 		}
 		spin_unlock_irqrestore(&tgtport->lock, flags);
 	}
@@ -1167,7 +1162,7 @@ nvmet_fc_target_assoc_free(struct kref *ref)
 
 	nvmet_fc_free_hostport(assoc->hostport);
 	spin_lock_irqsave(&tgtport->lock, flags);
-	list_del(&assoc->a_list);
+	list_del_rcu(&assoc->a_list);
 	oldls = assoc->rcv_disconn;
 	spin_unlock_irqrestore(&tgtport->lock, flags);
 	/* if pending Rcv Disconnect Association LS, send rsp now */
@@ -1177,7 +1172,7 @@ nvmet_fc_target_assoc_free(struct kref *ref)
 	dev_info(tgtport->dev,
 		"{%d:%d} Association freed\n",
 		tgtport->fc_target_port.port_num, assoc->a_id);
-	kfree(assoc);
+	kfree_rcu(assoc, rcu);
 	nvmet_fc_tgtport_put(tgtport);
 }
 
@@ -1198,7 +1193,6 @@ nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc)
 {
 	struct nvmet_fc_tgtport *tgtport = assoc->tgtport;
 	struct nvmet_fc_tgt_queue *queue;
-	unsigned long flags;
 	int i, terminating;
 
 	terminating = atomic_xchg(&assoc->terminating, 1);
@@ -1207,19 +1201,23 @@ nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc)
 	if (terminating)
 		return;
 
-	spin_lock_irqsave(&tgtport->lock, flags);
+
 	for (i = NVMET_NR_QUEUES; i >= 0; i--) {
-		queue = assoc->queues[i];
-		if (queue) {
-			if (!nvmet_fc_tgt_q_get(queue))
-				continue;
-			spin_unlock_irqrestore(&tgtport->lock, flags);
-			nvmet_fc_delete_target_queue(queue);
-			nvmet_fc_tgt_q_put(queue);
-			spin_lock_irqsave(&tgtport->lock, flags);
+		rcu_read_lock();
+		queue = rcu_dereference(assoc->queues[i]);
+		if (!queue) {
+			rcu_read_unlock();
+			continue;
 		}
+
+		if (!nvmet_fc_tgt_q_get(queue)) {
+			rcu_read_unlock();
+			continue;
+		}
+		rcu_read_unlock();
+		nvmet_fc_delete_target_queue(queue);
+		nvmet_fc_tgt_q_put(queue);
 	}
-	spin_unlock_irqrestore(&tgtport->lock, flags);
 
 	dev_info(tgtport->dev,
 		"{%d:%d} Association deleted\n",
@@ -1234,10 +1232,9 @@ nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport,
 {
 	struct nvmet_fc_tgt_assoc *assoc;
 	struct nvmet_fc_tgt_assoc *ret = NULL;
-	unsigned long flags;
 
-	spin_lock_irqsave(&tgtport->lock, flags);
-	list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(assoc, &tgtport->assoc_list, a_list) {
 		if (association_id == assoc->association_id) {
 			ret = assoc;
 			if (!nvmet_fc_tgt_a_get(assoc))
@@ -1245,7 +1242,7 @@ nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport,
 			break;
 		}
 	}
-	spin_unlock_irqrestore(&tgtport->lock, flags);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -1473,19 +1470,17 @@ nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport)
 static void
 __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport)
 {
-	struct nvmet_fc_tgt_assoc *assoc, *next;
-	unsigned long flags;
+	struct nvmet_fc_tgt_assoc *assoc;
 
-	spin_lock_irqsave(&tgtport->lock, flags);
-	list_for_each_entry_safe(assoc, next,
-				&tgtport->assoc_list, a_list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(assoc, &tgtport->assoc_list, a_list) {
 		if (!nvmet_fc_tgt_a_get(assoc))
 			continue;
 		if (!schedule_work(&assoc->del_work))
 			/* already deleting - release local reference */
 			nvmet_fc_tgt_a_put(assoc);
 	}
-	spin_unlock_irqrestore(&tgtport->lock, flags);
+	rcu_read_unlock();
 }
 
 /**
@@ -1568,16 +1563,16 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
 			continue;
 		spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
 
-		spin_lock_irqsave(&tgtport->lock, flags);
-		list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
-			queue = assoc->queues[0];
+		rcu_read_lock();
+		list_for_each_entry_rcu(assoc, &tgtport->assoc_list, a_list) {
+			queue = rcu_dereference(assoc->queues[0]);
 			if (queue && queue->nvme_sq.ctrl == ctrl) {
 				if (nvmet_fc_tgt_a_get(assoc))
 					found_ctrl = true;
 				break;
 			}
 		}
-		spin_unlock_irqrestore(&tgtport->lock, flags);
+		rcu_read_unlock();
 
 		nvmet_fc_tgtport_put(tgtport);
 

From 60b152a50820a125336ecae26da489059fc61ce1 Mon Sep 17 00:00:00 2001
From: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Date: Sat, 9 Jan 2021 00:41:47 +0100
Subject: [PATCH 064/183] nvme: constify static attribute_group structs

The only usage of these is to put their addresses in arrays of pointers
to const attribute_groups. Make them const to allow the compiler to put
them in read-only memory.

Signed-off-by: Rikard Falkeborn <rikard.falkeborn@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c     | 4 ++--
 drivers/nvme/host/fc.c       | 2 +-
 drivers/nvme/target/fcloop.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ba5df80881ea..ff0f42652abb 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2859,7 +2859,7 @@ static struct attribute *nvme_subsys_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group nvme_subsys_attrs_group = {
+static const struct attribute_group nvme_subsys_attrs_group = {
 	.attrs = nvme_subsys_attrs,
 };
 
@@ -3694,7 +3694,7 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
 	return a->mode;
 }
 
-static struct attribute_group nvme_dev_attrs_group = {
+static const struct attribute_group nvme_dev_attrs_group = {
 	.attrs		= nvme_dev_attrs,
 	.is_visible	= nvme_dev_attrs_are_visible,
 };
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 5f36cfa8136c..20dadd86e981 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3789,7 +3789,7 @@ static struct attribute *nvme_fc_attrs[] = {
 	NULL
 };
 
-static struct attribute_group nvme_fc_attr_group = {
+static const struct attribute_group nvme_fc_attr_group = {
 	.attrs = nvme_fc_attrs,
 };
 
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 68213f0a052b..54606f1872b4 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -1545,7 +1545,7 @@ static struct attribute *fcloop_dev_attrs[] = {
 	NULL
 };
 
-static struct attribute_group fclopp_dev_attrs_group = {
+static const struct attribute_group fclopp_dev_attrs_group = {
 	.attrs		= fcloop_dev_attrs,
 };
 

From f9063a53274d25a878310db3fb645bfa9e49c917 Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Fri, 8 Jan 2021 23:46:57 +0900
Subject: [PATCH 065/183] nvme: support command retry delay for admin command

The controller can request a delay retrying a failed command by setting
the Command Retry Delay (CRD) field in the Completion Queue Entry.

Currentlty this features is only applied to commands on the I/O queue, but
not to commands on the admin queue.  Retreive the nvme_ctrl from the
request so that no namespace is required and apply the feature to all
commands.

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ff0f42652abb..636a88c93194 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -279,14 +279,13 @@ static blk_status_t nvme_error_status(u16 status)
 
 static void nvme_retry_req(struct request *req)
 {
-	struct nvme_ns *ns = req->q->queuedata;
 	unsigned long delay = 0;
 	u16 crd;
 
 	/* The mask and shift result must be <= 3 */
 	crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
-	if (ns && crd)
-		delay = ns->ctrl->crdt[crd - 1] * 100;
+	if (crd)
+		delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
 
 	nvme_req(req)->retries++;
 	blk_mq_requeue_request(req, false);

From cb9b870fba3eba57cf3bcd7c6c4d4aa88bc5fe70 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 14 Jan 2021 13:15:24 -0800
Subject: [PATCH 066/183] nvme-tcp: fix wrong setting of request iov_iter

We might set the iov_iter direction wrong, which is harmless for this
use-case, but get it right. Also this makes the code slightly cleaner.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 881d28eb15e9..4367923d03e4 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -983,7 +983,6 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 			req->state = NVME_TCP_SEND_DATA;
 			if (queue->data_digest)
 				crypto_ahash_init(queue->snd_hash);
-			nvme_tcp_init_iter(req, WRITE);
 		} else {
 			nvme_tcp_done_send_req(queue);
 		}
@@ -1016,8 +1015,6 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 		req->state = NVME_TCP_SEND_DATA;
 		if (queue->data_digest)
 			crypto_ahash_init(queue->snd_hash);
-		if (!req->data_sent)
-			nvme_tcp_init_iter(req, WRITE);
 		return 1;
 	}
 	req->offset += ret;
@@ -2268,12 +2265,12 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
 	req->data_len = blk_rq_nr_phys_segments(rq) ?
 				blk_rq_payload_bytes(rq) : 0;
 	req->curr_bio = rq->bio;
+	if (req->curr_bio)
+		nvme_tcp_init_iter(req, rq_data_dir(rq));
 
 	if (rq_data_dir(rq) == WRITE &&
 	    req->data_len <= nvme_tcp_inline_data_size(queue))
 		req->pdu_len = req->data_len;
-	else if (req->curr_bio)
-		nvme_tcp_init_iter(req, READ);
 
 	pdu->hdr.type = nvme_tcp_cmd;
 	pdu->hdr.flags = 0;

From 60141aa08c08a43f3d22626b3a2532106a90a191 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 14 Jan 2021 13:15:25 -0800
Subject: [PATCH 067/183] nvme-tcp: get rid of unused helper function

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 4367923d03e4..f2f3471faed3 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -206,11 +206,6 @@ static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
 			req->pdu_len - req->pdu_sent);
 }
 
-static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
-{
-	return req->iter.iov_offset;
-}
-
 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
 {
 	return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?

From 0dc9edaf80ea3c48231d94cd482355699d453888 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 14 Jan 2021 13:15:26 -0800
Subject: [PATCH 068/183] nvme-tcp: pass multipage bvec to request iov_iter

iov_iter uses the right helpers so we should be able
to pass in a multipage bvec. Right now the iov_iter is
initialized with more segments that it needs which doesn't
fail because the iov_iter is capped by byte count, but it
is better to use a full multipage bvec iter.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index f2f3471faed3..4c13c7110dbe 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -224,24 +224,29 @@ static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
 	struct request *rq = blk_mq_rq_from_pdu(req);
 	struct bio_vec *vec;
 	unsigned int size;
-	int nsegs;
+	int nr_bvec;
 	size_t offset;
 
 	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
 		vec = &rq->special_vec;
-		nsegs = 1;
+		nr_bvec = 1;
 		size = blk_rq_payload_bytes(rq);
 		offset = 0;
 	} else {
 		struct bio *bio = req->curr_bio;
+		struct bvec_iter bi;
+		struct bio_vec bv;
 
 		vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
-		nsegs = bio_segments(bio);
+		nr_bvec = 0;
+		bio_for_each_bvec(bv, bio, bi) {
+			nr_bvec++;
+		}
 		size = bio->bi_iter.bi_size;
 		offset = bio->bi_iter.bi_bvec_done;
 	}
 
-	iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
+	iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
 	req->iter.iov_offset = offset;
 }
 

From fc97e942d90c2103755f2fcd9a068a4ee7dfc1bf Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Wed, 13 Jan 2021 23:36:27 +0900
Subject: [PATCH 069/183] nvme: refactor ns->ctrl by request

Just for current code in nvme_cleanup_cmd(), we don't have to get
namespace instance, but we need controller instance.

Controller instance can be retrieved by namespace instance, but it can
be directly accessed by nvme_request instance from request.

	ctrl = nvme_req(req)->ctrl;

We don't have to go around namespace instance from request instance
through gendisk.

Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 636a88c93194..009830d247f8 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -841,11 +841,11 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 void nvme_cleanup_cmd(struct request *req)
 {
 	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
-		struct nvme_ns *ns = req->rq_disk->private_data;
+		struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
 		struct page *page = req->special_vec.bv_page;
 
-		if (page == ns->ctrl->discard_page)
-			clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+		if (page == ctrl->discard_page)
+			clear_bit_unlock(0, &ctrl->discard_page_busy);
 		else
 			kfree(page_address(page) + req->special_vec.bv_offset);
 	}

From 624e67fdf9a657fe437d84dd9f28b35e594183dd Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 13 Jan 2021 17:33:52 -0800
Subject: [PATCH 070/183] nvmet: remove extra variable in smart log nsid

We remove the extra local variable struct nvmet_ns in
nvmet_get_smart_log_nsid() since req already has ns member that can be
reused, this also eliminates the explicit call to nvmet_put_namespace()
which is already present in the request completion path.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index dc1ea468b182..de804d9762dd 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -74,11 +74,11 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
 static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 		struct nvme_smart_log *slog)
 {
-	struct nvmet_ns *ns;
 	u64 host_reads, host_writes, data_units_read, data_units_written;
 
-	ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid);
-	if (!ns) {
+	req->ns = nvmet_find_namespace(req->sq->ctrl,
+				       req->cmd->get_log_page.nsid);
+	if (!req->ns) {
 		pr_err("Could not find namespace id : %d\n",
 				le32_to_cpu(req->cmd->get_log_page.nsid));
 		req->error_loc = offsetof(struct nvme_rw_command, nsid);
@@ -86,22 +86,20 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 	}
 
 	/* we don't have the right data for file backed ns */
-	if (!ns->bdev)
-		goto out;
+	if (!req->ns->bdev)
+		return NVME_SC_SUCCESS;
 
-	host_reads = part_stat_read(ns->bdev, ios[READ]);
+	host_reads = part_stat_read(req->ns->bdev, ios[READ]);
 	data_units_read =
-		DIV_ROUND_UP(part_stat_read(ns->bdev, sectors[READ]), 1000);
-	host_writes = part_stat_read(ns->bdev, ios[WRITE]);
+		DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[READ]), 1000);
+	host_writes = part_stat_read(req->ns->bdev, ios[WRITE]);
 	data_units_written =
-		DIV_ROUND_UP(part_stat_read(ns->bdev, sectors[WRITE]), 1000);
+		DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[WRITE]), 1000);
 
 	put_unaligned_le64(host_reads, &slog->host_reads[0]);
 	put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
 	put_unaligned_le64(host_writes, &slog->host_writes[0]);
 	put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
-out:
-	nvmet_put_namespace(ns);
 
 	return NVME_SC_SUCCESS;
 }

From 3631c7f4a24165b9431942b85b502454edb0c33b Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 13 Jan 2021 17:33:53 -0800
Subject: [PATCH 071/183] nvmet: remove extra variable in id-desclist

We remove the extra local variable struct nvmet_ns in
nvmet_execute_identify_desclist() since req already has ns member that
can be reused, this also eliminates the explicit call to
nvmet_put_namespace() which is already present in the request
completion path.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index de804d9762dd..1cc61ca42a7d 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -605,37 +605,35 @@ static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
 
 static void nvmet_execute_identify_desclist(struct nvmet_req *req)
 {
-	struct nvmet_ns *ns;
 	u16 status = 0;
 	off_t off = 0;
 
-	ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
-	if (!ns) {
+	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
+	if (!req->ns) {
 		req->error_loc = offsetof(struct nvme_identify, nsid);
 		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
 		goto out;
 	}
 
-	if (memchr_inv(&ns->uuid, 0, sizeof(ns->uuid))) {
+	if (memchr_inv(&req->ns->uuid, 0, sizeof(req->ns->uuid))) {
 		status = nvmet_copy_ns_identifier(req, NVME_NIDT_UUID,
 						  NVME_NIDT_UUID_LEN,
-						  &ns->uuid, &off);
+						  &req->ns->uuid, &off);
 		if (status)
-			goto out_put_ns;
+			goto out;
 	}
-	if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) {
+	if (memchr_inv(req->ns->nguid, 0, sizeof(req->ns->nguid))) {
 		status = nvmet_copy_ns_identifier(req, NVME_NIDT_NGUID,
 						  NVME_NIDT_NGUID_LEN,
-						  &ns->nguid, &off);
+						  &req->ns->nguid, &off);
 		if (status)
-			goto out_put_ns;
+			goto out;
 	}
 
 	if (sg_zero_buffer(req->sg, req->sg_cnt, NVME_IDENTIFY_DATA_SIZE - off,
 			off) != NVME_IDENTIFY_DATA_SIZE - off)
 		status = NVME_SC_INTERNAL | NVME_SC_DNR;
-out_put_ns:
-	nvmet_put_namespace(ns);
+
 out:
 	nvmet_req_complete(req, status);
 }

From 3c7b224f1956ed232b24ed2eb2c54e4476c6acb2 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 13 Jan 2021 17:33:54 -0800
Subject: [PATCH 072/183] nvmet: remove extra variable in identify ns

We remove the extra local variable struct nvmet_ns in
nvmet_execute_identify_ns() since req already has ns member that can be
reused, this also eliminates the explicit call to nvmet_put_namespace()
which is already present in the request completion path.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 1cc61ca42a7d..613a4d8feac1 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -467,7 +467,6 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 static void nvmet_execute_identify_ns(struct nvmet_req *req)
 {
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
-	struct nvmet_ns *ns;
 	struct nvme_id_ns *id;
 	u16 status = 0;
 
@@ -484,20 +483,21 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 	}
 
 	/* return an all zeroed buffer if we can't find an active namespace */
-	ns = nvmet_find_namespace(ctrl, req->cmd->identify.nsid);
-	if (!ns) {
+	req->ns = nvmet_find_namespace(ctrl, req->cmd->identify.nsid);
+	if (!req->ns) {
 		status = NVME_SC_INVALID_NS;
 		goto done;
 	}
 
-	nvmet_ns_revalidate(ns);
+	nvmet_ns_revalidate(req->ns);
 
 	/*
 	 * nuse = ncap = nsze isn't always true, but we have no way to find
 	 * that out from the underlying device.
 	 */
-	id->ncap = id->nsze = cpu_to_le64(ns->size >> ns->blksize_shift);
-	switch (req->port->ana_state[ns->anagrpid]) {
+	id->ncap = id->nsze =
+		cpu_to_le64(req->ns->size >> req->ns->blksize_shift);
+	switch (req->port->ana_state[req->ns->anagrpid]) {
 	case NVME_ANA_INACCESSIBLE:
 	case NVME_ANA_PERSISTENT_LOSS:
 		break;
@@ -506,8 +506,8 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 		break;
         }
 
-	if (ns->bdev)
-		nvmet_bdev_set_limits(ns->bdev, id);
+	if (req->ns->bdev)
+		nvmet_bdev_set_limits(req->ns->bdev, id);
 
 	/*
 	 * We just provide a single LBA format that matches what the
@@ -521,25 +521,24 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 	 * controllers, but also with any other user of the block device.
 	 */
 	id->nmic = (1 << 0);
-	id->anagrpid = cpu_to_le32(ns->anagrpid);
+	id->anagrpid = cpu_to_le32(req->ns->anagrpid);
 
-	memcpy(&id->nguid, &ns->nguid, sizeof(id->nguid));
+	memcpy(&id->nguid, &req->ns->nguid, sizeof(id->nguid));
 
-	id->lbaf[0].ds = ns->blksize_shift;
+	id->lbaf[0].ds = req->ns->blksize_shift;
 
-	if (ctrl->pi_support && nvmet_ns_has_pi(ns)) {
+	if (ctrl->pi_support && nvmet_ns_has_pi(req->ns)) {
 		id->dpc = NVME_NS_DPC_PI_FIRST | NVME_NS_DPC_PI_LAST |
 			  NVME_NS_DPC_PI_TYPE1 | NVME_NS_DPC_PI_TYPE2 |
 			  NVME_NS_DPC_PI_TYPE3;
 		id->mc = NVME_MC_EXTENDED_LBA;
-		id->dps = ns->pi_type;
+		id->dps = req->ns->pi_type;
 		id->flbas = NVME_NS_FLBAS_META_EXT;
-		id->lbaf[0].ms = cpu_to_le16(ns->metadata_size);
+		id->lbaf[0].ms = cpu_to_le16(req->ns->metadata_size);
 	}
 
-	if (ns->readonly)
+	if (req->ns->readonly)
 		id->nsattr |= (1 << 0);
-	nvmet_put_namespace(ns);
 done:
 	if (!status)
 		status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));

From 193fcf371f9e3705c14a0bf1d4bfc44af0f7c124 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Mon, 11 Jan 2021 20:26:16 -0800
Subject: [PATCH 073/183] nvmet: add lba to sect conversion helpers

In this preparation patch, we add helpers to convert lbas to sectors &
sectors to lba. This is needed to eliminate code duplication in the ZBD
backend.

Use these helpers in the block device backend.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/io-cmd-bdev.c |  8 +++-----
 drivers/nvme/target/nvmet.h       | 10 ++++++++++
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 125dde3f410e..23095bdfce06 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -256,8 +256,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 	if (is_pci_p2pdma_page(sg_page(req->sg)))
 		op |= REQ_NOMERGE;
 
-	sector = le64_to_cpu(req->cmd->rw.slba);
-	sector <<= (req->ns->blksize_shift - 9);
+	sector = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba);
 
 	if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) {
 		bio = &req->b.inline_bio;
@@ -345,7 +344,7 @@ static u16 nvmet_bdev_discard_range(struct nvmet_req *req,
 	int ret;
 
 	ret = __blkdev_issue_discard(ns->bdev,
-			le64_to_cpu(range->slba) << (ns->blksize_shift - 9),
+			nvmet_lba_to_sect(ns, range->slba),
 			le32_to_cpu(range->nlb) << (ns->blksize_shift - 9),
 			GFP_KERNEL, 0, bio);
 	if (ret && ret != -EOPNOTSUPP) {
@@ -414,8 +413,7 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req)
 	if (!nvmet_check_transfer_len(req, 0))
 		return;
 
-	sector = le64_to_cpu(write_zeroes->slba) <<
-		(req->ns->blksize_shift - 9);
+	sector = nvmet_lba_to_sect(req->ns, write_zeroes->slba);
 	nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
 		(req->ns->blksize_shift - 9));
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 592763732065..8776dd1a0490 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -603,4 +603,14 @@ static inline bool nvmet_ns_has_pi(struct nvmet_ns *ns)
 	return ns->pi_type && ns->metadata_size == sizeof(struct t10_pi_tuple);
 }
 
+static inline __le64 nvmet_sect_to_lba(struct nvmet_ns *ns, sector_t sect)
+{
+	return cpu_to_le64(sect >> (ns->blksize_shift - SECTOR_SHIFT));
+}
+
+static inline sector_t nvmet_lba_to_sect(struct nvmet_ns *ns, __le64 lba)
+{
+	return le64_to_cpu(lba) << (ns->blksize_shift - SECTOR_SHIFT);
+}
+
 #endif /* _NVMET_H */

From 3254899e0b52f10b9a3e7db4d10f081f60705ba9 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <mgurtovoy@nvidia.com>
Date: Thu, 21 Jan 2021 09:09:47 +0000
Subject: [PATCH 074/183] nvme: update enumerations for status codes

All the updates are mentioned in the ratified NVMe 1.4 spec.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index bfed36e342cc..458719544253 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1473,20 +1473,29 @@ enum {
 	NVME_SC_SGL_INVALID_DATA	= 0xf,
 	NVME_SC_SGL_INVALID_METADATA	= 0x10,
 	NVME_SC_SGL_INVALID_TYPE	= 0x11,
-
+	NVME_SC_CMB_INVALID_USE		= 0x12,
+	NVME_SC_PRP_INVALID_OFFSET	= 0x13,
+	NVME_SC_ATOMIC_WU_EXCEEDED	= 0x14,
+	NVME_SC_OP_DENIED		= 0x15,
 	NVME_SC_SGL_INVALID_OFFSET	= 0x16,
-	NVME_SC_SGL_INVALID_SUBTYPE	= 0x17,
-
+	NVME_SC_RESERVED		= 0x17,
+	NVME_SC_HOST_ID_INCONSIST	= 0x18,
+	NVME_SC_KA_TIMEOUT_EXPIRED	= 0x19,
+	NVME_SC_KA_TIMEOUT_INVALID	= 0x1A,
+	NVME_SC_ABORTED_PREEMPT_ABORT	= 0x1B,
 	NVME_SC_SANITIZE_FAILED		= 0x1C,
 	NVME_SC_SANITIZE_IN_PROGRESS	= 0x1D,
-
+	NVME_SC_SGL_INVALID_GRANULARITY	= 0x1E,
+	NVME_SC_CMD_NOT_SUP_CMB_QUEUE	= 0x1F,
 	NVME_SC_NS_WRITE_PROTECTED	= 0x20,
 	NVME_SC_CMD_INTERRUPTED		= 0x21,
+	NVME_SC_TRANSIENT_TR_ERR	= 0x22,
 
 	NVME_SC_LBA_RANGE		= 0x80,
 	NVME_SC_CAP_EXCEEDED		= 0x81,
 	NVME_SC_NS_NOT_READY		= 0x82,
 	NVME_SC_RESERVATION_CONFLICT	= 0x83,
+	NVME_SC_FORMAT_IN_PROGRESS	= 0x84,
 
 	/*
 	 * Command Specific Status:
@@ -1519,8 +1528,15 @@ enum {
 	NVME_SC_NS_NOT_ATTACHED		= 0x11a,
 	NVME_SC_THIN_PROV_NOT_SUPP	= 0x11b,
 	NVME_SC_CTRL_LIST_INVALID	= 0x11c,
+	NVME_SC_SELT_TEST_IN_PROGRESS	= 0x11d,
 	NVME_SC_BP_WRITE_PROHIBITED	= 0x11e,
+	NVME_SC_CTRL_ID_INVALID		= 0x11f,
+	NVME_SC_SEC_CTRL_STATE_INVALID	= 0x120,
+	NVME_SC_CTRL_RES_NUM_INVALID	= 0x121,
+	NVME_SC_RES_ID_INVALID		= 0x122,
 	NVME_SC_PMR_SAN_PROHIBITED	= 0x123,
+	NVME_SC_ANA_GROUP_ID_INVALID	= 0x124,
+	NVME_SC_ANA_ATTACH_FAILED	= 0x125,
 
 	/*
 	 * I/O Command Set Specific - NVM commands:

From 3a98c51a24825173455c479822aa2f89fecbe6af Mon Sep 17 00:00:00 2001
From: Michal Krakowiak <michal.krakowiak@linux.intel.com>
Date: Mon, 4 Jan 2021 16:53:43 +0100
Subject: [PATCH 075/183] nvme: parse format nvm command details when tracing

Add detailed parsing of format nvm admin command to make the
trace log more consistent and human-readable.

Signed-off-by: Michal Krakowiak <michal.krakowiak@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/trace.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 5c3cb6928f3c..e0400de713b5 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -102,6 +102,23 @@ static const char *nvme_trace_get_lba_status(struct trace_seq *p,
 	return ret;
 }
 
+static const char *nvme_trace_admin_format_nvm(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u8 lbaf = cdw10[0] & 0xF;
+	u8 mset = (cdw10[0] >> 4) & 0x1;
+	u8 pi = (cdw10[0] >> 5) & 0x7;
+	u8 pil = cdw10[1] & 0x1;
+	u8 ses = (cdw10[1] >> 1) & 0x7;
+
+	trace_seq_printf(p, "lbaf=%u, mset=%u, pi=%u, pil=%u, ses=%u",
+			lbaf, mset, pi, pil, ses);
+
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
 static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
@@ -159,6 +176,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
 		return nvme_trace_admin_get_features(p, cdw10);
 	case nvme_admin_get_lba_status:
 		return nvme_trace_get_lba_status(p, cdw10);
+	case nvme_admin_format_nvm:
+		return nvme_trace_admin_format_nvm(p, cdw10);
 	default:
 		return nvme_trace_common(p, cdw10);
 	}

From 4a407d5ebc7ac1ea8c6e2692bd79320459dc60f6 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 27 Jan 2021 02:50:00 +0900
Subject: [PATCH 076/183] nvme: add tracing of zns commands

When support for the NVMe ZNS commands was merged, tracing of these has
been omitted.

Add nvme_cmd_zone_mgmt_send, nvme_cmd_zone_mgmt_recv as well as
nvme_cmd_zone_append to the nvme driver's tracing facility.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/trace.c | 34 ++++++++++++++++++++++++++++++++++
 include/linux/nvme.h      |  6 +++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index e0400de713b5..6543015b6121 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -148,6 +148,35 @@ static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10)
 	return ret;
 }
 
+static const char *nvme_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u64 slba = get_unaligned_le64(cdw10);
+	u8 zsa = cdw10[12];
+	u8 all = cdw10[13];
+
+	trace_seq_printf(p, "slba=%llu, zsa=%u, all=%u", slba, zsa, all);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
+static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	u64 slba = get_unaligned_le64(cdw10);
+	u32 numd = get_unaligned_le32(cdw10 + 8);
+	u8 zra = cdw10[12];
+	u8 zrasf = cdw10[13];
+	u8 pr = cdw10[14];
+
+	trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u, pr=%u",
+			 slba, numd, zra, zrasf, pr);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+
 static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
@@ -190,9 +219,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
 	case nvme_cmd_read:
 	case nvme_cmd_write:
 	case nvme_cmd_write_zeroes:
+	case nvme_cmd_zone_append:
 		return nvme_trace_read_write(p, cdw10);
 	case nvme_cmd_dsm:
 		return nvme_trace_dsm(p, cdw10);
+	case nvme_cmd_zone_mgmt_send:
+		return nvme_trace_zone_mgmt_send(p, cdw10);
+	case nvme_cmd_zone_mgmt_recv:
+		return nvme_trace_zone_mgmt_recv(p, cdw10);
 	default:
 		return nvme_trace_common(p, cdw10);
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 458719544253..b08787cd0881 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -697,7 +697,11 @@ enum nvme_opcode {
 		nvme_opcode_name(nvme_cmd_resv_register),	\
 		nvme_opcode_name(nvme_cmd_resv_report),		\
 		nvme_opcode_name(nvme_cmd_resv_acquire),	\
-		nvme_opcode_name(nvme_cmd_resv_release))
+		nvme_opcode_name(nvme_cmd_resv_release),	\
+		nvme_opcode_name(nvme_cmd_zone_mgmt_send),	\
+		nvme_opcode_name(nvme_cmd_zone_mgmt_recv),	\
+		nvme_opcode_name(nvme_cmd_zone_append))
+
 
 
 /*

From 8f8ea928fd77db60dc22276e3acdb9ca41cbf8dd Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 26 Jan 2021 11:47:52 -0800
Subject: [PATCH 077/183] nvme-core: get rid of the extra space

Remove the extra space in the nvme_free_cels() when calling
xa_for_each loop which is not a common practice
(except drivers/infiniband/core/ not sure why).

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 009830d247f8..168601d96f48 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4448,7 +4448,7 @@ static void nvme_free_cels(struct nvme_ctrl *ctrl)
 	struct nvme_effects_log	*cel;
 	unsigned long i;
 
-	xa_for_each (&ctrl->cels, i, cel) {
+	xa_for_each(&ctrl->cels, i, cel) {
 		xa_erase(&ctrl->cels, i);
 		kfree(cel);
 	}

From 2547906982e2e6a0d42f8957f55af5bb51a7e55f Mon Sep 17 00:00:00 2001
From: Chao Leng <lengchao@huawei.com>
Date: Thu, 21 Jan 2021 11:32:36 +0800
Subject: [PATCH 078/183] nvme-core: add cancel tagset helpers

Add nvme_cancel_tagset and nvme_cancel_admin_tagset for tear down and
reconnection error handling.

Signed-off-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 20 ++++++++++++++++++++
 drivers/nvme/host/nvme.h |  2 ++
 2 files changed, 22 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 168601d96f48..4e8e310033c9 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -370,6 +370,26 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved)
 }
 EXPORT_SYMBOL_GPL(nvme_cancel_request);
 
+void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
+{
+	if (ctrl->tagset) {
+		blk_mq_tagset_busy_iter(ctrl->tagset,
+				nvme_cancel_request, ctrl);
+		blk_mq_tagset_wait_completed_request(ctrl->tagset);
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
+
+void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
+{
+	if (ctrl->admin_tagset) {
+		blk_mq_tagset_busy_iter(ctrl->admin_tagset,
+				nvme_cancel_request, ctrl);
+		blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
+
 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		enum nvme_ctrl_state new_state)
 {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 88a6b97247f5..a72f07181091 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -576,6 +576,8 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 
 void nvme_complete_rq(struct request *req);
 bool nvme_cancel_request(struct request *req, void *data, bool reserved);
+void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
+void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		enum nvme_ctrl_state new_state);
 bool nvme_wait_reset(struct nvme_ctrl *ctrl);

From 958dc1d32c80566f58d18f05ef1f05bd32d172c1 Mon Sep 17 00:00:00 2001
From: Chao Leng <lengchao@huawei.com>
Date: Thu, 21 Jan 2021 11:32:37 +0800
Subject: [PATCH 079/183] nvme-rdma: add clean action for failed reconnection

A crash happens when inject failed reconnection.
If reconnect failed after start io queues, the queues will be unquiesced
and new requests continue to be delivered. Reconnection error handling
process directly free queues without cancel suspend requests. The
suppend request will time out, and then crash due to use the queue
after free.

Add sync queues and cancel suppend requests for reconnection error
handling.

Signed-off-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index f5ef3edeb2fd..d92132cbcbbe 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -919,12 +919,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 
 	error = nvme_init_identify(&ctrl->ctrl);
 	if (error)
-		goto out_stop_queue;
+		goto out_quiesce_queue;
 
 	return 0;
 
+out_quiesce_queue:
+	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	blk_sync_queue(ctrl->ctrl.admin_q);
 out_stop_queue:
 	nvme_rdma_stop_queue(&ctrl->queues[0]);
+	nvme_cancel_admin_tagset(&ctrl->ctrl);
 out_cleanup_queue:
 	if (new)
 		blk_cleanup_queue(ctrl->ctrl.admin_q);
@@ -1001,8 +1005,10 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
 
 out_wait_freeze_timed_out:
 	nvme_stop_queues(&ctrl->ctrl);
+	nvme_sync_io_queues(&ctrl->ctrl);
 	nvme_rdma_stop_io_queues(ctrl);
 out_cleanup_connect_q:
+	nvme_cancel_tagset(&ctrl->ctrl);
 	if (new)
 		blk_cleanup_queue(ctrl->ctrl.connect_q);
 out_free_tag_set:
@@ -1144,10 +1150,18 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
 	return 0;
 
 destroy_io:
-	if (ctrl->ctrl.queue_count > 1)
+	if (ctrl->ctrl.queue_count > 1) {
+		nvme_stop_queues(&ctrl->ctrl);
+		nvme_sync_io_queues(&ctrl->ctrl);
+		nvme_rdma_stop_io_queues(ctrl);
+		nvme_cancel_tagset(&ctrl->ctrl);
 		nvme_rdma_destroy_io_queues(ctrl, new);
+	}
 destroy_admin:
+	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+	blk_sync_queue(ctrl->ctrl.admin_q);
 	nvme_rdma_stop_queue(&ctrl->queues[0]);
+	nvme_cancel_admin_tagset(&ctrl->ctrl);
 	nvme_rdma_destroy_admin_queue(ctrl, new);
 	return ret;
 }

From 70a99574a79f1cd4dc7ad56ea37be40844bfb97b Mon Sep 17 00:00:00 2001
From: Chao Leng <lengchao@huawei.com>
Date: Thu, 21 Jan 2021 11:32:38 +0800
Subject: [PATCH 080/183] nvme-tcp: add clean action for failed reconnection

If reconnect failed after start io queues, the queues will be unquiesced
and new requests continue to be delivered. Reconnection error handling
process directly free queues without cancel suspend requests. The
suppend request will time out, and then crash due to use the queue
after free.

Add sync queues and cancel suppend requests for reconnection error
handling.

Signed-off-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 4c13c7110dbe..8c256adb8c41 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1812,8 +1812,10 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
 
 out_wait_freeze_timed_out:
 	nvme_stop_queues(ctrl);
+	nvme_sync_io_queues(ctrl);
 	nvme_tcp_stop_io_queues(ctrl);
 out_cleanup_connect_q:
+	nvme_cancel_tagset(ctrl);
 	if (new)
 		blk_cleanup_queue(ctrl->connect_q);
 out_free_tag_set:
@@ -1875,12 +1877,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
 
 	error = nvme_init_identify(ctrl);
 	if (error)
-		goto out_stop_queue;
+		goto out_quiesce_queue;
 
 	return 0;
 
+out_quiesce_queue:
+	blk_mq_quiesce_queue(ctrl->admin_q);
+	blk_sync_queue(ctrl->admin_q);
 out_stop_queue:
 	nvme_tcp_stop_queue(ctrl, 0);
+	nvme_cancel_admin_tagset(ctrl);
 out_cleanup_queue:
 	if (new)
 		blk_cleanup_queue(ctrl->admin_q);
@@ -2000,10 +2006,18 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
 	return 0;
 
 destroy_io:
-	if (ctrl->queue_count > 1)
+	if (ctrl->queue_count > 1) {
+		nvme_stop_queues(ctrl);
+		nvme_sync_io_queues(ctrl);
+		nvme_tcp_stop_io_queues(ctrl);
+		nvme_cancel_tagset(ctrl);
 		nvme_tcp_destroy_io_queues(ctrl, new);
+	}
 destroy_admin:
+	blk_mq_quiesce_queue(ctrl->admin_q);
+	blk_sync_queue(ctrl->admin_q);
 	nvme_tcp_stop_queue(ctrl, 0);
+	nvme_cancel_admin_tagset(ctrl);
 	nvme_tcp_destroy_admin_queue(ctrl, new);
 	return ret;
 }

From c4189d680e12f0a41eea94a1f466142b2bf02c3d Mon Sep 17 00:00:00 2001
From: Chao Leng <lengchao@huawei.com>
Date: Thu, 21 Jan 2021 11:32:39 +0800
Subject: [PATCH 081/183] nvme-rdma: use cancel tagset helper for tear down

Use nvme_cancel_tagset and nvme_cancel_admin_tagset to clean code for
tear down process.

Signed-off-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index d92132cbcbbe..6700d8bab68a 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1025,11 +1025,7 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
 	blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
 	blk_sync_queue(ctrl->ctrl.admin_q);
 	nvme_rdma_stop_queue(&ctrl->queues[0]);
-	if (ctrl->ctrl.admin_tagset) {
-		blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
-			nvme_cancel_request, &ctrl->ctrl);
-		blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset);
-	}
+	nvme_cancel_admin_tagset(&ctrl->ctrl);
 	if (remove)
 		blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
 	nvme_rdma_destroy_admin_queue(ctrl, remove);
@@ -1043,11 +1039,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
 		nvme_stop_queues(&ctrl->ctrl);
 		nvme_sync_io_queues(&ctrl->ctrl);
 		nvme_rdma_stop_io_queues(ctrl);
-		if (ctrl->ctrl.tagset) {
-			blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
-				nvme_cancel_request, &ctrl->ctrl);
-			blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset);
-		}
+		nvme_cancel_tagset(&ctrl->ctrl);
 		if (remove)
 			nvme_start_queues(&ctrl->ctrl);
 		nvme_rdma_destroy_io_queues(ctrl, remove);

From 563c81586d0ab2841487a61fb34d6e9cd5efded7 Mon Sep 17 00:00:00 2001
From: Chao Leng <lengchao@huawei.com>
Date: Thu, 21 Jan 2021 11:32:40 +0800
Subject: [PATCH 082/183] nvme-tcp: use cancel tagset helper for tear down

Use nvme_cancel_tagset and nvme_cancel_admin_tagset to clean code for
tear down process.

Signed-off-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 8c256adb8c41..619b0d8f6e38 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1907,11 +1907,7 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
 	blk_mq_quiesce_queue(ctrl->admin_q);
 	blk_sync_queue(ctrl->admin_q);
 	nvme_tcp_stop_queue(ctrl, 0);
-	if (ctrl->admin_tagset) {
-		blk_mq_tagset_busy_iter(ctrl->admin_tagset,
-			nvme_cancel_request, ctrl);
-		blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
-	}
+	nvme_cancel_admin_tagset(ctrl);
 	if (remove)
 		blk_mq_unquiesce_queue(ctrl->admin_q);
 	nvme_tcp_destroy_admin_queue(ctrl, remove);
@@ -1927,11 +1923,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
 	nvme_stop_queues(ctrl);
 	nvme_sync_io_queues(ctrl);
 	nvme_tcp_stop_io_queues(ctrl);
-	if (ctrl->tagset) {
-		blk_mq_tagset_busy_iter(ctrl->tagset,
-			nvme_cancel_request, ctrl);
-		blk_mq_tagset_wait_completed_request(ctrl->tagset);
-	}
+	nvme_cancel_tagset(ctrl);
 	if (remove)
 		nvme_start_queues(ctrl);
 	nvme_tcp_destroy_io_queues(ctrl, remove);

From c5eec74f252dfba25269cd68f9a3407aedefd330 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Wed, 16 Dec 2020 02:26:22 +0100
Subject: [PATCH 083/183] md/raid5: cast chunk_sectors to sector_t value

Currently, raid5 calculates dev_sectors from chunk_sectors without
proper cast, which is problematic.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f411b9e5c332..b71f50132495 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7661,7 +7661,7 @@ static int raid5_run(struct mddev *mddev)
 	}
 
 	/* device size must be a multiple of chunk size */
-	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
+	mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1);
 	mddev->resync_max_sectors = mddev->dev_sectors;
 
 	if (mddev->degraded > dirty_parity_disks &&

From 8a0c014cd20516ade9654fc13b51345ec58e7be8 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 22 Jan 2021 12:13:20 +0100
Subject: [PATCH 084/183] floppy: reintroduce O_NDELAY fix

This issue was originally fixed in 09954bad4 ("floppy: refactor open()
flags handling").

The fix as a side-effect, however, introduce issue for open(O_ACCMODE)
that is being used for ioctl-only open. I wrote a fix for that, but
instead of it being merged, full revert of 09954bad4 was performed,
re-introducing the O_NDELAY / O_NONBLOCK issue, and it strikes again.

This is a forward-port of the original fix to current codebase; the
original submission had the changelog below:

====
Commit 09954bad4 ("floppy: refactor open() flags handling"), as a
side-effect, causes open(/dev/fdX, O_ACCMODE) to fail. It turns out that
this is being used setfdprm userspace for ioctl-only open().

Reintroduce back the original behavior wrt !(FMODE_READ|FMODE_WRITE)
modes, while still keeping the original O_NDELAY bug fixed.

Link: https://lore.kernel.org/r/nycvar.YFH.7.76.2101221209060.5622@cbobk.fhfr.pm
Cc: stable@vger.kernel.org
Reported-by: Wim Osterholt <wim@djo.tudelft.nl>
Tested-by: Wim Osterholt <wim@djo.tudelft.nl>
Reported-and-tested-by: Kurt Garloff <kurt@garloff.de>
Fixes: 09954bad4 ("floppy: refactor open() flags handling")
Fixes: f2791e7ead ("Revert "floppy: refactor open() flags handling"")
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Denis Efremov <efremov@linux.com>
---
 drivers/block/floppy.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index dfe1dfc901cc..0b71292d9d5a 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4121,23 +4121,23 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	if (fdc_state[FDC(drive)].rawcmd == 1)
 		fdc_state[FDC(drive)].rawcmd = 2;
 
-	if (!(mode & FMODE_NDELAY)) {
-		if (mode & (FMODE_READ|FMODE_WRITE)) {
-			drive_state[drive].last_checked = 0;
-			clear_bit(FD_OPEN_SHOULD_FAIL_BIT,
-				  &drive_state[drive].flags);
-			if (bdev_check_media_change(bdev))
-				floppy_revalidate(bdev->bd_disk);
-			if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags))
-				goto out;
-			if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags))
-				goto out;
-		}
-		res = -EROFS;
-		if ((mode & FMODE_WRITE) &&
-		    !test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags))
+	if (mode & (FMODE_READ|FMODE_WRITE)) {
+		drive_state[drive].last_checked = 0;
+		clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags);
+		if (bdev_check_media_change(bdev))
+			floppy_revalidate(bdev->bd_disk);
+		if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags))
+			goto out;
+		if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags))
 			goto out;
 	}
+
+	res = -EROFS;
+
+	if ((mode & FMODE_WRITE) &&
+			!test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags))
+		goto out;
+
 	mutex_unlock(&open_lock);
 	mutex_unlock(&floppy_mutex);
 	return 0;

From 4a2b92a5d3519fc2c1edda4d4aa0e05bff41e8de Mon Sep 17 00:00:00 2001
From: Bert Vermeulen <bert@biot.com>
Date: Fri, 22 Jan 2021 21:42:23 +0100
Subject: [PATCH 085/183] dt-bindings: interrupt-controller: Add Realtek
 RTL838x/RTL839x support

Document the binding for the Realtek RTL838x/RTL839x interrupt controller.

Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Bert Vermeulen <bert@biot.com>
[maz: Add a commit message, as the author couldn't be bothered...]
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210122204224.509124-2-bert@biot.com
---
 .../realtek,rtl-intc.yaml                     | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/interrupt-controller/realtek,rtl-intc.yaml

diff --git a/Documentation/devicetree/bindings/interrupt-controller/realtek,rtl-intc.yaml b/Documentation/devicetree/bindings/interrupt-controller/realtek,rtl-intc.yaml
new file mode 100644
index 000000000000..9e76fff20323
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/realtek,rtl-intc.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/interrupt-controller/realtek,rtl-intc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Realtek RTL SoC interrupt controller devicetree bindings
+
+maintainers:
+  - Birger Koblitz <mail@birger-koblitz.de>
+  - Bert Vermeulen <bert@biot.com>
+  - John Crispin <john@phrozen.org>
+
+properties:
+  compatible:
+    const: realtek,rtl-intc
+
+  "#interrupt-cells":
+    const: 1
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  interrupt-controller: true
+
+  "#address-cells":
+    const: 0
+
+  interrupt-map:
+    description: Describes mapping from SoC interrupts to CPU interrupts
+
+required:
+  - compatible
+  - reg
+  - "#interrupt-cells"
+  - interrupt-controller
+  - "#address-cells"
+  - interrupt-map
+
+additionalProperties: false
+
+examples:
+  - |
+    intc: interrupt-controller@3000 {
+      compatible = "realtek,rtl-intc";
+      #interrupt-cells = <1>;
+      interrupt-controller;
+      reg = <0x3000 0x20>;
+      #address-cells = <0>;
+      interrupt-map =
+              <31 &cpuintc 2>,
+              <30 &cpuintc 1>,
+              <29 &cpuintc 5>;
+    };

From 9f3a0f34b84ad1b9a8f2bdae44b66f16685b2143 Mon Sep 17 00:00:00 2001
From: Bert Vermeulen <bert@biot.com>
Date: Fri, 22 Jan 2021 21:42:24 +0100
Subject: [PATCH 086/183] irqchip: Add support for Realtek RTL838x/RTL839x
 interrupt controller

This is a standard IRQ driver with only status and mask registers.

The mapping from SoC interrupts (18-31) to MIPS core interrupts is
done via an interrupt-map in device tree.

Signed-off-by: Bert Vermeulen <bert@biot.com>
Signed-off-by: Birger Koblitz <mail@birger-koblitz.de>
Acked-by: John Crispin <john@phrozen.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210122204224.509124-3-bert@biot.com
---
 drivers/irqchip/Makefile          |   1 +
 drivers/irqchip/irq-realtek-rtl.c | 180 ++++++++++++++++++++++++++++++
 2 files changed, 181 insertions(+)
 create mode 100644 drivers/irqchip/irq-realtek-rtl.c

diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 2a1994d7f99a..c59b95a0532c 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -112,3 +112,4 @@ obj-$(CONFIG_LOONGSON_PCH_PIC)		+= irq-loongson-pch-pic.o
 obj-$(CONFIG_LOONGSON_PCH_MSI)		+= irq-loongson-pch-msi.o
 obj-$(CONFIG_MST_IRQ)			+= irq-mst-intc.o
 obj-$(CONFIG_SL28CPLD_INTC)		+= irq-sl28cpld.o
+obj-$(CONFIG_MACH_REALTEK_RTL)		+= irq-realtek-rtl.o
diff --git a/drivers/irqchip/irq-realtek-rtl.c b/drivers/irqchip/irq-realtek-rtl.c
new file mode 100644
index 000000000000..b57c67dfab5b
--- /dev/null
+++ b/drivers/irqchip/irq-realtek-rtl.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Birger Koblitz <mail@birger-koblitz.de>
+ * Copyright (C) 2020 Bert Vermeulen <bert@biot.com>
+ * Copyright (C) 2020 John Crispin <john@phrozen.org>
+ */
+
+#include <linux/of_irq.h>
+#include <linux/irqchip.h>
+#include <linux/spinlock.h>
+#include <linux/of_address.h>
+#include <linux/irqchip/chained_irq.h>
+
+/* Global Interrupt Mask Register */
+#define RTL_ICTL_GIMR		0x00
+/* Global Interrupt Status Register */
+#define RTL_ICTL_GISR		0x04
+/* Interrupt Routing Registers */
+#define RTL_ICTL_IRR0		0x08
+#define RTL_ICTL_IRR1		0x0c
+#define RTL_ICTL_IRR2		0x10
+#define RTL_ICTL_IRR3		0x14
+
+#define REG(x)		(realtek_ictl_base + x)
+
+static DEFINE_RAW_SPINLOCK(irq_lock);
+static void __iomem *realtek_ictl_base;
+
+static void realtek_ictl_unmask_irq(struct irq_data *i)
+{
+	unsigned long flags;
+	u32 value;
+
+	raw_spin_lock_irqsave(&irq_lock, flags);
+
+	value = readl(REG(RTL_ICTL_GIMR));
+	value |= BIT(i->hwirq);
+	writel(value, REG(RTL_ICTL_GIMR));
+
+	raw_spin_unlock_irqrestore(&irq_lock, flags);
+}
+
+static void realtek_ictl_mask_irq(struct irq_data *i)
+{
+	unsigned long flags;
+	u32 value;
+
+	raw_spin_lock_irqsave(&irq_lock, flags);
+
+	value = readl(REG(RTL_ICTL_GIMR));
+	value &= ~BIT(i->hwirq);
+	writel(value, REG(RTL_ICTL_GIMR));
+
+	raw_spin_unlock_irqrestore(&irq_lock, flags);
+}
+
+static struct irq_chip realtek_ictl_irq = {
+	.name = "realtek-rtl-intc",
+	.irq_mask = realtek_ictl_mask_irq,
+	.irq_unmask = realtek_ictl_unmask_irq,
+};
+
+static int intc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw)
+{
+	irq_set_chip_and_handler(hw, &realtek_ictl_irq, handle_level_irq);
+
+	return 0;
+}
+
+static const struct irq_domain_ops irq_domain_ops = {
+	.map = intc_map,
+	.xlate = irq_domain_xlate_onecell,
+};
+
+static void realtek_irq_dispatch(struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	struct irq_domain *domain;
+	unsigned int pending;
+
+	chained_irq_enter(chip, desc);
+	pending = readl(REG(RTL_ICTL_GIMR)) & readl(REG(RTL_ICTL_GISR));
+	if (unlikely(!pending)) {
+		spurious_interrupt();
+		goto out;
+	}
+	domain = irq_desc_get_handler_data(desc);
+	generic_handle_irq(irq_find_mapping(domain, __ffs(pending)));
+
+out:
+	chained_irq_exit(chip, desc);
+}
+
+/*
+ * SoC interrupts are cascaded to MIPS CPU interrupts according to the
+ * interrupt-map in the device tree. Each SoC interrupt gets 4 bits for
+ * the CPU interrupt in an Interrupt Routing Register. Max 32 SoC interrupts
+ * thus go into 4 IRRs.
+ */
+static int __init map_interrupts(struct device_node *node, struct irq_domain *domain)
+{
+	struct device_node *cpu_ictl;
+	const __be32 *imap;
+	u32 imaplen, soc_int, cpu_int, tmp, regs[4];
+	int ret, i, irr_regs[] = {
+		RTL_ICTL_IRR3,
+		RTL_ICTL_IRR2,
+		RTL_ICTL_IRR1,
+		RTL_ICTL_IRR0,
+	};
+	u8 mips_irqs_set;
+
+	ret = of_property_read_u32(node, "#address-cells", &tmp);
+	if (ret || tmp)
+		return -EINVAL;
+
+	imap = of_get_property(node, "interrupt-map", &imaplen);
+	if (!imap || imaplen % 3)
+		return -EINVAL;
+
+	mips_irqs_set = 0;
+	memset(regs, 0, sizeof(regs));
+	for (i = 0; i < imaplen; i += 3 * sizeof(u32)) {
+		soc_int = be32_to_cpup(imap);
+		if (soc_int > 31)
+			return -EINVAL;
+
+		cpu_ictl = of_find_node_by_phandle(be32_to_cpup(imap + 1));
+		if (!cpu_ictl)
+			return -EINVAL;
+		ret = of_property_read_u32(cpu_ictl, "#interrupt-cells", &tmp);
+		if (ret || tmp != 1)
+			return -EINVAL;
+		of_node_put(cpu_ictl);
+
+		cpu_int = be32_to_cpup(imap + 2);
+		if (cpu_int > 7)
+			return -EINVAL;
+
+		if (!(mips_irqs_set & BIT(cpu_int))) {
+			irq_set_chained_handler_and_data(cpu_int, realtek_irq_dispatch,
+							 domain);
+			mips_irqs_set |= BIT(cpu_int);
+		}
+
+		regs[(soc_int * 4) / 32] |= cpu_int << (soc_int * 4) % 32;
+		imap += 3;
+	}
+
+	for (i = 0; i < 4; i++)
+		writel(regs[i], REG(irr_regs[i]));
+
+	return 0;
+}
+
+static int __init realtek_rtl_of_init(struct device_node *node, struct device_node *parent)
+{
+	struct irq_domain *domain;
+	int ret;
+
+	realtek_ictl_base = of_iomap(node, 0);
+	if (!realtek_ictl_base)
+		return -ENXIO;
+
+	/* Disable all cascaded interrupts */
+	writel(0, REG(RTL_ICTL_GIMR));
+
+	domain = irq_domain_add_simple(node, 32, 0,
+				       &irq_domain_ops, NULL);
+
+	ret = map_interrupts(node, domain);
+	if (ret) {
+		pr_err("invalid interrupt map\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+IRQCHIP_DECLARE(realtek_rtl_intc, "realtek,rtl-intc", realtek_rtl_of_init);

From be1abc5ba4d2082df6749ab95ec6f87c4d3dbb23 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Thu, 4 Feb 2021 15:46:08 +0800
Subject: [PATCH 087/183] irqchip/csky-mpintc: Prevent selection on unsupported
 platforms

The irq-csky-mpintc driver is only supported on CPU_CK860 and
it will generate a compilation error when selected with CPU_CK610.

As it is already selected directly in the architecture Kconfig,
drop the option to select it manually.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
[maz: rewrote commit message]
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210204074609.3553018-1-guoren@kernel.org
---
 drivers/irqchip/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index f95d114c63ed..030895cc6f13 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -427,7 +427,7 @@ config QCOM_PDC
 	  IRQs for Qualcomm Technologies Inc (QTI) mobile chips.
 
 config CSKY_MPINTC
-	bool "C-SKY Multi Processor Interrupt Controller"
+	bool
 	depends on CSKY
 	help
 	  Say yes here to enable C-SKY SMP interrupt controller driver used

From ee8f353b1591cef4a29cddeb379c1503559f474e Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 4 Feb 2021 17:43:42 +0900
Subject: [PATCH 088/183] block: remove skd driver

The STEC S1220 PCIe SSD cards are EOL since 2014 and not supported by
the vendor anymore. As the skd driver for this SSD is starting to cause
problems with improvements to the block layer, stop supporting it in
newer kernel versions.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS               |    6 -
 drivers/block/Kconfig     |   10 -
 drivers/block/Makefile    |    2 -
 drivers/block/skd_main.c  | 3670 -------------------------------------
 drivers/block/skd_s1120.h |  322 ----
 5 files changed, 4010 deletions(-)
 delete mode 100644 drivers/block/skd_main.c
 delete mode 100644 drivers/block/skd_s1120.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 992fe3b0900a..f4766335189a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16938,12 +16938,6 @@ F:	include/linux/static_call*.h
 F:	kernel/jump_label.c
 F:	kernel/static_call.c
 
-STEC S1220 SKD DRIVER
-M:	Damien Le Moal <Damien.LeMoal@wdc.com>
-L:	linux-block@vger.kernel.org
-S:	Maintained
-F:	drivers/block/skd*[ch]
-
 STI AUDIO (ASoC) DRIVERS
 M:	Arnaud Pouliquen <arnaud.pouliquen@st.com>
 L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 583b671b1d2d..2779e85795a7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -267,16 +267,6 @@ config BLK_DEV_NBD
 
 	  If unsure, say N.
 
-config BLK_DEV_SKD
-	tristate "STEC S1120 Block Driver"
-	depends on PCI
-	depends on 64BIT
-	help
-	Saying Y or M here will enable support for the
-	STEC, Inc. S1120 PCIe SSD.
-
-	Use device /dev/skd$N amd /dev/skd$Np$M.
-
 config BLK_DEV_SX8
 	tristate "Promise SATA SX8 support"
 	depends on PCI
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index a3170859e01d..b501b8728fb9 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_BLK_DEV_LOOP)	+= loop.o
 obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
-obj-$(CONFIG_BLK_DEV_SKD)	+= skd.o
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
@@ -43,5 +42,4 @@ obj-$(CONFIG_BLK_DEV_RNBD)	+= rnbd/
 
 obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk/
 
-skd-y		:= skd_main.o
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
deleted file mode 100644
index a962b4551bed..000000000000
--- a/drivers/block/skd_main.c
+++ /dev/null
@@ -1,3670 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Driver for sTec s1120 PCIe SSDs. sTec was acquired in 2013 by HGST and HGST
- * was acquired by Western Digital in 2012.
- *
- * Copyright 2012 sTec, Inc.
- * Copyright (c) 2017 Western Digital Corporation or its affiliates.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/compiler.h>
-#include <linux/workqueue.h>
-#include <linux/delay.h>
-#include <linux/time.h>
-#include <linux/hdreg.h>
-#include <linux/dma-mapping.h>
-#include <linux/completion.h>
-#include <linux/scatterlist.h>
-#include <linux/err.h>
-#include <linux/aer.h>
-#include <linux/wait.h>
-#include <linux/stringify.h>
-#include <scsi/scsi.h>
-#include <scsi/sg.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-#include <asm/unaligned.h>
-
-#include "skd_s1120.h"
-
-static int skd_dbg_level;
-static int skd_isr_comp_limit = 4;
-
-#define SKD_ASSERT(expr) \
-	do { \
-		if (unlikely(!(expr))) { \
-			pr_err("Assertion failed! %s,%s,%s,line=%d\n",	\
-			       # expr, __FILE__, __func__, __LINE__); \
-		} \
-	} while (0)
-
-#define DRV_NAME "skd"
-#define PFX DRV_NAME ": "
-
-MODULE_LICENSE("GPL");
-
-MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver");
-
-#define PCI_VENDOR_ID_STEC      0x1B39
-#define PCI_DEVICE_ID_S1120     0x0001
-
-#define SKD_FUA_NV		(1 << 1)
-#define SKD_MINORS_PER_DEVICE   16
-
-#define SKD_MAX_QUEUE_DEPTH     200u
-
-#define SKD_PAUSE_TIMEOUT       (5 * 1000)
-
-#define SKD_N_FITMSG_BYTES      (512u)
-#define SKD_MAX_REQ_PER_MSG	14
-
-#define SKD_N_SPECIAL_FITMSG_BYTES      (128u)
-
-/* SG elements are 32 bytes, so we can make this 4096 and still be under the
- * 128KB limit.  That allows 4096*4K = 16M xfer size
- */
-#define SKD_N_SG_PER_REQ_DEFAULT 256u
-
-#define SKD_N_COMPLETION_ENTRY  256u
-#define SKD_N_READ_CAP_BYTES    (8u)
-
-#define SKD_N_INTERNAL_BYTES    (512u)
-
-#define SKD_SKCOMP_SIZE							\
-	((sizeof(struct fit_completion_entry_v1) +			\
-	  sizeof(struct fit_comp_error_info)) * SKD_N_COMPLETION_ENTRY)
-
-/* 5 bits of uniqifier, 0xF800 */
-#define SKD_ID_TABLE_MASK       (3u << 8u)
-#define  SKD_ID_RW_REQUEST      (0u << 8u)
-#define  SKD_ID_INTERNAL        (1u << 8u)
-#define  SKD_ID_FIT_MSG         (3u << 8u)
-#define SKD_ID_SLOT_MASK        0x00FFu
-#define SKD_ID_SLOT_AND_TABLE_MASK 0x03FFu
-
-#define SKD_N_MAX_SECTORS 2048u
-
-#define SKD_MAX_RETRIES 2u
-
-#define SKD_TIMER_SECONDS(seconds) (seconds)
-#define SKD_TIMER_MINUTES(minutes) ((minutes) * (60))
-
-#define INQ_STD_NBYTES 36
-
-enum skd_drvr_state {
-	SKD_DRVR_STATE_LOAD,
-	SKD_DRVR_STATE_IDLE,
-	SKD_DRVR_STATE_BUSY,
-	SKD_DRVR_STATE_STARTING,
-	SKD_DRVR_STATE_ONLINE,
-	SKD_DRVR_STATE_PAUSING,
-	SKD_DRVR_STATE_PAUSED,
-	SKD_DRVR_STATE_RESTARTING,
-	SKD_DRVR_STATE_RESUMING,
-	SKD_DRVR_STATE_STOPPING,
-	SKD_DRVR_STATE_FAULT,
-	SKD_DRVR_STATE_DISAPPEARED,
-	SKD_DRVR_STATE_PROTOCOL_MISMATCH,
-	SKD_DRVR_STATE_BUSY_ERASE,
-	SKD_DRVR_STATE_BUSY_SANITIZE,
-	SKD_DRVR_STATE_BUSY_IMMINENT,
-	SKD_DRVR_STATE_WAIT_BOOT,
-	SKD_DRVR_STATE_SYNCING,
-};
-
-#define SKD_WAIT_BOOT_TIMO      SKD_TIMER_SECONDS(90u)
-#define SKD_STARTING_TIMO       SKD_TIMER_SECONDS(8u)
-#define SKD_RESTARTING_TIMO     SKD_TIMER_MINUTES(4u)
-#define SKD_BUSY_TIMO           SKD_TIMER_MINUTES(20u)
-#define SKD_STARTED_BUSY_TIMO   SKD_TIMER_SECONDS(60u)
-#define SKD_START_WAIT_SECONDS  90u
-
-enum skd_req_state {
-	SKD_REQ_STATE_IDLE,
-	SKD_REQ_STATE_SETUP,
-	SKD_REQ_STATE_BUSY,
-	SKD_REQ_STATE_COMPLETED,
-	SKD_REQ_STATE_TIMEOUT,
-};
-
-enum skd_check_status_action {
-	SKD_CHECK_STATUS_REPORT_GOOD,
-	SKD_CHECK_STATUS_REPORT_SMART_ALERT,
-	SKD_CHECK_STATUS_REQUEUE_REQUEST,
-	SKD_CHECK_STATUS_REPORT_ERROR,
-	SKD_CHECK_STATUS_BUSY_IMMINENT,
-};
-
-struct skd_msg_buf {
-	struct fit_msg_hdr	fmh;
-	struct skd_scsi_request	scsi[SKD_MAX_REQ_PER_MSG];
-};
-
-struct skd_fitmsg_context {
-	u32 id;
-
-	u32 length;
-
-	struct skd_msg_buf *msg_buf;
-	dma_addr_t mb_dma_address;
-};
-
-struct skd_request_context {
-	enum skd_req_state state;
-
-	u16 id;
-	u32 fitmsg_id;
-
-	u8 flush_cmd;
-
-	enum dma_data_direction data_dir;
-	struct scatterlist *sg;
-	u32 n_sg;
-	u32 sg_byte_count;
-
-	struct fit_sg_descriptor *sksg_list;
-	dma_addr_t sksg_dma_address;
-
-	struct fit_completion_entry_v1 completion;
-
-	struct fit_comp_error_info err_info;
-	int retries;
-
-	blk_status_t status;
-};
-
-struct skd_special_context {
-	struct skd_request_context req;
-
-	void *data_buf;
-	dma_addr_t db_dma_address;
-
-	struct skd_msg_buf *msg_buf;
-	dma_addr_t mb_dma_address;
-};
-
-typedef enum skd_irq_type {
-	SKD_IRQ_LEGACY,
-	SKD_IRQ_MSI,
-	SKD_IRQ_MSIX
-} skd_irq_type_t;
-
-#define SKD_MAX_BARS                    2
-
-struct skd_device {
-	void __iomem *mem_map[SKD_MAX_BARS];
-	resource_size_t mem_phys[SKD_MAX_BARS];
-	u32 mem_size[SKD_MAX_BARS];
-
-	struct skd_msix_entry *msix_entries;
-
-	struct pci_dev *pdev;
-	int pcie_error_reporting_is_enabled;
-
-	spinlock_t lock;
-	struct gendisk *disk;
-	struct blk_mq_tag_set tag_set;
-	struct request_queue *queue;
-	struct skd_fitmsg_context *skmsg;
-	struct device *class_dev;
-	int gendisk_on;
-	int sync_done;
-
-	u32 devno;
-	u32 major;
-	char isr_name[30];
-
-	enum skd_drvr_state state;
-	u32 drive_state;
-
-	u32 cur_max_queue_depth;
-	u32 queue_low_water_mark;
-	u32 dev_max_queue_depth;
-
-	u32 num_fitmsg_context;
-	u32 num_req_context;
-
-	struct skd_fitmsg_context *skmsg_table;
-
-	struct skd_special_context internal_skspcl;
-	u32 read_cap_blocksize;
-	u32 read_cap_last_lba;
-	int read_cap_is_valid;
-	int inquiry_is_valid;
-	u8 inq_serial_num[13];  /*12 chars plus null term */
-
-	u8 skcomp_cycle;
-	u32 skcomp_ix;
-	struct kmem_cache *msgbuf_cache;
-	struct kmem_cache *sglist_cache;
-	struct kmem_cache *databuf_cache;
-	struct fit_completion_entry_v1 *skcomp_table;
-	struct fit_comp_error_info *skerr_table;
-	dma_addr_t cq_dma_address;
-
-	wait_queue_head_t waitq;
-
-	struct timer_list timer;
-	u32 timer_countdown;
-	u32 timer_substate;
-
-	int sgs_per_request;
-	u32 last_mtd;
-
-	u32 proto_ver;
-
-	int dbg_level;
-	u32 connect_time_stamp;
-	int connect_retries;
-#define SKD_MAX_CONNECT_RETRIES 16
-	u32 drive_jiffies;
-
-	u32 timo_slot;
-
-	struct work_struct start_queue;
-	struct work_struct completion_worker;
-};
-
-#define SKD_WRITEL(DEV, VAL, OFF) skd_reg_write32(DEV, VAL, OFF)
-#define SKD_READL(DEV, OFF)      skd_reg_read32(DEV, OFF)
-#define SKD_WRITEQ(DEV, VAL, OFF) skd_reg_write64(DEV, VAL, OFF)
-
-static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset)
-{
-	u32 val = readl(skdev->mem_map[1] + offset);
-
-	if (unlikely(skdev->dbg_level >= 2))
-		dev_dbg(&skdev->pdev->dev, "offset %x = %x\n", offset, val);
-	return val;
-}
-
-static inline void skd_reg_write32(struct skd_device *skdev, u32 val,
-				   u32 offset)
-{
-	writel(val, skdev->mem_map[1] + offset);
-	if (unlikely(skdev->dbg_level >= 2))
-		dev_dbg(&skdev->pdev->dev, "offset %x = %x\n", offset, val);
-}
-
-static inline void skd_reg_write64(struct skd_device *skdev, u64 val,
-				   u32 offset)
-{
-	writeq(val, skdev->mem_map[1] + offset);
-	if (unlikely(skdev->dbg_level >= 2))
-		dev_dbg(&skdev->pdev->dev, "offset %x = %016llx\n", offset,
-			val);
-}
-
-
-#define SKD_IRQ_DEFAULT SKD_IRQ_MSIX
-static int skd_isr_type = SKD_IRQ_DEFAULT;
-
-module_param(skd_isr_type, int, 0444);
-MODULE_PARM_DESC(skd_isr_type, "Interrupt type capability."
-		 " (0==legacy, 1==MSI, 2==MSI-X, default==1)");
-
-#define SKD_MAX_REQ_PER_MSG_DEFAULT 1
-static int skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT;
-
-module_param(skd_max_req_per_msg, int, 0444);
-MODULE_PARM_DESC(skd_max_req_per_msg,
-		 "Maximum SCSI requests packed in a single message."
-		 " (1-" __stringify(SKD_MAX_REQ_PER_MSG) ", default==1)");
-
-#define SKD_MAX_QUEUE_DEPTH_DEFAULT 64
-#define SKD_MAX_QUEUE_DEPTH_DEFAULT_STR "64"
-static int skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT;
-
-module_param(skd_max_queue_depth, int, 0444);
-MODULE_PARM_DESC(skd_max_queue_depth,
-		 "Maximum SCSI requests issued to s1120."
-		 " (1-200, default==" SKD_MAX_QUEUE_DEPTH_DEFAULT_STR ")");
-
-static int skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT;
-module_param(skd_sgs_per_request, int, 0444);
-MODULE_PARM_DESC(skd_sgs_per_request,
-		 "Maximum SG elements per block request."
-		 " (1-4096, default==256)");
-
-static int skd_max_pass_thru = 1;
-module_param(skd_max_pass_thru, int, 0444);
-MODULE_PARM_DESC(skd_max_pass_thru,
-		 "Maximum SCSI pass-thru at a time. IGNORED");
-
-module_param(skd_dbg_level, int, 0444);
-MODULE_PARM_DESC(skd_dbg_level, "s1120 debug level (0,1,2)");
-
-module_param(skd_isr_comp_limit, int, 0444);
-MODULE_PARM_DESC(skd_isr_comp_limit, "s1120 isr comp limit (0=none) default=4");
-
-/* Major device number dynamically assigned. */
-static u32 skd_major;
-
-static void skd_destruct(struct skd_device *skdev);
-static const struct block_device_operations skd_blockdev_ops;
-static void skd_send_fitmsg(struct skd_device *skdev,
-			    struct skd_fitmsg_context *skmsg);
-static void skd_send_special_fitmsg(struct skd_device *skdev,
-				    struct skd_special_context *skspcl);
-static bool skd_preop_sg_list(struct skd_device *skdev,
-			     struct skd_request_context *skreq);
-static void skd_postop_sg_list(struct skd_device *skdev,
-			       struct skd_request_context *skreq);
-
-static void skd_restart_device(struct skd_device *skdev);
-static int skd_quiesce_dev(struct skd_device *skdev);
-static int skd_unquiesce_dev(struct skd_device *skdev);
-static void skd_disable_interrupts(struct skd_device *skdev);
-static void skd_isr_fwstate(struct skd_device *skdev);
-static void skd_recover_requests(struct skd_device *skdev);
-static void skd_soft_reset(struct skd_device *skdev);
-
-const char *skd_drive_state_to_str(int state);
-const char *skd_skdev_state_to_str(enum skd_drvr_state state);
-static void skd_log_skdev(struct skd_device *skdev, const char *event);
-static void skd_log_skreq(struct skd_device *skdev,
-			  struct skd_request_context *skreq, const char *event);
-
-/*
- *****************************************************************************
- * READ/WRITE REQUESTS
- *****************************************************************************
- */
-static bool skd_inc_in_flight(struct request *rq, void *data, bool reserved)
-{
-	int *count = data;
-
-	count++;
-	return true;
-}
-
-static int skd_in_flight(struct skd_device *skdev)
-{
-	int count = 0;
-
-	blk_mq_tagset_busy_iter(&skdev->tag_set, skd_inc_in_flight, &count);
-
-	return count;
-}
-
-static void
-skd_prep_rw_cdb(struct skd_scsi_request *scsi_req,
-		int data_dir, unsigned lba,
-		unsigned count)
-{
-	if (data_dir == READ)
-		scsi_req->cdb[0] = READ_10;
-	else
-		scsi_req->cdb[0] = WRITE_10;
-
-	scsi_req->cdb[1] = 0;
-	scsi_req->cdb[2] = (lba & 0xff000000) >> 24;
-	scsi_req->cdb[3] = (lba & 0xff0000) >> 16;
-	scsi_req->cdb[4] = (lba & 0xff00) >> 8;
-	scsi_req->cdb[5] = (lba & 0xff);
-	scsi_req->cdb[6] = 0;
-	scsi_req->cdb[7] = (count & 0xff00) >> 8;
-	scsi_req->cdb[8] = count & 0xff;
-	scsi_req->cdb[9] = 0;
-}
-
-static void
-skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req,
-			    struct skd_request_context *skreq)
-{
-	skreq->flush_cmd = 1;
-
-	scsi_req->cdb[0] = SYNCHRONIZE_CACHE;
-	scsi_req->cdb[1] = 0;
-	scsi_req->cdb[2] = 0;
-	scsi_req->cdb[3] = 0;
-	scsi_req->cdb[4] = 0;
-	scsi_req->cdb[5] = 0;
-	scsi_req->cdb[6] = 0;
-	scsi_req->cdb[7] = 0;
-	scsi_req->cdb[8] = 0;
-	scsi_req->cdb[9] = 0;
-}
-
-/*
- * Return true if and only if all pending requests should be failed.
- */
-static bool skd_fail_all(struct request_queue *q)
-{
-	struct skd_device *skdev = q->queuedata;
-
-	SKD_ASSERT(skdev->state != SKD_DRVR_STATE_ONLINE);
-
-	skd_log_skdev(skdev, "req_not_online");
-	switch (skdev->state) {
-	case SKD_DRVR_STATE_PAUSING:
-	case SKD_DRVR_STATE_PAUSED:
-	case SKD_DRVR_STATE_STARTING:
-	case SKD_DRVR_STATE_RESTARTING:
-	case SKD_DRVR_STATE_WAIT_BOOT:
-	/* In case of starting, we haven't started the queue,
-	 * so we can't get here... but requests are
-	 * possibly hanging out waiting for us because we
-	 * reported the dev/skd0 already.  They'll wait
-	 * forever if connect doesn't complete.
-	 * What to do??? delay dev/skd0 ??
-	 */
-	case SKD_DRVR_STATE_BUSY:
-	case SKD_DRVR_STATE_BUSY_IMMINENT:
-	case SKD_DRVR_STATE_BUSY_ERASE:
-		return false;
-
-	case SKD_DRVR_STATE_BUSY_SANITIZE:
-	case SKD_DRVR_STATE_STOPPING:
-	case SKD_DRVR_STATE_SYNCING:
-	case SKD_DRVR_STATE_FAULT:
-	case SKD_DRVR_STATE_DISAPPEARED:
-	default:
-		return true;
-	}
-}
-
-static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
-				    const struct blk_mq_queue_data *mqd)
-{
-	struct request *const req = mqd->rq;
-	struct request_queue *const q = req->q;
-	struct skd_device *skdev = q->queuedata;
-	struct skd_fitmsg_context *skmsg;
-	struct fit_msg_hdr *fmh;
-	const u32 tag = blk_mq_unique_tag(req);
-	struct skd_request_context *const skreq = blk_mq_rq_to_pdu(req);
-	struct skd_scsi_request *scsi_req;
-	unsigned long flags = 0;
-	const u32 lba = blk_rq_pos(req);
-	const u32 count = blk_rq_sectors(req);
-	const int data_dir = rq_data_dir(req);
-
-	if (unlikely(skdev->state != SKD_DRVR_STATE_ONLINE))
-		return skd_fail_all(q) ? BLK_STS_IOERR : BLK_STS_RESOURCE;
-
-	if (!(req->rq_flags & RQF_DONTPREP)) {
-		skreq->retries = 0;
-		req->rq_flags |= RQF_DONTPREP;
-	}
-
-	blk_mq_start_request(req);
-
-	WARN_ONCE(tag >= skd_max_queue_depth, "%#x > %#x (nr_requests = %lu)\n",
-		  tag, skd_max_queue_depth, q->nr_requests);
-
-	SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE);
-
-	dev_dbg(&skdev->pdev->dev,
-		"new req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", req, lba,
-		lba, count, count, data_dir);
-
-	skreq->id = tag + SKD_ID_RW_REQUEST;
-	skreq->flush_cmd = 0;
-	skreq->n_sg = 0;
-	skreq->sg_byte_count = 0;
-
-	skreq->fitmsg_id = 0;
-
-	skreq->data_dir = data_dir == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
-
-	if (req->bio && !skd_preop_sg_list(skdev, skreq)) {
-		dev_dbg(&skdev->pdev->dev, "error Out\n");
-		skreq->status = BLK_STS_RESOURCE;
-		blk_mq_complete_request(req);
-		return BLK_STS_OK;
-	}
-
-	dma_sync_single_for_device(&skdev->pdev->dev, skreq->sksg_dma_address,
-				   skreq->n_sg *
-				   sizeof(struct fit_sg_descriptor),
-				   DMA_TO_DEVICE);
-
-	/* Either a FIT msg is in progress or we have to start one. */
-	if (skd_max_req_per_msg == 1) {
-		skmsg = NULL;
-	} else {
-		spin_lock_irqsave(&skdev->lock, flags);
-		skmsg = skdev->skmsg;
-	}
-	if (!skmsg) {
-		skmsg = &skdev->skmsg_table[tag];
-		skdev->skmsg = skmsg;
-
-		/* Initialize the FIT msg header */
-		fmh = &skmsg->msg_buf->fmh;
-		memset(fmh, 0, sizeof(*fmh));
-		fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
-		skmsg->length = sizeof(*fmh);
-	} else {
-		fmh = &skmsg->msg_buf->fmh;
-	}
-
-	skreq->fitmsg_id = skmsg->id;
-
-	scsi_req = &skmsg->msg_buf->scsi[fmh->num_protocol_cmds_coalesced];
-	memset(scsi_req, 0, sizeof(*scsi_req));
-
-	scsi_req->hdr.tag = skreq->id;
-	scsi_req->hdr.sg_list_dma_address =
-		cpu_to_be64(skreq->sksg_dma_address);
-
-	if (req_op(req) == REQ_OP_FLUSH) {
-		skd_prep_zerosize_flush_cdb(scsi_req, skreq);
-		SKD_ASSERT(skreq->flush_cmd == 1);
-	} else {
-		skd_prep_rw_cdb(scsi_req, data_dir, lba, count);
-	}
-
-	if (req->cmd_flags & REQ_FUA)
-		scsi_req->cdb[1] |= SKD_FUA_NV;
-
-	scsi_req->hdr.sg_list_len_bytes = cpu_to_be32(skreq->sg_byte_count);
-
-	/* Complete resource allocations. */
-	skreq->state = SKD_REQ_STATE_BUSY;
-
-	skmsg->length += sizeof(struct skd_scsi_request);
-	fmh->num_protocol_cmds_coalesced++;
-
-	dev_dbg(&skdev->pdev->dev, "req=0x%x busy=%d\n", skreq->id,
-		skd_in_flight(skdev));
-
-	/*
-	 * If the FIT msg buffer is full send it.
-	 */
-	if (skd_max_req_per_msg == 1) {
-		skd_send_fitmsg(skdev, skmsg);
-	} else {
-		if (mqd->last ||
-		    fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) {
-			skd_send_fitmsg(skdev, skmsg);
-			skdev->skmsg = NULL;
-		}
-		spin_unlock_irqrestore(&skdev->lock, flags);
-	}
-
-	return BLK_STS_OK;
-}
-
-static enum blk_eh_timer_return skd_timed_out(struct request *req,
-					      bool reserved)
-{
-	struct skd_device *skdev = req->q->queuedata;
-
-	dev_err(&skdev->pdev->dev, "request with tag %#x timed out\n",
-		blk_mq_unique_tag(req));
-
-	return BLK_EH_RESET_TIMER;
-}
-
-static void skd_complete_rq(struct request *req)
-{
-	struct skd_request_context *skreq = blk_mq_rq_to_pdu(req);
-
-	blk_mq_end_request(req, skreq->status);
-}
-
-static bool skd_preop_sg_list(struct skd_device *skdev,
-			     struct skd_request_context *skreq)
-{
-	struct request *req = blk_mq_rq_from_pdu(skreq);
-	struct scatterlist *sgl = &skreq->sg[0], *sg;
-	int n_sg;
-	int i;
-
-	skreq->sg_byte_count = 0;
-
-	WARN_ON_ONCE(skreq->data_dir != DMA_TO_DEVICE &&
-		     skreq->data_dir != DMA_FROM_DEVICE);
-
-	n_sg = blk_rq_map_sg(skdev->queue, req, sgl);
-	if (n_sg <= 0)
-		return false;
-
-	/*
-	 * Map scatterlist to PCI bus addresses.
-	 * Note PCI might change the number of entries.
-	 */
-	n_sg = dma_map_sg(&skdev->pdev->dev, sgl, n_sg, skreq->data_dir);
-	if (n_sg <= 0)
-		return false;
-
-	SKD_ASSERT(n_sg <= skdev->sgs_per_request);
-
-	skreq->n_sg = n_sg;
-
-	for_each_sg(sgl, sg, n_sg, i) {
-		struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
-		u32 cnt = sg_dma_len(sg);
-		uint64_t dma_addr = sg_dma_address(sg);
-
-		sgd->control = FIT_SGD_CONTROL_NOT_LAST;
-		sgd->byte_count = cnt;
-		skreq->sg_byte_count += cnt;
-		sgd->host_side_addr = dma_addr;
-		sgd->dev_side_addr = 0;
-	}
-
-	skreq->sksg_list[n_sg - 1].next_desc_ptr = 0LL;
-	skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST;
-
-	if (unlikely(skdev->dbg_level > 1)) {
-		dev_dbg(&skdev->pdev->dev,
-			"skreq=%x sksg_list=%p sksg_dma=%pad\n",
-			skreq->id, skreq->sksg_list, &skreq->sksg_dma_address);
-		for (i = 0; i < n_sg; i++) {
-			struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
-
-			dev_dbg(&skdev->pdev->dev,
-				"  sg[%d] count=%u ctrl=0x%x addr=0x%llx next=0x%llx\n",
-				i, sgd->byte_count, sgd->control,
-				sgd->host_side_addr, sgd->next_desc_ptr);
-		}
-	}
-
-	return true;
-}
-
-static void skd_postop_sg_list(struct skd_device *skdev,
-			       struct skd_request_context *skreq)
-{
-	/*
-	 * restore the next ptr for next IO request so we
-	 * don't have to set it every time.
-	 */
-	skreq->sksg_list[skreq->n_sg - 1].next_desc_ptr =
-		skreq->sksg_dma_address +
-		((skreq->n_sg) * sizeof(struct fit_sg_descriptor));
-	dma_unmap_sg(&skdev->pdev->dev, &skreq->sg[0], skreq->n_sg,
-		     skreq->data_dir);
-}
-
-/*
- *****************************************************************************
- * TIMER
- *****************************************************************************
- */
-
-static void skd_timer_tick_not_online(struct skd_device *skdev);
-
-static void skd_start_queue(struct work_struct *work)
-{
-	struct skd_device *skdev = container_of(work, typeof(*skdev),
-						start_queue);
-
-	/*
-	 * Although it is safe to call blk_start_queue() from interrupt
-	 * context, blk_mq_start_hw_queues() must not be called from
-	 * interrupt context.
-	 */
-	blk_mq_start_hw_queues(skdev->queue);
-}
-
-static void skd_timer_tick(struct timer_list *t)
-{
-	struct skd_device *skdev = from_timer(skdev, t, timer);
-	unsigned long reqflags;
-	u32 state;
-
-	if (skdev->state == SKD_DRVR_STATE_FAULT)
-		/* The driver has declared fault, and we want it to
-		 * stay that way until driver is reloaded.
-		 */
-		return;
-
-	spin_lock_irqsave(&skdev->lock, reqflags);
-
-	state = SKD_READL(skdev, FIT_STATUS);
-	state &= FIT_SR_DRIVE_STATE_MASK;
-	if (state != skdev->drive_state)
-		skd_isr_fwstate(skdev);
-
-	if (skdev->state != SKD_DRVR_STATE_ONLINE)
-		skd_timer_tick_not_online(skdev);
-
-	mod_timer(&skdev->timer, (jiffies + HZ));
-
-	spin_unlock_irqrestore(&skdev->lock, reqflags);
-}
-
-static void skd_timer_tick_not_online(struct skd_device *skdev)
-{
-	switch (skdev->state) {
-	case SKD_DRVR_STATE_IDLE:
-	case SKD_DRVR_STATE_LOAD:
-		break;
-	case SKD_DRVR_STATE_BUSY_SANITIZE:
-		dev_dbg(&skdev->pdev->dev,
-			"drive busy sanitize[%x], driver[%x]\n",
-			skdev->drive_state, skdev->state);
-		/* If we've been in sanitize for 3 seconds, we figure we're not
-		 * going to get anymore completions, so recover requests now
-		 */
-		if (skdev->timer_countdown > 0) {
-			skdev->timer_countdown--;
-			return;
-		}
-		skd_recover_requests(skdev);
-		break;
-
-	case SKD_DRVR_STATE_BUSY:
-	case SKD_DRVR_STATE_BUSY_IMMINENT:
-	case SKD_DRVR_STATE_BUSY_ERASE:
-		dev_dbg(&skdev->pdev->dev, "busy[%x], countdown=%d\n",
-			skdev->state, skdev->timer_countdown);
-		if (skdev->timer_countdown > 0) {
-			skdev->timer_countdown--;
-			return;
-		}
-		dev_dbg(&skdev->pdev->dev,
-			"busy[%x], timedout=%d, restarting device.",
-			skdev->state, skdev->timer_countdown);
-		skd_restart_device(skdev);
-		break;
-
-	case SKD_DRVR_STATE_WAIT_BOOT:
-	case SKD_DRVR_STATE_STARTING:
-		if (skdev->timer_countdown > 0) {
-			skdev->timer_countdown--;
-			return;
-		}
-		/* For now, we fault the drive.  Could attempt resets to
-		 * revcover at some point. */
-		skdev->state = SKD_DRVR_STATE_FAULT;
-
-		dev_err(&skdev->pdev->dev, "DriveFault Connect Timeout (%x)\n",
-			skdev->drive_state);
-
-		/*start the queue so we can respond with error to requests */
-		/* wakeup anyone waiting for startup complete */
-		schedule_work(&skdev->start_queue);
-		skdev->gendisk_on = -1;
-		wake_up_interruptible(&skdev->waitq);
-		break;
-
-	case SKD_DRVR_STATE_ONLINE:
-		/* shouldn't get here. */
-		break;
-
-	case SKD_DRVR_STATE_PAUSING:
-	case SKD_DRVR_STATE_PAUSED:
-		break;
-
-	case SKD_DRVR_STATE_RESTARTING:
-		if (skdev->timer_countdown > 0) {
-			skdev->timer_countdown--;
-			return;
-		}
-		/* For now, we fault the drive. Could attempt resets to
-		 * revcover at some point. */
-		skdev->state = SKD_DRVR_STATE_FAULT;
-		dev_err(&skdev->pdev->dev,
-			"DriveFault Reconnect Timeout (%x)\n",
-			skdev->drive_state);
-
-		/*
-		 * Recovering does two things:
-		 * 1. completes IO with error
-		 * 2. reclaims dma resources
-		 * When is it safe to recover requests?
-		 * - if the drive state is faulted
-		 * - if the state is still soft reset after out timeout
-		 * - if the drive registers are dead (state = FF)
-		 * If it is "unsafe", we still need to recover, so we will
-		 * disable pci bus mastering and disable our interrupts.
-		 */
-
-		if ((skdev->drive_state == FIT_SR_DRIVE_SOFT_RESET) ||
-		    (skdev->drive_state == FIT_SR_DRIVE_FAULT) ||
-		    (skdev->drive_state == FIT_SR_DRIVE_STATE_MASK))
-			/* It never came out of soft reset. Try to
-			 * recover the requests and then let them
-			 * fail. This is to mitigate hung processes. */
-			skd_recover_requests(skdev);
-		else {
-			dev_err(&skdev->pdev->dev, "Disable BusMaster (%x)\n",
-				skdev->drive_state);
-			pci_disable_device(skdev->pdev);
-			skd_disable_interrupts(skdev);
-			skd_recover_requests(skdev);
-		}
-
-		/*start the queue so we can respond with error to requests */
-		/* wakeup anyone waiting for startup complete */
-		schedule_work(&skdev->start_queue);
-		skdev->gendisk_on = -1;
-		wake_up_interruptible(&skdev->waitq);
-		break;
-
-	case SKD_DRVR_STATE_RESUMING:
-	case SKD_DRVR_STATE_STOPPING:
-	case SKD_DRVR_STATE_SYNCING:
-	case SKD_DRVR_STATE_FAULT:
-	case SKD_DRVR_STATE_DISAPPEARED:
-	default:
-		break;
-	}
-}
-
-static int skd_start_timer(struct skd_device *skdev)
-{
-	int rc;
-
-	timer_setup(&skdev->timer, skd_timer_tick, 0);
-
-	rc = mod_timer(&skdev->timer, (jiffies + HZ));
-	if (rc)
-		dev_err(&skdev->pdev->dev, "failed to start timer %d\n", rc);
-	return rc;
-}
-
-static void skd_kill_timer(struct skd_device *skdev)
-{
-	del_timer_sync(&skdev->timer);
-}
-
-/*
- *****************************************************************************
- * INTERNAL REQUESTS -- generated by driver itself
- *****************************************************************************
- */
-
-static int skd_format_internal_skspcl(struct skd_device *skdev)
-{
-	struct skd_special_context *skspcl = &skdev->internal_skspcl;
-	struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0];
-	struct fit_msg_hdr *fmh;
-	uint64_t dma_address;
-	struct skd_scsi_request *scsi;
-
-	fmh = &skspcl->msg_buf->fmh;
-	fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
-	fmh->num_protocol_cmds_coalesced = 1;
-
-	scsi = &skspcl->msg_buf->scsi[0];
-	memset(scsi, 0, sizeof(*scsi));
-	dma_address = skspcl->req.sksg_dma_address;
-	scsi->hdr.sg_list_dma_address = cpu_to_be64(dma_address);
-	skspcl->req.n_sg = 1;
-	sgd->control = FIT_SGD_CONTROL_LAST;
-	sgd->byte_count = 0;
-	sgd->host_side_addr = skspcl->db_dma_address;
-	sgd->dev_side_addr = 0;
-	sgd->next_desc_ptr = 0LL;
-
-	return 1;
-}
-
-#define WR_BUF_SIZE SKD_N_INTERNAL_BYTES
-
-static void skd_send_internal_skspcl(struct skd_device *skdev,
-				     struct skd_special_context *skspcl,
-				     u8 opcode)
-{
-	struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0];
-	struct skd_scsi_request *scsi;
-	unsigned char *buf = skspcl->data_buf;
-	int i;
-
-	if (skspcl->req.state != SKD_REQ_STATE_IDLE)
-		/*
-		 * A refresh is already in progress.
-		 * Just wait for it to finish.
-		 */
-		return;
-
-	skspcl->req.state = SKD_REQ_STATE_BUSY;
-
-	scsi = &skspcl->msg_buf->scsi[0];
-	scsi->hdr.tag = skspcl->req.id;
-
-	memset(scsi->cdb, 0, sizeof(scsi->cdb));
-
-	switch (opcode) {
-	case TEST_UNIT_READY:
-		scsi->cdb[0] = TEST_UNIT_READY;
-		sgd->byte_count = 0;
-		scsi->hdr.sg_list_len_bytes = 0;
-		break;
-
-	case READ_CAPACITY:
-		scsi->cdb[0] = READ_CAPACITY;
-		sgd->byte_count = SKD_N_READ_CAP_BYTES;
-		scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
-		break;
-
-	case INQUIRY:
-		scsi->cdb[0] = INQUIRY;
-		scsi->cdb[1] = 0x01;    /* evpd */
-		scsi->cdb[2] = 0x80;    /* serial number page */
-		scsi->cdb[4] = 0x10;
-		sgd->byte_count = 16;
-		scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
-		break;
-
-	case SYNCHRONIZE_CACHE:
-		scsi->cdb[0] = SYNCHRONIZE_CACHE;
-		sgd->byte_count = 0;
-		scsi->hdr.sg_list_len_bytes = 0;
-		break;
-
-	case WRITE_BUFFER:
-		scsi->cdb[0] = WRITE_BUFFER;
-		scsi->cdb[1] = 0x02;
-		scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8;
-		scsi->cdb[8] = WR_BUF_SIZE & 0xFF;
-		sgd->byte_count = WR_BUF_SIZE;
-		scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
-		/* fill incrementing byte pattern */
-		for (i = 0; i < sgd->byte_count; i++)
-			buf[i] = i & 0xFF;
-		break;
-
-	case READ_BUFFER:
-		scsi->cdb[0] = READ_BUFFER;
-		scsi->cdb[1] = 0x02;
-		scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8;
-		scsi->cdb[8] = WR_BUF_SIZE & 0xFF;
-		sgd->byte_count = WR_BUF_SIZE;
-		scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
-		memset(skspcl->data_buf, 0, sgd->byte_count);
-		break;
-
-	default:
-		SKD_ASSERT("Don't know what to send");
-		return;
-
-	}
-	skd_send_special_fitmsg(skdev, skspcl);
-}
-
-static void skd_refresh_device_data(struct skd_device *skdev)
-{
-	struct skd_special_context *skspcl = &skdev->internal_skspcl;
-
-	skd_send_internal_skspcl(skdev, skspcl, TEST_UNIT_READY);
-}
-
-static int skd_chk_read_buf(struct skd_device *skdev,
-			    struct skd_special_context *skspcl)
-{
-	unsigned char *buf = skspcl->data_buf;
-	int i;
-
-	/* check for incrementing byte pattern */
-	for (i = 0; i < WR_BUF_SIZE; i++)
-		if (buf[i] != (i & 0xFF))
-			return 1;
-
-	return 0;
-}
-
-static void skd_log_check_status(struct skd_device *skdev, u8 status, u8 key,
-				 u8 code, u8 qual, u8 fruc)
-{
-	/* If the check condition is of special interest, log a message */
-	if ((status == SAM_STAT_CHECK_CONDITION) && (key == 0x02)
-	    && (code == 0x04) && (qual == 0x06)) {
-		dev_err(&skdev->pdev->dev,
-			"*** LOST_WRITE_DATA ERROR *** key/asc/ascq/fruc %02x/%02x/%02x/%02x\n",
-			key, code, qual, fruc);
-	}
-}
-
-static void skd_complete_internal(struct skd_device *skdev,
-				  struct fit_completion_entry_v1 *skcomp,
-				  struct fit_comp_error_info *skerr,
-				  struct skd_special_context *skspcl)
-{
-	u8 *buf = skspcl->data_buf;
-	u8 status;
-	int i;
-	struct skd_scsi_request *scsi = &skspcl->msg_buf->scsi[0];
-
-	lockdep_assert_held(&skdev->lock);
-
-	SKD_ASSERT(skspcl == &skdev->internal_skspcl);
-
-	dev_dbg(&skdev->pdev->dev, "complete internal %x\n", scsi->cdb[0]);
-
-	dma_sync_single_for_cpu(&skdev->pdev->dev,
-				skspcl->db_dma_address,
-				skspcl->req.sksg_list[0].byte_count,
-				DMA_BIDIRECTIONAL);
-
-	skspcl->req.completion = *skcomp;
-	skspcl->req.state = SKD_REQ_STATE_IDLE;
-
-	status = skspcl->req.completion.status;
-
-	skd_log_check_status(skdev, status, skerr->key, skerr->code,
-			     skerr->qual, skerr->fruc);
-
-	switch (scsi->cdb[0]) {
-	case TEST_UNIT_READY:
-		if (status == SAM_STAT_GOOD)
-			skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER);
-		else if ((status == SAM_STAT_CHECK_CONDITION) &&
-			 (skerr->key == MEDIUM_ERROR))
-			skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER);
-		else {
-			if (skdev->state == SKD_DRVR_STATE_STOPPING) {
-				dev_dbg(&skdev->pdev->dev,
-					"TUR failed, don't send anymore state 0x%x\n",
-					skdev->state);
-				return;
-			}
-			dev_dbg(&skdev->pdev->dev,
-				"**** TUR failed, retry skerr\n");
-			skd_send_internal_skspcl(skdev, skspcl,
-						 TEST_UNIT_READY);
-		}
-		break;
-
-	case WRITE_BUFFER:
-		if (status == SAM_STAT_GOOD)
-			skd_send_internal_skspcl(skdev, skspcl, READ_BUFFER);
-		else {
-			if (skdev->state == SKD_DRVR_STATE_STOPPING) {
-				dev_dbg(&skdev->pdev->dev,
-					"write buffer failed, don't send anymore state 0x%x\n",
-					skdev->state);
-				return;
-			}
-			dev_dbg(&skdev->pdev->dev,
-				"**** write buffer failed, retry skerr\n");
-			skd_send_internal_skspcl(skdev, skspcl,
-						 TEST_UNIT_READY);
-		}
-		break;
-
-	case READ_BUFFER:
-		if (status == SAM_STAT_GOOD) {
-			if (skd_chk_read_buf(skdev, skspcl) == 0)
-				skd_send_internal_skspcl(skdev, skspcl,
-							 READ_CAPACITY);
-			else {
-				dev_err(&skdev->pdev->dev,
-					"*** W/R Buffer mismatch %d ***\n",
-					skdev->connect_retries);
-				if (skdev->connect_retries <
-				    SKD_MAX_CONNECT_RETRIES) {
-					skdev->connect_retries++;
-					skd_soft_reset(skdev);
-				} else {
-					dev_err(&skdev->pdev->dev,
-						"W/R Buffer Connect Error\n");
-					return;
-				}
-			}
-
-		} else {
-			if (skdev->state == SKD_DRVR_STATE_STOPPING) {
-				dev_dbg(&skdev->pdev->dev,
-					"read buffer failed, don't send anymore state 0x%x\n",
-					skdev->state);
-				return;
-			}
-			dev_dbg(&skdev->pdev->dev,
-				"**** read buffer failed, retry skerr\n");
-			skd_send_internal_skspcl(skdev, skspcl,
-						 TEST_UNIT_READY);
-		}
-		break;
-
-	case READ_CAPACITY:
-		skdev->read_cap_is_valid = 0;
-		if (status == SAM_STAT_GOOD) {
-			skdev->read_cap_last_lba =
-				(buf[0] << 24) | (buf[1] << 16) |
-				(buf[2] << 8) | buf[3];
-			skdev->read_cap_blocksize =
-				(buf[4] << 24) | (buf[5] << 16) |
-				(buf[6] << 8) | buf[7];
-
-			dev_dbg(&skdev->pdev->dev, "last lba %d, bs %d\n",
-				skdev->read_cap_last_lba,
-				skdev->read_cap_blocksize);
-
-			set_capacity(skdev->disk, skdev->read_cap_last_lba + 1);
-
-			skdev->read_cap_is_valid = 1;
-
-			skd_send_internal_skspcl(skdev, skspcl, INQUIRY);
-		} else if ((status == SAM_STAT_CHECK_CONDITION) &&
-			   (skerr->key == MEDIUM_ERROR)) {
-			skdev->read_cap_last_lba = ~0;
-			set_capacity(skdev->disk, skdev->read_cap_last_lba + 1);
-			dev_dbg(&skdev->pdev->dev, "**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n");
-			skd_send_internal_skspcl(skdev, skspcl, INQUIRY);
-		} else {
-			dev_dbg(&skdev->pdev->dev, "**** READCAP failed, retry TUR\n");
-			skd_send_internal_skspcl(skdev, skspcl,
-						 TEST_UNIT_READY);
-		}
-		break;
-
-	case INQUIRY:
-		skdev->inquiry_is_valid = 0;
-		if (status == SAM_STAT_GOOD) {
-			skdev->inquiry_is_valid = 1;
-
-			for (i = 0; i < 12; i++)
-				skdev->inq_serial_num[i] = buf[i + 4];
-			skdev->inq_serial_num[12] = 0;
-		}
-
-		if (skd_unquiesce_dev(skdev) < 0)
-			dev_dbg(&skdev->pdev->dev, "**** failed, to ONLINE device\n");
-		 /* connection is complete */
-		skdev->connect_retries = 0;
-		break;
-
-	case SYNCHRONIZE_CACHE:
-		if (status == SAM_STAT_GOOD)
-			skdev->sync_done = 1;
-		else
-			skdev->sync_done = -1;
-		wake_up_interruptible(&skdev->waitq);
-		break;
-
-	default:
-		SKD_ASSERT("we didn't send this");
-	}
-}
-
-/*
- *****************************************************************************
- * FIT MESSAGES
- *****************************************************************************
- */
-
-static void skd_send_fitmsg(struct skd_device *skdev,
-			    struct skd_fitmsg_context *skmsg)
-{
-	u64 qcmd;
-
-	dev_dbg(&skdev->pdev->dev, "dma address %pad, busy=%d\n",
-		&skmsg->mb_dma_address, skd_in_flight(skdev));
-	dev_dbg(&skdev->pdev->dev, "msg_buf %p\n", skmsg->msg_buf);
-
-	qcmd = skmsg->mb_dma_address;
-	qcmd |= FIT_QCMD_QID_NORMAL;
-
-	if (unlikely(skdev->dbg_level > 1)) {
-		u8 *bp = (u8 *)skmsg->msg_buf;
-		int i;
-		for (i = 0; i < skmsg->length; i += 8) {
-			dev_dbg(&skdev->pdev->dev, "msg[%2d] %8ph\n", i,
-				&bp[i]);
-			if (i == 0)
-				i = 64 - 8;
-		}
-	}
-
-	if (skmsg->length > 256)
-		qcmd |= FIT_QCMD_MSGSIZE_512;
-	else if (skmsg->length > 128)
-		qcmd |= FIT_QCMD_MSGSIZE_256;
-	else if (skmsg->length > 64)
-		qcmd |= FIT_QCMD_MSGSIZE_128;
-	else
-		/*
-		 * This makes no sense because the FIT msg header is
-		 * 64 bytes. If the msg is only 64 bytes long it has
-		 * no payload.
-		 */
-		qcmd |= FIT_QCMD_MSGSIZE_64;
-
-	dma_sync_single_for_device(&skdev->pdev->dev, skmsg->mb_dma_address,
-				   skmsg->length, DMA_TO_DEVICE);
-
-	/* Make sure skd_msg_buf is written before the doorbell is triggered. */
-	smp_wmb();
-
-	SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND);
-}
-
-static void skd_send_special_fitmsg(struct skd_device *skdev,
-				    struct skd_special_context *skspcl)
-{
-	u64 qcmd;
-
-	WARN_ON_ONCE(skspcl->req.n_sg != 1);
-
-	if (unlikely(skdev->dbg_level > 1)) {
-		u8 *bp = (u8 *)skspcl->msg_buf;
-		int i;
-
-		for (i = 0; i < SKD_N_SPECIAL_FITMSG_BYTES; i += 8) {
-			dev_dbg(&skdev->pdev->dev, " spcl[%2d] %8ph\n", i,
-				&bp[i]);
-			if (i == 0)
-				i = 64 - 8;
-		}
-
-		dev_dbg(&skdev->pdev->dev,
-			"skspcl=%p id=%04x sksg_list=%p sksg_dma=%pad\n",
-			skspcl, skspcl->req.id, skspcl->req.sksg_list,
-			&skspcl->req.sksg_dma_address);
-		for (i = 0; i < skspcl->req.n_sg; i++) {
-			struct fit_sg_descriptor *sgd =
-				&skspcl->req.sksg_list[i];
-
-			dev_dbg(&skdev->pdev->dev,
-				"  sg[%d] count=%u ctrl=0x%x addr=0x%llx next=0x%llx\n",
-				i, sgd->byte_count, sgd->control,
-				sgd->host_side_addr, sgd->next_desc_ptr);
-		}
-	}
-
-	/*
-	 * Special FIT msgs are always 128 bytes: a 64-byte FIT hdr
-	 * and one 64-byte SSDI command.
-	 */
-	qcmd = skspcl->mb_dma_address;
-	qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128;
-
-	dma_sync_single_for_device(&skdev->pdev->dev, skspcl->mb_dma_address,
-				   SKD_N_SPECIAL_FITMSG_BYTES, DMA_TO_DEVICE);
-	dma_sync_single_for_device(&skdev->pdev->dev,
-				   skspcl->req.sksg_dma_address,
-				   1 * sizeof(struct fit_sg_descriptor),
-				   DMA_TO_DEVICE);
-	dma_sync_single_for_device(&skdev->pdev->dev,
-				   skspcl->db_dma_address,
-				   skspcl->req.sksg_list[0].byte_count,
-				   DMA_BIDIRECTIONAL);
-
-	/* Make sure skd_msg_buf is written before the doorbell is triggered. */
-	smp_wmb();
-
-	SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND);
-}
-
-/*
- *****************************************************************************
- * COMPLETION QUEUE
- *****************************************************************************
- */
-
-static void skd_complete_other(struct skd_device *skdev,
-			       struct fit_completion_entry_v1 *skcomp,
-			       struct fit_comp_error_info *skerr);
-
-struct sns_info {
-	u8 type;
-	u8 stat;
-	u8 key;
-	u8 asc;
-	u8 ascq;
-	u8 mask;
-	enum skd_check_status_action action;
-};
-
-static struct sns_info skd_chkstat_table[] = {
-	/* Good */
-	{ 0x70, 0x02, RECOVERED_ERROR, 0,    0,	   0x1c,
-	  SKD_CHECK_STATUS_REPORT_GOOD },
-
-	/* Smart alerts */
-	{ 0x70, 0x02, NO_SENSE,	       0x0B, 0x00, 0x1E,	/* warnings */
-	  SKD_CHECK_STATUS_REPORT_SMART_ALERT },
-	{ 0x70, 0x02, NO_SENSE,	       0x5D, 0x00, 0x1E,	/* thresholds */
-	  SKD_CHECK_STATUS_REPORT_SMART_ALERT },
-	{ 0x70, 0x02, RECOVERED_ERROR, 0x0B, 0x01, 0x1F,        /* temperature over trigger */
-	  SKD_CHECK_STATUS_REPORT_SMART_ALERT },
-
-	/* Retry (with limits) */
-	{ 0x70, 0x02, 0x0B,	       0,    0,	   0x1C,        /* This one is for DMA ERROR */
-	  SKD_CHECK_STATUS_REQUEUE_REQUEST },
-	{ 0x70, 0x02, 0x06,	       0x0B, 0x00, 0x1E,        /* warnings */
-	  SKD_CHECK_STATUS_REQUEUE_REQUEST },
-	{ 0x70, 0x02, 0x06,	       0x5D, 0x00, 0x1E,        /* thresholds */
-	  SKD_CHECK_STATUS_REQUEUE_REQUEST },
-	{ 0x70, 0x02, 0x06,	       0x80, 0x30, 0x1F,        /* backup power */
-	  SKD_CHECK_STATUS_REQUEUE_REQUEST },
-
-	/* Busy (or about to be) */
-	{ 0x70, 0x02, 0x06,	       0x3f, 0x01, 0x1F, /* fw changed */
-	  SKD_CHECK_STATUS_BUSY_IMMINENT },
-};
-
-/*
- * Look up status and sense data to decide how to handle the error
- * from the device.
- * mask says which fields must match e.g., mask=0x18 means check
- * type and stat, ignore key, asc, ascq.
- */
-
-static enum skd_check_status_action
-skd_check_status(struct skd_device *skdev,
-		 u8 cmp_status, struct fit_comp_error_info *skerr)
-{
-	int i;
-
-	dev_err(&skdev->pdev->dev, "key/asc/ascq/fruc %02x/%02x/%02x/%02x\n",
-		skerr->key, skerr->code, skerr->qual, skerr->fruc);
-
-	dev_dbg(&skdev->pdev->dev,
-		"stat: t=%02x stat=%02x k=%02x c=%02x q=%02x fruc=%02x\n",
-		skerr->type, cmp_status, skerr->key, skerr->code, skerr->qual,
-		skerr->fruc);
-
-	/* Does the info match an entry in the good category? */
-	for (i = 0; i < ARRAY_SIZE(skd_chkstat_table); i++) {
-		struct sns_info *sns = &skd_chkstat_table[i];
-
-		if (sns->mask & 0x10)
-			if (skerr->type != sns->type)
-				continue;
-
-		if (sns->mask & 0x08)
-			if (cmp_status != sns->stat)
-				continue;
-
-		if (sns->mask & 0x04)
-			if (skerr->key != sns->key)
-				continue;
-
-		if (sns->mask & 0x02)
-			if (skerr->code != sns->asc)
-				continue;
-
-		if (sns->mask & 0x01)
-			if (skerr->qual != sns->ascq)
-				continue;
-
-		if (sns->action == SKD_CHECK_STATUS_REPORT_SMART_ALERT) {
-			dev_err(&skdev->pdev->dev,
-				"SMART Alert: sense key/asc/ascq %02x/%02x/%02x\n",
-				skerr->key, skerr->code, skerr->qual);
-		}
-		return sns->action;
-	}
-
-	/* No other match, so nonzero status means error,
-	 * zero status means good
-	 */
-	if (cmp_status) {
-		dev_dbg(&skdev->pdev->dev, "status check: error\n");
-		return SKD_CHECK_STATUS_REPORT_ERROR;
-	}
-
-	dev_dbg(&skdev->pdev->dev, "status check good default\n");
-	return SKD_CHECK_STATUS_REPORT_GOOD;
-}
-
-static void skd_resolve_req_exception(struct skd_device *skdev,
-				      struct skd_request_context *skreq,
-				      struct request *req)
-{
-	u8 cmp_status = skreq->completion.status;
-
-	switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) {
-	case SKD_CHECK_STATUS_REPORT_GOOD:
-	case SKD_CHECK_STATUS_REPORT_SMART_ALERT:
-		skreq->status = BLK_STS_OK;
-		if (likely(!blk_should_fake_timeout(req->q)))
-			blk_mq_complete_request(req);
-		break;
-
-	case SKD_CHECK_STATUS_BUSY_IMMINENT:
-		skd_log_skreq(skdev, skreq, "retry(busy)");
-		blk_mq_requeue_request(req, true);
-		dev_info(&skdev->pdev->dev, "drive BUSY imminent\n");
-		skdev->state = SKD_DRVR_STATE_BUSY_IMMINENT;
-		skdev->timer_countdown = SKD_TIMER_MINUTES(20);
-		skd_quiesce_dev(skdev);
-		break;
-
-	case SKD_CHECK_STATUS_REQUEUE_REQUEST:
-		if (++skreq->retries < SKD_MAX_RETRIES) {
-			skd_log_skreq(skdev, skreq, "retry");
-			blk_mq_requeue_request(req, true);
-			break;
-		}
-		fallthrough;
-
-	case SKD_CHECK_STATUS_REPORT_ERROR:
-	default:
-		skreq->status = BLK_STS_IOERR;
-		if (likely(!blk_should_fake_timeout(req->q)))
-			blk_mq_complete_request(req);
-		break;
-	}
-}
-
-static void skd_release_skreq(struct skd_device *skdev,
-			      struct skd_request_context *skreq)
-{
-	/*
-	 * Reclaim the skd_request_context
-	 */
-	skreq->state = SKD_REQ_STATE_IDLE;
-}
-
-static int skd_isr_completion_posted(struct skd_device *skdev,
-					int limit, int *enqueued)
-{
-	struct fit_completion_entry_v1 *skcmp;
-	struct fit_comp_error_info *skerr;
-	u16 req_id;
-	u32 tag;
-	u16 hwq = 0;
-	struct request *rq;
-	struct skd_request_context *skreq;
-	u16 cmp_cntxt;
-	u8 cmp_status;
-	u8 cmp_cycle;
-	u32 cmp_bytes;
-	int rc = 0;
-	int processed = 0;
-
-	lockdep_assert_held(&skdev->lock);
-
-	for (;; ) {
-		SKD_ASSERT(skdev->skcomp_ix < SKD_N_COMPLETION_ENTRY);
-
-		skcmp = &skdev->skcomp_table[skdev->skcomp_ix];
-		cmp_cycle = skcmp->cycle;
-		cmp_cntxt = skcmp->tag;
-		cmp_status = skcmp->status;
-		cmp_bytes = be32_to_cpu(skcmp->num_returned_bytes);
-
-		skerr = &skdev->skerr_table[skdev->skcomp_ix];
-
-		dev_dbg(&skdev->pdev->dev,
-			"cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d busy=%d rbytes=0x%x proto=%d\n",
-			skdev->skcomp_cycle, skdev->skcomp_ix, cmp_cycle,
-			cmp_cntxt, cmp_status, skd_in_flight(skdev),
-			cmp_bytes, skdev->proto_ver);
-
-		if (cmp_cycle != skdev->skcomp_cycle) {
-			dev_dbg(&skdev->pdev->dev, "end of completions\n");
-			break;
-		}
-		/*
-		 * Update the completion queue head index and possibly
-		 * the completion cycle count. 8-bit wrap-around.
-		 */
-		skdev->skcomp_ix++;
-		if (skdev->skcomp_ix >= SKD_N_COMPLETION_ENTRY) {
-			skdev->skcomp_ix = 0;
-			skdev->skcomp_cycle++;
-		}
-
-		/*
-		 * The command context is a unique 32-bit ID. The low order
-		 * bits help locate the request. The request is usually a
-		 * r/w request (see skd_start() above) or a special request.
-		 */
-		req_id = cmp_cntxt;
-		tag = req_id & SKD_ID_SLOT_AND_TABLE_MASK;
-
-		/* Is this other than a r/w request? */
-		if (tag >= skdev->num_req_context) {
-			/*
-			 * This is not a completion for a r/w request.
-			 */
-			WARN_ON_ONCE(blk_mq_tag_to_rq(skdev->tag_set.tags[hwq],
-						      tag));
-			skd_complete_other(skdev, skcmp, skerr);
-			continue;
-		}
-
-		rq = blk_mq_tag_to_rq(skdev->tag_set.tags[hwq], tag);
-		if (WARN(!rq, "No request for tag %#x -> %#x\n", cmp_cntxt,
-			 tag))
-			continue;
-		skreq = blk_mq_rq_to_pdu(rq);
-
-		/*
-		 * Make sure the request ID for the slot matches.
-		 */
-		if (skreq->id != req_id) {
-			dev_err(&skdev->pdev->dev,
-				"Completion mismatch comp_id=0x%04x skreq=0x%04x new=0x%04x\n",
-				req_id, skreq->id, cmp_cntxt);
-
-			continue;
-		}
-
-		SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY);
-
-		skreq->completion = *skcmp;
-		if (unlikely(cmp_status == SAM_STAT_CHECK_CONDITION)) {
-			skreq->err_info = *skerr;
-			skd_log_check_status(skdev, cmp_status, skerr->key,
-					     skerr->code, skerr->qual,
-					     skerr->fruc);
-		}
-		/* Release DMA resources for the request. */
-		if (skreq->n_sg > 0)
-			skd_postop_sg_list(skdev, skreq);
-
-		skd_release_skreq(skdev, skreq);
-
-		/*
-		 * Capture the outcome and post it back to the native request.
-		 */
-		if (likely(cmp_status == SAM_STAT_GOOD)) {
-			skreq->status = BLK_STS_OK;
-			if (likely(!blk_should_fake_timeout(rq->q)))
-				blk_mq_complete_request(rq);
-		} else {
-			skd_resolve_req_exception(skdev, skreq, rq);
-		}
-
-		/* skd_isr_comp_limit equal zero means no limit */
-		if (limit) {
-			if (++processed >= limit) {
-				rc = 1;
-				break;
-			}
-		}
-	}
-
-	if (skdev->state == SKD_DRVR_STATE_PAUSING &&
-	    skd_in_flight(skdev) == 0) {
-		skdev->state = SKD_DRVR_STATE_PAUSED;
-		wake_up_interruptible(&skdev->waitq);
-	}
-
-	return rc;
-}
-
-static void skd_complete_other(struct skd_device *skdev,
-			       struct fit_completion_entry_v1 *skcomp,
-			       struct fit_comp_error_info *skerr)
-{
-	u32 req_id = 0;
-	u32 req_table;
-	u32 req_slot;
-	struct skd_special_context *skspcl;
-
-	lockdep_assert_held(&skdev->lock);
-
-	req_id = skcomp->tag;
-	req_table = req_id & SKD_ID_TABLE_MASK;
-	req_slot = req_id & SKD_ID_SLOT_MASK;
-
-	dev_dbg(&skdev->pdev->dev, "table=0x%x id=0x%x slot=%d\n", req_table,
-		req_id, req_slot);
-
-	/*
-	 * Based on the request id, determine how to dispatch this completion.
-	 * This swich/case is finding the good cases and forwarding the
-	 * completion entry. Errors are reported below the switch.
-	 */
-	switch (req_table) {
-	case SKD_ID_RW_REQUEST:
-		/*
-		 * The caller, skd_isr_completion_posted() above,
-		 * handles r/w requests. The only way we get here
-		 * is if the req_slot is out of bounds.
-		 */
-		break;
-
-	case SKD_ID_INTERNAL:
-		if (req_slot == 0) {
-			skspcl = &skdev->internal_skspcl;
-			if (skspcl->req.id == req_id &&
-			    skspcl->req.state == SKD_REQ_STATE_BUSY) {
-				skd_complete_internal(skdev,
-						      skcomp, skerr, skspcl);
-				return;
-			}
-		}
-		break;
-
-	case SKD_ID_FIT_MSG:
-		/*
-		 * These id's should never appear in a completion record.
-		 */
-		break;
-
-	default:
-		/*
-		 * These id's should never appear anywhere;
-		 */
-		break;
-	}
-
-	/*
-	 * If we get here it is a bad or stale id.
-	 */
-}
-
-static void skd_reset_skcomp(struct skd_device *skdev)
-{
-	memset(skdev->skcomp_table, 0, SKD_SKCOMP_SIZE);
-
-	skdev->skcomp_ix = 0;
-	skdev->skcomp_cycle = 1;
-}
-
-/*
- *****************************************************************************
- * INTERRUPTS
- *****************************************************************************
- */
-static void skd_completion_worker(struct work_struct *work)
-{
-	struct skd_device *skdev =
-		container_of(work, struct skd_device, completion_worker);
-	unsigned long flags;
-	int flush_enqueued = 0;
-
-	spin_lock_irqsave(&skdev->lock, flags);
-
-	/*
-	 * pass in limit=0, which means no limit..
-	 * process everything in compq
-	 */
-	skd_isr_completion_posted(skdev, 0, &flush_enqueued);
-	schedule_work(&skdev->start_queue);
-
-	spin_unlock_irqrestore(&skdev->lock, flags);
-}
-
-static void skd_isr_msg_from_dev(struct skd_device *skdev);
-
-static irqreturn_t
-skd_isr(int irq, void *ptr)
-{
-	struct skd_device *skdev = ptr;
-	u32 intstat;
-	u32 ack;
-	int rc = 0;
-	int deferred = 0;
-	int flush_enqueued = 0;
-
-	spin_lock(&skdev->lock);
-
-	for (;; ) {
-		intstat = SKD_READL(skdev, FIT_INT_STATUS_HOST);
-
-		ack = FIT_INT_DEF_MASK;
-		ack &= intstat;
-
-		dev_dbg(&skdev->pdev->dev, "intstat=0x%x ack=0x%x\n", intstat,
-			ack);
-
-		/* As long as there is an int pending on device, keep
-		 * running loop.  When none, get out, but if we've never
-		 * done any processing, call completion handler?
-		 */
-		if (ack == 0) {
-			/* No interrupts on device, but run the completion
-			 * processor anyway?
-			 */
-			if (rc == 0)
-				if (likely (skdev->state
-					== SKD_DRVR_STATE_ONLINE))
-					deferred = 1;
-			break;
-		}
-
-		rc = IRQ_HANDLED;
-
-		SKD_WRITEL(skdev, ack, FIT_INT_STATUS_HOST);
-
-		if (likely((skdev->state != SKD_DRVR_STATE_LOAD) &&
-			   (skdev->state != SKD_DRVR_STATE_STOPPING))) {
-			if (intstat & FIT_ISH_COMPLETION_POSTED) {
-				/*
-				 * If we have already deferred completion
-				 * processing, don't bother running it again
-				 */
-				if (deferred == 0)
-					deferred =
-						skd_isr_completion_posted(skdev,
-						skd_isr_comp_limit, &flush_enqueued);
-			}
-
-			if (intstat & FIT_ISH_FW_STATE_CHANGE) {
-				skd_isr_fwstate(skdev);
-				if (skdev->state == SKD_DRVR_STATE_FAULT ||
-				    skdev->state ==
-				    SKD_DRVR_STATE_DISAPPEARED) {
-					spin_unlock(&skdev->lock);
-					return rc;
-				}
-			}
-
-			if (intstat & FIT_ISH_MSG_FROM_DEV)
-				skd_isr_msg_from_dev(skdev);
-		}
-	}
-
-	if (unlikely(flush_enqueued))
-		schedule_work(&skdev->start_queue);
-
-	if (deferred)
-		schedule_work(&skdev->completion_worker);
-	else if (!flush_enqueued)
-		schedule_work(&skdev->start_queue);
-
-	spin_unlock(&skdev->lock);
-
-	return rc;
-}
-
-static void skd_drive_fault(struct skd_device *skdev)
-{
-	skdev->state = SKD_DRVR_STATE_FAULT;
-	dev_err(&skdev->pdev->dev, "Drive FAULT\n");
-}
-
-static void skd_drive_disappeared(struct skd_device *skdev)
-{
-	skdev->state = SKD_DRVR_STATE_DISAPPEARED;
-	dev_err(&skdev->pdev->dev, "Drive DISAPPEARED\n");
-}
-
-static void skd_isr_fwstate(struct skd_device *skdev)
-{
-	u32 sense;
-	u32 state;
-	u32 mtd;
-	int prev_driver_state = skdev->state;
-
-	sense = SKD_READL(skdev, FIT_STATUS);
-	state = sense & FIT_SR_DRIVE_STATE_MASK;
-
-	dev_err(&skdev->pdev->dev, "s1120 state %s(%d)=>%s(%d)\n",
-		skd_drive_state_to_str(skdev->drive_state), skdev->drive_state,
-		skd_drive_state_to_str(state), state);
-
-	skdev->drive_state = state;
-
-	switch (skdev->drive_state) {
-	case FIT_SR_DRIVE_INIT:
-		if (skdev->state == SKD_DRVR_STATE_PROTOCOL_MISMATCH) {
-			skd_disable_interrupts(skdev);
-			break;
-		}
-		if (skdev->state == SKD_DRVR_STATE_RESTARTING)
-			skd_recover_requests(skdev);
-		if (skdev->state == SKD_DRVR_STATE_WAIT_BOOT) {
-			skdev->timer_countdown = SKD_STARTING_TIMO;
-			skdev->state = SKD_DRVR_STATE_STARTING;
-			skd_soft_reset(skdev);
-			break;
-		}
-		mtd = FIT_MXD_CONS(FIT_MTD_FITFW_INIT, 0, 0);
-		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
-		skdev->last_mtd = mtd;
-		break;
-
-	case FIT_SR_DRIVE_ONLINE:
-		skdev->cur_max_queue_depth = skd_max_queue_depth;
-		if (skdev->cur_max_queue_depth > skdev->dev_max_queue_depth)
-			skdev->cur_max_queue_depth = skdev->dev_max_queue_depth;
-
-		skdev->queue_low_water_mark =
-			skdev->cur_max_queue_depth * 2 / 3 + 1;
-		if (skdev->queue_low_water_mark < 1)
-			skdev->queue_low_water_mark = 1;
-		dev_info(&skdev->pdev->dev,
-			 "Queue depth limit=%d dev=%d lowat=%d\n",
-			 skdev->cur_max_queue_depth,
-			 skdev->dev_max_queue_depth,
-			 skdev->queue_low_water_mark);
-
-		skd_refresh_device_data(skdev);
-		break;
-
-	case FIT_SR_DRIVE_BUSY:
-		skdev->state = SKD_DRVR_STATE_BUSY;
-		skdev->timer_countdown = SKD_BUSY_TIMO;
-		skd_quiesce_dev(skdev);
-		break;
-	case FIT_SR_DRIVE_BUSY_SANITIZE:
-		/* set timer for 3 seconds, we'll abort any unfinished
-		 * commands after that expires
-		 */
-		skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE;
-		skdev->timer_countdown = SKD_TIMER_SECONDS(3);
-		schedule_work(&skdev->start_queue);
-		break;
-	case FIT_SR_DRIVE_BUSY_ERASE:
-		skdev->state = SKD_DRVR_STATE_BUSY_ERASE;
-		skdev->timer_countdown = SKD_BUSY_TIMO;
-		break;
-	case FIT_SR_DRIVE_OFFLINE:
-		skdev->state = SKD_DRVR_STATE_IDLE;
-		break;
-	case FIT_SR_DRIVE_SOFT_RESET:
-		switch (skdev->state) {
-		case SKD_DRVR_STATE_STARTING:
-		case SKD_DRVR_STATE_RESTARTING:
-			/* Expected by a caller of skd_soft_reset() */
-			break;
-		default:
-			skdev->state = SKD_DRVR_STATE_RESTARTING;
-			break;
-		}
-		break;
-	case FIT_SR_DRIVE_FW_BOOTING:
-		dev_dbg(&skdev->pdev->dev, "ISR FIT_SR_DRIVE_FW_BOOTING\n");
-		skdev->state = SKD_DRVR_STATE_WAIT_BOOT;
-		skdev->timer_countdown = SKD_WAIT_BOOT_TIMO;
-		break;
-
-	case FIT_SR_DRIVE_DEGRADED:
-	case FIT_SR_PCIE_LINK_DOWN:
-	case FIT_SR_DRIVE_NEED_FW_DOWNLOAD:
-		break;
-
-	case FIT_SR_DRIVE_FAULT:
-		skd_drive_fault(skdev);
-		skd_recover_requests(skdev);
-		schedule_work(&skdev->start_queue);
-		break;
-
-	/* PCIe bus returned all Fs? */
-	case 0xFF:
-		dev_info(&skdev->pdev->dev, "state=0x%x sense=0x%x\n", state,
-			 sense);
-		skd_drive_disappeared(skdev);
-		skd_recover_requests(skdev);
-		schedule_work(&skdev->start_queue);
-		break;
-	default:
-		/*
-		 * Uknown FW State. Wait for a state we recognize.
-		 */
-		break;
-	}
-	dev_err(&skdev->pdev->dev, "Driver state %s(%d)=>%s(%d)\n",
-		skd_skdev_state_to_str(prev_driver_state), prev_driver_state,
-		skd_skdev_state_to_str(skdev->state), skdev->state);
-}
-
-static bool skd_recover_request(struct request *req, void *data, bool reserved)
-{
-	struct skd_device *const skdev = data;
-	struct skd_request_context *skreq = blk_mq_rq_to_pdu(req);
-
-	if (skreq->state != SKD_REQ_STATE_BUSY)
-		return true;
-
-	skd_log_skreq(skdev, skreq, "recover");
-
-	/* Release DMA resources for the request. */
-	if (skreq->n_sg > 0)
-		skd_postop_sg_list(skdev, skreq);
-
-	skreq->state = SKD_REQ_STATE_IDLE;
-	skreq->status = BLK_STS_IOERR;
-	blk_mq_complete_request(req);
-	return true;
-}
-
-static void skd_recover_requests(struct skd_device *skdev)
-{
-	blk_mq_tagset_busy_iter(&skdev->tag_set, skd_recover_request, skdev);
-}
-
-static void skd_isr_msg_from_dev(struct skd_device *skdev)
-{
-	u32 mfd;
-	u32 mtd;
-	u32 data;
-
-	mfd = SKD_READL(skdev, FIT_MSG_FROM_DEVICE);
-
-	dev_dbg(&skdev->pdev->dev, "mfd=0x%x last_mtd=0x%x\n", mfd,
-		skdev->last_mtd);
-
-	/* ignore any mtd that is an ack for something we didn't send */
-	if (FIT_MXD_TYPE(mfd) != FIT_MXD_TYPE(skdev->last_mtd))
-		return;
-
-	switch (FIT_MXD_TYPE(mfd)) {
-	case FIT_MTD_FITFW_INIT:
-		skdev->proto_ver = FIT_PROTOCOL_MAJOR_VER(mfd);
-
-		if (skdev->proto_ver != FIT_PROTOCOL_VERSION_1) {
-			dev_err(&skdev->pdev->dev, "protocol mismatch\n");
-			dev_err(&skdev->pdev->dev, "  got=%d support=%d\n",
-				skdev->proto_ver, FIT_PROTOCOL_VERSION_1);
-			dev_err(&skdev->pdev->dev, "  please upgrade driver\n");
-			skdev->state = SKD_DRVR_STATE_PROTOCOL_MISMATCH;
-			skd_soft_reset(skdev);
-			break;
-		}
-		mtd = FIT_MXD_CONS(FIT_MTD_GET_CMDQ_DEPTH, 0, 0);
-		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
-		skdev->last_mtd = mtd;
-		break;
-
-	case FIT_MTD_GET_CMDQ_DEPTH:
-		skdev->dev_max_queue_depth = FIT_MXD_DATA(mfd);
-		mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_DEPTH, 0,
-				   SKD_N_COMPLETION_ENTRY);
-		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
-		skdev->last_mtd = mtd;
-		break;
-
-	case FIT_MTD_SET_COMPQ_DEPTH:
-		SKD_WRITEQ(skdev, skdev->cq_dma_address, FIT_MSG_TO_DEVICE_ARG);
-		mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_ADDR, 0, 0);
-		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
-		skdev->last_mtd = mtd;
-		break;
-
-	case FIT_MTD_SET_COMPQ_ADDR:
-		skd_reset_skcomp(skdev);
-		mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_HOST_ID, 0, skdev->devno);
-		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
-		skdev->last_mtd = mtd;
-		break;
-
-	case FIT_MTD_CMD_LOG_HOST_ID:
-		/* hardware interface overflows in y2106 */
-		skdev->connect_time_stamp = (u32)ktime_get_real_seconds();
-		data = skdev->connect_time_stamp & 0xFFFF;
-		mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data);
-		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
-		skdev->last_mtd = mtd;
-		break;
-
-	case FIT_MTD_CMD_LOG_TIME_STAMP_LO:
-		skdev->drive_jiffies = FIT_MXD_DATA(mfd);
-		data = (skdev->connect_time_stamp >> 16) & 0xFFFF;
-		mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_HI, 0, data);
-		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
-		skdev->last_mtd = mtd;
-		break;
-
-	case FIT_MTD_CMD_LOG_TIME_STAMP_HI:
-		skdev->drive_jiffies |= (FIT_MXD_DATA(mfd) << 16);
-		mtd = FIT_MXD_CONS(FIT_MTD_ARM_QUEUE, 0, 0);
-		SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
-		skdev->last_mtd = mtd;
-
-		dev_err(&skdev->pdev->dev, "Time sync driver=0x%x device=0x%x\n",
-			skdev->connect_time_stamp, skdev->drive_jiffies);
-		break;
-
-	case FIT_MTD_ARM_QUEUE:
-		skdev->last_mtd = 0;
-		/*
-		 * State should be, or soon will be, FIT_SR_DRIVE_ONLINE.
-		 */
-		break;
-
-	default:
-		break;
-	}
-}
-
-static void skd_disable_interrupts(struct skd_device *skdev)
-{
-	u32 sense;
-
-	sense = SKD_READL(skdev, FIT_CONTROL);
-	sense &= ~FIT_CR_ENABLE_INTERRUPTS;
-	SKD_WRITEL(skdev, sense, FIT_CONTROL);
-	dev_dbg(&skdev->pdev->dev, "sense 0x%x\n", sense);
-
-	/* Note that the 1s is written. A 1-bit means
-	 * disable, a 0 means enable.
-	 */
-	SKD_WRITEL(skdev, ~0, FIT_INT_MASK_HOST);
-}
-
-static void skd_enable_interrupts(struct skd_device *skdev)
-{
-	u32 val;
-
-	/* unmask interrupts first */
-	val = FIT_ISH_FW_STATE_CHANGE +
-	      FIT_ISH_COMPLETION_POSTED + FIT_ISH_MSG_FROM_DEV;
-
-	/* Note that the compliment of mask is written. A 1-bit means
-	 * disable, a 0 means enable. */
-	SKD_WRITEL(skdev, ~val, FIT_INT_MASK_HOST);
-	dev_dbg(&skdev->pdev->dev, "interrupt mask=0x%x\n", ~val);
-
-	val = SKD_READL(skdev, FIT_CONTROL);
-	val |= FIT_CR_ENABLE_INTERRUPTS;
-	dev_dbg(&skdev->pdev->dev, "control=0x%x\n", val);
-	SKD_WRITEL(skdev, val, FIT_CONTROL);
-}
-
-/*
- *****************************************************************************
- * START, STOP, RESTART, QUIESCE, UNQUIESCE
- *****************************************************************************
- */
-
-static void skd_soft_reset(struct skd_device *skdev)
-{
-	u32 val;
-
-	val = SKD_READL(skdev, FIT_CONTROL);
-	val |= (FIT_CR_SOFT_RESET);
-	dev_dbg(&skdev->pdev->dev, "control=0x%x\n", val);
-	SKD_WRITEL(skdev, val, FIT_CONTROL);
-}
-
-static void skd_start_device(struct skd_device *skdev)
-{
-	unsigned long flags;
-	u32 sense;
-	u32 state;
-
-	spin_lock_irqsave(&skdev->lock, flags);
-
-	/* ack all ghost interrupts */
-	SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
-
-	sense = SKD_READL(skdev, FIT_STATUS);
-
-	dev_dbg(&skdev->pdev->dev, "initial status=0x%x\n", sense);
-
-	state = sense & FIT_SR_DRIVE_STATE_MASK;
-	skdev->drive_state = state;
-	skdev->last_mtd = 0;
-
-	skdev->state = SKD_DRVR_STATE_STARTING;
-	skdev->timer_countdown = SKD_STARTING_TIMO;
-
-	skd_enable_interrupts(skdev);
-
-	switch (skdev->drive_state) {
-	case FIT_SR_DRIVE_OFFLINE:
-		dev_err(&skdev->pdev->dev, "Drive offline...\n");
-		break;
-
-	case FIT_SR_DRIVE_FW_BOOTING:
-		dev_dbg(&skdev->pdev->dev, "FIT_SR_DRIVE_FW_BOOTING\n");
-		skdev->state = SKD_DRVR_STATE_WAIT_BOOT;
-		skdev->timer_countdown = SKD_WAIT_BOOT_TIMO;
-		break;
-
-	case FIT_SR_DRIVE_BUSY_SANITIZE:
-		dev_info(&skdev->pdev->dev, "Start: BUSY_SANITIZE\n");
-		skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE;
-		skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
-		break;
-
-	case FIT_SR_DRIVE_BUSY_ERASE:
-		dev_info(&skdev->pdev->dev, "Start: BUSY_ERASE\n");
-		skdev->state = SKD_DRVR_STATE_BUSY_ERASE;
-		skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
-		break;
-
-	case FIT_SR_DRIVE_INIT:
-	case FIT_SR_DRIVE_ONLINE:
-		skd_soft_reset(skdev);
-		break;
-
-	case FIT_SR_DRIVE_BUSY:
-		dev_err(&skdev->pdev->dev, "Drive Busy...\n");
-		skdev->state = SKD_DRVR_STATE_BUSY;
-		skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
-		break;
-
-	case FIT_SR_DRIVE_SOFT_RESET:
-		dev_err(&skdev->pdev->dev, "drive soft reset in prog\n");
-		break;
-
-	case FIT_SR_DRIVE_FAULT:
-		/* Fault state is bad...soft reset won't do it...
-		 * Hard reset, maybe, but does it work on device?
-		 * For now, just fault so the system doesn't hang.
-		 */
-		skd_drive_fault(skdev);
-		/*start the queue so we can respond with error to requests */
-		dev_dbg(&skdev->pdev->dev, "starting queue\n");
-		schedule_work(&skdev->start_queue);
-		skdev->gendisk_on = -1;
-		wake_up_interruptible(&skdev->waitq);
-		break;
-
-	case 0xFF:
-		/* Most likely the device isn't there or isn't responding
-		 * to the BAR1 addresses. */
-		skd_drive_disappeared(skdev);
-		/*start the queue so we can respond with error to requests */
-		dev_dbg(&skdev->pdev->dev,
-			"starting queue to error-out reqs\n");
-		schedule_work(&skdev->start_queue);
-		skdev->gendisk_on = -1;
-		wake_up_interruptible(&skdev->waitq);
-		break;
-
-	default:
-		dev_err(&skdev->pdev->dev, "Start: unknown state %x\n",
-			skdev->drive_state);
-		break;
-	}
-
-	state = SKD_READL(skdev, FIT_CONTROL);
-	dev_dbg(&skdev->pdev->dev, "FIT Control Status=0x%x\n", state);
-
-	state = SKD_READL(skdev, FIT_INT_STATUS_HOST);
-	dev_dbg(&skdev->pdev->dev, "Intr Status=0x%x\n", state);
-
-	state = SKD_READL(skdev, FIT_INT_MASK_HOST);
-	dev_dbg(&skdev->pdev->dev, "Intr Mask=0x%x\n", state);
-
-	state = SKD_READL(skdev, FIT_MSG_FROM_DEVICE);
-	dev_dbg(&skdev->pdev->dev, "Msg from Dev=0x%x\n", state);
-
-	state = SKD_READL(skdev, FIT_HW_VERSION);
-	dev_dbg(&skdev->pdev->dev, "HW version=0x%x\n", state);
-
-	spin_unlock_irqrestore(&skdev->lock, flags);
-}
-
-static void skd_stop_device(struct skd_device *skdev)
-{
-	unsigned long flags;
-	struct skd_special_context *skspcl = &skdev->internal_skspcl;
-	u32 dev_state;
-	int i;
-
-	spin_lock_irqsave(&skdev->lock, flags);
-
-	if (skdev->state != SKD_DRVR_STATE_ONLINE) {
-		dev_err(&skdev->pdev->dev, "%s not online no sync\n", __func__);
-		goto stop_out;
-	}
-
-	if (skspcl->req.state != SKD_REQ_STATE_IDLE) {
-		dev_err(&skdev->pdev->dev, "%s no special\n", __func__);
-		goto stop_out;
-	}
-
-	skdev->state = SKD_DRVR_STATE_SYNCING;
-	skdev->sync_done = 0;
-
-	skd_send_internal_skspcl(skdev, skspcl, SYNCHRONIZE_CACHE);
-
-	spin_unlock_irqrestore(&skdev->lock, flags);
-
-	wait_event_interruptible_timeout(skdev->waitq,
-					 (skdev->sync_done), (10 * HZ));
-
-	spin_lock_irqsave(&skdev->lock, flags);
-
-	switch (skdev->sync_done) {
-	case 0:
-		dev_err(&skdev->pdev->dev, "%s no sync\n", __func__);
-		break;
-	case 1:
-		dev_err(&skdev->pdev->dev, "%s sync done\n", __func__);
-		break;
-	default:
-		dev_err(&skdev->pdev->dev, "%s sync error\n", __func__);
-	}
-
-stop_out:
-	skdev->state = SKD_DRVR_STATE_STOPPING;
-	spin_unlock_irqrestore(&skdev->lock, flags);
-
-	skd_kill_timer(skdev);
-
-	spin_lock_irqsave(&skdev->lock, flags);
-	skd_disable_interrupts(skdev);
-
-	/* ensure all ints on device are cleared */
-	/* soft reset the device to unload with a clean slate */
-	SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
-	SKD_WRITEL(skdev, FIT_CR_SOFT_RESET, FIT_CONTROL);
-
-	spin_unlock_irqrestore(&skdev->lock, flags);
-
-	/* poll every 100ms, 1 second timeout */
-	for (i = 0; i < 10; i++) {
-		dev_state =
-			SKD_READL(skdev, FIT_STATUS) & FIT_SR_DRIVE_STATE_MASK;
-		if (dev_state == FIT_SR_DRIVE_INIT)
-			break;
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(msecs_to_jiffies(100));
-	}
-
-	if (dev_state != FIT_SR_DRIVE_INIT)
-		dev_err(&skdev->pdev->dev, "%s state error 0x%02x\n", __func__,
-			dev_state);
-}
-
-/* assume spinlock is held */
-static void skd_restart_device(struct skd_device *skdev)
-{
-	u32 state;
-
-	/* ack all ghost interrupts */
-	SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
-
-	state = SKD_READL(skdev, FIT_STATUS);
-
-	dev_dbg(&skdev->pdev->dev, "drive status=0x%x\n", state);
-
-	state &= FIT_SR_DRIVE_STATE_MASK;
-	skdev->drive_state = state;
-	skdev->last_mtd = 0;
-
-	skdev->state = SKD_DRVR_STATE_RESTARTING;
-	skdev->timer_countdown = SKD_RESTARTING_TIMO;
-
-	skd_soft_reset(skdev);
-}
-
-/* assume spinlock is held */
-static int skd_quiesce_dev(struct skd_device *skdev)
-{
-	int rc = 0;
-
-	switch (skdev->state) {
-	case SKD_DRVR_STATE_BUSY:
-	case SKD_DRVR_STATE_BUSY_IMMINENT:
-		dev_dbg(&skdev->pdev->dev, "stopping queue\n");
-		blk_mq_stop_hw_queues(skdev->queue);
-		break;
-	case SKD_DRVR_STATE_ONLINE:
-	case SKD_DRVR_STATE_STOPPING:
-	case SKD_DRVR_STATE_SYNCING:
-	case SKD_DRVR_STATE_PAUSING:
-	case SKD_DRVR_STATE_PAUSED:
-	case SKD_DRVR_STATE_STARTING:
-	case SKD_DRVR_STATE_RESTARTING:
-	case SKD_DRVR_STATE_RESUMING:
-	default:
-		rc = -EINVAL;
-		dev_dbg(&skdev->pdev->dev, "state [%d] not implemented\n",
-			skdev->state);
-	}
-	return rc;
-}
-
-/* assume spinlock is held */
-static int skd_unquiesce_dev(struct skd_device *skdev)
-{
-	int prev_driver_state = skdev->state;
-
-	skd_log_skdev(skdev, "unquiesce");
-	if (skdev->state == SKD_DRVR_STATE_ONLINE) {
-		dev_dbg(&skdev->pdev->dev, "**** device already ONLINE\n");
-		return 0;
-	}
-	if (skdev->drive_state != FIT_SR_DRIVE_ONLINE) {
-		/*
-		 * If there has been an state change to other than
-		 * ONLINE, we will rely on controller state change
-		 * to come back online and restart the queue.
-		 * The BUSY state means that driver is ready to
-		 * continue normal processing but waiting for controller
-		 * to become available.
-		 */
-		skdev->state = SKD_DRVR_STATE_BUSY;
-		dev_dbg(&skdev->pdev->dev, "drive BUSY state\n");
-		return 0;
-	}
-
-	/*
-	 * Drive has just come online, driver is either in startup,
-	 * paused performing a task, or bust waiting for hardware.
-	 */
-	switch (skdev->state) {
-	case SKD_DRVR_STATE_PAUSED:
-	case SKD_DRVR_STATE_BUSY:
-	case SKD_DRVR_STATE_BUSY_IMMINENT:
-	case SKD_DRVR_STATE_BUSY_ERASE:
-	case SKD_DRVR_STATE_STARTING:
-	case SKD_DRVR_STATE_RESTARTING:
-	case SKD_DRVR_STATE_FAULT:
-	case SKD_DRVR_STATE_IDLE:
-	case SKD_DRVR_STATE_LOAD:
-		skdev->state = SKD_DRVR_STATE_ONLINE;
-		dev_err(&skdev->pdev->dev, "Driver state %s(%d)=>%s(%d)\n",
-			skd_skdev_state_to_str(prev_driver_state),
-			prev_driver_state, skd_skdev_state_to_str(skdev->state),
-			skdev->state);
-		dev_dbg(&skdev->pdev->dev,
-			"**** device ONLINE...starting block queue\n");
-		dev_dbg(&skdev->pdev->dev, "starting queue\n");
-		dev_info(&skdev->pdev->dev, "STEC s1120 ONLINE\n");
-		schedule_work(&skdev->start_queue);
-		skdev->gendisk_on = 1;
-		wake_up_interruptible(&skdev->waitq);
-		break;
-
-	case SKD_DRVR_STATE_DISAPPEARED:
-	default:
-		dev_dbg(&skdev->pdev->dev,
-			"**** driver state %d, not implemented\n",
-			skdev->state);
-		return -EBUSY;
-	}
-	return 0;
-}
-
-/*
- *****************************************************************************
- * PCIe MSI/MSI-X INTERRUPT HANDLERS
- *****************************************************************************
- */
-
-static irqreturn_t skd_reserved_isr(int irq, void *skd_host_data)
-{
-	struct skd_device *skdev = skd_host_data;
-	unsigned long flags;
-
-	spin_lock_irqsave(&skdev->lock, flags);
-	dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n",
-		SKD_READL(skdev, FIT_INT_STATUS_HOST));
-	dev_err(&skdev->pdev->dev, "MSIX reserved irq %d = 0x%x\n", irq,
-		SKD_READL(skdev, FIT_INT_STATUS_HOST));
-	SKD_WRITEL(skdev, FIT_INT_RESERVED_MASK, FIT_INT_STATUS_HOST);
-	spin_unlock_irqrestore(&skdev->lock, flags);
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t skd_statec_isr(int irq, void *skd_host_data)
-{
-	struct skd_device *skdev = skd_host_data;
-	unsigned long flags;
-
-	spin_lock_irqsave(&skdev->lock, flags);
-	dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n",
-		SKD_READL(skdev, FIT_INT_STATUS_HOST));
-	SKD_WRITEL(skdev, FIT_ISH_FW_STATE_CHANGE, FIT_INT_STATUS_HOST);
-	skd_isr_fwstate(skdev);
-	spin_unlock_irqrestore(&skdev->lock, flags);
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t skd_comp_q(int irq, void *skd_host_data)
-{
-	struct skd_device *skdev = skd_host_data;
-	unsigned long flags;
-	int flush_enqueued = 0;
-	int deferred;
-
-	spin_lock_irqsave(&skdev->lock, flags);
-	dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n",
-		SKD_READL(skdev, FIT_INT_STATUS_HOST));
-	SKD_WRITEL(skdev, FIT_ISH_COMPLETION_POSTED, FIT_INT_STATUS_HOST);
-	deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit,
-						&flush_enqueued);
-	if (flush_enqueued)
-		schedule_work(&skdev->start_queue);
-
-	if (deferred)
-		schedule_work(&skdev->completion_worker);
-	else if (!flush_enqueued)
-		schedule_work(&skdev->start_queue);
-
-	spin_unlock_irqrestore(&skdev->lock, flags);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t skd_msg_isr(int irq, void *skd_host_data)
-{
-	struct skd_device *skdev = skd_host_data;
-	unsigned long flags;
-
-	spin_lock_irqsave(&skdev->lock, flags);
-	dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n",
-		SKD_READL(skdev, FIT_INT_STATUS_HOST));
-	SKD_WRITEL(skdev, FIT_ISH_MSG_FROM_DEV, FIT_INT_STATUS_HOST);
-	skd_isr_msg_from_dev(skdev);
-	spin_unlock_irqrestore(&skdev->lock, flags);
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t skd_qfull_isr(int irq, void *skd_host_data)
-{
-	struct skd_device *skdev = skd_host_data;
-	unsigned long flags;
-
-	spin_lock_irqsave(&skdev->lock, flags);
-	dev_dbg(&skdev->pdev->dev, "MSIX = 0x%x\n",
-		SKD_READL(skdev, FIT_INT_STATUS_HOST));
-	SKD_WRITEL(skdev, FIT_INT_QUEUE_FULL, FIT_INT_STATUS_HOST);
-	spin_unlock_irqrestore(&skdev->lock, flags);
-	return IRQ_HANDLED;
-}
-
-/*
- *****************************************************************************
- * PCIe MSI/MSI-X SETUP
- *****************************************************************************
- */
-
-struct skd_msix_entry {
-	char isr_name[30];
-};
-
-struct skd_init_msix_entry {
-	const char *name;
-	irq_handler_t handler;
-};
-
-#define SKD_MAX_MSIX_COUNT              13
-#define SKD_MIN_MSIX_COUNT              7
-#define SKD_BASE_MSIX_IRQ               4
-
-static struct skd_init_msix_entry msix_entries[SKD_MAX_MSIX_COUNT] = {
-	{ "(DMA 0)",	    skd_reserved_isr },
-	{ "(DMA 1)",	    skd_reserved_isr },
-	{ "(DMA 2)",	    skd_reserved_isr },
-	{ "(DMA 3)",	    skd_reserved_isr },
-	{ "(State Change)", skd_statec_isr   },
-	{ "(COMPL_Q)",	    skd_comp_q	     },
-	{ "(MSG)",	    skd_msg_isr	     },
-	{ "(Reserved)",	    skd_reserved_isr },
-	{ "(Reserved)",	    skd_reserved_isr },
-	{ "(Queue Full 0)", skd_qfull_isr    },
-	{ "(Queue Full 1)", skd_qfull_isr    },
-	{ "(Queue Full 2)", skd_qfull_isr    },
-	{ "(Queue Full 3)", skd_qfull_isr    },
-};
-
-static int skd_acquire_msix(struct skd_device *skdev)
-{
-	int i, rc;
-	struct pci_dev *pdev = skdev->pdev;
-
-	rc = pci_alloc_irq_vectors(pdev, SKD_MAX_MSIX_COUNT, SKD_MAX_MSIX_COUNT,
-			PCI_IRQ_MSIX);
-	if (rc < 0) {
-		dev_err(&skdev->pdev->dev, "failed to enable MSI-X %d\n", rc);
-		goto out;
-	}
-
-	skdev->msix_entries = kcalloc(SKD_MAX_MSIX_COUNT,
-			sizeof(struct skd_msix_entry), GFP_KERNEL);
-	if (!skdev->msix_entries) {
-		rc = -ENOMEM;
-		dev_err(&skdev->pdev->dev, "msix table allocation error\n");
-		goto out;
-	}
-
-	/* Enable MSI-X vectors for the base queue */
-	for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) {
-		struct skd_msix_entry *qentry = &skdev->msix_entries[i];
-
-		snprintf(qentry->isr_name, sizeof(qentry->isr_name),
-			 "%s%d-msix %s", DRV_NAME, skdev->devno,
-			 msix_entries[i].name);
-
-		rc = devm_request_irq(&skdev->pdev->dev,
-				pci_irq_vector(skdev->pdev, i),
-				msix_entries[i].handler, 0,
-				qentry->isr_name, skdev);
-		if (rc) {
-			dev_err(&skdev->pdev->dev,
-				"Unable to register(%d) MSI-X handler %d: %s\n",
-				rc, i, qentry->isr_name);
-			goto msix_out;
-		}
-	}
-
-	dev_dbg(&skdev->pdev->dev, "%d msix irq(s) enabled\n",
-		SKD_MAX_MSIX_COUNT);
-	return 0;
-
-msix_out:
-	while (--i >= 0)
-		devm_free_irq(&pdev->dev, pci_irq_vector(pdev, i), skdev);
-out:
-	kfree(skdev->msix_entries);
-	skdev->msix_entries = NULL;
-	return rc;
-}
-
-static int skd_acquire_irq(struct skd_device *skdev)
-{
-	struct pci_dev *pdev = skdev->pdev;
-	unsigned int irq_flag = PCI_IRQ_LEGACY;
-	int rc;
-
-	if (skd_isr_type == SKD_IRQ_MSIX) {
-		rc = skd_acquire_msix(skdev);
-		if (!rc)
-			return 0;
-
-		dev_err(&skdev->pdev->dev,
-			"failed to enable MSI-X, re-trying with MSI %d\n", rc);
-	}
-
-	snprintf(skdev->isr_name, sizeof(skdev->isr_name), "%s%d", DRV_NAME,
-			skdev->devno);
-
-	if (skd_isr_type != SKD_IRQ_LEGACY)
-		irq_flag |= PCI_IRQ_MSI;
-	rc = pci_alloc_irq_vectors(pdev, 1, 1, irq_flag);
-	if (rc < 0) {
-		dev_err(&skdev->pdev->dev,
-			"failed to allocate the MSI interrupt %d\n", rc);
-		return rc;
-	}
-
-	rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr,
-			pdev->msi_enabled ? 0 : IRQF_SHARED,
-			skdev->isr_name, skdev);
-	if (rc) {
-		pci_free_irq_vectors(pdev);
-		dev_err(&skdev->pdev->dev, "failed to allocate interrupt %d\n",
-			rc);
-		return rc;
-	}
-
-	return 0;
-}
-
-static void skd_release_irq(struct skd_device *skdev)
-{
-	struct pci_dev *pdev = skdev->pdev;
-
-	if (skdev->msix_entries) {
-		int i;
-
-		for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) {
-			devm_free_irq(&pdev->dev, pci_irq_vector(pdev, i),
-					skdev);
-		}
-
-		kfree(skdev->msix_entries);
-		skdev->msix_entries = NULL;
-	} else {
-		devm_free_irq(&pdev->dev, pdev->irq, skdev);
-	}
-
-	pci_free_irq_vectors(pdev);
-}
-
-/*
- *****************************************************************************
- * CONSTRUCT
- *****************************************************************************
- */
-
-static void *skd_alloc_dma(struct skd_device *skdev, struct kmem_cache *s,
-			   dma_addr_t *dma_handle, gfp_t gfp,
-			   enum dma_data_direction dir)
-{
-	struct device *dev = &skdev->pdev->dev;
-	void *buf;
-
-	buf = kmem_cache_alloc(s, gfp);
-	if (!buf)
-		return NULL;
-	*dma_handle = dma_map_single(dev, buf,
-				     kmem_cache_size(s), dir);
-	if (dma_mapping_error(dev, *dma_handle)) {
-		kmem_cache_free(s, buf);
-		buf = NULL;
-	}
-	return buf;
-}
-
-static void skd_free_dma(struct skd_device *skdev, struct kmem_cache *s,
-			 void *vaddr, dma_addr_t dma_handle,
-			 enum dma_data_direction dir)
-{
-	if (!vaddr)
-		return;
-
-	dma_unmap_single(&skdev->pdev->dev, dma_handle,
-			 kmem_cache_size(s), dir);
-	kmem_cache_free(s, vaddr);
-}
-
-static int skd_cons_skcomp(struct skd_device *skdev)
-{
-	int rc = 0;
-	struct fit_completion_entry_v1 *skcomp;
-
-	dev_dbg(&skdev->pdev->dev,
-		"comp pci_alloc, total bytes %zd entries %d\n",
-		SKD_SKCOMP_SIZE, SKD_N_COMPLETION_ENTRY);
-
-	skcomp = dma_alloc_coherent(&skdev->pdev->dev, SKD_SKCOMP_SIZE,
-				    &skdev->cq_dma_address, GFP_KERNEL);
-
-	if (skcomp == NULL) {
-		rc = -ENOMEM;
-		goto err_out;
-	}
-
-	skdev->skcomp_table = skcomp;
-	skdev->skerr_table = (struct fit_comp_error_info *)((char *)skcomp +
-							   sizeof(*skcomp) *
-							   SKD_N_COMPLETION_ENTRY);
-
-err_out:
-	return rc;
-}
-
-static int skd_cons_skmsg(struct skd_device *skdev)
-{
-	int rc = 0;
-	u32 i;
-
-	dev_dbg(&skdev->pdev->dev,
-		"skmsg_table kcalloc, struct %lu, count %u total %lu\n",
-		sizeof(struct skd_fitmsg_context), skdev->num_fitmsg_context,
-		sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context);
-
-	skdev->skmsg_table = kcalloc(skdev->num_fitmsg_context,
-				     sizeof(struct skd_fitmsg_context),
-				     GFP_KERNEL);
-	if (skdev->skmsg_table == NULL) {
-		rc = -ENOMEM;
-		goto err_out;
-	}
-
-	for (i = 0; i < skdev->num_fitmsg_context; i++) {
-		struct skd_fitmsg_context *skmsg;
-
-		skmsg = &skdev->skmsg_table[i];
-
-		skmsg->id = i + SKD_ID_FIT_MSG;
-
-		skmsg->msg_buf = dma_alloc_coherent(&skdev->pdev->dev,
-						    SKD_N_FITMSG_BYTES,
-						    &skmsg->mb_dma_address,
-						    GFP_KERNEL);
-		if (skmsg->msg_buf == NULL) {
-			rc = -ENOMEM;
-			goto err_out;
-		}
-
-		WARN(((uintptr_t)skmsg->msg_buf | skmsg->mb_dma_address) &
-		     (FIT_QCMD_ALIGN - 1),
-		     "not aligned: msg_buf %p mb_dma_address %pad\n",
-		     skmsg->msg_buf, &skmsg->mb_dma_address);
-	}
-
-err_out:
-	return rc;
-}
-
-static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev,
-						  u32 n_sg,
-						  dma_addr_t *ret_dma_addr)
-{
-	struct fit_sg_descriptor *sg_list;
-
-	sg_list = skd_alloc_dma(skdev, skdev->sglist_cache, ret_dma_addr,
-				GFP_DMA | __GFP_ZERO, DMA_TO_DEVICE);
-
-	if (sg_list != NULL) {
-		uint64_t dma_address = *ret_dma_addr;
-		u32 i;
-
-		for (i = 0; i < n_sg - 1; i++) {
-			uint64_t ndp_off;
-			ndp_off = (i + 1) * sizeof(struct fit_sg_descriptor);
-
-			sg_list[i].next_desc_ptr = dma_address + ndp_off;
-		}
-		sg_list[i].next_desc_ptr = 0LL;
-	}
-
-	return sg_list;
-}
-
-static void skd_free_sg_list(struct skd_device *skdev,
-			     struct fit_sg_descriptor *sg_list,
-			     dma_addr_t dma_addr)
-{
-	if (WARN_ON_ONCE(!sg_list))
-		return;
-
-	skd_free_dma(skdev, skdev->sglist_cache, sg_list, dma_addr,
-		     DMA_TO_DEVICE);
-}
-
-static int skd_init_request(struct blk_mq_tag_set *set, struct request *rq,
-			    unsigned int hctx_idx, unsigned int numa_node)
-{
-	struct skd_device *skdev = set->driver_data;
-	struct skd_request_context *skreq = blk_mq_rq_to_pdu(rq);
-
-	skreq->state = SKD_REQ_STATE_IDLE;
-	skreq->sg = (void *)(skreq + 1);
-	sg_init_table(skreq->sg, skd_sgs_per_request);
-	skreq->sksg_list = skd_cons_sg_list(skdev, skd_sgs_per_request,
-					    &skreq->sksg_dma_address);
-
-	return skreq->sksg_list ? 0 : -ENOMEM;
-}
-
-static void skd_exit_request(struct blk_mq_tag_set *set, struct request *rq,
-			     unsigned int hctx_idx)
-{
-	struct skd_device *skdev = set->driver_data;
-	struct skd_request_context *skreq = blk_mq_rq_to_pdu(rq);
-
-	skd_free_sg_list(skdev, skreq->sksg_list, skreq->sksg_dma_address);
-}
-
-static int skd_cons_sksb(struct skd_device *skdev)
-{
-	int rc = 0;
-	struct skd_special_context *skspcl;
-
-	skspcl = &skdev->internal_skspcl;
-
-	skspcl->req.id = 0 + SKD_ID_INTERNAL;
-	skspcl->req.state = SKD_REQ_STATE_IDLE;
-
-	skspcl->data_buf = skd_alloc_dma(skdev, skdev->databuf_cache,
-					 &skspcl->db_dma_address,
-					 GFP_DMA | __GFP_ZERO,
-					 DMA_BIDIRECTIONAL);
-	if (skspcl->data_buf == NULL) {
-		rc = -ENOMEM;
-		goto err_out;
-	}
-
-	skspcl->msg_buf = skd_alloc_dma(skdev, skdev->msgbuf_cache,
-					&skspcl->mb_dma_address,
-					GFP_DMA | __GFP_ZERO, DMA_TO_DEVICE);
-	if (skspcl->msg_buf == NULL) {
-		rc = -ENOMEM;
-		goto err_out;
-	}
-
-	skspcl->req.sksg_list = skd_cons_sg_list(skdev, 1,
-						 &skspcl->req.sksg_dma_address);
-	if (skspcl->req.sksg_list == NULL) {
-		rc = -ENOMEM;
-		goto err_out;
-	}
-
-	if (!skd_format_internal_skspcl(skdev)) {
-		rc = -EINVAL;
-		goto err_out;
-	}
-
-err_out:
-	return rc;
-}
-
-static const struct blk_mq_ops skd_mq_ops = {
-	.queue_rq	= skd_mq_queue_rq,
-	.complete	= skd_complete_rq,
-	.timeout	= skd_timed_out,
-	.init_request	= skd_init_request,
-	.exit_request	= skd_exit_request,
-};
-
-static int skd_cons_disk(struct skd_device *skdev)
-{
-	int rc = 0;
-	struct gendisk *disk;
-	struct request_queue *q;
-	unsigned long flags;
-
-	disk = alloc_disk(SKD_MINORS_PER_DEVICE);
-	if (!disk) {
-		rc = -ENOMEM;
-		goto err_out;
-	}
-
-	skdev->disk = disk;
-	sprintf(disk->disk_name, DRV_NAME "%u", skdev->devno);
-
-	disk->major = skdev->major;
-	disk->first_minor = skdev->devno * SKD_MINORS_PER_DEVICE;
-	disk->fops = &skd_blockdev_ops;
-	disk->private_data = skdev;
-
-	memset(&skdev->tag_set, 0, sizeof(skdev->tag_set));
-	skdev->tag_set.ops = &skd_mq_ops;
-	skdev->tag_set.nr_hw_queues = 1;
-	skdev->tag_set.queue_depth = skd_max_queue_depth;
-	skdev->tag_set.cmd_size = sizeof(struct skd_request_context) +
-		skdev->sgs_per_request * sizeof(struct scatterlist);
-	skdev->tag_set.numa_node = NUMA_NO_NODE;
-	skdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
-		BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO);
-	skdev->tag_set.driver_data = skdev;
-	rc = blk_mq_alloc_tag_set(&skdev->tag_set);
-	if (rc)
-		goto err_out;
-	q = blk_mq_init_queue(&skdev->tag_set);
-	if (IS_ERR(q)) {
-		blk_mq_free_tag_set(&skdev->tag_set);
-		rc = PTR_ERR(q);
-		goto err_out;
-	}
-	q->queuedata = skdev;
-
-	skdev->queue = q;
-	disk->queue = q;
-
-	blk_queue_write_cache(q, true, true);
-	blk_queue_max_segments(q, skdev->sgs_per_request);
-	blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS);
-
-	/* set optimal I/O size to 8KB */
-	blk_queue_io_opt(q, 8192);
-
-	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
-	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
-
-	blk_queue_rq_timeout(q, 8 * HZ);
-
-	spin_lock_irqsave(&skdev->lock, flags);
-	dev_dbg(&skdev->pdev->dev, "stopping queue\n");
-	blk_mq_stop_hw_queues(skdev->queue);
-	spin_unlock_irqrestore(&skdev->lock, flags);
-
-err_out:
-	return rc;
-}
-
-#define SKD_N_DEV_TABLE         16u
-static u32 skd_next_devno;
-
-static struct skd_device *skd_construct(struct pci_dev *pdev)
-{
-	struct skd_device *skdev;
-	int blk_major = skd_major;
-	size_t size;
-	int rc;
-
-	skdev = kzalloc(sizeof(*skdev), GFP_KERNEL);
-
-	if (!skdev) {
-		dev_err(&pdev->dev, "memory alloc failure\n");
-		return NULL;
-	}
-
-	skdev->state = SKD_DRVR_STATE_LOAD;
-	skdev->pdev = pdev;
-	skdev->devno = skd_next_devno++;
-	skdev->major = blk_major;
-	skdev->dev_max_queue_depth = 0;
-
-	skdev->num_req_context = skd_max_queue_depth;
-	skdev->num_fitmsg_context = skd_max_queue_depth;
-	skdev->cur_max_queue_depth = 1;
-	skdev->queue_low_water_mark = 1;
-	skdev->proto_ver = 99;
-	skdev->sgs_per_request = skd_sgs_per_request;
-	skdev->dbg_level = skd_dbg_level;
-
-	spin_lock_init(&skdev->lock);
-
-	INIT_WORK(&skdev->start_queue, skd_start_queue);
-	INIT_WORK(&skdev->completion_worker, skd_completion_worker);
-
-	size = max(SKD_N_FITMSG_BYTES, SKD_N_SPECIAL_FITMSG_BYTES);
-	skdev->msgbuf_cache = kmem_cache_create("skd-msgbuf", size, 0,
-						SLAB_HWCACHE_ALIGN, NULL);
-	if (!skdev->msgbuf_cache)
-		goto err_out;
-	WARN_ONCE(kmem_cache_size(skdev->msgbuf_cache) < size,
-		  "skd-msgbuf: %d < %zd\n",
-		  kmem_cache_size(skdev->msgbuf_cache), size);
-	size = skd_sgs_per_request * sizeof(struct fit_sg_descriptor);
-	skdev->sglist_cache = kmem_cache_create("skd-sglist", size, 0,
-						SLAB_HWCACHE_ALIGN, NULL);
-	if (!skdev->sglist_cache)
-		goto err_out;
-	WARN_ONCE(kmem_cache_size(skdev->sglist_cache) < size,
-		  "skd-sglist: %d < %zd\n",
-		  kmem_cache_size(skdev->sglist_cache), size);
-	size = SKD_N_INTERNAL_BYTES;
-	skdev->databuf_cache = kmem_cache_create("skd-databuf", size, 0,
-						 SLAB_HWCACHE_ALIGN, NULL);
-	if (!skdev->databuf_cache)
-		goto err_out;
-	WARN_ONCE(kmem_cache_size(skdev->databuf_cache) < size,
-		  "skd-databuf: %d < %zd\n",
-		  kmem_cache_size(skdev->databuf_cache), size);
-
-	dev_dbg(&skdev->pdev->dev, "skcomp\n");
-	rc = skd_cons_skcomp(skdev);
-	if (rc < 0)
-		goto err_out;
-
-	dev_dbg(&skdev->pdev->dev, "skmsg\n");
-	rc = skd_cons_skmsg(skdev);
-	if (rc < 0)
-		goto err_out;
-
-	dev_dbg(&skdev->pdev->dev, "sksb\n");
-	rc = skd_cons_sksb(skdev);
-	if (rc < 0)
-		goto err_out;
-
-	dev_dbg(&skdev->pdev->dev, "disk\n");
-	rc = skd_cons_disk(skdev);
-	if (rc < 0)
-		goto err_out;
-
-	dev_dbg(&skdev->pdev->dev, "VICTORY\n");
-	return skdev;
-
-err_out:
-	dev_dbg(&skdev->pdev->dev, "construct failed\n");
-	skd_destruct(skdev);
-	return NULL;
-}
-
-/*
- *****************************************************************************
- * DESTRUCT (FREE)
- *****************************************************************************
- */
-
-static void skd_free_skcomp(struct skd_device *skdev)
-{
-	if (skdev->skcomp_table)
-		dma_free_coherent(&skdev->pdev->dev, SKD_SKCOMP_SIZE,
-				  skdev->skcomp_table, skdev->cq_dma_address);
-
-	skdev->skcomp_table = NULL;
-	skdev->cq_dma_address = 0;
-}
-
-static void skd_free_skmsg(struct skd_device *skdev)
-{
-	u32 i;
-
-	if (skdev->skmsg_table == NULL)
-		return;
-
-	for (i = 0; i < skdev->num_fitmsg_context; i++) {
-		struct skd_fitmsg_context *skmsg;
-
-		skmsg = &skdev->skmsg_table[i];
-
-		if (skmsg->msg_buf != NULL) {
-			dma_free_coherent(&skdev->pdev->dev, SKD_N_FITMSG_BYTES,
-					  skmsg->msg_buf,
-					    skmsg->mb_dma_address);
-		}
-		skmsg->msg_buf = NULL;
-		skmsg->mb_dma_address = 0;
-	}
-
-	kfree(skdev->skmsg_table);
-	skdev->skmsg_table = NULL;
-}
-
-static void skd_free_sksb(struct skd_device *skdev)
-{
-	struct skd_special_context *skspcl = &skdev->internal_skspcl;
-
-	skd_free_dma(skdev, skdev->databuf_cache, skspcl->data_buf,
-		     skspcl->db_dma_address, DMA_BIDIRECTIONAL);
-
-	skspcl->data_buf = NULL;
-	skspcl->db_dma_address = 0;
-
-	skd_free_dma(skdev, skdev->msgbuf_cache, skspcl->msg_buf,
-		     skspcl->mb_dma_address, DMA_TO_DEVICE);
-
-	skspcl->msg_buf = NULL;
-	skspcl->mb_dma_address = 0;
-
-	skd_free_sg_list(skdev, skspcl->req.sksg_list,
-			 skspcl->req.sksg_dma_address);
-
-	skspcl->req.sksg_list = NULL;
-	skspcl->req.sksg_dma_address = 0;
-}
-
-static void skd_free_disk(struct skd_device *skdev)
-{
-	struct gendisk *disk = skdev->disk;
-
-	if (disk && (disk->flags & GENHD_FL_UP))
-		del_gendisk(disk);
-
-	if (skdev->queue) {
-		blk_cleanup_queue(skdev->queue);
-		skdev->queue = NULL;
-		if (disk)
-			disk->queue = NULL;
-	}
-
-	if (skdev->tag_set.tags)
-		blk_mq_free_tag_set(&skdev->tag_set);
-
-	put_disk(disk);
-	skdev->disk = NULL;
-}
-
-static void skd_destruct(struct skd_device *skdev)
-{
-	if (skdev == NULL)
-		return;
-
-	cancel_work_sync(&skdev->start_queue);
-
-	dev_dbg(&skdev->pdev->dev, "disk\n");
-	skd_free_disk(skdev);
-
-	dev_dbg(&skdev->pdev->dev, "sksb\n");
-	skd_free_sksb(skdev);
-
-	dev_dbg(&skdev->pdev->dev, "skmsg\n");
-	skd_free_skmsg(skdev);
-
-	dev_dbg(&skdev->pdev->dev, "skcomp\n");
-	skd_free_skcomp(skdev);
-
-	kmem_cache_destroy(skdev->databuf_cache);
-	kmem_cache_destroy(skdev->sglist_cache);
-	kmem_cache_destroy(skdev->msgbuf_cache);
-
-	dev_dbg(&skdev->pdev->dev, "skdev\n");
-	kfree(skdev);
-}
-
-/*
- *****************************************************************************
- * BLOCK DEVICE (BDEV) GLUE
- *****************************************************************************
- */
-
-static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	struct skd_device *skdev;
-	u64 capacity;
-
-	skdev = bdev->bd_disk->private_data;
-
-	dev_dbg(&skdev->pdev->dev, "%s: CMD[%s] getgeo device\n",
-		bdev->bd_disk->disk_name, current->comm);
-
-	if (skdev->read_cap_is_valid) {
-		capacity = get_capacity(skdev->disk);
-		geo->heads = 64;
-		geo->sectors = 255;
-		geo->cylinders = (capacity) / (255 * 64);
-
-		return 0;
-	}
-	return -EIO;
-}
-
-static int skd_bdev_attach(struct device *parent, struct skd_device *skdev)
-{
-	dev_dbg(&skdev->pdev->dev, "add_disk\n");
-	device_add_disk(parent, skdev->disk, NULL);
-	return 0;
-}
-
-static const struct block_device_operations skd_blockdev_ops = {
-	.owner		= THIS_MODULE,
-	.getgeo		= skd_bdev_getgeo,
-};
-
-/*
- *****************************************************************************
- * PCIe DRIVER GLUE
- *****************************************************************************
- */
-
-static const struct pci_device_id skd_pci_tbl[] = {
-	{ PCI_VENDOR_ID_STEC, PCI_DEVICE_ID_S1120,
-	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
-	{ 0 }                     /* terminate list */
-};
-
-MODULE_DEVICE_TABLE(pci, skd_pci_tbl);
-
-static char *skd_pci_info(struct skd_device *skdev, char *str)
-{
-	int pcie_reg;
-
-	strcpy(str, "PCIe (");
-	pcie_reg = pci_find_capability(skdev->pdev, PCI_CAP_ID_EXP);
-
-	if (pcie_reg) {
-
-		char lwstr[6];
-		uint16_t pcie_lstat, lspeed, lwidth;
-
-		pcie_reg += 0x12;
-		pci_read_config_word(skdev->pdev, pcie_reg, &pcie_lstat);
-		lspeed = pcie_lstat & (0xF);
-		lwidth = (pcie_lstat & 0x3F0) >> 4;
-
-		if (lspeed == 1)
-			strcat(str, "2.5GT/s ");
-		else if (lspeed == 2)
-			strcat(str, "5.0GT/s ");
-		else
-			strcat(str, "<unknown> ");
-		snprintf(lwstr, sizeof(lwstr), "%dX)", lwidth);
-		strcat(str, lwstr);
-	}
-	return str;
-}
-
-static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
-{
-	int i;
-	int rc = 0;
-	char pci_str[32];
-	struct skd_device *skdev;
-
-	dev_dbg(&pdev->dev, "vendor=%04X device=%04x\n", pdev->vendor,
-		pdev->device);
-
-	rc = pci_enable_device(pdev);
-	if (rc)
-		return rc;
-	rc = pci_request_regions(pdev, DRV_NAME);
-	if (rc)
-		goto err_out;
-	rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
-	if (rc)
-		rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
-	if (rc) {
-		dev_err(&pdev->dev, "DMA mask error %d\n", rc);
-		goto err_out_regions;
-	}
-
-	if (!skd_major) {
-		rc = register_blkdev(0, DRV_NAME);
-		if (rc < 0)
-			goto err_out_regions;
-		BUG_ON(!rc);
-		skd_major = rc;
-	}
-
-	skdev = skd_construct(pdev);
-	if (skdev == NULL) {
-		rc = -ENOMEM;
-		goto err_out_regions;
-	}
-
-	skd_pci_info(skdev, pci_str);
-	dev_info(&pdev->dev, "%s 64bit\n", pci_str);
-
-	pci_set_master(pdev);
-	rc = pci_enable_pcie_error_reporting(pdev);
-	if (rc) {
-		dev_err(&pdev->dev,
-			"bad enable of PCIe error reporting rc=%d\n", rc);
-		skdev->pcie_error_reporting_is_enabled = 0;
-	} else
-		skdev->pcie_error_reporting_is_enabled = 1;
-
-	pci_set_drvdata(pdev, skdev);
-
-	for (i = 0; i < SKD_MAX_BARS; i++) {
-		skdev->mem_phys[i] = pci_resource_start(pdev, i);
-		skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
-		skdev->mem_map[i] = ioremap(skdev->mem_phys[i],
-					    skdev->mem_size[i]);
-		if (!skdev->mem_map[i]) {
-			dev_err(&pdev->dev,
-				"Unable to map adapter memory!\n");
-			rc = -ENODEV;
-			goto err_out_iounmap;
-		}
-		dev_dbg(&pdev->dev, "mem_map=%p, phyd=%016llx, size=%d\n",
-			skdev->mem_map[i], (uint64_t)skdev->mem_phys[i],
-			skdev->mem_size[i]);
-	}
-
-	rc = skd_acquire_irq(skdev);
-	if (rc) {
-		dev_err(&pdev->dev, "interrupt resource error %d\n", rc);
-		goto err_out_iounmap;
-	}
-
-	rc = skd_start_timer(skdev);
-	if (rc)
-		goto err_out_timer;
-
-	init_waitqueue_head(&skdev->waitq);
-
-	skd_start_device(skdev);
-
-	rc = wait_event_interruptible_timeout(skdev->waitq,
-					      (skdev->gendisk_on),
-					      (SKD_START_WAIT_SECONDS * HZ));
-	if (skdev->gendisk_on > 0) {
-		/* device came on-line after reset */
-		skd_bdev_attach(&pdev->dev, skdev);
-		rc = 0;
-	} else {
-		/* we timed out, something is wrong with the device,
-		   don't add the disk structure */
-		dev_err(&pdev->dev, "error: waiting for s1120 timed out %d!\n",
-			rc);
-		/* in case of no error; we timeout with ENXIO */
-		if (!rc)
-			rc = -ENXIO;
-		goto err_out_timer;
-	}
-
-	return rc;
-
-err_out_timer:
-	skd_stop_device(skdev);
-	skd_release_irq(skdev);
-
-err_out_iounmap:
-	for (i = 0; i < SKD_MAX_BARS; i++)
-		if (skdev->mem_map[i])
-			iounmap(skdev->mem_map[i]);
-
-	if (skdev->pcie_error_reporting_is_enabled)
-		pci_disable_pcie_error_reporting(pdev);
-
-	skd_destruct(skdev);
-
-err_out_regions:
-	pci_release_regions(pdev);
-
-err_out:
-	pci_disable_device(pdev);
-	pci_set_drvdata(pdev, NULL);
-	return rc;
-}
-
-static void skd_pci_remove(struct pci_dev *pdev)
-{
-	int i;
-	struct skd_device *skdev;
-
-	skdev = pci_get_drvdata(pdev);
-	if (!skdev) {
-		dev_err(&pdev->dev, "no device data for PCI\n");
-		return;
-	}
-	skd_stop_device(skdev);
-	skd_release_irq(skdev);
-
-	for (i = 0; i < SKD_MAX_BARS; i++)
-		if (skdev->mem_map[i])
-			iounmap(skdev->mem_map[i]);
-
-	if (skdev->pcie_error_reporting_is_enabled)
-		pci_disable_pcie_error_reporting(pdev);
-
-	skd_destruct(skdev);
-
-	pci_release_regions(pdev);
-	pci_disable_device(pdev);
-	pci_set_drvdata(pdev, NULL);
-
-	return;
-}
-
-static int skd_pci_suspend(struct pci_dev *pdev, pm_message_t state)
-{
-	int i;
-	struct skd_device *skdev;
-
-	skdev = pci_get_drvdata(pdev);
-	if (!skdev) {
-		dev_err(&pdev->dev, "no device data for PCI\n");
-		return -EIO;
-	}
-
-	skd_stop_device(skdev);
-
-	skd_release_irq(skdev);
-
-	for (i = 0; i < SKD_MAX_BARS; i++)
-		if (skdev->mem_map[i])
-			iounmap(skdev->mem_map[i]);
-
-	if (skdev->pcie_error_reporting_is_enabled)
-		pci_disable_pcie_error_reporting(pdev);
-
-	pci_release_regions(pdev);
-	pci_save_state(pdev);
-	pci_disable_device(pdev);
-	pci_set_power_state(pdev, pci_choose_state(pdev, state));
-	return 0;
-}
-
-static int skd_pci_resume(struct pci_dev *pdev)
-{
-	int i;
-	int rc = 0;
-	struct skd_device *skdev;
-
-	skdev = pci_get_drvdata(pdev);
-	if (!skdev) {
-		dev_err(&pdev->dev, "no device data for PCI\n");
-		return -1;
-	}
-
-	pci_set_power_state(pdev, PCI_D0);
-	pci_enable_wake(pdev, PCI_D0, 0);
-	pci_restore_state(pdev);
-
-	rc = pci_enable_device(pdev);
-	if (rc)
-		return rc;
-	rc = pci_request_regions(pdev, DRV_NAME);
-	if (rc)
-		goto err_out;
-	rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
-	if (rc)
-		rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
-	if (rc) {
-		dev_err(&pdev->dev, "DMA mask error %d\n", rc);
-		goto err_out_regions;
-	}
-
-	pci_set_master(pdev);
-	rc = pci_enable_pcie_error_reporting(pdev);
-	if (rc) {
-		dev_err(&pdev->dev,
-			"bad enable of PCIe error reporting rc=%d\n", rc);
-		skdev->pcie_error_reporting_is_enabled = 0;
-	} else
-		skdev->pcie_error_reporting_is_enabled = 1;
-
-	for (i = 0; i < SKD_MAX_BARS; i++) {
-
-		skdev->mem_phys[i] = pci_resource_start(pdev, i);
-		skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
-		skdev->mem_map[i] = ioremap(skdev->mem_phys[i],
-					    skdev->mem_size[i]);
-		if (!skdev->mem_map[i]) {
-			dev_err(&pdev->dev, "Unable to map adapter memory!\n");
-			rc = -ENODEV;
-			goto err_out_iounmap;
-		}
-		dev_dbg(&pdev->dev, "mem_map=%p, phyd=%016llx, size=%d\n",
-			skdev->mem_map[i], (uint64_t)skdev->mem_phys[i],
-			skdev->mem_size[i]);
-	}
-	rc = skd_acquire_irq(skdev);
-	if (rc) {
-		dev_err(&pdev->dev, "interrupt resource error %d\n", rc);
-		goto err_out_iounmap;
-	}
-
-	rc = skd_start_timer(skdev);
-	if (rc)
-		goto err_out_timer;
-
-	init_waitqueue_head(&skdev->waitq);
-
-	skd_start_device(skdev);
-
-	return rc;
-
-err_out_timer:
-	skd_stop_device(skdev);
-	skd_release_irq(skdev);
-
-err_out_iounmap:
-	for (i = 0; i < SKD_MAX_BARS; i++)
-		if (skdev->mem_map[i])
-			iounmap(skdev->mem_map[i]);
-
-	if (skdev->pcie_error_reporting_is_enabled)
-		pci_disable_pcie_error_reporting(pdev);
-
-err_out_regions:
-	pci_release_regions(pdev);
-
-err_out:
-	pci_disable_device(pdev);
-	return rc;
-}
-
-static void skd_pci_shutdown(struct pci_dev *pdev)
-{
-	struct skd_device *skdev;
-
-	dev_err(&pdev->dev, "%s called\n", __func__);
-
-	skdev = pci_get_drvdata(pdev);
-	if (!skdev) {
-		dev_err(&pdev->dev, "no device data for PCI\n");
-		return;
-	}
-
-	dev_err(&pdev->dev, "calling stop\n");
-	skd_stop_device(skdev);
-}
-
-static struct pci_driver skd_driver = {
-	.name		= DRV_NAME,
-	.id_table	= skd_pci_tbl,
-	.probe		= skd_pci_probe,
-	.remove		= skd_pci_remove,
-	.suspend	= skd_pci_suspend,
-	.resume		= skd_pci_resume,
-	.shutdown	= skd_pci_shutdown,
-};
-
-/*
- *****************************************************************************
- * LOGGING SUPPORT
- *****************************************************************************
- */
-
-const char *skd_drive_state_to_str(int state)
-{
-	switch (state) {
-	case FIT_SR_DRIVE_OFFLINE:
-		return "OFFLINE";
-	case FIT_SR_DRIVE_INIT:
-		return "INIT";
-	case FIT_SR_DRIVE_ONLINE:
-		return "ONLINE";
-	case FIT_SR_DRIVE_BUSY:
-		return "BUSY";
-	case FIT_SR_DRIVE_FAULT:
-		return "FAULT";
-	case FIT_SR_DRIVE_DEGRADED:
-		return "DEGRADED";
-	case FIT_SR_PCIE_LINK_DOWN:
-		return "INK_DOWN";
-	case FIT_SR_DRIVE_SOFT_RESET:
-		return "SOFT_RESET";
-	case FIT_SR_DRIVE_NEED_FW_DOWNLOAD:
-		return "NEED_FW";
-	case FIT_SR_DRIVE_INIT_FAULT:
-		return "INIT_FAULT";
-	case FIT_SR_DRIVE_BUSY_SANITIZE:
-		return "BUSY_SANITIZE";
-	case FIT_SR_DRIVE_BUSY_ERASE:
-		return "BUSY_ERASE";
-	case FIT_SR_DRIVE_FW_BOOTING:
-		return "FW_BOOTING";
-	default:
-		return "???";
-	}
-}
-
-const char *skd_skdev_state_to_str(enum skd_drvr_state state)
-{
-	switch (state) {
-	case SKD_DRVR_STATE_LOAD:
-		return "LOAD";
-	case SKD_DRVR_STATE_IDLE:
-		return "IDLE";
-	case SKD_DRVR_STATE_BUSY:
-		return "BUSY";
-	case SKD_DRVR_STATE_STARTING:
-		return "STARTING";
-	case SKD_DRVR_STATE_ONLINE:
-		return "ONLINE";
-	case SKD_DRVR_STATE_PAUSING:
-		return "PAUSING";
-	case SKD_DRVR_STATE_PAUSED:
-		return "PAUSED";
-	case SKD_DRVR_STATE_RESTARTING:
-		return "RESTARTING";
-	case SKD_DRVR_STATE_RESUMING:
-		return "RESUMING";
-	case SKD_DRVR_STATE_STOPPING:
-		return "STOPPING";
-	case SKD_DRVR_STATE_SYNCING:
-		return "SYNCING";
-	case SKD_DRVR_STATE_FAULT:
-		return "FAULT";
-	case SKD_DRVR_STATE_DISAPPEARED:
-		return "DISAPPEARED";
-	case SKD_DRVR_STATE_BUSY_ERASE:
-		return "BUSY_ERASE";
-	case SKD_DRVR_STATE_BUSY_SANITIZE:
-		return "BUSY_SANITIZE";
-	case SKD_DRVR_STATE_BUSY_IMMINENT:
-		return "BUSY_IMMINENT";
-	case SKD_DRVR_STATE_WAIT_BOOT:
-		return "WAIT_BOOT";
-
-	default:
-		return "???";
-	}
-}
-
-static const char *skd_skreq_state_to_str(enum skd_req_state state)
-{
-	switch (state) {
-	case SKD_REQ_STATE_IDLE:
-		return "IDLE";
-	case SKD_REQ_STATE_SETUP:
-		return "SETUP";
-	case SKD_REQ_STATE_BUSY:
-		return "BUSY";
-	case SKD_REQ_STATE_COMPLETED:
-		return "COMPLETED";
-	case SKD_REQ_STATE_TIMEOUT:
-		return "TIMEOUT";
-	default:
-		return "???";
-	}
-}
-
-static void skd_log_skdev(struct skd_device *skdev, const char *event)
-{
-	dev_dbg(&skdev->pdev->dev, "skdev=%p event='%s'\n", skdev, event);
-	dev_dbg(&skdev->pdev->dev, "  drive_state=%s(%d) driver_state=%s(%d)\n",
-		skd_drive_state_to_str(skdev->drive_state), skdev->drive_state,
-		skd_skdev_state_to_str(skdev->state), skdev->state);
-	dev_dbg(&skdev->pdev->dev, "  busy=%d limit=%d dev=%d lowat=%d\n",
-		skd_in_flight(skdev), skdev->cur_max_queue_depth,
-		skdev->dev_max_queue_depth, skdev->queue_low_water_mark);
-	dev_dbg(&skdev->pdev->dev, "  cycle=%d cycle_ix=%d\n",
-		skdev->skcomp_cycle, skdev->skcomp_ix);
-}
-
-static void skd_log_skreq(struct skd_device *skdev,
-			  struct skd_request_context *skreq, const char *event)
-{
-	struct request *req = blk_mq_rq_from_pdu(skreq);
-	u32 lba = blk_rq_pos(req);
-	u32 count = blk_rq_sectors(req);
-
-	dev_dbg(&skdev->pdev->dev, "skreq=%p event='%s'\n", skreq, event);
-	dev_dbg(&skdev->pdev->dev, "  state=%s(%d) id=0x%04x fitmsg=0x%04x\n",
-		skd_skreq_state_to_str(skreq->state), skreq->state, skreq->id,
-		skreq->fitmsg_id);
-	dev_dbg(&skdev->pdev->dev, "  sg_dir=%d n_sg=%d\n",
-		skreq->data_dir, skreq->n_sg);
-
-	dev_dbg(&skdev->pdev->dev,
-		"req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", req, lba, lba,
-		count, count, (int)rq_data_dir(req));
-}
-
-/*
- *****************************************************************************
- * MODULE GLUE
- *****************************************************************************
- */
-
-static int __init skd_init(void)
-{
-	BUILD_BUG_ON(sizeof(struct fit_completion_entry_v1) != 8);
-	BUILD_BUG_ON(sizeof(struct fit_comp_error_info) != 32);
-	BUILD_BUG_ON(sizeof(struct skd_command_header) != 16);
-	BUILD_BUG_ON(sizeof(struct skd_scsi_request) != 32);
-	BUILD_BUG_ON(sizeof(struct driver_inquiry_data) != 44);
-	BUILD_BUG_ON(offsetof(struct skd_msg_buf, fmh) != 0);
-	BUILD_BUG_ON(offsetof(struct skd_msg_buf, scsi) != 64);
-	BUILD_BUG_ON(sizeof(struct skd_msg_buf) != SKD_N_FITMSG_BYTES);
-
-	switch (skd_isr_type) {
-	case SKD_IRQ_LEGACY:
-	case SKD_IRQ_MSI:
-	case SKD_IRQ_MSIX:
-		break;
-	default:
-		pr_err(PFX "skd_isr_type %d invalid, re-set to %d\n",
-		       skd_isr_type, SKD_IRQ_DEFAULT);
-		skd_isr_type = SKD_IRQ_DEFAULT;
-	}
-
-	if (skd_max_queue_depth < 1 ||
-	    skd_max_queue_depth > SKD_MAX_QUEUE_DEPTH) {
-		pr_err(PFX "skd_max_queue_depth %d invalid, re-set to %d\n",
-		       skd_max_queue_depth, SKD_MAX_QUEUE_DEPTH_DEFAULT);
-		skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT;
-	}
-
-	if (skd_max_req_per_msg < 1 ||
-	    skd_max_req_per_msg > SKD_MAX_REQ_PER_MSG) {
-		pr_err(PFX "skd_max_req_per_msg %d invalid, re-set to %d\n",
-		       skd_max_req_per_msg, SKD_MAX_REQ_PER_MSG_DEFAULT);
-		skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT;
-	}
-
-	if (skd_sgs_per_request < 1 || skd_sgs_per_request > 4096) {
-		pr_err(PFX "skd_sg_per_request %d invalid, re-set to %d\n",
-		       skd_sgs_per_request, SKD_N_SG_PER_REQ_DEFAULT);
-		skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT;
-	}
-
-	if (skd_dbg_level < 0 || skd_dbg_level > 2) {
-		pr_err(PFX "skd_dbg_level %d invalid, re-set to %d\n",
-		       skd_dbg_level, 0);
-		skd_dbg_level = 0;
-	}
-
-	if (skd_isr_comp_limit < 0) {
-		pr_err(PFX "skd_isr_comp_limit %d invalid, set to %d\n",
-		       skd_isr_comp_limit, 0);
-		skd_isr_comp_limit = 0;
-	}
-
-	return pci_register_driver(&skd_driver);
-}
-
-static void __exit skd_exit(void)
-{
-	pci_unregister_driver(&skd_driver);
-
-	if (skd_major)
-		unregister_blkdev(skd_major, DRV_NAME);
-}
-
-module_init(skd_init);
-module_exit(skd_exit);
diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h
deleted file mode 100644
index c30bb98c7cd2..000000000000
--- a/drivers/block/skd_s1120.h
+++ /dev/null
@@ -1,322 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2012 STEC, Inc.
- * Copyright (c) 2017 Western Digital Corporation or its affiliates.
- */
-
-
-#ifndef SKD_S1120_H
-#define SKD_S1120_H
-
-/*
- * Q-channel, 64-bit r/w
- */
-#define FIT_Q_COMMAND			0x400u
-#define FIT_QCMD_QID_MASK		(0x3 << 1)
-#define  FIT_QCMD_QID0			(0x0 << 1)
-#define  FIT_QCMD_QID_NORMAL		FIT_QCMD_QID0
-#define  FIT_QCMD_QID1			(0x1 << 1)
-#define  FIT_QCMD_QID2			(0x2 << 1)
-#define  FIT_QCMD_QID3			(0x3 << 1)
-#define  FIT_QCMD_FLUSH_QUEUE		(0ull)	/* add QID */
-#define  FIT_QCMD_MSGSIZE_MASK		(0x3 << 4)
-#define  FIT_QCMD_MSGSIZE_64		(0x0 << 4)
-#define  FIT_QCMD_MSGSIZE_128		(0x1 << 4)
-#define  FIT_QCMD_MSGSIZE_256		(0x2 << 4)
-#define  FIT_QCMD_MSGSIZE_512		(0x3 << 4)
-#define  FIT_QCMD_ALIGN			L1_CACHE_BYTES
-
-/*
- * Control, 32-bit r/w
- */
-#define FIT_CONTROL			0x500u
-#define  FIT_CR_HARD_RESET		(1u << 0u)
-#define  FIT_CR_SOFT_RESET		(1u << 1u)
-#define  FIT_CR_DIS_TIMESTAMPS		(1u << 6u)
-#define  FIT_CR_ENABLE_INTERRUPTS	(1u << 7u)
-
-/*
- * Status, 32-bit, r/o
- */
-#define FIT_STATUS			0x510u
-#define FIT_SR_DRIVE_STATE_MASK		0x000000FFu
-#define	FIT_SR_SIGNATURE		(0xFF << 8)
-#define	FIT_SR_PIO_DMA			(1 << 16)
-#define FIT_SR_DRIVE_OFFLINE		0x00
-#define FIT_SR_DRIVE_INIT		0x01
-/* #define FIT_SR_DRIVE_READY		0x02 */
-#define FIT_SR_DRIVE_ONLINE		0x03
-#define FIT_SR_DRIVE_BUSY		0x04
-#define FIT_SR_DRIVE_FAULT		0x05
-#define FIT_SR_DRIVE_DEGRADED		0x06
-#define FIT_SR_PCIE_LINK_DOWN		0x07
-#define FIT_SR_DRIVE_SOFT_RESET		0x08
-#define FIT_SR_DRIVE_INIT_FAULT		0x09
-#define FIT_SR_DRIVE_BUSY_SANITIZE	0x0A
-#define FIT_SR_DRIVE_BUSY_ERASE		0x0B
-#define FIT_SR_DRIVE_FW_BOOTING		0x0C
-#define FIT_SR_DRIVE_NEED_FW_DOWNLOAD	0xFE
-#define FIT_SR_DEVICE_MISSING		0xFF
-#define FIT_SR__RESERVED		0xFFFFFF00u
-
-/*
- * FIT_STATUS - Status register data definition
- */
-#define FIT_SR_STATE_MASK		(0xFF << 0)
-#define FIT_SR_SIGNATURE		(0xFF << 8)
-#define FIT_SR_PIO_DMA			(1 << 16)
-
-/*
- * Interrupt status, 32-bit r/w1c (w1c ==> write 1 to clear)
- */
-#define FIT_INT_STATUS_HOST		0x520u
-#define  FIT_ISH_FW_STATE_CHANGE	(1u << 0u)
-#define  FIT_ISH_COMPLETION_POSTED	(1u << 1u)
-#define  FIT_ISH_MSG_FROM_DEV		(1u << 2u)
-#define  FIT_ISH_UNDEFINED_3		(1u << 3u)
-#define  FIT_ISH_UNDEFINED_4		(1u << 4u)
-#define  FIT_ISH_Q0_FULL		(1u << 5u)
-#define  FIT_ISH_Q1_FULL		(1u << 6u)
-#define  FIT_ISH_Q2_FULL		(1u << 7u)
-#define  FIT_ISH_Q3_FULL		(1u << 8u)
-#define  FIT_ISH_QCMD_FIFO_OVERRUN	(1u << 9u)
-#define  FIT_ISH_BAD_EXP_ROM_READ	(1u << 10u)
-
-#define FIT_INT_DEF_MASK \
-	(FIT_ISH_FW_STATE_CHANGE | \
-	 FIT_ISH_COMPLETION_POSTED | \
-	 FIT_ISH_MSG_FROM_DEV | \
-	 FIT_ISH_Q0_FULL | \
-	 FIT_ISH_Q1_FULL | \
-	 FIT_ISH_Q2_FULL | \
-	 FIT_ISH_Q3_FULL | \
-	 FIT_ISH_QCMD_FIFO_OVERRUN | \
-	 FIT_ISH_BAD_EXP_ROM_READ)
-
-#define FIT_INT_QUEUE_FULL \
-	(FIT_ISH_Q0_FULL | \
-	 FIT_ISH_Q1_FULL | \
-	 FIT_ISH_Q2_FULL | \
-	 FIT_ISH_Q3_FULL)
-
-#define MSI_MSG_NWL_ERROR_0		0x00000000
-#define MSI_MSG_NWL_ERROR_1		0x00000001
-#define MSI_MSG_NWL_ERROR_2		0x00000002
-#define MSI_MSG_NWL_ERROR_3		0x00000003
-#define MSI_MSG_STATE_CHANGE		0x00000004
-#define MSI_MSG_COMPLETION_POSTED	0x00000005
-#define MSI_MSG_MSG_FROM_DEV		0x00000006
-#define MSI_MSG_RESERVED_0		0x00000007
-#define MSI_MSG_RESERVED_1		0x00000008
-#define MSI_MSG_QUEUE_0_FULL		0x00000009
-#define MSI_MSG_QUEUE_1_FULL		0x0000000A
-#define MSI_MSG_QUEUE_2_FULL		0x0000000B
-#define MSI_MSG_QUEUE_3_FULL		0x0000000C
-
-#define FIT_INT_RESERVED_MASK \
-	(FIT_ISH_UNDEFINED_3 | \
-	 FIT_ISH_UNDEFINED_4)
-
-/*
- * Interrupt mask, 32-bit r/w
- * Bit definitions are the same as FIT_INT_STATUS_HOST
- */
-#define FIT_INT_MASK_HOST		0x528u
-
-/*
- * Message to device, 32-bit r/w
- */
-#define FIT_MSG_TO_DEVICE		0x540u
-
-/*
- * Message from device, 32-bit, r/o
- */
-#define FIT_MSG_FROM_DEVICE		0x548u
-
-/*
- * 32-bit messages to/from device, composition/extraction macros
- */
-#define FIT_MXD_CONS(TYPE, PARAM, DATA) \
-	((((TYPE)  & 0xFFu) << 24u) | \
-	(((PARAM) & 0xFFu) << 16u) | \
-	(((DATA)  & 0xFFFFu) << 0u))
-#define FIT_MXD_TYPE(MXD)		(((MXD) >> 24u) & 0xFFu)
-#define FIT_MXD_PARAM(MXD)		(((MXD) >> 16u) & 0xFFu)
-#define FIT_MXD_DATA(MXD)		(((MXD) >> 0u) & 0xFFFFu)
-
-/*
- * Types of messages to/from device
- */
-#define FIT_MTD_FITFW_INIT		0x01u
-#define FIT_MTD_GET_CMDQ_DEPTH		0x02u
-#define FIT_MTD_SET_COMPQ_DEPTH		0x03u
-#define FIT_MTD_SET_COMPQ_ADDR		0x04u
-#define FIT_MTD_ARM_QUEUE		0x05u
-#define FIT_MTD_CMD_LOG_HOST_ID		0x07u
-#define FIT_MTD_CMD_LOG_TIME_STAMP_LO	0x08u
-#define FIT_MTD_CMD_LOG_TIME_STAMP_HI	0x09u
-#define FIT_MFD_SMART_EXCEEDED		0x10u
-#define FIT_MFD_POWER_DOWN		0x11u
-#define FIT_MFD_OFFLINE			0x12u
-#define FIT_MFD_ONLINE			0x13u
-#define FIT_MFD_FW_RESTARTING		0x14u
-#define FIT_MFD_PM_ACTIVE		0x15u
-#define FIT_MFD_PM_STANDBY		0x16u
-#define FIT_MFD_PM_SLEEP		0x17u
-#define FIT_MFD_CMD_PROGRESS		0x18u
-
-#define FIT_MTD_DEBUG			0xFEu
-#define FIT_MFD_DEBUG			0xFFu
-
-#define FIT_MFD_MASK			(0xFFu)
-#define FIT_MFD_DATA_MASK		(0xFFu)
-#define FIT_MFD_MSG(x)			(((x) >> 24) & FIT_MFD_MASK)
-#define FIT_MFD_DATA(x)			((x) & FIT_MFD_MASK)
-
-/*
- * Extra arg to FIT_MSG_TO_DEVICE, 64-bit r/w
- * Used to set completion queue address (FIT_MTD_SET_COMPQ_ADDR)
- * (was Response buffer in docs)
- */
-#define FIT_MSG_TO_DEVICE_ARG		0x580u
-
-/*
- * Hardware (ASIC) version, 32-bit r/o
- */
-#define FIT_HW_VERSION			0x588u
-
-/*
- * Scatter/gather list descriptor.
- * 32-bytes and must be aligned on a 32-byte boundary.
- * All fields are in little endian order.
- */
-struct fit_sg_descriptor {
-	uint32_t control;
-	uint32_t byte_count;
-	uint64_t host_side_addr;
-	uint64_t dev_side_addr;
-	uint64_t next_desc_ptr;
-};
-
-#define FIT_SGD_CONTROL_NOT_LAST	0x000u
-#define FIT_SGD_CONTROL_LAST		0x40Eu
-
-/*
- * Header at the beginning of a FIT message. The header
- * is followed by SSDI requests each 64 bytes.
- * A FIT message can be up to 512 bytes long and must start
- * on a 64-byte boundary.
- */
-struct fit_msg_hdr {
-	uint8_t protocol_id;
-	uint8_t num_protocol_cmds_coalesced;
-	uint8_t _reserved[62];
-};
-
-#define FIT_PROTOCOL_ID_FIT	1
-#define FIT_PROTOCOL_ID_SSDI	2
-#define FIT_PROTOCOL_ID_SOFIT	3
-
-
-#define FIT_PROTOCOL_MINOR_VER(mtd_val) ((mtd_val >> 16) & 0xF)
-#define FIT_PROTOCOL_MAJOR_VER(mtd_val) ((mtd_val >> 20) & 0xF)
-
-/*
- * Format of a completion entry. The completion queue is circular
- * and must have at least as many entries as the maximum number
- * of commands that may be issued to the device.
- *
- * There are no head/tail pointers. The cycle value is used to
- * infer the presence of new completion records.
- * Initially the cycle in all entries is 0, the index is 0, and
- * the cycle value to expect is 1. When completions are added
- * their cycle values are set to 1. When the index wraps the
- * cycle value to expect is incremented.
- *
- * Command_context is opaque and taken verbatim from the SSDI command.
- * All other fields are big endian.
- */
-#define FIT_PROTOCOL_VERSION_0		0
-
-/*
- *  Protocol major version 1 completion entry.
- *  The major protocol version is found in bits
- *  20-23 of the FIT_MTD_FITFW_INIT response.
- */
-struct fit_completion_entry_v1 {
-	__be32		num_returned_bytes;
-	uint16_t	tag;
-	uint8_t		status;  /* SCSI status */
-	uint8_t		cycle;
-};
-#define FIT_PROTOCOL_VERSION_1		1
-#define FIT_PROTOCOL_VERSION_CURRENT	FIT_PROTOCOL_VERSION_1
-
-struct fit_comp_error_info {
-	uint8_t		type:7; /* 00: Bits0-6 indicates the type of sense data. */
-	uint8_t		valid:1; /* 00: Bit 7 := 1 ==> info field is valid. */
-	uint8_t		reserved0; /* 01: Obsolete field */
-	uint8_t		key:4; /* 02: Bits0-3 indicate the sense key. */
-	uint8_t		reserved2:1; /* 02: Reserved bit. */
-	uint8_t		bad_length:1; /* 02: Incorrect Length Indicator */
-	uint8_t		end_medium:1; /* 02: End of Medium */
-	uint8_t		file_mark:1; /* 02: Filemark */
-	uint8_t		info[4]; /* 03: */
-	uint8_t		reserved1; /* 07: Additional Sense Length */
-	uint8_t		cmd_spec[4]; /* 08: Command Specific Information */
-	uint8_t		code; /* 0C: Additional Sense Code */
-	uint8_t		qual; /* 0D: Additional Sense Code Qualifier */
-	uint8_t		fruc; /* 0E: Field Replaceable Unit Code */
-	uint8_t		sks_high:7; /* 0F: Sense Key Specific (MSB) */
-	uint8_t		sks_valid:1; /* 0F: Sense Key Specific Valid */
-	uint16_t	sks_low; /* 10: Sense Key Specific (LSW) */
-	uint16_t	reserved3; /* 12: Part of additional sense bytes (unused) */
-	uint16_t	uec; /* 14: Additional Sense Bytes */
-	uint64_t	per __packed; /* 16: Additional Sense Bytes */
-	uint8_t		reserved4[2]; /* 1E: Additional Sense Bytes (unused) */
-};
-
-
-/* Task management constants */
-#define SOFT_TASK_SIMPLE		0x00
-#define SOFT_TASK_HEAD_OF_QUEUE		0x01
-#define SOFT_TASK_ORDERED		0x02
-
-/* Version zero has the last 32 bits reserved,
- * Version one has the last 32 bits sg_list_len_bytes;
- */
-struct skd_command_header {
-	__be64		sg_list_dma_address;
-	uint16_t	tag;
-	uint8_t		attribute;
-	uint8_t		add_cdb_len;     /* In 32 bit words */
-	__be32		sg_list_len_bytes;
-};
-
-struct skd_scsi_request {
-	struct		skd_command_header hdr;
-	unsigned char	cdb[16];
-/*	unsigned char _reserved[16]; */
-};
-
-struct driver_inquiry_data {
-	uint8_t		peripheral_device_type:5;
-	uint8_t		qualifier:3;
-	uint8_t		page_code;
-	__be16		page_length;
-	__be16		pcie_bus_number;
-	uint8_t		pcie_device_number;
-	uint8_t		pcie_function_number;
-	uint8_t		pcie_link_speed;
-	uint8_t		pcie_link_lanes;
-	__be16		pcie_vendor_id;
-	__be16		pcie_device_id;
-	__be16		pcie_subsystem_vendor_id;
-	__be16		pcie_subsystem_device_id;
-	uint8_t		reserved1[2];
-	uint8_t		reserved2[3];
-	uint8_t		driver_version_length;
-	uint8_t		driver_version[0x14];
-};
-
-#endif /* SKD_S1120_H */

From 9936c7c2bc76a0b2276f6d19de6d1d92f03deeab Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:51:56 +0000
Subject: [PATCH 089/183] io_uring: deduplicate core cancellations sequence

Files and task cancellations go over same steps trying to cancel
requests in io-wq, poll, etc. Deduplicate it with a helper.

note: new io_uring_try_cancel_requests() is former
__io_uring_cancel_task_requests() with files passed as an agrument and
flushing overflowed requests.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 85 ++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 45 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 24ad36d71289..a750c504366d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1003,9 +1003,9 @@ enum io_mem_account {
 	ACCT_PINNED,
 };
 
-static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
-					    struct task_struct *task);
-
+static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+					 struct task_struct *task,
+					 struct files_struct *files);
 static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node);
 static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 			struct io_ring_ctx *ctx);
@@ -8817,7 +8817,7 @@ static void io_ring_exit_work(struct work_struct *work)
 	 * as nobody else will be looking for them.
 	 */
 	do {
-		__io_uring_cancel_task_requests(ctx, NULL);
+		io_uring_try_cancel_requests(ctx, NULL, NULL);
 	} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
 	io_ring_ctx_free(ctx);
 }
@@ -8931,6 +8931,40 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx,
 	}
 }
 
+static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
+					 struct task_struct *task,
+					 struct files_struct *files)
+{
+	struct io_task_cancel cancel = { .task = task, .files = files, };
+
+	while (1) {
+		enum io_wq_cancel cret;
+		bool ret = false;
+
+		if (ctx->io_wq) {
+			cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
+					       &cancel, true);
+			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
+		}
+
+		/* SQPOLL thread does its own polling */
+		if (!(ctx->flags & IORING_SETUP_SQPOLL) && !files) {
+			while (!list_empty_careful(&ctx->iopoll_list)) {
+				io_iopoll_try_reap_events(ctx);
+				ret = true;
+			}
+		}
+
+		ret |= io_poll_remove_all(ctx, task, files);
+		ret |= io_kill_timeouts(ctx, task, files);
+		ret |= io_run_task_work();
+		io_cqring_overflow_flush(ctx, true, task, files);
+		if (!ret)
+			break;
+		cond_resched();
+	}
+}
+
 static int io_uring_count_inflight(struct io_ring_ctx *ctx,
 				   struct task_struct *task,
 				   struct files_struct *files)
@@ -8950,7 +8984,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 				  struct files_struct *files)
 {
 	while (!list_empty_careful(&ctx->inflight_list)) {
-		struct io_task_cancel cancel = { .task = task, .files = files };
 		DEFINE_WAIT(wait);
 		int inflight;
 
@@ -8958,13 +8991,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 		if (!inflight)
 			break;
 
-		io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true);
-		io_poll_remove_all(ctx, task, files);
-		io_kill_timeouts(ctx, task, files);
-		io_cqring_overflow_flush(ctx, true, task, files);
-		/* cancellations _may_ trigger task work */
-		io_run_task_work();
-
+		io_uring_try_cancel_requests(ctx, task, files);
 		prepare_to_wait(&task->io_uring->wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		if (inflight == io_uring_count_inflight(ctx, task, files))
@@ -8973,37 +9000,6 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 	}
 }
 
-static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
-					    struct task_struct *task)
-{
-	while (1) {
-		struct io_task_cancel cancel = { .task = task, .files = NULL, };
-		enum io_wq_cancel cret;
-		bool ret = false;
-
-		if (ctx->io_wq) {
-			cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb,
-					       &cancel, true);
-			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
-		}
-
-		/* SQPOLL thread does its own polling */
-		if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
-			while (!list_empty_careful(&ctx->iopoll_list)) {
-				io_iopoll_try_reap_events(ctx);
-				ret = true;
-			}
-		}
-
-		ret |= io_poll_remove_all(ctx, task, NULL);
-		ret |= io_kill_timeouts(ctx, task, NULL);
-		ret |= io_run_task_work();
-		if (!ret)
-			break;
-		cond_resched();
-	}
-}
-
 static void io_disable_sqo_submit(struct io_ring_ctx *ctx)
 {
 	mutex_lock(&ctx->uring_lock);
@@ -9033,11 +9029,10 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
 	}
 
 	io_cancel_defer_files(ctx, task, files);
-	io_cqring_overflow_flush(ctx, true, task, files);
 
 	io_uring_cancel_files(ctx, task, files);
 	if (!files)
-		__io_uring_cancel_task_requests(ctx, task);
+		io_uring_try_cancel_requests(ctx, task, NULL);
 
 	if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
 		atomic_dec(&task->io_uring->in_idle);

From c1d5a224683b333ddbe278e455d639ccd4f5ca2b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:51:57 +0000
Subject: [PATCH 090/183] io_uring: refactor scheduling in io_cqring_wait

schedule_timeout() with timeout=MAX_SCHEDULE_TIMEOUT is guaranteed to
work just as schedule(), so instead of hand-coding it based on arguments
always use the timeout version and simplify code.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a750c504366d..5b735635b8f0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7213,9 +7213,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		.to_wait	= min_events,
 	};
 	struct io_rings *rings = ctx->rings;
-	struct timespec64 ts;
-	signed long timeout = 0;
-	int ret = 0;
+	signed long timeout = MAX_SCHEDULE_TIMEOUT;
+	int ret;
 
 	do {
 		io_cqring_overflow_flush(ctx, false, NULL, NULL);
@@ -7239,6 +7238,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	}
 
 	if (uts) {
+		struct timespec64 ts;
+
 		if (get_timespec64(&ts, uts))
 			return -EFAULT;
 		timeout = timespec64_to_jiffies(&ts);
@@ -7264,14 +7265,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			finish_wait(&ctx->wait, &iowq.wq);
 			continue;
 		}
-		if (uts) {
-			timeout = schedule_timeout(timeout);
-			if (timeout == 0) {
-				ret = -ETIME;
-				break;
-			}
-		} else {
-			schedule();
+		timeout = schedule_timeout(timeout);
+		if (timeout == 0) {
+			ret = -ETIME;
+			break;
 		}
 	} while (1);
 	finish_wait(&ctx->wait, &iowq.wq);

From eeb60b9ab4000d20261973642dfc9fb0e4b5d073 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:51:58 +0000
Subject: [PATCH 091/183] io_uring: refactor io_cqring_wait

It's easy to make a mistake in io_cqring_wait() because for all
break/continue clauses we need to watch for prepare/finish_wait to be
used correctly. Extract all those into a new helper
io_cqring_wait_schedule(), and transforming the loop into simple series
of func calls: prepare(); check_and_schedule(); finish();

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5b735635b8f0..dcb9e937daa3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7195,6 +7195,25 @@ static int io_run_task_work_sig(void)
 	return -EINTR;
 }
 
+/* when returns >0, the caller should retry */
+static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
+					  struct io_wait_queue *iowq,
+					  signed long *timeout)
+{
+	int ret;
+
+	/* make sure we run task_work before checking for signals */
+	ret = io_run_task_work_sig();
+	if (ret || io_should_wake(iowq))
+		return ret;
+	/* let the caller flush overflows, retry */
+	if (test_bit(0, &ctx->cq_check_overflow))
+		return 1;
+
+	*timeout = schedule_timeout(*timeout);
+	return !*timeout ? -ETIME : 1;
+}
+
 /*
  * Wait until events become available, if we don't already have some. The
  * application must reap them itself, as they reside on the shared cq ring.
@@ -7251,27 +7270,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		io_cqring_overflow_flush(ctx, false, NULL, NULL);
 		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
-		/* make sure we run task_work before checking for signals */
-		ret = io_run_task_work_sig();
-		if (ret > 0) {
-			finish_wait(&ctx->wait, &iowq.wq);
-			continue;
-		}
-		else if (ret < 0)
-			break;
-		if (io_should_wake(&iowq))
-			break;
-		if (test_bit(0, &ctx->cq_check_overflow)) {
-			finish_wait(&ctx->wait, &iowq.wq);
-			continue;
-		}
-		timeout = schedule_timeout(timeout);
-		if (timeout == 0) {
-			ret = -ETIME;
-			break;
-		}
-	} while (1);
-	finish_wait(&ctx->wait, &iowq.wq);
+		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+		finish_wait(&ctx->wait, &iowq.wq);
+	} while (ret > 0);
 
 	restore_saved_sigmask_unless(ret == -EINTR);
 

From 6713e7a6145a4b5a61e33a37f0b4d06ca6d2c6d8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:51:59 +0000
Subject: [PATCH 092/183] io_uring: refactor io_read for unsupported nowait

!io_file_supports_async() case of io_read() is hard to read, it jumps
somewhere in the middle of the function just to do async setup and fail
on a similar check. Call io_setup_async_rw() directly for this case,
it's much easier to follow.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index dcb9e937daa3..866e0ea83dbe 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3506,7 +3506,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	struct iov_iter __iter, *iter = &__iter;
 	struct io_async_rw *rw = req->async_data;
 	ssize_t io_size, ret, ret2;
-	bool no_async;
 
 	if (rw) {
 		iter = &rw->iter;
@@ -3527,9 +3526,12 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		kiocb->ki_flags |= IOCB_NOWAIT;
 
 	/* If the file doesn't support async, just async punt */
-	no_async = force_nonblock && !io_file_supports_async(req->file, READ);
-	if (no_async)
-		goto copy_iov;
+	if (force_nonblock && !io_file_supports_async(req->file, READ)) {
+		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
+		if (!ret)
+			return -EAGAIN;
+		goto out_free;
+	}
 
 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
 	if (unlikely(ret))
@@ -3568,8 +3570,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		ret = ret2;
 		goto out_free;
 	}
-	if (no_async)
-		return -EAGAIN;
 	rw = req->async_data;
 	/* it's copied and will be cleaned with ->io */
 	iovec = NULL;

From 1a2cc0ce8d18c9e5592733cb6381e9ff5c23d916 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:52:00 +0000
Subject: [PATCH 093/183] io_uring: further simplify do_read error parsing

First, instead of checking iov_iter_count(iter) for 0 to find out that
all needed bytes were read, just compare returned code against io_size.
It's more reliable and arguably cleaner.

Also, place the half-read case into an else branch and delete an extra
label.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 866e0ea83dbe..1d1fa1f77332 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3552,19 +3552,18 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = 0;
-		goto copy_iov;
-	} else if (ret <= 0) {
+	} else if (ret <= 0 || ret == io_size) {
 		/* make sure -ERESTARTSYS -> -EINTR is done */
 		goto done;
+	} else {
+		/* we did blocking attempt. no retry. */
+		if (!force_nonblock || (req->file->f_flags & O_NONBLOCK) ||
+		    !(req->flags & REQ_F_ISREG))
+			goto done;
+
+		io_size -= ret;
 	}
 
-	/* read it all, or we did blocking attempt. no retry. */
-	if (!iov_iter_count(iter) || !force_nonblock ||
-	    (req->file->f_flags & O_NONBLOCK) || !(req->flags & REQ_F_ISREG))
-		goto done;
-
-	io_size -= ret;
-copy_iov:
 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
 	if (ret2) {
 		ret = ret2;

From 6bf985dc50dd882a95fffa9c7eef0d1416f512e6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:52:01 +0000
Subject: [PATCH 094/183] io_uring: let io_setup_async_rw take care of iovec

Now we give out ownership of iovec into io_setup_async_rw(), so it
either sets request's context right or frees the iovec on error itself.
Makes our life a bit easier at call sites.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1d1fa1f77332..f8492d62b6a1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2721,11 +2721,7 @@ static bool io_resubmit_prep(struct io_kiocb *req)
 	ret = io_import_iovec(rw, req, &iovec, &iter, false);
 	if (ret < 0)
 		return false;
-	ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
-	if (!ret)
-		return true;
-	kfree(iovec);
-	return false;
+	return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
 }
 #endif
 
@@ -3366,8 +3362,10 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 	if (!force && !io_op_defs[req->opcode].needs_async_data)
 		return 0;
 	if (!req->async_data) {
-		if (__io_alloc_async_data(req))
+		if (__io_alloc_async_data(req)) {
+			kfree(iovec);
 			return -ENOMEM;
+		}
 
 		io_req_map_rw(req, iovec, fast_iov, iter);
 	}
@@ -3528,9 +3526,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	/* If the file doesn't support async, just async punt */
 	if (force_nonblock && !io_file_supports_async(req->file, READ)) {
 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
-		if (!ret)
-			return -EAGAIN;
-		goto out_free;
+		return ret ?: -EAGAIN;
 	}
 
 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
@@ -3565,10 +3561,9 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	}
 
 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
-	if (ret2) {
-		ret = ret2;
-		goto out_free;
-	}
+	if (ret2)
+		return ret2;
+
 	rw = req->async_data;
 	/* it's copied and will be cleaned with ->io */
 	iovec = NULL;
@@ -3703,8 +3698,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
-		if (!ret)
-			return -EAGAIN;
+		return ret ?: -EAGAIN;
 	}
 out_free:
 	/* it's reportedly faster than delegating the null check to kfree() */

From 7335e3bf9d0a92be09bb4f38d06ab22c40f0fead Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:52:02 +0000
Subject: [PATCH 095/183] io_uring: don't forget to adjust io_size

We have invariant in io_read() of how much we're trying to read spilled
into an iter and io_size variable. The last one controls decision making
about whether to do read-retries. However, io_size is modified only
after the first read attempt, so if we happen to go for a third retry in
a single call to io_read(), we will get io_size greater than in the
iterator, so may lead to various side effects up to live-locking.

Modify io_size each time.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f8492d62b6a1..25fffff27c76 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3548,16 +3548,11 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = 0;
-	} else if (ret <= 0 || ret == io_size) {
-		/* make sure -ERESTARTSYS -> -EINTR is done */
+	} else if (ret <= 0 || ret == io_size || !force_nonblock ||
+		   (req->file->f_flags & O_NONBLOCK) ||
+		   !(req->flags & REQ_F_ISREG)) {
+		/* read all, failed, already did sync or don't want to retry */
 		goto done;
-	} else {
-		/* we did blocking attempt. no retry. */
-		if (!force_nonblock || (req->file->f_flags & O_NONBLOCK) ||
-		    !(req->flags & REQ_F_ISREG))
-			goto done;
-
-		io_size -= ret;
 	}
 
 	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
@@ -3570,6 +3565,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	/* now use our persistent iterator, if we aren't already */
 	iter = &rw->iter;
 retry:
+	io_size -= ret;
 	rw->bytes_done += ret;
 	/* if we can retry, do so with the callbacks armed */
 	if (!io_rw_should_retry(req)) {

From 5ea5dd45844d1b727ab2a76f47d6e9aa65d1e921 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:52:03 +0000
Subject: [PATCH 096/183] io_uring: inline io_read()'s iovec freeing

io_read() has not the simpliest control flow with a lot of jumps and
it's hard to read. One of those is a out_free: label, which frees iovec.
However, from the middle of io_read() iovec is NULL'ed and so
kfree(iovec) is no-op, it leaves us with two place where we can inline
it and further clean up the code.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 25fffff27c76..35ad889afaec 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3530,14 +3530,18 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	}
 
 	ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
-	if (unlikely(ret))
-		goto out_free;
+	if (unlikely(ret)) {
+		kfree(iovec);
+		return ret;
+	}
 
 	ret = io_iter_do_read(req, iter);
 
 	if (ret == -EIOCBQUEUED) {
-		ret = 0;
-		goto out_free;
+		/* it's faster to check here then delegate to kfree */
+		if (iovec)
+			kfree(iovec);
+		return 0;
 	} else if (ret == -EAGAIN) {
 		/* IOPOLL retry should happen for io-wq threads */
 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -3560,8 +3564,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		return ret2;
 
 	rw = req->async_data;
-	/* it's copied and will be cleaned with ->io */
-	iovec = NULL;
 	/* now use our persistent iterator, if we aren't already */
 	iter = &rw->iter;
 retry:
@@ -3580,21 +3582,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	 * do, then just retry at the new offset.
 	 */
 	ret = io_iter_do_read(req, iter);
-	if (ret == -EIOCBQUEUED) {
-		ret = 0;
-		goto out_free;
-	} else if (ret > 0 && ret < io_size) {
-		/* we got some bytes, but not all. retry. */
+	if (ret == -EIOCBQUEUED)
+		return 0;
+	/* we got some bytes, but not all. retry. */
+	if (ret > 0 && ret < io_size)
 		goto retry;
-	}
 done:
 	kiocb_done(kiocb, ret, cs);
-	ret = 0;
-out_free:
-	/* it's reportedly faster than delegating the null check to kfree() */
-	if (iovec)
-		kfree(iovec);
-	return ret;
+	return 0;
 }
 
 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)

From b23df91bff954ebd8aee39eb22e5028f41cd9e56 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:52:04 +0000
Subject: [PATCH 097/183] io_uring: highlight read-retry loop

We already have implicit do-while for read-retries but with goto in the
end. Convert it to an actual do-while, it highlights it so making a
bit more understandable and is cleaner in general.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 35ad889afaec..bbf8ea8370d6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3566,27 +3566,27 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	rw = req->async_data;
 	/* now use our persistent iterator, if we aren't already */
 	iter = &rw->iter;
-retry:
-	io_size -= ret;
-	rw->bytes_done += ret;
-	/* if we can retry, do so with the callbacks armed */
-	if (!io_rw_should_retry(req)) {
-		kiocb->ki_flags &= ~IOCB_WAITQ;
-		return -EAGAIN;
-	}
 
-	/*
-	 * Now retry read with the IOCB_WAITQ parts set in the iocb. If we
-	 * get -EIOCBQUEUED, then we'll get a notification when the desired
-	 * page gets unlocked. We can also get a partial read here, and if we
-	 * do, then just retry at the new offset.
-	 */
-	ret = io_iter_do_read(req, iter);
-	if (ret == -EIOCBQUEUED)
-		return 0;
-	/* we got some bytes, but not all. retry. */
-	if (ret > 0 && ret < io_size)
-		goto retry;
+	do {
+		io_size -= ret;
+		rw->bytes_done += ret;
+		/* if we can retry, do so with the callbacks armed */
+		if (!io_rw_should_retry(req)) {
+			kiocb->ki_flags &= ~IOCB_WAITQ;
+			return -EAGAIN;
+		}
+
+		/*
+		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
+		 * we get -EIOCBQUEUED, then we'll get a notification when the
+		 * desired page gets unlocked. We can also get a partial read
+		 * here, and if we do, then just retry at the new offset.
+		 */
+		ret = io_iter_do_read(req, iter);
+		if (ret == -EIOCBQUEUED)
+			return 0;
+		/* we got some bytes, but not all. retry. */
+	} while (ret > 0 && ret < io_size);
 done:
 	kiocb_done(kiocb, ret, cs);
 	return 0;

From 75c668cdd6ca05dd9c7138a5a080c0088d72cf51 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:52:05 +0000
Subject: [PATCH 098/183] io_uring: treat NONBLOCK and RWF_NOWAIT similarly

Make decision making of whether we need to retry read/write similar for
O_NONBLOCK and RWF_NOWAIT. Set REQ_F_NOWAIT when either is specified and
use it for all relevant checks. Also fix resubmitting NOWAIT requests
via io_rw_reissue().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bbf8ea8370d6..ce2ea3f55f65 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2734,7 +2734,9 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
 	if (res != -EAGAIN && res != -EOPNOTSUPP)
 		return false;
 	mode = file_inode(req->file)->i_mode;
-	if ((!S_ISBLK(mode) && !S_ISREG(mode)) || io_wq_current_is_worker())
+	if (!S_ISBLK(mode) && !S_ISREG(mode))
+		return false;
+	if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
 		return false;
 
 	lockdep_assert_held(&req->ctx->uring_lock);
@@ -2907,16 +2909,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw.kiocb;
+	struct file *file = req->file;
 	unsigned ioprio;
 	int ret;
 
-	if (S_ISREG(file_inode(req->file)->i_mode))
+	if (S_ISREG(file_inode(file)->i_mode))
 		req->flags |= REQ_F_ISREG;
 
 	kiocb->ki_pos = READ_ONCE(sqe->off);
-	if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
+	if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) {
 		req->flags |= REQ_F_CUR_POS;
-		kiocb->ki_pos = req->file->f_pos;
+		kiocb->ki_pos = file->f_pos;
 	}
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
@@ -2924,6 +2927,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (unlikely(ret))
 		return ret;
 
+	/* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */
+	if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK))
+		req->flags |= REQ_F_NOWAIT;
+
 	ioprio = READ_ONCE(sqe->ioprio);
 	if (ioprio) {
 		ret = ioprio_check_cap(ioprio);
@@ -2934,10 +2941,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	} else
 		kiocb->ki_ioprio = get_current_ioprio();
 
-	/* don't allow async punt if RWF_NOWAIT was requested */
-	if (kiocb->ki_flags & IOCB_NOWAIT)
-		req->flags |= REQ_F_NOWAIT;
-
 	if (ctx->flags & IORING_SETUP_IOPOLL) {
 		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
 		    !kiocb->ki_filp->f_op->iopoll)
@@ -3546,15 +3549,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 		/* IOPOLL retry should happen for io-wq threads */
 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
 			goto done;
-		/* no retry on NONBLOCK marked file */
-		if (req->file->f_flags & O_NONBLOCK)
+		/* no retry on NONBLOCK nor RWF_NOWAIT */
+		if (req->flags & REQ_F_NOWAIT)
 			goto done;
 		/* some cases will consume bytes even on error returns */
 		iov_iter_revert(iter, io_size - iov_iter_count(iter));
 		ret = 0;
 	} else if (ret <= 0 || ret == io_size || !force_nonblock ||
-		   (req->file->f_flags & O_NONBLOCK) ||
-		   !(req->flags & REQ_F_ISREG)) {
+		   (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
 		/* read all, failed, already did sync or don't want to retry */
 		goto done;
 	}
@@ -3675,8 +3677,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	 */
 	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
 		ret2 = -EAGAIN;
-	/* no retry on NONBLOCK marked file */
-	if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK))
+	/* no retry on NONBLOCK nor RWF_NOWAIT */
+	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
 		goto done;
 	if (!force_nonblock || ret2 != -EAGAIN) {
 		/* IOPOLL retry should happen for io-wq threads */

From 847595de1732a6e928f241929d24dde2e9ffaf15 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:52:06 +0000
Subject: [PATCH 099/183] io_uring: io_import_iovec return type cleanup

io_import_iovec() doesn't return IO size anymore, only error code. Make
it more apparent by returning int instead of ssize and clean up
leftovers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ce2ea3f55f65..24cc00ff7155 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1030,9 +1030,8 @@ static struct file *io_file_get(struct io_submit_state *state,
 static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
 static void io_rsrc_put_work(struct work_struct *work);
 
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
-			       struct iovec **iovec, struct iov_iter *iter,
-			       bool needs_lock);
+static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
+			   struct iov_iter *iter, bool needs_lock);
 static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 			     const struct iovec *fast_iov,
 			     struct iov_iter *iter, bool force);
@@ -2693,9 +2692,8 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res,
 static bool io_resubmit_prep(struct io_kiocb *req)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
-	ssize_t ret = -ECANCELED;
+	int rw, ret = -ECANCELED;
 	struct iov_iter iter;
-	int rw;
 
 	/* already prepared */
 	if (req->async_data)
@@ -3004,8 +3002,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 		io_rw_done(kiocb, ret);
 }
 
-static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
-			       struct iov_iter *iter)
+static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	size_t len = req->rw.len;
@@ -3069,7 +3066,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
 		}
 	}
 
-	return len;
+	return 0;
 }
 
 static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
@@ -3210,16 +3207,14 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
 	return __io_iov_buffer_select(req, iov, needs_lock);
 }
 
-static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
-				 struct iovec **iovec, struct iov_iter *iter,
-				 bool needs_lock)
+static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
+			   struct iov_iter *iter, bool needs_lock)
 {
 	void __user *buf = u64_to_user_ptr(req->rw.addr);
 	size_t sqe_len = req->rw.len;
+	u8 opcode = req->opcode;
 	ssize_t ret;
-	u8 opcode;
 
-	opcode = req->opcode;
 	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
 		*iovec = NULL;
 		return io_import_fixed(req, rw, iter);
@@ -3244,10 +3239,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		ret = io_iov_buffer_select(req, *iovec, needs_lock);
-		if (!ret) {
-			ret = (*iovec)->iov_len;
-			iov_iter_init(iter, rw, *iovec, 1, ret);
-		}
+		if (!ret)
+			iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
 		*iovec = NULL;
 		return ret;
 	}
@@ -3379,7 +3372,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
 {
 	struct io_async_rw *iorw = req->async_data;
 	struct iovec *iov = iorw->fast_iov;
-	ssize_t ret;
+	int ret;
 
 	ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
 	if (unlikely(ret < 0))
@@ -3518,7 +3511,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	}
 	io_size = iov_iter_count(iter);
 	req->result = io_size;
-	ret = 0;
 
 	/* Ensure we clear previously set non-block flag */
 	if (!force_nonblock)

From ea64ec02b31d5b05ae94ac4d57e38f8a02117c76 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:52:07 +0000
Subject: [PATCH 100/183] io_uring: deduplicate file table slot calculation

Extract a helper io_fixed_file_slot() returning a place in our fixed
files table, so we don't hand-code it three times in the code.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 24cc00ff7155..5ee6a9273fca 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7740,6 +7740,15 @@ static void io_rsrc_put_work(struct work_struct *work)
 	}
 }
 
+static struct file **io_fixed_file_slot(struct fixed_rsrc_data *file_data,
+					unsigned i)
+{
+	struct fixed_rsrc_table *table;
+
+	table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
+	return &table->files[i & IORING_FILE_TABLE_MASK];
+}
+
 static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
 {
 	struct fixed_rsrc_ref_node *ref_node;
@@ -7808,6 +7817,7 @@ static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node)
 	kfree(ref_node);
 }
 
+
 static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 				 unsigned nr_args)
 {
@@ -7840,9 +7850,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		goto out_free;
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		struct fixed_rsrc_table *table;
-		unsigned index;
-
 		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
 			ret = -EFAULT;
 			goto out_fput;
@@ -7867,9 +7874,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto out_fput;
 		}
-		table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-		index = i & IORING_FILE_TABLE_MASK;
-		table->files[index] = file;
+		*io_fixed_file_slot(file_data, i) = file;
 	}
 
 	ret = io_sqe_files_scm(ctx);
@@ -7972,7 +7977,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 {
 	struct fixed_rsrc_data *data = ctx->file_data;
 	struct fixed_rsrc_ref_node *ref_node;
-	struct file *file;
+	struct file *file, **file_slot;
 	__s32 __user *fds;
 	int fd, i, err;
 	__u32 done;
@@ -7990,9 +7995,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 
 	fds = u64_to_user_ptr(up->data);
 	for (done = 0; done < nr_args; done++) {
-		struct fixed_rsrc_table *table;
-		unsigned index;
-
 		err = 0;
 		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
 			err = -EFAULT;
@@ -8002,14 +8004,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			continue;
 
 		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-		index = i & IORING_FILE_TABLE_MASK;
-		if (table->files[index]) {
-			file = table->files[index];
-			err = io_queue_file_removal(data, file);
+		file_slot = io_fixed_file_slot(ctx->file_data, i);
+
+		if (*file_slot) {
+			err = io_queue_file_removal(data, *file_slot);
 			if (err)
 				break;
-			table->files[index] = NULL;
+			*file_slot = NULL;
 			needs_switch = true;
 		}
 		if (fd != -1) {
@@ -8031,13 +8032,12 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			table->files[index] = file;
 			err = io_sqe_file_register(ctx, file, i);
 			if (err) {
-				table->files[index] = NULL;
 				fput(file);
 				break;
 			}
+			*file_slot = file;
 		}
 	}
 
@@ -9488,11 +9488,8 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
 	seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
 	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
 	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
-		struct fixed_rsrc_table *table;
-		struct file *f;
+		struct file *f = *io_fixed_file_slot(ctx->file_data, i);
 
-		table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
-		f = table->files[i & IORING_FILE_TABLE_MASK];
 		if (f)
 			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
 		else

From 5280f7e530f71ba85baf90169393196976ad0e52 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 4 Feb 2021 13:52:08 +0000
Subject: [PATCH 101/183] io_uring/io-wq: return 2-step work swap scheme

Saving one lock/unlock for io-wq is not super important, but adds some
ugliness in the code. More important, atomic decs not turning it to zero
for some archs won't give the right ordering/barriers so the
io_steal_work() may pretty easily get subtly and completely broken.

Return back 2-step io-wq work exchange and clean it up.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    | 16 ++++++----------
 fs/io-wq.h    |  4 ++--
 fs/io_uring.c | 26 ++++----------------------
 3 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 2e2f14f42bf2..63ef195b1acb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -555,23 +555,21 @@ static void io_worker_handle_work(struct io_worker *worker)
 
 		/* handle a whole dependent link */
 		do {
-			struct io_wq_work *old_work, *next_hashed, *linked;
+			struct io_wq_work *next_hashed, *linked;
 			unsigned int hash = io_get_work_hash(work);
 
 			next_hashed = wq_next_work(work);
 			io_impersonate_work(worker, work);
+			wq->do_work(work);
+			io_assign_current_work(worker, NULL);
 
-			old_work = work;
-			linked = wq->do_work(work);
-
+			linked = wq->free_work(work);
 			work = next_hashed;
 			if (!work && linked && !io_wq_is_hashed(linked)) {
 				work = linked;
 				linked = NULL;
 			}
 			io_assign_current_work(worker, work);
-			wq->free_work(old_work);
-
 			if (linked)
 				io_wqe_enqueue(wqe, linked);
 
@@ -850,11 +848,9 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
 	struct io_wq *wq = wqe->wq;
 
 	do {
-		struct io_wq_work *old_work = work;
-
 		work->flags |= IO_WQ_WORK_CANCEL;
-		work = wq->do_work(work);
-		wq->free_work(old_work);
+		wq->do_work(work);
+		work = wq->free_work(work);
 	} while (work);
 }
 
diff --git a/fs/io-wq.h b/fs/io-wq.h
index e1ffb80a4a1d..e37a0f217cc8 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -106,8 +106,8 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 	return container_of(work->list.next, struct io_wq_work, list);
 }
 
-typedef void (free_work_fn)(struct io_wq_work *);
-typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *);
+typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
+typedef void (io_wq_work_fn)(struct io_wq_work *);
 
 struct io_wq_data {
 	struct user_struct *user;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5ee6a9273fca..b740a39110d6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2379,22 +2379,6 @@ static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
 		io_free_req_deferred(req);
 }
 
-static struct io_wq_work *io_steal_work(struct io_kiocb *req)
-{
-	struct io_kiocb *nxt;
-
-	/*
-	 * A ref is owned by io-wq in which context we're. So, if that's the
-	 * last one, it's safe to steal next work. False negatives are Ok,
-	 * it just will be re-punted async in io_put_work()
-	 */
-	if (refcount_read(&req->refs) != 1)
-		return NULL;
-
-	nxt = io_req_find_next(req);
-	return nxt ? &nxt->work : NULL;
-}
-
 static void io_double_put_req(struct io_kiocb *req)
 {
 	/* drop both submit and complete references */
@@ -6343,7 +6327,7 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
 	return 0;
 }
 
-static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
+static void io_wq_submit_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	struct io_kiocb *timeout;
@@ -6394,8 +6378,6 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
 		if (lock_ctx)
 			mutex_unlock(&lock_ctx->uring_lock);
 	}
-
-	return io_steal_work(req);
 }
 
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
@@ -8067,12 +8049,12 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
 	return __io_sqe_files_update(ctx, &up, nr_args);
 }
 
-static void io_free_work(struct io_wq_work *work)
+static struct io_wq_work *io_free_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 
-	/* Consider that io_steal_work() relies on this ref */
-	io_put_req(req);
+	req = io_put_req_find_next(req);
+	return req ? &req->work : NULL;
 }
 
 static int io_init_wq_offload(struct io_ring_ctx *ctx,

From 2a7808024b195a342779fb5d7b7df1c4af45cc71 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 5 Feb 2021 00:57:58 +0000
Subject: [PATCH 102/183] io_uring: set msg_name on msg fixup

io_setup_async_msg() should fully prepare io_async_msghdr, let it also
handle assigning msg_name and don't hand code it in [send,recv]msg().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b740a39110d6..39bc1df9bb64 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4558,6 +4558,7 @@ static int io_setup_async_msg(struct io_kiocb *req,
 	async_msg = req->async_data;
 	req->flags |= REQ_F_NEED_CLEANUP;
 	memcpy(async_msg, kmsg, sizeof(*kmsg));
+	async_msg->msg.msg_name = &async_msg->addr;
 	return -EAGAIN;
 }
 
@@ -4610,7 +4611,6 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 
 	if (req->async_data) {
 		kmsg = req->async_data;
-		kmsg->msg.msg_name = &kmsg->addr;
 		/* if iov is set, it's allocated already */
 		if (!kmsg->iov)
 			kmsg->iov = kmsg->fast_iov;
@@ -4839,7 +4839,6 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 
 	if (req->async_data) {
 		kmsg = req->async_data;
-		kmsg->msg.msg_name = &kmsg->addr;
 		/* if iov is set, it's allocated already */
 		if (!kmsg->iov)
 			kmsg->iov = kmsg->fast_iov;

From 5476dfed29ad9b19d4e187685ab71bb9c496f965 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 5 Feb 2021 00:57:59 +0000
Subject: [PATCH 103/183] io_uring: clean iov usage for recvmsg buf select

Don't pretend we don't know that REQ_F_BUFFER_SELECT for recvmsg always
uses fast_iov -- clean up confusing intermixing kmsg->iov and
kmsg->fast_iov for buffer select.

Also don't init iter with garbage in __io_recvmsg_copy_hdr() only for it
to be set shortly after in io_recvmsg().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 39bc1df9bb64..e07a7fa15cfa 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4701,11 +4701,9 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		if (iov_len > 1)
 			return -EINVAL;
-		if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov)))
+		if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
 			return -EFAULT;
-		sr->len = iomsg->iov[0].iov_len;
-		iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1,
-				sr->len);
+		sr->len = iomsg->fast_iov[0].iov_len;
 		iomsg->iov = NULL;
 	} else {
 		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
@@ -4748,7 +4746,6 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 		if (clen < 0)
 			return -EINVAL;
 		sr->len = clen;
-		iomsg->iov[0].iov_len = clen;
 		iomsg->iov = NULL;
 	} else {
 		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
@@ -4855,7 +4852,8 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 		if (IS_ERR(kbuf))
 			return PTR_ERR(kbuf);
 		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
-		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
+		kmsg->fast_iov[0].iov_len = req->sr_msg.len;
+		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
 				1, req->sr_msg.len);
 	}
 

From 257e84a5377fbbc336ff563833a8712619acce56 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 5 Feb 2021 00:58:00 +0000
Subject: [PATCH 104/183] io_uring: refactor sendmsg/recvmsg iov managing

Current iov handling with recvmsg/sendmsg may be confusing. First make a
rule for msg->iov: either it points to an allocated iov that have to be
kfree()'d later, or it's NULL and we use fast_iov. That's much better
than current 3-state (also can point to fast_iov). And rename it into
free_iov for uniformity with read/write.

Also, instead of after struct io_async_msghdr copy fixing up of
msg.msg_iter.iov has been happening in io_recvmsg()/io_sendmsg(). Move
it into io_setup_async_msg(), that's the right place.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
[axboe: add comment on NULL check before kfree()]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 57 ++++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e07a7fa15cfa..7242cc48e97b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -594,7 +594,8 @@ struct io_async_connect {
 
 struct io_async_msghdr {
 	struct iovec			fast_iov[UIO_FASTIOV];
-	struct iovec			*iov;
+	/* points to an allocated iov, if NULL we use fast_iov instead */
+	struct iovec			*free_iov;
 	struct sockaddr __user		*uaddr;
 	struct msghdr			msg;
 	struct sockaddr_storage		addr;
@@ -4551,24 +4552,27 @@ static int io_setup_async_msg(struct io_kiocb *req,
 	if (async_msg)
 		return -EAGAIN;
 	if (io_alloc_async_data(req)) {
-		if (kmsg->iov != kmsg->fast_iov)
-			kfree(kmsg->iov);
+		kfree(kmsg->free_iov);
 		return -ENOMEM;
 	}
 	async_msg = req->async_data;
 	req->flags |= REQ_F_NEED_CLEANUP;
 	memcpy(async_msg, kmsg, sizeof(*kmsg));
 	async_msg->msg.msg_name = &async_msg->addr;
+	/* if were using fast_iov, set it to the new one */
+	if (!async_msg->free_iov)
+		async_msg->msg.msg_iter.iov = async_msg->fast_iov;
+
 	return -EAGAIN;
 }
 
 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
 			       struct io_async_msghdr *iomsg)
 {
-	iomsg->iov = iomsg->fast_iov;
 	iomsg->msg.msg_name = &iomsg->addr;
+	iomsg->free_iov = iomsg->fast_iov;
 	return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
-				   req->sr_msg.msg_flags, &iomsg->iov);
+				   req->sr_msg.msg_flags, &iomsg->free_iov);
 }
 
 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4609,13 +4613,8 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
-	if (req->async_data) {
-		kmsg = req->async_data;
-		/* if iov is set, it's allocated already */
-		if (!kmsg->iov)
-			kmsg->iov = kmsg->fast_iov;
-		kmsg->msg.msg_iter.iov = kmsg->iov;
-	} else {
+	kmsg = req->async_data;
+	if (!kmsg) {
 		ret = io_sendmsg_copy_hdr(req, &iomsg);
 		if (ret)
 			return ret;
@@ -4634,8 +4633,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
 
-	if (kmsg->iov != kmsg->fast_iov)
-		kfree(kmsg->iov);
+	/* fast path, check for non-NULL to avoid function call */
+	if (kmsg->free_iov)
+		kfree(kmsg->free_iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail_links(req);
@@ -4704,10 +4704,11 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
 		if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
 			return -EFAULT;
 		sr->len = iomsg->fast_iov[0].iov_len;
-		iomsg->iov = NULL;
+		iomsg->free_iov = NULL;
 	} else {
+		iomsg->free_iov = iomsg->fast_iov;
 		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
-				     &iomsg->iov, &iomsg->msg.msg_iter,
+				     &iomsg->free_iov, &iomsg->msg.msg_iter,
 				     false);
 		if (ret > 0)
 			ret = 0;
@@ -4746,10 +4747,11 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 		if (clen < 0)
 			return -EINVAL;
 		sr->len = clen;
-		iomsg->iov = NULL;
+		iomsg->free_iov = NULL;
 	} else {
+		iomsg->free_iov = iomsg->fast_iov;
 		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
-				   UIO_FASTIOV, &iomsg->iov,
+				   UIO_FASTIOV, &iomsg->free_iov,
 				   &iomsg->msg.msg_iter, true);
 		if (ret < 0)
 			return ret;
@@ -4763,7 +4765,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
 			       struct io_async_msghdr *iomsg)
 {
 	iomsg->msg.msg_name = &iomsg->addr;
-	iomsg->iov = iomsg->fast_iov;
 
 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
@@ -4834,13 +4835,8 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
-	if (req->async_data) {
-		kmsg = req->async_data;
-		/* if iov is set, it's allocated already */
-		if (!kmsg->iov)
-			kmsg->iov = kmsg->fast_iov;
-		kmsg->msg.msg_iter.iov = kmsg->iov;
-	} else {
+	kmsg = req->async_data;
+	if (!kmsg) {
 		ret = io_recvmsg_copy_hdr(req, &iomsg);
 		if (ret)
 			return ret;
@@ -4872,8 +4868,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 
 	if (req->flags & REQ_F_BUFFER_SELECTED)
 		cflags = io_put_recv_kbuf(req);
-	if (kmsg->iov != kmsg->fast_iov)
-		kfree(kmsg->iov);
+	/* fast path, check for non-NULL to avoid function call */
+	if (kmsg->free_iov)
+		kfree(kmsg->free_iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail_links(req);
@@ -6166,8 +6163,8 @@ static void __io_clean_op(struct io_kiocb *req)
 		case IORING_OP_RECVMSG:
 		case IORING_OP_SENDMSG: {
 			struct io_async_msghdr *io = req->async_data;
-			if (io->iov != io->fast_iov)
-				kfree(io->iov);
+
+			kfree(io->free_iov);
 			break;
 			}
 		case IORING_OP_SPLICE:

From 0e9ddb39b7d964d716cddd6e6bd1aab3f800066e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 7 Feb 2021 22:34:26 +0000
Subject: [PATCH 105/183] io_uring: cleanup up cancel SQPOLL reqs across exec

For SQPOLL rings tctx_inflight() always returns zero, so it might skip
doing full cancelation. It's fine because we jam all sqpoll submissions
in any case and do go through files cancel for them, but not nice.

Do the intended full cancellation, by mimicking __io_uring_task_cancel()
waiting but impersonating SQPOLL task.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 57 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7242cc48e97b..9c77fbc0c395 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9083,29 +9083,39 @@ void __io_uring_files_cancel(struct files_struct *files)
 
 static s64 tctx_inflight(struct io_uring_task *tctx)
 {
-	unsigned long index;
-	struct file *file;
+	return percpu_counter_sum(&tctx->inflight);
+}
+
+static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx)
+{
+	struct io_uring_task *tctx;
 	s64 inflight;
+	DEFINE_WAIT(wait);
 
-	inflight = percpu_counter_sum(&tctx->inflight);
-	if (!tctx->sqpoll)
-		return inflight;
+	if (!ctx->sq_data)
+		return;
+	tctx = ctx->sq_data->thread->io_uring;
+	io_disable_sqo_submit(ctx);
 
-	/*
-	 * If we have SQPOLL rings, then we need to iterate and find them, and
-	 * add the pending count for those.
-	 */
-	xa_for_each(&tctx->xa, index, file) {
-		struct io_ring_ctx *ctx = file->private_data;
+	atomic_inc(&tctx->in_idle);
+	do {
+		/* read completions before cancelations */
+		inflight = tctx_inflight(tctx);
+		if (!inflight)
+			break;
+		io_uring_cancel_task_requests(ctx, NULL);
 
-		if (ctx->flags & IORING_SETUP_SQPOLL) {
-			struct io_uring_task *__tctx = ctx->sqo_task->io_uring;
-
-			inflight += percpu_counter_sum(&__tctx->inflight);
-		}
-	}
-
-	return inflight;
+		prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
+		/*
+		 * If we've seen completions, retry without waiting. This
+		 * avoids a race where a completion comes in before we did
+		 * prepare_to_wait().
+		 */
+		if (inflight == tctx_inflight(tctx))
+			schedule();
+		finish_wait(&tctx->wait, &wait);
+	} while (1);
+	atomic_dec(&tctx->in_idle);
 }
 
 /*
@@ -9122,8 +9132,13 @@ void __io_uring_task_cancel(void)
 	atomic_inc(&tctx->in_idle);
 
 	/* trigger io_disable_sqo_submit() */
-	if (tctx->sqpoll)
-		__io_uring_files_cancel(NULL);
+	if (tctx->sqpoll) {
+		struct file *file;
+		unsigned long index;
+
+		xa_for_each(&tctx->xa, index, file)
+			io_uring_cancel_sqpoll(file->private_data);
+	}
 
 	do {
 		/* read completions before cancelations */

From c1f664d2400e73d5ca0fcd067fa5847d2c789c11 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Tue, 9 Feb 2021 15:10:51 +0800
Subject: [PATCH 106/183] irqchip/loongson-pch-msi: Use bitmap_zalloc() to
 allocate bitmap

Currently we use bitmap_alloc() to allocate msi bitmap which should be
initialized with zero. This is obviously wrong but it works because msi
can fallback to legacy interrupt mode. So use bitmap_zalloc() instead.

Fixes: 632dcc2c75ef6de3272aa ("irqchip: Add Loongson PCH MSI controller")
Cc: stable@vger.kernel.org
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210209071051.2078435-1-chenhuacai@loongson.cn
---
 drivers/irqchip/irq-loongson-pch-msi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-loongson-pch-msi.c b/drivers/irqchip/irq-loongson-pch-msi.c
index 12aeeab43289..32562b7e681b 100644
--- a/drivers/irqchip/irq-loongson-pch-msi.c
+++ b/drivers/irqchip/irq-loongson-pch-msi.c
@@ -225,7 +225,7 @@ static int pch_msi_init(struct device_node *node,
 		goto err_priv;
 	}
 
-	priv->msi_map = bitmap_alloc(priv->num_irqs, GFP_KERNEL);
+	priv->msi_map = bitmap_zalloc(priv->num_irqs, GFP_KERNEL);
 	if (!priv->msi_map) {
 		ret = -ENOMEM;
 		goto err_priv;

From 45d189c6062922ffe272e98013ba464b355dede7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:07 +0000
Subject: [PATCH 107/183] io_uring: replace force_nonblock with flags

Replace bool force_nonblock with flags. It has a long standing goal of
differentiating context from which we execute. Currently we have some
subtle places where some invariants, like holding of uring_lock, are
subtly inferred.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 178 +++++++++++++++++++++++++++-----------------------
 1 file changed, 96 insertions(+), 82 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9c77fbc0c395..862121c48cee 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -187,6 +187,10 @@ struct io_rings {
 	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
 };
 
+enum io_uring_cmd_flags {
+	IO_URING_F_NONBLOCK		= 1,
+};
+
 struct io_mapped_ubuf {
 	u64		ubuf;
 	size_t		len;
@@ -3477,7 +3481,7 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
 		return -EINVAL;
 }
 
-static int io_read(struct io_kiocb *req, bool force_nonblock,
+static int io_read(struct io_kiocb *req, unsigned int issue_flags,
 		   struct io_comp_state *cs)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
@@ -3485,6 +3489,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	struct iov_iter __iter, *iter = &__iter;
 	struct io_async_rw *rw = req->async_data;
 	ssize_t io_size, ret, ret2;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
 	if (rw) {
 		iter = &rw->iter;
@@ -3588,7 +3593,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return io_rw_prep_async(req, WRITE);
 }
 
-static int io_write(struct io_kiocb *req, bool force_nonblock,
+static int io_write(struct io_kiocb *req, unsigned int issue_flags,
 		    struct io_comp_state *cs)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
@@ -3596,6 +3601,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	struct iov_iter __iter, *iter = &__iter;
 	struct io_async_rw *rw = req->async_data;
 	ssize_t ret, ret2, io_size;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
 	if (rw) {
 		iter = &rw->iter;
@@ -3706,12 +3712,12 @@ static int io_renameat_prep(struct io_kiocb *req,
 	return 0;
 }
 
-static int io_renameat(struct io_kiocb *req, bool force_nonblock)
+static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_rename *ren = &req->rename;
 	int ret;
 
-	if (force_nonblock)
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
 	ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
@@ -3748,12 +3754,12 @@ static int io_unlinkat_prep(struct io_kiocb *req,
 	return 0;
 }
 
-static int io_unlinkat(struct io_kiocb *req, bool force_nonblock)
+static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_unlink *un = &req->unlink;
 	int ret;
 
-	if (force_nonblock)
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
 	if (un->flags & AT_REMOVEDIR)
@@ -3785,13 +3791,13 @@ static int io_shutdown_prep(struct io_kiocb *req,
 #endif
 }
 
-static int io_shutdown(struct io_kiocb *req, bool force_nonblock)
+static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
 {
 #if defined(CONFIG_NET)
 	struct socket *sock;
 	int ret;
 
-	if (force_nonblock)
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
 	sock = sock_from_file(req->file);
@@ -3850,7 +3856,7 @@ static int io_tee_prep(struct io_kiocb *req,
 	return __io_splice_prep(req, sqe);
 }
 
-static int io_tee(struct io_kiocb *req, bool force_nonblock)
+static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_splice *sp = &req->splice;
 	struct file *in = sp->file_in;
@@ -3858,7 +3864,7 @@ static int io_tee(struct io_kiocb *req, bool force_nonblock)
 	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
 	long ret = 0;
 
-	if (force_nonblock)
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 	if (sp->len)
 		ret = do_tee(in, out, sp->len, flags);
@@ -3881,7 +3887,7 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return __io_splice_prep(req, sqe);
 }
 
-static int io_splice(struct io_kiocb *req, bool force_nonblock)
+static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_splice *sp = &req->splice;
 	struct file *in = sp->file_in;
@@ -3890,7 +3896,7 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
 	loff_t *poff_in, *poff_out;
 	long ret = 0;
 
-	if (force_nonblock)
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
 	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
@@ -3943,13 +3949,13 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int io_fsync(struct io_kiocb *req, bool force_nonblock)
+static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
 {
 	loff_t end = req->sync.off + req->sync.len;
 	int ret;
 
 	/* fsync always requires a blocking context */
-	if (force_nonblock)
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
 	ret = vfs_fsync_range(req->file, req->sync.off,
@@ -3975,12 +3981,12 @@ static int io_fallocate_prep(struct io_kiocb *req,
 	return 0;
 }
 
-static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
+static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
 {
 	int ret;
 
 	/* fallocate always requiring blocking context */
-	if (force_nonblock)
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
 				req->sync.len);
@@ -4050,7 +4056,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return __io_openat_prep(req, sqe);
 }
 
-static int io_openat2(struct io_kiocb *req, bool force_nonblock)
+static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct open_flags op;
 	struct file *file;
@@ -4063,7 +4069,7 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)
 		goto err;
 	nonblock_set = op.open_flag & O_NONBLOCK;
 	resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
-	if (force_nonblock) {
+	if (issue_flags & IO_URING_F_NONBLOCK) {
 		/*
 		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
 		 * it'll always -EAGAIN
@@ -4080,7 +4086,8 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)
 
 	file = do_filp_open(req->open.dfd, req->open.filename, &op);
 	/* only retry if RESOLVE_CACHED wasn't already set by application */
-	if ((!resolve_nonblock && force_nonblock) && file == ERR_PTR(-EAGAIN)) {
+	if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) &&
+	    file == ERR_PTR(-EAGAIN)) {
 		/*
 		 * We could hang on to this 'fd', but seems like marginal
 		 * gain for something that is now known to be a slower path.
@@ -4094,7 +4101,7 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)
 		put_unused_fd(ret);
 		ret = PTR_ERR(file);
 	} else {
-		if (force_nonblock && !nonblock_set)
+		if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
 			file->f_flags &= ~O_NONBLOCK;
 		fsnotify_open(file);
 		fd_install(ret, file);
@@ -4108,9 +4115,9 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)
 	return 0;
 }
 
-static int io_openat(struct io_kiocb *req, bool force_nonblock)
+static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
 {
-	return io_openat2(req, force_nonblock);
+	return io_openat2(req, issue_flags & IO_URING_F_NONBLOCK);
 }
 
 static int io_remove_buffers_prep(struct io_kiocb *req,
@@ -4158,13 +4165,14 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
 	return i;
 }
 
-static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
+static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags,
 			     struct io_comp_state *cs)
 {
 	struct io_provide_buf *p = &req->pbuf;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer *head;
 	int ret = 0;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
 	io_ring_submit_lock(ctx, !force_nonblock);
 
@@ -4242,13 +4250,14 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
 	return i ? i : -ENOMEM;
 }
 
-static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
+static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags,
 			      struct io_comp_state *cs)
 {
 	struct io_provide_buf *p = &req->pbuf;
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_buffer *head, *list;
 	int ret = 0;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
 	io_ring_submit_lock(ctx, !force_nonblock);
 
@@ -4310,12 +4319,13 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
 #endif
 }
 
-static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock,
+static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags,
 			struct io_comp_state *cs)
 {
 #if defined(CONFIG_EPOLL)
 	struct io_epoll *ie = &req->epoll;
 	int ret;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
 	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
 	if (force_nonblock && ret == -EAGAIN)
@@ -4347,13 +4357,13 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 #endif
 }
 
-static int io_madvise(struct io_kiocb *req, bool force_nonblock)
+static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
 {
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
 	struct io_madvise *ma = &req->madvise;
 	int ret;
 
-	if (force_nonblock)
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
 	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
@@ -4379,12 +4389,12 @@ static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
+static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_fadvise *fa = &req->fadvise;
 	int ret;
 
-	if (force_nonblock) {
+	if (issue_flags & IO_URING_F_NONBLOCK) {
 		switch (fa->advice) {
 		case POSIX_FADV_NORMAL:
 		case POSIX_FADV_RANDOM:
@@ -4420,12 +4430,12 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int io_statx(struct io_kiocb *req, bool force_nonblock)
+static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_statx *ctx = &req->statx;
 	int ret;
 
-	if (force_nonblock) {
+	if (issue_flags & IO_URING_F_NONBLOCK) {
 		/* only need file table for an actual valid fd */
 		if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
 			req->flags |= REQ_F_NO_FILE_TABLE;
@@ -4455,7 +4465,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int io_close(struct io_kiocb *req, bool force_nonblock,
+static int io_close(struct io_kiocb *req, unsigned int issue_flags,
 		    struct io_comp_state *cs)
 {
 	struct files_struct *files = current->files;
@@ -4485,7 +4495,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock,
 	}
 
 	/* if the file has a flush method, be safe and punt to async */
-	if (file->f_op->flush && force_nonblock) {
+	if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
 		spin_unlock(&files->file_lock);
 		return -EAGAIN;
 	}
@@ -4527,12 +4537,12 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
+static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
 {
 	int ret;
 
 	/* sync_file_range always requires a blocking context */
-	if (force_nonblock)
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
 	ret = sync_file_range(req->file, req->sync.off, req->sync.len,
@@ -4601,7 +4611,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return ret;
 }
 
-static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
+static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags,
 		      struct io_comp_state *cs)
 {
 	struct io_async_msghdr iomsg, *kmsg;
@@ -4624,11 +4634,11 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 	flags = req->sr_msg.msg_flags;
 	if (flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
-	else if (force_nonblock)
+	else if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
 
 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
-	if (force_nonblock && ret == -EAGAIN)
+	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
 		return io_setup_async_msg(req, kmsg);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
@@ -4643,7 +4653,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 	return 0;
 }
 
-static int io_send(struct io_kiocb *req, bool force_nonblock,
+static int io_send(struct io_kiocb *req, unsigned int issue_flags,
 		   struct io_comp_state *cs)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
@@ -4669,12 +4679,12 @@ static int io_send(struct io_kiocb *req, bool force_nonblock,
 	flags = req->sr_msg.msg_flags;
 	if (flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
-	else if (force_nonblock)
+	else if (issue_flags & IO_URING_F_NONBLOCK)
 		flags |= MSG_DONTWAIT;
 
 	msg.msg_flags = flags;
 	ret = sock_sendmsg(sock, &msg);
-	if (force_nonblock && ret == -EAGAIN)
+	if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
 		return -EAGAIN;
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
@@ -4822,7 +4832,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
 	return ret;
 }
 
-static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
+static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags,
 		      struct io_comp_state *cs)
 {
 	struct io_async_msghdr iomsg, *kmsg;
@@ -4830,6 +4840,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 	struct io_buffer *kbuf;
 	unsigned flags;
 	int ret, cflags = 0;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
@@ -4878,7 +4889,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 	return 0;
 }
 
-static int io_recv(struct io_kiocb *req, bool force_nonblock,
+static int io_recv(struct io_kiocb *req, unsigned int issue_flags,
 		   struct io_comp_state *cs)
 {
 	struct io_buffer *kbuf;
@@ -4889,6 +4900,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
 	struct iovec iov;
 	unsigned flags;
 	int ret, cflags = 0;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
@@ -4948,10 +4960,11 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int io_accept(struct io_kiocb *req, bool force_nonblock,
+static int io_accept(struct io_kiocb *req, unsigned int issue_flags,
 		     struct io_comp_state *cs)
 {
 	struct io_accept *accept = &req->accept;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
 	int ret;
 
@@ -4992,12 +5005,13 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 					&io->address);
 }
 
-static int io_connect(struct io_kiocb *req, bool force_nonblock,
+static int io_connect(struct io_kiocb *req, unsigned int issue_flags,
 		      struct io_comp_state *cs)
 {
 	struct io_async_connect __io, *io;
 	unsigned file_flags;
 	int ret;
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 
 	if (req->async_data) {
 		io = req->async_data;
@@ -5039,13 +5053,13 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return -EOPNOTSUPP;
 }
 
-static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
+static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags,
 		      struct io_comp_state *cs)
 {
 	return -EOPNOTSUPP;
 }
 
-static int io_send(struct io_kiocb *req, bool force_nonblock,
+static int io_send(struct io_kiocb *req, unsigned int issue_flags,
 		   struct io_comp_state *cs)
 {
 	return -EOPNOTSUPP;
@@ -5057,13 +5071,13 @@ static int io_recvmsg_prep(struct io_kiocb *req,
 	return -EOPNOTSUPP;
 }
 
-static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
+static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags,
 		      struct io_comp_state *cs)
 {
 	return -EOPNOTSUPP;
 }
 
-static int io_recv(struct io_kiocb *req, bool force_nonblock,
+static int io_recv(struct io_kiocb *req, unsigned int issue_flags,
 		   struct io_comp_state *cs)
 {
 	return -EOPNOTSUPP;
@@ -5074,7 +5088,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return -EOPNOTSUPP;
 }
 
-static int io_accept(struct io_kiocb *req, bool force_nonblock,
+static int io_accept(struct io_kiocb *req, unsigned int issue_flags,
 		     struct io_comp_state *cs)
 {
 	return -EOPNOTSUPP;
@@ -5085,7 +5099,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return -EOPNOTSUPP;
 }
 
-static int io_connect(struct io_kiocb *req, bool force_nonblock,
+static int io_connect(struct io_kiocb *req, unsigned int issue_flags,
 		      struct io_comp_state *cs)
 {
 	return -EOPNOTSUPP;
@@ -5963,14 +5977,14 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
 	return 0;
 }
 
-static int io_files_update(struct io_kiocb *req, bool force_nonblock,
+static int io_files_update(struct io_kiocb *req, unsigned int issue_flags,
 			   struct io_comp_state *cs)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_uring_rsrc_update up;
 	int ret;
 
-	if (force_nonblock)
+	if (issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
 	up.offset = req->rsrc_update.offset;
@@ -6189,7 +6203,7 @@ static void __io_clean_op(struct io_kiocb *req)
 	}
 }
 
-static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
+static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags,
 			struct io_comp_state *cs)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -6202,15 +6216,15 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
 	case IORING_OP_READV:
 	case IORING_OP_READ_FIXED:
 	case IORING_OP_READ:
-		ret = io_read(req, force_nonblock, cs);
+		ret = io_read(req, issue_flags, cs);
 		break;
 	case IORING_OP_WRITEV:
 	case IORING_OP_WRITE_FIXED:
 	case IORING_OP_WRITE:
-		ret = io_write(req, force_nonblock, cs);
+		ret = io_write(req, issue_flags, cs);
 		break;
 	case IORING_OP_FSYNC:
-		ret = io_fsync(req, force_nonblock);
+		ret = io_fsync(req, issue_flags);
 		break;
 	case IORING_OP_POLL_ADD:
 		ret = io_poll_add(req);
@@ -6219,19 +6233,19 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
 		ret = io_poll_remove(req);
 		break;
 	case IORING_OP_SYNC_FILE_RANGE:
-		ret = io_sync_file_range(req, force_nonblock);
+		ret = io_sync_file_range(req, issue_flags);
 		break;
 	case IORING_OP_SENDMSG:
-		ret = io_sendmsg(req, force_nonblock, cs);
+		ret = io_sendmsg(req, issue_flags, cs);
 		break;
 	case IORING_OP_SEND:
-		ret = io_send(req, force_nonblock, cs);
+		ret = io_send(req, issue_flags, cs);
 		break;
 	case IORING_OP_RECVMSG:
-		ret = io_recvmsg(req, force_nonblock, cs);
+		ret = io_recvmsg(req, issue_flags, cs);
 		break;
 	case IORING_OP_RECV:
-		ret = io_recv(req, force_nonblock, cs);
+		ret = io_recv(req, issue_flags, cs);
 		break;
 	case IORING_OP_TIMEOUT:
 		ret = io_timeout(req);
@@ -6240,61 +6254,61 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
 		ret = io_timeout_remove(req);
 		break;
 	case IORING_OP_ACCEPT:
-		ret = io_accept(req, force_nonblock, cs);
+		ret = io_accept(req, issue_flags, cs);
 		break;
 	case IORING_OP_CONNECT:
-		ret = io_connect(req, force_nonblock, cs);
+		ret = io_connect(req, issue_flags, cs);
 		break;
 	case IORING_OP_ASYNC_CANCEL:
 		ret = io_async_cancel(req);
 		break;
 	case IORING_OP_FALLOCATE:
-		ret = io_fallocate(req, force_nonblock);
+		ret = io_fallocate(req, issue_flags);
 		break;
 	case IORING_OP_OPENAT:
-		ret = io_openat(req, force_nonblock);
+		ret = io_openat(req, issue_flags);
 		break;
 	case IORING_OP_CLOSE:
-		ret = io_close(req, force_nonblock, cs);
+		ret = io_close(req, issue_flags, cs);
 		break;
 	case IORING_OP_FILES_UPDATE:
-		ret = io_files_update(req, force_nonblock, cs);
+		ret = io_files_update(req, issue_flags, cs);
 		break;
 	case IORING_OP_STATX:
-		ret = io_statx(req, force_nonblock);
+		ret = io_statx(req, issue_flags);
 		break;
 	case IORING_OP_FADVISE:
-		ret = io_fadvise(req, force_nonblock);
+		ret = io_fadvise(req, issue_flags);
 		break;
 	case IORING_OP_MADVISE:
-		ret = io_madvise(req, force_nonblock);
+		ret = io_madvise(req, issue_flags);
 		break;
 	case IORING_OP_OPENAT2:
-		ret = io_openat2(req, force_nonblock);
+		ret = io_openat2(req, issue_flags);
 		break;
 	case IORING_OP_EPOLL_CTL:
-		ret = io_epoll_ctl(req, force_nonblock, cs);
+		ret = io_epoll_ctl(req, issue_flags, cs);
 		break;
 	case IORING_OP_SPLICE:
-		ret = io_splice(req, force_nonblock);
+		ret = io_splice(req, issue_flags);
 		break;
 	case IORING_OP_PROVIDE_BUFFERS:
-		ret = io_provide_buffers(req, force_nonblock, cs);
+		ret = io_provide_buffers(req, issue_flags, cs);
 		break;
 	case IORING_OP_REMOVE_BUFFERS:
-		ret = io_remove_buffers(req, force_nonblock, cs);
+		ret = io_remove_buffers(req, issue_flags, cs);
 		break;
 	case IORING_OP_TEE:
-		ret = io_tee(req, force_nonblock);
+		ret = io_tee(req, issue_flags);
 		break;
 	case IORING_OP_SHUTDOWN:
-		ret = io_shutdown(req, force_nonblock);
+		ret = io_shutdown(req, issue_flags);
 		break;
 	case IORING_OP_RENAMEAT:
-		ret = io_renameat(req, force_nonblock);
+		ret = io_renameat(req, issue_flags);
 		break;
 	case IORING_OP_UNLINKAT:
-		ret = io_unlinkat(req, force_nonblock);
+		ret = io_unlinkat(req, issue_flags);
 		break;
 	default:
 		ret = -EINVAL;
@@ -6336,7 +6350,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
 
 	if (!ret) {
 		do {
-			ret = io_issue_sqe(req, false, NULL);
+			ret = io_issue_sqe(req, 0, NULL);
 			/*
 			 * We can get EAGAIN for polled IO even though we're
 			 * forcing a sync submission from here, since we can't
@@ -6499,7 +6513,7 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
 			old_creds = override_creds(req->work.identity->creds);
 	}
 
-	ret = io_issue_sqe(req, true, cs);
+	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK, cs);
 
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file

From 61e98203047983fd959cfef889b328a57315847c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:08 +0000
Subject: [PATCH 108/183] io_uring: make op handlers always take issue flags

Make opcode handler interfaces a bit more consistent by always passing
in issue flags. Bulky but pretty easy and mechanical change.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 862121c48cee..ac233d04ee71 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3917,7 +3917,8 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 /*
  * IORING_OP_NOP just posts a completion event, nothing else.
  */
-static int io_nop(struct io_kiocb *req, struct io_comp_state *cs)
+static int io_nop(struct io_kiocb *req, unsigned int issue_flags,
+		  struct io_comp_state *cs)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -5581,7 +5582,7 @@ static int io_poll_remove_prep(struct io_kiocb *req,
  * Find a running poll command that matches one specified in sqe->addr,
  * and remove it if found.
  */
-static int io_poll_remove(struct io_kiocb *req)
+static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
@@ -5632,7 +5633,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	return 0;
 }
 
-static int io_poll_add(struct io_kiocb *req)
+static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_poll_iocb *poll = &req->poll;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -5772,7 +5773,7 @@ static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
 /*
  * Remove or update an existing timeout command
  */
-static int io_timeout_remove(struct io_kiocb *req)
+static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_timeout_rem *tr = &req->timeout_rem;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -5828,7 +5829,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
-static int io_timeout(struct io_kiocb *req)
+static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_timeout_data *data = req->async_data;
@@ -5951,7 +5952,7 @@ static int io_async_cancel_prep(struct io_kiocb *req,
 	return 0;
 }
 
-static int io_async_cancel(struct io_kiocb *req)
+static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
@@ -6211,7 +6212,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags,
 
 	switch (req->opcode) {
 	case IORING_OP_NOP:
-		ret = io_nop(req, cs);
+		ret = io_nop(req, issue_flags, cs);
 		break;
 	case IORING_OP_READV:
 	case IORING_OP_READ_FIXED:
@@ -6227,10 +6228,10 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags,
 		ret = io_fsync(req, issue_flags);
 		break;
 	case IORING_OP_POLL_ADD:
-		ret = io_poll_add(req);
+		ret = io_poll_add(req, issue_flags);
 		break;
 	case IORING_OP_POLL_REMOVE:
-		ret = io_poll_remove(req);
+		ret = io_poll_remove(req, issue_flags);
 		break;
 	case IORING_OP_SYNC_FILE_RANGE:
 		ret = io_sync_file_range(req, issue_flags);
@@ -6248,10 +6249,10 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags,
 		ret = io_recv(req, issue_flags, cs);
 		break;
 	case IORING_OP_TIMEOUT:
-		ret = io_timeout(req);
+		ret = io_timeout(req, issue_flags);
 		break;
 	case IORING_OP_TIMEOUT_REMOVE:
-		ret = io_timeout_remove(req);
+		ret = io_timeout_remove(req, issue_flags);
 		break;
 	case IORING_OP_ACCEPT:
 		ret = io_accept(req, issue_flags, cs);
@@ -6260,7 +6261,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags,
 		ret = io_connect(req, issue_flags, cs);
 		break;
 	case IORING_OP_ASYNC_CANCEL:
-		ret = io_async_cancel(req);
+		ret = io_async_cancel(req, issue_flags);
 		break;
 	case IORING_OP_FALLOCATE:
 		ret = io_fallocate(req, issue_flags);

From 889fca73287b0ae21c9d8712379c9ae5a3b27d08 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:09 +0000
Subject: [PATCH 109/183] io_uring: don't propagate io_comp_state

There is no reason to drag io_comp_state into opcode handlers, we just
need a flag and the actual work will be done in __io_queue_sqe().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 164 ++++++++++++++++++++++----------------------------
 1 file changed, 73 insertions(+), 91 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ac233d04ee71..273ebbac0654 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -189,6 +189,7 @@ struct io_rings {
 
 enum io_uring_cmd_flags {
 	IO_URING_F_NONBLOCK		= 1,
+	IO_URING_F_COMPLETE_DEFER	= 2,
 };
 
 struct io_mapped_ubuf {
@@ -1018,7 +1019,7 @@ static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
 				     struct fixed_rsrc_ref_node *ref_node);
 
 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
-			     struct io_comp_state *cs);
+			     unsigned int issue_flags);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req, int nr);
@@ -1957,7 +1958,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs)
 }
 
 static void io_req_complete_state(struct io_kiocb *req, long res,
-				  unsigned int cflags, struct io_comp_state *cs)
+				  unsigned int cflags)
 {
 	io_clean_op(req);
 	req->result = res;
@@ -1965,18 +1966,18 @@ static void io_req_complete_state(struct io_kiocb *req, long res,
 	req->flags |= REQ_F_COMPLETE_INLINE;
 }
 
-static inline void __io_req_complete(struct io_kiocb *req, long res,
-				     unsigned cflags, struct io_comp_state *cs)
+static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
+				     long res, unsigned cflags)
 {
-	if (!cs)
-		io_req_complete_nostate(req, res, cflags);
+	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
+		io_req_complete_state(req, res, cflags);
 	else
-		io_req_complete_state(req, res, cflags, cs);
+		io_req_complete_nostate(req, res, cflags);
 }
 
 static inline void io_req_complete(struct io_kiocb *req, long res)
 {
-	__io_req_complete(req, res, 0, NULL);
+	__io_req_complete(req, 0, res, 0);
 }
 
 static inline bool io_is_fallback_req(struct io_kiocb *req)
@@ -2449,7 +2450,7 @@ static void io_iopoll_queue(struct list_head *again)
 	do {
 		req = list_first_entry(again, struct io_kiocb, inflight_entry);
 		list_del(&req->inflight_entry);
-		__io_complete_rw(req, -EAGAIN, 0, NULL);
+		__io_complete_rw(req, -EAGAIN, 0, 0);
 	} while (!list_empty(again));
 }
 
@@ -2662,7 +2663,7 @@ static void kiocb_end_write(struct io_kiocb *req)
 }
 
 static void io_complete_rw_common(struct kiocb *kiocb, long res,
-				  struct io_comp_state *cs)
+				  unsigned int issue_flags)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 	int cflags = 0;
@@ -2674,7 +2675,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res,
 		req_set_fail_links(req);
 	if (req->flags & REQ_F_BUFFER_SELECTED)
 		cflags = io_put_rw_kbuf(req);
-	__io_req_complete(req, res, cflags, cs);
+	__io_req_complete(req, issue_flags, res, cflags);
 }
 
 #ifdef CONFIG_BLOCK
@@ -2741,17 +2742,17 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
 }
 
 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
-			     struct io_comp_state *cs)
+			     unsigned int issue_flags)
 {
 	if (!io_rw_reissue(req, res))
-		io_complete_rw_common(&req->rw.kiocb, res, cs);
+		io_complete_rw_common(&req->rw.kiocb, res, issue_flags);
 }
 
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
-	__io_complete_rw(req, res, res2, NULL);
+	__io_complete_rw(req, res, res2, 0);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -2970,7 +2971,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 }
 
 static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
-		       struct io_comp_state *cs)
+		       unsigned int issue_flags)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 	struct io_async_rw *io = req->async_data;
@@ -2986,7 +2987,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
 	if (req->flags & REQ_F_CUR_POS)
 		req->file->f_pos = kiocb->ki_pos;
 	if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
-		__io_complete_rw(req, ret, 0, cs);
+		__io_complete_rw(req, ret, 0, issue_flags);
 	else
 		io_rw_done(kiocb, ret);
 }
@@ -3481,8 +3482,7 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
 		return -EINVAL;
 }
 
-static int io_read(struct io_kiocb *req, unsigned int issue_flags,
-		   struct io_comp_state *cs)
+static int io_read(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
@@ -3572,7 +3572,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags,
 		/* we got some bytes, but not all. retry. */
 	} while (ret > 0 && ret < io_size);
 done:
-	kiocb_done(kiocb, ret, cs);
+	kiocb_done(kiocb, ret, issue_flags);
 	return 0;
 }
 
@@ -3593,8 +3593,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return io_rw_prep_async(req, WRITE);
 }
 
-static int io_write(struct io_kiocb *req, unsigned int issue_flags,
-		    struct io_comp_state *cs)
+static int io_write(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
@@ -3668,7 +3667,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags,
 		if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
 			goto copy_iov;
 done:
-		kiocb_done(kiocb, ret2, cs);
+		kiocb_done(kiocb, ret2, issue_flags);
 	} else {
 copy_iov:
 		/* some cases will consume bytes even on error returns */
@@ -3917,15 +3916,14 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
 /*
  * IORING_OP_NOP just posts a completion event, nothing else.
  */
-static int io_nop(struct io_kiocb *req, unsigned int issue_flags,
-		  struct io_comp_state *cs)
+static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 
-	__io_req_complete(req, 0, 0, cs);
+	__io_req_complete(req, issue_flags, 0, 0);
 	return 0;
 }
 
@@ -4166,8 +4164,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
 	return i;
 }
 
-static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags,
-			     struct io_comp_state *cs)
+static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_provide_buf *p = &req->pbuf;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -4188,11 +4185,11 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags,
 
 	/* need to hold the lock to complete IOPOLL requests */
 	if (ctx->flags & IORING_SETUP_IOPOLL) {
-		__io_req_complete(req, ret, 0, cs);
+		__io_req_complete(req, issue_flags, ret, 0);
 		io_ring_submit_unlock(ctx, !force_nonblock);
 	} else {
 		io_ring_submit_unlock(ctx, !force_nonblock);
-		__io_req_complete(req, ret, 0, cs);
+		__io_req_complete(req, issue_flags, ret, 0);
 	}
 	return 0;
 }
@@ -4251,8 +4248,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
 	return i ? i : -ENOMEM;
 }
 
-static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags,
-			      struct io_comp_state *cs)
+static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_provide_buf *p = &req->pbuf;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -4284,11 +4280,11 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags,
 
 	/* need to hold the lock to complete IOPOLL requests */
 	if (ctx->flags & IORING_SETUP_IOPOLL) {
-		__io_req_complete(req, ret, 0, cs);
+		__io_req_complete(req, issue_flags, ret, 0);
 		io_ring_submit_unlock(ctx, !force_nonblock);
 	} else {
 		io_ring_submit_unlock(ctx, !force_nonblock);
-		__io_req_complete(req, ret, 0, cs);
+		__io_req_complete(req, issue_flags, ret, 0);
 	}
 	return 0;
 }
@@ -4320,8 +4316,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
 #endif
 }
 
-static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags,
-			struct io_comp_state *cs)
+static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
 {
 #if defined(CONFIG_EPOLL)
 	struct io_epoll *ie = &req->epoll;
@@ -4334,7 +4329,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags,
 
 	if (ret < 0)
 		req_set_fail_links(req);
-	__io_req_complete(req, ret, 0, cs);
+	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 #else
 	return -EOPNOTSUPP;
@@ -4466,8 +4461,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int io_close(struct io_kiocb *req, unsigned int issue_flags,
-		    struct io_comp_state *cs)
+static int io_close(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct files_struct *files = current->files;
 	struct io_close *close = &req->close;
@@ -4516,7 +4510,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags,
 		req_set_fail_links(req);
 	if (file)
 		fput(file);
-	__io_req_complete(req, ret, 0, cs);
+	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 }
 
@@ -4612,8 +4606,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return ret;
 }
 
-static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags,
-		      struct io_comp_state *cs)
+static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
@@ -4650,12 +4643,11 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags,
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail_links(req);
-	__io_req_complete(req, ret, 0, cs);
+	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 }
 
-static int io_send(struct io_kiocb *req, unsigned int issue_flags,
-		   struct io_comp_state *cs)
+static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 	struct msghdr msg;
@@ -4692,7 +4684,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags,
 
 	if (ret < 0)
 		req_set_fail_links(req);
-	__io_req_complete(req, ret, 0, cs);
+	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 }
 
@@ -4833,8 +4825,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
 	return ret;
 }
 
-static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags,
-		      struct io_comp_state *cs)
+static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
@@ -4886,12 +4877,11 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags,
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail_links(req);
-	__io_req_complete(req, ret, cflags, cs);
+	__io_req_complete(req, issue_flags, ret, cflags);
 	return 0;
 }
 
-static int io_recv(struct io_kiocb *req, unsigned int issue_flags,
-		   struct io_comp_state *cs)
+static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_buffer *kbuf;
 	struct io_sr_msg *sr = &req->sr_msg;
@@ -4941,7 +4931,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags,
 		cflags = io_put_recv_kbuf(req);
 	if (ret < 0)
 		req_set_fail_links(req);
-	__io_req_complete(req, ret, cflags, cs);
+	__io_req_complete(req, issue_flags, ret, cflags);
 	return 0;
 }
 
@@ -4961,8 +4951,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int io_accept(struct io_kiocb *req, unsigned int issue_flags,
-		     struct io_comp_state *cs)
+static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_accept *accept = &req->accept;
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@@ -4982,7 +4971,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags,
 			ret = -EINTR;
 		req_set_fail_links(req);
 	}
-	__io_req_complete(req, ret, 0, cs);
+	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 }
 
@@ -5006,8 +4995,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 					&io->address);
 }
 
-static int io_connect(struct io_kiocb *req, unsigned int issue_flags,
-		      struct io_comp_state *cs)
+static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_async_connect __io, *io;
 	unsigned file_flags;
@@ -5045,7 +5033,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags,
 out:
 	if (ret < 0)
 		req_set_fail_links(req);
-	__io_req_complete(req, ret, 0, cs);
+	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 }
 #else /* !CONFIG_NET */
@@ -5054,14 +5042,12 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return -EOPNOTSUPP;
 }
 
-static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags,
-		      struct io_comp_state *cs)
+static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	return -EOPNOTSUPP;
 }
 
-static int io_send(struct io_kiocb *req, unsigned int issue_flags,
-		   struct io_comp_state *cs)
+static int io_send(struct io_kiocb *req, unsigned int issue_flags)
 {
 	return -EOPNOTSUPP;
 }
@@ -5072,14 +5058,12 @@ static int io_recvmsg_prep(struct io_kiocb *req,
 	return -EOPNOTSUPP;
 }
 
-static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags,
-		      struct io_comp_state *cs)
+static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	return -EOPNOTSUPP;
 }
 
-static int io_recv(struct io_kiocb *req, unsigned int issue_flags,
-		   struct io_comp_state *cs)
+static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 {
 	return -EOPNOTSUPP;
 }
@@ -5089,8 +5073,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return -EOPNOTSUPP;
 }
 
-static int io_accept(struct io_kiocb *req, unsigned int issue_flags,
-		     struct io_comp_state *cs)
+static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 {
 	return -EOPNOTSUPP;
 }
@@ -5100,8 +5083,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return -EOPNOTSUPP;
 }
 
-static int io_connect(struct io_kiocb *req, unsigned int issue_flags,
-		      struct io_comp_state *cs)
+static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 {
 	return -EOPNOTSUPP;
 }
@@ -5978,8 +5960,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
 	return 0;
 }
 
-static int io_files_update(struct io_kiocb *req, unsigned int issue_flags,
-			   struct io_comp_state *cs)
+static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_uring_rsrc_update up;
@@ -5997,7 +5978,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags,
 
 	if (ret < 0)
 		req_set_fail_links(req);
-	__io_req_complete(req, ret, 0, cs);
+	__io_req_complete(req, issue_flags, ret, 0);
 	return 0;
 }
 
@@ -6204,25 +6185,24 @@ static void __io_clean_op(struct io_kiocb *req)
 	}
 }
 
-static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags,
-			struct io_comp_state *cs)
+static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
 
 	switch (req->opcode) {
 	case IORING_OP_NOP:
-		ret = io_nop(req, issue_flags, cs);
+		ret = io_nop(req, issue_flags);
 		break;
 	case IORING_OP_READV:
 	case IORING_OP_READ_FIXED:
 	case IORING_OP_READ:
-		ret = io_read(req, issue_flags, cs);
+		ret = io_read(req, issue_flags);
 		break;
 	case IORING_OP_WRITEV:
 	case IORING_OP_WRITE_FIXED:
 	case IORING_OP_WRITE:
-		ret = io_write(req, issue_flags, cs);
+		ret = io_write(req, issue_flags);
 		break;
 	case IORING_OP_FSYNC:
 		ret = io_fsync(req, issue_flags);
@@ -6237,16 +6217,16 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags,
 		ret = io_sync_file_range(req, issue_flags);
 		break;
 	case IORING_OP_SENDMSG:
-		ret = io_sendmsg(req, issue_flags, cs);
+		ret = io_sendmsg(req, issue_flags);
 		break;
 	case IORING_OP_SEND:
-		ret = io_send(req, issue_flags, cs);
+		ret = io_send(req, issue_flags);
 		break;
 	case IORING_OP_RECVMSG:
-		ret = io_recvmsg(req, issue_flags, cs);
+		ret = io_recvmsg(req, issue_flags);
 		break;
 	case IORING_OP_RECV:
-		ret = io_recv(req, issue_flags, cs);
+		ret = io_recv(req, issue_flags);
 		break;
 	case IORING_OP_TIMEOUT:
 		ret = io_timeout(req, issue_flags);
@@ -6255,10 +6235,10 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags,
 		ret = io_timeout_remove(req, issue_flags);
 		break;
 	case IORING_OP_ACCEPT:
-		ret = io_accept(req, issue_flags, cs);
+		ret = io_accept(req, issue_flags);
 		break;
 	case IORING_OP_CONNECT:
-		ret = io_connect(req, issue_flags, cs);
+		ret = io_connect(req, issue_flags);
 		break;
 	case IORING_OP_ASYNC_CANCEL:
 		ret = io_async_cancel(req, issue_flags);
@@ -6270,10 +6250,10 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags,
 		ret = io_openat(req, issue_flags);
 		break;
 	case IORING_OP_CLOSE:
-		ret = io_close(req, issue_flags, cs);
+		ret = io_close(req, issue_flags);
 		break;
 	case IORING_OP_FILES_UPDATE:
-		ret = io_files_update(req, issue_flags, cs);
+		ret = io_files_update(req, issue_flags);
 		break;
 	case IORING_OP_STATX:
 		ret = io_statx(req, issue_flags);
@@ -6288,16 +6268,16 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags,
 		ret = io_openat2(req, issue_flags);
 		break;
 	case IORING_OP_EPOLL_CTL:
-		ret = io_epoll_ctl(req, issue_flags, cs);
+		ret = io_epoll_ctl(req, issue_flags);
 		break;
 	case IORING_OP_SPLICE:
 		ret = io_splice(req, issue_flags);
 		break;
 	case IORING_OP_PROVIDE_BUFFERS:
-		ret = io_provide_buffers(req, issue_flags, cs);
+		ret = io_provide_buffers(req, issue_flags);
 		break;
 	case IORING_OP_REMOVE_BUFFERS:
-		ret = io_remove_buffers(req, issue_flags, cs);
+		ret = io_remove_buffers(req, issue_flags);
 		break;
 	case IORING_OP_TEE:
 		ret = io_tee(req, issue_flags);
@@ -6351,7 +6331,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
 
 	if (!ret) {
 		do {
-			ret = io_issue_sqe(req, 0, NULL);
+			ret = io_issue_sqe(req, 0);
 			/*
 			 * We can get EAGAIN for polled IO even though we're
 			 * forcing a sync submission from here, since we can't
@@ -6498,8 +6478,10 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
 {
 	struct io_kiocb *linked_timeout;
 	const struct cred *old_creds = NULL;
-	int ret;
+	int ret, issue_flags = IO_URING_F_NONBLOCK;
 
+	if (cs)
+		issue_flags |= IO_URING_F_COMPLETE_DEFER;
 again:
 	linked_timeout = io_prep_linked_timeout(req);
 
@@ -6514,7 +6496,7 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
 			old_creds = override_creds(req->work.identity->creds);
 	}
 
-	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK, cs);
+	ret = io_issue_sqe(req, issue_flags);
 
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file

From 258b29a93bfe74a57c01e1b10b698d5b62e173fe Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:10 +0000
Subject: [PATCH 110/183] io_uring: don't keep submit_state on stack

struct io_submit_state is quite big (168 bytes) and going to grow. It's
better to not keep it on stack as it is now. Move it to context, it's
always protected by uring_lock, so it's fine to have only one instance
of it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 90 ++++++++++++++++++++++++++-------------------------
 1 file changed, 46 insertions(+), 44 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 273ebbac0654..743090364890 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -264,6 +264,39 @@ struct io_sq_data {
 	unsigned		sq_thread_idle;
 };
 
+#define IO_IOPOLL_BATCH			8
+
+struct io_comp_state {
+	unsigned int		nr;
+	struct list_head	list;
+	struct io_ring_ctx	*ctx;
+};
+
+struct io_submit_state {
+	struct blk_plug		plug;
+
+	/*
+	 * io_kiocb alloc cache
+	 */
+	void			*reqs[IO_IOPOLL_BATCH];
+	unsigned int		free_reqs;
+
+	bool			plug_started;
+
+	/*
+	 * Batch completion logic
+	 */
+	struct io_comp_state	comp;
+
+	/*
+	 * File reference cache
+	 */
+	struct file		*file;
+	unsigned int		fd;
+	unsigned int		file_refs;
+	unsigned int		ios_left;
+};
+
 struct io_ring_ctx {
 	struct {
 		struct percpu_ref	refs;
@@ -406,6 +439,7 @@ struct io_ring_ctx {
 
 	struct work_struct		exit_work;
 	struct io_restriction		restrictions;
+	struct io_submit_state		submit_state;
 };
 
 /*
@@ -758,39 +792,6 @@ struct io_defer_entry {
 	u32			seq;
 };
 
-#define IO_IOPOLL_BATCH			8
-
-struct io_comp_state {
-	unsigned int		nr;
-	struct list_head	list;
-	struct io_ring_ctx	*ctx;
-};
-
-struct io_submit_state {
-	struct blk_plug		plug;
-
-	/*
-	 * io_kiocb alloc cache
-	 */
-	void			*reqs[IO_IOPOLL_BATCH];
-	unsigned int		free_reqs;
-
-	bool			plug_started;
-
-	/*
-	 * Batch completion logic
-	 */
-	struct io_comp_state	comp;
-
-	/*
-	 * File reference cache
-	 */
-	struct file		*file;
-	unsigned int		fd;
-	unsigned int		file_refs;
-	unsigned int		ios_left;
-};
-
 struct io_op_def {
 	/* needs req->file assigned */
 	unsigned		needs_file : 1;
@@ -1997,9 +1998,10 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
 	return NULL;
 }
 
-static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
-				     struct io_submit_state *state)
+static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 {
+	struct io_submit_state *state = &ctx->submit_state;
+
 	if (!state->free_reqs) {
 		gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 		size_t sz;
@@ -6758,9 +6760,9 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx,
 				IOSQE_BUFFER_SELECT)
 
 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
-		       const struct io_uring_sqe *sqe,
-		       struct io_submit_state *state)
+		       const struct io_uring_sqe *sqe)
 {
+	struct io_submit_state *state;
 	unsigned int sqe_flags;
 	int id, ret;
 
@@ -6812,6 +6814,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 	/* same numerical values with corresponding REQ_F_*, safe to copy */
 	req->flags |= sqe_flags;
+	state = &ctx->submit_state;
 
 	/*
 	 * Plug now if we have more than 1 IO left after this, and the target
@@ -6838,7 +6841,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 {
-	struct io_submit_state state;
 	struct io_submit_link link;
 	int i, submitted = 0;
 
@@ -6857,7 +6859,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	percpu_counter_add(&current->io_uring->inflight, nr);
 	refcount_add(nr, &current->usage);
 
-	io_submit_state_start(&state, ctx, nr);
+	io_submit_state_start(&ctx->submit_state, ctx, nr);
 	link.head = NULL;
 
 	for (i = 0; i < nr; i++) {
@@ -6870,7 +6872,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 			io_consume_sqe(ctx);
 			break;
 		}
-		req = io_alloc_req(ctx, &state);
+		req = io_alloc_req(ctx);
 		if (unlikely(!req)) {
 			if (!submitted)
 				submitted = -EAGAIN;
@@ -6880,7 +6882,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		/* will complete beyond this point, count as submitted */
 		submitted++;
 
-		err = io_init_req(ctx, req, sqe, &state);
+		err = io_init_req(ctx, req, sqe);
 		if (unlikely(err)) {
 fail_req:
 			io_put_req(req);
@@ -6890,7 +6892,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 
 		trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
 					true, ctx->flags & IORING_SETUP_SQPOLL);
-		err = io_submit_sqe(req, sqe, &link, &state.comp);
+		err = io_submit_sqe(req, sqe, &link, &ctx->submit_state.comp);
 		if (err)
 			goto fail_req;
 	}
@@ -6905,8 +6907,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		put_task_struct_many(current, unused);
 	}
 	if (link.head)
-		io_queue_link_head(link.head, &state.comp);
-	io_submit_state_end(&state);
+		io_queue_link_head(link.head, &ctx->submit_state.comp);
+	io_submit_state_end(&ctx->submit_state);
 
 	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
 	io_commit_sqring(ctx);

From ba88ff112bdfde8103a8143f867bcdc46bc0e50f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:11 +0000
Subject: [PATCH 111/183] io_uring: remove ctx from comp_state

completion state is closely bound to ctx, we don't need to store ctx
inside as we always have it around to pass to flush.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 743090364890..6e800f9df292 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -269,7 +269,6 @@ struct io_sq_data {
 struct io_comp_state {
 	unsigned int		nr;
 	struct list_head	list;
-	struct io_ring_ctx	*ctx;
 };
 
 struct io_submit_state {
@@ -1924,10 +1923,9 @@ static inline void io_req_complete_nostate(struct io_kiocb *req, long res,
 	io_put_req(req);
 }
 
-static void io_submit_flush_completions(struct io_comp_state *cs)
+static void io_submit_flush_completions(struct io_comp_state *cs,
+					struct io_ring_ctx *ctx)
 {
-	struct io_ring_ctx *ctx = cs->ctx;
-
 	spin_lock_irq(&ctx->completion_lock);
 	while (!list_empty(&cs->list)) {
 		struct io_kiocb *req;
@@ -6520,7 +6518,7 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
 		if (req->flags & REQ_F_COMPLETE_INLINE) {
 			list_add_tail(&req->compl.list, &cs->list);
 			if (++cs->nr >= 32)
-				io_submit_flush_completions(cs);
+				io_submit_flush_completions(cs, req->ctx);
 			req = NULL;
 		} else {
 			req = io_put_req_find_next(req);
@@ -6655,10 +6653,11 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 /*
  * Batched submission is done, ensure local IO is flushed out.
  */
-static void io_submit_state_end(struct io_submit_state *state)
+static void io_submit_state_end(struct io_submit_state *state,
+				struct io_ring_ctx *ctx)
 {
 	if (!list_empty(&state->comp.list))
-		io_submit_flush_completions(&state->comp);
+		io_submit_flush_completions(&state->comp, ctx);
 	if (state->plug_started)
 		blk_finish_plug(&state->plug);
 	io_state_file_put(state);
@@ -6670,12 +6669,11 @@ static void io_submit_state_end(struct io_submit_state *state)
  * Start submission side cache.
  */
 static void io_submit_state_start(struct io_submit_state *state,
-				  struct io_ring_ctx *ctx, unsigned int max_ios)
+				  unsigned int max_ios)
 {
 	state->plug_started = false;
 	state->comp.nr = 0;
 	INIT_LIST_HEAD(&state->comp.list);
-	state->comp.ctx = ctx;
 	state->free_reqs = 0;
 	state->file_refs = 0;
 	state->ios_left = max_ios;
@@ -6859,7 +6857,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	percpu_counter_add(&current->io_uring->inflight, nr);
 	refcount_add(nr, &current->usage);
 
-	io_submit_state_start(&ctx->submit_state, ctx, nr);
+	io_submit_state_start(&ctx->submit_state, nr);
 	link.head = NULL;
 
 	for (i = 0; i < nr; i++) {
@@ -6908,7 +6906,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	}
 	if (link.head)
 		io_queue_link_head(link.head, &ctx->submit_state.comp);
-	io_submit_state_end(&ctx->submit_state);
+	io_submit_state_end(&ctx->submit_state, ctx);
 
 	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
 	io_commit_sqring(ctx);

From 5087275dba02943179720bd95d1d6c7047007550 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:12 +0000
Subject: [PATCH 112/183] io_uring: don't reinit submit state every time

As now submit_state is retained across syscalls, we can save ourself
from initialising it from ground up for each io_submit_sqes(). Set some
fields during ctx allocation, and just keep them always consistent.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
[axboe: remove unnecessary zeroing of ctx members]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6e800f9df292..6bce9094280c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1345,6 +1345,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 	init_llist_head(&ctx->rsrc_put_llist);
+	INIT_LIST_HEAD(&submit_state->comp.list);
 	return ctx;
 err:
 	if (ctx->fallback_req)
@@ -6661,8 +6662,10 @@ static void io_submit_state_end(struct io_submit_state *state,
 	if (state->plug_started)
 		blk_finish_plug(&state->plug);
 	io_state_file_put(state);
-	if (state->free_reqs)
+	if (state->free_reqs) {
 		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
+		state->free_reqs = 0;
+	}
 }
 
 /*
@@ -6672,10 +6675,6 @@ static void io_submit_state_start(struct io_submit_state *state,
 				  unsigned int max_ios)
 {
 	state->plug_started = false;
-	state->comp.nr = 0;
-	INIT_LIST_HEAD(&state->comp.list);
-	state->free_reqs = 0;
-	state->file_refs = 0;
 	state->ios_left = max_ios;
 }
 

From 6dd0be1e2481b32c39870e187840ade6c2a11a72 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:13 +0000
Subject: [PATCH 113/183] io_uring: replace list with array for compl batch

Reincarnation of an old patch that replaces a list in struct
io_compl_batch with an array. It's needed to avoid hooking requests via
their compl.list, because it won't be always available in the future.

It's also nice to split io_submit_flush_completions() to avoid free
under locks and remove unlock/lock with a long comment describing when
it can be done.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 35 +++++++++++------------------------
 1 file changed, 11 insertions(+), 24 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6bce9094280c..aef640616edb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -265,10 +265,11 @@ struct io_sq_data {
 };
 
 #define IO_IOPOLL_BATCH			8
+#define IO_COMPL_BATCH			32
 
 struct io_comp_state {
 	unsigned int		nr;
-	struct list_head	list;
+	struct io_kiocb		*reqs[IO_COMPL_BATCH];
 };
 
 struct io_submit_state {
@@ -1345,7 +1346,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 	init_llist_head(&ctx->rsrc_put_llist);
-	INIT_LIST_HEAD(&submit_state->comp.list);
 	return ctx;
 err:
 	if (ctx->fallback_req)
@@ -1927,33 +1927,20 @@ static inline void io_req_complete_nostate(struct io_kiocb *req, long res,
 static void io_submit_flush_completions(struct io_comp_state *cs,
 					struct io_ring_ctx *ctx)
 {
+	int i, nr = cs->nr;
+
 	spin_lock_irq(&ctx->completion_lock);
-	while (!list_empty(&cs->list)) {
-		struct io_kiocb *req;
+	for (i = 0; i < nr; i++) {
+		struct io_kiocb *req = cs->reqs[i];
 
-		req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
-		list_del(&req->compl.list);
 		__io_cqring_fill_event(req, req->result, req->compl.cflags);
-
-		/*
-		 * io_free_req() doesn't care about completion_lock unless one
-		 * of these flags is set. REQ_F_WORK_INITIALIZED is in the list
-		 * because of a potential deadlock with req->work.fs->lock
-		 * We defer both, completion and submission refs.
-		 */
-		if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT
-				 |REQ_F_WORK_INITIALIZED)) {
-			spin_unlock_irq(&ctx->completion_lock);
-			io_double_put_req(req);
-			spin_lock_irq(&ctx->completion_lock);
-		} else {
-			io_double_put_req(req);
-		}
 	}
 	io_commit_cqring(ctx);
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_ev_posted(ctx);
+	for (i = 0; i < nr; i++)
+		io_double_put_req(cs->reqs[i]);
 	cs->nr = 0;
 }
 
@@ -6517,8 +6504,8 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
 	} else if (likely(!ret)) {
 		/* drop submission reference */
 		if (req->flags & REQ_F_COMPLETE_INLINE) {
-			list_add_tail(&req->compl.list, &cs->list);
-			if (++cs->nr >= 32)
+			cs->reqs[cs->nr++] = req;
+			if (cs->nr == IO_COMPL_BATCH)
 				io_submit_flush_completions(cs, req->ctx);
 			req = NULL;
 		} else {
@@ -6657,7 +6644,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 static void io_submit_state_end(struct io_submit_state *state,
 				struct io_ring_ctx *ctx)
 {
-	if (!list_empty(&state->comp.list))
+	if (state->comp.nr)
 		io_submit_flush_completions(&state->comp, ctx);
 	if (state->plug_started)
 		blk_finish_plug(&state->plug);

From 905c172f32c56f0740630b639ca5c10ba3689da0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:14 +0000
Subject: [PATCH 114/183] io_uring: submit-completion free batching

io_submit_flush_completions() does completion batching, but may also use
free batching as iopoll does. The main beneficiaries should be buffered
reads/writes and send/recv.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 49 +++++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index aef640616edb..885dd16893c0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1924,26 +1924,6 @@ static inline void io_req_complete_nostate(struct io_kiocb *req, long res,
 	io_put_req(req);
 }
 
-static void io_submit_flush_completions(struct io_comp_state *cs,
-					struct io_ring_ctx *ctx)
-{
-	int i, nr = cs->nr;
-
-	spin_lock_irq(&ctx->completion_lock);
-	for (i = 0; i < nr; i++) {
-		struct io_kiocb *req = cs->reqs[i];
-
-		__io_cqring_fill_event(req, req->result, req->compl.cflags);
-	}
-	io_commit_cqring(ctx);
-	spin_unlock_irq(&ctx->completion_lock);
-
-	io_cqring_ev_posted(ctx);
-	for (i = 0; i < nr; i++)
-		io_double_put_req(cs->reqs[i]);
-	cs->nr = 0;
-}
-
 static void io_req_complete_state(struct io_kiocb *req, long res,
 				  unsigned int cflags)
 {
@@ -2329,6 +2309,35 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
 		__io_req_free_batch_flush(req->ctx, rb);
 }
 
+static void io_submit_flush_completions(struct io_comp_state *cs,
+					struct io_ring_ctx *ctx)
+{
+	int i, nr = cs->nr;
+	struct io_kiocb *req;
+	struct req_batch rb;
+
+	io_init_req_batch(&rb);
+	spin_lock_irq(&ctx->completion_lock);
+	for (i = 0; i < nr; i++) {
+		req = cs->reqs[i];
+		__io_cqring_fill_event(req, req->result, req->compl.cflags);
+	}
+	io_commit_cqring(ctx);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_ev_posted(ctx);
+	for (i = 0; i < nr; i++) {
+		req = cs->reqs[i];
+
+		/* submission and completion refs */
+		if (refcount_sub_and_test(2, &req->refs))
+			io_req_free_batch(&rb, req);
+	}
+
+	io_req_free_batch_finish(ctx, &rb);
+	cs->nr = 0;
+}
+
 /*
  * Drop reference to request, return next in chain (if there is one) if this
  * was the last reference to this request.

From 3893f39f2245eec04b8052cd441c2cb8a9ea3447 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:15 +0000
Subject: [PATCH 115/183] io_uring: remove fallback_req

Remove fallback_req for now, it gets in the way of other changes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 38 ++------------------------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 885dd16893c0..a0b5f2c6d8ea 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -386,9 +386,6 @@ struct io_ring_ctx {
 	struct completion	ref_comp;
 	struct completion	sq_thread_comp;
 
-	/* if all else fails... */
-	struct io_kiocb		*fallback_req;
-
 #if defined(CONFIG_UNIX)
 	struct socket		*ring_sock;
 #endif
@@ -1302,10 +1299,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	if (!ctx)
 		return NULL;
 
-	ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
-	if (!ctx->fallback_req)
-		goto err;
-
 	/*
 	 * Use 5 bits less than the max cq entries, that should give us around
 	 * 32 entries per hash list if totally full and uniformly spread.
@@ -1348,8 +1341,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	init_llist_head(&ctx->rsrc_put_llist);
 	return ctx;
 err:
-	if (ctx->fallback_req)
-		kmem_cache_free(req_cachep, ctx->fallback_req);
 	kfree(ctx->cancel_hash);
 	kfree(ctx);
 	return NULL;
@@ -1947,23 +1938,6 @@ static inline void io_req_complete(struct io_kiocb *req, long res)
 	__io_req_complete(req, 0, res, 0);
 }
 
-static inline bool io_is_fallback_req(struct io_kiocb *req)
-{
-	return req == (struct io_kiocb *)
-			((unsigned long) req->ctx->fallback_req & ~1UL);
-}
-
-static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
-{
-	struct io_kiocb *req;
-
-	req = ctx->fallback_req;
-	if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
-		return req;
-
-	return NULL;
-}
-
 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
@@ -1983,7 +1957,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 		if (unlikely(ret <= 0)) {
 			state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
 			if (!state->reqs[0])
-				return io_get_fallback_req(ctx);
+				return NULL;
 			ret = 1;
 		}
 		state->free_reqs = ret;
@@ -2030,10 +2004,7 @@ static void __io_free_req(struct io_kiocb *req)
 	io_dismantle_req(req);
 	io_put_task(req->task, 1);
 
-	if (likely(!io_is_fallback_req(req)))
-		kmem_cache_free(req_cachep, req);
-	else
-		clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req);
+	kmem_cache_free(req_cachep, req);
 	percpu_ref_put(&ctx->refs);
 }
 
@@ -2289,10 +2260,6 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
 
 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
 {
-	if (unlikely(io_is_fallback_req(req))) {
-		io_free_req(req);
-		return;
-	}
 	io_queue_next(req);
 
 	if (req->task != rb->task) {
@@ -8695,7 +8662,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	free_uid(ctx->user);
 	put_cred(ctx->creds);
 	kfree(ctx->cancel_hash);
-	kmem_cache_free(req_cachep, ctx->fallback_req);
 	kfree(ctx);
 }
 

From 9ae7246321d2b735867f6767e0fab96dd248c555 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:16 +0000
Subject: [PATCH 116/183] io_uring: count ctx refs separately from reqs

Currently batch free handles request memory freeing and ctx ref putting
together. Separate them and use different counters, that will be needed
for reusing reqs memory.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a0b5f2c6d8ea..9b84d6314c11 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2227,6 +2227,7 @@ static void io_free_req(struct io_kiocb *req)
 struct req_batch {
 	void *reqs[IO_IOPOLL_BATCH];
 	int to_free;
+	int ctx_refs;
 
 	struct task_struct	*task;
 	int			task_refs;
@@ -2236,6 +2237,7 @@ static inline void io_init_req_batch(struct req_batch *rb)
 {
 	rb->to_free = 0;
 	rb->task_refs = 0;
+	rb->ctx_refs = 0;
 	rb->task = NULL;
 }
 
@@ -2243,7 +2245,6 @@ static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
 				      struct req_batch *rb)
 {
 	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
-	percpu_ref_put_many(&ctx->refs, rb->to_free);
 	rb->to_free = 0;
 }
 
@@ -2256,6 +2257,8 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
 		io_put_task(rb->task, rb->task_refs);
 		rb->task = NULL;
 	}
+	if (rb->ctx_refs)
+		percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
 }
 
 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
@@ -2269,6 +2272,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
 		rb->task_refs = 0;
 	}
 	rb->task_refs++;
+	rb->ctx_refs++;
 
 	io_dismantle_req(req);
 	rb->reqs[rb->to_free++] = req;

From bf019da7fcbe7e42372582cc339fd1fb8e1e4fa5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:17 +0000
Subject: [PATCH 117/183] io_uring: persistent req cache

Don't free batch-allocated requests across syscalls.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9b84d6314c11..1f0b3b332d32 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -266,6 +266,8 @@ struct io_sq_data {
 
 #define IO_IOPOLL_BATCH			8
 #define IO_COMPL_BATCH			32
+#define IO_REQ_CACHE_SIZE		8
+#define IO_REQ_ALLOC_BATCH		8
 
 struct io_comp_state {
 	unsigned int		nr;
@@ -278,7 +280,7 @@ struct io_submit_state {
 	/*
 	 * io_kiocb alloc cache
 	 */
-	void			*reqs[IO_IOPOLL_BATCH];
+	void			*reqs[IO_REQ_CACHE_SIZE];
 	unsigned int		free_reqs;
 
 	bool			plug_started;
@@ -1942,13 +1944,14 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
 
+	BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
+
 	if (!state->free_reqs) {
 		gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
-		size_t sz;
 		int ret;
 
-		sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
-		ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
+		ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
+					    state->reqs);
 
 		/*
 		 * Bulk alloc is all-or-nothing. If we fail to get a batch,
@@ -6629,10 +6632,6 @@ static void io_submit_state_end(struct io_submit_state *state,
 	if (state->plug_started)
 		blk_finish_plug(&state->plug);
 	io_state_file_put(state);
-	if (state->free_reqs) {
-		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
-		state->free_reqs = 0;
-	}
 }
 
 /*
@@ -8632,6 +8631,8 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
 
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
+	struct io_submit_state *submit_state = &ctx->submit_state;
+
 	io_finish_async(ctx);
 	io_sqe_buffers_unregister(ctx);
 
@@ -8642,6 +8643,10 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		ctx->mm_account = NULL;
 	}
 
+	if (submit_state->free_reqs)
+		kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
+				     submit_state->reqs);
+
 #ifdef CONFIG_BLK_CGROUP
 	if (ctx->sqo_blkcg_css)
 		css_put(ctx->sqo_blkcg_css);

From 6ff119a6e4c3fe900e75e6667930dc086f185f2b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:18 +0000
Subject: [PATCH 118/183] io_uring: feed reqs back into alloc cache

Make io_req_free_batch(), which is used for inline executed requests and
IOPOLL, to return requests back into the allocation cache, so avoid
most of kmalloc()/kfree() for those cases.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1f0b3b332d32..fe07af756186 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -266,7 +266,7 @@ struct io_sq_data {
 
 #define IO_IOPOLL_BATCH			8
 #define IO_COMPL_BATCH			32
-#define IO_REQ_CACHE_SIZE		8
+#define IO_REQ_CACHE_SIZE		32
 #define IO_REQ_ALLOC_BATCH		8
 
 struct io_comp_state {
@@ -2264,7 +2264,8 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
 		percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
 }
 
-static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
+static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
+			      struct io_submit_state *state)
 {
 	io_queue_next(req);
 
@@ -2278,9 +2279,13 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
 	rb->ctx_refs++;
 
 	io_dismantle_req(req);
-	rb->reqs[rb->to_free++] = req;
-	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
-		__io_req_free_batch_flush(req->ctx, rb);
+	if (state->free_reqs != ARRAY_SIZE(state->reqs)) {
+		state->reqs[state->free_reqs++] = req;
+	} else {
+		rb->reqs[rb->to_free++] = req;
+		if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
+			__io_req_free_batch_flush(req->ctx, rb);
+	}
 }
 
 static void io_submit_flush_completions(struct io_comp_state *cs,
@@ -2305,7 +2310,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs,
 
 		/* submission and completion refs */
 		if (refcount_sub_and_test(2, &req->refs))
-			io_req_free_batch(&rb, req);
+			io_req_free_batch(&rb, req, &ctx->submit_state);
 	}
 
 	io_req_free_batch_finish(ctx, &rb);
@@ -2458,7 +2463,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		(*nr_events)++;
 
 		if (refcount_dec_and_test(&req->refs))
-			io_req_free_batch(&rb, req);
+			io_req_free_batch(&rb, req, &ctx->submit_state);
 	}
 
 	io_commit_cqring(ctx);

From 1b4c351f6eb7467c77fc19e0cd7e5f0083ecd847 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Feb 2021 00:03:19 +0000
Subject: [PATCH 119/183] io_uring: use persistent request cache

Now that we have the submit_state in the ring itself, we can have io_kiocb
allocations that are persistent across invocations. This reduces the time
spent doing slab allocations and frees.

[sil: rebased]
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fe07af756186..87a4b727fe1c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -270,8 +270,9 @@ struct io_sq_data {
 #define IO_REQ_ALLOC_BATCH		8
 
 struct io_comp_state {
-	unsigned int		nr;
 	struct io_kiocb		*reqs[IO_COMPL_BATCH];
+	unsigned int		nr;
+	struct list_head	free_list;
 };
 
 struct io_submit_state {
@@ -1341,6 +1342,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 	init_llist_head(&ctx->rsrc_put_llist);
+	INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
 	return ctx;
 err:
 	kfree(ctx->cancel_hash);
@@ -1946,6 +1948,15 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 
 	BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
 
+	if (!list_empty(&state->comp.free_list)) {
+		struct io_kiocb *req;
+
+		req = list_first_entry(&state->comp.free_list, struct io_kiocb,
+					compl.list);
+		list_del(&req->compl.list);
+		return req;
+	}
+
 	if (!state->free_reqs) {
 		gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 		int ret;
@@ -2228,34 +2239,21 @@ static void io_free_req(struct io_kiocb *req)
 }
 
 struct req_batch {
-	void *reqs[IO_IOPOLL_BATCH];
-	int to_free;
-	int ctx_refs;
-
 	struct task_struct	*task;
 	int			task_refs;
+	int			ctx_refs;
 };
 
 static inline void io_init_req_batch(struct req_batch *rb)
 {
-	rb->to_free = 0;
 	rb->task_refs = 0;
 	rb->ctx_refs = 0;
 	rb->task = NULL;
 }
 
-static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
-				      struct req_batch *rb)
-{
-	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
-	rb->to_free = 0;
-}
-
 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
 				     struct req_batch *rb)
 {
-	if (rb->to_free)
-		__io_req_free_batch_flush(ctx, rb);
 	if (rb->task) {
 		io_put_task(rb->task, rb->task_refs);
 		rb->task = NULL;
@@ -2282,9 +2280,9 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
 	if (state->free_reqs != ARRAY_SIZE(state->reqs)) {
 		state->reqs[state->free_reqs++] = req;
 	} else {
-		rb->reqs[rb->to_free++] = req;
-		if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
-			__io_req_free_batch_flush(req->ctx, rb);
+		struct io_comp_state *cs = &req->ctx->submit_state.comp;
+
+		list_add(&req->compl.list, &cs->free_list);
 	}
 }
 
@@ -8634,6 +8632,19 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
 	idr_destroy(&ctx->io_buffer_idr);
 }
 
+static void io_req_cache_free(struct io_ring_ctx *ctx)
+{
+	struct io_comp_state *cs = &ctx->submit_state.comp;
+
+	while (!list_empty(&cs->free_list)) {
+		struct io_kiocb *req;
+
+		req = list_first_entry(&cs->free_list, struct io_kiocb, compl.list);
+		list_del(&req->compl.list);
+		kmem_cache_free(req_cachep, req);
+	}
+}
+
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *submit_state = &ctx->submit_state;
@@ -8676,6 +8687,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	free_uid(ctx->user);
 	put_cred(ctx->creds);
 	kfree(ctx->cancel_hash);
+	io_req_cache_free(ctx);
 	kfree(ctx);
 }
 

From 7cbf1722d5fc5779946ee8f338e9e38b5de15856 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Feb 2021 00:03:20 +0000
Subject: [PATCH 120/183] io_uring: provide FIFO ordering for task_work

task_work is a LIFO list, due to how it's implemented as a lockless
list. For long chains of task_work, this can be problematic as the
first entry added is the last one processed. Similarly, we'd waste
a lot of CPU cycles reversing this list.

Wrap the task_work so we have a single task_work entry per task per
ctx, and use that to run it in the right order.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.h               |   9 ----
 fs/io_uring.c            | 101 ++++++++++++++++++++++++++++++++++++---
 include/linux/io_uring.h |  14 ++++++
 3 files changed, 108 insertions(+), 16 deletions(-)

diff --git a/fs/io-wq.h b/fs/io-wq.h
index e37a0f217cc8..096f1021018e 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -27,15 +27,6 @@ enum io_wq_cancel {
 	IO_WQ_CANCEL_NOTFOUND,	/* work not found */
 };
 
-struct io_wq_work_node {
-	struct io_wq_work_node *next;
-};
-
-struct io_wq_work_list {
-	struct io_wq_work_node *first;
-	struct io_wq_work_node *last;
-};
-
 static inline void wq_list_add_after(struct io_wq_work_node *node,
 				     struct io_wq_work_node *pos,
 				     struct io_wq_work_list *list)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 87a4b727fe1c..bfc8fcd93504 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -721,6 +721,11 @@ struct async_poll {
 	struct io_poll_iocb	*double_poll;
 };
 
+struct io_task_work {
+	struct io_wq_work_node	node;
+	task_work_func_t	func;
+};
+
 /*
  * NOTE! Each of the iocb union members has the file pointer
  * as the first entry in their struct definition. So you can
@@ -779,7 +784,10 @@ struct io_kiocb {
 	 * 2. to track reqs with ->files (see io_op_def::file_table)
 	 */
 	struct list_head		inflight_entry;
-	struct callback_head		task_work;
+	union {
+		struct io_task_work	io_task_work;
+		struct callback_head	task_work;
+	};
 	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 	struct hlist_node		hash_node;
 	struct async_poll		*apoll;
@@ -2129,6 +2137,81 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 	return __io_req_find_next(req);
 }
 
+static bool __tctx_task_work(struct io_uring_task *tctx)
+{
+	struct io_wq_work_list list;
+	struct io_wq_work_node *node;
+
+	if (wq_list_empty(&tctx->task_list))
+		return false;
+
+	spin_lock(&tctx->task_lock);
+	list = tctx->task_list;
+	INIT_WQ_LIST(&tctx->task_list);
+	spin_unlock(&tctx->task_lock);
+
+	node = list.first;
+	while (node) {
+		struct io_wq_work_node *next = node->next;
+		struct io_kiocb *req;
+
+		req = container_of(node, struct io_kiocb, io_task_work.node);
+		req->task_work.func(&req->task_work);
+		node = next;
+	}
+
+	return list.first != NULL;
+}
+
+static void tctx_task_work(struct callback_head *cb)
+{
+	struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work);
+
+	while (__tctx_task_work(tctx))
+		cond_resched();
+
+	clear_bit(0, &tctx->task_state);
+}
+
+static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
+			    enum task_work_notify_mode notify)
+{
+	struct io_uring_task *tctx = tsk->io_uring;
+	struct io_wq_work_node *node, *prev;
+	int ret;
+
+	WARN_ON_ONCE(!tctx);
+
+	spin_lock(&tctx->task_lock);
+	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
+	spin_unlock(&tctx->task_lock);
+
+	/* task_work already pending, we're done */
+	if (test_bit(0, &tctx->task_state) ||
+	    test_and_set_bit(0, &tctx->task_state))
+		return 0;
+
+	if (!task_work_add(tsk, &tctx->task_work, notify))
+		return 0;
+
+	/*
+	 * Slow path - we failed, find and delete work. if the work is not
+	 * in the list, it got run and we're fine.
+	 */
+	ret = 0;
+	spin_lock(&tctx->task_lock);
+	wq_list_for_each(node, prev, &tctx->task_list) {
+		if (&req->io_task_work.node == node) {
+			wq_list_del(&tctx->task_list, node, prev);
+			ret = 1;
+			break;
+		}
+	}
+	spin_unlock(&tctx->task_lock);
+	clear_bit(0, &tctx->task_state);
+	return ret;
+}
+
 static int io_req_task_work_add(struct io_kiocb *req)
 {
 	struct task_struct *tsk = req->task;
@@ -2149,7 +2232,7 @@ static int io_req_task_work_add(struct io_kiocb *req)
 	if (!(ctx->flags & IORING_SETUP_SQPOLL))
 		notify = TWA_SIGNAL;
 
-	ret = task_work_add(tsk, &req->task_work, notify);
+	ret = io_task_work_add(tsk, req, notify);
 	if (!ret)
 		wake_up_process(tsk);
 
@@ -2157,7 +2240,7 @@ static int io_req_task_work_add(struct io_kiocb *req)
 }
 
 static void io_req_task_work_add_fallback(struct io_kiocb *req,
-					  void (*cb)(struct callback_head *))
+					  task_work_func_t cb)
 {
 	struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq);
 
@@ -2216,7 +2299,7 @@ static void io_req_task_queue(struct io_kiocb *req)
 {
 	int ret;
 
-	init_task_work(&req->task_work, io_req_task_submit);
+	req->task_work.func = io_req_task_submit;
 	percpu_ref_get(&req->ctx->refs);
 
 	ret = io_req_task_work_add(req);
@@ -2347,7 +2430,7 @@ static void io_free_req_deferred(struct io_kiocb *req)
 {
 	int ret;
 
-	init_task_work(&req->task_work, io_put_req_deferred_cb);
+	req->task_work.func = io_put_req_deferred_cb;
 	ret = io_req_task_work_add(req);
 	if (unlikely(ret))
 		io_req_task_work_add_fallback(req, io_put_req_deferred_cb);
@@ -3392,7 +3475,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
 	list_del_init(&wait->entry);
 
-	init_task_work(&req->task_work, io_req_task_submit);
+	req->task_work.func = io_req_task_submit;
 	percpu_ref_get(&req->ctx->refs);
 
 	/* submit ref gets dropped, acquire a new one */
@@ -5083,7 +5166,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 	list_del_init(&poll->wait.entry);
 
 	req->result = mask;
-	init_task_work(&req->task_work, func);
+	req->task_work.func = func;
 	percpu_ref_get(&req->ctx->refs);
 
 	/*
@@ -8086,6 +8169,10 @@ static int io_uring_alloc_task_context(struct task_struct *task)
 	io_init_identity(&tctx->__identity);
 	tctx->identity = &tctx->__identity;
 	task->io_uring = tctx;
+	spin_lock_init(&tctx->task_lock);
+	INIT_WQ_LIST(&tctx->task_list);
+	tctx->task_state = 0;
+	init_task_work(&tctx->task_work, tctx_task_work);
 	return 0;
 }
 
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 35b2d845704d..2eb6d19de336 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -22,6 +22,15 @@ struct io_identity {
 	refcount_t			count;
 };
 
+struct io_wq_work_node {
+	struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+	struct io_wq_work_node *first;
+	struct io_wq_work_node *last;
+};
+
 struct io_uring_task {
 	/* submission side */
 	struct xarray		xa;
@@ -32,6 +41,11 @@ struct io_uring_task {
 	struct io_identity	*identity;
 	atomic_t		in_idle;
 	bool			sqpoll;
+
+	spinlock_t		task_lock;
+	struct io_wq_work_list	task_list;
+	unsigned long		task_state;
+	struct callback_head	task_work;
 };
 
 #if defined(CONFIG_IO_URING)

From 65453d1efbd20f3825beba2a9c93ffb2ec729ece Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 10 Feb 2021 00:03:21 +0000
Subject: [PATCH 121/183] io_uring: enable req cache for task_work items

task_work is run without utilizing the req alloc cache, so any deferred
items don't get to take advantage of either the alloc or free side of it.
With task_work now being wrapped by io_uring, we can use the ctx
completion state to both use the req cache and the completion flush
batching.

With this, the only request type that cannot take advantage of the req
cache is IRQ driven IO for regular files / block devices. Anything else,
including IOPOLL polled IO to those same tyes, will take advantage of it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bfc8fcd93504..fe8921a728b0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1051,6 +1051,8 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 			     const struct iovec *fast_iov,
 			     struct iov_iter *iter, bool force);
 static void io_req_task_queue(struct io_kiocb *req);
+static void io_submit_flush_completions(struct io_comp_state *cs,
+					struct io_ring_ctx *ctx);
 
 static struct kmem_cache *req_cachep;
 
@@ -2139,6 +2141,7 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 
 static bool __tctx_task_work(struct io_uring_task *tctx)
 {
+	struct io_ring_ctx *ctx = NULL;
 	struct io_wq_work_list list;
 	struct io_wq_work_node *node;
 
@@ -2153,11 +2156,28 @@ static bool __tctx_task_work(struct io_uring_task *tctx)
 	node = list.first;
 	while (node) {
 		struct io_wq_work_node *next = node->next;
+		struct io_ring_ctx *this_ctx;
 		struct io_kiocb *req;
 
 		req = container_of(node, struct io_kiocb, io_task_work.node);
+		this_ctx = req->ctx;
 		req->task_work.func(&req->task_work);
 		node = next;
+
+		if (!ctx) {
+			ctx = this_ctx;
+		} else if (ctx != this_ctx) {
+			mutex_lock(&ctx->uring_lock);
+			io_submit_flush_completions(&ctx->submit_state.comp, ctx);
+			mutex_unlock(&ctx->uring_lock);
+			ctx = this_ctx;
+		}
+	}
+
+	if (ctx && ctx->submit_state.comp.nr) {
+		mutex_lock(&ctx->uring_lock);
+		io_submit_flush_completions(&ctx->submit_state.comp, ctx);
+		mutex_unlock(&ctx->uring_lock);
 	}
 
 	return list.first != NULL;
@@ -2280,7 +2300,7 @@ static void __io_req_task_submit(struct io_kiocb *req)
 	if (!ctx->sqo_dead &&
 	    !__io_sq_thread_acquire_mm(ctx) &&
 	    !__io_sq_thread_acquire_files(ctx))
-		__io_queue_sqe(req, NULL);
+		__io_queue_sqe(req, &ctx->submit_state.comp);
 	else
 		__io_req_task_cancel(req, -EFAULT);
 	mutex_unlock(&ctx->uring_lock);

From c5eef2b9449ba267f53bfa7cf63d2bc93acbee32 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:22 +0000
Subject: [PATCH 122/183] io_uring: take comp_state from ctx

__io_queue_sqe() is always called with a non-NULL comp_state, which is
taken directly from context. Don't pass it around but infer from ctx.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fe8921a728b0..bff5bc4a2b6e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1042,7 +1042,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 static void __io_clean_op(struct io_kiocb *req);
 static struct file *io_file_get(struct io_submit_state *state,
 				struct io_kiocb *req, int fd, bool fixed);
-static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
+static void __io_queue_sqe(struct io_kiocb *req);
 static void io_rsrc_put_work(struct work_struct *work);
 
 static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
@@ -2300,7 +2300,7 @@ static void __io_req_task_submit(struct io_kiocb *req)
 	if (!ctx->sqo_dead &&
 	    !__io_sq_thread_acquire_mm(ctx) &&
 	    !__io_sq_thread_acquire_files(ctx))
-		__io_queue_sqe(req, &ctx->submit_state.comp);
+		__io_queue_sqe(req);
 	else
 		__io_req_task_cancel(req, -EFAULT);
 	mutex_unlock(&ctx->uring_lock);
@@ -6551,14 +6551,12 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 	return nxt;
 }
 
-static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
+static void __io_queue_sqe(struct io_kiocb *req)
 {
 	struct io_kiocb *linked_timeout;
 	const struct cred *old_creds = NULL;
-	int ret, issue_flags = IO_URING_F_NONBLOCK;
+	int ret;
 
-	if (cs)
-		issue_flags |= IO_URING_F_COMPLETE_DEFER;
 again:
 	linked_timeout = io_prep_linked_timeout(req);
 
@@ -6573,7 +6571,7 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
 			old_creds = override_creds(req->work.identity->creds);
 	}
 
-	ret = io_issue_sqe(req, issue_flags);
+	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
 
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -6593,9 +6591,12 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
 	} else if (likely(!ret)) {
 		/* drop submission reference */
 		if (req->flags & REQ_F_COMPLETE_INLINE) {
+			struct io_ring_ctx *ctx = req->ctx;
+			struct io_comp_state *cs = &ctx->submit_state.comp;
+
 			cs->reqs[cs->nr++] = req;
 			if (cs->nr == IO_COMPL_BATCH)
-				io_submit_flush_completions(cs, req->ctx);
+				io_submit_flush_completions(cs, ctx);
 			req = NULL;
 		} else {
 			req = io_put_req_find_next(req);
@@ -6621,8 +6622,7 @@ static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
 		revert_creds(old_creds);
 }
 
-static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-			 struct io_comp_state *cs)
+static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	int ret;
 
@@ -6647,18 +6647,17 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (unlikely(ret))
 				goto fail_req;
 		}
-		__io_queue_sqe(req, cs);
+		__io_queue_sqe(req);
 	}
 }
 
-static inline void io_queue_link_head(struct io_kiocb *req,
-				      struct io_comp_state *cs)
+static inline void io_queue_link_head(struct io_kiocb *req)
 {
 	if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
 		io_put_req(req);
 		io_req_complete(req, -ECANCELED);
 	} else
-		io_queue_sqe(req, NULL, cs);
+		io_queue_sqe(req, NULL);
 }
 
 struct io_submit_link {
@@ -6667,7 +6666,7 @@ struct io_submit_link {
 };
 
 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-			 struct io_submit_link *link, struct io_comp_state *cs)
+			 struct io_submit_link *link)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
@@ -6705,7 +6704,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 		/* last request of a link, enqueue the link */
 		if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
-			io_queue_link_head(head, cs);
+			io_queue_link_head(head);
 			link->head = NULL;
 		}
 	} else {
@@ -6720,7 +6719,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			link->head = req;
 			link->last = req;
 		} else {
-			io_queue_sqe(req, sqe, cs);
+			io_queue_sqe(req, sqe);
 		}
 	}
 
@@ -6961,7 +6960,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 
 		trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
 					true, ctx->flags & IORING_SETUP_SQPOLL);
-		err = io_submit_sqe(req, sqe, &link, &ctx->submit_state.comp);
+		err = io_submit_sqe(req, sqe, &link);
 		if (err)
 			goto fail_req;
 	}
@@ -6976,7 +6975,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		put_task_struct_many(current, unused);
 	}
 	if (link.head)
-		io_queue_link_head(link.head, &ctx->submit_state.comp);
+		io_queue_link_head(link.head);
 	io_submit_state_end(&ctx->submit_state, ctx);
 
 	 /* Commit SQ ring head once we've consumed and submitted all SQEs */

From e5d1bc0a91f16959aa279aa3ee9fdc246d4bb382 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 00:03:23 +0000
Subject: [PATCH 123/183] io_uring: defer flushing cached reqs

Awhile there are requests in the allocation cache -- use them, only if
those ended go for the stashed memory in comp.free_list. As list
manipulation are generally heavy and are not good for caches, flush them
all or as much as can in one go.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
[axboe: return success/failure from io_flush_cached_reqs()]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bff5bc4a2b6e..4a28032ba35b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1952,25 +1952,35 @@ static inline void io_req_complete(struct io_kiocb *req, long res)
 	__io_req_complete(req, 0, res, 0);
 }
 
+static bool io_flush_cached_reqs(struct io_submit_state *state)
+{
+	struct io_kiocb *req = NULL;
+
+	while (!list_empty(&state->comp.free_list)) {
+		req = list_first_entry(&state->comp.free_list, struct io_kiocb,
+					compl.list);
+		list_del(&req->compl.list);
+		state->reqs[state->free_reqs++] = req;
+		if (state->free_reqs == ARRAY_SIZE(state->reqs))
+			break;
+	}
+
+	return req != NULL;
+}
+
 static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *state = &ctx->submit_state;
 
 	BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs));
 
-	if (!list_empty(&state->comp.free_list)) {
-		struct io_kiocb *req;
-
-		req = list_first_entry(&state->comp.free_list, struct io_kiocb,
-					compl.list);
-		list_del(&req->compl.list);
-		return req;
-	}
-
 	if (!state->free_reqs) {
 		gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 		int ret;
 
+		if (io_flush_cached_reqs(state))
+			goto got_req;
+
 		ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
 					    state->reqs);
 
@@ -1986,7 +1996,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 		}
 		state->free_reqs = ret;
 	}
-
+got_req:
 	state->free_reqs--;
 	return state->reqs[state->free_reqs];
 }

From ed670c3f90a67d9e16ab6d8893be6f072d79cd4c Mon Sep 17 00:00:00 2001
From: Hao Xu <haoxu@linux.alibaba.com>
Date: Fri, 5 Feb 2021 16:34:21 +0800
Subject: [PATCH 124/183] io_uring: fix possible deadlock in io_uring_poll

Abaci reported follow issue:

[   30.615891] ======================================================
[   30.616648] WARNING: possible circular locking dependency detected
[   30.617423] 5.11.0-rc3-next-20210115 #1 Not tainted
[   30.618035] ------------------------------------------------------
[   30.618914] a.out/1128 is trying to acquire lock:
[   30.619520] ffff88810b063868 (&ep->mtx){+.+.}-{3:3}, at: __ep_eventpoll_poll+0x9f/0x220
[   30.620505]
[   30.620505] but task is already holding lock:
[   30.621218] ffff88810e952be8 (&ctx->uring_lock){+.+.}-{3:3}, at: __x64_sys_io_uring_enter+0x3f0/0x5b0
[   30.622349]
[   30.622349] which lock already depends on the new lock.
[   30.622349]
[   30.623289]
[   30.623289] the existing dependency chain (in reverse order) is:
[   30.624243]
[   30.624243] -> #1 (&ctx->uring_lock){+.+.}-{3:3}:
[   30.625263]        lock_acquire+0x2c7/0x390
[   30.625868]        __mutex_lock+0xae/0x9f0
[   30.626451]        io_cqring_overflow_flush.part.95+0x6d/0x70
[   30.627278]        io_uring_poll+0xcb/0xd0
[   30.627890]        ep_item_poll.isra.14+0x4e/0x90
[   30.628531]        do_epoll_ctl+0xb7e/0x1120
[   30.629122]        __x64_sys_epoll_ctl+0x70/0xb0
[   30.629770]        do_syscall_64+0x2d/0x40
[   30.630332]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   30.631187]
[   30.631187] -> #0 (&ep->mtx){+.+.}-{3:3}:
[   30.631985]        check_prevs_add+0x226/0xb00
[   30.632584]        __lock_acquire+0x1237/0x13a0
[   30.633207]        lock_acquire+0x2c7/0x390
[   30.633740]        __mutex_lock+0xae/0x9f0
[   30.634258]        __ep_eventpoll_poll+0x9f/0x220
[   30.634879]        __io_arm_poll_handler+0xbf/0x220
[   30.635462]        io_issue_sqe+0xa6b/0x13e0
[   30.635982]        __io_queue_sqe+0x10b/0x550
[   30.636648]        io_queue_sqe+0x235/0x470
[   30.637281]        io_submit_sqes+0xcce/0xf10
[   30.637839]        __x64_sys_io_uring_enter+0x3fb/0x5b0
[   30.638465]        do_syscall_64+0x2d/0x40
[   30.638999]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   30.639643]
[   30.639643] other info that might help us debug this:
[   30.639643]
[   30.640618]  Possible unsafe locking scenario:
[   30.640618]
[   30.641402]        CPU0                    CPU1
[   30.641938]        ----                    ----
[   30.642664]   lock(&ctx->uring_lock);
[   30.643425]                                lock(&ep->mtx);
[   30.644498]                                lock(&ctx->uring_lock);
[   30.645668]   lock(&ep->mtx);
[   30.646321]
[   30.646321]  *** DEADLOCK ***
[   30.646321]
[   30.647642] 1 lock held by a.out/1128:
[   30.648424]  #0: ffff88810e952be8 (&ctx->uring_lock){+.+.}-{3:3}, at: __x64_sys_io_uring_enter+0x3f0/0x5b0
[   30.649954]
[   30.649954] stack backtrace:
[   30.650592] CPU: 1 PID: 1128 Comm: a.out Not tainted 5.11.0-rc3-next-20210115 #1
[   30.651554] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[   30.652290] Call Trace:
[   30.652688]  dump_stack+0xac/0xe3
[   30.653164]  check_noncircular+0x11e/0x130
[   30.653747]  ? check_prevs_add+0x226/0xb00
[   30.654303]  check_prevs_add+0x226/0xb00
[   30.654845]  ? add_lock_to_list.constprop.49+0xac/0x1d0
[   30.655564]  __lock_acquire+0x1237/0x13a0
[   30.656262]  lock_acquire+0x2c7/0x390
[   30.656788]  ? __ep_eventpoll_poll+0x9f/0x220
[   30.657379]  ? __io_queue_proc.isra.88+0x180/0x180
[   30.658014]  __mutex_lock+0xae/0x9f0
[   30.658524]  ? __ep_eventpoll_poll+0x9f/0x220
[   30.659112]  ? mark_held_locks+0x5a/0x80
[   30.659648]  ? __ep_eventpoll_poll+0x9f/0x220
[   30.660229]  ? _raw_spin_unlock_irqrestore+0x2d/0x40
[   30.660885]  ? trace_hardirqs_on+0x46/0x110
[   30.661471]  ? __io_queue_proc.isra.88+0x180/0x180
[   30.662102]  ? __ep_eventpoll_poll+0x9f/0x220
[   30.662696]  __ep_eventpoll_poll+0x9f/0x220
[   30.663273]  ? __ep_eventpoll_poll+0x220/0x220
[   30.663875]  __io_arm_poll_handler+0xbf/0x220
[   30.664463]  io_issue_sqe+0xa6b/0x13e0
[   30.664984]  ? __lock_acquire+0x782/0x13a0
[   30.665544]  ? __io_queue_proc.isra.88+0x180/0x180
[   30.666170]  ? __io_queue_sqe+0x10b/0x550
[   30.666725]  __io_queue_sqe+0x10b/0x550
[   30.667252]  ? __fget_files+0x131/0x260
[   30.667791]  ? io_req_prep+0xd8/0x1090
[   30.668316]  ? io_queue_sqe+0x235/0x470
[   30.668868]  io_queue_sqe+0x235/0x470
[   30.669398]  io_submit_sqes+0xcce/0xf10
[   30.669931]  ? xa_load+0xe4/0x1c0
[   30.670425]  __x64_sys_io_uring_enter+0x3fb/0x5b0
[   30.671051]  ? lockdep_hardirqs_on_prepare+0xde/0x180
[   30.671719]  ? syscall_enter_from_user_mode+0x2b/0x80
[   30.672380]  do_syscall_64+0x2d/0x40
[   30.672901]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   30.673503] RIP: 0033:0x7fd89c813239
[   30.673962] Code: 01 00 48 81 c4 80 00 00 00 e9 f1 fe ff ff 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05  3d 01 f0 ff ff 73 01 c3 48 8b 0d 27 ec 2c 00 f7 d8 64 89 01 48
[   30.675920] RSP: 002b:00007ffc65a7c628 EFLAGS: 00000217 ORIG_RAX: 00000000000001aa
[   30.676791] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fd89c813239
[   30.677594] RDX: 0000000000000000 RSI: 0000000000000014 RDI: 0000000000000003
[   30.678678] RBP: 00007ffc65a7c720 R08: 0000000000000000 R09: 0000000003000000
[   30.679492] R10: 0000000000000000 R11: 0000000000000217 R12: 0000000000400ff0
[   30.680282] R13: 00007ffc65a7c840 R14: 0000000000000000 R15: 0000000000000000

This might happen if we do epoll_wait on a uring fd while reading/writing
the former epoll fd in a sqe in the former uring instance.
So let's don't flush cqring overflow list, just do a simple check.

Reported-by: Abaci <abaci@linux.alibaba.com>
Fixes: 6c503150ae33 ("io_uring: patch up IOPOLL overflow_flush sync")
Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4a28032ba35b..e73ca37c6a3b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8820,8 +8820,21 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	smp_rmb();
 	if (!io_sqring_full(ctx))
 		mask |= EPOLLOUT | EPOLLWRNORM;
-	io_cqring_overflow_flush(ctx, false, NULL, NULL);
-	if (io_cqring_events(ctx))
+
+	/*
+	 * Don't flush cqring overflow list here, just do a simple check.
+	 * Otherwise there could possible be ABBA deadlock:
+	 *      CPU0                    CPU1
+	 *      ----                    ----
+	 * lock(&ctx->uring_lock);
+	 *                              lock(&ep->mtx);
+	 *                              lock(&ctx->uring_lock);
+	 * lock(&ep->mtx);
+	 *
+	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
+	 * pushs them to do the flush.
+	 */
+	if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;

From c7dae4ba46c9d7d56430b800907b708711995414 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 9 Feb 2021 19:53:37 -0700
Subject: [PATCH 125/183] io_uring: enable req cache for IRQ driven IO

This is the last class of requests that cannot utilize the req alloc
cache. Add a per-ctx req cache that is protected by the completion_lock,
and refill our submit side cache when it gets over our batch count.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 71 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 51 insertions(+), 20 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e73ca37c6a3b..2c7ff0b1b086 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -272,7 +272,11 @@ struct io_sq_data {
 struct io_comp_state {
 	struct io_kiocb		*reqs[IO_COMPL_BATCH];
 	unsigned int		nr;
+	unsigned int		locked_free_nr;
+	/* inline/task_work completion list, under ->uring_lock */
 	struct list_head	free_list;
+	/* IRQ completion list, under ->completion_lock */
+	struct list_head	locked_free_list;
 };
 
 struct io_submit_state {
@@ -1033,6 +1037,9 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req, int nr);
 static void io_double_put_req(struct io_kiocb *req);
+static void io_dismantle_req(struct io_kiocb *req);
+static void io_put_task(struct task_struct *task, int nr);
+static void io_queue_next(struct io_kiocb *req);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void __io_queue_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
@@ -1353,6 +1360,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
 	init_llist_head(&ctx->rsrc_put_llist);
 	INIT_LIST_HEAD(&ctx->submit_state.comp.free_list);
+	INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
 	return ctx;
 err:
 	kfree(ctx->cancel_hash);
@@ -1908,8 +1916,8 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
 	__io_cqring_fill_event(req, res, 0);
 }
 
-static void io_req_complete_post(struct io_kiocb *req, long res,
-				 unsigned int cflags)
+static inline void io_req_complete_post(struct io_kiocb *req, long res,
+					unsigned int cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
@@ -1917,16 +1925,26 @@ static void io_req_complete_post(struct io_kiocb *req, long res,
 	spin_lock_irqsave(&ctx->completion_lock, flags);
 	__io_cqring_fill_event(req, res, cflags);
 	io_commit_cqring(ctx);
+	/*
+	 * If we're the last reference to this request, add to our locked
+	 * free_list cache.
+	 */
+	if (refcount_dec_and_test(&req->refs)) {
+		struct io_comp_state *cs = &ctx->submit_state.comp;
+
+		io_dismantle_req(req);
+		io_put_task(req->task, 1);
+		list_add(&req->compl.list, &cs->locked_free_list);
+		cs->locked_free_nr++;
+	} else
+		req = NULL;
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	io_cqring_ev_posted(ctx);
-}
-
-static inline void io_req_complete_nostate(struct io_kiocb *req, long res,
-					   unsigned int cflags)
-{
-	io_req_complete_post(req, res, cflags);
-	io_put_req(req);
+	if (req) {
+		io_queue_next(req);
+		percpu_ref_put(&ctx->refs);
+	}
 }
 
 static void io_req_complete_state(struct io_kiocb *req, long res,
@@ -1944,7 +1962,7 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
 	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
 		io_req_complete_state(req, res, cflags);
 	else
-		io_req_complete_nostate(req, res, cflags);
+		io_req_complete_post(req, res, cflags);
 }
 
 static inline void io_req_complete(struct io_kiocb *req, long res)
@@ -1952,12 +1970,26 @@ static inline void io_req_complete(struct io_kiocb *req, long res)
 	__io_req_complete(req, 0, res, 0);
 }
 
-static bool io_flush_cached_reqs(struct io_submit_state *state)
+static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
 {
+	struct io_submit_state *state = &ctx->submit_state;
+	struct io_comp_state *cs = &state->comp;
 	struct io_kiocb *req = NULL;
 
-	while (!list_empty(&state->comp.free_list)) {
-		req = list_first_entry(&state->comp.free_list, struct io_kiocb,
+	/*
+	 * If we have more than a batch's worth of requests in our IRQ side
+	 * locked cache, grab the lock and move them over to our submission
+	 * side cache.
+	 */
+	if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) {
+		spin_lock_irq(&ctx->completion_lock);
+		list_splice_init(&cs->locked_free_list, &cs->free_list);
+		cs->locked_free_nr = 0;
+		spin_unlock_irq(&ctx->completion_lock);
+	}
+
+	while (!list_empty(&cs->free_list)) {
+		req = list_first_entry(&cs->free_list, struct io_kiocb,
 					compl.list);
 		list_del(&req->compl.list);
 		state->reqs[state->free_reqs++] = req;
@@ -1978,7 +2010,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
 		gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 		int ret;
 
-		if (io_flush_cached_reqs(state))
+		if (io_flush_cached_reqs(ctx))
 			goto got_req;
 
 		ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
@@ -8748,14 +8780,12 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
 	idr_destroy(&ctx->io_buffer_idr);
 }
 
-static void io_req_cache_free(struct io_ring_ctx *ctx)
+static void io_req_cache_free(struct list_head *list)
 {
-	struct io_comp_state *cs = &ctx->submit_state.comp;
-
-	while (!list_empty(&cs->free_list)) {
+	while (!list_empty(list)) {
 		struct io_kiocb *req;
 
-		req = list_first_entry(&cs->free_list, struct io_kiocb, compl.list);
+		req = list_first_entry(list, struct io_kiocb, compl.list);
 		list_del(&req->compl.list);
 		kmem_cache_free(req_cachep, req);
 	}
@@ -8803,7 +8833,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	free_uid(ctx->user);
 	put_cred(ctx->creds);
 	kfree(ctx->cancel_hash);
-	io_req_cache_free(ctx);
+	io_req_cache_free(&ctx->submit_state.comp.free_list);
+	io_req_cache_free(&ctx->submit_state.comp.locked_free_list);
 	kfree(ctx);
 }
 

From 91f245d5d5de0802428a478802ec051f7de2f5d6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 9 Feb 2021 13:48:50 -0700
Subject: [PATCH 126/183] io_uring: enable kmemcg account for io_uring requests

This puts io_uring under the memory cgroups accounting and limits for
requests.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2c7ff0b1b086..bffed6aa5722 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -10350,7 +10350,8 @@ static int __init io_uring_init(void)
 
 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
-	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+				SLAB_ACCOUNT);
 	return 0;
 };
 __initcall(io_uring_init);

From 26bfa89e25f42d2b26fe951bbcf04bb13937fbba Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 9 Feb 2021 20:14:12 -0700
Subject: [PATCH 127/183] io_uring: place ring SQ/CQ arrays under memcg memory
 limits

Instead of imposing rlimit memlock limits for the rings themselves,
ensure that we account them properly under memcg with __GFP_ACCOUNT.
We retain rlimit memlock for registered buffers, this is just for the
ring arrays themselves.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 85 ++++++---------------------------------------------
 1 file changed, 10 insertions(+), 75 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bffed6aa5722..7a1e4ecf5f94 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1017,11 +1017,6 @@ static const struct io_op_def io_op_defs[] = {
 	},
 };
 
-enum io_mem_account {
-	ACCT_LOCKED,
-	ACCT_PINNED,
-};
-
 static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 struct files_struct *files);
@@ -8355,25 +8350,16 @@ static inline int __io_account_mem(struct user_struct *user,
 	return 0;
 }
 
-static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
-			     enum io_mem_account acct)
+static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 {
 	if (ctx->limit_mem)
 		__io_unaccount_mem(ctx->user, nr_pages);
 
-	if (ctx->mm_account) {
-		if (acct == ACCT_LOCKED) {
-			mmap_write_lock(ctx->mm_account);
-			ctx->mm_account->locked_vm -= nr_pages;
-			mmap_write_unlock(ctx->mm_account);
-		}else if (acct == ACCT_PINNED) {
-			atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
-		}
-	}
+	if (ctx->mm_account)
+		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
 }
 
-static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
-			  enum io_mem_account acct)
+static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 {
 	int ret;
 
@@ -8383,15 +8369,8 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
 			return ret;
 	}
 
-	if (ctx->mm_account) {
-		if (acct == ACCT_LOCKED) {
-			mmap_write_lock(ctx->mm_account);
-			ctx->mm_account->locked_vm += nr_pages;
-			mmap_write_unlock(ctx->mm_account);
-		} else if (acct == ACCT_PINNED) {
-			atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
-		}
-	}
+	if (ctx->mm_account)
+		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
 
 	return 0;
 }
@@ -8411,7 +8390,7 @@ static void io_mem_free(void *ptr)
 static void *io_mem_alloc(size_t size)
 {
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
-				__GFP_NORETRY;
+				__GFP_NORETRY | __GFP_ACCOUNT;
 
 	return (void *) __get_free_pages(gfp_flags, get_order(size));
 }
@@ -8445,18 +8424,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
 	return off;
 }
 
-static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
-{
-	size_t pages;
-
-	pages = (size_t)1 << get_order(
-		rings_size(sq_entries, cq_entries, NULL));
-	pages += (size_t)1 << get_order(
-		array_size(sizeof(struct io_uring_sqe), sq_entries));
-
-	return pages;
-}
-
 static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
 	int i, j;
@@ -8471,7 +8438,7 @@ static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 			unpin_user_page(imu->bvec[j].bv_page);
 
 		if (imu->acct_pages)
-			io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
+			io_unaccount_mem(ctx, imu->acct_pages);
 		kvfree(imu->bvec);
 		imu->nr_bvecs = 0;
 	}
@@ -8569,7 +8536,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	if (!imu->acct_pages)
 		return 0;
 
-	ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED);
+	ret = io_account_mem(ctx, imu->acct_pages);
 	if (ret)
 		imu->acct_pages = 0;
 	return ret;
@@ -8949,14 +8916,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	/* if we failed setting up the ctx, we might not have any rings */
 	io_iopoll_try_reap_events(ctx);
 
-	/*
-	 * Do this upfront, so we won't have a grace period where the ring
-	 * is closed but resources aren't reaped yet. This can cause
-	 * spurious failure in setting up a new ring.
-	 */
-	io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries),
-			 ACCT_LOCKED);
-
 	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
 	/*
 	 * Use system_unbound_wq to avoid spawning tons of event kworkers
@@ -9780,7 +9739,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 	struct user_struct *user = NULL;
 	struct io_ring_ctx *ctx;
 	struct file *file;
-	bool limit_mem;
 	int ret;
 
 	if (!entries)
@@ -9821,26 +9779,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 	}
 
 	user = get_uid(current_user());
-	limit_mem = !capable(CAP_IPC_LOCK);
-
-	if (limit_mem) {
-		ret = __io_account_mem(user,
-				ring_pages(p->sq_entries, p->cq_entries));
-		if (ret) {
-			free_uid(user);
-			return ret;
-		}
-	}
 
 	ctx = io_ring_ctx_alloc(p);
 	if (!ctx) {
-		if (limit_mem)
-			__io_unaccount_mem(user, ring_pages(p->sq_entries,
-								p->cq_entries));
 		free_uid(user);
 		return -ENOMEM;
 	}
 	ctx->compat = in_compat_syscall();
+	ctx->limit_mem = !capable(CAP_IPC_LOCK);
 	ctx->user = user;
 	ctx->creds = get_current_cred();
 #ifdef CONFIG_AUDIT
@@ -9876,17 +9822,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 		goto err;
 	}
 #endif
-
-	/*
-	 * Account memory _before_ installing the file descriptor. Once
-	 * the descriptor is installed, it can get closed at any time. Also
-	 * do this before hitting the general error path, as ring freeing
-	 * will un-account as well.
-	 */
-	io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
-		       ACCT_LOCKED);
-	ctx->limit_mem = limit_mem;
-
 	ret = io_allocate_scq_urings(ctx, p);
 	if (ret)
 		goto err;

From 71dda2a5625f31bc3410cb69c3d31376a2b66f28 Mon Sep 17 00:00:00 2001
From: dongdong tao <dongdong.tao@canonical.com>
Date: Wed, 10 Feb 2021 13:07:23 +0800
Subject: [PATCH 128/183] bcache: consider the fragmentation when update the
 writeback rate

Current way to calculate the writeback rate only considered the
dirty sectors, this usually works fine when the fragmentation
is not high, but it will give us unreasonable small rate when
we are under a situation that very few dirty sectors consumed
a lot dirty buckets. In some case, the dirty bucekts can reached
to CUTOFF_WRITEBACK_SYNC while the dirty data(sectors) not even
reached the writeback_percent, the writeback rate will still
be the minimum value (4k), thus it will cause all the writes to be
stucked in a non-writeback mode because of the slow writeback.

We accelerate the rate in 3 stages with different aggressiveness,
the first stage starts when dirty buckets percent reach above
BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW (50), the second is
BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID (57), the third is
BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH (64). By default
the first stage tries to writeback the amount of dirty data
in one bucket (on average) in (1 / (dirty_buckets_percent - 50)) second,
the second stage tries to writeback the amount of dirty data in one bucket
in (1 / (dirty_buckets_percent - 57)) * 100 millisecond, the third
stage tries to writeback the amount of dirty data in one bucket in
(1 / (dirty_buckets_percent - 64)) millisecond.

the initial rate at each stage can be controlled by 3 configurable
parameters writeback_rate_fp_term_{low|mid|high}, they are by default
1, 10, 1000, the hint of IO throughput that these values are trying
to achieve is described by above paragraph, the reason that
I choose those value as default is based on the testing and the
production data, below is some details:

A. When it comes to the low stage, there is still a bit far from the 70
   threshold, so we only want to give it a little bit push by setting the
   term to 1, it means the initial rate will be 170 if the fragment is 6,
   it is calculated by bucket_size/fragment, this rate is very small,
   but still much reasonable than the minimum 8.
   For a production bcache with unheavy workload, if the cache device
   is bigger than 1 TB, it may take hours to consume 1% buckets,
   so it is very possible to reclaim enough dirty buckets in this stage,
   thus to avoid entering the next stage.

B. If the dirty buckets ratio didn't turn around during the first stage,
   it comes to the mid stage, then it is necessary for mid stage
   to be more aggressive than low stage, so i choose the initial rate
   to be 10 times more than low stage, that means 1700 as the initial
   rate if the fragment is 6. This is some normal rate
   we usually see for a normal workload when writeback happens
   because of writeback_percent.

C. If the dirty buckets ratio didn't turn around during the low and mid
   stages, it comes to the third stage, and it is the last chance that
   we can turn around to avoid the horrible cutoff writeback sync issue,
   then we choose 100 times more aggressive than the mid stage, that
   means 170000 as the initial rate if the fragment is 6. This is also
   inferred from a production bcache, I've got one week's writeback rate
   data from a production bcache which has quite heavy workloads,
   again, the writeback is triggered by the writeback percent,
   the highest rate area is around 100000 to 240000, so I believe this
   kind aggressiveness at this stage is reasonable for production.
   And it should be mostly enough because the hint is trying to reclaim
   1000 bucket per second, and from that heavy production env,
   it is consuming 50 bucket per second on average in one week's data.

Option writeback_consider_fragment is to control whether we want
this feature to be on or off, it's on by default.

Lastly, below is the performance data for all the testing result,
including the data from production env:
https://docs.google.com/document/d/1AmbIEa_2MhB9bqhC3rfga9tp7n9YX9PLn0jSUxscVW0/edit?usp=sharing

Signed-off-by: dongdong tao <dongdong.tao@canonical.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h    |  4 ++++
 drivers/md/bcache/sysfs.c     | 23 +++++++++++++++++++
 drivers/md/bcache/writeback.c | 42 +++++++++++++++++++++++++++++++++++
 drivers/md/bcache/writeback.h |  4 ++++
 4 files changed, 73 insertions(+)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 1d57f48307e6..d7a84327b7f1 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -373,6 +373,7 @@ struct cached_dev {
 	unsigned int		partial_stripes_expensive:1;
 	unsigned int		writeback_metadata:1;
 	unsigned int		writeback_running:1;
+	unsigned int		writeback_consider_fragment:1;
 	unsigned char		writeback_percent;
 	unsigned int		writeback_delay;
 
@@ -385,6 +386,9 @@ struct cached_dev {
 	unsigned int		writeback_rate_update_seconds;
 	unsigned int		writeback_rate_i_term_inverse;
 	unsigned int		writeback_rate_p_term_inverse;
+	unsigned int		writeback_rate_fp_term_low;
+	unsigned int		writeback_rate_fp_term_mid;
+	unsigned int		writeback_rate_fp_term_high;
 	unsigned int		writeback_rate_minimum;
 
 	enum stop_on_failure	stop_when_cache_set_failed;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 00a520c03f41..eef15f8022ba 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -117,10 +117,14 @@ rw_attribute(writeback_running);
 rw_attribute(writeback_percent);
 rw_attribute(writeback_delay);
 rw_attribute(writeback_rate);
+rw_attribute(writeback_consider_fragment);
 
 rw_attribute(writeback_rate_update_seconds);
 rw_attribute(writeback_rate_i_term_inverse);
 rw_attribute(writeback_rate_p_term_inverse);
+rw_attribute(writeback_rate_fp_term_low);
+rw_attribute(writeback_rate_fp_term_mid);
+rw_attribute(writeback_rate_fp_term_high);
 rw_attribute(writeback_rate_minimum);
 read_attribute(writeback_rate_debug);
 
@@ -195,6 +199,7 @@ SHOW(__bch_cached_dev)
 	var_printf(bypass_torture_test,	"%i");
 	var_printf(writeback_metadata,	"%i");
 	var_printf(writeback_running,	"%i");
+	var_printf(writeback_consider_fragment,	"%i");
 	var_print(writeback_delay);
 	var_print(writeback_percent);
 	sysfs_hprint(writeback_rate,
@@ -205,6 +210,9 @@ SHOW(__bch_cached_dev)
 	var_print(writeback_rate_update_seconds);
 	var_print(writeback_rate_i_term_inverse);
 	var_print(writeback_rate_p_term_inverse);
+	var_print(writeback_rate_fp_term_low);
+	var_print(writeback_rate_fp_term_mid);
+	var_print(writeback_rate_fp_term_high);
 	var_print(writeback_rate_minimum);
 
 	if (attr == &sysfs_writeback_rate_debug) {
@@ -303,6 +311,7 @@ STORE(__cached_dev)
 	sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test);
 	sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata);
 	sysfs_strtoul_bool(writeback_running, dc->writeback_running);
+	sysfs_strtoul_bool(writeback_consider_fragment, dc->writeback_consider_fragment);
 	sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX);
 
 	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
@@ -331,6 +340,16 @@ STORE(__cached_dev)
 	sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
 			    dc->writeback_rate_p_term_inverse,
 			    1, UINT_MAX);
+	sysfs_strtoul_clamp(writeback_rate_fp_term_low,
+			    dc->writeback_rate_fp_term_low,
+			    1, dc->writeback_rate_fp_term_mid - 1);
+	sysfs_strtoul_clamp(writeback_rate_fp_term_mid,
+			    dc->writeback_rate_fp_term_mid,
+			    dc->writeback_rate_fp_term_low + 1,
+			    dc->writeback_rate_fp_term_high - 1);
+	sysfs_strtoul_clamp(writeback_rate_fp_term_high,
+			    dc->writeback_rate_fp_term_high,
+			    dc->writeback_rate_fp_term_mid + 1, UINT_MAX);
 	sysfs_strtoul_clamp(writeback_rate_minimum,
 			    dc->writeback_rate_minimum,
 			    1, UINT_MAX);
@@ -499,9 +518,13 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_writeback_delay,
 	&sysfs_writeback_percent,
 	&sysfs_writeback_rate,
+	&sysfs_writeback_consider_fragment,
 	&sysfs_writeback_rate_update_seconds,
 	&sysfs_writeback_rate_i_term_inverse,
 	&sysfs_writeback_rate_p_term_inverse,
+	&sysfs_writeback_rate_fp_term_low,
+	&sysfs_writeback_rate_fp_term_mid,
+	&sysfs_writeback_rate_fp_term_high,
 	&sysfs_writeback_rate_minimum,
 	&sysfs_writeback_rate_debug,
 	&sysfs_io_errors,
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index a129e4d2707c..82d4e0880a99 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -88,6 +88,44 @@ static void __update_writeback_rate(struct cached_dev *dc)
 	int64_t integral_scaled;
 	uint32_t new_rate;
 
+	/*
+	 * We need to consider the number of dirty buckets as well
+	 * when calculating the proportional_scaled, Otherwise we might
+	 * have an unreasonable small writeback rate at a highly fragmented situation
+	 * when very few dirty sectors consumed a lot dirty buckets, the
+	 * worst case is when dirty buckets reached cutoff_writeback_sync and
+	 * dirty data is still not even reached to writeback percent, so the rate
+	 * still will be at the minimum value, which will cause the write
+	 * stuck at a non-writeback mode.
+	 */
+	struct cache_set *c = dc->disk.c;
+
+	int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets;
+
+	if (dc->writeback_consider_fragment &&
+		c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) {
+		int64_t fragment =
+			div_s64((dirty_buckets *  c->cache->sb.bucket_size), dirty);
+		int64_t fp_term;
+		int64_t fps;
+
+		if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) {
+			fp_term = dc->writeback_rate_fp_term_low *
+			(c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW);
+		} else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) {
+			fp_term = dc->writeback_rate_fp_term_mid *
+			(c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID);
+		} else {
+			fp_term = dc->writeback_rate_fp_term_high *
+			(c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH);
+		}
+		fps = div_s64(dirty, dirty_buckets) * fp_term;
+		if (fragment > 3 && fps > proportional_scaled) {
+			/* Only overrite the p when fragment > 3 */
+			proportional_scaled = fps;
+		}
+	}
+
 	if ((error < 0 && dc->writeback_rate_integral > 0) ||
 	    (error > 0 && time_before64(local_clock(),
 			 dc->writeback_rate.next + NSEC_PER_MSEC))) {
@@ -977,6 +1015,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 
 	dc->writeback_metadata		= true;
 	dc->writeback_running		= false;
+	dc->writeback_consider_fragment = true;
 	dc->writeback_percent		= 10;
 	dc->writeback_delay		= 30;
 	atomic_long_set(&dc->writeback_rate.rate, 1024);
@@ -984,6 +1023,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 
 	dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
 	dc->writeback_rate_p_term_inverse = 40;
+	dc->writeback_rate_fp_term_low = 1;
+	dc->writeback_rate_fp_term_mid = 10;
+	dc->writeback_rate_fp_term_high = 1000;
 	dc->writeback_rate_i_term_inverse = 10000;
 
 	WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 3f1230e22de0..02b2f9df73f6 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -16,6 +16,10 @@
 
 #define BCH_AUTO_GC_DIRTY_THRESHOLD	50
 
+#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW 50
+#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID 57
+#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH 64
+
 #define BCH_DIRTY_INIT_THRD_MAX	64
 /*
  * 14 (16384ths) is chosen here as something that each backing device

From d7fae7b4fa152795ab70c680d3a63c7843c9368c Mon Sep 17 00:00:00 2001
From: Kai Krakow <kai@kaishome.de>
Date: Wed, 10 Feb 2021 13:07:24 +0800
Subject: [PATCH 129/183] bcache: Fix register_device_aync typo

Should be `register_device_async`.

Cc: Coly Li <colyli@suse.de>
Signed-off-by: Kai Krakow <kai@kaishome.de>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 193fe7652329..dfbaf6aa3e4f 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2517,7 +2517,7 @@ static void register_cache_worker(struct work_struct *work)
 	module_put(THIS_MODULE);
 }
 
-static void register_device_aync(struct async_reg_args *args)
+static void register_device_async(struct async_reg_args *args)
 {
 	if (SB_IS_BDEV(args->sb))
 		INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker);
@@ -2611,7 +2611,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 		args->sb	= sb;
 		args->sb_disk	= sb_disk;
 		args->bdev	= bdev;
-		register_device_aync(args);
+		register_device_async(args);
 		/* No wait and returns to user space */
 		goto async_done;
 	}

From 9f233ffe02e5cef611100cd8c5bcf4de26ca7bef Mon Sep 17 00:00:00 2001
From: Kai Krakow <kai@kaishome.de>
Date: Wed, 10 Feb 2021 13:07:25 +0800
Subject: [PATCH 130/183] Revert "bcache: Kill btree_io_wq"

This reverts commit 56b30770b27d54d68ad51eccc6d888282b568cee.

With the btree using the `system_wq`, I seem to see a lot more desktop
latency than I should.

After some more investigation, it looks like the original assumption
of 56b3077 no longer is true, and bcache has a very high potential of
congesting the `system_wq`. In turn, this introduces laggy desktop
performance, IO stalls (at least with btrfs), and input events may be
delayed.

So let's revert this. It's important to note that the semantics of
using `system_wq` previously mean that `btree_io_wq` should be created
before and destroyed after other bcache wqs to keep the same
assumptions.

Cc: Coly Li <colyli@suse.de>
Cc: stable@vger.kernel.org # 5.4+
Signed-off-by: Kai Krakow <kai@kaishome.de>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h |  2 ++
 drivers/md/bcache/btree.c  | 21 +++++++++++++++++++--
 drivers/md/bcache/super.c  |  4 ++++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d7a84327b7f1..2b8c7dd2cfae 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -1046,5 +1046,7 @@ void bch_debug_exit(void);
 void bch_debug_init(void);
 void bch_request_exit(void);
 int bch_request_init(void);
+void bch_btree_exit(void);
+int bch_btree_init(void);
 
 #endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 910df242c83d..952f022db5a5 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -99,6 +99,8 @@
 #define PTR_HASH(c, k)							\
 	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
 
+static struct workqueue_struct *btree_io_wq;
+
 #define insert_lock(s, b)	((b)->level <= (s)->lock)
 
 
@@ -308,7 +310,7 @@ static void __btree_node_write_done(struct closure *cl)
 	btree_complete_write(b, w);
 
 	if (btree_node_dirty(b))
-		schedule_delayed_work(&b->work, 30 * HZ);
+		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
 
 	closure_return_with_destructor(cl, btree_node_write_unlock);
 }
@@ -481,7 +483,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 	BUG_ON(!i->keys);
 
 	if (!btree_node_dirty(b))
-		schedule_delayed_work(&b->work, 30 * HZ);
+		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
 
 	set_btree_node_dirty(b);
 
@@ -2764,3 +2766,18 @@ void bch_keybuf_init(struct keybuf *buf)
 	spin_lock_init(&buf->lock);
 	array_allocator_init(&buf->freelist);
 }
+
+void bch_btree_exit(void)
+{
+	if (btree_io_wq)
+		destroy_workqueue(btree_io_wq);
+}
+
+int __init bch_btree_init(void)
+{
+	btree_io_wq = create_singlethread_workqueue("bch_btree_io");
+	if (!btree_io_wq)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index dfbaf6aa3e4f..97405aec4b51 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2821,6 +2821,7 @@ static void bcache_exit(void)
 		destroy_workqueue(bcache_wq);
 	if (bch_journal_wq)
 		destroy_workqueue(bch_journal_wq);
+	bch_btree_exit();
 
 	if (bcache_major)
 		unregister_blkdev(bcache_major, "bcache");
@@ -2876,6 +2877,9 @@ static int __init bcache_init(void)
 		return bcache_major;
 	}
 
+	if (bch_btree_init())
+		goto err;
+
 	bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
 	if (!bcache_wq)
 		goto err;

From d797bd9897e3559eb48d68368550d637d32e468c Mon Sep 17 00:00:00 2001
From: Kai Krakow <kai@kaishome.de>
Date: Wed, 10 Feb 2021 13:07:26 +0800
Subject: [PATCH 131/183] bcache: Give btree_io_wq correct semantics again

Before killing `btree_io_wq`, the queue was allocated using
`create_singlethread_workqueue()` which has `WQ_MEM_RECLAIM`. After
killing it, it no longer had this property but `system_wq` is not
single threaded.

Let's combine both worlds and make it multi threaded but able to
reclaim memory.

Cc: Coly Li <colyli@suse.de>
Cc: stable@vger.kernel.org # 5.4+
Signed-off-by: Kai Krakow <kai@kaishome.de>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/btree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 952f022db5a5..fe6dce125aba 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2775,7 +2775,7 @@ void bch_btree_exit(void)
 
 int __init bch_btree_init(void)
 {
-	btree_io_wq = create_singlethread_workqueue("bch_btree_io");
+	btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0);
 	if (!btree_io_wq)
 		return -ENOMEM;
 

From afe78ab46f638ecdf80a35b122ffc92c20d9ae5d Mon Sep 17 00:00:00 2001
From: Kai Krakow <kai@kaishome.de>
Date: Wed, 10 Feb 2021 13:07:27 +0800
Subject: [PATCH 132/183] bcache: Move journal work to new flush wq

This is potentially long running and not latency sensitive, let's get
it out of the way of other latency sensitive events.

As observed in the previous commit, the `system_wq` comes easily
congested by bcache, and this fixes a few more stalls I was observing
every once in a while.

Let's not make this `WQ_MEM_RECLAIM` as it showed to reduce performance
of boot and file system operations in my tests. Also, without
`WQ_MEM_RECLAIM`, I no longer see desktop stalls. This matches the
previous behavior as `system_wq` also does no memory reclaim:

> // workqueue.c:
> system_wq = alloc_workqueue("events", 0, 0);

Cc: Coly Li <colyli@suse.de>
Cc: stable@vger.kernel.org # 5.4+
Signed-off-by: Kai Krakow <kai@kaishome.de>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h  |  1 +
 drivers/md/bcache/journal.c |  4 ++--
 drivers/md/bcache/super.c   | 16 ++++++++++++++++
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 2b8c7dd2cfae..848dd4db1659 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -1005,6 +1005,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent);
 
 extern struct workqueue_struct *bcache_wq;
 extern struct workqueue_struct *bch_journal_wq;
+extern struct workqueue_struct *bch_flush_wq;
 extern struct mutex bch_register_lock;
 extern struct list_head bch_cache_sets;
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index aefbdb7e003b..c6613e817333 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -932,8 +932,8 @@ atomic_t *bch_journal(struct cache_set *c,
 		journal_try_write(c);
 	} else if (!w->dirty) {
 		w->dirty = true;
-		schedule_delayed_work(&c->journal.work,
-				      msecs_to_jiffies(c->journal_delay_ms));
+		queue_delayed_work(bch_flush_wq, &c->journal.work,
+				   msecs_to_jiffies(c->journal_delay_ms));
 		spin_unlock(&c->journal.lock);
 	} else {
 		spin_unlock(&c->journal.lock);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 97405aec4b51..71691f32959b 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -49,6 +49,7 @@ static int bcache_major;
 static DEFINE_IDA(bcache_device_idx);
 static wait_queue_head_t unregister_wait;
 struct workqueue_struct *bcache_wq;
+struct workqueue_struct *bch_flush_wq;
 struct workqueue_struct *bch_journal_wq;
 
 
@@ -2821,6 +2822,8 @@ static void bcache_exit(void)
 		destroy_workqueue(bcache_wq);
 	if (bch_journal_wq)
 		destroy_workqueue(bch_journal_wq);
+	if (bch_flush_wq)
+		destroy_workqueue(bch_flush_wq);
 	bch_btree_exit();
 
 	if (bcache_major)
@@ -2884,6 +2887,19 @@ static int __init bcache_init(void)
 	if (!bcache_wq)
 		goto err;
 
+	/*
+	 * Let's not make this `WQ_MEM_RECLAIM` for the following reasons:
+	 *
+	 * 1. It used `system_wq` before which also does no memory reclaim.
+	 * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and
+	 *    reduced throughput can be observed.
+	 *
+	 * We still want to user our own queue to not congest the `system_wq`.
+	 */
+	bch_flush_wq = alloc_workqueue("bch_flush", 0, 0);
+	if (!bch_flush_wq)
+		goto err;
+
 	bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
 	if (!bch_journal_wq)
 		goto err;

From 6751c1e3cff3aa763c760c08862627069a37b50e Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 10 Feb 2021 13:07:28 +0800
Subject: [PATCH 133/183] bcache: Avoid comma separated statements

Use semicolons and braces.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bset.c  | 12 ++++++++----
 drivers/md/bcache/sysfs.c |  6 ++++--
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 67a2c47f4201..94d38e8a59b3 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -712,8 +712,10 @@ void bch_bset_build_written_tree(struct btree_keys *b)
 	for (j = inorder_next(0, t->size);
 	     j;
 	     j = inorder_next(j, t->size)) {
-		while (bkey_to_cacheline(t, k) < cacheline)
-			prev = k, k = bkey_next(k);
+		while (bkey_to_cacheline(t, k) < cacheline) {
+			prev = k;
+			k = bkey_next(k);
+		}
 
 		t->prev[j] = bkey_u64s(prev);
 		t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k);
@@ -901,8 +903,10 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
 	status = BTREE_INSERT_STATUS_INSERT;
 
 	while (m != bset_bkey_last(i) &&
-	       bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0)
-		prev = m, m = bkey_next(m);
+	       bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) {
+		prev = m;
+		m = bkey_next(m);
+	}
 
 	/* prev is in the tree, if we merge we're done */
 	status = BTREE_INSERT_STATUS_BACK_MERGE;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index eef15f8022ba..cc89f3156d1a 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -1094,8 +1094,10 @@ SHOW(__bch_cache)
 			--n;
 
 		while (cached < p + n &&
-		       *cached == BTREE_PRIO)
-			cached++, n--;
+		       *cached == BTREE_PRIO) {
+			cached++;
+			n--;
+		}
 
 		for (i = 0; i < n; i++)
 			sum += INITIAL_PRIO - cached[i];

From f720a8edbc6470fad8b47d0d4ae092a6c63340bb Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Tue, 2 Feb 2021 15:06:17 +0800
Subject: [PATCH 134/183] nvme: convert sysfs sprintf/snprintf family to
 sysfs_emit

Fix the following coccicheck warning:

./drivers/nvme/host/core.c:3580:8-16: WARNING: use scnprintf or sprintf.
./drivers/nvme/host/core.c:3570:8-16: WARNING: use scnprintf or sprintf.
./drivers/nvme/host/core.c:3560:8-16: WARNING: use scnprintf or sprintf.
./drivers/nvme/host/core.c:3526:8-16: WARNING: use scnprintf or sprintf.
./drivers/nvme/host/core.c:2833:8-16: WARNING: use scnprintf or sprintf.

Reported-by: Abaci Robot<abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4e8e310033c9..0befaad788a0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2848,7 +2848,7 @@ static ssize_t nvme_subsys_show_nqn(struct device *dev,
 	struct nvme_subsystem *subsys =
 		container_of(dev, struct nvme_subsystem, dev);
 
-	return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
+	return sysfs_emit(buf, "%s\n", subsys->subnqn);
 }
 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
 
@@ -3541,7 +3541,7 @@ static ssize_t nvme_sysfs_show_transport(struct device *dev,
 {
 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
 
-	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
+	return sysfs_emit(buf, "%s\n", ctrl->ops->name);
 }
 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
 
@@ -3575,7 +3575,7 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
 {
 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
 
-	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
+	return sysfs_emit(buf, "%s\n", ctrl->subsys->subnqn);
 }
 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
 
@@ -3585,7 +3585,7 @@ static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
 {
 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
 
-	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn);
+	return sysfs_emit(buf, "%s\n", ctrl->opts->host->nqn);
 }
 static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
 
@@ -3595,7 +3595,7 @@ static ssize_t nvme_sysfs_show_hostid(struct device *dev,
 {
 	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
 
-	return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id);
+	return sysfs_emit(buf, "%pU\n", &ctrl->opts->host->id);
 }
 static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
 

From 83fba8c8114748a18e20391565cfdfdf8466075c Mon Sep 17 00:00:00 2001
From: Chao Leng <lengchao@huawei.com>
Date: Mon, 1 Feb 2021 11:49:38 +0800
Subject: [PATCH 135/183] blk-mq: introduce blk_mq_set_request_complete

nvme drivers need to set the state of request to MQ_RQ_COMPLETE when
directly complete request in queue_rq.
So add blk_mq_set_request_complete.

Signed-off-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/blk-mq.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index aabbf6830ffc..2c473c9b8990 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -490,6 +490,18 @@ static inline int blk_mq_request_completed(struct request *rq)
 	return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
 }
 
+/*
+ * 
+ * Set the state to complete when completing a request from inside ->queue_rq.
+ * This is used by drivers that want to ensure special complete actions that
+ * need access to the request are called on failure, e.g. by nvme for
+ * multipathing.
+ */
+static inline void blk_mq_set_request_complete(struct request *rq)
+{
+	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
+}
+
 void blk_mq_start_request(struct request *rq);
 void blk_mq_end_request(struct request *rq, blk_status_t error);
 void __blk_mq_end_request(struct request *rq, blk_status_t error);

From dda3248e7fc306e0ce3612ae96bdd9a36e2ab04f Mon Sep 17 00:00:00 2001
From: Chao Leng <lengchao@huawei.com>
Date: Thu, 4 Feb 2021 08:55:11 +0100
Subject: [PATCH 136/183] nvme: introduce a nvme_host_path_error helper

When using nvme native multipathing, if a path related error occurs
during ->queue_rq, the request needs to be completed with
NVME_SC_HOST_PATH_ERROR so that the request can be failed over.

Introduce a helper to complete the command from ->queue_rq in a wait
that invokes nvme_complete_rq.

Signed-off-by: Chao Leng <lengchao@huawei.com>
[hch: renamed, added a return value to clean up the callers a bit]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 15 +++++++++++++++
 drivers/nvme/host/nvme.h |  1 +
 2 files changed, 16 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0befaad788a0..02579f4f776c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -355,6 +355,21 @@ void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
+/*
+ * Called to unwind from ->queue_rq on a failed command submission so that the
+ * multipathing code gets called to potentially failover to another path.
+ * The caller needs to unwind all transport specific resource allocations and
+ * must return propagate the return value.
+ */
+blk_status_t nvme_host_path_error(struct request *req)
+{
+	nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
+	blk_mq_set_request_complete(req);
+	nvme_complete_rq(req);
+	return BLK_STS_OK;
+}
+EXPORT_SYMBOL_GPL(nvme_host_path_error);
+
 bool nvme_cancel_request(struct request *req, void *data, bool reserved)
 {
 	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a72f07181091..5819f0381041 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -575,6 +575,7 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 }
 
 void nvme_complete_rq(struct request *req);
+blk_status_t nvme_host_path_error(struct request *req);
 bool nvme_cancel_request(struct request *req, void *data, bool reserved);
 void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
 void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);

From ea5e5f42cd2c80d19862dd63a2f3a4e7a99c6a20 Mon Sep 17 00:00:00 2001
From: Chao Leng <lengchao@huawei.com>
Date: Mon, 1 Feb 2021 11:49:39 +0800
Subject: [PATCH 137/183] nvme-fabrics: avoid double completions in
 nvmf_fail_nonready_command

When reconnecting, the request may be completed with
NVME_SC_HOST_PATH_ERROR in nvmf_fail_nonready_command, which currently
set the state of the request to MQ_RQ_IN_FLIGHT before calling
nvme_complete_rq.  When this happens for a request that is freed by
the caller, such as nvme_submit_user_cmd, in the worst case the request
could be completed again in tear down process.

Instead of calling blk_mq_start_request from nvmf_fail_nonready_command,
just use the new nvme_host_path_error helper to complete the command
without starting it.

Signed-off-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 72ac00173500..5dfd806fc2d2 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -552,11 +552,7 @@ blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
 	    !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
 	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
 		return BLK_STS_RESOURCE;
-
-	nvme_req(rq)->status = NVME_SC_HOST_PATH_ERROR;
-	blk_mq_start_request(rq);
-	nvme_complete_rq(rq);
-	return BLK_STS_OK;
+	return nvme_host_path_error(rq);
 }
 EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command);
 

From 62eca39722fd997e3621fc903229917b9f0fb271 Mon Sep 17 00:00:00 2001
From: Chao Leng <lengchao@huawei.com>
Date: Mon, 1 Feb 2021 11:49:40 +0800
Subject: [PATCH 138/183] nvme-rdma: handle nvme_rdma_post_send failures better

nvme_rdma_post_send failing is a path related error and should bounce
to another path when using nvme-multipath.  Call nvme_host_path_error
when nvme_rdma_post_send returns -EIO to ensure nvme_complete_rq gets
invoked to fail over to another path if there is one.

Signed-off-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 6700d8bab68a..53ac4d7442ba 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2098,7 +2098,9 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 err_unmap:
 	nvme_rdma_unmap_data(queue, rq);
 err:
-	if (err == -ENOMEM || err == -EAGAIN)
+	if (err == -EIO)
+		ret = nvme_host_path_error(rq);
+	else if (err == -ENOMEM || err == -EAGAIN)
 		ret = BLK_STS_RESOURCE;
 	else
 		ret = BLK_STS_IOERR;

From fda871c0ba5d2eed2cd1c881573168129da70058 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 3 Feb 2021 15:00:01 -0800
Subject: [PATCH 139/183] nvmet-tcp: fix receive data digest calculation for
 multiple h2cdata PDUs

When a host sends multiple h2cdata PDUs for a single command, we
should verify the data digest calculation per PDU and not
per command.

Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver")
Reported-by: Narayan Ayalasomayajula <Narayan.Ayalasomayajula@wdc.com>
Tested-by: Narayan Ayalasomayajula <Narayan.Ayalasomayajula@wdc.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/tcp.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index dc1f0f647189..c3da50f776fa 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -378,7 +378,7 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
 	return NVME_SC_INTERNAL;
 }
 
-static void nvmet_tcp_ddgst(struct ahash_request *hash,
+static void nvmet_tcp_send_ddgst(struct ahash_request *hash,
 		struct nvmet_tcp_cmd *cmd)
 {
 	ahash_request_set_crypt(hash, cmd->req.sg,
@@ -386,6 +386,23 @@ static void nvmet_tcp_ddgst(struct ahash_request *hash,
 	crypto_ahash_digest(hash);
 }
 
+static void nvmet_tcp_recv_ddgst(struct ahash_request *hash,
+		struct nvmet_tcp_cmd *cmd)
+{
+	struct scatterlist sg;
+	struct kvec *iov;
+	int i;
+
+	crypto_ahash_init(hash);
+	for (i = 0, iov = cmd->iov; i < cmd->nr_mapped; i++, iov++) {
+		sg_init_one(&sg, iov->iov_base, iov->iov_len);
+		ahash_request_set_crypt(hash, &sg, NULL, iov->iov_len);
+		crypto_ahash_update(hash);
+	}
+	ahash_request_set_crypt(hash, NULL, (void *)&cmd->exp_ddgst, 0);
+	crypto_ahash_final(hash);
+}
+
 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
 {
 	struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
@@ -410,7 +427,7 @@ static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
 
 	if (queue->data_digest) {
 		pdu->hdr.flags |= NVME_TCP_F_DDGST;
-		nvmet_tcp_ddgst(queue->snd_hash, cmd);
+		nvmet_tcp_send_ddgst(queue->snd_hash, cmd);
 	}
 
 	if (cmd->queue->hdr_digest) {
@@ -1059,7 +1076,7 @@ static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
 {
 	struct nvmet_tcp_queue *queue = cmd->queue;
 
-	nvmet_tcp_ddgst(queue->rcv_hash, cmd);
+	nvmet_tcp_recv_ddgst(queue->rcv_hash, cmd);
 	queue->offset = 0;
 	queue->left = NVME_TCP_DIGEST_LENGTH;
 	queue->rcv_state = NVMET_TCP_RECV_DDGST;
@@ -1080,14 +1097,14 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
 		cmd->rbytes_done += ret;
 	}
 
+	if (queue->data_digest) {
+		nvmet_tcp_prep_recv_ddgst(cmd);
+		return 0;
+	}
 	nvmet_tcp_unmap_pdu_iovec(cmd);
 
 	if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
 	    cmd->rbytes_done == cmd->req.transfer_len) {
-		if (queue->data_digest) {
-			nvmet_tcp_prep_recv_ddgst(cmd);
-			return 0;
-		}
 		cmd->req.execute(&cmd->req);
 	}
 

From 0fbcfb089a3f2f2a731d01f0aec8f7697a849c28 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 5 Feb 2021 11:47:25 -0800
Subject: [PATCH 140/183] nvmet-tcp: fix potential race of tcp socket closing
 accept_work

When we accept a TCP connection and allocate an nvmet-tcp queue we should
make sure not to fully establish it or reference it as the connection may
be already closing, which triggers queue release work, which does not
fence against queue establishment.

In order to address such a race, we make sure to check the sk_state and
contain the queue reference to be done underneath the sk_callback_lock
such that the queue release work correctly fences against it.

Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver")
Reported-by: Elad Grupi <elad.grupi@dell.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/tcp.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index c3da50f776fa..ac2d9ed23cea 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1484,17 +1484,27 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
 	if (inet->rcv_tos > 0)
 		ip_sock_set_tos(sock->sk, inet->rcv_tos);
 
+	ret = 0;
 	write_lock_bh(&sock->sk->sk_callback_lock);
-	sock->sk->sk_user_data = queue;
-	queue->data_ready = sock->sk->sk_data_ready;
-	sock->sk->sk_data_ready = nvmet_tcp_data_ready;
-	queue->state_change = sock->sk->sk_state_change;
-	sock->sk->sk_state_change = nvmet_tcp_state_change;
-	queue->write_space = sock->sk->sk_write_space;
-	sock->sk->sk_write_space = nvmet_tcp_write_space;
+	if (sock->sk->sk_state != TCP_ESTABLISHED) {
+		/*
+		 * If the socket is already closing, don't even start
+		 * consuming it
+		 */
+		ret = -ENOTCONN;
+	} else {
+		sock->sk->sk_user_data = queue;
+		queue->data_ready = sock->sk->sk_data_ready;
+		sock->sk->sk_data_ready = nvmet_tcp_data_ready;
+		queue->state_change = sock->sk->sk_state_change;
+		sock->sk->sk_state_change = nvmet_tcp_state_change;
+		queue->write_space = sock->sk->sk_write_space;
+		sock->sk->sk_write_space = nvmet_tcp_write_space;
+		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
+	}
 	write_unlock_bh(&sock->sk->sk_callback_lock);
 
-	return 0;
+	return ret;
 }
 
 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
@@ -1542,8 +1552,6 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
 	if (ret)
 		goto out_destroy_sq;
 
-	queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
-
 	return 0;
 out_destroy_sq:
 	mutex_lock(&nvmet_tcp_queue_mutex);

From 73a1a2298f3e9df24cea7a9aab412ba9470f6159 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 5 Feb 2021 11:50:02 -0800
Subject: [PATCH 141/183] nvme-multipath: set nr_zones for zoned namespaces

The bio based drivers only require the request_queue's nr_zones is set,
so set this field in the head if the namespace path is zoned.

Fixes: 240e6ee272c07 ("nvme: support for zoned namespaces")
Reported-by: Minwoo Im <minwoo.im.dev@gmail.com>
Cc: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/multipath.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 65bd6efa5e1c..0696319adaf6 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -677,6 +677,10 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
 	if (blk_queue_stable_writes(ns->queue) && ns->head->disk)
 		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
 				   ns->head->disk->queue);
+#ifdef CONFIG_BLK_DEV_ZONED
+	if (blk_queue_is_zoned(ns->queue) && ns->head->disk)
+		ns->head->disk->queue->nr_zones = ns->queue->nr_zones;
+#endif
 }
 
 void nvme_mpath_remove_disk(struct nvme_ns_head *head)

From b5df8e79a293739f031f25eb45de350165033ea4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 7 Feb 2021 17:17:34 +0100
Subject: [PATCH 142/183] nvmet-fc: add a missing __rcu annotation to
 nvmet_fc_tgt_assoc.queues

Make sparse happy after the recent conversion to RCU lookups.

Fixes: 4e2f02bf77da ("nvmet-fc: use RCU proctection for assoc_list")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: James Smart <james.smart@broadcom.com>
---
 drivers/nvme/target/fc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index c14c60bfdf85..d375745fc4ed 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -165,7 +165,7 @@ struct nvmet_fc_tgt_assoc {
 	struct nvmet_fc_hostport	*hostport;
 	struct nvmet_fc_ls_iod		*rcv_disconn;
 	struct list_head		a_list;
-	struct nvmet_fc_tgt_queue	*queues[NVMET_NR_QUEUES + 1];
+	struct nvmet_fc_tgt_queue __rcu	*queues[NVMET_NR_QUEUES + 1];
 	struct kref			ref;
 	struct work_struct		del_work;
 	struct rcu_head			rcu;

From 40244ad36bcfb796a6bb9e95bdcbf8ddf3134509 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 9 Feb 2021 21:47:52 -0800
Subject: [PATCH 143/183] nvmet: set status to 0 in case for invalid nsid

For unallocated namespace in nvmet_execute_identify_ns() don't set the
status to NVME_SC_INVALID_NS, set it to zero.

Fixes: bffcd507780e ("nvmet: set right status on error in id-ns handler")
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 613a4d8feac1..5070ea5cf260 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -485,7 +485,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 	/* return an all zeroed buffer if we can't find an active namespace */
 	req->ns = nvmet_find_namespace(ctrl, req->cmd->identify.nsid);
 	if (!req->ns) {
-		status = NVME_SC_INVALID_NS;
+		status = 0;
 		goto done;
 	}
 

From aa0aff604a60627b9f6c51c99dd5f63634322668 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 9 Feb 2021 21:47:53 -0800
Subject: [PATCH 144/183] nvmet: return uniform error for invalid ns

For nvmet_find_namespace() error case we have inconsistent error code
mapping in the function nvmet_get_smart_log_nsid() and
nvmet_set_feat_write_protect().

There is no point in retrying for the invalid namesapce from the host
side. Set the error code to the NVME_SC_INVALID_NS | NVME_SC_DNR which
matches what we have in nvmet_execute_identify_desclist().

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 5070ea5cf260..e938064254a5 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -82,7 +82,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 		pr_err("Could not find namespace id : %d\n",
 				le32_to_cpu(req->cmd->get_log_page.nsid));
 		req->error_loc = offsetof(struct nvme_rw_command, nsid);
-		return NVME_SC_INVALID_NS;
+		return NVME_SC_INVALID_NS | NVME_SC_DNR;
 	}
 
 	/* we don't have the right data for file backed ns */
@@ -697,7 +697,7 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
 	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid);
 	if (unlikely(!req->ns)) {
 		req->error_loc = offsetof(struct nvme_common_command, nsid);
-		return status;
+		return status = NVME_SC_INVALID_NS | NVME_SC_DNR;
 	}
 
 	mutex_lock(&subsys->lock);

From 3a1f7c79ae6d3dfdc16082daa44b3cf8dbe4f238 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 9 Feb 2021 21:47:54 -0800
Subject: [PATCH 145/183] nvmet: make nvmet_find_namespace() req based

The six callers of nvmet_find_namespace() duplicate the error log page
update and status setting code for each call on failure.

All callers are nvmet requests based functions, so we can pass req
to the nvmet_find_namesapce() & derive ctrl from req, that'll allow us
to update the error log page in nvmet_find_namespace(). Now that we
pass the request we can also get rid of the local variable in
nvmet_find_namespace() and use the req->ns and return the error code.

Replace the ctrl parameter with nvmet_req for nvmet_find_namespace(),
centralize the error log page update for non allocated namesapces, and
return uniform error for non-allocated namespace.

The nvmet_find_namespace() takes nsid parameter which is from NVMe
commands structures such as get_log_page, identify, rw and common. All
these commands have same offset for the nsid field.

Derive nsid from req->cmd->common.nsid) & remove the extra parameter
from the nvmet_find_namespace().

Lastly now we associate the ns to the req parameter that we pass to the
nvmet_find_namespace(), rename nvmet_find_namespace() to
nvmet_req_find_ns().

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 46 +++++++++++++--------------------
 drivers/nvme/target/core.c      | 24 +++++++++--------
 drivers/nvme/target/nvmet.h     |  2 +-
 3 files changed, 32 insertions(+), 40 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index e938064254a5..f32533480e66 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -75,15 +75,11 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 		struct nvme_smart_log *slog)
 {
 	u64 host_reads, host_writes, data_units_read, data_units_written;
+	u16 status;
 
-	req->ns = nvmet_find_namespace(req->sq->ctrl,
-				       req->cmd->get_log_page.nsid);
-	if (!req->ns) {
-		pr_err("Could not find namespace id : %d\n",
-				le32_to_cpu(req->cmd->get_log_page.nsid));
-		req->error_loc = offsetof(struct nvme_rw_command, nsid);
-		return NVME_SC_INVALID_NS | NVME_SC_DNR;
-	}
+	status = nvmet_req_find_ns(req);
+	if (status)
+		return status;
 
 	/* we don't have the right data for file backed ns */
 	if (!req->ns->bdev)
@@ -468,7 +464,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 {
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 	struct nvme_id_ns *id;
-	u16 status = 0;
+	u16 status;
 
 	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
 		req->error_loc = offsetof(struct nvme_identify, nsid);
@@ -483,8 +479,8 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 	}
 
 	/* return an all zeroed buffer if we can't find an active namespace */
-	req->ns = nvmet_find_namespace(ctrl, req->cmd->identify.nsid);
-	if (!req->ns) {
+	status = nvmet_req_find_ns(req);
+	if (status) {
 		status = 0;
 		goto done;
 	}
@@ -604,15 +600,12 @@ static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len,
 
 static void nvmet_execute_identify_desclist(struct nvmet_req *req)
 {
-	u16 status = 0;
 	off_t off = 0;
+	u16 status;
 
-	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
-	if (!req->ns) {
-		req->error_loc = offsetof(struct nvme_identify, nsid);
-		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
+	status = nvmet_req_find_ns(req);
+	if (status)
 		goto out;
-	}
 
 	if (memchr_inv(&req->ns->uuid, 0, sizeof(req->ns->uuid))) {
 		status = nvmet_copy_ns_identifier(req, NVME_NIDT_UUID,
@@ -692,13 +685,11 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
 {
 	u32 write_protect = le32_to_cpu(req->cmd->common.cdw11);
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
-	u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE;
+	u16 status;
 
-	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid);
-	if (unlikely(!req->ns)) {
-		req->error_loc = offsetof(struct nvme_common_command, nsid);
-		return status = NVME_SC_INVALID_NS | NVME_SC_DNR;
-	}
+	status = nvmet_req_find_ns(req);
+	if (status)
+		return status;
 
 	mutex_lock(&subsys->lock);
 	switch (write_protect) {
@@ -799,11 +790,10 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
 	u32 result;
 
-	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid);
-	if (!req->ns)  {
-		req->error_loc = offsetof(struct nvme_common_command, nsid);
-		return NVME_SC_INVALID_NS | NVME_SC_DNR;
-	}
+	result = nvmet_req_find_ns(req);
+	if (result)
+		return result;
+
 	mutex_lock(&subsys->lock);
 	if (req->ns->readonly == true)
 		result = NVME_NS_WRITE_PROTECT;
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 8ce4d59cc9e7..95b58d4b1af2 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -417,15 +417,18 @@ void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
 	cancel_delayed_work_sync(&ctrl->ka_work);
 }
 
-struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
+u16 nvmet_req_find_ns(struct nvmet_req *req)
 {
-	struct nvmet_ns *ns;
+	u32 nsid = le32_to_cpu(req->cmd->common.nsid);
 
-	ns = xa_load(&ctrl->subsys->namespaces, le32_to_cpu(nsid));
-	if (ns)
-		percpu_ref_get(&ns->ref);
+	req->ns = xa_load(&req->sq->ctrl->subsys->namespaces, nsid);
+	if (unlikely(!req->ns)) {
+		req->error_loc = offsetof(struct nvme_common_command, nsid);
+		return NVME_SC_INVALID_NS | NVME_SC_DNR;
+	}
 
-	return ns;
+	percpu_ref_get(&req->ns->ref);
+	return NVME_SC_SUCCESS;
 }
 
 static void nvmet_destroy_namespace(struct percpu_ref *ref)
@@ -862,11 +865,10 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 	if (nvmet_req_passthru_ctrl(req))
 		return nvmet_parse_passthru_io_cmd(req);
 
-	req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
-	if (unlikely(!req->ns)) {
-		req->error_loc = offsetof(struct nvme_common_command, nsid);
-		return NVME_SC_INVALID_NS | NVME_SC_DNR;
-	}
+	ret = nvmet_req_find_ns(req);
+	if (unlikely(ret))
+		return ret;
+
 	ret = nvmet_check_ana_state(req->port, req->ns);
 	if (unlikely(ret)) {
 		req->error_loc = offsetof(struct nvme_common_command, nsid);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 8776dd1a0490..954b3d8451f5 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -443,7 +443,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
 void nvmet_subsys_put(struct nvmet_subsys *subsys);
 void nvmet_subsys_del_ctrls(struct nvmet_subsys *subsys);
 
-struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid);
+u16 nvmet_req_find_ns(struct nvmet_req *req);
 void nvmet_put_namespace(struct nvmet_ns *ns);
 int nvmet_ns_enable(struct nvmet_ns *ns);
 void nvmet_ns_disable(struct nvmet_ns *ns);

From 3999434b6ce6fa452128c36cbb5017f0cd347615 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 9 Feb 2021 21:47:55 -0800
Subject: [PATCH 146/183] nvmet: remove extra variable in id-ns handler

In nvmet_execute_identify_ns() local variable ctrl is accessed only in
one place, remove that and directly use it from nvmet_req->sq->ctrl.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index f32533480e66..552da813da18 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -462,7 +462,6 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 
 static void nvmet_execute_identify_ns(struct nvmet_req *req)
 {
-	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 	struct nvme_id_ns *id;
 	u16 status;
 
@@ -523,7 +522,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 
 	id->lbaf[0].ds = req->ns->blksize_shift;
 
-	if (ctrl->pi_support && nvmet_ns_has_pi(req->ns)) {
+	if (req->sq->ctrl->pi_support && nvmet_ns_has_pi(req->ns)) {
 		id->dpc = NVME_NS_DPC_PI_FIRST | NVME_NS_DPC_PI_LAST |
 			  NVME_NS_DPC_PI_TYPE1 | NVME_NS_DPC_PI_TYPE2 |
 			  NVME_NS_DPC_PI_TYPE3;

From d81d57cf1b4702b7c2fa8ce8f1d5c6961a0c20b5 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 9 Feb 2021 21:47:56 -0800
Subject: [PATCH 147/183] nvmet: add helper to report invalid opcode

In the NVMeOF block device backend, file backend, and passthru backend
we reject and report the commands if opcode is not handled.

Add an helper and use it in block device backend to keep the code
and error message uniform.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c        | 9 +++++++++
 drivers/nvme/target/io-cmd-bdev.c | 5 +----
 drivers/nvme/target/nvmet.h       | 1 +
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 95b58d4b1af2..35ad96261b8f 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -82,6 +82,15 @@ inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
 	return status;
 }
 
+u16 nvmet_report_invalid_opcode(struct nvmet_req *req)
+{
+	pr_debug("unhandled cmd %d on qid %d\n", req->cmd->common.opcode,
+		 req->sq->qid);
+
+	req->error_loc = offsetof(struct nvme_common_command, opcode);
+	return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+}
+
 static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
 		const char *subsysnqn);
 
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 23095bdfce06..105ef2b125cf 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -449,9 +449,6 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
 		req->execute = nvmet_bdev_execute_write_zeroes;
 		return 0;
 	default:
-		pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
-		       req->sq->qid);
-		req->error_loc = offsetof(struct nvme_common_command, opcode);
-		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+		return nvmet_report_invalid_opcode(req);
 	}
 }
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 954b3d8451f5..00f78e41d8c8 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -589,6 +589,7 @@ nvmet_req_passthru_ctrl(struct nvmet_req *req)
 }
 
 u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
+u16 nvmet_report_invalid_opcode(struct nvmet_req *req);
 
 /* Convert a 32-bit number to a 16-bit 0's based number */
 static inline __le16 to0based(u32 a)

From 1c2c76136875d2329339275d431484a33dbb612d Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 9 Feb 2021 21:47:57 -0800
Subject: [PATCH 148/183] nvmet: use invalid cmd opcode helper

In the NVMeOF block device backend, file backend, and passthru backend
we reject and report the commands if opcode is not handled.

Use the previously introduced helper in file backend to reduce the
duplicate code and make the error message uniform.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/io-cmd-file.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 0abbefd9925e..715d4376c997 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -400,9 +400,6 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
 		req->execute = nvmet_file_execute_write_zeroes;
 		return 0;
 	default:
-		pr_err("unhandled cmd for file ns %d on qid %d\n",
-				cmd->common.opcode, req->sq->qid);
-		req->error_loc = offsetof(struct nvme_common_command, opcode);
-		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+		return nvmet_report_invalid_opcode(req);
 	}
 }

From 07116ea50fd3a3b58725389e4abaf1c03bcae641 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 9 Feb 2021 21:47:58 -0800
Subject: [PATCH 149/183] nvmet: use invalid cmd opcode helper

In the NVMeOF block device backend, file backend, and passthru backend
we reject and report the commands if opcode is not handled.

Use the previously introduced helper in the passthru backend to make the
error message uniform.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/passthru.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index cbc88acdd233..3b22f4a868f4 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -494,7 +494,7 @@ u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
 		return nvmet_setup_passthru_command(req);
 	default:
 		/* Reject commands not in the allowlist above */
-		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+		return nvmet_report_invalid_opcode(req);
 	}
 }
 

From d86481e924a7d6e8a40477ffa98077c6c0d77ed5 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 9 Feb 2021 21:47:59 -0800
Subject: [PATCH 150/183] nvmet: use min of device_path and disk len
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In function __assign_req_name() instead of using the DEVICE_NAME_LEN in
strncpy() use min of DISK_NAME_LEN and strlen(req->ns->device_path).

This is needed to turn off the following warnings:-

In file included from drivers/nvme/target/core.c:14:
In function ‘__assign_req_name’,
    inlined from ‘trace_event_raw_event_nvmet_req_init’ at drivers/nvme/target/./trace.h:58:1:
drivers/nvme/target/trace.h:52:3: warning: ‘strncpy’ specified bound 32 equals destination size [-Wstringop-truncation]
   strncpy(name, req->ns->device_path, DISK_NAME_LEN);
   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In function ‘__assign_req_name’,
    inlined from ‘perf_trace_nvmet_req_complete’ at drivers/nvme/target/./trace.h:100:1:
drivers/nvme/target/trace.h:52:3: warning: ‘strncpy’ specified bound 32 equals destination size [-Wstringop-truncation]
   strncpy(name, req->ns->device_path, DISK_NAME_LEN);
   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In function ‘__assign_req_name’,
    inlined from ‘perf_trace_nvmet_req_init’ at drivers/nvme/target/./trace.h:58:1:
drivers/nvme/target/trace.h:52:3: warning: ‘strncpy’ specified bound 32 equals destination size [-Wstringop-truncation]
   strncpy(name, req->ns->device_path, DISK_NAME_LEN);
   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In function ‘__assign_req_name’,
    inlined from ‘trace_event_raw_event_nvmet_req_complete’ at drivers/nvme/target/./trace.h:100:1:
drivers/nvme/target/trace.h:52:3: warning: ‘strncpy’ specified bound 32 equals destination size [-Wstringop-truncation]
   strncpy(name, req->ns->device_path, DISK_NAME_LEN);
   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/trace.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/target/trace.h b/drivers/nvme/target/trace.h
index c14e3249a14d..6109b3806b12 100644
--- a/drivers/nvme/target/trace.h
+++ b/drivers/nvme/target/trace.h
@@ -48,10 +48,13 @@ static inline struct nvmet_ctrl *nvmet_req_to_ctrl(struct nvmet_req *req)
 
 static inline void __assign_req_name(char *name, struct nvmet_req *req)
 {
-	if (req->ns)
-		strncpy(name, req->ns->device_path, DISK_NAME_LEN);
-	else
+	if (!req->ns) {
 		memset(name, 0, DISK_NAME_LEN);
+		return;
+	}
+
+	strncpy(name, req->ns->device_path,
+		min_t(size_t, DISK_NAME_LEN, strlen(req->ns->device_path)));
 }
 #endif
 

From 20c2c3bb83f26c42bf62cc773f96f30848ed11a2 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 9 Feb 2021 21:48:01 -0800
Subject: [PATCH 151/183] nvmet: add nvmet_req_subsys() helper

Just like what we have to get the passthru ctrl from the req, add an
helper to get the subsystem associated with the nvmet_req() instead
of open coding the chain of structures.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 10 +++++-----
 drivers/nvme/target/core.c      |  2 +-
 drivers/nvme/target/nvmet.h     |  7 ++++++-
 drivers/nvme/target/passthru.c  |  4 ++--
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 552da813da18..bc6a774f2124 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -683,7 +683,7 @@ static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req)
 static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
 {
 	u32 write_protect = le32_to_cpu(req->cmd->common.cdw11);
-	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
+	struct nvmet_subsys *subsys = nvmet_req_subsys(req);
 	u16 status;
 
 	status = nvmet_req_find_ns(req);
@@ -742,7 +742,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
 
 void nvmet_execute_set_features(struct nvmet_req *req)
 {
-	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
+	struct nvmet_subsys *subsys = nvmet_req_subsys(req);
 	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
 	u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11);
 	u16 status = 0;
@@ -786,7 +786,7 @@ void nvmet_execute_set_features(struct nvmet_req *req)
 
 static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
 {
-	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
+	struct nvmet_subsys *subsys = nvmet_req_subsys(req);
 	u32 result;
 
 	result = nvmet_req_find_ns(req);
@@ -816,7 +816,7 @@ void nvmet_get_feat_async_event(struct nvmet_req *req)
 
 void nvmet_execute_get_features(struct nvmet_req *req)
 {
-	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
+	struct nvmet_subsys *subsys = nvmet_req_subsys(req);
 	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
 	u16 status = 0;
 
@@ -923,7 +923,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
 
 	if (nvme_is_fabrics(cmd))
 		return nvmet_parse_fabrics_cmd(req);
-	if (req->sq->ctrl->subsys->type == NVME_NQN_DISC)
+	if (nvmet_req_subsys(req)->type == NVME_NQN_DISC)
 		return nvmet_parse_discovery_cmd(req);
 
 	ret = nvmet_check_ctrl_status(req, cmd);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 35ad96261b8f..7e3b194203a4 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -430,7 +430,7 @@ u16 nvmet_req_find_ns(struct nvmet_req *req)
 {
 	u32 nsid = le32_to_cpu(req->cmd->common.nsid);
 
-	req->ns = xa_load(&req->sq->ctrl->subsys->namespaces, nsid);
+	req->ns = xa_load(&nvmet_req_subsys(req)->namespaces, nsid);
 	if (unlikely(!req->ns)) {
 		req->error_loc = offsetof(struct nvme_common_command, nsid);
 		return NVME_SC_INVALID_NS | NVME_SC_DNR;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 00f78e41d8c8..cdfa537b1c0a 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -551,6 +551,11 @@ static inline u32 nvmet_dsm_len(struct nvmet_req *req)
 		sizeof(struct nvme_dsm_range);
 }
 
+static inline struct nvmet_subsys *nvmet_req_subsys(struct nvmet_req *req)
+{
+	return req->sq->ctrl->subsys;
+}
+
 #ifdef CONFIG_NVME_TARGET_PASSTHRU
 void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys);
 int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys);
@@ -585,7 +590,7 @@ static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
 static inline struct nvme_ctrl *
 nvmet_req_passthru_ctrl(struct nvmet_req *req)
 {
-	return nvmet_passthru_ctrl(req->sq->ctrl->subsys);
+	return nvmet_passthru_ctrl(nvmet_req_subsys(req));
 }
 
 u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index 3b22f4a868f4..f50c7b2bf21c 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -239,9 +239,9 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
 		}
 
 		q = ns->queue;
-		timeout = req->sq->ctrl->subsys->io_timeout;
+		timeout = nvmet_req_subsys(req)->io_timeout;
 	} else {
-		timeout = req->sq->ctrl->subsys->admin_timeout;
+		timeout = nvmet_req_subsys(req)->admin_timeout;
 	}
 
 	rq = nvme_alloc_request(q, req->cmd, 0);

From 295a39f5a56f3276bae6a0ae5c26ce06bb8aa21c Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Tue, 9 Feb 2021 21:48:02 -0800
Subject: [PATCH 152/183] nvmet: remove else at the end of the function

The function nvmet_parse_io_cmd() returns value from
nvmet_file_parse_io_cmd() or nvmet_bdev_parse_io_cmd() based on which
backend is set for the request. Remove the else and just return the
value from nvmet_bdev_parse_io_cmd().

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 7e3b194203a4..67bbf0e3b507 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -891,8 +891,8 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 
 	if (req->ns->file)
 		return nvmet_file_parse_io_cmd(req);
-	else
-		return nvmet_bdev_parse_io_cmd(req);
+
+	return nvmet_bdev_parse_io_cmd(req);
 }
 
 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,

From ed7770f6628691c13c9423bce7eee7cff2399c12 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 19 Jan 2021 07:43:18 +0100
Subject: [PATCH 153/183] nvme-hwmon: rework to avoid devm allocation

The original design to use device-managed resource allocation
doesn't really work as the NVMe controller has a vastly different
lifetime than the hwmon sysfs attributes, causing warning about
duplicate sysfs entries upon reconnection.
This patch reworks the hwmon allocation to avoid device-managed
resource allocation, and uses the NVMe controller as parent for
the sysfs attributes.

Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Hannes Reinecke <hare@suse.de>
Tested-by: Enzo Matsumiya <ematsumiya@suse.de>
Tested-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c  |  1 +
 drivers/nvme/host/hwmon.c | 31 +++++++++++++++++++++----------
 drivers/nvme/host/nvme.h  |  8 ++++++++
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 02579f4f776c..d77f3f26d8d3 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4471,6 +4471,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl);
 
 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
 {
+	nvme_hwmon_exit(ctrl);
 	nvme_fault_inject_fini(&ctrl->fault_inject);
 	dev_pm_qos_hide_latency_tolerance(ctrl->device);
 	cdev_device_del(&ctrl->cdev, ctrl->device);
diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c
index 552dbc04567b..8f9e96986780 100644
--- a/drivers/nvme/host/hwmon.c
+++ b/drivers/nvme/host/hwmon.c
@@ -223,12 +223,12 @@ static const struct hwmon_chip_info nvme_hwmon_chip_info = {
 
 int nvme_hwmon_init(struct nvme_ctrl *ctrl)
 {
-	struct device *dev = ctrl->dev;
+	struct device *dev = ctrl->device;
 	struct nvme_hwmon_data *data;
 	struct device *hwmon;
 	int err;
 
-	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return 0;
 
@@ -237,19 +237,30 @@ int nvme_hwmon_init(struct nvme_ctrl *ctrl)
 
 	err = nvme_hwmon_get_smart_log(data);
 	if (err) {
-		dev_warn(ctrl->device,
-			"Failed to read smart log (error %d)\n", err);
-		devm_kfree(dev, data);
+		dev_warn(dev, "Failed to read smart log (error %d)\n", err);
+		kfree(data);
 		return err;
 	}
 
-	hwmon = devm_hwmon_device_register_with_info(dev, "nvme", data,
-						     &nvme_hwmon_chip_info,
-						     NULL);
+	hwmon = hwmon_device_register_with_info(dev, "nvme",
+						data, &nvme_hwmon_chip_info,
+						NULL);
 	if (IS_ERR(hwmon)) {
 		dev_warn(dev, "Failed to instantiate hwmon device\n");
-		devm_kfree(dev, data);
+		kfree(data);
 	}
-
+	ctrl->hwmon_device = hwmon;
 	return 0;
 }
+
+void nvme_hwmon_exit(struct nvme_ctrl *ctrl)
+{
+	if (ctrl->hwmon_device) {
+		struct nvme_hwmon_data *data =
+			dev_get_drvdata(ctrl->hwmon_device);
+
+		hwmon_device_unregister(ctrl->hwmon_device);
+		ctrl->hwmon_device = NULL;
+		kfree(data);
+	}
+}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 5819f0381041..2efb87642d18 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -246,6 +246,9 @@ struct nvme_ctrl {
 	struct rw_semaphore namespaces_rwsem;
 	struct device ctrl_device;
 	struct device *device;	/* char device */
+#ifdef CONFIG_NVME_HWMON
+	struct device *hwmon_device;
+#endif
 	struct cdev cdev;
 	struct work_struct reset_work;
 	struct work_struct delete_work;
@@ -812,11 +815,16 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
 
 #ifdef CONFIG_NVME_HWMON
 int nvme_hwmon_init(struct nvme_ctrl *ctrl);
+void nvme_hwmon_exit(struct nvme_ctrl *ctrl);
 #else
 static inline int nvme_hwmon_init(struct nvme_ctrl *ctrl)
 {
 	return 0;
 }
+
+static inline void nvme_hwmon_exit(struct nvme_ctrl *ctrl)
+{
+}
 #endif
 
 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,

From 4bdf260362b3be529d170b04662638fd6dc52241 Mon Sep 17 00:00:00 2001
From: Filippo Sironi <sironi@amazon.de>
Date: Wed, 10 Feb 2021 01:39:42 +0100
Subject: [PATCH 154/183] nvme: add 48-bit DMA address quirk for Amazon NVMe
 controllers

Some Amazon NVMe controllers do not follow the NVMe specification
and are limited to 48-bit DMA addresses.  Add a quirk to force
bounce buffering if needed and limit the IOVA allocation for these
devices.

This affects all current Amazon NVMe controllers that expose EBS
volumes (0x0061, 0x0065, 0x8061) and local instance storage
(0xcd00, 0xcd01, 0xcd02).

Signed-off-by: Filippo Sironi <sironi@amazon.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/nvme.h |  6 ++++++
 drivers/nvme/host/pci.c  | 21 ++++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2efb87642d18..07b34175c6ce 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -144,6 +144,12 @@ enum nvme_quirks {
 	 * NVMe 1.3 compliance.
 	 */
 	NVME_QUIRK_NO_NS_DESC_LIST		= (1 << 15),
+
+	/*
+	 * The controller does not properly handle DMA addresses over
+	 * 48 bits.
+	 */
+	NVME_QUIRK_DMA_ADDRESS_BITS_48		= (1 << 16),
 };
 
 /*
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5b78e68be9a1..0045c5edf629 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2362,13 +2362,16 @@ static int nvme_pci_enable(struct nvme_dev *dev)
 {
 	int result = -ENOMEM;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	int dma_address_bits = 64;
 
 	if (pci_enable_device_mem(pdev))
 		return result;
 
 	pci_set_master(pdev);
 
-	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)))
+	if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
+		dma_address_bits = 48;
+	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits)))
 		goto disable;
 
 	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
@@ -3257,6 +3260,22 @@ static const struct pci_device_id nvme_id_table[] = {
 		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
 	{ PCI_DEVICE(0x15b7, 0x2001),   /*  Sandisk Skyhawk */
 		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+	{ PCI_DEVICE(0x1d97, 0x2263),   /* SPCC */
+		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+	{ PCI_DEVICE(0x2646, 0x2263),   /* KINGSTON A2000 NVMe SSD  */
+		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061),
+		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065),
+		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x8061),
+		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd00),
+		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd01),
+		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02),
+		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
 		.driver_data = NVME_QUIRK_SINGLE_VECTOR },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },

From a2d52a6c1b6764031b6cac7cc156530cbb38248c Mon Sep 17 00:00:00 2001
From: Liao Pingfang <winndows@163.com>
Date: Sat, 6 Feb 2021 15:10:55 +0800
Subject: [PATCH 155/183] nbd: Convert to DEFINE_SHOW_ATTRIBUTE

Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code.

Signed-off-by: Liao Pingfang <winndows@163.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/nbd.c | 28 ++++------------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 6727358e147d..b076a0a53fb1 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1521,17 +1521,7 @@ static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
 	return 0;
 }
 
-static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, nbd_dbg_tasks_show, inode->i_private);
-}
-
-static const struct file_operations nbd_dbg_tasks_ops = {
-	.open = nbd_dbg_tasks_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(nbd_dbg_tasks);
 
 static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
 {
@@ -1556,17 +1546,7 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
 	return 0;
 }
 
-static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, nbd_dbg_flags_show, inode->i_private);
-}
-
-static const struct file_operations nbd_dbg_flags_ops = {
-	.open = nbd_dbg_flags_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(nbd_dbg_flags);
 
 static int nbd_dev_dbg_init(struct nbd_device *nbd)
 {
@@ -1584,11 +1564,11 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
 	}
 	config->dbg_dir = dir;
 
-	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
+	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_fops);
 	debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
 	debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
 	debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
-	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
+	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_fops);
 
 	return 0;
 }

From 34343786ecc5ff493ca4d1f873b4386759ba52ee Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 10 Feb 2021 11:45:42 +0000
Subject: [PATCH 156/183] io_uring: unpark SQPOLL thread for cancelation

We park SQPOLL task before going into io_uring_cancel_files(), so the
task won't run task_works including those that might be important for
the cancellation passes. In this case it's io_poll_remove_one(), which
frees requests via io_put_req_deferred().

Unpark it for while waiting, it's ok as we disable submissions
beforehand, so no new requests will be generated.

INFO: task syz-executor893:8493 blocked for more than 143 seconds.
Call Trace:
 context_switch kernel/sched/core.c:4327 [inline]
 __schedule+0x90c/0x21a0 kernel/sched/core.c:5078
 schedule+0xcf/0x270 kernel/sched/core.c:5157
 io_uring_cancel_files fs/io_uring.c:8912 [inline]
 io_uring_cancel_task_requests+0xe70/0x11a0 fs/io_uring.c:8979
 __io_uring_files_cancel+0x110/0x1b0 fs/io_uring.c:9067
 io_uring_files_cancel include/linux/io_uring.h:51 [inline]
 do_exit+0x2fe/0x2ae0 kernel/exit.c:780
 do_group_exit+0x125/0x310 kernel/exit.c:922
 __do_sys_exit_group kernel/exit.c:933 [inline]
 __se_sys_exit_group kernel/exit.c:931 [inline]
 __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:931
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Cc: stable@vger.kernel.org # 5.5+
Reported-by: syzbot+695b03d82fa8e4901b06@syzkaller.appspotmail.com
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7a1e4ecf5f94..9ed79509f389 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9047,11 +9047,16 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 			break;
 
 		io_uring_try_cancel_requests(ctx, task, files);
+
+		if (ctx->sq_data)
+			io_sq_thread_unpark(ctx->sq_data);
 		prepare_to_wait(&task->io_uring->wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		if (inflight == io_uring_count_inflight(ctx, task, files))
 			schedule();
 		finish_wait(&task->io_uring->wait, &wait);
+		if (ctx->sq_data)
+			io_sq_thread_park(ctx->sq_data);
 	}
 }
 

From 4a245479c2312e6b51862c21af134d4191ab9cf7 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 10 Feb 2021 20:00:07 +0000
Subject: [PATCH 157/183] io_uring: remove redundant initialization of variable
 ret

The variable ret is being initialized with a value that is never read
and it is being updated later with a new value.  The initialization is
redundant and can be removed.

Addresses-Coverity: ("Unused value")
Fixes: b63534c41e20 ("io_uring: re-issue block requests that failed because of resources")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9ed79509f389..f730af32c17a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2796,7 +2796,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res,
 static bool io_resubmit_prep(struct io_kiocb *req)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
-	int rw, ret = -ECANCELED;
+	int rw, ret;
 	struct iov_iter iter;
 
 	/* already prepared */

From 597886836164ef18b76faea7304357556fe29da9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 10 Feb 2021 15:51:59 -0800
Subject: [PATCH 158/183] block: Replace lkml.org links with lore

As started by commit 05a5f51ca566 ("Documentation: Replace lkml.org
links with lore"), replace lkml.org links with lore to better use a
single source that's more likely to stay available long-term.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/aoe/aoecmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index ac720bdcd983..ecd77897a761 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1046,7 +1046,7 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
 
 	__blk_mq_end_request(rq, err);
 
-	/* cf. http://lkml.org/lkml/2006/10/31/28 */
+	/* cf. https://lore.kernel.org/lkml/20061031071040.GS14055@kernel.dk/ */
 	if (!fastfail)
 		blk_mq_run_hw_queues(q, true);
 }

From e11e5116171dedeaf63735931e72ad5de0f30ed5 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 10 Feb 2021 14:04:00 -0800
Subject: [PATCH 159/183] nvme-tcp: fix crash triggered with a dataless request
 submission

write-zeros has a bio, but does not have any data buffers associated
with it. Hence should not initialize the request iter for it (which
attempts to reference the bi_io_vec (and crash).
--
 run blktests nvme/012 at 2021-02-05 21:53:34
 BUG: kernel NULL pointer dereference, address: 0000000000000008
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 0 P4D 0
 Oops: 0000 [#1] SMP NOPTI
 CPU: 15 PID: 12069 Comm: kworker/15:2H Tainted: G S        I       5.11.0-rc6+ #1
 Hardware name: Dell Inc. PowerEdge R640/06NR82, BIOS 2.10.0 11/12/2020
 Workqueue: kblockd blk_mq_run_work_fn
 RIP: 0010:nvme_tcp_init_iter+0x7d/0xd0 [nvme_tcp]
 RSP: 0018:ffffbd084447bd18 EFLAGS: 00010246
 RAX: 0000000000000000 RBX: ffffa0bba9f3ce80 RCX: 0000000000000000
 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000002000000
 RBP: ffffa0ba8ac6fec0 R08: 0000000002000000 R09: 0000000000000000
 R10: 0000000002800809 R11: 0000000000000000 R12: 0000000000000000
 R13: ffffa0bba9f3cf90 R14: 0000000000000000 R15: 0000000000000000
 FS:  0000000000000000(0000) GS:ffffa0c9ff9c0000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000008 CR3: 00000001c9c6c005 CR4: 00000000007706e0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 PKRU: 55555554
 Call Trace:
  nvme_tcp_queue_rq+0xef/0x330 [nvme_tcp]
  blk_mq_dispatch_rq_list+0x11c/0x7c0
  ? blk_mq_flush_busy_ctxs+0xf6/0x110
  __blk_mq_sched_dispatch_requests+0x12b/0x170
  blk_mq_sched_dispatch_requests+0x30/0x60
  __blk_mq_run_hw_queue+0x2b/0x60
  process_one_work+0x1cb/0x360
  ? process_one_work+0x360/0x360
  worker_thread+0x30/0x370
  ? process_one_work+0x360/0x360
  kthread+0x116/0x130
  ? kthread_park+0x80/0x80
  ret_from_fork+0x1f/0x30
--

Fixes: cb9b870fba3e ("nvme-tcp: fix wrong setting of request iov_iter")
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Tested-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 619b0d8f6e38..69f59d2c5799 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2271,7 +2271,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
 	req->data_len = blk_rq_nr_phys_segments(rq) ?
 				blk_rq_payload_bytes(rq) : 0;
 	req->curr_bio = rq->bio;
-	if (req->curr_bio)
+	if (req->curr_bio && req->data_len)
 		nvme_tcp_init_iter(req, rq_data_dir(rq));
 
 	if (rq_data_dir(rq) == WRITE &&

From e68a3ff8c342b655f01f74a577c15605eec9aa12 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 11 Feb 2021 07:45:08 -0700
Subject: [PATCH 160/183] io_uring: assign file_slot prior to calling
 io_sqe_file_register()

We use the assigned slot in io_sqe_file_register(), and a previous
patch moved the assignment to after we have called it. This isn't
super pretty, and will get cleaned up in the future. For now, fix
the regression by restoring the previous assignment/clear of the
file_slot.

Fixes: ea64ec02b31d ("io_uring: deduplicate file table slot calculation")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f730af32c17a..cd9c4c05f6f5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8112,12 +8112,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
+			*file_slot = file;
 			err = io_sqe_file_register(ctx, file, i);
 			if (err) {
+				*file_slot = NULL;
 				fput(file);
 				break;
 			}
-			*file_slot = file;
 		}
 	}
 

From 3c1a2ead915c1bcb7b1f9e902469ea0ee1f7857f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 11 Feb 2021 10:48:03 -0700
Subject: [PATCH 161/183] io_uring: move submit side state closer in the ring

We recently added the submit side req cache, but it was placed at the
end of the struct. Move it near the other submission state for better
memory placement, and reshuffle a few other members at the same time.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cd9c4c05f6f5..8be7a24aa10e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -346,6 +346,13 @@ struct io_ring_ctx {
 		struct io_uring_sqe	*sq_sqes;
 	} ____cacheline_aligned_in_smp;
 
+	struct {
+		struct mutex		uring_lock;
+		wait_queue_head_t	wait;
+	} ____cacheline_aligned_in_smp;
+
+	struct io_submit_state		submit_state;
+
 	struct io_rings	*rings;
 
 	/* IO offload */
@@ -413,11 +420,6 @@ struct io_ring_ctx {
 		struct eventfd_ctx	*cq_ev_fd;
 	} ____cacheline_aligned_in_smp;
 
-	struct {
-		struct mutex		uring_lock;
-		wait_queue_head_t	wait;
-	} ____cacheline_aligned_in_smp;
-
 	struct {
 		spinlock_t		completion_lock;
 
@@ -441,9 +443,10 @@ struct io_ring_ctx {
 	struct list_head		rsrc_ref_list;
 	spinlock_t			rsrc_ref_lock;
 
-	struct work_struct		exit_work;
 	struct io_restriction		restrictions;
-	struct io_submit_state		submit_state;
+
+	/* Keep this last, we don't need it for the fast path */
+	struct work_struct		exit_work;
 };
 
 /*

From 6e833d538b3123767393c987d11c40b7728b3f79 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 11 Feb 2021 18:28:20 +0000
Subject: [PATCH 162/183] io_uring: clean up io_req_free_batch_finish()

io_req_free_batch_finish() is final and does not permit struct req_batch
to be reused without re-init. To be more consistent don't clear ->task
there.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8be7a24aa10e..fe06ca43e832 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2397,10 +2397,8 @@ static inline void io_init_req_batch(struct req_batch *rb)
 static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
 				     struct req_batch *rb)
 {
-	if (rb->task) {
+	if (rb->task)
 		io_put_task(rb->task, rb->task_refs);
-		rb->task = NULL;
-	}
 	if (rb->ctx_refs)
 		percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
 }

From f161340d9e85b9038031b497b32383e50ff00ca1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 11 Feb 2021 18:28:21 +0000
Subject: [PATCH 163/183] io_uring: simplify iopoll reissuing

Don't stash -EAGAIN'ed iopoll requests into a list to reissue it later,
do it eagerly. It removes overhead on keeping and checking that list,
and allows in case of failure for these requests to be completed through
normal iopoll completion path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fe06ca43e832..18e449b5d632 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1029,8 +1029,7 @@ static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
 				     struct fixed_rsrc_ref_node *ref_node);
 
-static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
-			     unsigned int issue_flags);
+static bool io_rw_reissue(struct io_kiocb *req, long res);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req, int nr);
@@ -2558,17 +2557,6 @@ static inline bool io_run_task_work(void)
 	return false;
 }
 
-static void io_iopoll_queue(struct list_head *again)
-{
-	struct io_kiocb *req;
-
-	do {
-		req = list_first_entry(again, struct io_kiocb, inflight_entry);
-		list_del(&req->inflight_entry);
-		__io_complete_rw(req, -EAGAIN, 0, 0);
-	} while (!list_empty(again));
-}
-
 /*
  * Find and free completed poll iocbs
  */
@@ -2577,7 +2565,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 {
 	struct req_batch rb;
 	struct io_kiocb *req;
-	LIST_HEAD(again);
 
 	/* order with ->result store in io_complete_rw_iopoll() */
 	smp_rmb();
@@ -2587,14 +2574,14 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		int cflags = 0;
 
 		req = list_first_entry(done, struct io_kiocb, inflight_entry);
-		if (READ_ONCE(req->result) == -EAGAIN) {
-			req->result = 0;
-			req->iopoll_completed = 0;
-			list_move_tail(&req->inflight_entry, &again);
-			continue;
-		}
 		list_del(&req->inflight_entry);
 
+		if (READ_ONCE(req->result) == -EAGAIN) {
+			req->iopoll_completed = 0;
+			if (io_rw_reissue(req, -EAGAIN))
+				continue;
+		}
+
 		if (req->flags & REQ_F_BUFFER_SELECTED)
 			cflags = io_put_rw_kbuf(req);
 
@@ -2608,9 +2595,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	io_commit_cqring(ctx);
 	io_cqring_ev_posted_iopoll(ctx);
 	io_req_free_batch_finish(ctx, &rb);
-
-	if (!list_empty(&again))
-		io_iopoll_queue(&again);
 }
 
 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,

From 23faba36ce287e4af9018dea51893a1067701508 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 11 Feb 2021 18:28:22 +0000
Subject: [PATCH 164/183] io_uring: move res check out of io_rw_reissue()

We pass return code into io_rw_reissue() only to be able to check if it's
-EAGAIN. That's not the cleanest approach and may prevent inlining of the
non-EAGAIN fast path, so do it at call sites.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 18e449b5d632..c873ec113bcc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1029,7 +1029,7 @@ static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node(
 static void init_fixed_file_ref_node(struct io_ring_ctx *ctx,
 				     struct fixed_rsrc_ref_node *ref_node);
 
-static bool io_rw_reissue(struct io_kiocb *req, long res);
+static bool io_rw_reissue(struct io_kiocb *req);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
 static void io_put_req_deferred(struct io_kiocb *req, int nr);
@@ -2578,7 +2578,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 
 		if (READ_ONCE(req->result) == -EAGAIN) {
 			req->iopoll_completed = 0;
-			if (io_rw_reissue(req, -EAGAIN))
+			if (io_rw_reissue(req))
 				continue;
 		}
 
@@ -2812,15 +2812,12 @@ static bool io_resubmit_prep(struct io_kiocb *req)
 }
 #endif
 
-static bool io_rw_reissue(struct io_kiocb *req, long res)
+static bool io_rw_reissue(struct io_kiocb *req)
 {
 #ifdef CONFIG_BLOCK
-	umode_t mode;
+	umode_t mode = file_inode(req->file)->i_mode;
 	int ret;
 
-	if (res != -EAGAIN && res != -EOPNOTSUPP)
-		return false;
-	mode = file_inode(req->file)->i_mode;
 	if (!S_ISBLK(mode) && !S_ISREG(mode))
 		return false;
 	if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
@@ -2843,8 +2840,10 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 			     unsigned int issue_flags)
 {
-	if (!io_rw_reissue(req, res))
-		io_complete_rw_common(&req->rw.kiocb, res, issue_flags);
+	if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req))
+		return;
+
+	io_complete_rw_common(&req->rw.kiocb, res, issue_flags);
 }
 
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)

From 2f8e45f16c57360dd4d8b1310c2952a29a8fa890 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 11 Feb 2021 18:28:23 +0000
Subject: [PATCH 165/183] io_uring: inline io_complete_rw_common()

__io_complete_rw() casts request to kiocb for it to be immediately
container_of()'ed by io_complete_rw_common(). And the last function's name
doesn't do a great job of illuminating its purposes, so just inline it in
its only user.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c873ec113bcc..7b1979624320 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2761,22 +2761,6 @@ static void kiocb_end_write(struct io_kiocb *req)
 	file_end_write(req->file);
 }
 
-static void io_complete_rw_common(struct kiocb *kiocb, long res,
-				  unsigned int issue_flags)
-{
-	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
-	int cflags = 0;
-
-	if (kiocb->ki_flags & IOCB_WRITE)
-		kiocb_end_write(req);
-
-	if (res != req->result)
-		req_set_fail_links(req);
-	if (req->flags & REQ_F_BUFFER_SELECTED)
-		cflags = io_put_rw_kbuf(req);
-	__io_req_complete(req, issue_flags, res, cflags);
-}
-
 #ifdef CONFIG_BLOCK
 static bool io_resubmit_prep(struct io_kiocb *req)
 {
@@ -2840,10 +2824,18 @@ static bool io_rw_reissue(struct io_kiocb *req)
 static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 			     unsigned int issue_flags)
 {
+	int cflags = 0;
+
 	if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req))
 		return;
+	if (res != req->result)
+		req_set_fail_links(req);
 
-	io_complete_rw_common(&req->rw.kiocb, res, issue_flags);
+	if (req->rw.kiocb.ki_flags & IOCB_WRITE)
+		kiocb_end_write(req);
+	if (req->flags & REQ_F_BUFFER_SELECTED)
+		cflags = io_put_rw_kbuf(req);
+	__io_req_complete(req, issue_flags, res, cflags);
 }
 
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)

From bd75904590de1c2bbdff55180cef209b13bd50fa Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 12 Feb 2021 03:23:50 +0000
Subject: [PATCH 166/183] io_uring: take compl state from submit state

Completion and submission states are now coupled together, it's weird to
get one from argument and another from ctx, do it consistently for
io_req_free_batch(). It's also faster as we already have @state cached
in registers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7b1979624320..8c2613bf54d3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2417,13 +2417,10 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
 	rb->ctx_refs++;
 
 	io_dismantle_req(req);
-	if (state->free_reqs != ARRAY_SIZE(state->reqs)) {
+	if (state->free_reqs != ARRAY_SIZE(state->reqs))
 		state->reqs[state->free_reqs++] = req;
-	} else {
-		struct io_comp_state *cs = &req->ctx->submit_state.comp;
-
-		list_add(&req->compl.list, &cs->free_list);
-	}
+	else
+		list_add(&req->compl.list, &state->comp.free_list);
 }
 
 static void io_submit_flush_completions(struct io_comp_state *cs,

From d3d7298d05cb026305b0f5033acc9c9c4f281e14 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 12 Feb 2021 03:23:51 +0000
Subject: [PATCH 167/183] io_uring: optimise out unlikely link queue

__io_queue_sqe() tries to issue as much requests of a link as it can,
and uses io_put_req_find_next() to extract a next one, targeting inline
completed requests. As now __io_queue_sqe() is always used together with
struct io_comp_state, it leaves next propagation only a small window and
only for async reqs, that doesn't justify its existence.

Remove it, make __io_queue_sqe() to issue only a head request. It
simplifies the code and will allow other optimisations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 42 ++++++++++--------------------------------
 1 file changed, 10 insertions(+), 32 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8c2613bf54d3..26d1080217e5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6563,26 +6563,20 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 
 static void __io_queue_sqe(struct io_kiocb *req)
 {
-	struct io_kiocb *linked_timeout;
+	struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
 	const struct cred *old_creds = NULL;
 	int ret;
 
-again:
-	linked_timeout = io_prep_linked_timeout(req);
-
 	if ((req->flags & REQ_F_WORK_INITIALIZED) &&
 	    (req->work.flags & IO_WQ_WORK_CREDS) &&
-	    req->work.identity->creds != current_cred()) {
-		if (old_creds)
-			revert_creds(old_creds);
-		if (old_creds == req->work.identity->creds)
-			old_creds = NULL; /* restored original creds */
-		else
-			old_creds = override_creds(req->work.identity->creds);
-	}
+	    req->work.identity->creds != current_cred())
+		old_creds = override_creds(req->work.identity->creds);
 
 	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
 
+	if (old_creds)
+		revert_creds(old_creds);
+
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
 	 * doesn't support non-blocking read/write attempts
@@ -6595,9 +6589,6 @@ static void __io_queue_sqe(struct io_kiocb *req)
 			 */
 			io_queue_async_work(req);
 		}
-
-		if (linked_timeout)
-			io_queue_linked_timeout(linked_timeout);
 	} else if (likely(!ret)) {
 		/* drop submission reference */
 		if (req->flags & REQ_F_COMPLETE_INLINE) {
@@ -6605,31 +6596,18 @@ static void __io_queue_sqe(struct io_kiocb *req)
 			struct io_comp_state *cs = &ctx->submit_state.comp;
 
 			cs->reqs[cs->nr++] = req;
-			if (cs->nr == IO_COMPL_BATCH)
+			if (cs->nr == ARRAY_SIZE(cs->reqs))
 				io_submit_flush_completions(cs, ctx);
-			req = NULL;
 		} else {
-			req = io_put_req_find_next(req);
-		}
-
-		if (linked_timeout)
-			io_queue_linked_timeout(linked_timeout);
-
-		if (req) {
-			if (!(req->flags & REQ_F_FORCE_ASYNC))
-				goto again;
-			io_queue_async_work(req);
+			io_put_req(req);
 		}
 	} else {
-		/* un-prep timeout, so it'll be killed as any other linked */
-		req->flags &= ~REQ_F_LINK_TIMEOUT;
 		req_set_fail_links(req);
 		io_put_req(req);
 		io_req_complete(req, ret);
 	}
-
-	if (old_creds)
-		revert_creds(old_creds);
+	if (linked_timeout)
+		io_queue_linked_timeout(linked_timeout);
 }
 
 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)

From 4e32635834a30b8aa9583d3899a8ecc6416023fb Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 12 Feb 2021 03:23:52 +0000
Subject: [PATCH 168/183] io_uring: optimise SQPOLL mm/files grabbing

There are two reasons for this. First is to optimise
io_sq_thread_acquire_mm_files() for non-SQPOLL case, which currently do
too many checks and function calls in the hot path, e.g. in
io_init_req().

The second is to not grab mm/files when there are not needed. As
__io_queue_sqe() issues only one request now, we can reuse
io_sq_thread_acquire_mm_files() instead of unconditional acquire
mm/files.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 26d1080217e5..813d1ccd7a69 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1145,9 +1145,6 @@ static void io_sq_thread_drop_mm_files(void)
 
 static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx)
 {
-	if (current->flags & PF_EXITING)
-		return -EFAULT;
-
 	if (!current->files) {
 		struct files_struct *files;
 		struct nsproxy *nsproxy;
@@ -1175,15 +1172,9 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
 {
 	struct mm_struct *mm;
 
-	if (current->flags & PF_EXITING)
-		return -EFAULT;
 	if (current->mm)
 		return 0;
 
-	/* Should never happen */
-	if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL)))
-		return -EFAULT;
-
 	task_lock(ctx->sqo_task);
 	mm = ctx->sqo_task->mm;
 	if (unlikely(!mm || !mmget_not_zero(mm)))
@@ -1198,8 +1189,8 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
 	return -EFAULT;
 }
 
-static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
-					 struct io_kiocb *req)
+static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
+					   struct io_kiocb *req)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
 	int ret;
@@ -1219,6 +1210,16 @@ static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
 	return 0;
 }
 
+static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
+						struct io_kiocb *req)
+{
+	if (unlikely(current->flags & PF_EXITING))
+		return -EFAULT;
+	if (!(ctx->flags & IORING_SETUP_SQPOLL))
+		return 0;
+	return __io_sq_thread_acquire_mm_files(ctx, req);
+}
+
 static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
 					 struct cgroup_subsys_state **cur_css)
 
@@ -2336,9 +2337,7 @@ static void __io_req_task_submit(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	mutex_lock(&ctx->uring_lock);
-	if (!ctx->sqo_dead &&
-	    !__io_sq_thread_acquire_mm(ctx) &&
-	    !__io_sq_thread_acquire_files(ctx))
+	if (!ctx->sqo_dead && !io_sq_thread_acquire_mm_files(ctx, req))
 		__io_queue_sqe(req);
 	else
 		__io_req_task_cancel(req, -EFAULT);

From 921b9054e0c4c443c479c21800f6c4c8b43fa1b0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 12 Feb 2021 03:23:53 +0000
Subject: [PATCH 169/183] io_uring: don't duplicate io_req_task_queue()

Don't hand code io_req_task_queue() inside of io_async_buf_func(), just
call it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 813d1ccd7a69..5c0b1a7dba80 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3494,7 +3494,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 	struct wait_page_queue *wpq;
 	struct io_kiocb *req = wait->private;
 	struct wait_page_key *key = arg;
-	int ret;
 
 	wpq = container_of(wait, struct wait_page_queue, wait);
 
@@ -3504,14 +3503,9 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
 	list_del_init(&wait->entry);
 
-	req->task_work.func = io_req_task_submit;
-	percpu_ref_get(&req->ctx->refs);
-
 	/* submit ref gets dropped, acquire a new one */
 	refcount_inc(&req->refs);
-	ret = io_req_task_work_add(req);
-	if (unlikely(ret))
-		io_req_task_work_add_fallback(req, io_req_task_cancel);
+	io_req_task_queue(req);
 	return 1;
 }
 

From 04fc6c802dfacba800f5a5d00bea0ebfcc60f840 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 12 Feb 2021 03:23:54 +0000
Subject: [PATCH 170/183] io_uring: save ctx put/get for task_work submit

Do a little trick in io_ring_ctx_free() briefly taking uring_lock, that
will wait for everyone currently holding it, so we can skip pinning ctx
with ctx->refs for __io_req_task_submit(), which is executed and loses
its refs/reqs while holding the lock.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5c0b1a7dba80..87f2f8e660e8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2336,6 +2336,7 @@ static void __io_req_task_submit(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
+	/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
 	mutex_lock(&ctx->uring_lock);
 	if (!ctx->sqo_dead && !io_sq_thread_acquire_mm_files(ctx, req))
 		__io_queue_sqe(req);
@@ -2347,10 +2348,8 @@ static void __io_req_task_submit(struct io_kiocb *req)
 static void io_req_task_submit(struct callback_head *cb)
 {
 	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
-	struct io_ring_ctx *ctx = req->ctx;
 
 	__io_req_task_submit(req);
-	percpu_ref_put(&ctx->refs);
 }
 
 static void io_req_task_queue(struct io_kiocb *req)
@@ -2358,11 +2357,11 @@ static void io_req_task_queue(struct io_kiocb *req)
 	int ret;
 
 	req->task_work.func = io_req_task_submit;
-	percpu_ref_get(&req->ctx->refs);
-
 	ret = io_req_task_work_add(req);
-	if (unlikely(ret))
+	if (unlikely(ret)) {
+		percpu_ref_get(&req->ctx->refs);
 		io_req_task_work_add_fallback(req, io_req_task_cancel);
+	}
 }
 
 static inline void io_queue_next(struct io_kiocb *req)
@@ -8707,6 +8706,14 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	struct io_submit_state *submit_state = &ctx->submit_state;
 
+	/*
+	 * Some may use context even when all refs and requests have been put,
+	 * and they are free to do so while still holding uring_lock, see
+	 * __io_req_task_submit(). Wait for them to finish.
+	 */
+	mutex_lock(&ctx->uring_lock);
+	mutex_unlock(&ctx->uring_lock);
+
 	io_finish_async(ctx);
 	io_sqe_buffers_unregister(ctx);
 

From 4fccfcbb733794634d4e873e7973c1847beca5bf Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 12 Feb 2021 11:55:17 +0000
Subject: [PATCH 171/183] io_uring: don't split out consume out of SQE get

Remove io_consume_sqe() and inline it back into io_get_sqe(). It
requires req dealloc on error, but in exchange we get cleaner
io_submit_sqes() and better locality for cached_sq_head.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 87f2f8e660e8..9c58be0579f3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6762,7 +6762,7 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 	 * 2) allows the kernel side to track the head on its own, even
 	 *    though the application is the one updating it.
 	 */
-	head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
+	head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]);
 	if (likely(head < ctx->sq_entries))
 		return &ctx->sq_sqes[head];
 
@@ -6772,11 +6772,6 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
 	return NULL;
 }
 
-static inline void io_consume_sqe(struct io_ring_ctx *ctx)
-{
-	ctx->cached_sq_head++;
-}
-
 /*
  * Check SQE restrictions (opcode and flags).
  *
@@ -6915,18 +6910,17 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 		struct io_kiocb *req;
 		int err;
 
-		sqe = io_get_sqe(ctx);
-		if (unlikely(!sqe)) {
-			io_consume_sqe(ctx);
-			break;
-		}
 		req = io_alloc_req(ctx);
 		if (unlikely(!req)) {
 			if (!submitted)
 				submitted = -EAGAIN;
 			break;
 		}
-		io_consume_sqe(ctx);
+		sqe = io_get_sqe(ctx);
+		if (unlikely(!sqe)) {
+			kmem_cache_free(req_cachep, req);
+			break;
+		}
 		/* will complete beyond this point, count as submitted */
 		submitted++;
 

From dc0eced5d92052a84d58df03a3bc6382f64fecfa Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 12 Feb 2021 18:41:15 +0000
Subject: [PATCH 172/183] io_uring: don't check PF_EXITING from syscall

io_sq_thread_acquire_mm_files() can find a PF_EXITING task only when
it's called from task_work context. Don't check it in all other cases,
that are when we're in io_uring_enter().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9c58be0579f3..66bbb0dc50af 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1213,8 +1213,6 @@ static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
 static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx,
 						struct io_kiocb *req)
 {
-	if (unlikely(current->flags & PF_EXITING))
-		return -EFAULT;
 	if (!(ctx->flags & IORING_SETUP_SQPOLL))
 		return 0;
 	return __io_sq_thread_acquire_mm_files(ctx, req);
@@ -2338,7 +2336,8 @@ static void __io_req_task_submit(struct io_kiocb *req)
 
 	/* ctx stays valid until unlock, even if we drop all ours ctx->refs */
 	mutex_lock(&ctx->uring_lock);
-	if (!ctx->sqo_dead && !io_sq_thread_acquire_mm_files(ctx, req))
+	if (!ctx->sqo_dead && !(current->flags & PF_EXITING) &&
+	    !io_sq_thread_acquire_mm_files(ctx, req))
 		__io_queue_sqe(req);
 	else
 		__io_req_task_cancel(req, -EFAULT);

From cdbff98223330cdb6c57ead1533ce066dddd61b7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 12 Feb 2021 18:41:16 +0000
Subject: [PATCH 173/183] io_uring: clean io_req_find_next() fast check

Indirectly io_req_find_next() is called for every request, optimise the
check by testing flags as it was long before -- __io_req_find_next()
tolerates false-positives well (i.e. link==NULL), and those should be
really rare.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 66bbb0dc50af..776531f6e18b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2172,7 +2172,7 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
 
 static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 {
-	if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT)))
+	if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
 		return NULL;
 	return __io_req_find_next(req);
 }

From 5be9ad1e4287e1742fd8d253267c86446441bdaf Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 12 Feb 2021 18:41:17 +0000
Subject: [PATCH 174/183] io_uring: optimise io_init_req() flags setting

Invalid req->flags are tolerated by free/put well, avoid this dancing
needlessly presetting it to zero, and then not even resetting but
modifying it, i.e. "|=".

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 776531f6e18b..2e8cb739c835 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6806,14 +6806,15 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 {
 	struct io_submit_state *state;
 	unsigned int sqe_flags;
-	int id, ret;
+	int id, ret = 0;
 
 	req->opcode = READ_ONCE(sqe->opcode);
+	/* same numerical values with corresponding REQ_F_*, safe to copy */
+	req->flags = sqe_flags = READ_ONCE(sqe->flags);
 	req->user_data = READ_ONCE(sqe->user_data);
 	req->async_data = NULL;
 	req->file = NULL;
 	req->ctx = ctx;
-	req->flags = 0;
 	req->link = NULL;
 	req->fixed_rsrc_refs = NULL;
 	/* one is dropped after submission, the other at completion */
@@ -6821,17 +6822,16 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->task = current;
 	req->result = 0;
 
+	/* enforce forwards compatibility on users */
+	if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
+		return -EINVAL;
+
 	if (unlikely(req->opcode >= IORING_OP_LAST))
 		return -EINVAL;
 
 	if (unlikely(io_sq_thread_acquire_mm_files(ctx, req)))
 		return -EFAULT;
 
-	sqe_flags = READ_ONCE(sqe->flags);
-	/* enforce forwards compatibility on users */
-	if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
-		return -EINVAL;
-
 	if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
 		return -EACCES;
 
@@ -6854,8 +6854,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		req->work.flags |= IO_WQ_WORK_CREDS;
 	}
 
-	/* same numerical values with corresponding REQ_F_*, safe to copy */
-	req->flags |= sqe_flags;
 	state = &ctx->submit_state;
 
 	/*
@@ -6868,7 +6866,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		state->plug_started = true;
 	}
 
-	ret = 0;
 	if (io_op_defs[req->opcode].needs_file) {
 		bool fixed = req->flags & REQ_F_FIXED_FILE;
 

From e06aa2e94f0532d04bad7713eb7c6a32ab9ba674 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 12 Feb 2021 14:02:54 -0700
Subject: [PATCH 175/183] io-wq: clear out worker ->fs and ->files

By default, kernel threads have init_fs and init_files assigned. In the
past, this has triggered security problems, as commands that don't ask
for (and hence don't get assigned) fs/files from the originating task
can then attempt path resolution etc with access to parts of the system
they should not be able to.

Rather than add checks in the fs code for misuse, just set these to
NULL. If we do attempt to use them, then the resulting code will oops
rather than provide access to something that it should not permit.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 63ef195b1acb..c36bbcd823ce 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -64,9 +64,7 @@ struct io_worker {
 #endif
 	const struct cred *cur_creds;
 	const struct cred *saved_creds;
-	struct files_struct *restore_files;
 	struct nsproxy *restore_nsproxy;
-	struct fs_struct *restore_fs;
 };
 
 #if BITS_PER_LONG == 64
@@ -156,19 +154,19 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
 		worker->cur_creds = worker->saved_creds = NULL;
 	}
 
-	if (current->files != worker->restore_files) {
+	if (current->files) {
 		__acquire(&wqe->lock);
 		raw_spin_unlock_irq(&wqe->lock);
 		dropped_lock = true;
 
 		task_lock(current);
-		current->files = worker->restore_files;
+		current->files = NULL;
 		current->nsproxy = worker->restore_nsproxy;
 		task_unlock(current);
 	}
 
-	if (current->fs != worker->restore_fs)
-		current->fs = worker->restore_fs;
+	if (current->fs)
+		current->fs = NULL;
 
 	/*
 	 * If we have an active mm, we need to drop the wq lock before unusing
@@ -329,11 +327,11 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
 	allow_kernel_signal(SIGINT);
 
 	current->flags |= PF_IO_WORKER;
+	current->fs = NULL;
+	current->files = NULL;
 
 	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
-	worker->restore_files = current->files;
 	worker->restore_nsproxy = current->nsproxy;
-	worker->restore_fs = current->fs;
 	io_wqe_inc_running(wqe, worker);
 }
 

From 68e68ee6e359318c40891f614612616d219066d0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 13 Feb 2021 09:00:02 -0700
Subject: [PATCH 176/183] io_uring: allow task match to be passed to
 io_req_cache_free()

No changes in this patch, just allows a caller to pass in a targeted
task that we must match for freeing requests in the cache.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2e8cb739c835..9cd7b03a6f34 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8681,12 +8681,13 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
 	idr_destroy(&ctx->io_buffer_idr);
 }
 
-static void io_req_cache_free(struct list_head *list)
+static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
 {
-	while (!list_empty(list)) {
-		struct io_kiocb *req;
+	struct io_kiocb *req, *nxt;
 
-		req = list_first_entry(list, struct io_kiocb, compl.list);
+	list_for_each_entry_safe(req, nxt, list, compl.list) {
+		if (tsk && req->task != tsk)
+			continue;
 		list_del(&req->compl.list);
 		kmem_cache_free(req_cachep, req);
 	}
@@ -8742,8 +8743,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	free_uid(ctx->user);
 	put_cred(ctx->creds);
 	kfree(ctx->cancel_hash);
-	io_req_cache_free(&ctx->submit_state.comp.free_list);
-	io_req_cache_free(&ctx->submit_state.comp.locked_free_list);
+	io_req_cache_free(&ctx->submit_state.comp.free_list, NULL);
+	io_req_cache_free(&ctx->submit_state.comp.locked_free_list, NULL);
 	kfree(ctx);
 }
 

From 9a4fdbd8ee0d8aca0cb5692446e5ca583b230cd7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 13 Feb 2021 09:09:44 -0700
Subject: [PATCH 177/183] io_uring: add helper to free all request caches

We have three different ones, put it in a helper for easy calling. This
is in preparation for doing it outside of ring freeing as well. With
that in mind, also ensure that we do the proper locking for safe calling
from a context where the ring it still live.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9cd7b03a6f34..1895fc132252 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8693,10 +8693,27 @@ static void io_req_cache_free(struct list_head *list, struct task_struct *tsk)
 	}
 }
 
-static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk)
 {
 	struct io_submit_state *submit_state = &ctx->submit_state;
 
+	mutex_lock(&ctx->uring_lock);
+
+	if (submit_state->free_reqs)
+		kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
+				     submit_state->reqs);
+
+	io_req_cache_free(&submit_state->comp.free_list, NULL);
+
+	spin_lock_irq(&ctx->completion_lock);
+	io_req_cache_free(&submit_state->comp.locked_free_list, NULL);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	mutex_unlock(&ctx->uring_lock);
+}
+
+static void io_ring_ctx_free(struct io_ring_ctx *ctx)
+{
 	/*
 	 * Some may use context even when all refs and requests have been put,
 	 * and they are free to do so while still holding uring_lock, see
@@ -8715,10 +8732,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		ctx->mm_account = NULL;
 	}
 
-	if (submit_state->free_reqs)
-		kmem_cache_free_bulk(req_cachep, submit_state->free_reqs,
-				     submit_state->reqs);
-
 #ifdef CONFIG_BLK_CGROUP
 	if (ctx->sqo_blkcg_css)
 		css_put(ctx->sqo_blkcg_css);
@@ -8742,9 +8755,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	percpu_ref_exit(&ctx->refs);
 	free_uid(ctx->user);
 	put_cred(ctx->creds);
+	io_req_caches_free(ctx, NULL);
 	kfree(ctx->cancel_hash);
-	io_req_cache_free(&ctx->submit_state.comp.free_list, NULL);
-	io_req_cache_free(&ctx->submit_state.comp.locked_free_list, NULL);
 	kfree(ctx);
 }
 

From 41be53e94fb04cc69fdf2f524c2a05d8069e047b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 13 Feb 2021 09:11:04 -0700
Subject: [PATCH 178/183] io_uring: kill cached requests from exiting task
 closing the ring

Be nice and prune these upfront, in case the ring is being shared and
one of the tasks is going away. This is a bit more important now that
we account the allocations.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1895fc132252..a9d094f7060f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -9232,8 +9232,10 @@ static int io_uring_flush(struct file *file, void *data)
 	struct io_uring_task *tctx = current->io_uring;
 	struct io_ring_ctx *ctx = file->private_data;
 
-	if (fatal_signal_pending(current) || (current->flags & PF_EXITING))
+	if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) {
 		io_uring_cancel_task_requests(ctx, NULL);
+		io_req_caches_free(ctx, current);
+	}
 
 	if (!tctx)
 		return 0;

From a890caeb2ba40ca183969230e204ab144f258357 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 8 Feb 2021 15:56:05 +0100
Subject: [PATCH 179/183] irqchip/imx: IMX_INTMUX should not default to y,
 unconditionally

Merely enabling CONFIG_COMPILE_TEST should not enable additional code.
To fix this, restrict the automatic enabling of IMX_INTMUX to ARCH_MXC,
and ask the user in case of compile-testing.

Fixes: 66968d7dfc3f5451 ("irqchip: Add COMPILE_TEST support for IMX_INTMUX")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20210208145605.422943-1-geert+renesas@glider.be
---
 drivers/irqchip/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 030895cc6f13..da7b3cf63b07 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -452,7 +452,8 @@ config IMX_IRQSTEER
 	  Support for the i.MX IRQSTEER interrupt multiplexer/remapper.
 
 config IMX_INTMUX
-	def_bool y if ARCH_MXC || COMPILE_TEST
+	bool "i.MX INTMUX support" if COMPILE_TEST
+	default y if ARCH_MXC
 	select IRQ_DOMAIN
 	help
 	  Support for the i.MX INTMUX interrupt multiplexer.

From 4cf29e43afc0dea7ccf6b09a20bd598fad47bf60 Mon Sep 17 00:00:00 2001
From: Tian Tao <tiantao6@hisilicon.com>
Date: Sun, 14 Feb 2021 10:31:02 +0000
Subject: [PATCH 180/183] lightnvm: fix unnecessary NULL check warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove NULL checks before vfree() to fix these warnings:
./drivers/lightnvm/pblk-gc.c:27:2-7: WARNING: NULL check before some
freeing functions is not needed.

Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-gc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 2581eebcfc41..b31658be35a7 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -23,8 +23,7 @@
 
 static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
 {
-	if (gc_rq->data)
-		vfree(gc_rq->data);
+	vfree(gc_rq->data);
 	kfree(gc_rq);
 }
 

From f4b64ae6745177642cd9610cfd7df0041e7fca58 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Sun, 14 Feb 2021 10:31:03 +0000
Subject: [PATCH 181/183] lightnvm: pblk: Replace guid_copy() with
 export_guid()/import_guid()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is a specific API to treat raw data as GUID, i.e. export_guid()
and import_guid(). Use them instead of guid_copy() with explicit casting.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     | 5 ++---
 drivers/lightnvm/pblk-recovery.c | 3 +--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 1dddba11e721..33d39d3dd343 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -988,7 +988,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
 	bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len);
 
 	smeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC);
-	guid_copy((guid_t *)&smeta_buf->header.uuid, &pblk->instance_uuid);
+	export_guid(smeta_buf->header.uuid, &pblk->instance_uuid);
 	smeta_buf->header.id = cpu_to_le32(line->id);
 	smeta_buf->header.type = cpu_to_le16(line->type);
 	smeta_buf->header.version_major = SMETA_VERSION_MAJOR;
@@ -1803,8 +1803,7 @@ void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
 
 	if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC) {
 		emeta_buf->header.identifier = cpu_to_le32(PBLK_MAGIC);
-		guid_copy((guid_t *)&emeta_buf->header.uuid,
-							&pblk->instance_uuid);
+		export_guid(emeta_buf->header.uuid, &pblk->instance_uuid);
 		emeta_buf->header.id = cpu_to_le32(line->id);
 		emeta_buf->header.type = cpu_to_le16(line->type);
 		emeta_buf->header.version_major = EMETA_VERSION_MAJOR;
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 299ef47a17b2..0e6f0c76e930 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -706,8 +706,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 
 		/* The first valid instance uuid is used for initialization */
 		if (!valid_uuid) {
-			guid_copy(&pblk->instance_uuid,
-				  (guid_t *)&smeta_buf->header.uuid);
+			import_guid(&pblk->instance_uuid, smeta_buf->header.uuid);
 			valid_uuid = 1;
 		}
 

From 0d4370cfe36b7f1719123b621a4ec4d9c7a25f89 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 14 Feb 2021 13:21:43 -0700
Subject: [PATCH 182/183] proc: don't allow async path resolution of
 /proc/thread-self components

If this is attempted by an io-wq kthread, then return -EOPNOTSUPP as we
don't currently support that. Once we can get task_pid_ptr() doing the
right thing, then this can go away again.

Use PF_IO_WORKER for this to speciically target the io_uring workers.
Modify the /proc/self/ check to use PF_IO_WORKER as well.

Cc: stable@vger.kernel.org
Fixes: 8d4c3e76e3be ("proc: don't allow async path resolution of /proc/self components")
Reported-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/proc/self.c        | 2 +-
 fs/proc/thread_self.c | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/proc/self.c b/fs/proc/self.c
index cc71ce3466dc..a4012154e109 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -20,7 +20,7 @@ static const char *proc_self_get_link(struct dentry *dentry,
 	 * Not currently supported. Once we can inherit all of struct pid,
 	 * we can allow this.
 	 */
-	if (current->flags & PF_KTHREAD)
+	if (current->flags & PF_IO_WORKER)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	if (!tgid)
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index a553273fbd41..d56681d86d28 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -17,6 +17,13 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
 	pid_t pid = task_pid_nr_ns(current, ns);
 	char *name;
 
+	/*
+	 * Not currently supported. Once we can inherit all of struct pid,
+	 * we can allow this.
+	 */
+	if (current->flags & PF_IO_WORKER)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (!pid)
 		return ERR_PTR(-ENOENT);
 	name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);

From 0b81e80c813f92520667c872d499a2dba8377be6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 16 Feb 2021 10:33:53 -0700
Subject: [PATCH 183/183] io_uring: tctx->task_lock should be IRQ safe

We add task_work from any context, hence we need to ensure that we can
tolerate it being from IRQ context as well.

Fixes: 7cbf1722d5fc ("io_uring: provide FIFO ordering for task_work")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a9d094f7060f..58dd10481106 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2186,10 +2186,10 @@ static bool __tctx_task_work(struct io_uring_task *tctx)
 	if (wq_list_empty(&tctx->task_list))
 		return false;
 
-	spin_lock(&tctx->task_lock);
+	spin_lock_irq(&tctx->task_lock);
 	list = tctx->task_list;
 	INIT_WQ_LIST(&tctx->task_list);
-	spin_unlock(&tctx->task_lock);
+	spin_unlock_irq(&tctx->task_lock);
 
 	node = list.first;
 	while (node) {
@@ -2236,13 +2236,14 @@ static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
 {
 	struct io_uring_task *tctx = tsk->io_uring;
 	struct io_wq_work_node *node, *prev;
+	unsigned long flags;
 	int ret;
 
 	WARN_ON_ONCE(!tctx);
 
-	spin_lock(&tctx->task_lock);
+	spin_lock_irqsave(&tctx->task_lock, flags);
 	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
-	spin_unlock(&tctx->task_lock);
+	spin_unlock_irqrestore(&tctx->task_lock, flags);
 
 	/* task_work already pending, we're done */
 	if (test_bit(0, &tctx->task_state) ||
@@ -2257,7 +2258,7 @@ static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
 	 * in the list, it got run and we're fine.
 	 */
 	ret = 0;
-	spin_lock(&tctx->task_lock);
+	spin_lock_irqsave(&tctx->task_lock, flags);
 	wq_list_for_each(node, prev, &tctx->task_list) {
 		if (&req->io_task_work.node == node) {
 			wq_list_del(&tctx->task_list, node, prev);
@@ -2265,7 +2266,7 @@ static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req,
 			break;
 		}
 	}
-	spin_unlock(&tctx->task_lock);
+	spin_unlock_irqrestore(&tctx->task_lock, flags);
 	clear_bit(0, &tctx->task_state);
 	return ret;
 }