From 291b5c9870fc546376d69cf792b7885cd0c9c1b3 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Mon, 21 Dec 2020 19:59:31 -0700
Subject: [PATCH 01/69] i3c/master/mipi-i3c-hci: Fix position of __maybe_unused
 in i3c_hci_of_match

Clang warns:

 ../drivers/i3c/master/mipi-i3c-hci/core.c:780:21: warning: attribute
 declaration must precede definition [-Wignored-attributes]
 static const struct __maybe_unused of_device_id i3c_hci_of_match[] = {
                     ^
 ../include/linux/compiler_attributes.h:267:56: note: expanded from macro
 '__maybe_unused'
 #define __maybe_unused                  __attribute__((__unused__))
                                                        ^
 ../include/linux/mod_devicetable.h:262:8: note: previous definition is
 here
 struct of_device_id {
        ^
1 warning generated.

'struct of_device_id' should not be split, as it is a type. Move the
__maybe_unused attribute after the static and const qualifiers so that
there are no warnings about this variable, period.

Fixes: 95393f3e07ab ("i3c/master/mipi-i3c-hci: quiet maybe-unused variable warning")
Link: https://github.com/ClangBuiltLinux/linux/issues/1221
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Acked-by: Nicolas Pitre <npitre@baylibre.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201222025931.3043480-1-natechancellor@gmail.com
---
 drivers/i3c/master/mipi-i3c-hci/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c
index 500abd27fb22..1b73647cc3b1 100644
--- a/drivers/i3c/master/mipi-i3c-hci/core.c
+++ b/drivers/i3c/master/mipi-i3c-hci/core.c
@@ -777,7 +777,7 @@ static int i3c_hci_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static const struct __maybe_unused of_device_id i3c_hci_of_match[] = {
+static const __maybe_unused struct of_device_id i3c_hci_of_match[] = {
 	{ .compatible = "mipi-i3c-hci", },
 	{},
 };

From 16e19e11228ba660d9e322035635e7dcf160d5c2 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 15 Jan 2021 14:52:52 -0700
Subject: [PATCH 02/69] dmaengine: idxd: Fix list corruption in description
 completion

Sanjay reported the following kernel splat after running dmatest for stress
testing. The current code is giving up the spinlock in the middle of
a completion list walk, and that opens up opportunity for list corruption
if another thread touches the list at the same time. In order to make sure
the list is always protected, the hardware completed descriptors will be
put on a local list to be completed with callbacks from the outside of
the list lock.

kernel: general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] SMP NOPTI
kernel: CPU: 62 PID: 1814 Comm: irq/89-idxd-por Tainted: G        W         5.10.0-intel-next_10_16+ #1
kernel: Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDCRB1.SBT.4915.D02.2012070418 12/07/2020
kernel: RIP: 0010:irq_process_work_list+0xcd/0x170 [idxd]
kernel: Code: e8 18 65 5c d3 8b 45 d4 85 c0 75 b3 4c 89 f7 e8 b9 fe ff ff 84 c0 74 bf 4c 89 e7 e8 dd 6b 5c d3 49 8b 3f 49 8b 4f 08 48 89 c6 <48> 89 4f 08 48 89 39 4c 89 e7 48 b9 00 01 00 00 00 00 ad de 49 89
kernel: RSP: 0018:ff256768c4353df8 EFLAGS: 00010046
kernel: RAX: 0000000000000202 RBX: dead000000000100 RCX: dead000000000122
kernel: RDX: 0000000000000001 RSI: 0000000000000202 RDI: dead000000000100
kernel: RBP: ff256768c4353e40 R08: 0000000000000001 R09: 0000000000000000
kernel: R10: 0000000000000000 R11: 00000000ffffffff R12: ff1fdf9fd06b3e48
kernel: R13: 0000000000000005 R14: ff1fdf9fc4275980 R15: ff1fdf9fc4275a00
kernel: FS:  0000000000000000(0000) GS:ff1fdfa32f880000(0000) knlGS:0000000000000000
kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
kernel: CR2: 00007f87f24012a0 CR3: 000000010f630004 CR4: 0000000003771ee0
kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
kernel: DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
kernel: PKRU: 55555554
kernel: Call Trace:
kernel: ? irq_thread+0xa9/0x1b0
kernel: idxd_wq_thread+0x34/0x90 [idxd]
kernel: irq_thread_fn+0x24/0x60
kernel: irq_thread+0x10f/0x1b0
kernel: ? irq_forced_thread_fn+0x80/0x80
kernel: ? wake_threads_waitq+0x30/0x30
kernel: ? irq_thread_check_affinity+0xe0/0xe0
kernel: kthread+0x142/0x160
kernel: ? kthread_park+0x90/0x90
kernel: ret_from_fork+0x1f/0x30
kernel: Modules linked in: idxd_ktest dmatest intel_rapl_msr idxd_mdev iTCO_wdt vfio_pci vfio_virqfd iTCO_vendor_support intel_rapl_common i10nm_edac nfit x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd cryptd glue_helper rapl msr pcspkr snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio snd_hda_intel snd_intel_dspcfg ofpart snd_hda_codec snd_hda_core snd_hwdep cmdlinepart snd_seq snd_seq_device idxd snd_pcm intel_spi_pci intel_spi snd_timer spi_nor input_leds joydev snd i2c_i801 mtd soundcore i2c_smbus mei_me mei i2c_ismt ipmi_ssif acpi_ipmi ipmi_si ipmi_devintf ipmi_msghandler mac_hid sunrpc nls_iso8859_1 sch_fq_codel ip_tables x_tables xfs libcrc32c ast drm_vram_helper drm_ttm_helper ttm igc wmi pinctrl_sunrisepoint hid_generic usbmouse usbkbd usbhid hid
kernel: ---[ end trace cd5ca950ef0db25f ]---

Reported-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Fixes: e4f4d8cdeb9a ("dmaengine: idxd: Clean up descriptors with fault error")
Link: https://lore.kernel.org/r/161074757267.2183951.17912830858060607266.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/irq.c | 86 +++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 42 deletions(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 593a2f6ed16c..fe996cec7c74 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -219,29 +219,27 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	return IRQ_HANDLED;
 }
 
-static bool process_fault(struct idxd_desc *desc, u64 fault_addr)
+static inline bool match_fault(struct idxd_desc *desc, u64 fault_addr)
 {
 	/*
 	 * Completion address can be bad as well. Check fault address match for descriptor
 	 * and completion address.
 	 */
-	if ((u64)desc->hw == fault_addr ||
-	    (u64)desc->completion == fault_addr) {
-		idxd_dma_complete_txd(desc, IDXD_COMPLETE_DEV_FAIL);
+	if ((u64)desc->hw == fault_addr || (u64)desc->completion == fault_addr) {
+		struct idxd_device *idxd = desc->wq->idxd;
+		struct device *dev = &idxd->pdev->dev;
+
+		dev_warn(dev, "desc with fault address: %#llx\n", fault_addr);
 		return true;
 	}
 
 	return false;
 }
 
-static bool complete_desc(struct idxd_desc *desc)
+static inline void complete_desc(struct idxd_desc *desc, enum idxd_complete_type reason)
 {
-	if (desc->completion->status) {
-		idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL);
-		return true;
-	}
-
-	return false;
+	idxd_dma_complete_txd(desc, reason);
+	idxd_free_desc(desc->wq, desc);
 }
 
 static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
@@ -251,25 +249,25 @@ static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
 	struct idxd_desc *desc, *t;
 	struct llist_node *head;
 	int queued = 0;
-	bool completed = false;
 	unsigned long flags;
+	enum idxd_complete_type reason;
 
 	*processed = 0;
 	head = llist_del_all(&irq_entry->pending_llist);
 	if (!head)
 		goto out;
 
-	llist_for_each_entry_safe(desc, t, head, llnode) {
-		if (wtype == IRQ_WORK_NORMAL)
-			completed = complete_desc(desc);
-		else if (wtype == IRQ_WORK_PROCESS_FAULT)
-			completed = process_fault(desc, data);
+	if (wtype == IRQ_WORK_NORMAL)
+		reason = IDXD_COMPLETE_NORMAL;
+	else
+		reason = IDXD_COMPLETE_DEV_FAIL;
 
-		if (completed) {
-			idxd_free_desc(desc->wq, desc);
+	llist_for_each_entry_safe(desc, t, head, llnode) {
+		if (desc->completion->status) {
+			if ((desc->completion->status & DSA_COMP_STATUS_MASK) != DSA_COMP_SUCCESS)
+				match_fault(desc, data);
+			complete_desc(desc, reason);
 			(*processed)++;
-			if (wtype == IRQ_WORK_PROCESS_FAULT)
-				break;
 		} else {
 			spin_lock_irqsave(&irq_entry->list_lock, flags);
 			list_add_tail(&desc->list,
@@ -287,42 +285,46 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
 				 enum irq_work_type wtype,
 				 int *processed, u64 data)
 {
-	struct list_head *node, *next;
 	int queued = 0;
-	bool completed = false;
 	unsigned long flags;
+	LIST_HEAD(flist);
+	struct idxd_desc *desc, *n;
+	enum idxd_complete_type reason;
 
 	*processed = 0;
+	if (wtype == IRQ_WORK_NORMAL)
+		reason = IDXD_COMPLETE_NORMAL;
+	else
+		reason = IDXD_COMPLETE_DEV_FAIL;
+
+	/*
+	 * This lock protects list corruption from access of list outside of the irq handler
+	 * thread.
+	 */
 	spin_lock_irqsave(&irq_entry->list_lock, flags);
-	if (list_empty(&irq_entry->work_list))
-		goto out;
-
-	list_for_each_safe(node, next, &irq_entry->work_list) {
-		struct idxd_desc *desc =
-			container_of(node, struct idxd_desc, list);
-
+	if (list_empty(&irq_entry->work_list)) {
 		spin_unlock_irqrestore(&irq_entry->list_lock, flags);
-		if (wtype == IRQ_WORK_NORMAL)
-			completed = complete_desc(desc);
-		else if (wtype == IRQ_WORK_PROCESS_FAULT)
-			completed = process_fault(desc, data);
+		return 0;
+	}
 
-		if (completed) {
-			spin_lock_irqsave(&irq_entry->list_lock, flags);
+	list_for_each_entry_safe(desc, n, &irq_entry->work_list, list) {
+		if (desc->completion->status) {
 			list_del(&desc->list);
-			spin_unlock_irqrestore(&irq_entry->list_lock, flags);
-			idxd_free_desc(desc->wq, desc);
 			(*processed)++;
-			if (wtype == IRQ_WORK_PROCESS_FAULT)
-				return queued;
+			list_add_tail(&desc->list, &flist);
 		} else {
 			queued++;
 		}
-		spin_lock_irqsave(&irq_entry->list_lock, flags);
 	}
 
- out:
 	spin_unlock_irqrestore(&irq_entry->list_lock, flags);
+
+	list_for_each_entry(desc, &flist, list) {
+		if ((desc->completion->status & DSA_COMP_STATUS_MASK) != DSA_COMP_SUCCESS)
+			match_fault(desc, data);
+		complete_desc(desc, reason);
+	}
+
 	return queued;
 }
 

From f5cc9ace24fbdf41b4814effbb2f9bad7046e988 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 15 Jan 2021 14:52:33 -0700
Subject: [PATCH 03/69] dmaengine: idxd: fix misc interrupt completion

Nikhil reported the misc interrupt handler can sometimes miss handling
the command interrupt when an error interrupt happens near the same time.
Have the irq handling thread continue to process the misc interrupts until
all interrupts are processed. This is a low usage interrupt and is not
expected to handle high volume traffic. Therefore there is no concern of
this thread running for a long time.

Fixes: 0d5c10b4c84d ("dmaengine: idxd: add work queue drain support")
Reported-by: Nikhil Rao <nikhil.rao@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/161074755329.2183844.13295528344116907983.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/irq.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index fe996cec7c74..a60ca11a5784 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -111,19 +111,14 @@ irqreturn_t idxd_irq_handler(int vec, void *data)
 	return IRQ_WAKE_THREAD;
 }
 
-irqreturn_t idxd_misc_thread(int vec, void *data)
+static int process_misc_interrupts(struct idxd_device *idxd, u32 cause)
 {
-	struct idxd_irq_entry *irq_entry = data;
-	struct idxd_device *idxd = irq_entry->idxd;
 	struct device *dev = &idxd->pdev->dev;
 	union gensts_reg gensts;
-	u32 cause, val = 0;
+	u32 val = 0;
 	int i;
 	bool err = false;
 
-	cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET);
-	iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
-
 	if (cause & IDXD_INTC_ERR) {
 		spin_lock_bh(&idxd->dev_lock);
 		for (i = 0; i < 4; i++)
@@ -181,7 +176,7 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 			      val);
 
 	if (!err)
-		goto out;
+		return 0;
 
 	/*
 	 * This case should rarely happen and typically is due to software
@@ -211,10 +206,33 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 				gensts.reset_type == IDXD_DEVICE_RESET_FLR ?
 				"FLR" : "system reset");
 			spin_unlock_bh(&idxd->dev_lock);
+			return -ENXIO;
 		}
 	}
 
- out:
+	return 0;
+}
+
+irqreturn_t idxd_misc_thread(int vec, void *data)
+{
+	struct idxd_irq_entry *irq_entry = data;
+	struct idxd_device *idxd = irq_entry->idxd;
+	int rc;
+	u32 cause;
+
+	cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+	if (cause)
+		iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+
+	while (cause) {
+		rc = process_misc_interrupts(idxd, cause);
+		if (rc < 0)
+			break;
+		cause = ioread32(idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+		if (cause)
+			iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET);
+	}
+
 	idxd_unmask_msix_vector(idxd, irq_entry->id);
 	return IRQ_HANDLED;
 }

From e594443196d6e0ef3d3b30320c49b3a4d4f9a547 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 18 Jan 2021 10:28:44 -0700
Subject: [PATCH 04/69] dmaengine: move channel device_node deletion to driver

Channel device_node deletion is managed by the device driver rather than
the dmaengine core. The deletion was accidentally introduced when making
channel unregister dynamic. It causes xilinx_dma module to crash on unload
as reported by Radhey. Remove chan->device_node delete in dmaengine and
also fix up idxd driver.

[   42.142705] Internal error: Oops: 96000044 [#1] SMP
[   42.147566] Modules linked in: xilinx_dma(-) clk_xlnx_clock_wizard uio_pdrv_genirq
[   42.155139] CPU: 1 PID: 2075 Comm: rmmod Not tainted 5.10.1-00026-g3a2e6dd7a05-dirty #192
[   42.163302] Hardware name: Enclustra XU5 SOM (DT)
[   42.167992] pstate: 40000005 (nZcv daif -PAN -UAO -TCO BTYPE=--)
[   42.173996] pc : xilinx_dma_chan_remove+0x74/0xa0 [xilinx_dma]
[   42.179815] lr : xilinx_dma_chan_remove+0x70/0xa0 [xilinx_dma]
[   42.185636] sp : ffffffc01112bca0
[   42.188935] x29: ffffffc01112bca0 x28: ffffff80402ea640

xilinx_dma_chan_remove+0x74/0xa0:
__list_del at ./include/linux/list.h:112 (inlined by)
__list_del_entry at./include/linux/list.h:135 (inlined by)
list_del at ./include/linux/list.h:146 (inlined by)
xilinx_dma_chan_remove at drivers/dma/xilinx/xilinx_dma.c:2546

Fixes: e81274cd6b52 ("dmaengine: add support to dynamic register/unregister of channels")
Reported-by: Radhey Shyam Pandey <radheys@xilinx.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
Link: https://lore.kernel.org/r/161099092469.2495902.5064826526660062342.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Cc: stable@vger.kernel.org # 5.9+
---
 drivers/dma/dmaengine.c | 1 -
 drivers/dma/idxd/dma.c  | 5 ++++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 962cbb5e5f7f..fe6a460c4373 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -1110,7 +1110,6 @@ static void __dma_async_device_channel_unregister(struct dma_device *device,
 		  "%s called while %d clients hold a reference\n",
 		  __func__, chan->client_count);
 	mutex_lock(&dma_list_mutex);
-	list_del(&chan->device_node);
 	device->chancnt--;
 	chan->dev->chan = NULL;
 	mutex_unlock(&dma_list_mutex);
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 8ed2773d8285..71fd6e4c42cd 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -205,5 +205,8 @@ int idxd_register_dma_channel(struct idxd_wq *wq)
 
 void idxd_unregister_dma_channel(struct idxd_wq *wq)
 {
-	dma_async_device_channel_unregister(&wq->idxd->dma_dev, &wq->dma_chan);
+	struct dma_chan *chan = &wq->dma_chan;
+
+	dma_async_device_channel_unregister(&wq->idxd->dma_dev, chan);
+	list_del(&chan->device_node);
 }

From fed1b6a00a191cad4dd843519b590e3d6ad9f843 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 24 Jan 2021 08:09:23 +0100
Subject: [PATCH 05/69] dmaengine: ti: k3-udma: Fix a resource leak in an error
 handling path

In 'dma_pool_create()', we return -ENOMEM, but don't release the resources
already allocated, as in all the other error handling paths.

Go to 'err_res_free' instead of returning directly.

Fixes: 017794739702 ("dmaengine: ti: k3-udma: Initial support for K3 BCDMA")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Peter Ujfalusi <peter.ujfalusi@gmail.com>
Link: https://lore.kernel.org/r/20210124070923.724479-1-christophe.jaillet@wanadoo.fr
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index 298460438bb4..f474a1232335 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -2401,7 +2401,8 @@ static int bcdma_alloc_chan_resources(struct dma_chan *chan)
 			dev_err(ud->ddev.dev,
 				"Descriptor pool allocation failed\n");
 			uc->use_dma_pool = false;
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto err_res_free;
 		}
 
 		uc->use_dma_pool = true;

From b64acb28da8394485f0762e657470c9fc33aca4d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 25 Jan 2021 12:36:42 +0100
Subject: [PATCH 06/69] ath9k: fix build error with LEDS_CLASS=m

When CONFIG_ATH9K is built-in but LED support is in a loadable
module, both ath9k drivers fails to link:

x86_64-linux-ld: drivers/net/wireless/ath/ath9k/gpio.o: in function `ath_deinit_leds':
gpio.c:(.text+0x36): undefined reference to `led_classdev_unregister'
x86_64-linux-ld: drivers/net/wireless/ath/ath9k/gpio.o: in function `ath_init_leds':
gpio.c:(.text+0x179): undefined reference to `led_classdev_register_ext'

The problem is that the 'imply' keyword does not enforce any dependency
but is only a weak hint to Kconfig to enable another symbol from a
defconfig file.

Change imply to a 'depends on LEDS_CLASS' that prevents the incorrect
configuration but still allows building the driver without LED support.

The 'select MAC80211_LEDS' is now ensures that the LED support is
actually used if it is present, and the added Kconfig dependency
on MAC80211_LEDS ensures that it cannot be enabled manually when it
has no effect.

Fixes: 197f466e93f5 ("ath9k_htc: Do not select MAC80211_LEDS by default")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Link: https://lore.kernel.org/r/20210125113654.2408057-1-arnd@kernel.org
---
 drivers/net/wireless/ath/ath9k/Kconfig | 8 ++------
 net/mac80211/Kconfig                   | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/Kconfig b/drivers/net/wireless/ath/ath9k/Kconfig
index a84bb9b6573f..e150d82eddb6 100644
--- a/drivers/net/wireless/ath/ath9k/Kconfig
+++ b/drivers/net/wireless/ath/ath9k/Kconfig
@@ -21,11 +21,9 @@ config ATH9K_BTCOEX_SUPPORT
 config ATH9K
 	tristate "Atheros 802.11n wireless cards support"
 	depends on MAC80211 && HAS_DMA
+	select MAC80211_LEDS if LEDS_CLASS=y || LEDS_CLASS=MAC80211
 	select ATH9K_HW
 	select ATH9K_COMMON
-	imply NEW_LEDS
-	imply LEDS_CLASS
-	imply MAC80211_LEDS
 	help
 	  This module adds support for wireless adapters based on
 	  Atheros IEEE 802.11n AR5008, AR9001 and AR9002 family
@@ -176,11 +174,9 @@ config ATH9K_PCI_NO_EEPROM
 config ATH9K_HTC
 	tristate "Atheros HTC based wireless cards support"
 	depends on USB && MAC80211
+	select MAC80211_LEDS if LEDS_CLASS=y || LEDS_CLASS=MAC80211
 	select ATH9K_HW
 	select ATH9K_COMMON
-	imply NEW_LEDS
-	imply LEDS_CLASS
-	imply MAC80211_LEDS
 	help
 	  Support for Atheros HTC based cards.
 	  Chipsets supported: AR9271
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index cd9a9bd242ba..51ec8256b7fa 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -69,7 +69,7 @@ config MAC80211_MESH
 config MAC80211_LEDS
 	bool "Enable LED triggers"
 	depends on MAC80211
-	depends on LEDS_CLASS
+	depends on LEDS_CLASS=y || LEDS_CLASS=MAC80211
 	select LEDS_TRIGGERS
 	help
 	  This option enables a few LED triggers for different

From 93a1d4791c10d443bc67044def7efee2991d48b7 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 26 Jan 2021 12:02:13 +0100
Subject: [PATCH 07/69] mt76: dma: fix a possible memory leak in
 mt76_add_fragment()

Fix a memory leak in mt76_add_fragment routine returning the buffer
to the page_frag_cache when we receive a new fragment and the
skb_shared_info frag array is full.

Fixes: b102f0c522cf6 ("mt76: fix array overflow on receiving too many fragments for a packet")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Acked-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
Link: https://lore.kernel.org/r/4f9dd73407da88b2a552517ce8db242d86bf4d5c.1611616130.git.lorenzo@kernel.org
---
 drivers/net/wireless/mediatek/mt76/dma.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index 73eeb00d5aa6..e81dfaf99bcb 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -509,15 +509,17 @@ static void
 mt76_add_fragment(struct mt76_dev *dev, struct mt76_queue *q, void *data,
 		  int len, bool more)
 {
-	struct page *page = virt_to_head_page(data);
-	int offset = data - page_address(page);
 	struct sk_buff *skb = q->rx_head;
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 
 	if (shinfo->nr_frags < ARRAY_SIZE(shinfo->frags)) {
-		offset += q->buf_offset;
+		struct page *page = virt_to_head_page(data);
+		int offset = data - page_address(page) + q->buf_offset;
+
 		skb_add_rx_frag(skb, shinfo->nr_frags, page, offset, len,
 				q->buf_size);
+	} else {
+		skb_free_frag(data);
 	}
 
 	if (more)

From 89e3becd8f821e507052e012d2559dcda59f538e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Mon, 1 Feb 2021 08:26:14 -0700
Subject: [PATCH 08/69] dmaengine: idxd: check device state before issue
 command

Add device state check before executing command. Without the check the
command can be issued while device is in halt state and causes the driver to
block while waiting for the completion of the command.

Reported-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Sanjay Kumar <sanjay.k.kumar@intel.com>
Fixes: 0d5c10b4c84d ("dmaengine: idxd: add work queue drain support")
Link: https://lore.kernel.org/r/161219313921.2976211.12222625226450097465.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/device.c | 23 ++++++++++++++++++++++-
 drivers/dma/idxd/idxd.h   |  2 +-
 drivers/dma/idxd/init.c   |  5 ++++-
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 95f94a3ed6be..84a6ea60ecf0 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -398,17 +398,31 @@ static inline bool idxd_is_enabled(struct idxd_device *idxd)
 	return false;
 }
 
+static inline bool idxd_device_is_halted(struct idxd_device *idxd)
+{
+	union gensts_reg gensts;
+
+	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
+
+	return (gensts.state == IDXD_DEVICE_STATE_HALT);
+}
+
 /*
  * This is function is only used for reset during probe and will
  * poll for completion. Once the device is setup with interrupts,
  * all commands will be done via interrupt completion.
  */
-void idxd_device_init_reset(struct idxd_device *idxd)
+int idxd_device_init_reset(struct idxd_device *idxd)
 {
 	struct device *dev = &idxd->pdev->dev;
 	union idxd_command_reg cmd;
 	unsigned long flags;
 
+	if (idxd_device_is_halted(idxd)) {
+		dev_warn(&idxd->pdev->dev, "Device is HALTED!\n");
+		return -ENXIO;
+	}
+
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.cmd = IDXD_CMD_RESET_DEVICE;
 	dev_dbg(dev, "%s: sending reset for init.\n", __func__);
@@ -419,6 +433,7 @@ void idxd_device_init_reset(struct idxd_device *idxd)
 	       IDXD_CMDSTS_ACTIVE)
 		cpu_relax();
 	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+	return 0;
 }
 
 static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
@@ -428,6 +443,12 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand,
 	DECLARE_COMPLETION_ONSTACK(done);
 	unsigned long flags;
 
+	if (idxd_device_is_halted(idxd)) {
+		dev_warn(&idxd->pdev->dev, "Device is HALTED!\n");
+		*status = IDXD_CMDSTS_HW_ERR;
+		return;
+	}
+
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.cmd = cmd_code;
 	cmd.operand = operand;
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 5a50e91c71bf..81a0e65fd316 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -326,7 +326,7 @@ void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id);
 void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id);
 
 /* device control */
-void idxd_device_init_reset(struct idxd_device *idxd);
+int idxd_device_init_reset(struct idxd_device *idxd);
 int idxd_device_enable(struct idxd_device *idxd);
 int idxd_device_disable(struct idxd_device *idxd);
 void idxd_device_reset(struct idxd_device *idxd);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 2c051e07c34c..fa04acd5582a 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -335,7 +335,10 @@ static int idxd_probe(struct idxd_device *idxd)
 	int rc;
 
 	dev_dbg(dev, "%s entered and resetting device\n", __func__);
-	idxd_device_init_reset(idxd);
+	rc = idxd_device_init_reset(idxd);
+	if (rc < 0)
+		return rc;
+
 	dev_dbg(dev, "IDXD reset complete\n");
 
 	if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM)) {

From 548f1191d86ccb9bde2a5305988877b7584c01eb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 2 Feb 2021 23:06:36 -0800
Subject: [PATCH 09/69] bpf: Unbreak BPF_PROG_TYPE_KPROBE when kprobe is called
 via do_int3

The commit 0d00449c7a28 ("x86: Replace ist_enter() with nmi_enter()")
converted do_int3 handler to be "NMI-like".
That made old if (in_nmi()) check abort execution of bpf programs
attached to kprobe when kprobe is firing via int3
(For example when kprobe is placed in the middle of the function).
Remove the check to restore user visible behavior.

Fixes: 0d00449c7a28 ("x86: Replace ist_enter() with nmi_enter()")
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Link: https://lore.kernel.org/bpf/20210203070636.70926-1-alexei.starovoitov@gmail.com
---
 kernel/trace/bpf_trace.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 6c0018abe68a..764400260eb6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -96,9 +96,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
 	unsigned int ret;
 
-	if (in_nmi()) /* not supported yet */
-		return 1;
-
 	cant_sleep();
 
 	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {

From 6183f4d3a0a2ad230511987c6c362ca43ec0055f Mon Sep 17 00:00:00 2001
From: Bui Quang Minh <minhquangbui99@gmail.com>
Date: Wed, 27 Jan 2021 06:36:53 +0000
Subject: [PATCH 10/69] bpf: Check for integer overflow when using
 roundup_pow_of_two()

On 32-bit architecture, roundup_pow_of_two() can return 0 when the argument
has upper most bit set due to resulting 1UL << 32. Add a check for this case.

Fixes: d5a3b1f69186 ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE")
Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210127063653.3576-1-minhquangbui99@gmail.com
---
 kernel/bpf/stackmap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index aea96b638473..bfafbf115bf3 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -115,6 +115,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 
 	/* hash table size must be power of 2 */
 	n_buckets = roundup_pow_of_two(attr->max_entries);
+	if (!n_buckets)
+		return ERR_PTR(-E2BIG);
 
 	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
 	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));

From a4dc7eee9106a9d2a6e08b442db19677aa9699c7 Mon Sep 17 00:00:00 2001
From: Christoph Schemmel <christoph.schemmel@gmail.com>
Date: Tue, 2 Feb 2021 09:45:23 +0100
Subject: [PATCH 11/69] NET: usb: qmi_wwan: Adding support for Cinterion MV31

Adding support for Cinterion MV31 with PID 0x00B7.

T:  Bus=04 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 11 Spd=5000 MxCh= 0
D:  Ver= 3.20 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs=  1
P:  Vendor=1e2d ProdID=00b7 Rev=04.14
S:  Manufacturer=Cinterion
S:  Product=Cinterion USB Mobile Broadband
S:  SerialNumber=b3246eed
C:  #Ifs= 4 Cfg#= 1 Atr=a0 MxPwr=896mA
I:  If#=0x0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan
I:  If#=0x1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#=0x2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option
I:  If#=0x3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option

Signed-off-by: Christoph Schemmel <christoph.schemmel@gmail.com>
Link: https://lore.kernel.org/r/20210202084523.4371-1-christoph.schemmel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index cc4819282820..5a05add9b4e6 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1309,6 +1309,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x1e2d, 0x0082, 5)},	/* Cinterion PHxx,PXxx (2 RmNet) */
 	{QMI_FIXED_INTF(0x1e2d, 0x0083, 4)},	/* Cinterion PHxx,PXxx (1 RmNet + USB Audio)*/
 	{QMI_QUIRK_SET_DTR(0x1e2d, 0x00b0, 4)},	/* Cinterion CLS8 */
+	{QMI_FIXED_INTF(0x1e2d, 0x00b7, 0)},	/* Cinterion MV31 RmNet */
 	{QMI_FIXED_INTF(0x413c, 0x81a2, 8)},	/* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */
 	{QMI_FIXED_INTF(0x413c, 0x81a3, 8)},	/* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */
 	{QMI_FIXED_INTF(0x413c, 0x81a4, 8)},	/* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */

From b1bdde33b72366da20d10770ab7a49fe87b5e190 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@mail.kfki.hu>
Date: Fri, 29 Jan 2021 20:57:43 +0100
Subject: [PATCH 12/69] netfilter: xt_recent: Fix attempt to update deleted
 entry

When both --reap and --update flag are specified, there's a code
path at which the entry to be updated is reaped beforehand,
which then leads to kernel crash. Reap only entries which won't be
updated.

Fixes kernel bugzilla #207773.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=207773
Reported-by: Reindl Harald <h.reindl@thelounge.net>
Fixes: 0079c5aee348 ("netfilter: xt_recent: add an entry reaper")
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_recent.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 606411869698..0446307516cd 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -152,7 +152,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e)
 /*
  * Drop entries with timestamps older then 'time'.
  */
-static void recent_entry_reap(struct recent_table *t, unsigned long time)
+static void recent_entry_reap(struct recent_table *t, unsigned long time,
+			      struct recent_entry *working, bool update)
 {
 	struct recent_entry *e;
 
@@ -161,6 +162,12 @@ static void recent_entry_reap(struct recent_table *t, unsigned long time)
 	 */
 	e = list_entry(t->lru_list.next, struct recent_entry, lru_list);
 
+	/*
+	 * Do not reap the entry which are going to be updated.
+	 */
+	if (e == working && update)
+		return;
+
 	/*
 	 * The last time stamp is the most recent.
 	 */
@@ -303,7 +310,8 @@ recent_mt(const struct sk_buff *skb, struct xt_action_param *par)
 
 		/* info->seconds must be non-zero */
 		if (info->check_set & XT_RECENT_REAP)
-			recent_entry_reap(t, time);
+			recent_entry_reap(t, time, e,
+				info->check_set & XT_RECENT_UPDATE && ret);
 	}
 
 	if (info->check_set & XT_RECENT_SET ||

From a3005b0f83f217c888393c6bf9cd36e3d1616bca Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Sat, 30 Jan 2021 11:14:25 +0100
Subject: [PATCH 13/69] selftests: netfilter: fix current year

use date %Y instead of %G to read current year
Problem appeared when running lkp-tests on 01/01/2021

Fixes: 48d072c4e8cd ("selftests: netfilter: add time counter check")
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/nft_meta.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/netfilter/nft_meta.sh
index 087f0e6e71ce..f33154c04d34 100755
--- a/tools/testing/selftests/netfilter/nft_meta.sh
+++ b/tools/testing/selftests/netfilter/nft_meta.sh
@@ -23,7 +23,7 @@ ip -net "$ns0" addr add 127.0.0.1 dev lo
 
 trap cleanup EXIT
 
-currentyear=$(date +%G)
+currentyear=$(date +%Y)
 lastyear=$((currentyear-1))
 ip netns exec "$ns0" nft -f /dev/stdin <<EOF
 table inet filter {

From 767d1216bff82507c945e92fe719dff2083bb2f4 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 2 Feb 2021 16:07:37 +0100
Subject: [PATCH 14/69] netfilter: nftables: fix possible UAF over chains from
 packet path in netns

Although hooks are released via call_rcu(), chain and rule objects are
immediately released while packets are still walking over these bits.

This patch adds the .pre_exit callback which is invoked before
synchronize_rcu() in the netns framework to stay safe.

Remove a comment which is not valid anymore since the core does not use
synchronize_net() anymore since 8c873e219970 ("netfilter: core: free
hooks with call_rcu").

Suggested-by: Florian Westphal <fw@strlen.de>
Fixes: df05ef874b28 ("netfilter: nf_tables: release objects on netns destruction")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8d3aa97b52e7..43fe80f10313 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -8949,6 +8949,17 @@ int __nft_release_basechain(struct nft_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(__nft_release_basechain);
 
+static void __nft_release_hooks(struct net *net)
+{
+	struct nft_table *table;
+	struct nft_chain *chain;
+
+	list_for_each_entry(table, &net->nft.tables, list) {
+		list_for_each_entry(chain, &table->chains, list)
+			nf_tables_unregister_hook(net, table, chain);
+	}
+}
+
 static void __nft_release_tables(struct net *net)
 {
 	struct nft_flowtable *flowtable, *nf;
@@ -8964,10 +8975,6 @@ static void __nft_release_tables(struct net *net)
 
 	list_for_each_entry_safe(table, nt, &net->nft.tables, list) {
 		ctx.family = table->family;
-
-		list_for_each_entry(chain, &table->chains, list)
-			nf_tables_unregister_hook(net, table, chain);
-		/* No packets are walking on these chains anymore. */
 		ctx.table = table;
 		list_for_each_entry(chain, &table->chains, list) {
 			ctx.chain = chain;
@@ -9016,6 +9023,11 @@ static int __net_init nf_tables_init_net(struct net *net)
 	return 0;
 }
 
+static void __net_exit nf_tables_pre_exit_net(struct net *net)
+{
+	__nft_release_hooks(net);
+}
+
 static void __net_exit nf_tables_exit_net(struct net *net)
 {
 	mutex_lock(&net->nft.commit_mutex);
@@ -9029,8 +9041,9 @@ static void __net_exit nf_tables_exit_net(struct net *net)
 }
 
 static struct pernet_operations nf_tables_net_ops = {
-	.init	= nf_tables_init_net,
-	.exit	= nf_tables_exit_net,
+	.init		= nf_tables_init_net,
+	.pre_exit	= nf_tables_pre_exit_net,
+	.exit		= nf_tables_exit_net,
 };
 
 static int __init nf_tables_module_init(void)

From 8d6bca156e47d68551750a384b3ff49384c67be3 Mon Sep 17 00:00:00 2001
From: Sven Auhagen <sven.auhagen@voleatech.de>
Date: Tue, 2 Feb 2021 18:01:16 +0100
Subject: [PATCH 15/69] netfilter: flowtable: fix tcp and udp header checksum
 update

When updating the tcp or udp header checksum on port nat the function
inet_proto_csum_replace2 with the last parameter pseudohdr as true.
This leads to an error in the case that GRO is used and packets are
split up in GSO. The tcp or udp checksum of all packets is incorrect.

The error is probably masked due to the fact the most network driver
implement tcp/udp checksum offloading. It also only happens when GRO is
applied and not on single packets.

The error is most visible when using a pppoe connection which is not
triggering the tcp/udp checksum offload.

Fixes: ac2a66665e23 ("netfilter: add generic flow table infrastructure")
Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 513f78db3cb2..4a4acbba78ff 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -399,7 +399,7 @@ static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
 		return -1;
 
 	tcph = (void *)(skb_network_header(skb) + thoff);
-	inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
+	inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
 
 	return 0;
 }
@@ -415,7 +415,7 @@ static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
 	udph = (void *)(skb_network_header(skb) + thoff);
 	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
 		inet_proto_csum_replace2(&udph->check, skb, port,
-					 new_port, true);
+					 new_port, false);
 		if (!udph->check)
 			udph->check = CSUM_MANGLED_0;
 	}

From 2a80c15812372e554474b1dba0b1d8e467af295d Mon Sep 17 00:00:00 2001
From: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
Date: Tue, 2 Feb 2021 15:20:59 +0600
Subject: [PATCH 16/69] net/qrtr: restrict user-controlled length in
 qrtr_tun_write_iter()

syzbot found WARNING in qrtr_tun_write_iter [1] when write_iter length
exceeds KMALLOC_MAX_SIZE causing order >= MAX_ORDER condition.

Additionally, there is no check for 0 length write.

[1]
WARNING: mm/page_alloc.c:5011
[..]
Call Trace:
 alloc_pages_current+0x18c/0x2a0 mm/mempolicy.c:2267
 alloc_pages include/linux/gfp.h:547 [inline]
 kmalloc_order+0x2e/0xb0 mm/slab_common.c:837
 kmalloc_order_trace+0x14/0x120 mm/slab_common.c:853
 kmalloc include/linux/slab.h:557 [inline]
 kzalloc include/linux/slab.h:682 [inline]
 qrtr_tun_write_iter+0x8a/0x180 net/qrtr/tun.c:83
 call_write_iter include/linux/fs.h:1901 [inline]

Reported-by: syzbot+c2a7e5c5211605a90865@syzkaller.appspotmail.com
Signed-off-by: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
Link: https://lore.kernel.org/r/20210202092059.1361381-1-snovitoll@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/qrtr/tun.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c
index 15ce9b642b25..b238c40a9984 100644
--- a/net/qrtr/tun.c
+++ b/net/qrtr/tun.c
@@ -80,6 +80,12 @@ static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ssize_t ret;
 	void *kbuf;
 
+	if (!len)
+		return -EINVAL;
+
+	if (len > KMALLOC_MAX_SIZE)
+		return -ENOMEM;
+
 	kbuf = kzalloc(len, GFP_KERNEL);
 	if (!kbuf)
 		return -ENOMEM;

From d795cc02a297df80910cf4ba23147680d15d8a7d Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Wed, 3 Feb 2021 23:37:14 +0300
Subject: [PATCH 17/69] selftests/tls: fix selftest with CHACHA20-POLY1305

TLS selftests were broken also because of use of structure that
was not exported to UAPI. Fix by defining the union in tests.

Fixes: 4f336e88a870 (selftests/tls: add CHACHA20-POLY1305 to tls selftests)
Reported-by: Rong Chen <rong.a.chen@intel.com>
Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Link: https://lore.kernel.org/r/1612384634-5377-1-git-send-email-vfedorenko@novek.ru
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/tls.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index e0088c2d38a5..426d07875a48 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -133,7 +133,10 @@ FIXTURE_VARIANT_ADD(tls, 13_chacha)
 
 FIXTURE_SETUP(tls)
 {
-	union tls_crypto_context tls12;
+	union {
+		struct tls12_crypto_info_aes_gcm_128 aes128;
+		struct tls12_crypto_info_chacha20_poly1305 chacha20;
+	} tls12;
 	struct sockaddr_in addr;
 	socklen_t len;
 	int sfd, ret;
@@ -143,14 +146,16 @@ FIXTURE_SETUP(tls)
 	len = sizeof(addr);
 
 	memset(&tls12, 0, sizeof(tls12));
-	tls12.info.version = variant->tls_version;
-	tls12.info.cipher_type = variant->cipher_type;
 	switch (variant->cipher_type) {
 	case TLS_CIPHER_CHACHA20_POLY1305:
-		tls12_sz = sizeof(tls12_crypto_info_chacha20_poly1305);
+		tls12_sz = sizeof(struct tls12_crypto_info_chacha20_poly1305);
+		tls12.chacha20.info.version = variant->tls_version;
+		tls12.chacha20.info.cipher_type = variant->cipher_type;
 		break;
 	case TLS_CIPHER_AES_GCM_128:
-		tls12_sz = sizeof(tls12_crypto_info_aes_gcm_128);
+		tls12_sz = sizeof(struct tls12_crypto_info_aes_gcm_128);
+		tls12.aes128.info.version = variant->tls_version;
+		tls12.aes128.info.cipher_type = variant->cipher_type;
 		break;
 	default:
 		tls12_sz = 0;

From ec7d8e7dd3a59528e305a18e93f1cb98f7faf83b Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 2 Feb 2021 08:09:38 +0100
Subject: [PATCH 18/69] xen/netback: avoid race in
 xenvif_rx_ring_slots_available()

Since commit 23025393dbeb3b8b3 ("xen/netback: use lateeoi irq binding")
xenvif_rx_ring_slots_available() is no longer called only from the rx
queue kernel thread, so it needs to access the rx queue with the
associated queue held.

Reported-by: Igor Druzhinin <igor.druzhinin@citrix.com>
Fixes: 23025393dbeb3b8b3 ("xen/netback: use lateeoi irq binding")
Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Wei Liu <wl@xen.org>
Link: https://lore.kernel.org/r/20210202070938.7863-1-jgross@suse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/xen-netback/rx.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/rx.c b/drivers/net/xen-netback/rx.c
index b8febe1d1bfd..accc991d153f 100644
--- a/drivers/net/xen-netback/rx.c
+++ b/drivers/net/xen-netback/rx.c
@@ -38,10 +38,15 @@ static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue)
 	RING_IDX prod, cons;
 	struct sk_buff *skb;
 	int needed;
+	unsigned long flags;
+
+	spin_lock_irqsave(&queue->rx_queue.lock, flags);
 
 	skb = skb_peek(&queue->rx_queue);
-	if (!skb)
+	if (!skb) {
+		spin_unlock_irqrestore(&queue->rx_queue.lock, flags);
 		return false;
+	}
 
 	needed = DIV_ROUND_UP(skb->len, XEN_PAGE_SIZE);
 	if (skb_is_gso(skb))
@@ -49,6 +54,8 @@ static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue)
 	if (skb->sw_hash)
 		needed++;
 
+	spin_unlock_irqrestore(&queue->rx_queue.lock, flags);
+
 	do {
 		prod = queue->rx.sring->req_prod;
 		cons = queue->rx.req_cons;

From 3401e4aa43a540881cc97190afead650e709c418 Mon Sep 17 00:00:00 2001
From: Raju Rangoju <rajur@chelsio.com>
Date: Tue, 2 Feb 2021 23:55:11 +0530
Subject: [PATCH 19/69] cxgb4: Add new T6 PCI device id 0x6092

Signed-off-by: Raju Rangoju <rajur@chelsio.com>
Link: https://lore.kernel.org/r/20210202182511.8109-1-rajur@chelsio.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
index 0c5373462ced..0b1b5f9c67d4 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
@@ -219,6 +219,7 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN
 	CH_PCI_ID_TABLE_FENTRY(0x6089), /* Custom T62100-KR */
 	CH_PCI_ID_TABLE_FENTRY(0x608a), /* Custom T62100-CR */
 	CH_PCI_ID_TABLE_FENTRY(0x608b), /* Custom T6225-CR */
+	CH_PCI_ID_TABLE_FENTRY(0x6092), /* Custom T62100-CR-LOM */
 CH_PCI_DEVICE_ID_TABLE_DEFINE_END;
 
 #endif /* __T4_PCI_ID_TBL_H__ */

From 7b5eab57cac45e270a0ad624ba157c5b30b3d44d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 3 Feb 2021 08:47:56 +0000
Subject: [PATCH 20/69] rxrpc: Fix clearance of Tx/Rx ring when releasing a
 call

At the end of rxrpc_release_call(), rxrpc_cleanup_ring() is called to clear
the Rx/Tx skbuff ring, but this doesn't lock the ring whilst it's accessing
it.  Unfortunately, rxrpc_resend() might be trying to retransmit a packet
concurrently with this - and whilst it does lock the ring, this isn't
protection against rxrpc_cleanup_call().

Fix this by removing the call to rxrpc_cleanup_ring() from
rxrpc_release_call().  rxrpc_cleanup_ring() will be called again anyway
from rxrpc_cleanup_call().  The earlier call is just an optimisation to
recycle skbuffs more quickly.

Alternative solutions include rxrpc_release_call() could try to cancel the
work item or wait for it to complete or rxrpc_cleanup_ring() could lock
when accessing the ring (which would require a bh lock).

This can produce a report like the following:

  BUG: KASAN: use-after-free in rxrpc_send_data_packet+0x19b4/0x1e70 net/rxrpc/output.c:372
  Read of size 4 at addr ffff888011606e04 by task kworker/0:0/5
  ...
  Workqueue: krxrpcd rxrpc_process_call
  Call Trace:
   ...
   kasan_report.cold+0x79/0xd5 mm/kasan/report.c:413
   rxrpc_send_data_packet+0x19b4/0x1e70 net/rxrpc/output.c:372
   rxrpc_resend net/rxrpc/call_event.c:266 [inline]
   rxrpc_process_call+0x1634/0x1f60 net/rxrpc/call_event.c:412
   process_one_work+0x98d/0x15f0 kernel/workqueue.c:2275
   ...

  Allocated by task 2318:
   ...
   sock_alloc_send_pskb+0x793/0x920 net/core/sock.c:2348
   rxrpc_send_data+0xb51/0x2bf0 net/rxrpc/sendmsg.c:358
   rxrpc_do_sendmsg+0xc03/0x1350 net/rxrpc/sendmsg.c:744
   rxrpc_sendmsg+0x420/0x630 net/rxrpc/af_rxrpc.c:560
   ...

  Freed by task 2318:
   ...
   kfree_skb+0x140/0x3f0 net/core/skbuff.c:704
   rxrpc_free_skb+0x11d/0x150 net/rxrpc/skbuff.c:78
   rxrpc_cleanup_ring net/rxrpc/call_object.c:485 [inline]
   rxrpc_release_call+0x5dd/0x860 net/rxrpc/call_object.c:552
   rxrpc_release_calls_on_socket+0x21c/0x300 net/rxrpc/call_object.c:579
   rxrpc_release_sock net/rxrpc/af_rxrpc.c:885 [inline]
   rxrpc_release+0x263/0x5a0 net/rxrpc/af_rxrpc.c:916
   __sock_release+0xcd/0x280 net/socket.c:597
   ...

  The buggy address belongs to the object at ffff888011606dc0
   which belongs to the cache skbuff_head_cache of size 232

Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code")
Reported-by: syzbot+174de899852504e4a74a@syzkaller.appspotmail.com
Reported-by: syzbot+3d1c772efafd3c38d007@syzkaller.appspotmail.com
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Hillf Danton <hdanton@sina.com>
Link: https://lore.kernel.org/r/161234207610.653119.5287360098400436976.stgit@warthog.procyon.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/rxrpc/call_object.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index c845594b663f..4eb91d958a48 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -548,8 +548,6 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 		rxrpc_disconnect_call(call);
 	if (call->security)
 		call->security->free_call_crypto(call);
-
-	rxrpc_cleanup_ring(call);
 	_leave("");
 }
 

From 81b8be68ef8e8915d0cc6cedd2ac425c74a24813 Mon Sep 17 00:00:00 2001
From: Xie He <xie.he.0141@gmail.com>
Date: Tue, 2 Feb 2021 23:15:41 -0800
Subject: [PATCH 21/69] net: hdlc_x25: Return meaningful error code in x25_open

It's not meaningful to pass on LAPB error codes to HDLC code or other
parts of the system, because they will not understand the error codes.

Instead, use system-wide recognizable error codes.

Fixes: f362e5fe0f1f ("wan/hdlc_x25: make lapb params configurable")
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Xie He <xie.he.0141@gmail.com>
Acked-by: Martin Schiller <ms@dev.tdt.de>
Link: https://lore.kernel.org/r/20210203071541.86138-1-xie.he.0141@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/wan/hdlc_x25.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wan/hdlc_x25.c b/drivers/net/wan/hdlc_x25.c
index bb164805804e..4aaa6388b9ee 100644
--- a/drivers/net/wan/hdlc_x25.c
+++ b/drivers/net/wan/hdlc_x25.c
@@ -169,11 +169,11 @@ static int x25_open(struct net_device *dev)
 
 	result = lapb_register(dev, &cb);
 	if (result != LAPB_OK)
-		return result;
+		return -ENOMEM;
 
 	result = lapb_getparms(dev, &params);
 	if (result != LAPB_OK)
-		return result;
+		return -EINVAL;
 
 	if (state(hdlc)->settings.dce)
 		params.mode = params.mode | LAPB_DCE;
@@ -188,7 +188,7 @@ static int x25_open(struct net_device *dev)
 
 	result = lapb_setparms(dev, &params);
 	if (result != LAPB_OK)
-		return result;
+		return -EINVAL;
 
 	return 0;
 }

From 1d23a56b0296d29e7047b41fe0a42a001036160d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Wed, 3 Feb 2021 19:06:55 -0600
Subject: [PATCH 22/69] net: ipa: set error code in gsi_channel_setup()

In gsi_channel_setup(), we check to see if the configuration data
contains any information about channels that are not supported by
the hardware.  If one is found, we abort the setup process, but
the error code (ret) is not set in this case.  Fix this bug.

Fixes: 650d1603825d8 ("soc: qcom: ipa: the generic software interface")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alex Elder <elder@linaro.org>
Link: https://lore.kernel.org/r/20210204010655.15619-1-elder@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ipa/gsi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c
index 34e5f2155d62..b77f5fef7aec 100644
--- a/drivers/net/ipa/gsi.c
+++ b/drivers/net/ipa/gsi.c
@@ -1710,6 +1710,7 @@ static int gsi_channel_setup(struct gsi *gsi)
 		if (!channel->gsi)
 			continue;	/* Ignore uninitialized channels */
 
+		ret = -EINVAL;
 		dev_err(gsi->dev, "channel %u not supported by hardware\n",
 			channel_id - 1);
 		channel_id = gsi->channel_count;

From 52cbd23a119c6ebf40a527e53f3402d2ea38eccb Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 3 Feb 2021 14:29:52 -0500
Subject: [PATCH 23/69] udp: fix skb_copy_and_csum_datagram with odd segment
 sizes

When iteratively computing a checksum with csum_block_add, track the
offset "pos" to correctly rotate in csum_block_add when offset is odd.

The open coded implementation of skb_copy_and_csum_datagram did this.
With the switch to __skb_datagram_iter calling csum_and_copy_to_iter,
pos was reinitialized to 0 on each call.

Bring back the pos by passing it along with the csum to the callback.

Changes v1->v2
  - pass csum value, instead of csump pointer (Alexander Duyck)

Link: https://lore.kernel.org/netdev/20210128152353.GB27281@optiplex/
Fixes: 950fcaecd5cc ("datagram: consolidate datagram copy to iter helpers")
Reported-by: Oliver Graute <oliver.graute@gmail.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20210203192952.1849843-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/uio.h |  8 +++++++-
 lib/iov_iter.c      | 24 ++++++++++++++----------
 net/core/datagram.c | 12 ++++++++++--
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 72d88566694e..27ff8eb786dc 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -260,7 +260,13 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 {
 	i->count = count;
 }
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, struct iov_iter *i);
+
+struct csum_state {
+	__wsum csum;
+	size_t off;
+};
+
+size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index a21e6a5792c5..f0b2ccb1bb01 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -592,14 +592,15 @@ static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
 }
 
 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
-				__wsum *csum, struct iov_iter *i)
+					 struct csum_state *csstate,
+					 struct iov_iter *i)
 {
 	struct pipe_inode_info *pipe = i->pipe;
 	unsigned int p_mask = pipe->ring_size - 1;
+	__wsum sum = csstate->csum;
+	size_t off = csstate->off;
 	unsigned int i_head;
 	size_t n, r;
-	size_t off = 0;
-	__wsum sum = *csum;
 
 	if (!sanity(i))
 		return 0;
@@ -621,7 +622,8 @@ static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
 		i_head++;
 	} while (n);
 	i->count -= bytes;
-	*csum = sum;
+	csstate->csum = sum;
+	csstate->off = off;
 	return bytes;
 }
 
@@ -1522,18 +1524,19 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
 }
 EXPORT_SYMBOL(csum_and_copy_from_iter_full);
 
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump,
+size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
 			     struct iov_iter *i)
 {
+	struct csum_state *csstate = _csstate;
 	const char *from = addr;
-	__wsum *csum = csump;
 	__wsum sum, next;
-	size_t off = 0;
+	size_t off;
 
 	if (unlikely(iov_iter_is_pipe(i)))
-		return csum_and_copy_to_pipe_iter(addr, bytes, csum, i);
+		return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i);
 
-	sum = *csum;
+	sum = csstate->csum;
+	off = csstate->off;
 	if (unlikely(iov_iter_is_discard(i))) {
 		WARN_ON(1);	/* for now */
 		return 0;
@@ -1561,7 +1564,8 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump,
 		off += v.iov_len;
 	})
 	)
-	*csum = sum;
+	csstate->csum = sum;
+	csstate->off = off;
 	return bytes;
 }
 EXPORT_SYMBOL(csum_and_copy_to_iter);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 81809fa735a7..15ab9ffb27fe 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -721,8 +721,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
 				      struct iov_iter *to, int len,
 				      __wsum *csump)
 {
-	return __skb_datagram_iter(skb, offset, to, len, true,
-			csum_and_copy_to_iter, csump);
+	struct csum_state csdata = { .csum = *csump };
+	int ret;
+
+	ret = __skb_datagram_iter(skb, offset, to, len, true,
+				  csum_and_copy_to_iter, &csdata);
+	if (ret)
+		return ret;
+
+	*csump = csdata.csum;
+	return 0;
 }
 
 /**

From 12bc8dfb83b5292fe387b795210018b7632ee08b Mon Sep 17 00:00:00 2001
From: "Andrea Parri (Microsoft)" <parri.andrea@gmail.com>
Date: Wed, 3 Feb 2021 12:36:02 +0100
Subject: [PATCH 24/69] hv_netvsc: Reset the RSC count if NVSP_STAT_FAIL in
 netvsc_receive()

Commit 44144185951a0f ("hv_netvsc: Add validation for untrusted Hyper-V
values") added validation to rndis_filter_receive_data() (and
rndis_filter_receive()) which introduced NVSP_STAT_FAIL-scenarios where
the count is not updated/reset.  Fix this omission, and prevent similar
scenarios from occurring in the future.

Reported-by: Juan Vazquez <juvazq@microsoft.com>
Signed-off-by: Andrea Parri (Microsoft) <parri.andrea@gmail.com>
Fixes: 44144185951a0f ("hv_netvsc: Add validation for untrusted Hyper-V values")
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Link: https://lore.kernel.org/r/20210203113602.558916-1-parri.andrea@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/hyperv/netvsc.c       | 5 ++++-
 drivers/net/hyperv/rndis_filter.c | 2 --
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 2350342b961f..13bd48a75db7 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1262,8 +1262,11 @@ static int netvsc_receive(struct net_device *ndev,
 		ret = rndis_filter_receive(ndev, net_device,
 					   nvchan, data, buflen);
 
-		if (unlikely(ret != NVSP_STAT_SUCCESS))
+		if (unlikely(ret != NVSP_STAT_SUCCESS)) {
+			/* Drop incomplete packet */
+			nvchan->rsc.cnt = 0;
 			status = NVSP_STAT_FAIL;
+		}
 	}
 
 	enq_receive_complete(ndev, net_device, q_idx,
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 598713c0d5a8..3aab2b867fc0 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -509,8 +509,6 @@ static int rndis_filter_receive_data(struct net_device *ndev,
 	return ret;
 
 drop:
-	/* Drop incomplete packet */
-	nvchan->rsc.cnt = 0;
 	return NVSP_STAT_FAIL;
 }
 

From 07bf34a50e327975b21a9dee64d220c3dcb72ee9 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 4 Feb 2021 15:45:11 +0200
Subject: [PATCH 25/69] net: enetc: initialize the RFS and RSS memories

Michael tried to enable Advanced Error Reporting through the ENETC's
Root Complex Event Collector, and the system started spitting out single
bit correctable ECC errors coming from the ENETC interfaces:

pcieport 0000:00:1f.0: AER: Multiple Corrected error received: 0000:00:00.0
fsl_enetc 0000:00:00.0: PCIe Bus Error: severity=Corrected, type=Transaction Layer, (Receiver ID)
fsl_enetc 0000:00:00.0:   device [1957:e100] error status/mask=00004000/00000000
fsl_enetc 0000:00:00.0:    [14] CorrIntErr
fsl_enetc 0000:00:00.1: PCIe Bus Error: severity=Corrected, type=Transaction Layer, (Receiver ID)
fsl_enetc 0000:00:00.1:   device [1957:e100] error status/mask=00004000/00000000
fsl_enetc 0000:00:00.1:    [14] CorrIntErr

Further investigating the port correctable memory error detect register
(PCMEDR) shows that these AER errors have an associated SOURCE_ID of 6
(RFS/RSS):

$ devmem 0x1f8010e10 32
0xC0000006
$ devmem 0x1f8050e10 32
0xC0000006

Discussion with the hardware design engineers reveals that on LS1028A,
the hardware does not do initialization of that RFS/RSS memory, and that
software should clear/initialize the entire table before starting to
operate. That comes as a bit of a surprise, since the driver does not do
initialization of the RFS memory. Also, the initialization of the
Receive Side Scaling is done only partially.

Even though the entire ENETC IP has a single shared flow steering
memory, the flow steering service should returns matches only for TCAM
entries that are within the range of the Station Interface that is doing
the search. Therefore, it should be sufficient for a Station Interface
to initialize all of its own entries in order to avoid any ECC errors,
and only the Station Interfaces in use should need initialization.

There are Physical Station Interfaces associated with PCIe PFs and
Virtual Station Interfaces associated with PCIe VFs. We let the PF
driver initialize the entire port's memory, which includes the RFS
entries which are going to be used by the VF.

Reported-by: Michael Walle <michael@walle.cc>
Fixes: d4fd0404c1c9 ("enetc: Introduce basic PF and VF ENETC ethernet drivers")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Michael Walle <michael@walle.cc>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Link: https://lore.kernel.org/r/20210204134511.2640309-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/freescale/enetc/enetc_hw.h   |  2 +
 .../net/ethernet/freescale/enetc/enetc_pf.c   | 59 +++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h
index e1e950d48c92..c71fe8d751d5 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h
+++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h
@@ -196,6 +196,8 @@ enum enetc_bdr_type {TX, RX};
 #define ENETC_CBS_BW_MASK	GENMASK(6, 0)
 #define ENETC_PTCCBSR1(n)	(0x1114 + (n) * 8) /* n = 0 to 7*/
 #define ENETC_RSSHASH_KEY_SIZE	40
+#define ENETC_PRSSCAPR		0x1404
+#define ENETC_PRSSCAPR_GET_NUM_RSS(val)	(BIT((val) & 0xf) * 32)
 #define ENETC_PRSSK(n)		(0x1410 + (n) * 4) /* n = [0..9] */
 #define ENETC_PSIVLANFMR	0x1700
 #define ENETC_PSIVLANFMR_VS	BIT(0)
diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
index ed8fcb8b486e..3eb5f1375bd4 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c
@@ -996,6 +996,51 @@ static void enetc_phylink_destroy(struct enetc_ndev_priv *priv)
 		phylink_destroy(priv->phylink);
 }
 
+/* Initialize the entire shared memory for the flow steering entries
+ * of this port (PF + VFs)
+ */
+static int enetc_init_port_rfs_memory(struct enetc_si *si)
+{
+	struct enetc_cmd_rfse rfse = {0};
+	struct enetc_hw *hw = &si->hw;
+	int num_rfs, i, err = 0;
+	u32 val;
+
+	val = enetc_port_rd(hw, ENETC_PRFSCAPR);
+	num_rfs = ENETC_PRFSCAPR_GET_NUM_RFS(val);
+
+	for (i = 0; i < num_rfs; i++) {
+		err = enetc_set_fs_entry(si, &rfse, i);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+static int enetc_init_port_rss_memory(struct enetc_si *si)
+{
+	struct enetc_hw *hw = &si->hw;
+	int num_rss, err;
+	int *rss_table;
+	u32 val;
+
+	val = enetc_port_rd(hw, ENETC_PRSSCAPR);
+	num_rss = ENETC_PRSSCAPR_GET_NUM_RSS(val);
+	if (!num_rss)
+		return 0;
+
+	rss_table = kcalloc(num_rss, sizeof(*rss_table), GFP_KERNEL);
+	if (!rss_table)
+		return -ENOMEM;
+
+	err = enetc_set_rss_table(si, rss_table, num_rss);
+
+	kfree(rss_table);
+
+	return err;
+}
+
 static int enetc_pf_probe(struct pci_dev *pdev,
 			  const struct pci_device_id *ent)
 {
@@ -1051,6 +1096,18 @@ static int enetc_pf_probe(struct pci_dev *pdev,
 		goto err_alloc_si_res;
 	}
 
+	err = enetc_init_port_rfs_memory(si);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to initialize RFS memory\n");
+		goto err_init_port_rfs;
+	}
+
+	err = enetc_init_port_rss_memory(si);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to initialize RSS memory\n");
+		goto err_init_port_rss;
+	}
+
 	err = enetc_alloc_msix(priv);
 	if (err) {
 		dev_err(&pdev->dev, "MSIX alloc failed\n");
@@ -1079,6 +1136,8 @@ static int enetc_pf_probe(struct pci_dev *pdev,
 	enetc_mdiobus_destroy(pf);
 err_mdiobus_create:
 	enetc_free_msix(priv);
+err_init_port_rss:
+err_init_port_rfs:
 err_alloc_msix:
 	enetc_free_si_resources(priv);
 err_alloc_si_res:

From 8fd54a73b7cda11548154451bdb4bde6d8ff74c7 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 4 Feb 2021 18:33:51 +0200
Subject: [PATCH 26/69] net: dsa: call teardown method on probe failure

Since teardown is supposed to undo the effects of the setup method, it
should be called in the error path for dsa_switch_setup, not just in
dsa_switch_teardown.

Fixes: 5e3f847a02aa ("net: dsa: Add teardown callback for drivers")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20210204163351.2929670-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/dsa/dsa2.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index a47e0f9b20d0..a04fd637b4cd 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -462,20 +462,23 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 		ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
 		if (!ds->slave_mii_bus) {
 			err = -ENOMEM;
-			goto unregister_notifier;
+			goto teardown;
 		}
 
 		dsa_slave_mii_bus_init(ds);
 
 		err = mdiobus_register(ds->slave_mii_bus);
 		if (err < 0)
-			goto unregister_notifier;
+			goto teardown;
 	}
 
 	ds->setup = true;
 
 	return 0;
 
+teardown:
+	if (ds->ops->teardown)
+		ds->ops->teardown(ds);
 unregister_notifier:
 	dsa_switch_unregister_notifier(ds);
 unregister_devlink_ports:

From 647b8dd5184665432cc8a2b5bca46a201f690c37 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Thu, 4 Feb 2021 20:50:34 +0300
Subject: [PATCH 27/69] selftests: txtimestamp: fix compilation issue

PACKET_TX_TIMESTAMP is defined in if_packet.h but it is not included in
test. Include it instead of <netpacket/packet.h> otherwise the error of
redefinition arrives.
Also fix the compiler warning about ambiguous control flow by adding
explicit braces.

Fixes: 8fe2f761cae9 ("net-timestamp: expand documentation")
Suggested-by: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Acked-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/1612461034-24524-1-git-send-email-vfedorenko@novek.ru
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/txtimestamp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c
index 490a8cca708a..fabb1d555ee5 100644
--- a/tools/testing/selftests/net/txtimestamp.c
+++ b/tools/testing/selftests/net/txtimestamp.c
@@ -26,6 +26,7 @@
 #include <inttypes.h>
 #include <linux/errqueue.h>
 #include <linux/if_ether.h>
+#include <linux/if_packet.h>
 #include <linux/ipv6.h>
 #include <linux/net_tstamp.h>
 #include <netdb.h>
@@ -34,7 +35,6 @@
 #include <netinet/ip.h>
 #include <netinet/udp.h>
 #include <netinet/tcp.h>
-#include <netpacket/packet.h>
 #include <poll.h>
 #include <stdarg.h>
 #include <stdbool.h>
@@ -495,12 +495,12 @@ static void do_test(int family, unsigned int report_opt)
 	total_len = cfg_payload_len;
 	if (cfg_use_pf_packet || cfg_proto == SOCK_RAW) {
 		total_len += sizeof(struct udphdr);
-		if (cfg_use_pf_packet || cfg_ipproto == IPPROTO_RAW)
+		if (cfg_use_pf_packet || cfg_ipproto == IPPROTO_RAW) {
 			if (family == PF_INET)
 				total_len += sizeof(struct iphdr);
 			else
 				total_len += sizeof(struct ipv6hdr);
-
+		}
 		/* special case, only rawv6_sendmsg:
 		 * pass proto in sin6_port if not connected
 		 * also see ANK comment in net/ipv4/raw.c

From 8dc1c444df193701910f5e80b5d4caaf705a8fb0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 4 Feb 2021 13:31:46 -0800
Subject: [PATCH 28/69] net: gro: do not keep too many GRO packets in
 napi->rx_list

Commit c80794323e82 ("net: Fix packet reordering caused by GRO and
listified RX cooperation") had the unfortunate effect of adding
latencies in common workloads.

Before the patch, GRO packets were immediately passed to
upper stacks.

After the patch, we can accumulate quite a lot of GRO
packets (depdending on NAPI budget).

My fix is counting in napi->rx_count number of segments
instead of number of logical packets.

Fixes: c80794323e82 ("net: Fix packet reordering caused by GRO and listified RX cooperation")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Bisected-by: John Sperbeck <jsperbeck@google.com>
Tested-by: Jian Yang <jianyang@google.com>
Cc: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Edward Cree <ecree.xilinx@gmail.com>
Reviewed-by: Alexander Lobakin <alobakin@pm.me>
Link: https://lore.kernel.org/r/20210204213146.4192368-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/dev.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index a979b86dbacd..449b45b843d4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5735,10 +5735,11 @@ static void gro_normal_list(struct napi_struct *napi)
 /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
  * pass the whole batch up to the stack.
  */
-static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb)
+static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
 {
 	list_add_tail(&skb->list, &napi->rx_list);
-	if (++napi->rx_count >= gro_normal_batch)
+	napi->rx_count += segs;
+	if (napi->rx_count >= gro_normal_batch)
 		gro_normal_list(napi);
 }
 
@@ -5777,7 +5778,7 @@ static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
 	}
 
 out:
-	gro_normal_one(napi, skb);
+	gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
 	return NET_RX_SUCCESS;
 }
 
@@ -6067,7 +6068,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
 {
 	switch (ret) {
 	case GRO_NORMAL:
-		gro_normal_one(napi, skb);
+		gro_normal_one(napi, skb, 1);
 		break;
 
 	case GRO_DROP:
@@ -6155,7 +6156,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
 		__skb_push(skb, ETH_HLEN);
 		skb->protocol = eth_type_trans(skb, skb->dev);
 		if (ret == GRO_NORMAL)
-			gro_normal_one(napi, skb);
+			gro_normal_one(napi, skb, 1);
 		break;
 
 	case GRO_DROP:

From 275a9c72b420e5051b0e92e49b26bef06c196f29 Mon Sep 17 00:00:00 2001
From: Camelia Groza <camelia.groza@nxp.com>
Date: Thu, 4 Feb 2021 18:49:26 +0200
Subject: [PATCH 29/69] dpaa_eth: reserve space for the xdp_frame under the
 A050385 erratum

When the erratum workaround is triggered, the newly created xdp_frame
structure is stored at the start of the newly allocated buffer. Avoid
the structure from being overwritten by explicitly reserving enough
space in the buffer for storing it.

Account for the fact that the structure's size might increase in time by
aligning the headroom to DPAA_FD_DATA_ALIGNMENT bytes, thus guaranteeing
the data's alignment.

Fixes: ae680bcbd06a ("dpaa_eth: implement the A050385 erratum workaround for XDP")
Signed-off-by: Camelia Groza <camelia.groza@nxp.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Acked-by: Madalin Bucur <madalin.bucur@oss.nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/freescale/dpaa/dpaa_eth.c    | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 4360ce4d3fb6..f3a879937d8d 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -2182,6 +2182,7 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv,
 	struct xdp_frame *new_xdpf, *xdpf = *init_xdpf;
 	void *new_buff;
 	struct page *p;
+	int headroom;
 
 	/* Check the data alignment and make sure the headroom is large
 	 * enough to store the xdpf backpointer. Use an aligned headroom
@@ -2197,19 +2198,34 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv,
 		return 0;
 	}
 
+	/* The new xdp_frame is stored in the new buffer. Reserve enough space
+	 * in the headroom for storing it along with the driver's private
+	 * info. The headroom needs to be aligned to DPAA_FD_DATA_ALIGNMENT to
+	 * guarantee the data's alignment in the buffer.
+	 */
+	headroom = ALIGN(sizeof(*new_xdpf) + priv->tx_headroom,
+			 DPAA_FD_DATA_ALIGNMENT);
+
+	/* Assure the extended headroom and data don't overflow the buffer,
+	 * while maintaining the mandatory tailroom.
+	 */
+	if (headroom + xdpf->len > DPAA_BP_RAW_SIZE -
+			SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+		return -ENOMEM;
+
 	p = dev_alloc_pages(0);
 	if (unlikely(!p))
 		return -ENOMEM;
 
 	/* Copy the data to the new buffer at a properly aligned offset */
 	new_buff = page_address(p);
-	memcpy(new_buff + priv->tx_headroom, xdpf->data, xdpf->len);
+	memcpy(new_buff + headroom, xdpf->data, xdpf->len);
 
 	/* Create an XDP frame around the new buffer in a similar fashion
 	 * to xdp_convert_buff_to_frame.
 	 */
 	new_xdpf = new_buff;
-	new_xdpf->data = new_buff + priv->tx_headroom;
+	new_xdpf->data = new_buff + headroom;
 	new_xdpf->len = xdpf->len;
 	new_xdpf->headroom = priv->tx_headroom;
 	new_xdpf->frame_sz = DPAA_BP_RAW_SIZE;

From c2b0e8455eb76135f505dda81a8869e60f37a861 Mon Sep 17 00:00:00 2001
From: Camelia Groza <camelia.groza@nxp.com>
Date: Thu, 4 Feb 2021 18:49:27 +0200
Subject: [PATCH 30/69] dpaa_eth: reduce data alignment requirements for the
 A050385 erratum

The 256 byte data alignment is required for preventing DMA transaction
splits when crossing 4K page boundaries. Since XDP deals only with page
sized buffers or less, this restriction isn't needed. Instead, the data
only needs to be aligned to 64 bytes to prevent DMA transaction splits.

These lessened restrictions can increase performance by widening the pool
of permitted data alignments and preventing unnecessary realignments.

Fixes: ae680bcbd06a ("dpaa_eth: implement the A050385 erratum workaround for XDP")
Signed-off-by: Camelia Groza <camelia.groza@nxp.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Acked-by: Madalin Bucur <madalin.bucur@oss.nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index f3a879937d8d..2a2c7db23407 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -2192,7 +2192,7 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv,
 	 * byte frame headroom. If the XDP program uses all of it, copy the
 	 * data to a new buffer and make room for storing the backpointer.
 	 */
-	if (PTR_IS_ALIGNED(xdpf->data, DPAA_A050385_ALIGN) &&
+	if (PTR_IS_ALIGNED(xdpf->data, DPAA_FD_DATA_ALIGNMENT) &&
 	    xdpf->headroom >= priv->tx_headroom) {
 		xdpf->headroom = priv->tx_headroom;
 		return 0;

From 0a9946cca1a30b7236a86757da9df2222eb73ee0 Mon Sep 17 00:00:00 2001
From: Camelia Groza <camelia.groza@nxp.com>
Date: Thu, 4 Feb 2021 18:49:28 +0200
Subject: [PATCH 31/69] dpaa_eth: try to move the data in place for the A050385
 erratum

The XDP frame's headroom might be large enough to accommodate the
xdpf backpointer as well as shifting the data to an aligned address.

Try this first before resorting to allocating a new buffer and copying
the data.

Suggested-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Camelia Groza <camelia.groza@nxp.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Acked-by: Madalin Bucur <madalin.bucur@oss.nxp.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/freescale/dpaa/dpaa_eth.c    | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index 2a2c7db23407..6faa20bed488 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -2180,8 +2180,9 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv,
 				struct xdp_frame **init_xdpf)
 {
 	struct xdp_frame *new_xdpf, *xdpf = *init_xdpf;
-	void *new_buff;
+	void *new_buff, *aligned_data;
 	struct page *p;
+	u32 data_shift;
 	int headroom;
 
 	/* Check the data alignment and make sure the headroom is large
@@ -2198,6 +2199,23 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv,
 		return 0;
 	}
 
+	/* Try to move the data inside the buffer just enough to align it and
+	 * store the xdpf backpointer. If the available headroom isn't large
+	 * enough, resort to allocating a new buffer and copying the data.
+	 */
+	aligned_data = PTR_ALIGN_DOWN(xdpf->data, DPAA_FD_DATA_ALIGNMENT);
+	data_shift = xdpf->data - aligned_data;
+
+	/* The XDP frame's headroom needs to be large enough to accommodate
+	 * shifting the data as well as storing the xdpf backpointer.
+	 */
+	if (xdpf->headroom  >= data_shift + priv->tx_headroom) {
+		memmove(aligned_data, xdpf->data, xdpf->len);
+		xdpf->data = aligned_data;
+		xdpf->headroom = priv->tx_headroom;
+		return 0;
+	}
+
 	/* The new xdp_frame is stored in the new buffer. Reserve enough space
 	 * in the headroom for storing it along with the driver's private
 	 * info. The headroom needs to be aligned to DPAA_FD_DATA_ALIGNMENT to

From f317e2ea8c88737aa36228167b2292baef3f0430 Mon Sep 17 00:00:00 2001
From: Mohammad Athari Bin Ismail <mohammad.athari.ismail@intel.com>
Date: Thu, 4 Feb 2021 22:03:16 +0800
Subject: [PATCH 32/69] net: stmmac: set TxQ mode back to DCB after disabling
 CBS

When disable CBS, mode_to_use parameter is not updated even the operation
mode of Tx Queue is changed to Data Centre Bridging (DCB). Therefore,
when tc_setup_cbs() function is called to re-enable CBS, the operation
mode of Tx Queue remains at DCB, which causing CBS fails to work.

This patch updates the value of mode_to_use parameter to MTL_QUEUE_DCB
after operation mode of Tx Queue is changed to DCB in stmmac_dma_qmode()
callback function.

Fixes: 1f705bc61aee ("net: stmmac: Add support for CBS QDISC")
Suggested-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: Mohammad Athari Bin Ismail <mohammad.athari.ismail@intel.com>
Signed-off-by: Song, Yoong Siang <yoong.siang.song@intel.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Link: https://lore.kernel.org/r/1612447396-20351-1-git-send-email-yoong.siang.song@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
index 8ed3b2c834a0..56985542e202 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -324,7 +324,12 @@ static int tc_setup_cbs(struct stmmac_priv *priv,
 
 		priv->plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_AVB;
 	} else if (!qopt->enable) {
-		return stmmac_dma_qmode(priv, priv->ioaddr, queue, MTL_QUEUE_DCB);
+		ret = stmmac_dma_qmode(priv, priv->ioaddr, queue,
+				       MTL_QUEUE_DCB);
+		if (ret)
+			return ret;
+
+		priv->plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_DCB;
 	}
 
 	/* Port Transmit Rate and Speed Divider */

From ef66a1eace968ff22a35f45e6e8ec36b668b6116 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
Date: Tue, 2 Feb 2021 21:08:02 -0800
Subject: [PATCH 33/69] ibmvnic: Clear failover_pending if unable to schedule

Normally we clear the failover_pending flag when processing the reset.
But if we are unable to schedule a failover reset we must clear the
flag ourselves. We could fail to schedule the reset if we are in PROBING
state (eg: when booting via kexec) or because we could not allocate memory.

Thanks to Cris Forno for helping isolate the problem and for testing.

Fixes: 1d8504937478 ("powerpc/vnic: Extend "failover pending" window")
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
Tested-by: Cristobal Forno <cforno12@linux.ibm.com>
Link: https://lore.kernel.org/r/20210203050802.680772-1-sukadev@linux.ibm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index f79034c786c8..a536fdbf05e1 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -4918,7 +4918,22 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 				complete(&adapter->init_done);
 				adapter->init_done_rc = -EIO;
 			}
-			ibmvnic_reset(adapter, VNIC_RESET_FAILOVER);
+			rc = ibmvnic_reset(adapter, VNIC_RESET_FAILOVER);
+			if (rc && rc != -EBUSY) {
+				/* We were unable to schedule the failover
+				 * reset either because the adapter was still
+				 * probing (eg: during kexec) or we could not
+				 * allocate memory. Clear the failover_pending
+				 * flag since no one else will. We ignore
+				 * EBUSY because it means either FAILOVER reset
+				 * is already scheduled or the adapter is
+				 * being removed.
+				 */
+				netdev_err(netdev,
+					   "Error %ld scheduling failover reset\n",
+					   rc);
+				adapter->failover_pending = false;
+			}
 			break;
 		case IBMVNIC_CRQ_INIT_COMPLETE:
 			dev_info(dev, "Partner initialization complete\n");

From 5d1cbcc990f18edaddddef26677073c4e6fad7b7 Mon Sep 17 00:00:00 2001
From: Norbert Slusarek <nslusarek@gmx.net>
Date: Fri, 5 Feb 2021 13:12:06 +0100
Subject: [PATCH 34/69] net/vmw_vsock: fix NULL pointer dereference

In vsock_stream_connect(), a thread will enter schedule_timeout().
While being scheduled out, another thread can enter vsock_stream_connect()
as well and set vsk->transport to NULL. In case a signal was sent, the
first thread can leave schedule_timeout() and vsock_transport_cancel_pkt()
will be called right after. Inside vsock_transport_cancel_pkt(), a null
dereference will happen on transport->cancel_pkt.

Fixes: c0cfa2d8a788 ("vsock: add multi-transports support")
Signed-off-by: Norbert Slusarek <nslusarek@gmx.net>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/trinity-c2d6cede-bfb1-44e2-85af-1fbc7f541715-1612535117028@3c-app-gmx-bap12
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/vmw_vsock/af_vsock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 6894f21dc147..cb81cfb47a78 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1233,7 +1233,7 @@ static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
 {
 	const struct vsock_transport *transport = vsk->transport;
 
-	if (!transport->cancel_pkt)
+	if (!transport || !transport->cancel_pkt)
 		return -EOPNOTSUPP;
 
 	return transport->cancel_pkt(vsk);

From 3d0bc44d39bca615b72637e340317b7899b7f911 Mon Sep 17 00:00:00 2001
From: Norbert Slusarek <nslusarek@gmx.net>
Date: Fri, 5 Feb 2021 13:14:05 +0100
Subject: [PATCH 35/69] net/vmw_vsock: improve locking in
 vsock_connect_timeout()

A possible locking issue in vsock_connect_timeout() was recognized by
Eric Dumazet which might cause a null pointer dereference in
vsock_transport_cancel_pkt(). This patch assures that
vsock_transport_cancel_pkt() will be called within the lock, so a race
condition won't occur which could result in vsk->transport to be set to NULL.

Fixes: 380feae0def7 ("vsock: cancel packets when failing to connect")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Norbert Slusarek <nslusarek@gmx.net>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://lore.kernel.org/r/trinity-f8e0937a-cf0e-4d80-a76e-d9a958ba3ef1-1612535522360@3c-app-gmx-bap12
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/vmw_vsock/af_vsock.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index cb81cfb47a78..4ea301fc2bf0 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1243,7 +1243,6 @@ static void vsock_connect_timeout(struct work_struct *work)
 {
 	struct sock *sk;
 	struct vsock_sock *vsk;
-	int cancel = 0;
 
 	vsk = container_of(work, struct vsock_sock, connect_work.work);
 	sk = sk_vsock(vsk);
@@ -1254,11 +1253,9 @@ static void vsock_connect_timeout(struct work_struct *work)
 		sk->sk_state = TCP_CLOSE;
 		sk->sk_err = ETIMEDOUT;
 		sk->sk_error_report(sk);
-		cancel = 1;
+		vsock_transport_cancel_pkt(vsk);
 	}
 	release_sock(sk);
-	if (cancel)
-		vsock_transport_cancel_pkt(vsk);
 
 	sock_put(sk);
 }

From 225353c070fda18a23785e34e1eec2be508a3a3c Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayagr@amazon.com>
Date: Fri, 5 Feb 2021 21:51:14 +0200
Subject: [PATCH 36/69] net: ena: Update XDP verdict upon failure

The verdict returned from ena_xdp_execute() is used to determine the
fate of the RX buffer's page. In case of XDP Redirect/TX error the
verdict should be set to XDP_ABORTED, otherwise the page won't be freed.

Fixes: a318c70ad152 ("net: ena: introduce XDP redirect implementation")
Signed-off-by: Shay Agroskin <shayagr@amazon.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 06596fa1f9fe..a0596c073ddd 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -404,6 +404,7 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp)
 		if (unlikely(!xdpf)) {
 			trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
 			xdp_stat = &rx_ring->rx_stats.xdp_aborted;
+			verdict = XDP_ABORTED;
 			break;
 		}
 
@@ -424,7 +425,10 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp)
 			xdp_stat = &rx_ring->rx_stats.xdp_redirect;
 			break;
 		}
-		fallthrough;
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
+		xdp_stat = &rx_ring->rx_stats.xdp_aborted;
+		verdict = XDP_ABORTED;
+		break;
 	case XDP_ABORTED:
 		trace_xdp_exception(rx_ring->netdev, xdp_prog, verdict);
 		xdp_stat = &rx_ring->rx_stats.xdp_aborted;

From b6c14d7a83802046f7098e9bae78fbde23affa74 Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Wed, 3 Feb 2021 20:19:24 +0100
Subject: [PATCH 37/69] dmaengine dw: Revert "dmaengine: dw: Enable runtime PM"

This reverts commit 842067940a3e3fc008a60fee388e000219b32632.
For some solutions e.g. sound/soc/intel/catpt, DW DMA is part of a
compound device (in that very example, domains: ADSP, SSP0, SSP1, DMA0
and DMA1 are part of a single entity) rather than being a standalone
one. Driver for said device may enlist DMA to transfer data during
suspend or resume sequences.

Manipulating RPM explicitly in dw's DMA request and release channel
functions causes suspend() to also invoke resume() for the exact same
device. Similar situation occurs for resume() sequence. Effectively
renders device dysfunctional after first suspend() attempt. Revert the
change to address the problem.

Fixes: 842067940a3e ("dmaengine: dw: Enable runtime PM")
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Acked-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20210203191924.15706-1-cezary.rojewski@intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw/core.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 19a23767533a..7ab83fe601ed 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -982,11 +982,8 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 
 	dev_vdbg(chan2dev(chan), "%s\n", __func__);
 
-	pm_runtime_get_sync(dw->dma.dev);
-
 	/* ASSERT:  channel is idle */
 	if (dma_readl(dw, CH_EN) & dwc->mask) {
-		pm_runtime_put_sync_suspend(dw->dma.dev);
 		dev_dbg(chan2dev(chan), "DMA channel not idle?\n");
 		return -EIO;
 	}
@@ -1003,7 +1000,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 	 * We need controller-specific data to set up slave transfers.
 	 */
 	if (chan->private && !dw_dma_filter(chan, chan->private)) {
-		pm_runtime_put_sync_suspend(dw->dma.dev);
 		dev_warn(chan2dev(chan), "Wrong controller-specific data\n");
 		return -EINVAL;
 	}
@@ -1047,8 +1043,6 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	if (!dw->in_use)
 		do_dw_dma_off(dw);
 
-	pm_runtime_put_sync_suspend(dw->dma.dev);
-
 	dev_vdbg(chan2dev(chan), "%s: done\n", __func__);
 }
 

From 3c55e94c0adea4a5389c4b80f6ae9927dd6a4501 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 4 Feb 2021 18:25:37 +0100
Subject: [PATCH 38/69] cpufreq: ACPI: Extend frequency tables to cover boost
 frequencies

A severe performance regression on AMD EPYC processors when using
the schedutil scaling governor was discovered by Phoronix.com and
attributed to the following commits:

  41ea667227ba ("x86, sched: Calculate frequency invariance for AMD
  systems")

  976df7e5730e ("x86, sched: Use midpoint of max_boost and max_P for
  frequency invariance on AMD EPYC")

The source of the problem is that the maximum performance level taken
for computing the arch_max_freq_ratio value used in the x86 scale-
invariance code is higher than the one corresponding to the
cpuinfo.max_freq value coming from the acpi_cpufreq driver.

This effectively causes the scale-invariant utilization to fall below
100% even if the CPU runs at cpuinfo.max_freq or slightly faster, so
the schedutil governor selects a frequency below cpuinfo.max_freq
then.  That frequency corresponds to a frequency table entry below
the maximum performance level necessary to get to the "boost" range
of CPU frequencies.

However, if the cpuinfo.max_freq value coming from acpi_cpufreq was
higher, the schedutil governor would select higher frequencies which
in turn would allow acpi_cpufreq to set more adequate performance
levels and to get to the "boost" range of CPU frequencies more often.

This issue affects any systems where acpi_cpufreq is used and the
"boost" (or "turbo") frequencies are enabled, not just AMD EPYC.
Moreover, commit db865272d9c4 ("cpufreq: Avoid configuring old
governors as default with intel_pstate") from the 5.10 development
cycle made it extremely easy to default to schedutil even if the
preferred driver is acpi_cpufreq as long as intel_pstate is built
too, because the mere presence of the latter effectively removes the
ondemand governor from the defaults.  Distro kernels are likely to
include both intel_pstate and acpi_cpufreq on x86, so their users
who cannot use intel_pstate or choose to use acpi_cpufreq may
easily be affectecd by this issue.

To address this issue, extend the frequency table constructed by
acpi_cpufreq for each CPU to cover the entire range of available
frequencies (including the "boost" ones) if CPPC is available and
indicates that "boost" (or "turbo") frequencies are enabled.  That
causes cpuinfo.max_freq to become the maximum "boost" frequency of
the given CPU (instead of the maximum frequency returned by the ACPI
_PSS object that corresponds to the "nominal" performance level).

Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems")
Fixes: 976df7e5730e ("x86, sched: Use midpoint of max_boost and max_P for frequency invariance on AMD EPYC")
Fixes: db865272d9c4 ("cpufreq: Avoid configuring old governors as default with intel_pstate")
Link: https://www.phoronix.com/scan.php?page=article&item=linux511-amd-schedutil&num=1
Link: https://lore.kernel.org/linux-pm/20210203135321.12253-2-ggherdovich@suse.cz/
Reported-by: Michael Larabel <Michael@phoronix.com>
Diagnosed-by: Giovanni Gherdovich <ggherdovich@suse.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Giovanni Gherdovich <ggherdovich@suse.cz>
Reviewed-by: Giovanni Gherdovich <ggherdovich@suse.cz>
Tested-by: Michael Larabel <Michael@phoronix.com>
---
 drivers/cpufreq/acpi-cpufreq.c | 109 +++++++++++++++++++++++++++++----
 1 file changed, 96 insertions(+), 13 deletions(-)

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 1e4fbb002a31..4614f1c6f50a 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -26,6 +26,7 @@
 #include <linux/uaccess.h>
 
 #include <acpi/processor.h>
+#include <acpi/cppc_acpi.h>
 
 #include <asm/msr.h>
 #include <asm/processor.h>
@@ -53,6 +54,7 @@ struct acpi_cpufreq_data {
 	unsigned int resume;
 	unsigned int cpu_feature;
 	unsigned int acpi_perf_cpu;
+	unsigned int first_perf_state;
 	cpumask_var_t freqdomain_cpus;
 	void (*cpu_freq_write)(struct acpi_pct_register *reg, u32 val);
 	u32 (*cpu_freq_read)(struct acpi_pct_register *reg);
@@ -221,10 +223,10 @@ static unsigned extract_msr(struct cpufreq_policy *policy, u32 msr)
 
 	perf = to_perf_data(data);
 
-	cpufreq_for_each_entry(pos, policy->freq_table)
+	cpufreq_for_each_entry(pos, policy->freq_table + data->first_perf_state)
 		if (msr == perf->states[pos->driver_data].status)
 			return pos->frequency;
-	return policy->freq_table[0].frequency;
+	return policy->freq_table[data->first_perf_state].frequency;
 }
 
 static unsigned extract_freq(struct cpufreq_policy *policy, u32 val)
@@ -363,6 +365,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 	struct cpufreq_policy *policy;
 	unsigned int freq;
 	unsigned int cached_freq;
+	unsigned int state;
 
 	pr_debug("%s (%d)\n", __func__, cpu);
 
@@ -374,7 +377,11 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 	if (unlikely(!data || !policy->freq_table))
 		return 0;
 
-	cached_freq = policy->freq_table[to_perf_data(data)->state].frequency;
+	state = to_perf_data(data)->state;
+	if (state < data->first_perf_state)
+		state = data->first_perf_state;
+
+	cached_freq = policy->freq_table[state].frequency;
 	freq = extract_freq(policy, get_cur_val(cpumask_of(cpu), data));
 	if (freq != cached_freq) {
 		/*
@@ -628,16 +635,54 @@ static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
 }
 #endif
 
+#ifdef CONFIG_ACPI_CPPC_LIB
+static u64 get_max_boost_ratio(unsigned int cpu)
+{
+	struct cppc_perf_caps perf_caps;
+	u64 highest_perf, nominal_perf;
+	int ret;
+
+	if (acpi_pstate_strict)
+		return 0;
+
+	ret = cppc_get_perf_caps(cpu, &perf_caps);
+	if (ret) {
+		pr_debug("CPU%d: Unable to get performance capabilities (%d)\n",
+			 cpu, ret);
+		return 0;
+	}
+
+	highest_perf = perf_caps.highest_perf;
+	nominal_perf = perf_caps.nominal_perf;
+
+	if (!highest_perf || !nominal_perf) {
+		pr_debug("CPU%d: highest or nominal performance missing\n", cpu);
+		return 0;
+	}
+
+	if (highest_perf < nominal_perf) {
+		pr_debug("CPU%d: nominal performance above highest\n", cpu);
+		return 0;
+	}
+
+	return div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf);
+}
+#else
+static inline u64 get_max_boost_ratio(unsigned int cpu) { return 0; }
+#endif
+
 static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	unsigned int i;
-	unsigned int valid_states = 0;
-	unsigned int cpu = policy->cpu;
-	struct acpi_cpufreq_data *data;
-	unsigned int result = 0;
-	struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
-	struct acpi_processor_performance *perf;
 	struct cpufreq_frequency_table *freq_table;
+	struct acpi_processor_performance *perf;
+	struct acpi_cpufreq_data *data;
+	unsigned int cpu = policy->cpu;
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	unsigned int valid_states = 0;
+	unsigned int result = 0;
+	unsigned int state_count;
+	u64 max_boost_ratio;
+	unsigned int i;
 #ifdef CONFIG_SMP
 	static int blacklisted;
 #endif
@@ -750,8 +795,20 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		goto err_unreg;
 	}
 
-	freq_table = kcalloc(perf->state_count + 1, sizeof(*freq_table),
-			     GFP_KERNEL);
+	state_count = perf->state_count + 1;
+
+	max_boost_ratio = get_max_boost_ratio(cpu);
+	if (max_boost_ratio) {
+		/*
+		 * Make a room for one more entry to represent the highest
+		 * available "boost" frequency.
+		 */
+		state_count++;
+		valid_states++;
+		data->first_perf_state = valid_states;
+	}
+
+	freq_table = kcalloc(state_count, sizeof(*freq_table), GFP_KERNEL);
 	if (!freq_table) {
 		result = -ENOMEM;
 		goto err_unreg;
@@ -785,6 +842,30 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		valid_states++;
 	}
 	freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
+
+	if (max_boost_ratio) {
+		unsigned int state = data->first_perf_state;
+		unsigned int freq = freq_table[state].frequency;
+
+		/*
+		 * Because the loop above sorts the freq_table entries in the
+		 * descending order, freq is the maximum frequency in the table.
+		 * Assume that it corresponds to the CPPC nominal frequency and
+		 * use it to populate the frequency field of the extra "boost"
+		 * frequency entry.
+		 */
+		freq_table[0].frequency = freq * max_boost_ratio >> SCHED_CAPACITY_SHIFT;
+		/*
+		 * The purpose of the extra "boost" frequency entry is to make
+		 * the rest of cpufreq aware of the real maximum frequency, but
+		 * the way to request it is the same as for the first_perf_state
+		 * entry that is expected to cover the entire range of "boost"
+		 * frequencies of the CPU, so copy the driver_data value from
+		 * that entry.
+		 */
+		freq_table[0].driver_data = freq_table[state].driver_data;
+	}
+
 	policy->freq_table = freq_table;
 	perf->state = 0;
 
@@ -858,8 +939,10 @@ static void acpi_cpufreq_cpu_ready(struct cpufreq_policy *policy)
 {
 	struct acpi_processor_performance *perf = per_cpu_ptr(acpi_perf_data,
 							      policy->cpu);
+	struct acpi_cpufreq_data *data = policy->driver_data;
+	unsigned int freq = policy->freq_table[data->first_perf_state].frequency;
 
-	if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
+	if (perf->states[0].core_frequency * 1000 != freq)
 		pr_warn(FW_WARN "P-state 0 is not max freq\n");
 }
 

From d11a1d08a082a7dc0ada423d2b2e26e9b6f2525c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 4 Feb 2021 18:34:32 +0100
Subject: [PATCH 39/69] cpufreq: ACPI: Update arch scale-invariance max perf
 ratio if CPPC is not there

If the maximum performance level taken for computing the
arch_max_freq_ratio value used in the x86 scale-invariance code is
higher than the one corresponding to the cpuinfo.max_freq value
coming from the acpi_cpufreq driver, the scale-invariant utilization
falls below 100% even if the CPU runs at cpuinfo.max_freq or slightly
faster, which causes the schedutil governor to select a frequency
below cpuinfo.max_freq.  That frequency corresponds to a frequency
table entry below the maximum performance level necessary to get to
the "boost" range of CPU frequencies which prevents "boost"
frequencies from being used in some workloads.

While this issue is related to scale-invariance, it may be amplified
by commit db865272d9c4 ("cpufreq: Avoid configuring old governors as
default with intel_pstate") from the 5.10 development cycle which
made it extremely easy to default to schedutil even if the preferred
driver is acpi_cpufreq as long as intel_pstate is built too, because
the mere presence of the latter effectively removes the ondemand
governor from the defaults.  Distro kernels are likely to include
both intel_pstate and acpi_cpufreq on x86, so their users who cannot
use intel_pstate or choose to use acpi_cpufreq may easily be
affectecd by this issue.

If CPPC is available, it can be used to address this issue by
extending the frequency tables created by acpi_cpufreq to cover the
entire available frequency range (including "boost" frequencies) for
each CPU, but if CPPC is not there, acpi_cpufreq has no idea what
the maximum "boost" frequency is and the frequency tables created by
it cannot be extended in a meaningful way, so in that case make it
ask the arch scale-invariance code to to use the "nominal" performance
level for CPU utilization scaling in order to avoid the issue at hand.

Fixes: db865272d9c4 ("cpufreq: Avoid configuring old governors as default with intel_pstate")
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Giovanni Gherdovich <ggherdovich@suse.cz>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/smpboot.c      | 1 +
 drivers/cpufreq/acpi-cpufreq.c | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 117e24fbfd8a..02813a7f3a7c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1833,6 +1833,7 @@ void arch_set_max_freq_ratio(bool turbo_disabled)
 	arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
 					arch_turbo_freq_ratio;
 }
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
 
 static bool turbo_disabled(void)
 {
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 4614f1c6f50a..d3e5a6fceb61 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -806,6 +806,14 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		state_count++;
 		valid_states++;
 		data->first_perf_state = valid_states;
+	} else {
+		/*
+		 * If the maximum "boost" frequency is unknown, ask the arch
+		 * scale-invariance code to use the "nominal" performance for
+		 * CPU utilization scaling so as to prevent the schedutil
+		 * governor from selecting inadequate CPU frequencies.
+		 */
+		arch_set_max_freq_ratio(true);
 	}
 
 	freq_table = kcalloc(state_count, sizeof(*freq_table), GFP_KERNEL);

From fe0af09074bfeb46a35357e67635eefe33cdfc49 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Sat, 6 Feb 2021 09:49:37 +0100
Subject: [PATCH 40/69] Revert "ACPICA: Interpreter: fix memory leak by using
 existing buffer"

This reverts commit 32cf1a12cad43358e47dac8014379c2f33dfbed4.

The 'exisitng buffer' in this case is the firmware provided table, and
we should not modify that in place. This fixes a crash on arm64 with
initrd table overrides, in which case the DSDT is not mapped with
read/write permissions.

Reported-by: Shawn Guo <shawn.guo@linaro.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Shawn Guo <shawn.guo@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/nsrepair2.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/acpica/nsrepair2.c b/drivers/acpi/acpica/nsrepair2.c
index d2c8d8279e7a..24c197d91f29 100644
--- a/drivers/acpi/acpica/nsrepair2.c
+++ b/drivers/acpi/acpica/nsrepair2.c
@@ -495,8 +495,9 @@ acpi_ns_repair_HID(struct acpi_evaluate_info *info,
 		   union acpi_operand_object **return_object_ptr)
 {
 	union acpi_operand_object *return_object = *return_object_ptr;
-	char *dest;
+	union acpi_operand_object *new_string;
 	char *source;
+	char *dest;
 
 	ACPI_FUNCTION_NAME(ns_repair_HID);
 
@@ -517,6 +518,13 @@ acpi_ns_repair_HID(struct acpi_evaluate_info *info,
 		return_ACPI_STATUS(AE_OK);
 	}
 
+	/* It is simplest to always create a new string object */
+
+	new_string = acpi_ut_create_string_object(return_object->string.length);
+	if (!new_string) {
+		return_ACPI_STATUS(AE_NO_MEMORY);
+	}
+
 	/*
 	 * Remove a leading asterisk if present. For some unknown reason, there
 	 * are many machines in the field that contains IDs like this.
@@ -526,7 +534,7 @@ acpi_ns_repair_HID(struct acpi_evaluate_info *info,
 	source = return_object->string.pointer;
 	if (*source == '*') {
 		source++;
-		return_object->string.length--;
+		new_string->string.length--;
 
 		ACPI_DEBUG_PRINT((ACPI_DB_REPAIR,
 				  "%s: Removed invalid leading asterisk\n",
@@ -541,11 +549,12 @@ acpi_ns_repair_HID(struct acpi_evaluate_info *info,
 	 * "NNNN####" where N is an uppercase letter or decimal digit, and
 	 * # is a hex digit.
 	 */
-	for (dest = return_object->string.pointer; *source; dest++, source++) {
+	for (dest = new_string->string.pointer; *source; dest++, source++) {
 		*dest = (char)toupper((int)*source);
 	}
-	return_object->string.pointer[return_object->string.length] = 0;
 
+	acpi_ut_remove_reference(return_object);
+	*return_object_ptr = new_string;
 	return_ACPI_STATUS(AE_OK);
 }
 

From af8085f3a4712c57d0dd415ad543bac85780375c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 5 Feb 2021 11:36:30 +1100
Subject: [PATCH 41/69] net: fix iteration for sctp transport seq_files

The sctp transport seq_file iterators take a reference to the transport
in the ->start and ->next functions and releases the reference in the
->show function.  The preferred handling for such resources is to
release them in the subsequent ->next or ->stop function call.

Since Commit 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration
code and interface") there is no guarantee that ->show will be called
after ->next, so this function can now leak references.

So move the sctp_transport_put() call to ->next and ->stop.

Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface")
Reported-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sctp/proc.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index f7da88ae20a5..982a87b3e11f 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -215,6 +215,12 @@ static void sctp_transport_seq_stop(struct seq_file *seq, void *v)
 {
 	struct sctp_ht_iter *iter = seq->private;
 
+	if (v && v != SEQ_START_TOKEN) {
+		struct sctp_transport *transport = v;
+
+		sctp_transport_put(transport);
+	}
+
 	sctp_transport_walk_stop(&iter->hti);
 }
 
@@ -222,6 +228,12 @@ static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct sctp_ht_iter *iter = seq->private;
 
+	if (v && v != SEQ_START_TOKEN) {
+		struct sctp_transport *transport = v;
+
+		sctp_transport_put(transport);
+	}
+
 	++*pos;
 
 	return sctp_transport_get_next(seq_file_net(seq), &iter->hti);
@@ -277,8 +289,6 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
 		sk->sk_rcvbuf);
 	seq_printf(seq, "\n");
 
-	sctp_transport_put(transport);
-
 	return 0;
 }
 
@@ -354,8 +364,6 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
 		seq_printf(seq, "\n");
 	}
 
-	sctp_transport_put(transport);
-
 	return 0;
 }
 

From ce7536bc7398e2ae552d2fabb7e0e371a9f1fe46 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Mon, 8 Feb 2021 15:44:54 +0100
Subject: [PATCH 42/69] vsock/virtio: update credit only if socket is not
 closed

If the socket is closed or is being released, some resources used by
virtio_transport_space_update() such as 'vsk->trans' may be released.

To avoid a use after free bug we should only update the available credit
when we are sure the socket is still open and we have the lock held.

Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko")
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20210208144454.84438-1-sgarzare@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/vmw_vsock/virtio_transport_common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 5956939eebb7..e4370b1b7494 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1130,8 +1130,6 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
 
 	vsk = vsock_sk(sk);
 
-	space_available = virtio_transport_space_update(sk, pkt);
-
 	lock_sock(sk);
 
 	/* Check if sk has been closed before lock_sock */
@@ -1142,6 +1140,8 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
 		goto free_pkt;
 	}
 
+	space_available = virtio_transport_space_update(sk, pkt);
+
 	/* Update CID in case it has changed after a transport reset event */
 	vsk->local_addr.svm_cid = dst.svm_cid;
 

From 07998281c268592963e1cd623fe6ab0270b65ae4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 5 Feb 2021 12:56:43 +0100
Subject: [PATCH 43/69] netfilter: conntrack: skip identical origin tuple in
 same zone only

The origin skip check needs to re-test the zone. Else, we might skip
a colliding tuple in the reply direction.

This only occurs when using 'directional zones' where origin tuples
reside in different zones but the reply tuples share the same zone.

This causes the new conntrack entry to be dropped at confirmation time
because NAT clash resolution was elided.

Fixes: 4e35c1cb9460240 ("netfilter: nf_nat: skip nat clash resolution for same-origin entries")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 234b7cab37c3..ff0168736f6e 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1229,7 +1229,8 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
 			 * Let nf_ct_resolve_clash() deal with this later.
 			 */
 			if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
-					      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
+					      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
+					      nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL))
 				continue;
 
 			NF_CT_STAT_INC_ATOMIC(net, found);

From 664899e85c1312e51d2761e7f8b2f25d053e8489 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 8 Feb 2021 13:20:47 +0100
Subject: [PATCH 44/69] netfilter: nftables: relax check for stateful
 expressions in set definition

Restore the original behaviour where users are allowed to add an element
with any stateful expression if the set definition specifies no stateful
expressions. Make sure upper maximum number of stateful expressions of
NFT_SET_EXPR_MAX is not reached.

Fixes: 8cfd9b0f8515 ("netfilter: nftables: generalize set expressions support")
Fixes: 48b0ae046ee9 ("netfilter: nftables: netlink support for several set element expressions")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 43fe80f10313..8ee9f40cc0ea 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5281,6 +5281,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {};
 	struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
 	u8 genmask = nft_genmask_next(ctx->net);
+	u32 flags = 0, size = 0, num_exprs = 0;
 	struct nft_set_ext_tmpl tmpl;
 	struct nft_set_ext *ext, *ext2;
 	struct nft_set_elem elem;
@@ -5290,7 +5291,6 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	struct nft_data_desc desc;
 	enum nft_registers dreg;
 	struct nft_trans *trans;
-	u32 flags = 0, size = 0;
 	u64 timeout;
 	u64 expiration;
 	int err, i;
@@ -5356,7 +5356,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	if (nla[NFTA_SET_ELEM_EXPR]) {
 		struct nft_expr *expr;
 
-		if (set->num_exprs != 1)
+		if (set->num_exprs && set->num_exprs != 1)
 			return -EOPNOTSUPP;
 
 		expr = nft_set_elem_expr_alloc(ctx, set,
@@ -5365,8 +5365,9 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			return PTR_ERR(expr);
 
 		expr_array[0] = expr;
+		num_exprs = 1;
 
-		if (set->exprs[0] && set->exprs[0]->ops != expr->ops) {
+		if (set->num_exprs && set->exprs[0]->ops != expr->ops) {
 			err = -EOPNOTSUPP;
 			goto err_set_elem_expr;
 		}
@@ -5375,12 +5376,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		struct nlattr *tmp;
 		int left;
 
-		if (set->num_exprs == 0)
-			return -EOPNOTSUPP;
-
 		i = 0;
 		nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) {
-			if (i == set->num_exprs) {
+			if (i == NFT_SET_EXPR_MAX ||
+			    (set->num_exprs && set->num_exprs == i)) {
 				err = -E2BIG;
 				goto err_set_elem_expr;
 			}
@@ -5394,14 +5393,15 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 				goto err_set_elem_expr;
 			}
 			expr_array[i] = expr;
+			num_exprs++;
 
-			if (expr->ops != set->exprs[i]->ops) {
+			if (set->num_exprs && expr->ops != set->exprs[i]->ops) {
 				err = -EOPNOTSUPP;
 				goto err_set_elem_expr;
 			}
 			i++;
 		}
-		if (set->num_exprs != i) {
+		if (set->num_exprs && set->num_exprs != i) {
 			err = -EOPNOTSUPP;
 			goto err_set_elem_expr;
 		}
@@ -5409,6 +5409,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		err = nft_set_elem_expr_clone(ctx, set, expr_array);
 		if (err < 0)
 			goto err_set_elem_expr_clone;
+
+		num_exprs = set->num_exprs;
 	}
 
 	err = nft_setelem_parse_key(ctx, set, &elem.key.val,
@@ -5433,8 +5435,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
 	}
 
-	if (set->num_exprs) {
-		for (i = 0; i < set->num_exprs; i++)
+	if (num_exprs) {
+		for (i = 0; i < num_exprs; i++)
 			size += expr_array[i]->ops->size;
 
 		nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS,
@@ -5522,7 +5524,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		*nft_set_ext_obj(ext) = obj;
 		obj->use++;
 	}
-	for (i = 0; i < set->num_exprs; i++)
+	for (i = 0; i < num_exprs; i++)
 		nft_set_elem_expr_setup(ext, i, expr_array);
 
 	trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
@@ -5584,7 +5586,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 err_parse_key:
 	nft_data_release(&elem.key.val, NFT_DATA_VALUE);
 err_set_elem_expr:
-	for (i = 0; i < set->num_exprs && expr_array[i]; i++)
+	for (i = 0; i < num_exprs && expr_array[i]; i++)
 		nft_expr_destroy(ctx, expr_array[i]);
 err_set_elem_expr_clone:
 	return err;

From 3aa6bce9af0e25b735c9c1263739a5639a336ae8 Mon Sep 17 00:00:00 2001
From: Edwin Peer <edwin.peer@broadcom.com>
Date: Fri, 5 Feb 2021 17:37:32 -0800
Subject: [PATCH 45/69] net: watchdog: hold device global xmit lock during tx
 disable

Prevent netif_tx_disable() running concurrently with dev_watchdog() by
taking the device global xmit lock. Otherwise, the recommended:

	netif_carrier_off(dev);
	netif_tx_disable(dev);

driver shutdown sequence can happen after the watchdog has already
checked carrier, resulting in possible false alarms. This is because
netif_tx_lock() only sets the frozen bit without maintaining the locks
on the individual queues.

Fixes: c3f26a269c24 ("netdev: Fix lockdep warnings in multiqueue configurations.")
Signed-off-by: Edwin Peer <edwin.peer@broadcom.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 259be67644e3..5ff27c12ce68 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4352,6 +4352,7 @@ static inline void netif_tx_disable(struct net_device *dev)
 
 	local_bh_disable();
 	cpu = smp_processor_id();
+	spin_lock(&dev->tx_global_lock);
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 
@@ -4359,6 +4360,7 @@ static inline void netif_tx_disable(struct net_device *dev)
 		netif_tx_stop_queue(txq);
 		__netif_tx_unlock(txq);
 	}
+	spin_unlock(&dev->tx_global_lock);
 	local_bh_enable();
 }
 

From b2bdba1cbc84cadb14393d0101a5bfd38d342e0a Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sat, 6 Feb 2021 22:47:33 +0100
Subject: [PATCH 46/69] bridge: mrp: Fix the usage of
 br_mrp_port_switchdev_set_state

The function br_mrp_port_switchdev_set_state was called both with MRP
port state and STP port state, which is an issue because they don't
match exactly.

Therefore, update the function to be used only with STP port state and
use the id SWITCHDEV_ATTR_ID_PORT_STP_STATE.

The choice of using STP over MRP is that the drivers already implement
SWITCHDEV_ATTR_ID_PORT_STP_STATE and already in SW we update the port
STP state.

Fixes: 9a9f26e8f7ea30 ("bridge: mrp: Connect MRP API with the switchdev API")
Fixes: fadd409136f0f2 ("bridge: switchdev: mrp: Implement MRP API for switchdev")
Fixes: 2f1a11ae11d222 ("bridge: mrp: Add MRP interface.")
Reported-by: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_mrp.c           | 9 ++++++---
 net/bridge/br_mrp_switchdev.c | 7 +++----
 net/bridge/br_private_mrp.h   | 3 +--
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
index cec2c4e4561d..5aeae6ad17b3 100644
--- a/net/bridge/br_mrp.c
+++ b/net/bridge/br_mrp.c
@@ -557,19 +557,22 @@ int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance)
 int br_mrp_set_port_state(struct net_bridge_port *p,
 			  enum br_mrp_port_state_type state)
 {
+	u32 port_state;
+
 	if (!p || !(p->flags & BR_MRP_AWARE))
 		return -EINVAL;
 
 	spin_lock_bh(&p->br->lock);
 
 	if (state == BR_MRP_PORT_STATE_FORWARDING)
-		p->state = BR_STATE_FORWARDING;
+		port_state = BR_STATE_FORWARDING;
 	else
-		p->state = BR_STATE_BLOCKING;
+		port_state = BR_STATE_BLOCKING;
 
+	p->state = port_state;
 	spin_unlock_bh(&p->br->lock);
 
-	br_mrp_port_switchdev_set_state(p, state);
+	br_mrp_port_switchdev_set_state(p, port_state);
 
 	return 0;
 }
diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c
index ed547e03ace1..75a7e8d0a268 100644
--- a/net/bridge/br_mrp_switchdev.c
+++ b/net/bridge/br_mrp_switchdev.c
@@ -169,13 +169,12 @@ int br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp,
 	return err;
 }
 
-int br_mrp_port_switchdev_set_state(struct net_bridge_port *p,
-				    enum br_mrp_port_state_type state)
+int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state)
 {
 	struct switchdev_attr attr = {
 		.orig_dev = p->dev,
-		.id = SWITCHDEV_ATTR_ID_MRP_PORT_STATE,
-		.u.mrp_port_state = state,
+		.id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
+		.u.stp_state = state,
 	};
 	int err;
 
diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h
index 32a48e5418da..2514954c1431 100644
--- a/net/bridge/br_private_mrp.h
+++ b/net/bridge/br_private_mrp.h
@@ -72,8 +72,7 @@ int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp,
 int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp,
 				    u32 interval, u8 max_miss, u32 period,
 				    bool monitor);
-int br_mrp_port_switchdev_set_state(struct net_bridge_port *p,
-				    enum br_mrp_port_state_type state);
+int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state);
 int br_mrp_port_switchdev_set_role(struct net_bridge_port *p,
 				   enum br_mrp_port_role_type role);
 int br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp,

From 059d2a1004981dce19f0127dabc1b4ec927d202a Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Sat, 6 Feb 2021 22:47:34 +0100
Subject: [PATCH 47/69] switchdev: mrp: Remove SWITCHDEV_ATTR_ID_MRP_PORT_STAT

Now that MRP started to use also SWITCHDEV_ATTR_ID_PORT_STP_STATE to
notify HW, then SWITCHDEV_ATTR_ID_MRP_PORT_STAT is not used anywhere
else, therefore we can remove it.

Fixes: c284b545900830 ("switchdev: mrp: Extend switchdev API to offload MRP")
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/switchdev.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 99cd538d6519..afdf8bd1b4fe 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -42,7 +42,6 @@ enum switchdev_attr_id {
 	SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
 	SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
 #if IS_ENABLED(CONFIG_BRIDGE_MRP)
-	SWITCHDEV_ATTR_ID_MRP_PORT_STATE,
 	SWITCHDEV_ATTR_ID_MRP_PORT_ROLE,
 #endif
 };
@@ -62,7 +61,6 @@ struct switchdev_attr {
 		u16 vlan_protocol;			/* BRIDGE_VLAN_PROTOCOL */
 		bool mc_disabled;			/* MC_DISABLED */
 #if IS_ENABLED(CONFIG_BRIDGE_MRP)
-		u8 mrp_port_state;			/* MRP_PORT_STATE */
 		u8 mrp_port_role;			/* MRP_PORT_ROLE */
 #endif
 	} u;

From eb4733d7cffc547e08fe5a216e4f03663bb71108 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 8 Feb 2021 19:36:27 +0200
Subject: [PATCH 48/69] net: dsa: felix: implement port flushing on
 .phylink_mac_link_down

There are several issues which may be seen when the link goes down while
forwarding traffic, all of which can be attributed to the fact that the
port flushing procedure from the reference manual was not closely
followed.

With flow control enabled on both the ingress port and the egress port,
it may happen when a link goes down that Ethernet packets are in flight.
In flow control mode, frames are held back and not dropped. When there
is enough traffic in flight (example: iperf3 TCP), then the ingress port
might enter congestion and never exit that state. This is a problem,
because it is the egress port's link that went down, and that has caused
the inability of the ingress port to send packets to any other port.
This is solved by flushing the egress port's queues when it goes down.

There is also a problem when performing stream splitting for
IEEE 802.1CB traffic (not yet upstream, but a sort of multicast,
basically). There, if one port from the destination ports mask goes
down, splitting the stream towards the other destinations will no longer
be performed. This can be traced down to this line:

	ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG);

which should have been instead, as per the reference manual:

	ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA,
			 DEV_MAC_ENA_CFG);

Basically only DEV_MAC_ENA_CFG_RX_ENA should be disabled, but not
DEV_MAC_ENA_CFG_TX_ENA - I don't have further insight into why that is
the case, but apparently multicasting to several ports will cause issues
if at least one of them doesn't have DEV_MAC_ENA_CFG_TX_ENA set.

I am not sure what the state of the Ocelot VSC7514 driver is, but
probably not as bad as Felix/Seville, since VSC7514 uses phylib and has
the following in ocelot_adjust_link:

	if (!phydev->link)
		return;

therefore the port is not really put down when the link is lost, unlike
the DSA drivers which use .phylink_mac_link_down for that.

Nonetheless, I put ocelot_port_flush() in the common ocelot.c because it
needs to access some registers from drivers/net/ethernet/mscc/ocelot_rew.h
which are not exported in include/soc/mscc/ and a bugfix patch should
probably not move headers around.

Fixes: bdeced75b13f ("net: dsa: felix: Add PCS operations for PHYLINK")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/ocelot/felix.c        | 17 ++++++++-
 drivers/net/ethernet/mscc/ocelot.c    | 54 +++++++++++++++++++++++++++
 drivers/net/ethernet/mscc/ocelot_io.c |  8 ++++
 include/soc/mscc/ocelot.h             |  2 +
 4 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
index 7dc230677b78..45fdb1256dbf 100644
--- a/drivers/net/dsa/ocelot/felix.c
+++ b/drivers/net/dsa/ocelot/felix.c
@@ -233,9 +233,24 @@ static void felix_phylink_mac_link_down(struct dsa_switch *ds, int port,
 {
 	struct ocelot *ocelot = ds->priv;
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
+	int err;
+
+	ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA,
+			 DEV_MAC_ENA_CFG);
 
-	ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG);
 	ocelot_fields_write(ocelot, port, QSYS_SWITCH_PORT_MODE_PORT_ENA, 0);
+
+	err = ocelot_port_flush(ocelot, port);
+	if (err)
+		dev_err(ocelot->dev, "failed to flush port %d: %d\n",
+			port, err);
+
+	/* Put the port in reset. */
+	ocelot_port_writel(ocelot_port,
+			   DEV_CLOCK_CFG_MAC_TX_RST |
+			   DEV_CLOCK_CFG_MAC_RX_RST |
+			   DEV_CLOCK_CFG_LINK_SPEED(OCELOT_SPEED_1000),
+			   DEV_CLOCK_CFG);
 }
 
 static void felix_phylink_mac_link_up(struct dsa_switch *ds, int port,
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index ff87a0bc089c..c072eb5c0764 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -375,6 +375,60 @@ static void ocelot_vlan_init(struct ocelot *ocelot)
 	}
 }
 
+static u32 ocelot_read_eq_avail(struct ocelot *ocelot, int port)
+{
+	return ocelot_read_rix(ocelot, QSYS_SW_STATUS, port);
+}
+
+int ocelot_port_flush(struct ocelot *ocelot, int port)
+{
+	int err, val;
+
+	/* Disable dequeuing from the egress queues */
+	ocelot_rmw_rix(ocelot, QSYS_PORT_MODE_DEQUEUE_DIS,
+		       QSYS_PORT_MODE_DEQUEUE_DIS,
+		       QSYS_PORT_MODE, port);
+
+	/* Disable flow control */
+	ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, 0);
+
+	/* Disable priority flow control */
+	ocelot_fields_write(ocelot, port,
+			    QSYS_SWITCH_PORT_MODE_TX_PFC_ENA, 0);
+
+	/* Wait at least the time it takes to receive a frame of maximum length
+	 * at the port.
+	 * Worst-case delays for 10 kilobyte jumbo frames are:
+	 * 8 ms on a 10M port
+	 * 800 μs on a 100M port
+	 * 80 μs on a 1G port
+	 * 32 μs on a 2.5G port
+	 */
+	usleep_range(8000, 10000);
+
+	/* Disable half duplex backpressure. */
+	ocelot_rmw_rix(ocelot, 0, SYS_FRONT_PORT_MODE_HDX_MODE,
+		       SYS_FRONT_PORT_MODE, port);
+
+	/* Flush the queues associated with the port. */
+	ocelot_rmw_gix(ocelot, REW_PORT_CFG_FLUSH_ENA, REW_PORT_CFG_FLUSH_ENA,
+		       REW_PORT_CFG, port);
+
+	/* Enable dequeuing from the egress queues. */
+	ocelot_rmw_rix(ocelot, 0, QSYS_PORT_MODE_DEQUEUE_DIS, QSYS_PORT_MODE,
+		       port);
+
+	/* Wait until flushing is complete. */
+	err = read_poll_timeout(ocelot_read_eq_avail, val, !val,
+				100, 2000000, false, ocelot, port);
+
+	/* Clear flushing again. */
+	ocelot_rmw_gix(ocelot, 0, REW_PORT_CFG_FLUSH_ENA, REW_PORT_CFG, port);
+
+	return err;
+}
+EXPORT_SYMBOL(ocelot_port_flush);
+
 void ocelot_adjust_link(struct ocelot *ocelot, int port,
 			struct phy_device *phydev)
 {
diff --git a/drivers/net/ethernet/mscc/ocelot_io.c b/drivers/net/ethernet/mscc/ocelot_io.c
index 0acb45948418..ea4e83410fe4 100644
--- a/drivers/net/ethernet/mscc/ocelot_io.c
+++ b/drivers/net/ethernet/mscc/ocelot_io.c
@@ -71,6 +71,14 @@ void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg)
 }
 EXPORT_SYMBOL(ocelot_port_writel);
 
+void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg)
+{
+	u32 cur = ocelot_port_readl(port, reg);
+
+	ocelot_port_writel(port, (cur & (~mask)) | val, reg);
+}
+EXPORT_SYMBOL(ocelot_port_rmwl);
+
 u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target,
 			    u32 reg, u32 offset)
 {
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 2f4cd3288bcc..c34b9ccb6472 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -709,6 +709,7 @@ struct ocelot_policer {
 /* I/O */
 u32 ocelot_port_readl(struct ocelot_port *port, u32 reg);
 void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg);
+void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg);
 u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset);
 void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset);
 void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg,
@@ -737,6 +738,7 @@ int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset);
 int ocelot_get_ts_info(struct ocelot *ocelot, int port,
 		       struct ethtool_ts_info *info);
 void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs);
+int ocelot_port_flush(struct ocelot *ocelot, int port);
 void ocelot_adjust_link(struct ocelot *ocelot, int port,
 			struct phy_device *phydev);
 int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, bool enabled,

From 67a69f84cab60484f02eb8cbc7a76edffbb28a25 Mon Sep 17 00:00:00 2001
From: Yufeng Mo <moyufeng@huawei.com>
Date: Tue, 9 Feb 2021 17:03:05 +0800
Subject: [PATCH 49/69] net: hns3: add a check for queue_id in
 hclge_reset_vf_queue()

The queue_id is received from vf, if use it directly,
an out-of-bound issue may be caused, so add a check for
this queue_id before using it in hclge_reset_vf_queue().

Fixes: 1a426f8b40fc ("net: hns3: fix the VF queue reset flow error")
Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index c242883fea5d..48549db23c52 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -9813,12 +9813,19 @@ int hclge_reset_tqp(struct hnae3_handle *handle, u16 queue_id)
 
 void hclge_reset_vf_queue(struct hclge_vport *vport, u16 queue_id)
 {
+	struct hnae3_handle *handle = &vport->nic;
 	struct hclge_dev *hdev = vport->back;
 	int reset_try_times = 0;
 	int reset_status;
 	u16 queue_gid;
 	int ret;
 
+	if (queue_id >= handle->kinfo.num_tqps) {
+		dev_warn(&hdev->pdev->dev, "Invalid vf queue id(%u)\n",
+			 queue_id);
+		return;
+	}
+
 	queue_gid = hclge_covert_handle_qid_global(&vport->nic, queue_id);
 
 	ret = hclge_send_reset_tqp_cmd(hdev, queue_gid, true);

From 326334aad024a60f46dc5e7dbe1efe32da3ca66f Mon Sep 17 00:00:00 2001
From: Yufeng Mo <moyufeng@huawei.com>
Date: Tue, 9 Feb 2021 17:03:06 +0800
Subject: [PATCH 50/69] net: hns3: add a check for tqp_index in
 hclge_get_ring_chain_from_mbx()

The tqp_index is received from vf, if use it directly,
an out-of-bound issue may be caused, so add a check for
this tqp_index before using it in hclge_get_ring_chain_from_mbx().

Fixes: 84e095d64ed9 ("net: hns3: Change PF to add ring-vect binding & resetQ to mailbox")
Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index 754c09ada901..ea2dea990283 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -158,21 +158,31 @@ static int hclge_get_ring_chain_from_mbx(
 			struct hclge_vport *vport)
 {
 	struct hnae3_ring_chain_node *cur_chain, *new_chain;
+	struct hclge_dev *hdev = vport->back;
 	int ring_num;
-	int i = 0;
+	int i;
 
 	ring_num = req->msg.ring_num;
 
 	if (ring_num > HCLGE_MBX_MAX_RING_CHAIN_PARAM_NUM)
 		return -ENOMEM;
 
+	for (i = 0; i < ring_num; i++) {
+		if (req->msg.param[i].tqp_index >= vport->nic.kinfo.rss_size) {
+			dev_err(&hdev->pdev->dev, "tqp index(%u) is out of range(0-%u)\n",
+				req->msg.param[i].tqp_index,
+				vport->nic.kinfo.rss_size - 1);
+			return -EINVAL;
+		}
+	}
+
 	hnae3_set_bit(ring_chain->flag, HNAE3_RING_TYPE_B,
-		      req->msg.param[i].ring_type);
+		      req->msg.param[0].ring_type);
 	ring_chain->tqp_index =
 		hclge_get_queue_id(vport->nic.kinfo.tqp
-				   [req->msg.param[i].tqp_index]);
+				   [req->msg.param[0].tqp_index]);
 	hnae3_set_field(ring_chain->int_gl_idx, HNAE3_RING_GL_IDX_M,
-			HNAE3_RING_GL_IDX_S, req->msg.param[i].int_gl_index);
+			HNAE3_RING_GL_IDX_S, req->msg.param[0].int_gl_index);
 
 	cur_chain = ring_chain;
 

From 532cfc0df1e4d68e74522ef4a0dcbf6ebbe68287 Mon Sep 17 00:00:00 2001
From: Yufeng Mo <moyufeng@huawei.com>
Date: Tue, 9 Feb 2021 17:03:07 +0800
Subject: [PATCH 51/69] net: hns3: add a check for index in hclge_get_rss_key()

The index is received from vf, if use it directly,
an out-of-bound issue may be caused, so add a check for
this index before using it in hclge_get_rss_key().

Fixes: a638b1d8cc87 ("net: hns3: fix get VF RSS issue")
Signed-off-by: Yufeng Mo <moyufeng@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c    | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
index ea2dea990283..ffb416e088a9 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c
@@ -607,6 +607,17 @@ static void hclge_get_rss_key(struct hclge_vport *vport,
 
 	index = mbx_req->msg.data[0];
 
+	/* Check the query index of rss_hash_key from VF, make sure no
+	 * more than the size of rss_hash_key.
+	 */
+	if (((index + 1) * HCLGE_RSS_MBX_RESP_LEN) >
+	      sizeof(vport[0].rss_hash_key)) {
+		dev_warn(&hdev->pdev->dev,
+			 "failed to get the rss hash key, the index(%u) invalid !\n",
+			 index);
+		return;
+	}
+
 	memcpy(resp_msg->data,
 	       &hdev->vport[0].rss_hash_key[index * HCLGE_RSS_MBX_RESP_LEN],
 	       HCLGE_RSS_MBX_RESP_LEN);

From 1c5fae9c9a092574398a17facc31c533791ef232 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Tue, 9 Feb 2021 09:52:19 +0100
Subject: [PATCH 52/69] vsock: fix locking in vsock_shutdown()

In vsock_shutdown() we touched some socket fields without holding the
socket lock, such as 'state' and 'sk_flags'.

Also, after the introduction of multi-transport, we are accessing
'vsk->transport' in vsock_send_shutdown() without holding the lock
and this call can be made while the connection is in progress, so
the transport can change in the meantime.

To avoid issues, we hold the socket lock when we enter in
vsock_shutdown() and release it when we leave.

Among the transports that implement the 'shutdown' callback, only
hyperv_transport acquired the lock. Since the caller now holds it,
we no longer take it.

Fixes: d021c344051a ("VSOCK: Introduce VM Sockets")
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/vmw_vsock/af_vsock.c         | 8 +++++---
 net/vmw_vsock/hyperv_transport.c | 4 ----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 4ea301fc2bf0..5546710d8ac1 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -943,10 +943,12 @@ static int vsock_shutdown(struct socket *sock, int mode)
 	 */
 
 	sk = sock->sk;
+
+	lock_sock(sk);
 	if (sock->state == SS_UNCONNECTED) {
 		err = -ENOTCONN;
 		if (sk->sk_type == SOCK_STREAM)
-			return err;
+			goto out;
 	} else {
 		sock->state = SS_DISCONNECTING;
 		err = 0;
@@ -955,10 +957,8 @@ static int vsock_shutdown(struct socket *sock, int mode)
 	/* Receive and send shutdowns are treated alike. */
 	mode = mode & (RCV_SHUTDOWN | SEND_SHUTDOWN);
 	if (mode) {
-		lock_sock(sk);
 		sk->sk_shutdown |= mode;
 		sk->sk_state_change(sk);
-		release_sock(sk);
 
 		if (sk->sk_type == SOCK_STREAM) {
 			sock_reset_flag(sk, SOCK_DONE);
@@ -966,6 +966,8 @@ static int vsock_shutdown(struct socket *sock, int mode)
 		}
 	}
 
+out:
+	release_sock(sk);
 	return err;
 }
 
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 630b851f8150..cc3bae2659e7 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -474,14 +474,10 @@ static void hvs_shutdown_lock_held(struct hvsock *hvs, int mode)
 
 static int hvs_shutdown(struct vsock_sock *vsk, int mode)
 {
-	struct sock *sk = sk_vsock(vsk);
-
 	if (!(mode & SEND_SHUTDOWN))
 		return 0;
 
-	lock_sock(sk);
 	hvs_shutdown_lock_held(vsk->trans, mode);
-	release_sock(sk);
 	return 0;
 }
 

From ee114dd64c0071500345439fc79dd5e0f9d106ed Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 5 Feb 2021 17:20:14 +0100
Subject: [PATCH 53/69] bpf: Fix verifier jsgt branch analysis on max bound

Fix incorrect is_branch{32,64}_taken() analysis for the jsgt case. The return
code for both will tell the caller whether a given conditional jump is taken
or not, e.g. 1 means branch will be taken [for the involved registers] and the
goto target will be executed, 0 means branch will not be taken and instead we
fall-through to the next insn, and last but not least a -1 denotes that it is
not known at verification time whether a branch will be taken or not. Now while
the jsgt has the branch-taken case correct with reg->s32_min_value > sval, the
branch-not-taken case is off-by-one when testing for reg->s32_max_value < sval
since the branch will also be taken for reg->s32_max_value == sval. The jgt
branch analysis, for example, gets this right.

Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking")
Fixes: 4f7b3e82589e ("bpf: improve verifier branch analysis")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e7368c5eacb7..67ea0ac3333c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6877,7 +6877,7 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
 	case BPF_JSGT:
 		if (reg->s32_min_value > sval)
 			return 1;
-		else if (reg->s32_max_value < sval)
+		else if (reg->s32_max_value <= sval)
 			return 0;
 		break;
 	case BPF_JLT:
@@ -6950,7 +6950,7 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
 	case BPF_JSGT:
 		if (reg->smin_value > sval)
 			return 1;
-		else if (reg->smax_value < sval)
+		else if (reg->smax_value <= sval)
 			return 0;
 		break;
 	case BPF_JLT:

From fd675184fc7abfd1e1c52d23e8e900676b5a1c1a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 5 Feb 2021 20:48:21 +0100
Subject: [PATCH 54/69] bpf: Fix verifier jmp32 pruning decision logic

Anatoly has been fuzzing with kBdysch harness and reported a hang in
one of the outcomes:

  func#0 @0
  0: R1=ctx(id=0,off=0,imm=0) R10=fp0
  0: (b7) r0 = 808464450
  1: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R10=fp0
  1: (b4) w4 = 808464432
  2: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP808464432 R10=fp0
  2: (9c) w4 %= w0
  3: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0
  3: (66) if w4 s> 0x30303030 goto pc+0
   R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff),s32_max_value=808464432) R10=fp0
  4: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff),s32_max_value=808464432) R10=fp0
  4: (7f) r0 >>= r0
  5: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff),s32_max_value=808464432) R10=fp0
  5: (9c) w4 %= w0
  6: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0
  6: (66) if w0 s> 0x3030 goto pc+0
   R0_w=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0
  7: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0
  7: (d6) if w0 s<= 0x303030 goto pc+1
  9: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0
  9: (95) exit
  propagating r0

  from 6 to 7: safe
  4: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umin_value=808464433,umax_value=2147483647,var_off=(0x0; 0x7fffffff)) R10=fp0
  4: (7f) r0 >>= r0
  5: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umin_value=808464433,umax_value=2147483647,var_off=(0x0; 0x7fffffff)) R10=fp0
  5: (9c) w4 %= w0
  6: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0
  6: (66) if w0 s> 0x3030 goto pc+0
   R0_w=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0
  propagating r0
  7: safe
  propagating r0

  from 6 to 7: safe
  processed 15 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1

The underlying program was xlated as follows:

  # bpftool p d x i 10
   0: (b7) r0 = 808464450
   1: (b4) w4 = 808464432
   2: (bc) w0 = w0
   3: (15) if r0 == 0x0 goto pc+1
   4: (9c) w4 %= w0
   5: (66) if w4 s> 0x30303030 goto pc+0
   6: (7f) r0 >>= r0
   7: (bc) w0 = w0
   8: (15) if r0 == 0x0 goto pc+1
   9: (9c) w4 %= w0
  10: (66) if w0 s> 0x3030 goto pc+0
  11: (d6) if w0 s<= 0x303030 goto pc+1
  12: (05) goto pc-1
  13: (95) exit

The verifier rewrote original instructions it recognized as dead code with
'goto pc-1', but reality differs from verifier simulation in that we are
actually able to trigger a hang due to hitting the 'goto pc-1' instructions.

Taking a closer look at the verifier analysis, the reason is that it misjudges
its pruning decision at the first 'from 6 to 7: safe' occasion. What happens
is that while both old/cur registers are marked as precise, they get misjudged
for the jmp32 case as range_within() yields true, meaning that the prior
verification path with a wider register bound could be verified successfully
and therefore the current path with a narrower register bound is deemed safe
as well whereas in reality it's not. R0 old/cur path's bounds compare as
follows:

  old: smin_value=0x8000000000000000,smax_value=0x7fffffffffffffff,umin_value=0x0,umax_value=0xffffffffffffffff,var_off=(0x0; 0xffffffffffffffff)
  cur: smin_value=0x8000000000000000,smax_value=0x7fffffff7fffffff,umin_value=0x0,umax_value=0xffffffff7fffffff,var_off=(0x0; 0xffffffff7fffffff)

  old: s32_min_value=0x80000000,s32_max_value=0x00003030,u32_min_value=0x00000000,u32_max_value=0xffffffff
  cur: s32_min_value=0x00003031,s32_max_value=0x7fffffff,u32_min_value=0x00003031,u32_max_value=0x7fffffff

The 64 bit bounds generally look okay and while the information that got
propagated from 32 to 64 bit looks correct as well, it's not precise enough
for judging a conditional jmp32. Given the latter only operates on subregisters
we also need to take these into account as well for a range_within() probe
in order to be able to prune paths. Extending the range_within() constraint
to both bounds will be able to tell us that the old signed 32 bit bounds are
not wider than the cur signed 32 bit bounds.

With the fix in place, the program will now verify the 'goto' branch case as
it should have been:

  [...]
  6: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0
  6: (66) if w0 s> 0x3030 goto pc+0
   R0_w=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0
  7: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0
  7: (d6) if w0 s<= 0x303030 goto pc+1
  9: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0
  9: (95) exit

  7: R0_w=invP(id=0,smax_value=9223372034707292159,umax_value=18446744071562067967,var_off=(0x0; 0xffffffff7fffffff),s32_min_value=12337,u32_min_value=12337,u32_max_value=2147483647) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0
  7: (d6) if w0 s<= 0x303030 goto pc+1
   R0_w=invP(id=0,smax_value=9223372034707292159,umax_value=18446744071562067967,var_off=(0x0; 0xffffffff7fffffff),s32_min_value=3158065,u32_min_value=3158065,u32_max_value=2147483647) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0
  8: R0_w=invP(id=0,smax_value=9223372034707292159,umax_value=18446744071562067967,var_off=(0x0; 0xffffffff7fffffff),s32_min_value=3158065,u32_min_value=3158065,u32_max_value=2147483647) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0
  8: (30) r0 = *(u8 *)skb[808464432]
  BPF_LD_[ABS|IND] uses reserved fields
  processed 11 insns (limit 1000000) max_states_per_insn 1 total_states 1 peak_states 1 mark_read 1

The bug is quite subtle in the sense that when verifier would determine that
a given branch is dead code, it would (here: wrongly) remove these instructions
from the program and hard-wire the taken branch for privileged programs instead
of the 'goto pc-1' rewrites which will cause hard to debug problems.

Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking")
Reported-by: Anatoly Trosinenko <anatoly.trosinenko@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 67ea0ac3333c..24c0e9224f3c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8590,7 +8590,11 @@ static bool range_within(struct bpf_reg_state *old,
 	return old->umin_value <= cur->umin_value &&
 	       old->umax_value >= cur->umax_value &&
 	       old->smin_value <= cur->smin_value &&
-	       old->smax_value >= cur->smax_value;
+	       old->smax_value >= cur->smax_value &&
+	       old->u32_min_value <= cur->u32_min_value &&
+	       old->u32_max_value >= cur->u32_max_value &&
+	       old->s32_min_value <= cur->s32_min_value &&
+	       old->s32_max_value >= cur->s32_max_value;
 }
 
 /* Maximum number of register states that can exist at once */

From e88b2c6e5a4d9ce30d75391e4d950da74bb2bd90 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Feb 2021 18:46:10 +0000
Subject: [PATCH 55/69] bpf: Fix 32 bit src register truncation on div/mod

While reviewing a different fix, John and I noticed an oddity in one of the
BPF program dumps that stood out, for example:

  # bpftool p d x i 13
   0: (b7) r0 = 808464450
   1: (b4) w4 = 808464432
   2: (bc) w0 = w0
   3: (15) if r0 == 0x0 goto pc+1
   4: (9c) w4 %= w0
  [...]

In line 2 we noticed that the mov32 would 32 bit truncate the original src
register for the div/mod operation. While for the two operations the dst
register is typically marked unknown e.g. from adjust_scalar_min_max_vals()
the src register is not, and thus verifier keeps tracking original bounds,
simplified:

  0: R1=ctx(id=0,off=0,imm=0) R10=fp0
  0: (b7) r0 = -1
  1: R0_w=invP-1 R1=ctx(id=0,off=0,imm=0) R10=fp0
  1: (b7) r1 = -1
  2: R0_w=invP-1 R1_w=invP-1 R10=fp0
  2: (3c) w0 /= w1
  3: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=invP-1 R10=fp0
  3: (77) r1 >>= 32
  4: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=invP4294967295 R10=fp0
  4: (bf) r0 = r1
  5: R0_w=invP4294967295 R1_w=invP4294967295 R10=fp0
  5: (95) exit
  processed 6 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0

Runtime result of r0 at exit is 0 instead of expected -1. Remove the
verifier mov32 src rewrite in div/mod and replace it with a jmp32 test
instead. After the fix, we result in the following code generation when
having dividend r1 and divisor r6:

  div, 64 bit:                             div, 32 bit:

   0: (b7) r6 = 8                           0: (b7) r6 = 8
   1: (b7) r1 = 8                           1: (b7) r1 = 8
   2: (55) if r6 != 0x0 goto pc+2           2: (56) if w6 != 0x0 goto pc+2
   3: (ac) w1 ^= w1                         3: (ac) w1 ^= w1
   4: (05) goto pc+1                        4: (05) goto pc+1
   5: (3f) r1 /= r6                         5: (3c) w1 /= w6
   6: (b7) r0 = 0                           6: (b7) r0 = 0
   7: (95) exit                             7: (95) exit

  mod, 64 bit:                             mod, 32 bit:

   0: (b7) r6 = 8                           0: (b7) r6 = 8
   1: (b7) r1 = 8                           1: (b7) r1 = 8
   2: (15) if r6 == 0x0 goto pc+1           2: (16) if w6 == 0x0 goto pc+1
   3: (9f) r1 %= r6                         3: (9c) w1 %= w6
   4: (b7) r0 = 0                           4: (b7) r0 = 0
   5: (95) exit                             5: (95) exit

x86 in particular can throw a 'divide error' exception for div
instruction not only for divisor being zero, but also for the case
when the quotient is too large for the designated register. For the
edx:eax and rdx:rax dividend pair it is not an issue in x86 BPF JIT
since we always zero edx (rdx). Hence really the only protection
needed is against divisor being zero.

Fixes: 68fda450a7df ("bpf: fix 32-bit divide by zero")
Co-developed-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 24c0e9224f3c..37581919e050 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -11003,30 +11003,28 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 		    insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
 		    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
 			bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
-			struct bpf_insn mask_and_div[] = {
-				BPF_MOV32_REG(insn->src_reg, insn->src_reg),
+			bool isdiv = BPF_OP(insn->code) == BPF_DIV;
+			struct bpf_insn *patchlet;
+			struct bpf_insn chk_and_div[] = {
 				/* Rx div 0 -> 0 */
-				BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2),
+				BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+					     BPF_JNE | BPF_K, insn->src_reg,
+					     0, 2, 0),
 				BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
 				BPF_JMP_IMM(BPF_JA, 0, 0, 1),
 				*insn,
 			};
-			struct bpf_insn mask_and_mod[] = {
-				BPF_MOV32_REG(insn->src_reg, insn->src_reg),
+			struct bpf_insn chk_and_mod[] = {
 				/* Rx mod 0 -> Rx */
-				BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1),
+				BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+					     BPF_JEQ | BPF_K, insn->src_reg,
+					     0, 1, 0),
 				*insn,
 			};
-			struct bpf_insn *patchlet;
 
-			if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
-			    insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
-				patchlet = mask_and_div + (is64 ? 1 : 0);
-				cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0);
-			} else {
-				patchlet = mask_and_mod + (is64 ? 1 : 0);
-				cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0);
-			}
+			patchlet = isdiv ? chk_and_div : chk_and_mod;
+			cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
+				      ARRAY_SIZE(chk_and_mod);
 
 			new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
 			if (!new_prog)

From e812cbbbbbb15adbbbee176baa1e8bda53059bf0 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Tue, 9 Feb 2021 13:41:50 -0800
Subject: [PATCH 56/69] squashfs: avoid out of bounds writes in decompressors

Patch series "Squashfs: fix BIO migration regression and add sanity checks".

Patch [1/4] fixes a regression introduced by the "migrate from
ll_rw_block usage to BIO" patch, which has produced a number of
Sysbot/Syzkaller reports.

Patches [2/4], [3/4], and [4/4] fix a number of filesystem corruption
issues which have produced Sysbot reports in the id, inode and xattr
lookup code.

Each patch has been tested against the Sysbot reproducers using the
given kernel configuration.  They have the appropriate "Reported-by:"
lines added.

Additionally, all of the reproducer filesystems are indirectly fixed by
patch [4/4] due to the fact they all have xattr corruption which is now
detected there.

Additional testing with other configurations and architectures (32bit,
big endian), and normal filesystems has also been done to trap any
inadvertent regressions caused by the additional sanity checks.

This patch (of 4):

This is a regression introduced by the patch "migrate from ll_rw_block
usage to BIO".

Sysbot/Syskaller has reported a number of "out of bounds writes" and
"unable to handle kernel paging request in squashfs_decompress" errors
which have been identified as a regression introduced by the above
patch.

Specifically, the patch removed the following sanity check

        if (length < 0 || length > output->length ||
		(index + length) > msblk->bytes_used)

This check did two things:

1. It ensured any reads were not beyond the end of the filesystem

2. It ensured that the "length" field read from the filesystem
   was within the expected maximum length.  Without this any
   corrupted values can over-run allocated buffers.

Link: https://lkml.kernel.org/r/20210204130249.4495-1-phillip@squashfs.org.uk
Link: https://lkml.kernel.org/r/20210204130249.4495-2-phillip@squashfs.org.uk
Fixes: 93e72b3c612adc ("squashfs: migrate from ll_rw_block usage to BIO")
Reported-by: syzbot+6fba78f99b9afd4b5634@syzkaller.appspotmail.com
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Cc: Philippe Liard <pliard@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/squashfs/block.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 8a19773b5a0b..45f44425d856 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -196,9 +196,15 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
 		length = SQUASHFS_COMPRESSED_SIZE(length);
 		index += 2;
 
-		TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
+		TRACE("Block @ 0x%llx, %scompressed size %d\n", index - 2,
 		      compressed ? "" : "un", length);
 	}
+	if (length < 0 || length > output->length ||
+			(index + length) > msblk->bytes_used) {
+		res = -EIO;
+		goto out;
+	}
+
 	if (next_index)
 		*next_index = index + length;
 

From f37aa4c7366e23f91b81d00bafd6a7ab54e4a381 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Tue, 9 Feb 2021 13:41:53 -0800
Subject: [PATCH 57/69] squashfs: add more sanity checks in id lookup

Sysbot has reported a number of "slab-out-of-bounds reads" and
"use-after-free read" errors which has been identified as being caused
by a corrupted index value read from the inode.  This could be because
the metadata block is uncompressed, or because the "compression" bit has
been corrupted (turning a compressed block into an uncompressed block).

This patch adds additional sanity checks to detect this, and the
following corruption.

1. It checks against corruption of the ids count.  This can either
   lead to a larger table to be read, or a smaller than expected
   table to be read.

   In the case of a too large ids count, this would often have been
   trapped by the existing sanity checks, but this patch introduces
   a more exact check, which can identify too small values.

2. It checks the contents of the index table for corruption.

Link: https://lkml.kernel.org/r/20210204130249.4495-3-phillip@squashfs.org.uk
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Reported-by: syzbot+b06d57ba83f604522af2@syzkaller.appspotmail.com
Reported-by: syzbot+c021ba012da41ee9807c@syzkaller.appspotmail.com
Reported-by: syzbot+5024636e8b5fd19f0f19@syzkaller.appspotmail.com
Reported-by: syzbot+bcbc661df46657d0fa4f@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/squashfs/id.c             | 40 ++++++++++++++++++++++++++++--------
 fs/squashfs/squashfs_fs_sb.h |  1 +
 fs/squashfs/super.c          |  6 +++---
 fs/squashfs/xattr.h          | 10 ++++++++-
 4 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index 6be5afe7287d..11581bf31af4 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -35,10 +35,15 @@ int squashfs_get_id(struct super_block *sb, unsigned int index,
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
 	int block = SQUASHFS_ID_BLOCK(index);
 	int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
-	u64 start_block = le64_to_cpu(msblk->id_table[block]);
+	u64 start_block;
 	__le32 disk_id;
 	int err;
 
+	if (index >= msblk->ids)
+		return -EINVAL;
+
+	start_block = le64_to_cpu(msblk->id_table[block]);
+
 	err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
 							sizeof(disk_id));
 	if (err < 0)
@@ -56,7 +61,10 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
 		u64 id_table_start, u64 next_table, unsigned short no_ids)
 {
 	unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
+	unsigned int indexes = SQUASHFS_ID_BLOCKS(no_ids);
+	int n;
 	__le64 *table;
+	u64 start, end;
 
 	TRACE("In read_id_index_table, length %d\n", length);
 
@@ -67,20 +75,36 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
 		return ERR_PTR(-EINVAL);
 
 	/*
-	 * length bytes should not extend into the next table - this check
-	 * also traps instances where id_table_start is incorrectly larger
-	 * than the next table start
+	 * The computed size of the index table (length bytes) should exactly
+	 * match the table start and end points
 	 */
-	if (id_table_start + length > next_table)
+	if (length != (next_table - id_table_start))
 		return ERR_PTR(-EINVAL);
 
 	table = squashfs_read_table(sb, id_table_start, length);
+	if (IS_ERR(table))
+		return table;
 
 	/*
-	 * table[0] points to the first id lookup table metadata block, this
-	 * should be less than id_table_start
+	 * table[0], table[1], ... table[indexes - 1] store the locations
+	 * of the compressed id blocks.   Each entry should be less than
+	 * the next (i.e. table[0] < table[1]), and the difference between them
+	 * should be SQUASHFS_METADATA_SIZE or less.  table[indexes - 1]
+	 * should be less than id_table_start, and again the difference
+	 * should be SQUASHFS_METADATA_SIZE or less
 	 */
-	if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) {
+	for (n = 0; n < (indexes - 1); n++) {
+		start = le64_to_cpu(table[n]);
+		end = le64_to_cpu(table[n + 1]);
+
+		if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) {
+			kfree(table);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	start = le64_to_cpu(table[indexes - 1]);
+	if (start >= id_table_start || (id_table_start - start) > SQUASHFS_METADATA_SIZE) {
 		kfree(table);
 		return ERR_PTR(-EINVAL);
 	}
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 34c21ffb6df3..166e98806265 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -64,5 +64,6 @@ struct squashfs_sb_info {
 	unsigned int				inodes;
 	unsigned int				fragments;
 	int					xattr_ids;
+	unsigned int				ids;
 };
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index d6c6593ec169..88cc94be1076 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -166,6 +166,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
 	msblk->inodes = le32_to_cpu(sblk->inodes);
 	msblk->fragments = le32_to_cpu(sblk->fragments);
+	msblk->ids = le16_to_cpu(sblk->no_ids);
 	flags = le16_to_cpu(sblk->flags);
 
 	TRACE("Found valid superblock on %pg\n", sb->s_bdev);
@@ -177,7 +178,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	TRACE("Block size %d\n", msblk->block_size);
 	TRACE("Number of inodes %d\n", msblk->inodes);
 	TRACE("Number of fragments %d\n", msblk->fragments);
-	TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids));
+	TRACE("Number of ids %d\n", msblk->ids);
 	TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
 	TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
 	TRACE("sblk->fragment_table_start %llx\n",
@@ -236,8 +237,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
 allocate_id_index_table:
 	/* Allocate and read id index table */
 	msblk->id_table = squashfs_read_id_index_table(sb,
-		le64_to_cpu(sblk->id_table_start), next_table,
-		le16_to_cpu(sblk->no_ids));
+		le64_to_cpu(sblk->id_table_start), next_table, msblk->ids);
 	if (IS_ERR(msblk->id_table)) {
 		errorf(fc, "unable to read id index table");
 		err = PTR_ERR(msblk->id_table);
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 184129afd456..d8a270d3ac4c 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -17,8 +17,16 @@ extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
 static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
 		u64 start, u64 *xattr_table_start, int *xattr_ids)
 {
+	struct squashfs_xattr_id_table *id_table;
+
+	id_table = squashfs_read_table(sb, start, sizeof(*id_table));
+	if (IS_ERR(id_table))
+		return (__le64 *) id_table;
+
+	*xattr_table_start = le64_to_cpu(id_table->xattr_table_start);
+	kfree(id_table);
+
 	ERROR("Xattrs in filesystem, these will be ignored\n");
-	*xattr_table_start = start;
 	return ERR_PTR(-ENOTSUPP);
 }
 

From eabac19e40c095543def79cb6ffeb3a8588aaff4 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Tue, 9 Feb 2021 13:41:56 -0800
Subject: [PATCH 58/69] squashfs: add more sanity checks in inode lookup

Sysbot has reported an "slab-out-of-bounds read" error which has been
identified as being caused by a corrupted "ino_num" value read from the
inode.  This could be because the metadata block is uncompressed, or
because the "compression" bit has been corrupted (turning a compressed
block into an uncompressed block).

This patch adds additional sanity checks to detect this, and the
following corruption.

1. It checks against corruption of the inodes count.  This can either
   lead to a larger table to be read, or a smaller than expected
   table to be read.

   In the case of a too large inodes count, this would often have been
   trapped by the existing sanity checks, but this patch introduces
   a more exact check, which can identify too small values.

2. It checks the contents of the index table for corruption.

[phillip@squashfs.org.uk: fix checkpatch issue]
  Link: https://lkml.kernel.org/r/527909353.754618.1612769948607@webmail.123-reg.co.uk

Link: https://lkml.kernel.org/r/20210204130249.4495-4-phillip@squashfs.org.uk
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Reported-by: syzbot+04419e3ff19d2970ea28@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/squashfs/export.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index ae2c87bb0fbe..eb02072d28dd 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -41,12 +41,17 @@ static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
 	int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
 	int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
-	u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]);
+	u64 start;
 	__le64 ino;
 	int err;
 
 	TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);
 
+	if (ino_num == 0 || (ino_num - 1) >= msblk->inodes)
+		return -EINVAL;
+
+	start = le64_to_cpu(msblk->inode_lookup_table[blk]);
+
 	err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
 	if (err < 0)
 		return err;
@@ -111,7 +116,10 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
 		u64 lookup_table_start, u64 next_table, unsigned int inodes)
 {
 	unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
+	unsigned int indexes = SQUASHFS_LOOKUP_BLOCKS(inodes);
+	int n;
 	__le64 *table;
+	u64 start, end;
 
 	TRACE("In read_inode_lookup_table, length %d\n", length);
 
@@ -121,20 +129,37 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
 	if (inodes == 0)
 		return ERR_PTR(-EINVAL);
 
-	/* length bytes should not extend into the next table - this check
-	 * also traps instances where lookup_table_start is incorrectly larger
-	 * than the next table start
+	/*
+	 * The computed size of the lookup table (length bytes) should exactly
+	 * match the table start and end points
 	 */
-	if (lookup_table_start + length > next_table)
+	if (length != (next_table - lookup_table_start))
 		return ERR_PTR(-EINVAL);
 
 	table = squashfs_read_table(sb, lookup_table_start, length);
+	if (IS_ERR(table))
+		return table;
 
 	/*
-	 * table[0] points to the first inode lookup table metadata block,
-	 * this should be less than lookup_table_start
+	 * table0], table[1], ... table[indexes - 1] store the locations
+	 * of the compressed inode lookup blocks.  Each entry should be
+	 * less than the next (i.e. table[0] < table[1]), and the difference
+	 * between them should be SQUASHFS_METADATA_SIZE or less.
+	 * table[indexes - 1] should  be less than lookup_table_start, and
+	 * again the difference should be SQUASHFS_METADATA_SIZE or less
 	 */
-	if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) {
+	for (n = 0; n < (indexes - 1); n++) {
+		start = le64_to_cpu(table[n]);
+		end = le64_to_cpu(table[n + 1]);
+
+		if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) {
+			kfree(table);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	start = le64_to_cpu(table[indexes - 1]);
+	if (start >= lookup_table_start || (lookup_table_start - start) > SQUASHFS_METADATA_SIZE) {
 		kfree(table);
 		return ERR_PTR(-EINVAL);
 	}

From 506220d2ba21791314af569211ffd8870b8208fa Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Tue, 9 Feb 2021 13:42:00 -0800
Subject: [PATCH 59/69] squashfs: add more sanity checks in xattr id lookup

Sysbot has reported a warning where a kmalloc() attempt exceeds the
maximum limit.  This has been identified as corruption of the xattr_ids
count when reading the xattr id lookup table.

This patch adds a number of additional sanity checks to detect this
corruption and others.

1. It checks for a corrupted xattr index read from the inode.  This could
   be because the metadata block is uncompressed, or because the
   "compression" bit has been corrupted (turning a compressed block
   into an uncompressed block).  This would cause an out of bounds read.

2. It checks against corruption of the xattr_ids count.  This can either
   lead to the above kmalloc failure, or a smaller than expected
   table to be read.

3. It checks the contents of the index table for corruption.

[phillip@squashfs.org.uk: fix checkpatch issue]
  Link: https://lkml.kernel.org/r/270245655.754655.1612770082682@webmail.123-reg.co.uk

Link: https://lkml.kernel.org/r/20210204130249.4495-5-phillip@squashfs.org.uk
Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Reported-by: syzbot+2ccea6339d368360800d@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/squashfs/xattr_id.c | 66 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 57 insertions(+), 9 deletions(-)

diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index d99e08464554..ead66670b41a 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -31,10 +31,15 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
 	int block = SQUASHFS_XATTR_BLOCK(index);
 	int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
-	u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+	u64 start_block;
 	struct squashfs_xattr_id id;
 	int err;
 
+	if (index >= msblk->xattr_ids)
+		return -EINVAL;
+
+	start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+
 	err = squashfs_read_metadata(sb, &id, &start_block, &offset,
 							sizeof(id));
 	if (err < 0)
@@ -50,13 +55,17 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
 /*
  * Read uncompressed xattr id lookup table indexes from disk into memory
  */
-__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start,
 		u64 *xattr_table_start, int *xattr_ids)
 {
-	unsigned int len;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	unsigned int len, indexes;
 	struct squashfs_xattr_id_table *id_table;
+	__le64 *table;
+	u64 start, end;
+	int n;
 
-	id_table = squashfs_read_table(sb, start, sizeof(*id_table));
+	id_table = squashfs_read_table(sb, table_start, sizeof(*id_table));
 	if (IS_ERR(id_table))
 		return (__le64 *) id_table;
 
@@ -70,13 +79,52 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
 	if (*xattr_ids == 0)
 		return ERR_PTR(-EINVAL);
 
-	/* xattr_table should be less than start */
-	if (*xattr_table_start >= start)
+	len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+	indexes = SQUASHFS_XATTR_BLOCKS(*xattr_ids);
+
+	/*
+	 * The computed size of the index table (len bytes) should exactly
+	 * match the table start and end points
+	 */
+	start = table_start + sizeof(*id_table);
+	end = msblk->bytes_used;
+
+	if (len != (end - start))
 		return ERR_PTR(-EINVAL);
 
-	len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+	table = squashfs_read_table(sb, start, len);
+	if (IS_ERR(table))
+		return table;
 
-	TRACE("In read_xattr_index_table, length %d\n", len);
+	/* table[0], table[1], ... table[indexes - 1] store the locations
+	 * of the compressed xattr id blocks.  Each entry should be less than
+	 * the next (i.e. table[0] < table[1]), and the difference between them
+	 * should be SQUASHFS_METADATA_SIZE or less.  table[indexes - 1]
+	 * should be less than table_start, and again the difference
+	 * shouls be SQUASHFS_METADATA_SIZE or less.
+	 *
+	 * Finally xattr_table_start should be less than table[0].
+	 */
+	for (n = 0; n < (indexes - 1); n++) {
+		start = le64_to_cpu(table[n]);
+		end = le64_to_cpu(table[n + 1]);
 
-	return squashfs_read_table(sb, start + sizeof(*id_table), len);
+		if (start >= end || (end - start) > SQUASHFS_METADATA_SIZE) {
+			kfree(table);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	start = le64_to_cpu(table[indexes - 1]);
+	if (start >= table_start || (table_start - start) > SQUASHFS_METADATA_SIZE) {
+		kfree(table);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (*xattr_table_start >= le64_to_cpu(table[0])) {
+		kfree(table);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return table;
 }

From 1cc4cdb521f9689183474bc89eefc451ac44fa1c Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 9 Feb 2021 13:42:03 -0800
Subject: [PATCH 60/69] kasan: fix stack traces dependency for HW_TAGS

Currently, whether the alloc/free stack traces collection is enabled by
default for hardware tag-based KASAN depends on CONFIG_DEBUG_KERNEL.
The intention for this dependency was to only enable collection on slow
debug kernels due to a significant perf and memory impact.

As it turns out, CONFIG_DEBUG_KERNEL is not considered a debug option
and is enabled on many productions kernels including Android and Ubuntu.
As the result, this dependency is pointless and only complicates the
code and documentation.

Having stack traces collection disabled by default would make the
hardware mode work differently to to the software ones, which is
confusing.

This change removes the dependency and enables stack traces collection
by default.

Looking into the future, this default might makes sense for production
kernels, assuming we implement a fast stack trace collection approach.

Link: https://lkml.kernel.org/r/6678d77ceffb71f1cff2cf61560e2ffe7bb6bfe9.1612808820.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/dev-tools/kasan.rst | 3 +--
 mm/kasan/hw_tags.c                | 8 ++------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst
index 1651d961f06a..a248ac3941be 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -163,8 +163,7 @@ particular KASAN features.
 - ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
 
 - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
-  traces collection (default: ``on`` for ``CONFIG_DEBUG_KERNEL=y``, otherwise
-  ``off``).
+  traces collection (default: ``on``).
 
 - ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
   report or also panic the kernel (default: ``report``).
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index e529428e7a11..d558799b25b3 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -134,12 +134,8 @@ void __init kasan_init_hw_tags(void)
 
 	switch (kasan_arg_stacktrace) {
 	case KASAN_ARG_STACKTRACE_DEFAULT:
-		/*
-		 * Default to enabling stack trace collection for
-		 * debug kernels.
-		 */
-		if (IS_ENABLED(CONFIG_DEBUG_KERNEL))
-			static_branch_enable(&kasan_flag_stacktrace);
+		/* Default to enabling stack trace collection. */
+		static_branch_enable(&kasan_flag_stacktrace);
 		break;
 	case KASAN_ARG_STACKTRACE_OFF:
 		/* Do nothing, kasan_flag_stacktrace keeps its default value. */

From 793f49a87aae24e5bcf92ad98d764153fc936570 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Tue, 9 Feb 2021 13:42:07 -0800
Subject: [PATCH 61/69] firmware_loader: align .builtin_fw to 8

arm64 references the start address of .builtin_fw (__start_builtin_fw)
with a pair of R_AARCH64_ADR_PREL_PG_HI21/R_AARCH64_LDST64_ABS_LO12_NC
relocations.  The compiler is allowed to emit the
R_AARCH64_LDST64_ABS_LO12_NC relocation because struct builtin_fw in
include/linux/firmware.h is 8-byte aligned.

The R_AARCH64_LDST64_ABS_LO12_NC relocation requires the address to be a
multiple of 8, which may not be the case if .builtin_fw is empty.
Unconditionally align .builtin_fw to fix the linker error.  32-bit
architectures could use ALIGN(4) but that would add unnecessary
complexity, so just use ALIGN(8).

Link: https://lkml.kernel.org/r/20201208054646.2913063-1-maskray@google.com
Link: https://github.com/ClangBuiltLinux/linux/issues/1204
Fixes: 5658c76 ("firmware: allow firmware files to be built into kernel image")
Signed-off-by: Fangrui Song <maskray@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/vmlinux.lds.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index b2b3d81b1535..b97c628ad91f 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -459,7 +459,7 @@
 	}								\
 									\
 	/* Built-in firmware blobs */					\
-	.builtin_fw        : AT(ADDR(.builtin_fw) - LOAD_OFFSET) {	\
+	.builtin_fw : AT(ADDR(.builtin_fw) - LOAD_OFFSET) ALIGN(8) {	\
 		__start_builtin_fw = .;					\
 		KEEP(*(.builtin_fw))					\
 		__end_builtin_fw = .;					\

From a30a29091b5a6d4c64b5fc77040720a65e2dd4e6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 9 Feb 2021 13:42:10 -0800
Subject: [PATCH 62/69] mm/mremap: fix BUILD_BUG_ON() error in get_extent

clang can't evaluate this function argument at compile time when the
function is not inlined, which leads to a link time failure:

  ld.lld: error: undefined symbol: __compiletime_assert_414
  >>> referenced by mremap.c
  >>>               mremap.o:(get_extent) in archive mm/built-in.a

Mark the function as __always_inline to avoid it.

Link: https://lkml.kernel.org/r/20201230154104.522605-1-arnd@kernel.org
Fixes: 9ad9718bfa41 ("mm/mremap: calculate extent in one place")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: Brian Geffon <bgeffon@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mremap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index f554320281cc..aa63bfd3cad2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -336,8 +336,9 @@ enum pgt_entry {
  * valid. Else returns a smaller extent bounded by the end of the source and
  * destination pgt_entry.
  */
-static unsigned long get_extent(enum pgt_entry entry, unsigned long old_addr,
-			unsigned long old_end, unsigned long new_addr)
+static __always_inline unsigned long get_extent(enum pgt_entry entry,
+			unsigned long old_addr, unsigned long old_end,
+			unsigned long new_addr)
 {
 	unsigned long next, extent, mask, size;
 

From b85a7a8bb5736998b8a681937a9749b350c17988 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Tue, 9 Feb 2021 13:42:14 -0800
Subject: [PATCH 63/69] tmpfs: disallow CONFIG_TMPFS_INODE64 on s390

Currently there is an assumption in tmpfs that 64-bit architectures also
have a 64-bit ino_t.  This is not true on s390 which has a 32-bit ino_t.
With CONFIG_TMPFS_INODE64=y tmpfs mounts will get 64-bit inode numbers
and display "inode64" in the mount options, but passing the "inode64"
mount option will fail.  This leads to the following behavior:

  # mkdir mnt
  # mount -t tmpfs nodev mnt
  # mount -o remount,rw mnt
  mount: /home/ubuntu/mnt: mount point not mounted or bad option.

As mount sees "inode64" in the mount options and thus passes it in the
options for the remount.

So prevent CONFIG_TMPFS_INODE64 from being selected on s390.

Link: https://lkml.kernel.org/r/20210205230620.518245-1-seth.forshee@canonical.com
Fixes: ea3271f7196c ("tmpfs: support 64-bit inums per-sb")
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Chris Down <chris@chrisdown.name>
Cc: Hugh Dickins <hughd@google.com>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: <stable@vger.kernel.org>	[5.9+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index aa4c12282301..3347ec7bd837 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -203,7 +203,7 @@ config TMPFS_XATTR
 
 config TMPFS_INODE64
 	bool "Use 64-bit ino_t by default in tmpfs"
-	depends on TMPFS && 64BIT
+	depends on TMPFS && 64BIT && !S390
 	default n
 	help
 	  tmpfs has historically used only inode numbers as wide as an unsigned

From ad69c389ec110ea54f8b0c0884b255340ef1c736 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Tue, 9 Feb 2021 13:42:17 -0800
Subject: [PATCH 64/69] tmpfs: disallow CONFIG_TMPFS_INODE64 on alpha

As with s390, alpha is a 64-bit architecture with a 32-bit ino_t.  With
CONFIG_TMPFS_INODE64=y tmpfs mounts will get 64-bit inode numbers and
display "inode64" in the mount options, whereas passing "inode64" in the
mount options will fail.  This leads to erroneous behaviours such as
this:

  # mkdir mnt
  # mount -t tmpfs nodev mnt
  # mount -o remount,rw mnt
  mount: /home/ubuntu/mnt: mount point not mounted or bad option.

Prevent CONFIG_TMPFS_INODE64 from being selected on alpha.

Link: https://lkml.kernel.org/r/20210208215726.608197-1-seth.forshee@canonical.com
Fixes: ea3271f7196c ("tmpfs: support 64-bit inums per-sb")
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Chris Down <chris@chrisdown.name>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: <stable@vger.kernel.org>	[5.9+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 3347ec7bd837..da524c4d7b7e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -203,7 +203,7 @@ config TMPFS_XATTR
 
 config TMPFS_INODE64
 	bool "Use 64-bit ino_t by default in tmpfs"
-	depends on TMPFS && 64BIT && !S390
+	depends on TMPFS && 64BIT && !(S390 || ALPHA)
 	default n
 	help
 	  tmpfs has historically used only inode numbers as wide as an unsigned

From d52db800846f66d98a4e14c39cf88a06bcd9985f Mon Sep 17 00:00:00 2001
From: Rong Chen <rong.a.chen@intel.com>
Date: Tue, 9 Feb 2021 13:42:21 -0800
Subject: [PATCH 65/69] selftests/vm: rename file run_vmtests to run_vmtests.sh

Commit c2aa8afc36fa has renamed run_vmtests in Makefile, but the file
still uses the old name.

The kernel test robot reported the following issue:

  # selftests: vm: run_vmtests.sh
  # Warning: file run_vmtests.sh is missing!
  not ok 1 selftests: vm: run_vmtests.sh

Link: https://lkml.kernel.org/r/20210205085507.1479894-1-rong.a.chen@intel.com
Fixes: c2aa8afc36fa (selftests/vm: rename run_vmtests --> run_vmtests.sh)
Signed-off-by: Rong Chen <rong.a.chen@intel.com>
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/{run_vmtests => run_vmtests.sh} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tools/testing/selftests/vm/{run_vmtests => run_vmtests.sh} (100%)

diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests.sh
similarity index 100%
rename from tools/testing/selftests/vm/run_vmtests
rename to tools/testing/selftests/vm/run_vmtests.sh

From a0c2eb0a4387322ebc629c01f5adb2d957c343fe Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Date: Tue, 9 Feb 2021 13:42:24 -0800
Subject: [PATCH 66/69] MAINTAINERS: update Andrey Ryabinin's email address

Update my email, @virtuozzo.com will stop working shortly.

Link: https://lkml.kernel.org/r/20210204223904.3824-1-ryabinin.a.a@gmail.com
Signed-off-by: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .mailmap    | 1 +
 MAINTAINERS | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.mailmap b/.mailmap
index d674968df008..7fdf87b24fe8 100644
--- a/.mailmap
+++ b/.mailmap
@@ -37,6 +37,7 @@ Andrew Murray <amurray@thegoodpenguin.co.uk> <amurray@embedded-bits.co.uk>
 Andrew Murray <amurray@thegoodpenguin.co.uk> <andrew.murray@arm.com>
 Andrew Vasquez <andrew.vasquez@qlogic.com>
 Andrey Ryabinin <ryabinin.a.a@gmail.com> <a.ryabinin@samsung.com>
+Andrey Ryabinin <ryabinin.a.a@gmail.com> <aryabinin@virtuozzo.com>
 Andy Adamson <andros@citi.umich.edu>
 Antoine Tenart <atenart@kernel.org> <antoine.tenart@bootlin.com>
 Antoine Tenart <atenart@kernel.org> <antoine.tenart@free-electrons.com>
diff --git a/MAINTAINERS b/MAINTAINERS
index 667d03852191..64c7169db617 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9559,7 +9559,7 @@ F:	Documentation/hwmon/k8temp.rst
 F:	drivers/hwmon/k8temp.c
 
 KASAN
-M:	Andrey Ryabinin <aryabinin@virtuozzo.com>
+M:	Andrey Ryabinin <ryabinin.a.a@gmail.com>
 R:	Alexander Potapenko <glider@google.com>
 R:	Dmitry Vyukov <dvyukov@google.com>
 L:	kasan-dev@googlegroups.com

From e82553c10b0899994153f9bf0af333c0a1550fd7 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 9 Feb 2021 13:42:28 -0800
Subject: [PATCH 67/69] Revert "mm: memcontrol: avoid workload stalls when
 lowering memory.high"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 536d3bf261a2fc3b05b3e91e7eef7383443015cf, as it can
cause writers to memory.high to get stuck in the kernel forever,
performing page reclaim and consuming excessive amounts of CPU cycles.

Before the patch, a write to memory.high would first put the new limit
in place for the workload, and then reclaim the requested delta.  After
the patch, the kernel tries to reclaim the delta before putting the new
limit into place, in order to not overwhelm the workload with a sudden,
large excess over the limit.  However, if reclaim is actively racing
with new allocations from the uncurbed workload, it can keep the write()
working inside the kernel indefinitely.

This is causing problems in Facebook production.  A privileged
system-level daemon that adjusts memory.high for various workloads
running on a host can get unexpectedly stuck in the kernel and
essentially turn into a sort of involuntary kswapd for one of the
workloads.  We've observed that daemon busy-spin in a write() for
minutes at a time, neglecting its other duties on the system, and
expending privileged system resources on behalf of a workload.

To remedy this, we have first considered changing the reclaim logic to
break out after a couple of loops - whether the workload has converged
to the new limit or not - and bound the write() call this way.  However,
the root cause that inspired the sequence change in the first place has
been fixed through other means, and so a revert back to the proven
limit-setting sequence, also used by memory.max, is preferable.

The sequence was changed to avoid extreme latencies in the workload when
the limit was lowered: the sudden, large excess created by the limit
lowering would erroneously trigger the penalty sleeping code that is
meant to throttle excessive growth from below.  Allocating threads could
end up sleeping long after the write() had already reclaimed the delta
for which they were being punished.

However, erroneous throttling also caused problems in other scenarios at
around the same time.  This resulted in commit b3ff92916af3 ("mm, memcg:
reclaim more aggressively before high allocator throttling"), included
in the same release as the offending commit.  When allocating threads
now encounter large excess caused by a racing write() to memory.high,
instead of entering punitive sleeps, they will simply be tasked with
helping reclaim down the excess, and will be held no longer than it
takes to accomplish that.  This is in line with regular limit
enforcement - i.e.  if the workload allocates up against or over an
otherwise unchanged limit from below.

With the patch breaking userspace, and the root cause addressed by other
means already, revert it again.

Link: https://lkml.kernel.org/r/20210122184341.292461-1-hannes@cmpxchg.org
Fixes: 536d3bf261a2 ("mm: memcontrol: avoid workload stalls when lowering memory.high")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Tejun Heo <tj@kernel.org>
Acked-by: Chris Down <chris@chrisdown.name>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: <stable@vger.kernel.org>	[5.8+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2de77b5bcc2..913c2b9e5c72 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6271,6 +6271,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 	if (err)
 		return err;
 
+	page_counter_set_high(&memcg->memory, high);
+
 	for (;;) {
 		unsigned long nr_pages = page_counter_read(&memcg->memory);
 		unsigned long reclaimed;
@@ -6294,10 +6296,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 			break;
 	}
 
-	page_counter_set_high(&memcg->memory, high);
-
 	memcg_wb_domain_size_changed(memcg);
-
 	return nbytes;
 }
 

From 3286222fc609dea27bd16ac02c55d3f1c3190063 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 9 Feb 2021 13:42:32 -0800
Subject: [PATCH 68/69] mm, slub: better heuristic for number of cpus when
 calculating slab order

When creating a new kmem cache, SLUB determines how large the slab pages
will based on number of inputs, including the number of CPUs in the
system.  Larger slab pages mean that more objects can be allocated/free
from per-cpu slabs before accessing shared structures, but also
potentially more memory can be wasted due to low slab usage and
fragmentation.  The rough idea of using number of CPUs is that larger
systems will be more likely to benefit from reduced contention, and also
should have enough memory to spare.

Number of CPUs used to be determined as nr_cpu_ids, which is number of
possible cpus, but on some systems many will never be onlined, thus
commit 045ab8c9487b ("mm/slub: let number of online CPUs determine the
slub page order") changed it to nr_online_cpus().  However, for kmem
caches created early before CPUs are onlined, this may lead to
permamently low slab page sizes.

Vincent reports a regression [1] of hackbench on arm64 systems:

  "I'm facing significant performances regression on a large arm64
   server system (224 CPUs). Regressions is also present on small arm64
   system (8 CPUs) but in a far smaller order of magnitude

   On 224 CPUs system : 9 iterations of hackbench -l 16000 -g 16
   v5.11-rc4 : 9.135sec (+/- 0.45%)
   v5.11-rc4 + revert this patch: 3.173sec (+/- 0.48%)
   v5.10: 3.136sec (+/- 0.40%)"

Mel reports a regression [2] of hackbench on x86_64, with lockstat suggesting
page allocator contention:

  "i.e. the patch incurs a 7% to 32% performance penalty. This bisected
   cleanly yesterday when I was looking for the regression and then
   found the thread.

   Numerous caches change size. For example, kmalloc-512 goes from
   order-0 (vanilla) to order-2 with the revert.

   So mostly this is down to the number of times SLUB calls into the
   page allocator which only caches order-0 pages on a per-cpu basis"

Clearly num_online_cpus() doesn't work too early in bootup.  We could
change the order dynamically in a memory hotplug callback, but runtime
order changing for existing kmem caches has been already shown as
dangerous, and removed in 32a6f409b693 ("mm, slub: remove runtime
allocation order changes").

It could be resurrected in a safe manner with some effort, but to fix
the regression we need something simpler.

We could use num_present_cpus() that should be the number of physically
present CPUs even before they are onlined.  That would work for PowerPC
[3], which triggered the original commit, but that still doesn't work on
arm64 [4] as explained in [5].

So this patch tries to determine the best available value without
specific arch knowledge.

 - num_present_cpus() if the number is larger than 1, as that means the
   arch is likely setting it properly

 - nr_cpu_ids otherwise

This should fix the reported regressions while also keeping the effect
of 045ab8c9487b for PowerPC systems.  It's possible there are
configurations where num_present_cpus() is 1 during boot while
nr_cpu_ids is at the same time bloated, so these (if they exist) would
keep the large orders based on nr_cpu_ids as was before 045ab8c9487b.

[1] https://lore.kernel.org/linux-mm/CAKfTPtA_JgMf_+zdFbcb_V9rM7JBWNPjAz9irgwFj7Rou=xzZg@mail.gmail.com/
[2] https://lore.kernel.org/linux-mm/20210128134512.GF3592@techsingularity.net/
[3] https://lore.kernel.org/linux-mm/20210123051607.GC2587010@in.ibm.com/
[4] https://lore.kernel.org/linux-mm/CAKfTPtAjyVmS5VYvU6DBxg4-JEo5bdmWbngf-03YsY18cmWv_g@mail.gmail.com/
[5] https://lore.kernel.org/linux-mm/20210126230305.GD30941@willie-the-truck/

Link: https://lkml.kernel.org/r/20210208134108.22286-1-vbabka@suse.cz
Fixes: 045ab8c9487b ("mm/slub: let number of online CPUs determine the slub page order")
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Reported-by: Mel Gorman <mgorman@techsingularity.net>
Tested-by: Mel Gorman <mgorman@techsingularity.net>
Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jann Horn <jannh@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Will Deacon <will@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 7ecbbbe5bc0c..b22a4b101c84 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3423,6 +3423,7 @@ static inline int calculate_order(unsigned int size)
 	unsigned int order;
 	unsigned int min_objects;
 	unsigned int max_objects;
+	unsigned int nr_cpus;
 
 	/*
 	 * Attempt to find best configuration for a slab. This
@@ -3433,8 +3434,21 @@ static inline int calculate_order(unsigned int size)
 	 * we reduce the minimum objects required in a slab.
 	 */
 	min_objects = slub_min_objects;
-	if (!min_objects)
-		min_objects = 4 * (fls(num_online_cpus()) + 1);
+	if (!min_objects) {
+		/*
+		 * Some architectures will only update present cpus when
+		 * onlining them, so don't trust the number if it's just 1. But
+		 * we also don't want to use nr_cpu_ids always, as on some other
+		 * architectures, there can be many possible cpus, but never
+		 * onlined. Here we compromise between trying to avoid too high
+		 * order on systems that appear larger than they are, and too
+		 * low order on systems that appear smaller than they are.
+		 */
+		nr_cpus = num_present_cpus();
+		if (nr_cpus <= 1)
+			nr_cpus = nr_cpu_ids;
+		min_objects = 4 * (fls(nr_cpus) + 1);
+	}
 	max_objects = order_objects(slub_max_order, size);
 	min_objects = min(min_objects, max_objects);
 

From a35d8f016e0b68634035217d06d1c53863456b50 Mon Sep 17 00:00:00 2001
From: Joachim Henke <joachim.henke@t-systems.com>
Date: Tue, 9 Feb 2021 13:42:36 -0800
Subject: [PATCH 69/69] nilfs2: make splice write available again

Since 5.10, splice() or sendfile() to NILFS2 return EINVAL.  This was
caused by commit 36e2c7421f02 ("fs: don't allow splice read/write
without explicit ops").

This patch initializes the splice_write field in file_operations, like
most file systems do, to restore the functionality.

Link: https://lkml.kernel.org/r/1612784101-14353-1-git-send-email-konishi.ryusuke@gmail.com
Signed-off-by: Joachim Henke <joachim.henke@t-systems.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Cc: <stable@vger.kernel.org>	[5.10+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/file.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 64bc81363c6c..e1bd592ce700 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -141,6 +141,7 @@ const struct file_operations nilfs_file_operations = {
 	/* .release	= nilfs_release_file, */
 	.fsync		= nilfs_sync_file,
 	.splice_read	= generic_file_splice_read,
+	.splice_write   = iter_file_splice_write,
 };
 
 const struct inode_operations nilfs_file_inode_operations = {