From bd40b17ca49d7d110adf456e647701ce74de2241 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 31 Jan 2020 05:07:55 -0500
Subject: [PATCH 001/327] XArray: Fix xa_find_next for large multi-index
 entries

Coverity pointed out that xas_sibling() was shifting xa_offset without
promoting it to an unsigned long first, so the shift could cause an
overflow and we'd get the wrong answer.  The fix is obvious, and the
new test-case provokes UBSAN to report an error:
runtime error: shift exponent 60 is too large for 32-bit type 'int'

Fixes: 19c30f4dd092 ("XArray: Fix xa_find_after with multi-index entries")
Reported-by: Bjorn Helgaas <bhelgaas@google.com>
Reported-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: stable@vger.kernel.org
---
 lib/test_xarray.c | 18 ++++++++++++++++++
 lib/xarray.c      |  3 ++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 55c14e8c8859..8c7d7a8468b8 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -12,6 +12,9 @@
 static unsigned int tests_run;
 static unsigned int tests_passed;
 
+static const unsigned int order_limit =
+		IS_ENABLED(CONFIG_XARRAY_MULTI) ? BITS_PER_LONG : 1;
+
 #ifndef XA_DEBUG
 # ifdef __KERNEL__
 void xa_dump(const struct xarray *xa) { }
@@ -959,6 +962,20 @@ static noinline void check_multi_find_2(struct xarray *xa)
 	}
 }
 
+static noinline void check_multi_find_3(struct xarray *xa)
+{
+	unsigned int order;
+
+	for (order = 5; order < order_limit; order++) {
+		unsigned long index = 1UL << (order - 5);
+
+		XA_BUG_ON(xa, !xa_empty(xa));
+		xa_store_order(xa, 0, order - 4, xa_mk_index(0), GFP_KERNEL);
+		XA_BUG_ON(xa, xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT));
+		xa_erase_index(xa, 0);
+	}
+}
+
 static noinline void check_find_1(struct xarray *xa)
 {
 	unsigned long i, j, k;
@@ -1081,6 +1098,7 @@ static noinline void check_find(struct xarray *xa)
 	for (i = 2; i < 10; i++)
 		check_multi_find_1(xa, i);
 	check_multi_find_2(xa);
+	check_multi_find_3(xa);
 }
 
 /* See find_swap_entry() in mm/shmem.c */
diff --git a/lib/xarray.c b/lib/xarray.c
index 1d9fab7db8da..acd1fad2e862 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1839,7 +1839,8 @@ static bool xas_sibling(struct xa_state *xas)
 	if (!node)
 		return false;
 	mask = (XA_CHUNK_SIZE << node->shift) - 1;
-	return (xas->xa_index & mask) > (xas->xa_offset << node->shift);
+	return (xas->xa_index & mask) >
+		((unsigned long)xas->xa_offset << node->shift);
 }
 
 /**

From c36d451ad386b34f452fc3c8621ff14b9eaa31a6 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 31 Jan 2020 06:17:09 -0500
Subject: [PATCH 002/327] XArray: Fix xas_pause for large multi-index entries

Inspired by the recent Coverity report, I looked for other places where
the offset wasn't being converted to an unsigned long before being
shifted, and I found one in xas_pause() when the entry being paused is
of order >32.

Fixes: b803b42823d0 ("xarray: Add XArray iterators")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: stable@vger.kernel.org
---
 lib/test_xarray.c | 37 +++++++++++++++++++++++++++++++++++++
 lib/xarray.c      |  2 +-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index 8c7d7a8468b8..d4f97925dbd8 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -1156,6 +1156,42 @@ static noinline void check_find_entry(struct xarray *xa)
 	XA_BUG_ON(xa, !xa_empty(xa));
 }
 
+static noinline void check_pause(struct xarray *xa)
+{
+	XA_STATE(xas, xa, 0);
+	void *entry;
+	unsigned int order;
+	unsigned long index = 1;
+	unsigned int count = 0;
+
+	for (order = 0; order < order_limit; order++) {
+		XA_BUG_ON(xa, xa_store_order(xa, index, order,
+					xa_mk_index(index), GFP_KERNEL));
+		index += 1UL << order;
+	}
+
+	rcu_read_lock();
+	xas_for_each(&xas, entry, ULONG_MAX) {
+		XA_BUG_ON(xa, entry != xa_mk_index(1UL << count));
+		count++;
+	}
+	rcu_read_unlock();
+	XA_BUG_ON(xa, count != order_limit);
+
+	count = 0;
+	xas_set(&xas, 0);
+	rcu_read_lock();
+	xas_for_each(&xas, entry, ULONG_MAX) {
+		XA_BUG_ON(xa, entry != xa_mk_index(1UL << count));
+		count++;
+		xas_pause(&xas);
+	}
+	rcu_read_unlock();
+	XA_BUG_ON(xa, count != order_limit);
+
+	xa_destroy(xa);
+}
+
 static noinline void check_move_tiny(struct xarray *xa)
 {
 	XA_STATE(xas, xa, 0);
@@ -1664,6 +1700,7 @@ static int xarray_checks(void)
 	check_xa_alloc();
 	check_find(&array);
 	check_find_entry(&array);
+	check_pause(&array);
 	check_account(&array);
 	check_destroy(&array);
 	check_move(&array);
diff --git a/lib/xarray.c b/lib/xarray.c
index acd1fad2e862..05324cf571f4 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -970,7 +970,7 @@ void xas_pause(struct xa_state *xas)
 
 	xas->xa_node = XAS_RESTART;
 	if (node) {
-		unsigned int offset = xas->xa_offset;
+		unsigned long offset = xas->xa_offset;
 		while (++offset < XA_CHUNK_SIZE) {
 			if (!xa_is_sibling(xa_entry(xas->xa, node, offset)))
 				break;

From 24a448b165253b6f2ab1e0bcdba9a733007681d6 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@mykernel.net>
Date: Fri, 6 Dec 2019 18:29:03 +0800
Subject: [PATCH 003/327] XArray: Fix incorrect comment in header file

256 means Retry entry and 257 means Zero entry,
so fix it.

Signed-off-by: Chengguang Xu <cgxu519@mykernel.net>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/xarray.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index f73e1775ded0..a491653d8882 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -32,8 +32,8 @@
  * The following internal entries have a special meaning:
  *
  * 0-62: Sibling entries
- * 256: Zero entry
- * 257: Retry entry
+ * 256: Retry entry
+ * 257: Zero entry
  *
  * Errors are also represented as internal entries, but use the negative
  * space (-4094 to -2).  They're never stored in the slots array; only

From 3a00e7c47c382b30524e78b36ab047c16b8fcfef Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linux.alibaba.com>
Date: Tue, 21 Jan 2020 16:34:05 +0800
Subject: [PATCH 004/327] ida: remove abandoned macros

3 IDA_ started macros aren't used from commit f32f004cddf8 ("ida: Convert
to XArray"). so better to remove them.

Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 lib/radix-tree.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index c8fa1d274530..2ee6ae3b0ade 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -55,14 +55,6 @@ struct kmem_cache *radix_tree_node_cachep;
 						RADIX_TREE_MAP_SHIFT))
 #define IDR_PRELOAD_SIZE	(IDR_MAX_PATH * 2 - 1)
 
-/*
- * The IDA is even shorter since it uses a bitmap at the last level.
- */
-#define IDA_INDEX_BITS		(8 * sizeof(int) - 1 - ilog2(IDA_BITMAP_BITS))
-#define IDA_MAX_PATH		(DIV_ROUND_UP(IDA_INDEX_BITS, \
-						RADIX_TREE_MAP_SHIFT))
-#define IDA_PRELOAD_SIZE	(IDA_MAX_PATH * 2 - 1)
-
 /*
  * Per-cpu pool of preloaded nodes
  */

From 8f11538ebe984e5434eeda4c7183d165cddb5936 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 11 Jan 2020 10:14:09 -0500
Subject: [PATCH 005/327] do_add_mount(): lift lock_mount/unlock_mount into
 callers

preparation to finish_automount() fix (next commit)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 47 ++++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 85b5f7bea82e..dcd015fafe01 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2697,45 +2697,32 @@ static int do_move_mount_old(struct path *path, const char *old_name)
 /*
  * add a mount into a namespace's mount tree
  */
-static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
+static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
+			struct path *path, int mnt_flags)
 {
-	struct mountpoint *mp;
-	struct mount *parent;
-	int err;
+	struct mount *parent = real_mount(path->mnt);
 
 	mnt_flags &= ~MNT_INTERNAL_FLAGS;
 
-	mp = lock_mount(path);
-	if (IS_ERR(mp))
-		return PTR_ERR(mp);
-
-	parent = real_mount(path->mnt);
-	err = -EINVAL;
 	if (unlikely(!check_mnt(parent))) {
 		/* that's acceptable only for automounts done in private ns */
 		if (!(mnt_flags & MNT_SHRINKABLE))
-			goto unlock;
+			return -EINVAL;
 		/* ... and for those we'd better have mountpoint still alive */
 		if (!parent->mnt_ns)
-			goto unlock;
+			return -EINVAL;
 	}
 
 	/* Refuse the same filesystem on the same mount point */
-	err = -EBUSY;
 	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
 	    path->mnt->mnt_root == path->dentry)
-		goto unlock;
+		return -EBUSY;
 
-	err = -EINVAL;
 	if (d_is_symlink(newmnt->mnt.mnt_root))
-		goto unlock;
+		return -EINVAL;
 
 	newmnt->mnt.mnt_flags = mnt_flags;
-	err = graft_tree(newmnt, parent, mp);
-
-unlock:
-	unlock_mount(mp);
-	return err;
+	return graft_tree(newmnt, parent, mp);
 }
 
 static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
@@ -2748,6 +2735,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
 			   unsigned int mnt_flags)
 {
 	struct vfsmount *mnt;
+	struct mountpoint *mp;
 	struct super_block *sb = fc->root->d_sb;
 	int error;
 
@@ -2768,7 +2756,13 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
 
 	mnt_warn_timestamp_expiry(mountpoint, mnt);
 
-	error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags);
+	mp = lock_mount(mountpoint);
+	if (IS_ERR(mp)) {
+		mntput(mnt);
+		return PTR_ERR(mp);
+	}
+	error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
+	unlock_mount(mp);
 	if (error < 0)
 		mntput(mnt);
 	return error;
@@ -2830,6 +2824,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
 int finish_automount(struct vfsmount *m, struct path *path)
 {
 	struct mount *mnt = real_mount(m);
+	struct mountpoint *mp;
 	int err;
 	/* The new mount record should have at least 2 refs to prevent it being
 	 * expired before we get a chance to add it
@@ -2842,7 +2837,13 @@ int finish_automount(struct vfsmount *m, struct path *path)
 		goto fail;
 	}
 
-	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+	mp = lock_mount(path);
+	if (IS_ERR(mp)) {
+		err = PTR_ERR(mp);
+		goto fail;
+	}
+	err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
+	unlock_mount(mp);
 	if (!err)
 		return 0;
 fail:

From 034c7678dd2c05fb08e6fa3b0ac527ead8f1b11b Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 16 Jan 2020 13:32:34 -0800
Subject: [PATCH 006/327] selftests/resctrl: Add README for resctrl tests

resctrl tests will be implemented. README is added for the tool first.

Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/README | 53 ++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 tools/testing/selftests/resctrl/README

diff --git a/tools/testing/selftests/resctrl/README b/tools/testing/selftests/resctrl/README
new file mode 100644
index 000000000000..6e5a0ffa18e8
--- /dev/null
+++ b/tools/testing/selftests/resctrl/README
@@ -0,0 +1,53 @@
+resctrl_tests - resctrl file system test suit
+
+Authors:
+	Fenghua Yu <fenghua.yu@intel.com>
+	Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+
+resctrl_tests tests various resctrl functionalities and interfaces including
+both software and hardware.
+
+Currently it supports Memory Bandwidth Monitoring test and Memory Bandwidth
+Allocation test on Intel RDT hardware. More tests will be added in the future.
+And the test suit can be extended to cover AMD QoS and ARM MPAM hardware
+as well.
+
+BUILD
+-----
+
+Run "make" to build executable file "resctrl_tests".
+
+RUN
+---
+
+To use resctrl_tests, root or sudoer privileges are required. This is because
+the test needs to mount resctrl file system and change contents in the file
+system.
+
+Executing the test without any parameter will run all supported tests:
+
+	sudo ./resctrl_tests
+
+OVERVIEW OF EXECUTION
+---------------------
+
+A test case has four stages:
+
+  - setup: mount resctrl file system, create group, setup schemata, move test
+    process pids to tasks, start benchmark.
+  - execute: let benchmark run
+  - verify: get resctrl data and verify the data with another source, e.g.
+    perf event.
+  - teardown: umount resctrl and clear temporary files.
+
+ARGUMENTS
+---------
+
+Parameter '-h' shows usage information.
+
+usage: resctrl_tests [-h] [-b "benchmark_cmd [options]"] [-t test list] [-n no_of_bits]
+        -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM default benchmark is builtin fill_buf
+        -t test list: run tests specified in the test list, e.g. -t mbm, mba, cqm, cat
+        -n no_of_bits: run cache tests using specified no of bits in cache bit mask
+        -p cpu_no: specify CPU number to run the test. 1 is default
+        -h: help

From 591a6e8588fc1dbdc04355e8ad7b0be43d221c9c Mon Sep 17 00:00:00 2001
From: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Date: Thu, 16 Jan 2020 13:32:35 -0800
Subject: [PATCH 007/327] selftests/resctrl: Add basic resctrl file system
 operations and data

The basic resctrl file system operations and data are added for future
usage by resctrl selftest tool.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/Makefile    |  12 +
 tools/testing/selftests/resctrl/resctrl.h   |  50 +++
 tools/testing/selftests/resctrl/resctrlfs.c | 462 ++++++++++++++++++++
 3 files changed, 524 insertions(+)
 create mode 100644 tools/testing/selftests/resctrl/Makefile
 create mode 100644 tools/testing/selftests/resctrl/resctrl.h
 create mode 100644 tools/testing/selftests/resctrl/resctrlfs.c

diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile
new file mode 100644
index 000000000000..76bbd6d3e4a8
--- /dev/null
+++ b/tools/testing/selftests/resctrl/Makefile
@@ -0,0 +1,12 @@
+CC = $(CROSS_COMPILE)gcc
+CFLAGS = -g -Wall
+SRCS=$(wildcard *.c)
+OBJS=$(SRCS:.c=.o)
+
+$(OBJS): $(SRCS)
+	$(CC) $(CFLAGS) -c $(SRCS)
+
+.PHONY: clean
+
+clean:
+	$(RM) $(OBJS)
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
new file mode 100644
index 000000000000..ba98cd6efc64
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define _GNU_SOURCE
+#ifndef RESCTRL_H
+#define RESCTRL_H
+#include <stdio.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <dirent.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <asm/unistd.h>
+#include <linux/perf_event.h>
+
+#define RESCTRL_PATH		"/sys/fs/resctrl"
+#define PHYS_ID_PATH		"/sys/devices/system/cpu/cpu"
+
+#define PARENT_EXIT(err_msg)			\
+	do {					\
+		perror(err_msg);		\
+		kill(ppid, SIGKILL);		\
+		exit(EXIT_FAILURE);		\
+	} while (0)
+
+pid_t bm_pid, ppid;
+
+int remount_resctrlfs(bool mum_resctrlfs);
+int get_resource_id(int cpu_no, int *resource_id);
+int validate_bw_report_request(char *bw_report);
+bool validate_resctrl_feature_request(char *resctrl_val);
+char *fgrep(FILE *inf, const char *str);
+int taskset_benchmark(pid_t bm_pid, int cpu_no);
+void run_benchmark(int signum, siginfo_t *info, void *ucontext);
+int write_schemata(char *ctrlgrp, char *schemata, int cpu_no,
+		   char *resctrl_val);
+int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
+			    char *resctrl_val);
+int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
+		    int group_fd, unsigned long flags);
+int run_fill_buf(unsigned long span, int malloc_and_init_memory, int memflush,
+		 int op, char *resctrl_va);
+
+#endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
new file mode 100644
index 000000000000..1b125f9d8e5d
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Basic resctrl file system operations
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+int tests_run;
+
+static int find_resctrl_mount(char *buffer)
+{
+	FILE *mounts;
+	char line[256], *fs, *mntpoint;
+
+	mounts = fopen("/proc/mounts", "r");
+	if (!mounts) {
+		perror("/proc/mounts");
+		return -ENXIO;
+	}
+	while (!feof(mounts)) {
+		if (!fgets(line, 256, mounts))
+			break;
+		fs = strtok(line, " \t");
+		if (!fs)
+			continue;
+		mntpoint = strtok(NULL, " \t");
+		if (!mntpoint)
+			continue;
+		fs = strtok(NULL, " \t");
+		if (!fs)
+			continue;
+		if (strcmp(fs, "resctrl"))
+			continue;
+
+		fclose(mounts);
+		if (buffer)
+			strncpy(buffer, mntpoint, 256);
+
+		return 0;
+	}
+
+	fclose(mounts);
+
+	return -ENOENT;
+}
+
+/*
+ * remount_resctrlfs - Remount resctrl FS at /sys/fs/resctrl
+ * @mum_resctrlfs:	Should the resctrl FS be remounted?
+ *
+ * If not mounted, mount it.
+ * If mounted and mum_resctrlfs then remount resctrl FS.
+ * If mounted and !mum_resctrlfs then noop
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int remount_resctrlfs(bool mum_resctrlfs)
+{
+	char mountpoint[256];
+	int ret;
+
+	ret = find_resctrl_mount(mountpoint);
+	if (ret)
+		strcpy(mountpoint, RESCTRL_PATH);
+
+	if (!ret && mum_resctrlfs && umount(mountpoint)) {
+		printf("not ok unmounting \"%s\"\n", mountpoint);
+		perror("# umount");
+		tests_run++;
+	}
+
+	if (!ret && !mum_resctrlfs)
+		return 0;
+
+	ret = mount("resctrl", RESCTRL_PATH, "resctrl", 0, NULL);
+	printf("%sok mounting resctrl to \"%s\"\n", ret ? "not " : "",
+	       RESCTRL_PATH);
+	if (ret)
+		perror("# mount");
+
+	tests_run++;
+
+	return ret;
+}
+
+int umount_resctrlfs(void)
+{
+	if (umount(RESCTRL_PATH)) {
+		perror("# Unable to umount resctrl");
+
+		return errno;
+	}
+
+	return 0;
+}
+
+/*
+ * get_resource_id - Get socket number/l3 id for a specified CPU
+ * @cpu_no:	CPU number
+ * @resource_id: Socket number or l3_id
+ *
+ * Return: >= 0 on success, < 0 on failure.
+ */
+int get_resource_id(int cpu_no, int *resource_id)
+{
+	char phys_pkg_path[1024];
+	FILE *fp;
+
+	sprintf(phys_pkg_path, "%s%d/topology/physical_package_id",
+		PHYS_ID_PATH, cpu_no);
+	fp = fopen(phys_pkg_path, "r");
+	if (!fp) {
+		perror("Failed to open physical_package_id");
+
+		return -1;
+	}
+	if (fscanf(fp, "%d", resource_id) <= 0) {
+		perror("Could not get socket number or l3 id");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * taskset_benchmark - Taskset PID (i.e. benchmark) to a specified cpu
+ * @bm_pid:	PID that should be binded
+ * @cpu_no:	CPU number at which the PID would be binded
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int taskset_benchmark(pid_t bm_pid, int cpu_no)
+{
+	cpu_set_t my_set;
+
+	CPU_ZERO(&my_set);
+	CPU_SET(cpu_no, &my_set);
+
+	if (sched_setaffinity(bm_pid, sizeof(cpu_set_t), &my_set)) {
+		perror("Unable to taskset benchmark");
+
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * run_benchmark - Run a specified benchmark or fill_buf (default benchmark)
+ *		   in specified signal. Direct benchmark stdio to /dev/null.
+ * @signum:	signal number
+ * @info:	signal info
+ * @ucontext:	user context in signal handling
+ *
+ * Return: void
+ */
+void run_benchmark(int signum, siginfo_t *info, void *ucontext)
+{
+	unsigned long span;
+	int operation, ret;
+	char **benchmark_cmd;
+	FILE *fp;
+
+	benchmark_cmd = info->si_ptr;
+
+	/*
+	 * Direct stdio of child to /dev/null, so that only parent writes to
+	 * stdio (console)
+	 */
+	fp = freopen("/dev/null", "w", stdout);
+	if (!fp)
+		PARENT_EXIT("Unable to direct benchmark status to /dev/null");
+
+	if (strcmp(benchmark_cmd[0], "fill_buf") == 0) {
+		/* Execute default fill_buf benchmark */
+		span = strtoul(benchmark_cmd[1], NULL, 10);
+		operation = atoi(benchmark_cmd[4]);
+		if (run_fill_buf(span, 1, 1, operation, NULL))
+			fprintf(stderr, "Error in running fill buffer\n");
+	} else {
+		/* Execute specified benchmark */
+		ret = execvp(benchmark_cmd[0], benchmark_cmd);
+		if (ret)
+			perror("wrong\n");
+	}
+
+	fclose(stdout);
+	PARENT_EXIT("Unable to run specified benchmark");
+}
+
+/*
+ * create_grp - Create a group only if one doesn't exist
+ * @grp_name:	Name of the group
+ * @grp:	Full path and name of the group
+ * @parent_grp:	Full path and name of the parent group
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+static int create_grp(const char *grp_name, char *grp, const char *parent_grp)
+{
+	int found_grp = 0;
+	struct dirent *ep;
+	DIR *dp;
+
+	/* Check if requested grp exists or not */
+	dp = opendir(parent_grp);
+	if (dp) {
+		while ((ep = readdir(dp)) != NULL) {
+			if (strcmp(ep->d_name, grp_name) == 0)
+				found_grp = 1;
+		}
+		closedir(dp);
+	} else {
+		perror("Unable to open resctrl for group");
+
+		return -1;
+	}
+
+	/* Requested grp doesn't exist, hence create it */
+	if (found_grp == 0) {
+		if (mkdir(grp, 0) == -1) {
+			perror("Unable to create group");
+
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static int write_pid_to_tasks(char *tasks, pid_t pid)
+{
+	FILE *fp;
+
+	fp = fopen(tasks, "w");
+	if (!fp) {
+		perror("Failed to open tasks file");
+
+		return -1;
+	}
+	if (fprintf(fp, "%d\n", pid) < 0) {
+		perror("Failed to wr pid to tasks file");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * write_bm_pid_to_resctrl - Write a PID (i.e. benchmark) to resctrl FS
+ * @bm_pid:		PID that should be written
+ * @ctrlgrp:		Name of the control monitor group (con_mon grp)
+ * @mongrp:		Name of the monitor group (mon grp)
+ * @resctrl_val:	Resctrl feature (Eg: mbm, mba.. etc)
+ *
+ * If a con_mon grp is requested, create it and write pid to it, otherwise
+ * write pid to root con_mon grp.
+ * If a mon grp is requested, create it and write pid to it, otherwise
+ * pid is not written, this means that pid is in con_mon grp and hence
+ * should consult con_mon grp's mon_data directory for results.
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
+			    char *resctrl_val)
+{
+	char controlgroup[128], monitorgroup[512], monitorgroup_p[256];
+	char tasks[1024];
+	int ret = 0;
+
+	if (strlen(ctrlgrp))
+		sprintf(controlgroup, "%s/%s", RESCTRL_PATH, ctrlgrp);
+	else
+		sprintf(controlgroup, "%s", RESCTRL_PATH);
+
+	/* Create control and monitoring group and write pid into it */
+	ret = create_grp(ctrlgrp, controlgroup, RESCTRL_PATH);
+	if (ret)
+		goto out;
+	sprintf(tasks, "%s/tasks", controlgroup);
+	ret = write_pid_to_tasks(tasks, bm_pid);
+	if (ret)
+		goto out;
+
+	/* Create mon grp and write pid into it for "mbm" test */
+	if ((strcmp(resctrl_val, "mbm") == 0)) {
+		if (mongrp) {
+			sprintf(monitorgroup_p, "%s/mon_groups", controlgroup);
+			sprintf(monitorgroup, "%s/%s", monitorgroup_p, mongrp);
+			ret = create_grp(mongrp, monitorgroup, monitorgroup_p);
+			if (ret)
+				goto out;
+
+			sprintf(tasks, "%s/mon_groups/%s/tasks",
+				controlgroup, mongrp);
+			ret = write_pid_to_tasks(tasks, bm_pid);
+			if (ret)
+				goto out;
+		}
+	}
+
+out:
+	printf("%sok writing benchmark parameters to resctrl FS\n",
+	       ret ? "not " : "");
+	if (ret)
+		perror("# writing to resctrlfs");
+
+	tests_run++;
+
+	return ret;
+}
+
+/*
+ * write_schemata - Update schemata of a con_mon grp
+ * @ctrlgrp:		Name of the con_mon grp
+ * @schemata:		Schemata that should be updated to
+ * @cpu_no:		CPU number that the benchmark PID is binded to
+ * @resctrl_val:	Resctrl feature (Eg: mbm, mba.. etc)
+ *
+ * Update schemata of a con_mon grp *only* if requested resctrl feature is
+ * allocation type
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
+{
+	char controlgroup[1024], schema[1024], reason[64];
+	int resource_id, ret = 0;
+	FILE *fp;
+
+	if (strcmp(resctrl_val, "mba") != 0)
+		return -ENOENT;
+
+	if (!schemata) {
+		printf("# Skipping empty schemata update\n");
+
+		return -1;
+	}
+
+	if (get_resource_id(cpu_no, &resource_id) < 0) {
+		sprintf(reason, "Failed to get resource id");
+		ret = -1;
+
+		goto out;
+	}
+
+	if (strlen(ctrlgrp) != 0)
+		sprintf(controlgroup, "%s/%s/schemata", RESCTRL_PATH, ctrlgrp);
+	else
+		sprintf(controlgroup, "%s/schemata", RESCTRL_PATH);
+
+	sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata);
+
+	fp = fopen(controlgroup, "w");
+	if (!fp) {
+		sprintf(reason, "Failed to open control group");
+		ret = -1;
+
+		goto out;
+	}
+
+	if (fprintf(fp, "%s\n", schema) < 0) {
+		sprintf(reason, "Failed to write schemata in control group");
+		fclose(fp);
+		ret = -1;
+
+		goto out;
+	}
+	fclose(fp);
+
+out:
+	printf("%sok Write schema \"%s\" to resctrl FS%s%s\n",
+	       ret ? "not " : "", schema, ret ? " # " : "",
+	       ret ? reason : "");
+	tests_run++;
+
+	return ret;
+}
+
+char *fgrep(FILE *inf, const char *str)
+{
+	char line[256];
+	int slen = strlen(str);
+
+	while (!feof(inf)) {
+		if (!fgets(line, 256, inf))
+			break;
+		if (strncmp(line, str, slen))
+			continue;
+
+		return strdup(line);
+	}
+
+	return NULL;
+}
+
+/*
+ * validate_resctrl_feature_request - Check if requested feature is valid.
+ * @resctrl_val:	Requested feature
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+bool validate_resctrl_feature_request(char *resctrl_val)
+{
+	FILE *inf = fopen("/proc/cpuinfo", "r");
+	bool found = false;
+	char *res;
+
+	if (!inf)
+		return false;
+
+	res = fgrep(inf, "flags");
+
+	if (res) {
+		char *s = strchr(res, ':');
+
+		found = s && !strstr(s, resctrl_val);
+		free(res);
+	}
+	fclose(inf);
+
+	return found;
+}
+
+int validate_bw_report_request(char *bw_report)
+{
+	if (strcmp(bw_report, "reads") == 0)
+		return 0;
+	if (strcmp(bw_report, "writes") == 0)
+		return 0;
+	if (strcmp(bw_report, "nt-writes") == 0) {
+		strcpy(bw_report, "writes");
+		return 0;
+	}
+	if (strcmp(bw_report, "total") == 0)
+		return 0;
+
+	fprintf(stderr, "Requested iMC B/W report type unavailable\n");
+
+	return -1;
+}
+
+int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
+		    int group_fd, unsigned long flags)
+{
+	int ret;
+
+	ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
+		      group_fd, flags);
+	return ret;
+}

From 1d3f08687d76c0a80be35bbab6d64b1a8f2730e8 Mon Sep 17 00:00:00 2001
From: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Date: Thu, 16 Jan 2020 13:32:36 -0800
Subject: [PATCH 008/327] selftests/resctrl: Read memory bandwidth from perf
 IMC counter and from resctrl file system

Total memory bandwidth can be monitored from perf IMC counter and from
resctrl file system. Later the two will be compared to verify the total
memory bandwidth read from resctrl is correct.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/resctrl_val.c | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 tools/testing/selftests/resctrl/resctrl_val.c

diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
new file mode 100644
index 000000000000..0ca4c9252516
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory bandwidth monitoring and allocation library
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define UNCORE_IMC		"uncore_imc"
+#define READ_FILE_NAME		"events/cas_count_read"
+#define WRITE_FILE_NAME		"events/cas_count_write"
+#define DYN_PMU_PATH		"/sys/bus/event_source/devices"
+#define SCALE			0.00006103515625
+#define MAX_IMCS		20
+#define MAX_TOKENS		5
+#define READ			0
+#define WRITE			1
+#define CON_MON_MBM_LOCAL_BYTES_PATH				\
+	"%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define CON_MBM_LOCAL_BYTES_PATH		\
+	"%s/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define MON_MBM_LOCAL_BYTES_PATH		\
+	"%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define MBM_LOCAL_BYTES_PATH			\
+	"%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+struct membw_read_format {
+	__u64 value;         /* The value of the event */
+	__u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
+	__u64 time_running;  /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
+	__u64 id;            /* if PERF_FORMAT_ID */
+};
+
+struct imc_counter_config {
+	__u32 type;
+	__u64 event;
+	__u64 umask;
+	struct perf_event_attr pe;
+	struct membw_read_format return_value;
+	int fd;
+};
+
+static struct imc_counter_config imc_counters_config[MAX_IMCS][2];
+
+void membw_initialize_perf_event_attr(int i, int j)
+{
+	memset(&imc_counters_config[i][j].pe, 0,
+	       sizeof(struct perf_event_attr));
+	imc_counters_config[i][j].pe.type = imc_counters_config[i][j].type;
+	imc_counters_config[i][j].pe.size = sizeof(struct perf_event_attr);
+	imc_counters_config[i][j].pe.disabled = 1;
+	imc_counters_config[i][j].pe.inherit = 1;
+	imc_counters_config[i][j].pe.exclude_guest = 0;
+	imc_counters_config[i][j].pe.config =
+		imc_counters_config[i][j].umask << 8 |
+		imc_counters_config[i][j].event;
+	imc_counters_config[i][j].pe.sample_type = PERF_SAMPLE_IDENTIFIER;
+	imc_counters_config[i][j].pe.read_format =
+		PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
+}
+
+void membw_ioctl_perf_event_ioc_reset_enable(int i, int j)
+{
+	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_RESET, 0);
+	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+void membw_ioctl_perf_event_ioc_disable(int i, int j)
+{
+	ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_DISABLE, 0);
+}
+
+/*
+ * get_event_and_umask:	Parse config into event and umask
+ * @cas_count_cfg:	Config
+ * @count:		iMC number
+ * @op:			Operation (read/write)
+ */
+void get_event_and_umask(char *cas_count_cfg, int count, bool op)
+{
+	char *token[MAX_TOKENS];
+	int i = 0;
+
+	strcat(cas_count_cfg, ",");
+	token[0] = strtok(cas_count_cfg, "=,");
+
+	for (i = 1; i < MAX_TOKENS; i++)
+		token[i] = strtok(NULL, "=,");
+
+	for (i = 0; i < MAX_TOKENS; i++) {
+		if (!token[i])
+			break;
+		if (strcmp(token[i], "event") == 0) {
+			if (op == READ)
+				imc_counters_config[count][READ].event =
+				strtol(token[i + 1], NULL, 16);
+			else
+				imc_counters_config[count][WRITE].event =
+				strtol(token[i + 1], NULL, 16);
+		}
+		if (strcmp(token[i], "umask") == 0) {
+			if (op == READ)
+				imc_counters_config[count][READ].umask =
+				strtol(token[i + 1], NULL, 16);
+			else
+				imc_counters_config[count][WRITE].umask =
+				strtol(token[i + 1], NULL, 16);
+		}
+	}
+}

From 7f4d257e3a2a4864268f6f0da7c85ca4f083e643 Mon Sep 17 00:00:00 2001
From: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Date: Thu, 16 Jan 2020 13:32:37 -0800
Subject: [PATCH 009/327] selftests/resctrl: Add callback to start a benchmark

The callback starts a child process and puts the child pid in created
resctrl group with specified memory bandwidth in schemata. The child
starts running benchmark.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/resctrl.h     |  29 +
 tools/testing/selftests/resctrl/resctrl_val.c | 574 ++++++++++++++++++
 2 files changed, 603 insertions(+)

diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index ba98cd6efc64..fb42087c904d 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -3,6 +3,8 @@
 #ifndef RESCTRL_H
 #define RESCTRL_H
 #include <stdio.h>
+#include <stdarg.h>
+#include <math.h>
 #include <errno.h>
 #include <sched.h>
 #include <stdlib.h>
@@ -19,6 +21,7 @@
 #include <asm/unistd.h>
 #include <linux/perf_event.h>
 
+#define MB			(1024 * 1024)
 #define RESCTRL_PATH		"/sys/fs/resctrl"
 #define PHYS_ID_PATH		"/sys/devices/system/cpu/cpu"
 
@@ -29,10 +32,35 @@
 		exit(EXIT_FAILURE);		\
 	} while (0)
 
+/*
+ * resctrl_val_param:	resctrl test parameters
+ * @resctrl_val:	Resctrl feature (Eg: mbm, mba.. etc)
+ * @ctrlgrp:		Name of the control monitor group (con_mon grp)
+ * @mongrp:		Name of the monitor group (mon grp)
+ * @cpu_no:		CPU number to which the benchmark would be binded
+ * @span:		Memory bytes accessed in each benchmark iteration
+ * @mum_resctrlfs:	Should the resctrl FS be remounted?
+ * @filename:		Name of file to which the o/p should be written
+ * @bw_report:		Bandwidth report type (reads vs writes)
+ * @setup:		Call back function to setup test environment
+ */
+struct resctrl_val_param {
+	char		*resctrl_val;
+	char		ctrlgrp[64];
+	char		mongrp[64];
+	int		cpu_no;
+	unsigned long	span;
+	int		mum_resctrlfs;
+	char		filename[64];
+	char		*bw_report;
+	int		(*setup)(int num, ...);
+};
+
 pid_t bm_pid, ppid;
 
 int remount_resctrlfs(bool mum_resctrlfs);
 int get_resource_id(int cpu_no, int *resource_id);
+int umount_resctrlfs(void);
 int validate_bw_report_request(char *bw_report);
 bool validate_resctrl_feature_request(char *resctrl_val);
 char *fgrep(FILE *inf, const char *str);
@@ -46,5 +74,6 @@ int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 		    int group_fd, unsigned long flags);
 int run_fill_buf(unsigned long span, int malloc_and_init_memory, int memflush,
 		 int op, char *resctrl_va);
+int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param);
 
 #endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 0ca4c9252516..0d1bd03ec4be 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -47,6 +47,8 @@ struct imc_counter_config {
 	int fd;
 };
 
+static char mbm_total_path[1024];
+static int imcs;
 static struct imc_counter_config imc_counters_config[MAX_IMCS][2];
 
 void membw_initialize_perf_event_attr(int i, int j)
@@ -115,3 +117,575 @@ void get_event_and_umask(char *cas_count_cfg, int count, bool op)
 		}
 	}
 }
+
+static int open_perf_event(int i, int cpu_no, int j)
+{
+	imc_counters_config[i][j].fd =
+		perf_event_open(&imc_counters_config[i][j].pe, -1, cpu_no, -1,
+				PERF_FLAG_FD_CLOEXEC);
+
+	if (imc_counters_config[i][j].fd == -1) {
+		fprintf(stderr, "Error opening leader %llx\n",
+			imc_counters_config[i][j].pe.config);
+
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Get type and config (read and write) of an iMC counter */
+static int read_from_imc_dir(char *imc_dir, int count)
+{
+	char cas_count_cfg[1024], imc_counter_cfg[1024], imc_counter_type[1024];
+	FILE *fp;
+
+	/* Get type of iMC counter */
+	sprintf(imc_counter_type, "%s%s", imc_dir, "type");
+	fp = fopen(imc_counter_type, "r");
+	if (!fp) {
+		perror("Failed to open imc counter type file");
+
+		return -1;
+	}
+	if (fscanf(fp, "%u", &imc_counters_config[count][READ].type) <= 0) {
+		perror("Could not get imc type");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	imc_counters_config[count][WRITE].type =
+				imc_counters_config[count][READ].type;
+
+	/* Get read config */
+	sprintf(imc_counter_cfg, "%s%s", imc_dir, READ_FILE_NAME);
+	fp = fopen(imc_counter_cfg, "r");
+	if (!fp) {
+		perror("Failed to open imc config file");
+
+		return -1;
+	}
+	if (fscanf(fp, "%s", cas_count_cfg) <= 0) {
+		perror("Could not get imc cas count read");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	get_event_and_umask(cas_count_cfg, count, READ);
+
+	/* Get write config */
+	sprintf(imc_counter_cfg, "%s%s", imc_dir, WRITE_FILE_NAME);
+	fp = fopen(imc_counter_cfg, "r");
+	if (!fp) {
+		perror("Failed to open imc config file");
+
+		return -1;
+	}
+	if  (fscanf(fp, "%s", cas_count_cfg) <= 0) {
+		perror("Could not get imc cas count write");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	get_event_and_umask(cas_count_cfg, count, WRITE);
+
+	return 0;
+}
+
+/*
+ * A system can have 'n' number of iMC (Integrated Memory Controller)
+ * counters, get that 'n'. For each iMC counter get it's type and config.
+ * Also, each counter has two configs, one for read and the other for write.
+ * A config again has two parts, event and umask.
+ * Enumerate all these details into an array of structures.
+ *
+ * Return: >= 0 on success. < 0 on failure.
+ */
+static int num_of_imcs(void)
+{
+	unsigned int count = 0;
+	char imc_dir[512];
+	struct dirent *ep;
+	int ret;
+	DIR *dp;
+
+	dp = opendir(DYN_PMU_PATH);
+	if (dp) {
+		while ((ep = readdir(dp))) {
+			if (strstr(ep->d_name, UNCORE_IMC)) {
+				sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH,
+					ep->d_name);
+				ret = read_from_imc_dir(imc_dir, count);
+				if (ret) {
+					closedir(dp);
+
+					return ret;
+				}
+				count++;
+			}
+		}
+		closedir(dp);
+		if (count == 0) {
+			perror("Unable find iMC counters!\n");
+
+			return -1;
+		}
+	} else {
+		perror("Unable to open PMU directory!\n");
+
+		return -1;
+	}
+
+	return count;
+}
+
+static int initialize_mem_bw_imc(void)
+{
+	int imc, j;
+
+	imcs = num_of_imcs();
+	if (imcs <= 0)
+		return imcs;
+
+	/* Initialize perf_event_attr structures for all iMC's */
+	for (imc = 0; imc < imcs; imc++) {
+		for (j = 0; j < 2; j++)
+			membw_initialize_perf_event_attr(imc, j);
+	}
+
+	return 0;
+}
+
+/*
+ * get_mem_bw_imc:	Memory band width as reported by iMC counters
+ * @cpu_no:		CPU number that the benchmark PID is binded to
+ * @bw_report:		Bandwidth report type (reads, writes)
+ *
+ * Memory B/W utilized by a process on a socket can be calculated using
+ * iMC counters. Perf events are used to read these counters.
+ *
+ * Return: >= 0 on success. < 0 on failure.
+ */
+static float get_mem_bw_imc(int cpu_no, char *bw_report)
+{
+	float reads, writes, of_mul_read, of_mul_write;
+	int imc, j, ret;
+
+	/* Start all iMC counters to log values (both read and write) */
+	reads = 0, writes = 0, of_mul_read = 1, of_mul_write = 1;
+	for (imc = 0; imc < imcs; imc++) {
+		for (j = 0; j < 2; j++) {
+			ret = open_perf_event(imc, cpu_no, j);
+			if (ret)
+				return -1;
+		}
+		for (j = 0; j < 2; j++)
+			membw_ioctl_perf_event_ioc_reset_enable(imc, j);
+	}
+
+	sleep(1);
+
+	/* Stop counters after a second to get results (both read and write) */
+	for (imc = 0; imc < imcs; imc++) {
+		for (j = 0; j < 2; j++)
+			membw_ioctl_perf_event_ioc_disable(imc, j);
+	}
+
+	/*
+	 * Get results which are stored in struct type imc_counter_config
+	 * Take over flow into consideration before calculating total b/w
+	 */
+	for (imc = 0; imc < imcs; imc++) {
+		struct imc_counter_config *r =
+			&imc_counters_config[imc][READ];
+		struct imc_counter_config *w =
+			&imc_counters_config[imc][WRITE];
+
+		if (read(r->fd, &r->return_value,
+			 sizeof(struct membw_read_format)) == -1) {
+			perror("Couldn't get read b/w through iMC");
+
+			return -1;
+		}
+
+		if (read(w->fd, &w->return_value,
+			 sizeof(struct membw_read_format)) == -1) {
+			perror("Couldn't get write bw through iMC");
+
+			return -1;
+		}
+
+		__u64 r_time_enabled = r->return_value.time_enabled;
+		__u64 r_time_running = r->return_value.time_running;
+
+		if (r_time_enabled != r_time_running)
+			of_mul_read = (float)r_time_enabled /
+					(float)r_time_running;
+
+		__u64 w_time_enabled = w->return_value.time_enabled;
+		__u64 w_time_running = w->return_value.time_running;
+
+		if (w_time_enabled != w_time_running)
+			of_mul_write = (float)w_time_enabled /
+					(float)w_time_running;
+		reads += r->return_value.value * of_mul_read * SCALE;
+		writes += w->return_value.value * of_mul_write * SCALE;
+	}
+
+	for (imc = 0; imc < imcs; imc++) {
+		close(imc_counters_config[imc][READ].fd);
+		close(imc_counters_config[imc][WRITE].fd);
+	}
+
+	if (strcmp(bw_report, "reads") == 0)
+		return reads;
+
+	if (strcmp(bw_report, "writes") == 0)
+		return writes;
+
+	return (reads + writes);
+}
+
+void set_mbm_path(const char *ctrlgrp, const char *mongrp, int resource_id)
+{
+	if (ctrlgrp && mongrp)
+		sprintf(mbm_total_path, CON_MON_MBM_LOCAL_BYTES_PATH,
+			RESCTRL_PATH, ctrlgrp, mongrp, resource_id);
+	else if (!ctrlgrp && mongrp)
+		sprintf(mbm_total_path, MON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
+			mongrp, resource_id);
+	else if (ctrlgrp && !mongrp)
+		sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
+			ctrlgrp, resource_id);
+	else if (!ctrlgrp && !mongrp)
+		sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
+			resource_id);
+}
+
+/*
+ * initialize_mem_bw_resctrl:	Appropriately populate "mbm_total_path"
+ * @ctrlgrp:			Name of the control monitor group (con_mon grp)
+ * @mongrp:			Name of the monitor group (mon grp)
+ * @cpu_no:			CPU number that the benchmark PID is binded to
+ * @resctrl_val:		Resctrl feature (Eg: mbm, mba.. etc)
+ */
+static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp,
+				      int cpu_no, char *resctrl_val)
+{
+	int resource_id;
+
+	if (get_resource_id(cpu_no, &resource_id) < 0) {
+		perror("Could not get resource_id");
+		return;
+	}
+
+	if (strcmp(resctrl_val, "mbm") == 0)
+		set_mbm_path(ctrlgrp, mongrp, resource_id);
+
+	if ((strcmp(resctrl_val, "mba") == 0)) {
+		if (ctrlgrp)
+			sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH,
+				RESCTRL_PATH, ctrlgrp, resource_id);
+		else
+			sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH,
+				RESCTRL_PATH, resource_id);
+	}
+}
+
+/*
+ * Get MBM Local bytes as reported by resctrl FS
+ * For MBM,
+ * 1. If con_mon grp and mon grp are given, then read from con_mon grp's mon grp
+ * 2. If only con_mon grp is given, then read from con_mon grp
+ * 3. If both are not given, then read from root con_mon grp
+ * For MBA,
+ * 1. If con_mon grp is given, then read from it
+ * 2. If con_mon grp is not given, then read from root con_mon grp
+ */
+static unsigned long get_mem_bw_resctrl(void)
+{
+	unsigned long mbm_total = 0;
+	FILE *fp;
+
+	fp = fopen(mbm_total_path, "r");
+	if (!fp) {
+		perror("Failed to open total bw file");
+
+		return -1;
+	}
+	if (fscanf(fp, "%lu", &mbm_total) <= 0) {
+		perror("Could not get mbm local bytes");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return mbm_total;
+}
+
+pid_t bm_pid, ppid;
+
+static void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
+{
+	kill(bm_pid, SIGKILL);
+	printf("Ending\n\n");
+
+	exit(EXIT_SUCCESS);
+}
+
+/*
+ * print_results_bw:	the memory bandwidth results are stored in a file
+ * @filename:		file that stores the results
+ * @bm_pid:		child pid that runs benchmark
+ * @bw_imc:		perf imc counter value
+ * @bw_resc:		memory bandwidth value
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+static int print_results_bw(char *filename,  int bm_pid, float bw_imc,
+			    unsigned long bw_resc)
+{
+	unsigned long diff = fabs(bw_imc - bw_resc);
+	FILE *fp;
+
+	if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) {
+		printf("Pid: %d \t Mem_BW_iMC: %f \t ", bm_pid, bw_imc);
+		printf("Mem_BW_resc: %lu \t Difference: %lu\n", bw_resc, diff);
+	} else {
+		fp = fopen(filename, "a");
+		if (!fp) {
+			perror("Cannot open results file");
+
+			return errno;
+		}
+		if (fprintf(fp, "Pid: %d \t Mem_BW_iMC: %f \t Mem_BW_resc: %lu \t Difference: %lu\n",
+			    bm_pid, bw_imc, bw_resc, diff) <= 0) {
+			fclose(fp);
+			perror("Could not log results.");
+
+			return errno;
+		}
+		fclose(fp);
+	}
+
+	return 0;
+}
+
+static int
+measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start)
+{
+	unsigned long bw_imc, bw_resc, bw_resc_end;
+	int ret;
+
+	/*
+	 * Measure memory bandwidth from resctrl and from
+	 * another source which is perf imc value or could
+	 * be something else if perf imc event is not available.
+	 * Compare the two values to validate resctrl value.
+	 * It takes 1sec to measure the data.
+	 */
+	bw_imc = get_mem_bw_imc(param->cpu_no, param->bw_report);
+	if (bw_imc <= 0)
+		return bw_imc;
+
+	bw_resc_end = get_mem_bw_resctrl();
+	if (bw_resc_end <= 0)
+		return bw_resc_end;
+
+	bw_resc = (bw_resc_end - *bw_resc_start) / MB;
+	ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc);
+	if (ret)
+		return ret;
+
+	*bw_resc_start = bw_resc_end;
+
+	return 0;
+}
+
+/*
+ * resctrl_val:	execute benchmark and measure memory bandwidth on
+ *			the benchmark
+ * @benchmark_cmd:	benchmark command and its arguments
+ * @param:		parameters passed to resctrl_val()
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
+{
+	char *resctrl_val = param->resctrl_val;
+	unsigned long bw_resc_start = 0;
+	struct sigaction sigact;
+	int ret = 0, pipefd[2];
+	char pipe_message = 0;
+	union sigval value;
+
+	if (strcmp(param->filename, "") == 0)
+		sprintf(param->filename, "stdio");
+
+	if ((strcmp(resctrl_val, "mba")) == 0 ||
+	    (strcmp(resctrl_val, "mbm")) == 0) {
+		ret = validate_bw_report_request(param->bw_report);
+		if (ret)
+			return ret;
+	}
+
+	ret = remount_resctrlfs(param->mum_resctrlfs);
+	if (ret)
+		return ret;
+
+	/*
+	 * If benchmark wasn't successfully started by child, then child should
+	 * kill parent, so save parent's pid
+	 */
+	ppid = getpid();
+
+	if (pipe(pipefd)) {
+		perror("# Unable to create pipe");
+
+		return -1;
+	}
+
+	/*
+	 * Fork to start benchmark, save child's pid so that it can be killed
+	 * when needed
+	 */
+	bm_pid = fork();
+	if (bm_pid == -1) {
+		perror("# Unable to fork");
+
+		return -1;
+	}
+
+	if (bm_pid == 0) {
+		/*
+		 * Mask all signals except SIGUSR1, parent uses SIGUSR1 to
+		 * start benchmark
+		 */
+		sigfillset(&sigact.sa_mask);
+		sigdelset(&sigact.sa_mask, SIGUSR1);
+
+		sigact.sa_sigaction = run_benchmark;
+		sigact.sa_flags = SA_SIGINFO;
+
+		/* Register for "SIGUSR1" signal from parent */
+		if (sigaction(SIGUSR1, &sigact, NULL))
+			PARENT_EXIT("Can't register child for signal");
+
+		/* Tell parent that child is ready */
+		close(pipefd[0]);
+		pipe_message = 1;
+		if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
+		    sizeof(pipe_message)) {
+			perror("# failed signaling parent process");
+			close(pipefd[1]);
+			return -1;
+		}
+		close(pipefd[1]);
+
+		/* Suspend child until delivery of "SIGUSR1" from parent */
+		sigsuspend(&sigact.sa_mask);
+
+		PARENT_EXIT("Child is done");
+	}
+
+	printf("# benchmark PID: %d\n", bm_pid);
+
+	/*
+	 * Register CTRL-C handler for parent, as it has to kill benchmark
+	 * before exiting
+	 */
+	sigact.sa_sigaction = ctrlc_handler;
+	sigemptyset(&sigact.sa_mask);
+	sigact.sa_flags = SA_SIGINFO;
+	if (sigaction(SIGINT, &sigact, NULL) ||
+	    sigaction(SIGHUP, &sigact, NULL)) {
+		perror("# sigaction");
+		ret = errno;
+		goto out;
+	}
+
+	value.sival_ptr = benchmark_cmd;
+
+	/* Taskset benchmark to specified cpu */
+	ret = taskset_benchmark(bm_pid, param->cpu_no);
+	if (ret)
+		goto out;
+
+	/* Write benchmark to specified control&monitoring grp in resctrl FS */
+	ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp,
+				      resctrl_val);
+	if (ret)
+		goto out;
+
+	if ((strcmp(resctrl_val, "mbm") == 0) ||
+	    (strcmp(resctrl_val, "mba") == 0)) {
+		ret = initialize_mem_bw_imc();
+		if (ret)
+			goto out;
+
+		initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp,
+					  param->cpu_no, resctrl_val);
+	}
+
+	/* Parent waits for child to be ready. */
+	close(pipefd[1]);
+	while (pipe_message != 1) {
+		if (read(pipefd[0], &pipe_message, sizeof(pipe_message)) <
+		    sizeof(pipe_message)) {
+			perror("# failed reading message from child process");
+			close(pipefd[0]);
+			goto out;
+		}
+	}
+	close(pipefd[0]);
+
+	/* Signal child to start benchmark */
+	if (sigqueue(bm_pid, SIGUSR1, value) == -1) {
+		perror("# sigqueue SIGUSR1 to child");
+		ret = errno;
+		goto out;
+	}
+
+	/* Give benchmark enough time to fully run */
+	sleep(1);
+
+	/* Test runs until the callback setup() tells the test to stop. */
+	while (1) {
+		if (strcmp(resctrl_val, "mbm") == 0) {
+			ret = param->setup(1, param);
+			if (ret) {
+				ret = 0;
+				break;
+			}
+
+			ret = measure_vals(param, &bw_resc_start);
+			if (ret)
+				break;
+		} else if ((strcmp(resctrl_val, "mba") == 0)) {
+			ret = param->setup(1, param);
+			if (ret) {
+				ret = 0;
+				break;
+			}
+
+			ret = measure_vals(param, &bw_resc_start);
+			if (ret)
+				break;
+		} else {
+			break;
+		}
+	}
+
+out:
+	kill(bm_pid, SIGKILL);
+	umount_resctrlfs();
+
+	return ret;
+}

From a2561b12fe392e0024e3187d325dbcbde7b99c04 Mon Sep 17 00:00:00 2001
From: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Date: Thu, 16 Jan 2020 13:32:38 -0800
Subject: [PATCH 010/327] selftests/resctrl: Add built in benchmark

Built-in benchmark fill_buf generates stressful memory bandwidth
and cache traffic.

Later it will be used as a default benchmark by various resctrl tests
such as MBA (Memory Bandwidth Allocation) and MBM (Memory Bandwidth
Monitoring) tests.

Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/fill_buf.c | 207 +++++++++++++++++++++
 1 file changed, 207 insertions(+)
 create mode 100644 tools/testing/selftests/resctrl/fill_buf.c

diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
new file mode 100644
index 000000000000..b76bf6a4a0a5
--- /dev/null
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fill_buf benchmark
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <malloc.h>
+#include <string.h>
+
+#include "resctrl.h"
+
+#define CL_SIZE			(64)
+#define PAGE_SIZE		(4 * 1024)
+#define MB			(1024 * 1024)
+
+static unsigned char *startptr;
+
+static void sb(void)
+{
+#if defined(__i386) || defined(__x86_64)
+	asm volatile("sfence\n\t"
+		     : : : "memory");
+#endif
+}
+
+static void ctrl_handler(int signo)
+{
+	free(startptr);
+	printf("\nEnding\n");
+	sb();
+	exit(EXIT_SUCCESS);
+}
+
+static void cl_flush(void *p)
+{
+#if defined(__i386) || defined(__x86_64)
+	asm volatile("clflush (%0)\n\t"
+		     : : "r"(p) : "memory");
+#endif
+}
+
+static void mem_flush(void *p, size_t s)
+{
+	char *cp = (char *)p;
+	size_t i = 0;
+
+	s = s / CL_SIZE; /* mem size in cache llines */
+
+	for (i = 0; i < s; i++)
+		cl_flush(&cp[i * CL_SIZE]);
+
+	sb();
+}
+
+static void *malloc_and_init_memory(size_t s)
+{
+	uint64_t *p64;
+	size_t s64;
+
+	void *p = memalign(PAGE_SIZE, s);
+
+	p64 = (uint64_t *)p;
+	s64 = s / sizeof(uint64_t);
+
+	while (s64 > 0) {
+		*p64 = (uint64_t)rand();
+		p64 += (CL_SIZE / sizeof(uint64_t));
+		s64 -= (CL_SIZE / sizeof(uint64_t));
+	}
+
+	return p;
+}
+
+static int fill_one_span_read(unsigned char *start_ptr, unsigned char *end_ptr)
+{
+	unsigned char sum, *p;
+
+	sum = 0;
+	p = start_ptr;
+	while (p < end_ptr) {
+		sum += *p;
+		p += (CL_SIZE / 2);
+	}
+
+	return sum;
+}
+
+static
+void fill_one_span_write(unsigned char *start_ptr, unsigned char *end_ptr)
+{
+	unsigned char *p;
+
+	p = start_ptr;
+	while (p < end_ptr) {
+		*p = '1';
+		p += (CL_SIZE / 2);
+	}
+}
+
+static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr,
+			   char *resctrl_val)
+{
+	int ret = 0;
+	FILE *fp;
+
+	while (1)
+		ret = fill_one_span_read(start_ptr, end_ptr);
+
+	/* Consume read result so that reading memory is not optimized out. */
+	fp = fopen("/dev/null", "w");
+	if (!fp)
+		perror("Unable to write to /dev/null");
+	fprintf(fp, "Sum: %d ", ret);
+	fclose(fp);
+
+	return 0;
+}
+
+static int fill_cache_write(unsigned char *start_ptr, unsigned char *end_ptr,
+			    char *resctrl_val)
+{
+	while (1)
+		fill_one_span_write(start_ptr, end_ptr);
+
+	return 0;
+}
+
+static int
+fill_cache(unsigned long long buf_size, int malloc_and_init, int memflush,
+	   int op, char *resctrl_val)
+{
+	unsigned char *start_ptr, *end_ptr;
+	unsigned long long i;
+	int ret;
+
+	if (malloc_and_init)
+		start_ptr = malloc_and_init_memory(buf_size);
+	else
+		start_ptr = malloc(buf_size);
+
+	if (!start_ptr)
+		return -1;
+
+	startptr = start_ptr;
+	end_ptr = start_ptr + buf_size;
+
+	/*
+	 * It's better to touch the memory once to avoid any compiler
+	 * optimizations
+	 */
+	if (!malloc_and_init) {
+		for (i = 0; i < buf_size; i++)
+			*start_ptr++ = (unsigned char)rand();
+	}
+
+	start_ptr = startptr;
+
+	/* Flush the memory before using to avoid "cache hot pages" effect */
+	if (memflush)
+		mem_flush(start_ptr, buf_size);
+
+	if (op == 0)
+		ret = fill_cache_read(start_ptr, end_ptr, resctrl_val);
+	else
+		ret = fill_cache_write(start_ptr, end_ptr, resctrl_val);
+
+	if (ret) {
+		printf("\n Errror in fill cache read/write...\n");
+		return -1;
+	}
+
+	free(startptr);
+
+	return 0;
+}
+
+int run_fill_buf(unsigned long span, int malloc_and_init_memory,
+		 int memflush, int op, char *resctrl_val)
+{
+	unsigned long long cache_size = span;
+	int ret;
+
+	/* set up ctrl-c handler */
+	if (signal(SIGINT, ctrl_handler) == SIG_ERR)
+		printf("Failed to catch SIGINT!\n");
+	if (signal(SIGHUP, ctrl_handler) == SIG_ERR)
+		printf("Failed to catch SIGHUP!\n");
+
+	ret = fill_cache(cache_size, malloc_and_init_memory, memflush, op,
+			 resctrl_val);
+	if (ret) {
+		printf("\n Errror in fill cache\n");
+		return -1;
+	}
+
+	return 0;
+}

From ecdbb911f22d6cc5d422571dedf048b889d71417 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 16 Jan 2020 13:32:39 -0800
Subject: [PATCH 011/327] selftests/resctrl: Add MBM test

MBM (Memory Bandwidth Monitoring) test is the first implemented selftest.
It starts a stressful memory bandwidth benchmark and assigns the
bandwidth pid in a resctrl monitoring group. Read and compare perf IMC
counter and MBM total bytes for the benchmark. The numbers should be
close enough to pass the test.

Default benchmark is built-in fill_buf. But users can specify their
own benchmark by option "-b".

We can add memory bandwidth monitoring for multiple processes in the
future.

Co-developed-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/Makefile      |   7 +-
 tools/testing/selftests/resctrl/mbm_test.c    | 145 ++++++++++++++++++
 tools/testing/selftests/resctrl/resctrl.h     |   6 +
 .../testing/selftests/resctrl/resctrl_tests.c | 132 ++++++++++++++++
 tools/testing/selftests/resctrl/resctrl_val.c |   2 +
 tools/testing/selftests/resctrl/resctrlfs.c   |  77 ++++++++++
 6 files changed, 368 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/resctrl/mbm_test.c
 create mode 100644 tools/testing/selftests/resctrl/resctrl_tests.c

diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile
index 76bbd6d3e4a8..d585cc1948cc 100644
--- a/tools/testing/selftests/resctrl/Makefile
+++ b/tools/testing/selftests/resctrl/Makefile
@@ -3,10 +3,15 @@ CFLAGS = -g -Wall
 SRCS=$(wildcard *.c)
 OBJS=$(SRCS:.c=.o)
 
+all: resctrl_tests
+
 $(OBJS): $(SRCS)
 	$(CC) $(CFLAGS) -c $(SRCS)
 
+resctrl_tests: $(OBJS)
+	$(CC) $(CFLAGS) -o $@ $^
+
 .PHONY: clean
 
 clean:
-	$(RM) $(OBJS)
+	$(RM) $(OBJS) resctrl_tests
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
new file mode 100644
index 000000000000..4700f7453f81
--- /dev/null
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory Bandwidth Monitoring (MBM) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define RESULT_FILE_NAME	"result_mbm"
+#define MAX_DIFF		300
+#define NUM_OF_RUNS		5
+
+static void
+show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span)
+{
+	unsigned long avg_bw_imc = 0, avg_bw_resc = 0;
+	unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
+	long avg_diff = 0;
+	int runs;
+
+	/*
+	 * Discard the first value which is inaccurate due to monitoring setup
+	 * transition phase.
+	 */
+	for (runs = 1; runs < NUM_OF_RUNS ; runs++) {
+		sum_bw_imc += bw_imc[runs];
+		sum_bw_resc += bw_resc[runs];
+	}
+
+	avg_bw_imc = sum_bw_imc / 4;
+	avg_bw_resc = sum_bw_resc / 4;
+	avg_diff = avg_bw_resc - avg_bw_imc;
+
+	printf("%sok MBM: diff within %d%%\n",
+	       labs(avg_diff) > MAX_DIFF ? "not " : "", MAX_DIFF);
+	tests_run++;
+	printf("# avg_diff: %lu\n", labs(avg_diff));
+	printf("# Span (MB): %d\n", span);
+	printf("# avg_bw_imc: %lu\n", avg_bw_imc);
+	printf("# avg_bw_resc: %lu\n", avg_bw_resc);
+}
+
+static int check_results(int span)
+{
+	unsigned long bw_imc[NUM_OF_RUNS], bw_resc[NUM_OF_RUNS];
+	char temp[1024], *token_array[8];
+	char output[] = RESULT_FILE_NAME;
+	int runs;
+	FILE *fp;
+
+	printf("# Checking for pass/fail\n");
+
+	fp = fopen(output, "r");
+	if (!fp) {
+		perror(output);
+
+		return errno;
+	}
+
+	runs = 0;
+	while (fgets(temp, sizeof(temp), fp)) {
+		char *token = strtok(temp, ":\t");
+		int i = 0;
+
+		while (token) {
+			token_array[i++] = token;
+			token = strtok(NULL, ":\t");
+		}
+
+		bw_resc[runs] = strtoul(token_array[5], NULL, 0);
+		bw_imc[runs] = strtoul(token_array[3], NULL, 0);
+		runs++;
+	}
+
+	show_bw_info(bw_imc, bw_resc, span);
+
+	fclose(fp);
+
+	return 0;
+}
+
+static int mbm_setup(int num, ...)
+{
+	struct resctrl_val_param *p;
+	static int num_of_runs;
+	va_list param;
+	int ret = 0;
+
+	/* Run NUM_OF_RUNS times */
+	if (num_of_runs++ >= NUM_OF_RUNS)
+		return -1;
+
+	va_start(param, num);
+	p = va_arg(param, struct resctrl_val_param *);
+	va_end(param);
+
+	/* Set up shemata with 100% allocation on the first run. */
+	if (num_of_runs == 0)
+		ret = write_schemata(p->ctrlgrp, "100", p->cpu_no,
+				     p->resctrl_val);
+
+	return ret;
+}
+
+void mbm_test_cleanup(void)
+{
+	remove(RESULT_FILE_NAME);
+}
+
+int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd)
+{
+	struct resctrl_val_param param = {
+		.resctrl_val	= "mbm",
+		.ctrlgrp	= "c1",
+		.mongrp		= "m1",
+		.span		= span,
+		.cpu_no		= cpu_no,
+		.mum_resctrlfs	= 1,
+		.filename	= RESULT_FILE_NAME,
+		.bw_report	=  bw_report,
+		.setup		= mbm_setup
+	};
+	int ret;
+
+	remove(RESULT_FILE_NAME);
+
+	if (!validate_resctrl_feature_request("mbm"))
+		return -1;
+
+	ret = resctrl_val(benchmark_cmd, &param);
+	if (ret)
+		return ret;
+
+	ret = check_results(span);
+	if (ret)
+		return ret;
+
+	mbm_test_cleanup();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index fb42087c904d..c92bc50c6751 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -57,7 +57,10 @@ struct resctrl_val_param {
 };
 
 pid_t bm_pid, ppid;
+int tests_run;
 
+bool check_resctrlfs_support(void);
+int filter_dmesg(void);
 int remount_resctrlfs(bool mum_resctrlfs);
 int get_resource_id(int cpu_no, int *resource_id);
 int umount_resctrlfs(void);
@@ -75,5 +78,8 @@ int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 int run_fill_buf(unsigned long span, int malloc_and_init_memory, int memflush,
 		 int op, char *resctrl_va);
 int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param);
+int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd);
+void tests_cleanup(void);
+void mbm_test_cleanup(void);
 
 #endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
new file mode 100644
index 000000000000..496c8030fe43
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resctrl tests
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define BENCHMARK_ARGS		64
+#define BENCHMARK_ARG_SIZE	64
+
+static void cmd_help(void)
+{
+	printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list]\n");
+	printf("\t-b benchmark_cmd [options]: run specified benchmark\n");
+	printf("\t default benchmark is builtin fill_buf\n");
+	printf("\t-t test list: run tests specified in the test list, ");
+	printf("e.g. -t mbm,mba\n");
+	printf("\t-h: help\n");
+}
+
+void tests_cleanup(void)
+{
+	mbm_test_cleanup();
+}
+
+int main(int argc, char **argv)
+{
+	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, ben_ind;
+	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
+	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
+	bool has_ben = false, mbm_test = true;
+	int ben_count;
+
+	for (i = 0; i < argc; i++) {
+		if (strcmp(argv[i], "-b") == 0) {
+			ben_ind = i + 1;
+			ben_count = argc - ben_ind;
+			argc_new = ben_ind - 1;
+			has_ben = true;
+			break;
+		}
+	}
+
+	while ((c = getopt(argc_new, argv, "ht:b:")) != -1) {
+		char *token;
+
+		switch (c) {
+		case 't':
+			token = strtok(optarg, ",");
+
+			mbm_test = false;
+			while (token) {
+				if (!strcmp(token, "mbm")) {
+					mbm_test = true;
+				} else {
+					printf("invalid argument\n");
+
+					return -1;
+				}
+				token = strtok(NULL, ":\t");
+			}
+			break;
+		case 'p':
+			cpu_no = atoi(optarg);
+			break;
+		case 'h':
+			cmd_help();
+
+			return 0;
+		default:
+			printf("invalid argument\n");
+
+			return -1;
+		}
+	}
+
+	printf("TAP version 13\n");
+
+	/*
+	 * Typically we need root privileges, because:
+	 * 1. We write to resctrl FS
+	 * 2. We execute perf commands
+	 */
+	if (geteuid() != 0)
+		printf("# WARNING: not running as root, tests may fail.\n");
+
+	if (has_ben) {
+		/* Extract benchmark command from command line. */
+		for (i = ben_ind; i < argc; i++) {
+			benchmark_cmd[i - ben_ind] = benchmark_cmd_area[i];
+			sprintf(benchmark_cmd[i - ben_ind], "%s", argv[i]);
+		}
+		benchmark_cmd[ben_count] = NULL;
+	} else {
+		/* If no benchmark is given by "-b" argument, use fill_buf. */
+		for (i = 0; i < 6; i++)
+			benchmark_cmd[i] = benchmark_cmd_area[i];
+
+		strcpy(benchmark_cmd[0], "fill_buf");
+		sprintf(benchmark_cmd[1], "%d", span);
+		strcpy(benchmark_cmd[2], "1");
+		strcpy(benchmark_cmd[3], "1");
+		strcpy(benchmark_cmd[4], "0");
+		strcpy(benchmark_cmd[5], "");
+		benchmark_cmd[6] = NULL;
+	}
+
+	sprintf(bw_report, "reads");
+	sprintf(bm_type, "fill_buf");
+
+	check_resctrlfs_support();
+	filter_dmesg();
+
+	if (mbm_test) {
+		printf("# Starting MBM BW change ...\n");
+		if (!has_ben)
+			sprintf(benchmark_cmd[5], "%s", "mba");
+		res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd);
+		printf("%sok MBM: bw change\n", res ? "not " : "");
+		mbm_test_cleanup();
+		tests_run++;
+	}
+
+	printf("1..%d\n", tests_run);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 0d1bd03ec4be..4f7bd5a4d86e 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -435,6 +435,8 @@ pid_t bm_pid, ppid;
 static void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
 {
 	kill(bm_pid, SIGKILL);
+	umount_resctrlfs();
+	tests_cleanup();
 	printf("Ending\n\n");
 
 	exit(EXIT_SUCCESS);
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 1b125f9d8e5d..07c8394b427f 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -388,6 +388,41 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	return ret;
 }
 
+bool check_resctrlfs_support(void)
+{
+	FILE *inf = fopen("/proc/filesystems", "r");
+	DIR *dp;
+	char *res;
+	bool ret = false;
+
+	if (!inf)
+		return false;
+
+	res = fgrep(inf, "nodev\tresctrl\n");
+
+	if (res) {
+		ret = true;
+		free(res);
+	}
+
+	fclose(inf);
+
+	printf("%sok kernel supports resctrl filesystem\n", ret ? "" : "not ");
+	tests_run++;
+
+	dp = opendir(RESCTRL_PATH);
+	printf("%sok resctrl mountpoint \"%s\" exists\n",
+	       dp ? "" : "not ", RESCTRL_PATH);
+	if (dp)
+		closedir(dp);
+	tests_run++;
+
+	printf("# resctrl filesystem %s mounted\n",
+	       find_resctrl_mount(NULL) ? "not" : "is");
+
+	return ret;
+}
+
 char *fgrep(FILE *inf, const char *str)
 {
 	char line[256];
@@ -433,6 +468,48 @@ bool validate_resctrl_feature_request(char *resctrl_val)
 	return found;
 }
 
+int filter_dmesg(void)
+{
+	char line[1024];
+	FILE *fp;
+	int pipefds[2];
+	pid_t pid;
+	int ret;
+
+	ret = pipe(pipefds);
+	if (ret) {
+		perror("pipe");
+		return ret;
+	}
+	pid = fork();
+	if (pid == 0) {
+		close(pipefds[0]);
+		dup2(pipefds[1], STDOUT_FILENO);
+		execlp("dmesg", "dmesg", NULL);
+		perror("executing dmesg");
+		exit(1);
+	}
+	close(pipefds[1]);
+	fp = fdopen(pipefds[0], "r");
+	if (!fp) {
+		perror("fdopen(pipe)");
+		kill(pid, SIGTERM);
+
+		return -1;
+	}
+
+	while (fgets(line, 1024, fp)) {
+		if (strstr(line, "intel_rdt:"))
+			printf("# dmesg: %s", line);
+		if (strstr(line, "resctrl:"))
+			printf("# dmesg: %s", line);
+	}
+	fclose(fp);
+	waitpid(pid, NULL, 0);
+
+	return 0;
+}
+
 int validate_bw_report_request(char *bw_report)
 {
 	if (strcmp(bw_report, "reads") == 0)

From 01fee6b4d1f93247e806dddb0065b88317949085 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 16 Jan 2020 13:32:40 -0800
Subject: [PATCH 012/327] selftests/resctrl: Add MBA test

MBA (Memory Bandwidth Allocation) test starts a stressful memory
bandwidth benchmark and allocates memory bandwidth from 100% down
to 10% for the benchmark process. For each allocation, compare
perf IMC counter and mbm total bytes from resctrl. The difference
between the two values should be within a threshold to pass the test.

Default benchmark is built-in fill_buf. But users can specify their
own benchmark by option "-b".

We can add memory bandwidth allocation for multiple processes in the
future.

Co-developed-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/mba_test.c    | 171 ++++++++++++++++++
 tools/testing/selftests/resctrl/resctrl.h     |   2 +
 .../testing/selftests/resctrl/resctrl_tests.c |  16 +-
 3 files changed, 188 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/resctrl/mba_test.c

diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
new file mode 100644
index 000000000000..7bf8eaa6204b
--- /dev/null
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory Bandwidth Allocation (MBA) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define RESULT_FILE_NAME	"result_mba"
+#define NUM_OF_RUNS		5
+#define MAX_DIFF		300
+#define ALLOCATION_MAX		100
+#define ALLOCATION_MIN		10
+#define ALLOCATION_STEP		10
+
+/*
+ * Change schemata percentage from 100 to 10%. Write schemata to specified
+ * con_mon grp, mon_grp in resctrl FS.
+ * For each allocation, run 5 times in order to get average values.
+ */
+static int mba_setup(int num, ...)
+{
+	static int runs_per_allocation, allocation = 100;
+	struct resctrl_val_param *p;
+	char allocation_str[64];
+	va_list param;
+
+	va_start(param, num);
+	p = va_arg(param, struct resctrl_val_param *);
+	va_end(param);
+
+	if (runs_per_allocation >= NUM_OF_RUNS)
+		runs_per_allocation = 0;
+
+	/* Only set up schemata once every NUM_OF_RUNS of allocations */
+	if (runs_per_allocation++ != 0)
+		return 0;
+
+	if (allocation < ALLOCATION_MIN || allocation > ALLOCATION_MAX)
+		return -1;
+
+	sprintf(allocation_str, "%d", allocation);
+
+	write_schemata(p->ctrlgrp, allocation_str, p->cpu_no, p->resctrl_val);
+	allocation -= ALLOCATION_STEP;
+
+	return 0;
+}
+
+static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
+{
+	int allocation, runs;
+	bool failed = false;
+
+	printf("# Results are displayed in (MB)\n");
+	/* Memory bandwidth from 100% down to 10% */
+	for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP;
+	     allocation++) {
+		unsigned long avg_bw_imc, avg_bw_resc;
+		unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
+		unsigned long avg_diff;
+
+		/*
+		 * The first run is discarded due to inaccurate value from
+		 * phase transition.
+		 */
+		for (runs = NUM_OF_RUNS * allocation + 1;
+		     runs < NUM_OF_RUNS * allocation + NUM_OF_RUNS ; runs++) {
+			sum_bw_imc += bw_imc[runs];
+			sum_bw_resc += bw_resc[runs];
+		}
+
+		avg_bw_imc = sum_bw_imc / (NUM_OF_RUNS - 1);
+		avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1);
+		avg_diff = labs((long)(avg_bw_resc - avg_bw_imc));
+
+		printf("%sok MBA schemata percentage %u smaller than %d %%\n",
+		       avg_diff > MAX_DIFF ? "not " : "",
+		       ALLOCATION_MAX - ALLOCATION_STEP * allocation,
+		       MAX_DIFF);
+		tests_run++;
+		printf("# avg_diff: %lu\n", avg_diff);
+		printf("# avg_bw_imc: %lu\n", avg_bw_imc);
+		printf("# avg_bw_resc: %lu\n", avg_bw_resc);
+		if (avg_diff > MAX_DIFF)
+			failed = true;
+	}
+
+	printf("%sok schemata change using MBA%s\n", failed ? "not " : "",
+	       failed ? " # at least one test failed" : "");
+	tests_run++;
+}
+
+static int check_results(void)
+{
+	char *token_array[8], output[] = RESULT_FILE_NAME, temp[512];
+	unsigned long bw_imc[1024], bw_resc[1024];
+	int runs;
+	FILE *fp;
+
+	fp = fopen(output, "r");
+	if (!fp) {
+		perror(output);
+
+		return errno;
+	}
+
+	runs = 0;
+	while (fgets(temp, sizeof(temp), fp)) {
+		char *token = strtok(temp, ":\t");
+		int fields = 0;
+
+		while (token) {
+			token_array[fields++] = token;
+			token = strtok(NULL, ":\t");
+		}
+
+		/* Field 3 is perf imc value */
+		bw_imc[runs] = strtoul(token_array[3], NULL, 0);
+		/* Field 5 is resctrl value */
+		bw_resc[runs] = strtoul(token_array[5], NULL, 0);
+		runs++;
+	}
+
+	fclose(fp);
+
+	show_mba_info(bw_imc, bw_resc);
+
+	return 0;
+}
+
+void mba_test_cleanup(void)
+{
+	remove(RESULT_FILE_NAME);
+}
+
+int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd)
+{
+	struct resctrl_val_param param = {
+		.resctrl_val	= "mba",
+		.ctrlgrp	= "c1",
+		.mongrp		= "m1",
+		.cpu_no		= cpu_no,
+		.mum_resctrlfs	= 1,
+		.filename	= RESULT_FILE_NAME,
+		.bw_report	= bw_report,
+		.setup		= mba_setup
+	};
+	int ret;
+
+	remove(RESULT_FILE_NAME);
+
+	if (!validate_resctrl_feature_request("mba"))
+		return -1;
+
+	ret = resctrl_val(benchmark_cmd, &param);
+	if (ret)
+		return ret;
+
+	ret = check_results();
+	if (ret)
+		return ret;
+
+	mba_test_cleanup();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index c92bc50c6751..a6196ab6b59f 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -81,5 +81,7 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param);
 int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd);
 void tests_cleanup(void);
 void mbm_test_cleanup(void);
+int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd);
+void mba_test_cleanup(void);
 
 #endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 496c8030fe43..0ed953a97db2 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -26,6 +26,7 @@ static void cmd_help(void)
 void tests_cleanup(void)
 {
 	mbm_test_cleanup();
+	mba_test_cleanup();
 }
 
 int main(int argc, char **argv)
@@ -33,7 +34,7 @@ int main(int argc, char **argv)
 	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, ben_ind;
 	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
 	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
-	bool has_ben = false, mbm_test = true;
+	bool has_ben = false, mbm_test = true, mba_test = true;
 	int ben_count;
 
 	for (i = 0; i < argc; i++) {
@@ -54,9 +55,12 @@ int main(int argc, char **argv)
 			token = strtok(optarg, ",");
 
 			mbm_test = false;
+			mba_test = false;
 			while (token) {
 				if (!strcmp(token, "mbm")) {
 					mbm_test = true;
+				} else if (!strcmp(token, "mba")) {
+					mba_test = true;
 				} else {
 					printf("invalid argument\n");
 
@@ -126,6 +130,16 @@ int main(int argc, char **argv)
 		tests_run++;
 	}
 
+	if (mba_test) {
+		printf("# Starting MBA Schemata change ...\n");
+		if (!has_ben)
+			sprintf(benchmark_cmd[1], "%d", span);
+		res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd);
+		printf("%sok MBA: schemata change\n", res ? "not " : "");
+		mba_test_cleanup();
+		tests_run++;
+	}
+
 	printf("1..%d\n", tests_run);
 
 	return 0;

From 78941183d1b151317beb37b25690b7d87fe2596d Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 16 Jan 2020 13:32:41 -0800
Subject: [PATCH 013/327] selftests/resctrl: Add Cache QoS Monitoring (CQM)
 selftest

Cache QoS Monitoring (CQM) selftest starts stressful cache benchmark
with specified size of memory to access the cache. Last Level cache
occupancy reported by CQM should be close to the size of the memory.

Co-developed-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/cache.c       |  99 +++++++++
 tools/testing/selftests/resctrl/cqm_test.c    | 176 ++++++++++++++++
 tools/testing/selftests/resctrl/resctrl.h     |  16 ++
 .../testing/selftests/resctrl/resctrl_tests.c |  31 ++-
 tools/testing/selftests/resctrl/resctrl_val.c |  63 +++++-
 tools/testing/selftests/resctrl/resctrlfs.c   | 193 +++++++++++++++++-
 6 files changed, 558 insertions(+), 20 deletions(-)
 create mode 100644 tools/testing/selftests/resctrl/cache.c
 create mode 100644 tools/testing/selftests/resctrl/cqm_test.c

diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
new file mode 100644
index 000000000000..d7719219ab51
--- /dev/null
+++ b/tools/testing/selftests/resctrl/cache.c
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdint.h>
+#include "resctrl.h"
+
+struct read_format {
+	__u64 nr;			/* The number of events */
+	struct {
+		__u64 value;		/* The value of the event */
+	} values[2];
+};
+
+char llc_occup_path[1024];
+
+/*
+ * Get LLC Occupancy as reported by RESCTRL FS
+ * For CQM,
+ * 1. If con_mon grp and mon grp given, then read from mon grp in
+ * con_mon grp
+ * 2. If only con_mon grp given, then read from con_mon grp
+ * 3. If both not given, then read from root con_mon grp
+ * For CAT,
+ * 1. If con_mon grp given, then read from it
+ * 2. If con_mon grp not given, then read from root con_mon grp
+ *
+ * Return: =0 on success.  <0 on failure.
+ */
+static int get_llc_occu_resctrl(unsigned long *llc_occupancy)
+{
+	FILE *fp;
+
+	fp = fopen(llc_occup_path, "r");
+	if (!fp) {
+		perror("Failed to open results file");
+
+		return errno;
+	}
+	if (fscanf(fp, "%lu", llc_occupancy) <= 0) {
+		perror("Could not get llc occupancy");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * print_results_cache:	the cache results are stored in a file
+ * @filename:		file that stores the results
+ * @bm_pid:		child pid that runs benchmark
+ * @llc_value:		perf miss value /
+ *			llc occupancy value reported by resctrl FS
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+static int print_results_cache(char *filename, int bm_pid,
+			       unsigned long llc_value)
+{
+	FILE *fp;
+
+	if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) {
+		printf("Pid: %d \t LLC_value: %lu\n", bm_pid,
+		       llc_value);
+	} else {
+		fp = fopen(filename, "a");
+		if (!fp) {
+			perror("Cannot open results file");
+
+			return errno;
+		}
+		fprintf(fp, "Pid: %d \t llc_value: %lu\n", bm_pid, llc_value);
+		fclose(fp);
+	}
+
+	return 0;
+}
+
+int measure_cache_vals(struct resctrl_val_param *param, int bm_pid)
+{
+	unsigned long llc_occu_resc = 0, llc_value = 0;
+	int ret;
+
+	/*
+	 * Measure llc occupancy from resctrl.
+	 */
+	if (!strcmp(param->resctrl_val, "cqm")) {
+		ret = get_llc_occu_resctrl(&llc_occu_resc);
+		if (ret < 0)
+			return ret;
+		llc_value = llc_occu_resc;
+	}
+	ret = print_results_cache(param->filename, bm_pid, llc_value);
+	if (ret)
+		return ret;
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cqm_test.c
new file mode 100644
index 000000000000..c8756152bd61
--- /dev/null
+++ b/tools/testing/selftests/resctrl/cqm_test.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cache Monitoring Technology (CQM) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+#include <unistd.h>
+
+#define RESULT_FILE_NAME	"result_cqm"
+#define NUM_OF_RUNS		5
+#define MAX_DIFF		2000000
+#define MAX_DIFF_PERCENT	15
+
+int count_of_bits;
+char cbm_mask[256];
+unsigned long long_mask;
+unsigned long cache_size;
+
+static int cqm_setup(int num, ...)
+{
+	struct resctrl_val_param *p;
+	va_list param;
+
+	va_start(param, num);
+	p = va_arg(param, struct resctrl_val_param *);
+	va_end(param);
+
+	/* Run NUM_OF_RUNS times */
+	if (p->num_of_runs >= NUM_OF_RUNS)
+		return -1;
+
+	p->num_of_runs++;
+
+	return 0;
+}
+
+static void show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits,
+			    unsigned long span)
+{
+	unsigned long avg_llc_occu_resc = 0;
+	float diff_percent;
+	long avg_diff = 0;
+	bool res;
+
+	avg_llc_occu_resc = sum_llc_occu_resc / (NUM_OF_RUNS - 1);
+	avg_diff = (long)abs(span - avg_llc_occu_resc);
+
+	diff_percent = (((float)span - avg_llc_occu_resc) / span) * 100;
+
+	if ((abs((int)diff_percent) <= MAX_DIFF_PERCENT) ||
+	    (abs(avg_diff) <= MAX_DIFF))
+		res = true;
+	else
+		res = false;
+
+	printf("%sok CQM: diff within %d, %d\%%\n", res ? "" : "not",
+	       MAX_DIFF, (int)MAX_DIFF_PERCENT);
+
+	printf("# diff: %ld\n", avg_diff);
+	printf("# percent diff=%d\n", abs((int)diff_percent));
+	printf("# Results are displayed in (Bytes)\n");
+	printf("# Number of bits: %d\n", no_of_bits);
+	printf("# Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc);
+	printf("# llc_occu_exp (span): %lu\n", span);
+
+	tests_run++;
+}
+
+static int check_results(struct resctrl_val_param *param, int no_of_bits)
+{
+	char *token_array[8], temp[512];
+	unsigned long sum_llc_occu_resc = 0;
+	int runs = 0;
+	FILE *fp;
+
+	printf("# checking for pass/fail\n");
+	fp = fopen(param->filename, "r");
+	if (!fp) {
+		perror("# Error in opening file\n");
+
+		return errno;
+	}
+
+	while (fgets(temp, 1024, fp)) {
+		char *token = strtok(temp, ":\t");
+		int fields = 0;
+
+		while (token) {
+			token_array[fields++] = token;
+			token = strtok(NULL, ":\t");
+		}
+
+		/* Field 3 is llc occ resc value */
+		if (runs > 0)
+			sum_llc_occu_resc += strtoul(token_array[3], NULL, 0);
+		runs++;
+	}
+	fclose(fp);
+	show_cache_info(sum_llc_occu_resc, no_of_bits, param->span);
+
+	return 0;
+}
+
+void cqm_test_cleanup(void)
+{
+	remove(RESULT_FILE_NAME);
+}
+
+int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
+{
+	int ret, mum_resctrlfs;
+
+	cache_size = 0;
+	mum_resctrlfs = 1;
+
+	ret = remount_resctrlfs(mum_resctrlfs);
+	if (ret)
+		return ret;
+
+	if (!validate_resctrl_feature_request("cqm"))
+		return -1;
+
+	ret = get_cbm_mask("L3");
+	if (ret)
+		return ret;
+
+	long_mask = strtoul(cbm_mask, NULL, 16);
+
+	ret = get_cache_size(cpu_no, "L3", &cache_size);
+	if (ret)
+		return ret;
+	printf("cache size :%lu\n", cache_size);
+
+	count_of_bits = count_bits(long_mask);
+
+	if (n < 1 || n > count_of_bits) {
+		printf("Invalid input value for numbr_of_bits n!\n");
+		printf("Please Enter value in range 1 to %d\n", count_of_bits);
+		return -1;
+	}
+
+	struct resctrl_val_param param = {
+		.resctrl_val	= "cqm",
+		.ctrlgrp	= "c1",
+		.mongrp		= "m1",
+		.cpu_no		= cpu_no,
+		.mum_resctrlfs	= 0,
+		.filename	= RESULT_FILE_NAME,
+		.mask		= ~(long_mask << n) & long_mask,
+		.span		= cache_size * n / count_of_bits,
+		.num_of_runs	= 0,
+		.setup		= cqm_setup,
+	};
+
+	if (strcmp(benchmark_cmd[0], "fill_buf") == 0)
+		sprintf(benchmark_cmd[1], "%lu", param.span);
+
+	remove(RESULT_FILE_NAME);
+
+	ret = resctrl_val(benchmark_cmd, &param);
+	if (ret)
+		return ret;
+
+	ret = check_results(&param, n);
+	if (ret)
+		return ret;
+
+	cqm_test_cleanup();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index a6196ab6b59f..d326c7e95862 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -18,12 +18,16 @@
 #include <sys/mount.h>
 #include <sys/types.h>
 #include <sys/wait.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/eventfd.h>
 #include <asm/unistd.h>
 #include <linux/perf_event.h>
 
 #define MB			(1024 * 1024)
 #define RESCTRL_PATH		"/sys/fs/resctrl"
 #define PHYS_ID_PATH		"/sys/devices/system/cpu/cpu"
+#define CBM_MASK_PATH		"/sys/fs/resctrl/info"
 
 #define PARENT_EXIT(err_msg)			\
 	do {					\
@@ -53,12 +57,16 @@ struct resctrl_val_param {
 	int		mum_resctrlfs;
 	char		filename[64];
 	char		*bw_report;
+	unsigned long	mask;
+	int		num_of_runs;
 	int		(*setup)(int num, ...);
 };
 
 pid_t bm_pid, ppid;
 int tests_run;
 
+char llc_occup_path[1024];
+
 bool check_resctrlfs_support(void);
 int filter_dmesg(void);
 int remount_resctrlfs(bool mum_resctrlfs);
@@ -83,5 +91,13 @@ void tests_cleanup(void);
 void mbm_test_cleanup(void);
 int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd);
 void mba_test_cleanup(void);
+int get_cbm_mask(char *cache_type);
+int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size);
+void ctrlc_handler(int signum, siginfo_t *info, void *ptr);
+int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd);
+unsigned int count_bits(unsigned long n);
+void cqm_test_cleanup(void);
+int get_core_sibling(int cpu_no);
+int measure_cache_vals(struct resctrl_val_param *param, int bm_pid);
 
 #endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 0ed953a97db2..70179a61df3b 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -15,11 +15,13 @@
 
 static void cmd_help(void)
 {
-	printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list]\n");
-	printf("\t-b benchmark_cmd [options]: run specified benchmark\n");
+	printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n");
+	printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM");
 	printf("\t default benchmark is builtin fill_buf\n");
 	printf("\t-t test list: run tests specified in the test list, ");
-	printf("e.g. -t mbm,mba\n");
+	printf("e.g. -t mbm, mba, cqm\n");
+	printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n");
+	printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n");
 	printf("\t-h: help\n");
 }
 
@@ -27,15 +29,16 @@ void tests_cleanup(void)
 {
 	mbm_test_cleanup();
 	mba_test_cleanup();
+	cqm_test_cleanup();
 }
 
 int main(int argc, char **argv)
 {
-	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, ben_ind;
+	bool has_ben = false, mbm_test = true, mba_test = true, cqm_test = true;
+	int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 5;
 	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
 	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
-	bool has_ben = false, mbm_test = true, mba_test = true;
-	int ben_count;
+	int ben_ind, ben_count;
 
 	for (i = 0; i < argc; i++) {
 		if (strcmp(argv[i], "-b") == 0) {
@@ -56,11 +59,14 @@ int main(int argc, char **argv)
 
 			mbm_test = false;
 			mba_test = false;
+			cqm_test = false;
 			while (token) {
 				if (!strcmp(token, "mbm")) {
 					mbm_test = true;
 				} else if (!strcmp(token, "mba")) {
 					mba_test = true;
+				} else if (!strcmp(token, "cqm")) {
+					cqm_test = true;
 				} else {
 					printf("invalid argument\n");
 
@@ -72,6 +78,9 @@ int main(int argc, char **argv)
 		case 'p':
 			cpu_no = atoi(optarg);
 			break;
+		case 'n':
+			no_of_bits = atoi(optarg);
+			break;
 		case 'h':
 			cmd_help();
 
@@ -140,6 +149,16 @@ int main(int argc, char **argv)
 		tests_run++;
 	}
 
+	if (cqm_test) {
+		printf("# Starting CQM test ...\n");
+		if (!has_ben)
+			sprintf(benchmark_cmd[5], "%s", "cqm");
+		res = cqm_resctrl_val(cpu_no, no_of_bits, benchmark_cmd);
+		printf("%sok CQM: test\n", res ? "not " : "");
+		cqm_test_cleanup();
+		tests_run++;
+	}
+
 	printf("1..%d\n", tests_run);
 
 	return 0;
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index 4f7bd5a4d86e..520fea3606d1 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -31,6 +31,18 @@
 #define MBM_LOCAL_BYTES_PATH			\
 	"%s/mon_data/mon_L3_%02d/mbm_local_bytes"
 
+#define CON_MON_LCC_OCCUP_PATH		\
+	"%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+#define CON_LCC_OCCUP_PATH		\
+	"%s/%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+#define MON_LCC_OCCUP_PATH		\
+	"%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+#define LCC_OCCUP_PATH			\
+	"%s/mon_data/mon_L3_%02d/llc_occupancy"
+
 struct membw_read_format {
 	__u64 value;         /* The value of the event */
 	__u64 time_enabled;  /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
@@ -432,7 +444,7 @@ static unsigned long get_mem_bw_resctrl(void)
 
 pid_t bm_pid, ppid;
 
-static void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
+void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
 {
 	kill(bm_pid, SIGKILL);
 	umount_resctrlfs();
@@ -480,6 +492,42 @@ static int print_results_bw(char *filename,  int bm_pid, float bw_imc,
 	return 0;
 }
 
+static void set_cqm_path(const char *ctrlgrp, const char *mongrp, char sock_num)
+{
+	if (strlen(ctrlgrp) && strlen(mongrp))
+		sprintf(llc_occup_path,	CON_MON_LCC_OCCUP_PATH,	RESCTRL_PATH,
+			ctrlgrp, mongrp, sock_num);
+	else if (!strlen(ctrlgrp) && strlen(mongrp))
+		sprintf(llc_occup_path,	MON_LCC_OCCUP_PATH, RESCTRL_PATH,
+			mongrp, sock_num);
+	else if (strlen(ctrlgrp) && !strlen(mongrp))
+		sprintf(llc_occup_path,	CON_LCC_OCCUP_PATH, RESCTRL_PATH,
+			ctrlgrp, sock_num);
+	else if (!strlen(ctrlgrp) && !strlen(mongrp))
+		sprintf(llc_occup_path, LCC_OCCUP_PATH,	RESCTRL_PATH, sock_num);
+}
+
+/*
+ * initialize_llc_occu_resctrl:	Appropriately populate "llc_occup_path"
+ * @ctrlgrp:			Name of the control monitor group (con_mon grp)
+ * @mongrp:			Name of the monitor group (mon grp)
+ * @cpu_no:			CPU number that the benchmark PID is binded to
+ * @resctrl_val:		Resctrl feature (Eg: cat, cqm.. etc)
+ */
+static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp,
+					int cpu_no, char *resctrl_val)
+{
+	int resource_id;
+
+	if (get_resource_id(cpu_no, &resource_id) < 0) {
+		perror("# Unable to resource_id");
+		return;
+	}
+
+	if (strcmp(resctrl_val, "cqm") == 0)
+		set_cqm_path(ctrlgrp, mongrp, resource_id);
+}
+
 static int
 measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start)
 {
@@ -634,7 +682,9 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 
 		initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp,
 					  param->cpu_no, resctrl_val);
-	}
+	} else if (strcmp(resctrl_val, "cqm") == 0)
+		initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp,
+					    param->cpu_no, resctrl_val);
 
 	/* Parent waits for child to be ready. */
 	close(pipefd[1]);
@@ -660,7 +710,8 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 
 	/* Test runs until the callback setup() tells the test to stop. */
 	while (1) {
-		if (strcmp(resctrl_val, "mbm") == 0) {
+		if ((strcmp(resctrl_val, "mbm") == 0) ||
+		    (strcmp(resctrl_val, "mba") == 0)) {
 			ret = param->setup(1, param);
 			if (ret) {
 				ret = 0;
@@ -670,14 +721,14 @@ int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
 			ret = measure_vals(param, &bw_resc_start);
 			if (ret)
 				break;
-		} else if ((strcmp(resctrl_val, "mba") == 0)) {
+		} else if (strcmp(resctrl_val, "cqm") == 0) {
 			ret = param->setup(1, param);
 			if (ret) {
 				ret = 0;
 				break;
 			}
-
-			ret = measure_vals(param, &bw_resc_start);
+			sleep(1);
+			ret = measure_cache_vals(param, bm_pid);
 			if (ret)
 				break;
 		} else {
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 07c8394b427f..9c050a6c1723 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -49,6 +49,8 @@ static int find_resctrl_mount(char *buffer)
 	return -ENOENT;
 }
 
+char cbm_mask[256];
+
 /*
  * remount_resctrlfs - Remount resctrl FS at /sys/fs/resctrl
  * @mum_resctrlfs:	Should the resctrl FS be remounted?
@@ -130,6 +132,145 @@ int get_resource_id(int cpu_no, int *resource_id)
 	return 0;
 }
 
+/*
+ * get_cache_size - Get cache size for a specified CPU
+ * @cpu_no:	CPU number
+ * @cache_type:	Cache level L2/L3
+ * @cache_size:	pointer to cache_size
+ *
+ * Return: = 0 on success, < 0 on failure.
+ */
+int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size)
+{
+	char cache_path[1024], cache_str[64];
+	int length, i, cache_num;
+	FILE *fp;
+
+	if (!strcmp(cache_type, "L3")) {
+		cache_num = 3;
+	} else if (!strcmp(cache_type, "L2")) {
+		cache_num = 2;
+	} else {
+		perror("Invalid cache level");
+		return -1;
+	}
+
+	sprintf(cache_path, "/sys/bus/cpu/devices/cpu%d/cache/index%d/size",
+		cpu_no, cache_num);
+	fp = fopen(cache_path, "r");
+	if (!fp) {
+		perror("Failed to open cache size");
+
+		return -1;
+	}
+	if (fscanf(fp, "%s", cache_str) <= 0) {
+		perror("Could not get cache_size");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	length = (int)strlen(cache_str);
+
+	*cache_size = 0;
+
+	for (i = 0; i < length; i++) {
+		if ((cache_str[i] >= '0') && (cache_str[i] <= '9'))
+
+			*cache_size = *cache_size * 10 + (cache_str[i] - '0');
+
+		else if (cache_str[i] == 'K')
+
+			*cache_size = *cache_size * 1024;
+
+		else if (cache_str[i] == 'M')
+
+			*cache_size = *cache_size * 1024 * 1024;
+
+		else
+			break;
+	}
+
+	return 0;
+}
+
+#define CORE_SIBLINGS_PATH	"/sys/bus/cpu/devices/cpu"
+
+/*
+ * get_cbm_mask - Get cbm mask for given cache
+ * @cache_type:	Cache level L2/L3
+ *
+ * Mask is stored in cbm_mask which is global variable.
+ *
+ * Return: = 0 on success, < 0 on failure.
+ */
+int get_cbm_mask(char *cache_type)
+{
+	char cbm_mask_path[1024];
+	FILE *fp;
+
+	sprintf(cbm_mask_path, "%s/%s/cbm_mask", CBM_MASK_PATH, cache_type);
+
+	fp = fopen(cbm_mask_path, "r");
+	if (!fp) {
+		perror("Failed to open cache level");
+
+		return -1;
+	}
+	if (fscanf(fp, "%s", cbm_mask) <= 0) {
+		perror("Could not get max cbm_mask");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	return 0;
+}
+
+/*
+ * get_core_sibling - Get sibling core id from the same socket for given CPU
+ * @cpu_no:	CPU number
+ *
+ * Return:	> 0 on success, < 0 on failure.
+ */
+int get_core_sibling(int cpu_no)
+{
+	char core_siblings_path[1024], cpu_list_str[64];
+	int sibling_cpu_no = -1;
+	FILE *fp;
+
+	sprintf(core_siblings_path, "%s%d/topology/core_siblings_list",
+		CORE_SIBLINGS_PATH, cpu_no);
+
+	fp = fopen(core_siblings_path, "r");
+	if (!fp) {
+		perror("Failed to open core siblings path");
+
+		return -1;
+	}
+	if (fscanf(fp, "%s", cpu_list_str) <= 0) {
+		perror("Could not get core_siblings list");
+		fclose(fp);
+
+		return -1;
+	}
+	fclose(fp);
+
+	char *token = strtok(cpu_list_str, "-,");
+
+	while (token) {
+		sibling_cpu_no = atoi(token);
+		/* Skipping core 0 as we don't want to run test on core 0 */
+		if (sibling_cpu_no != 0)
+			break;
+		token = strtok(NULL, "-,");
+	}
+
+	return sibling_cpu_no;
+}
+
 /*
  * taskset_benchmark - Taskset PID (i.e. benchmark) to a specified cpu
  * @bm_pid:	PID that should be binded
@@ -164,9 +305,10 @@ int taskset_benchmark(pid_t bm_pid, int cpu_no)
  */
 void run_benchmark(int signum, siginfo_t *info, void *ucontext)
 {
-	unsigned long span;
-	int operation, ret;
+	int operation, ret, malloc_and_init_memory, memflush;
+	unsigned long span, buffer_span;
 	char **benchmark_cmd;
+	char resctrl_val[64];
 	FILE *fp;
 
 	benchmark_cmd = info->si_ptr;
@@ -182,8 +324,18 @@ void run_benchmark(int signum, siginfo_t *info, void *ucontext)
 	if (strcmp(benchmark_cmd[0], "fill_buf") == 0) {
 		/* Execute default fill_buf benchmark */
 		span = strtoul(benchmark_cmd[1], NULL, 10);
+		malloc_and_init_memory = atoi(benchmark_cmd[2]);
+		memflush =  atoi(benchmark_cmd[3]);
 		operation = atoi(benchmark_cmd[4]);
-		if (run_fill_buf(span, 1, 1, operation, NULL))
+		sprintf(resctrl_val, "%s", benchmark_cmd[5]);
+
+		if (strcmp(resctrl_val, "cqm") != 0)
+			buffer_span = span * MB;
+		else
+			buffer_span = span;
+
+		if (run_fill_buf(buffer_span, malloc_and_init_memory, memflush,
+				 operation, resctrl_val))
 			fprintf(stderr, "Error in running fill buffer\n");
 	} else {
 		/* Execute specified benchmark */
@@ -210,6 +362,14 @@ static int create_grp(const char *grp_name, char *grp, const char *parent_grp)
 	struct dirent *ep;
 	DIR *dp;
 
+	/*
+	 * At this point, we are guaranteed to have resctrl FS mounted and if
+	 * length of grp_name == 0, it means, user wants to use root con_mon
+	 * grp, so do nothing
+	 */
+	if (strlen(grp_name) == 0)
+		return 0;
+
 	/* Check if requested grp exists or not */
 	dp = opendir(parent_grp);
 	if (dp) {
@@ -293,9 +453,10 @@ int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
 	if (ret)
 		goto out;
 
-	/* Create mon grp and write pid into it for "mbm" test */
-	if ((strcmp(resctrl_val, "mbm") == 0)) {
-		if (mongrp) {
+	/* Create mon grp and write pid into it for "mbm" and "cqm" test */
+	if ((strcmp(resctrl_val, "cqm") == 0) ||
+	    (strcmp(resctrl_val, "mbm") == 0)) {
+		if (strlen(mongrp)) {
 			sprintf(monitorgroup_p, "%s/mon_groups", controlgroup);
 			sprintf(monitorgroup, "%s/%s", monitorgroup_p, mongrp);
 			ret = create_grp(mongrp, monitorgroup, monitorgroup_p);
@@ -339,7 +500,8 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	int resource_id, ret = 0;
 	FILE *fp;
 
-	if (strcmp(resctrl_val, "mba") != 0)
+	if ((strcmp(resctrl_val, "mba") != 0) &&
+	    (strcmp(resctrl_val, "cqm") != 0))
 		return -ENOENT;
 
 	if (!schemata) {
@@ -360,7 +522,10 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	else
 		sprintf(controlgroup, "%s/schemata", RESCTRL_PATH);
 
-	sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata);
+	if (!strcmp(resctrl_val, "cqm"))
+		sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata);
+	if (strcmp(resctrl_val, "mba") == 0)
+		sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata);
 
 	fp = fopen(controlgroup, "w");
 	if (!fp) {
@@ -537,3 +702,15 @@ int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
 		      group_fd, flags);
 	return ret;
 }
+
+unsigned int count_bits(unsigned long n)
+{
+	unsigned int count = 0;
+
+	while (n) {
+		count += n & 1;
+		n >>= 1;
+	}
+
+	return count;
+}

From 790bf585b0eeec9aa0e680ba090142b98da7f948 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 16 Jan 2020 13:32:42 -0800
Subject: [PATCH 014/327] selftests/resctrl: Add Cache Allocation Technology
 (CAT) selftest

Cache Allocation Technology (CAT) selftest allocates a portion of
last level cache and starts a benchmark to read each cache
line in this portion of cache. Measure the cache misses in perf and
the misses should be equal to the number of cache lines in this
portion of cache.

We don't use CQM to calculate cache usage because some CAT enabled
platforms don't have CQM.

Co-developed-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Co-developed-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/cache.c       | 175 +++++++++++-
 tools/testing/selftests/resctrl/cat_test.c    | 250 ++++++++++++++++++
 tools/testing/selftests/resctrl/fill_buf.c    |  10 +-
 tools/testing/selftests/resctrl/resctrl.h     |   3 +
 .../testing/selftests/resctrl/resctrl_tests.c |  15 +-
 tools/testing/selftests/resctrl/resctrlfs.c   |   3 +-
 6 files changed, 451 insertions(+), 5 deletions(-)
 create mode 100644 tools/testing/selftests/resctrl/cat_test.c

diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
index d7719219ab51..38dbf4962e33 100644
--- a/tools/testing/selftests/resctrl/cache.c
+++ b/tools/testing/selftests/resctrl/cache.c
@@ -10,8 +10,105 @@ struct read_format {
 	} values[2];
 };
 
+static struct perf_event_attr pea_llc_miss;
+static struct read_format rf_cqm;
+static int fd_lm;
 char llc_occup_path[1024];
 
+static void initialize_perf_event_attr(void)
+{
+	pea_llc_miss.type = PERF_TYPE_HARDWARE;
+	pea_llc_miss.size = sizeof(struct perf_event_attr);
+	pea_llc_miss.read_format = PERF_FORMAT_GROUP;
+	pea_llc_miss.exclude_kernel = 1;
+	pea_llc_miss.exclude_hv = 1;
+	pea_llc_miss.exclude_idle = 1;
+	pea_llc_miss.exclude_callchain_kernel = 1;
+	pea_llc_miss.inherit = 1;
+	pea_llc_miss.exclude_guest = 1;
+	pea_llc_miss.disabled = 1;
+}
+
+static void ioctl_perf_event_ioc_reset_enable(void)
+{
+	ioctl(fd_lm, PERF_EVENT_IOC_RESET, 0);
+	ioctl(fd_lm, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+static int perf_event_open_llc_miss(pid_t pid, int cpu_no)
+{
+	fd_lm = perf_event_open(&pea_llc_miss, pid, cpu_no, -1,
+				PERF_FLAG_FD_CLOEXEC);
+	if (fd_lm == -1) {
+		perror("Error opening leader");
+		ctrlc_handler(0, NULL, NULL);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int initialize_llc_perf(void)
+{
+	memset(&pea_llc_miss, 0, sizeof(struct perf_event_attr));
+	memset(&rf_cqm, 0, sizeof(struct read_format));
+
+	/* Initialize perf_event_attr structures for HW_CACHE_MISSES */
+	initialize_perf_event_attr();
+
+	pea_llc_miss.config = PERF_COUNT_HW_CACHE_MISSES;
+
+	rf_cqm.nr = 1;
+
+	return 0;
+}
+
+static int reset_enable_llc_perf(pid_t pid, int cpu_no)
+{
+	int ret = 0;
+
+	ret = perf_event_open_llc_miss(pid, cpu_no);
+	if (ret < 0)
+		return ret;
+
+	/* Start counters to log values */
+	ioctl_perf_event_ioc_reset_enable();
+
+	return 0;
+}
+
+/*
+ * get_llc_perf:	llc cache miss through perf events
+ * @cpu_no:		CPU number that the benchmark PID is binded to
+ *
+ * Perf events like HW_CACHE_MISSES could be used to validate number of
+ * cache lines allocated.
+ *
+ * Return: =0 on success.  <0 on failure.
+ */
+static int get_llc_perf(unsigned long *llc_perf_miss)
+{
+	__u64 total_misses;
+
+	/* Stop counters after one span to get miss rate */
+
+	ioctl(fd_lm, PERF_EVENT_IOC_DISABLE, 0);
+
+	if (read(fd_lm, &rf_cqm, sizeof(struct read_format)) == -1) {
+		perror("Could not get llc misses through perf");
+
+		return -1;
+	}
+
+	total_misses = rf_cqm.values[0].value;
+
+	close(fd_lm);
+
+	*llc_perf_miss = total_misses;
+
+	return 0;
+}
+
 /*
  * Get LLC Occupancy as reported by RESCTRL FS
  * For CQM,
@@ -79,9 +176,19 @@ static int print_results_cache(char *filename, int bm_pid,
 
 int measure_cache_vals(struct resctrl_val_param *param, int bm_pid)
 {
-	unsigned long llc_occu_resc = 0, llc_value = 0;
+	unsigned long llc_perf_miss = 0, llc_occu_resc = 0, llc_value = 0;
 	int ret;
 
+	/*
+	 * Measure cache miss from perf.
+	 */
+	if (!strcmp(param->resctrl_val, "cat")) {
+		ret = get_llc_perf(&llc_perf_miss);
+		if (ret < 0)
+			return ret;
+		llc_value = llc_perf_miss;
+	}
+
 	/*
 	 * Measure llc occupancy from resctrl.
 	 */
@@ -97,3 +204,69 @@ int measure_cache_vals(struct resctrl_val_param *param, int bm_pid)
 
 	return 0;
 }
+
+/*
+ * cache_val:		execute benchmark and measure LLC occupancy resctrl
+ * and perf cache miss for the benchmark
+ * @param:		parameters passed to cache_val()
+ *
+ * Return:		0 on success. non-zero on failure.
+ */
+int cat_val(struct resctrl_val_param *param)
+{
+	int malloc_and_init_memory = 1, memflush = 1, operation = 0, ret = 0;
+	char *resctrl_val = param->resctrl_val;
+	pid_t bm_pid;
+
+	if (strcmp(param->filename, "") == 0)
+		sprintf(param->filename, "stdio");
+
+	bm_pid = getpid();
+
+	/* Taskset benchmark to specified cpu */
+	ret = taskset_benchmark(bm_pid, param->cpu_no);
+	if (ret)
+		return ret;
+
+	/* Write benchmark to specified con_mon grp, mon_grp in resctrl FS*/
+	ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp,
+				      resctrl_val);
+	if (ret)
+		return ret;
+
+	if ((strcmp(resctrl_val, "cat") == 0)) {
+		ret = initialize_llc_perf();
+		if (ret)
+			return ret;
+	}
+
+	/* Test runs until the callback setup() tells the test to stop. */
+	while (1) {
+		if (strcmp(resctrl_val, "cat") == 0) {
+			ret = param->setup(1, param);
+			if (ret) {
+				ret = 0;
+				break;
+			}
+			ret = reset_enable_llc_perf(bm_pid, param->cpu_no);
+			if (ret)
+				break;
+
+			if (run_fill_buf(param->span, malloc_and_init_memory,
+					 memflush, operation, resctrl_val)) {
+				fprintf(stderr, "Error-running fill buffer\n");
+				ret = -1;
+				break;
+			}
+
+			sleep(1);
+			ret = measure_cache_vals(param, bm_pid);
+			if (ret)
+				break;
+		} else {
+			break;
+		}
+	}
+
+	return ret;
+}
diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
new file mode 100644
index 000000000000..c194fcc42407
--- /dev/null
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cache Allocation Technology (CAT) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ *    Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+#include <unistd.h>
+
+#define RESULT_FILE_NAME1	"result_cat1"
+#define RESULT_FILE_NAME2	"result_cat2"
+#define NUM_OF_RUNS		5
+#define MAX_DIFF_PERCENT	4
+#define MAX_DIFF		1000000
+
+int count_of_bits;
+char cbm_mask[256];
+unsigned long long_mask;
+unsigned long cache_size;
+
+/*
+ * Change schemata. Write schemata to specified
+ * con_mon grp, mon_grp in resctrl FS.
+ * Run 5 times in order to get average values.
+ */
+static int cat_setup(int num, ...)
+{
+	struct resctrl_val_param *p;
+	char schemata[64];
+	va_list param;
+	int ret = 0;
+
+	va_start(param, num);
+	p = va_arg(param, struct resctrl_val_param *);
+	va_end(param);
+
+	/* Run NUM_OF_RUNS times */
+	if (p->num_of_runs >= NUM_OF_RUNS)
+		return -1;
+
+	if (p->num_of_runs == 0) {
+		sprintf(schemata, "%lx", p->mask);
+		ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no,
+				     p->resctrl_val);
+	}
+	p->num_of_runs++;
+
+	return ret;
+}
+
+static void show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits,
+			    unsigned long span)
+{
+	unsigned long allocated_cache_lines = span / 64;
+	unsigned long avg_llc_perf_miss = 0;
+	float diff_percent;
+
+	avg_llc_perf_miss = sum_llc_perf_miss / (NUM_OF_RUNS - 1);
+	diff_percent = ((float)allocated_cache_lines - avg_llc_perf_miss) /
+				allocated_cache_lines * 100;
+
+	printf("%sok CAT: cache miss rate within %d%%\n",
+	       abs((int)diff_percent) > MAX_DIFF_PERCENT ? "not " : "",
+	       MAX_DIFF_PERCENT);
+	tests_run++;
+	printf("# Percent diff=%d\n", abs((int)diff_percent));
+	printf("# Number of bits: %d\n", no_of_bits);
+	printf("# Avg_llc_perf_miss: %lu\n", avg_llc_perf_miss);
+	printf("# Allocated cache lines: %lu\n", allocated_cache_lines);
+}
+
+static int check_results(struct resctrl_val_param *param)
+{
+	char *token_array[8], temp[512];
+	unsigned long sum_llc_perf_miss = 0;
+	int runs = 0, no_of_bits = 0;
+	FILE *fp;
+
+	printf("# Checking for pass/fail\n");
+	fp = fopen(param->filename, "r");
+	if (!fp) {
+		perror("# Cannot open file");
+
+		return errno;
+	}
+
+	while (fgets(temp, sizeof(temp), fp)) {
+		char *token = strtok(temp, ":\t");
+		int fields = 0;
+
+		while (token) {
+			token_array[fields++] = token;
+			token = strtok(NULL, ":\t");
+		}
+		/*
+		 * Discard the first value which is inaccurate due to monitoring
+		 * setup transition phase.
+		 */
+		if (runs > 0)
+			sum_llc_perf_miss += strtoul(token_array[3], NULL, 0);
+		runs++;
+	}
+
+	fclose(fp);
+	no_of_bits = count_bits(param->mask);
+
+	show_cache_info(sum_llc_perf_miss, no_of_bits, param->span);
+
+	return 0;
+}
+
+void cat_test_cleanup(void)
+{
+	remove(RESULT_FILE_NAME1);
+	remove(RESULT_FILE_NAME2);
+}
+
+int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
+{
+	unsigned long l_mask, l_mask_1;
+	int ret, pipefd[2], sibling_cpu_no;
+	char pipe_message;
+	pid_t bm_pid;
+
+	cache_size = 0;
+
+	ret = remount_resctrlfs(true);
+	if (ret)
+		return ret;
+
+	if (!validate_resctrl_feature_request("cat"))
+		return -1;
+
+	/* Get default cbm mask for L3/L2 cache */
+	ret = get_cbm_mask(cache_type);
+	if (ret)
+		return ret;
+
+	long_mask = strtoul(cbm_mask, NULL, 16);
+
+	/* Get L3/L2 cache size */
+	ret = get_cache_size(cpu_no, cache_type, &cache_size);
+	if (ret)
+		return ret;
+	printf("cache size :%lu\n", cache_size);
+
+	/* Get max number of bits from default-cabm mask */
+	count_of_bits = count_bits(long_mask);
+
+	if (n < 1 || n > count_of_bits - 1) {
+		printf("Invalid input value for no_of_bits n!\n");
+		printf("Please Enter value in range 1 to %d\n",
+		       count_of_bits - 1);
+		return -1;
+	}
+
+	/* Get core id from same socket for running another thread */
+	sibling_cpu_no = get_core_sibling(cpu_no);
+	if (sibling_cpu_no < 0)
+		return -1;
+
+	struct resctrl_val_param param = {
+		.resctrl_val	= "cat",
+		.cpu_no		= cpu_no,
+		.mum_resctrlfs	= 0,
+		.setup		= cat_setup,
+	};
+
+	l_mask = long_mask >> n;
+	l_mask_1 = ~l_mask & long_mask;
+
+	/* Set param values for parent thread which will be allocated bitmask
+	 * with (max_bits - n) bits
+	 */
+	param.span = cache_size * (count_of_bits - n) / count_of_bits;
+	strcpy(param.ctrlgrp, "c2");
+	strcpy(param.mongrp, "m2");
+	strcpy(param.filename, RESULT_FILE_NAME2);
+	param.mask = l_mask;
+	param.num_of_runs = 0;
+
+	if (pipe(pipefd)) {
+		perror("# Unable to create pipe");
+		return errno;
+	}
+
+	bm_pid = fork();
+
+	/* Set param values for child thread which will be allocated bitmask
+	 * with n bits
+	 */
+	if (bm_pid == 0) {
+		param.mask = l_mask_1;
+		strcpy(param.ctrlgrp, "c1");
+		strcpy(param.mongrp, "m1");
+		param.span = cache_size * n / count_of_bits;
+		strcpy(param.filename, RESULT_FILE_NAME1);
+		param.num_of_runs = 0;
+		param.cpu_no = sibling_cpu_no;
+	}
+
+	remove(param.filename);
+
+	ret = cat_val(&param);
+	if (ret)
+		return ret;
+
+	ret = check_results(&param);
+	if (ret)
+		return ret;
+
+	if (bm_pid == 0) {
+		/* Tell parent that child is ready */
+		close(pipefd[0]);
+		pipe_message = 1;
+		if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
+		    sizeof(pipe_message)) {
+			close(pipefd[1]);
+			perror("# failed signaling parent process");
+			return errno;
+		}
+
+		close(pipefd[1]);
+		while (1)
+			;
+	} else {
+		/* Parent waits for child to be ready. */
+		close(pipefd[1]);
+		pipe_message = 0;
+		while (pipe_message != 1) {
+			if (read(pipefd[0], &pipe_message,
+				 sizeof(pipe_message)) < sizeof(pipe_message)) {
+				perror("# failed reading from child process");
+				break;
+			}
+		}
+		close(pipefd[0]);
+		kill(bm_pid, SIGKILL);
+	}
+
+	cat_test_cleanup();
+	if (bm_pid)
+		umount_resctrlfs();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
index b76bf6a4a0a5..84d2a8b9657a 100644
--- a/tools/testing/selftests/resctrl/fill_buf.c
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -113,8 +113,11 @@ static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr,
 	int ret = 0;
 	FILE *fp;
 
-	while (1)
+	while (1) {
 		ret = fill_one_span_read(start_ptr, end_ptr);
+		if (!strcmp(resctrl_val, "cat"))
+			break;
+	}
 
 	/* Consume read result so that reading memory is not optimized out. */
 	fp = fopen("/dev/null", "w");
@@ -129,8 +132,11 @@ static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr,
 static int fill_cache_write(unsigned char *start_ptr, unsigned char *end_ptr,
 			    char *resctrl_val)
 {
-	while (1)
+	while (1) {
 		fill_one_span_write(start_ptr, end_ptr);
+		if (!strcmp(resctrl_val, "cat"))
+			break;
+	}
 
 	return 0;
 }
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index d326c7e95862..e8399389f3ad 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -94,6 +94,9 @@ void mba_test_cleanup(void);
 int get_cbm_mask(char *cache_type);
 int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size);
 void ctrlc_handler(int signum, siginfo_t *info, void *ptr);
+int cat_val(struct resctrl_val_param *param);
+void cat_test_cleanup(void);
+int cat_perf_miss_val(int cpu_no, int no_of_bits, char *cache_type);
 int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd);
 unsigned int count_bits(unsigned long n);
 void cqm_test_cleanup(void);
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 70179a61df3b..7f75b0b01bb5 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -19,7 +19,7 @@ static void cmd_help(void)
 	printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM");
 	printf("\t default benchmark is builtin fill_buf\n");
 	printf("\t-t test list: run tests specified in the test list, ");
-	printf("e.g. -t mbm, mba, cqm\n");
+	printf("e.g. -t mbm, mba, cqm, cat\n");
 	printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n");
 	printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n");
 	printf("\t-h: help\n");
@@ -30,6 +30,7 @@ void tests_cleanup(void)
 	mbm_test_cleanup();
 	mba_test_cleanup();
 	cqm_test_cleanup();
+	cat_test_cleanup();
 }
 
 int main(int argc, char **argv)
@@ -39,6 +40,7 @@ int main(int argc, char **argv)
 	char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
 	char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
 	int ben_ind, ben_count;
+	bool cat_test = true;
 
 	for (i = 0; i < argc; i++) {
 		if (strcmp(argv[i], "-b") == 0) {
@@ -60,6 +62,7 @@ int main(int argc, char **argv)
 			mbm_test = false;
 			mba_test = false;
 			cqm_test = false;
+			cat_test = false;
 			while (token) {
 				if (!strcmp(token, "mbm")) {
 					mbm_test = true;
@@ -67,6 +70,8 @@ int main(int argc, char **argv)
 					mba_test = true;
 				} else if (!strcmp(token, "cqm")) {
 					cqm_test = true;
+				} else if (!strcmp(token, "cat")) {
+					cat_test = true;
 				} else {
 					printf("invalid argument\n");
 
@@ -159,6 +164,14 @@ int main(int argc, char **argv)
 		tests_run++;
 	}
 
+	if (cat_test) {
+		printf("# Starting CAT test ...\n");
+		res = cat_perf_miss_val(cpu_no, no_of_bits, "L3");
+		printf("%sok CAT: test\n", res ? "not " : "");
+		tests_run++;
+		cat_test_cleanup();
+	}
+
 	printf("1..%d\n", tests_run);
 
 	return 0;
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 9c050a6c1723..8772c8c4d5d0 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -501,6 +501,7 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	FILE *fp;
 
 	if ((strcmp(resctrl_val, "mba") != 0) &&
+	    (strcmp(resctrl_val, "cat") != 0) &&
 	    (strcmp(resctrl_val, "cqm") != 0))
 		return -ENOENT;
 
@@ -522,7 +523,7 @@ int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
 	else
 		sprintf(controlgroup, "%s/schemata", RESCTRL_PATH);
 
-	if (!strcmp(resctrl_val, "cqm"))
+	if (!strcmp(resctrl_val, "cat") || !strcmp(resctrl_val, "cqm"))
 		sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata);
 	if (strcmp(resctrl_val, "mba") == 0)
 		sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata);

From 53f74fbec9f069241aa6f4c4d908229c77ee83e2 Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@amd.com>
Date: Thu, 16 Jan 2020 13:32:43 -0800
Subject: [PATCH 015/327] selftests/resctrl: Add vendor detection mechanism

RESCTRL feature is supported both on Intel and AMD now. Some features
are implemented differently. Add vendor detection mechanism. Use the vendor
check where there are differences.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/resctrl.h     |  1 +
 .../testing/selftests/resctrl/resctrl_tests.c | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
index e8399389f3ad..39bf59c6b9c5 100644
--- a/tools/testing/selftests/resctrl/resctrl.h
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -66,6 +66,7 @@ pid_t bm_pid, ppid;
 int tests_run;
 
 char llc_occup_path[1024];
+bool is_amd;
 
 bool check_resctrlfs_support(void);
 int filter_dmesg(void);
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 7f75b0b01bb5..884e918b1e97 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -13,6 +13,27 @@
 #define BENCHMARK_ARGS		64
 #define BENCHMARK_ARG_SIZE	64
 
+bool is_amd;
+
+void detect_amd(void)
+{
+	FILE *inf = fopen("/proc/cpuinfo", "r");
+	char *res;
+
+	if (!inf)
+		return;
+
+	res = fgrep(inf, "vendor_id");
+
+	if (res) {
+		char *s = strchr(res, ':');
+
+		is_amd = s && !strcmp(s, ": AuthenticAMD\n");
+		free(res);
+	}
+	fclose(inf);
+}
+
 static void cmd_help(void)
 {
 	printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n");
@@ -107,6 +128,9 @@ int main(int argc, char **argv)
 	if (geteuid() != 0)
 		printf("# WARNING: not running as root, tests may fail.\n");
 
+	/* Detect AMD vendor */
+	detect_amd();
+
 	if (has_ben) {
 		/* Extract benchmark command from command line. */
 		for (i = ben_ind; i < argc; i++) {

From c0327e1d7c42adb31a9541de6812f740893177a1 Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@amd.com>
Date: Thu, 16 Jan 2020 13:32:44 -0800
Subject: [PATCH 016/327] selftests/resctrl: Use cache index3 id for AMD
 schemata masks

AMD uses the cache l3 boundary for schemata masks. Update it accordigly.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/resctrlfs.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
index 8772c8c4d5d0..19c0ec4045a4 100644
--- a/tools/testing/selftests/resctrl/resctrlfs.c
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -113,8 +113,13 @@ int get_resource_id(int cpu_no, int *resource_id)
 	char phys_pkg_path[1024];
 	FILE *fp;
 
-	sprintf(phys_pkg_path, "%s%d/topology/physical_package_id",
-		PHYS_ID_PATH, cpu_no);
+	if (is_amd)
+		sprintf(phys_pkg_path, "%s%d/cache/index3/id",
+			PHYS_ID_PATH, cpu_no);
+	else
+		sprintf(phys_pkg_path, "%s%d/topology/physical_package_id",
+			PHYS_ID_PATH, cpu_no);
+
 	fp = fopen(phys_pkg_path, "r");
 	if (!fp) {
 		perror("Failed to open physical_package_id");

From 85f553d24ada5399cab730e3199a970e2cc78c82 Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@amd.com>
Date: Thu, 16 Jan 2020 13:32:45 -0800
Subject: [PATCH 017/327] selftests/resctrl: Disable MBA and MBM tests for AMD

For now, disable MBA and MBM tests for AMD. Deciding test pass/fail
is not clear right now. We can enable when we have some clarity.

Signed-off-by: Babu Moger <babu.moger@amd.com>
Co-developed-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/cat_test.c      | 4 ++--
 tools/testing/selftests/resctrl/resctrl_tests.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
index c194fcc42407..5da43767b973 100644
--- a/tools/testing/selftests/resctrl/cat_test.c
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -64,8 +64,8 @@ static void show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits,
 				allocated_cache_lines * 100;
 
 	printf("%sok CAT: cache miss rate within %d%%\n",
-	       abs((int)diff_percent) > MAX_DIFF_PERCENT ? "not " : "",
-	       MAX_DIFF_PERCENT);
+	       !is_amd && abs((int)diff_percent) > MAX_DIFF_PERCENT ?
+	       "not " : "", MAX_DIFF_PERCENT);
 	tests_run++;
 	printf("# Percent diff=%d\n", abs((int)diff_percent));
 	printf("# Number of bits: %d\n", no_of_bits);
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
index 884e918b1e97..425cc85ac883 100644
--- a/tools/testing/selftests/resctrl/resctrl_tests.c
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -158,7 +158,7 @@ int main(int argc, char **argv)
 	check_resctrlfs_support();
 	filter_dmesg();
 
-	if (mbm_test) {
+	if (!is_amd && mbm_test) {
 		printf("# Starting MBM BW change ...\n");
 		if (!has_ben)
 			sprintf(benchmark_cmd[5], "%s", "mba");
@@ -168,7 +168,7 @@ int main(int argc, char **argv)
 		tests_run++;
 	}
 
-	if (mba_test) {
+	if (!is_amd && mba_test) {
 		printf("# Starting MBA Schemata change ...\n");
 		if (!has_ben)
 			sprintf(benchmark_cmd[1], "%d", span);

From 3032e3a7c7e3ef4a9839ec99c60ffeed76830bf2 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 16 Jan 2020 13:32:46 -0800
Subject: [PATCH 018/327] selftests/resctrl: Add the test in MAINTAINERS

The resctrl selftest will be maintained by RDT maintainers.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 38fe2f3f7b6f..b3eefdbc2edd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14073,6 +14073,7 @@ S:	Supported
 F:	arch/x86/kernel/cpu/resctrl/
 F:	arch/x86/include/asm/resctrl_sched.h
 F:	Documentation/x86/resctrl*
+F:	tools/testing/selftests/resctrl/
 
 READ-COPY UPDATE (RCU)
 M:	"Paul E. McKenney" <paulmck@kernel.org>

From 081ea5195a11c9f1eaa8393be603b75982f91b7d Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Sun, 26 Jan 2020 16:26:47 +0200
Subject: [PATCH 019/327] RDMA/cma: Use a helper function to enqueue resolve
 work items

To avoid errors, with attaching ownership of work item and its cm_id
refcount which is decremented in work handler, tie them up in single
helper function. Also avoid code duplication.

Link: https://lore.kernel.org/r/20200126142652.104803-3-leon@kernel.org
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 72f032160c4b..8f16ebb413c2 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -2687,14 +2687,18 @@ static void cma_init_resolve_route_work(struct cma_work *work,
 	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
 }
 
-static void cma_init_resolve_addr_work(struct cma_work *work,
-				       struct rdma_id_private *id_priv)
+static void enqueue_resolve_addr_work(struct cma_work *work,
+				      struct rdma_id_private *id_priv)
 {
+	atomic_inc(&id_priv->refcount);
+
 	work->id = id_priv;
 	INIT_WORK(&work->work, cma_work_handler);
 	work->old_state = RDMA_CM_ADDR_QUERY;
 	work->new_state = RDMA_CM_ADDR_RESOLVED;
 	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+
+	queue_work(cma_wq, &work->work);
 }
 
 static int cma_resolve_ib_route(struct rdma_id_private *id_priv,
@@ -3148,9 +3152,7 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv)
 	rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
 	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
 
-	atomic_inc(&id_priv->refcount);
-	cma_init_resolve_addr_work(work, id_priv);
-	queue_work(cma_wq, &work->work);
+	enqueue_resolve_addr_work(work, id_priv);
 	return 0;
 err:
 	kfree(work);
@@ -3175,9 +3177,7 @@ static int cma_resolve_ib_addr(struct rdma_id_private *id_priv)
 	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *)
 		&(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr));
 
-	atomic_inc(&id_priv->refcount);
-	cma_init_resolve_addr_work(work, id_priv);
-	queue_work(cma_wq, &work->work);
+	enqueue_resolve_addr_work(work, id_priv);
 	return 0;
 err:
 	kfree(work);

From cc055dd3a71352759a6c7ecaee612eeaef93ef22 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Sun, 26 Jan 2020 16:26:48 +0200
Subject: [PATCH 020/327] RDMA/cma: Use RDMA device port iterator

Use RDMA device port iterator to avoid open coding.

Link: https://lore.kernel.org/r/20200126142652.104803-4-leon@kernel.org
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 8f16ebb413c2..34c62eae08d8 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -728,8 +728,8 @@ static int cma_iw_acquire_dev(struct rdma_id_private *id_priv,
 	struct cma_device *cma_dev;
 	enum ib_gid_type gid_type;
 	int ret = -ENODEV;
+	unsigned int port;
 	union ib_gid gid;
-	u8 port;
 
 	if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
 	    id_priv->id.ps == RDMA_PS_IPOIB)
@@ -753,7 +753,8 @@ static int cma_iw_acquire_dev(struct rdma_id_private *id_priv,
 	}
 
 	list_for_each_entry(cma_dev, &dev_list, list) {
-		for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
+		rdma_for_each_port (cma_dev->device, port) {
+
 			if (listen_id_priv->cma_dev == cma_dev &&
 			    listen_id_priv->id.port_num == port)
 				continue;
@@ -786,8 +787,8 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
 	struct cma_device *cma_dev, *cur_dev;
 	struct sockaddr_ib *addr;
 	union ib_gid gid, sgid, *dgid;
+	unsigned int p;
 	u16 pkey, index;
-	u8 p;
 	enum ib_port_state port_state;
 	int i;
 
@@ -798,7 +799,7 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
 
 	mutex_lock(&lock);
 	list_for_each_entry(cur_dev, &dev_list, list) {
-		for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+		rdma_for_each_port (cur_dev->device, p) {
 			if (!rdma_cap_af_ib(cur_dev->device, p))
 				continue;
 
@@ -3029,9 +3030,9 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv)
 	struct cma_device *cma_dev, *cur_dev;
 	union ib_gid gid;
 	enum ib_port_state port_state;
+	unsigned int p;
 	u16 pkey;
 	int ret;
-	u8 p;
 
 	cma_dev = NULL;
 	mutex_lock(&lock);
@@ -3043,7 +3044,7 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv)
 		if (!cma_dev)
 			cma_dev = cur_dev;
 
-		for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+		rdma_for_each_port (cur_dev->device, p) {
 			if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) &&
 			    port_state == IB_PORT_ACTIVE) {
 				cma_dev = cur_dev;

From 5ff8c8fa44c2cb74f3066ec4a531265db69b86c5 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Sun, 26 Jan 2020 16:26:49 +0200
Subject: [PATCH 021/327] RDMA/cma: Rename cma_device ref/deref helpers to to
 get/put

Helper functions which increment/decrement reference count of the
structure read better when they are named with the get/put suffix.

Hence, rename cma_ref/deref_dev() to cma_dev_get/put().

Link: https://lore.kernel.org/r/20200126142652.104803-5-leon@kernel.org
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c          | 22 +++++++++++-----------
 drivers/infiniband/core/cma_configfs.c |  6 +++---
 drivers/infiniband/core/cma_priv.h     |  4 ++--
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 34c62eae08d8..7e16d1b001ff 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -247,11 +247,17 @@ enum {
 	CMA_OPTION_AFONLY,
 };
 
-void cma_ref_dev(struct cma_device *cma_dev)
+void cma_dev_get(struct cma_device *cma_dev)
 {
 	atomic_inc(&cma_dev->refcount);
 }
 
+void cma_dev_put(struct cma_device *cma_dev)
+{
+	if (atomic_dec_and_test(&cma_dev->refcount))
+		complete(&cma_dev->comp);
+}
+
 struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter	filter,
 					     void		*cookie)
 {
@@ -267,7 +273,7 @@ struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter	filter,
 		}
 
 	if (found_cma_dev)
-		cma_ref_dev(found_cma_dev);
+		cma_dev_get(found_cma_dev);
 	mutex_unlock(&lock);
 	return found_cma_dev;
 }
@@ -463,7 +469,7 @@ static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
 static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
 			       struct cma_device *cma_dev)
 {
-	cma_ref_dev(cma_dev);
+	cma_dev_get(cma_dev);
 	id_priv->cma_dev = cma_dev;
 	id_priv->id.device = cma_dev->device;
 	id_priv->id.route.addr.dev_addr.transport =
@@ -484,12 +490,6 @@ static void cma_attach_to_dev(struct rdma_id_private *id_priv,
 					  rdma_start_port(cma_dev->device)];
 }
 
-void cma_deref_dev(struct cma_device *cma_dev)
-{
-	if (atomic_dec_and_test(&cma_dev->refcount))
-		complete(&cma_dev->comp);
-}
-
 static inline void release_mc(struct kref *kref)
 {
 	struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref);
@@ -502,7 +502,7 @@ static void cma_release_dev(struct rdma_id_private *id_priv)
 {
 	mutex_lock(&lock);
 	list_del(&id_priv->list);
-	cma_deref_dev(id_priv->cma_dev);
+	cma_dev_put(id_priv->cma_dev);
 	id_priv->cma_dev = NULL;
 	mutex_unlock(&lock);
 }
@@ -4728,7 +4728,7 @@ static void cma_process_remove(struct cma_device *cma_dev)
 	}
 	mutex_unlock(&lock);
 
-	cma_deref_dev(cma_dev);
+	cma_dev_put(cma_dev);
 	wait_for_completion(&cma_dev->comp);
 }
 
diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c
index 8b0b5ae22e4c..c672a4978bfd 100644
--- a/drivers/infiniband/core/cma_configfs.c
+++ b/drivers/infiniband/core/cma_configfs.c
@@ -94,7 +94,7 @@ static int cma_configfs_params_get(struct config_item *item,
 
 static void cma_configfs_params_put(struct cma_device *cma_dev)
 {
-	cma_deref_dev(cma_dev);
+	cma_dev_put(cma_dev);
 }
 
 static ssize_t default_roce_mode_show(struct config_item *item,
@@ -312,12 +312,12 @@ static struct config_group *make_cma_dev(struct config_group *group,
 	configfs_add_default_group(&cma_dev_group->ports_group,
 			&cma_dev_group->device_group);
 
-	cma_deref_dev(cma_dev);
+	cma_dev_put(cma_dev);
 	return &cma_dev_group->device_group;
 
 fail:
 	if (cma_dev)
-		cma_deref_dev(cma_dev);
+		cma_dev_put(cma_dev);
 	kfree(cma_dev_group);
 	return ERR_PTR(err);
 }
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index ca7307277518..4e04c442ff86 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -111,8 +111,8 @@ static inline void cma_configfs_exit(void)
 }
 #endif
 
-void cma_ref_dev(struct cma_device *dev);
-void cma_deref_dev(struct cma_device *dev);
+void cma_dev_get(struct cma_device *dev);
+void cma_dev_put(struct cma_device *dev);
 typedef bool (*cma_device_filter)(struct ib_device *, void *);
 struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
 					     void *cookie);

From be439912e7c2e3e78ebd087932c165a83bdca6b5 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Sun, 26 Jan 2020 16:26:50 +0200
Subject: [PATCH 022/327] RDMA/cma: Use refcount API to reflect refcount

Use the refcount variant to capture the reference counting of the cma
device structure.

Link: https://lore.kernel.org/r/20200126142652.104803-6-leon@kernel.org
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 7e16d1b001ff..d43f7ce759f2 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -199,7 +199,7 @@ struct cma_device {
 	struct list_head	list;
 	struct ib_device	*device;
 	struct completion	comp;
-	atomic_t		refcount;
+	refcount_t refcount;
 	struct list_head	id_list;
 	enum ib_gid_type	*default_gid_type;
 	u8			*default_roce_tos;
@@ -249,12 +249,12 @@ enum {
 
 void cma_dev_get(struct cma_device *cma_dev)
 {
-	atomic_inc(&cma_dev->refcount);
+	refcount_inc(&cma_dev->refcount);
 }
 
 void cma_dev_put(struct cma_device *cma_dev)
 {
-	if (atomic_dec_and_test(&cma_dev->refcount))
+	if (refcount_dec_and_test(&cma_dev->refcount))
 		complete(&cma_dev->comp);
 }
 
@@ -754,7 +754,6 @@ static int cma_iw_acquire_dev(struct rdma_id_private *id_priv,
 
 	list_for_each_entry(cma_dev, &dev_list, list) {
 		rdma_for_each_port (cma_dev->device, port) {
-
 			if (listen_id_priv->cma_dev == cma_dev &&
 			    listen_id_priv->id.port_num == port)
 				continue;
@@ -4657,7 +4656,7 @@ static void cma_add_one(struct ib_device *device)
 	}
 
 	init_completion(&cma_dev->comp);
-	atomic_set(&cma_dev->refcount, 1);
+	refcount_set(&cma_dev->refcount, 1);
 	INIT_LIST_HEAD(&cma_dev->id_list);
 	ib_set_client_data(device, &cma_client, cma_dev);
 

From e368d23f57f6a08341d35c44255f2d8e7695152b Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Sun, 26 Jan 2020 16:26:51 +0200
Subject: [PATCH 023/327] RDMA/cma: Rename cma_device ref/deref helpers to to
 get/put

Helper functions which increment/decrement reference count of a
structure read better when they are named with the get/put suffix.

Hence, rename cma_ref/deref_id() to cma_id_get/put().  Also use
cma_get_id() wrapper to find the balancing put() calls.

Link: https://lore.kernel.org/r/20200126142652.104803-7-leon@kernel.org
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c | 42 ++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index d43f7ce759f2..605afeed122f 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -840,7 +840,12 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
 	return 0;
 }
 
-static void cma_deref_id(struct rdma_id_private *id_priv)
+static void cma_id_get(struct rdma_id_private *id_priv)
+{
+	atomic_inc(&id_priv->refcount);
+}
+
+static void cma_id_put(struct rdma_id_private *id_priv)
 {
 	if (atomic_dec_and_test(&id_priv->refcount))
 		complete(&id_priv->comp);
@@ -1846,11 +1851,11 @@ void rdma_destroy_id(struct rdma_cm_id *id)
 	}
 
 	cma_release_port(id_priv);
-	cma_deref_id(id_priv);
+	cma_id_put(id_priv);
 	wait_for_completion(&id_priv->comp);
 
 	if (id_priv->internal_id)
-		cma_deref_id(id_priv->id.context);
+		cma_id_put(id_priv->id.context);
 
 	kfree(id_priv->id.route.path_rec);
 
@@ -2187,7 +2192,7 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
 	 * Protect against the user destroying conn_id from another thread
 	 * until we're done accessing it.
 	 */
-	atomic_inc(&conn_id->refcount);
+	cma_id_get(conn_id);
 	ret = cma_cm_event_handler(conn_id, &event);
 	if (ret)
 		goto err3;
@@ -2204,13 +2209,13 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id,
 	mutex_unlock(&lock);
 	mutex_unlock(&conn_id->handler_mutex);
 	mutex_unlock(&listen_id->handler_mutex);
-	cma_deref_id(conn_id);
+	cma_id_put(conn_id);
 	if (net_dev)
 		dev_put(net_dev);
 	return 0;
 
 err3:
-	cma_deref_id(conn_id);
+	cma_id_put(conn_id);
 	/* Destroy the CM ID by returning a non-zero value. */
 	conn_id->cm_id.ib = NULL;
 err2:
@@ -2391,7 +2396,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 	 * Protect against the user destroying conn_id from another thread
 	 * until we're done accessing it.
 	 */
-	atomic_inc(&conn_id->refcount);
+	cma_id_get(conn_id);
 	ret = cma_cm_event_handler(conn_id, &event);
 	if (ret) {
 		/* User wants to destroy the CM ID */
@@ -2399,13 +2404,13 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 		cma_exch(conn_id, RDMA_CM_DESTROYING);
 		mutex_unlock(&conn_id->handler_mutex);
 		mutex_unlock(&listen_id->handler_mutex);
-		cma_deref_id(conn_id);
+		cma_id_put(conn_id);
 		rdma_destroy_id(&conn_id->id);
 		return ret;
 	}
 
 	mutex_unlock(&conn_id->handler_mutex);
-	cma_deref_id(conn_id);
+	cma_id_put(conn_id);
 
 out:
 	mutex_unlock(&listen_id->handler_mutex);
@@ -2492,7 +2497,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
 
 	_cma_attach_to_dev(dev_id_priv, cma_dev);
 	list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
-	atomic_inc(&id_priv->refcount);
+	cma_id_get(id_priv);
 	dev_id_priv->internal_id = 1;
 	dev_id_priv->afonly = id_priv->afonly;
 	dev_id_priv->tos_set = id_priv->tos_set;
@@ -2647,7 +2652,7 @@ static void cma_work_handler(struct work_struct *_work)
 	}
 out:
 	mutex_unlock(&id_priv->handler_mutex);
-	cma_deref_id(id_priv);
+	cma_id_put(id_priv);
 	if (destroy)
 		rdma_destroy_id(&id_priv->id);
 	kfree(work);
@@ -2671,7 +2676,7 @@ static void cma_ndev_work_handler(struct work_struct *_work)
 
 out:
 	mutex_unlock(&id_priv->handler_mutex);
-	cma_deref_id(id_priv);
+	cma_id_put(id_priv);
 	if (destroy)
 		rdma_destroy_id(&id_priv->id);
 	kfree(work);
@@ -2690,7 +2695,8 @@ static void cma_init_resolve_route_work(struct cma_work *work,
 static void enqueue_resolve_addr_work(struct cma_work *work,
 				      struct rdma_id_private *id_priv)
 {
-	atomic_inc(&id_priv->refcount);
+	/* Balances with cma_id_put() in cma_work_handler */
+	cma_id_get(id_priv);
 
 	work->id = id_priv;
 	INIT_WORK(&work->work, cma_work_handler);
@@ -2986,7 +2992,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
 	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
 		return -EINVAL;
 
-	atomic_inc(&id_priv->refcount);
+	cma_id_get(id_priv);
 	if (rdma_cap_ib_sa(id->device, id->port_num))
 		ret = cma_resolve_ib_route(id_priv, timeout_ms);
 	else if (rdma_protocol_roce(id->device, id->port_num))
@@ -3002,7 +3008,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
 	return 0;
 err:
 	cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
-	cma_deref_id(id_priv);
+	cma_id_put(id_priv);
 	return ret;
 }
 EXPORT_SYMBOL(rdma_resolve_route);
@@ -4581,7 +4587,7 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id
 		INIT_WORK(&work->work, cma_ndev_work_handler);
 		work->id = id_priv;
 		work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
-		atomic_inc(&id_priv->refcount);
+		cma_id_get(id_priv);
 		queue_work(cma_wq, &work->work);
 	}
 
@@ -4715,11 +4721,11 @@ static void cma_process_remove(struct cma_device *cma_dev)
 
 		list_del(&id_priv->listen_list);
 		list_del_init(&id_priv->list);
-		atomic_inc(&id_priv->refcount);
+		cma_id_get(id_priv);
 		mutex_unlock(&lock);
 
 		ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv);
-		cma_deref_id(id_priv);
+		cma_id_put(id_priv);
 		if (ret)
 			rdma_destroy_id(&id_priv->id);
 

From 43fb5892cdfaa3bbe170aade07d4a38086636cca Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Sun, 26 Jan 2020 16:26:52 +0200
Subject: [PATCH 024/327] RDMA/cma: Use refcount API to reflect refcount

Use a refcount_t for atomics being used as a refcount.

Link: https://lore.kernel.org/r/20200126142652.104803-8-leon@kernel.org
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c      | 6 +++---
 drivers/infiniband/core/cma_priv.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 605afeed122f..5165158a7aaa 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -842,12 +842,12 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
 
 static void cma_id_get(struct rdma_id_private *id_priv)
 {
-	atomic_inc(&id_priv->refcount);
+	refcount_inc(&id_priv->refcount);
 }
 
 static void cma_id_put(struct rdma_id_private *id_priv)
 {
-	if (atomic_dec_and_test(&id_priv->refcount))
+	if (refcount_dec_and_test(&id_priv->refcount))
 		complete(&id_priv->comp);
 }
 
@@ -875,7 +875,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net,
 	spin_lock_init(&id_priv->lock);
 	mutex_init(&id_priv->qp_mutex);
 	init_completion(&id_priv->comp);
-	atomic_set(&id_priv->refcount, 1);
+	refcount_set(&id_priv->refcount, 1);
 	mutex_init(&id_priv->handler_mutex);
 	INIT_LIST_HEAD(&id_priv->listen_list);
 	INIT_LIST_HEAD(&id_priv->mc_list);
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index 4e04c442ff86..5edcf44a9307 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -66,7 +66,7 @@ struct rdma_id_private {
 	struct mutex		qp_mutex;
 
 	struct completion	comp;
-	atomic_t		refcount;
+	refcount_t refcount;
 	struct mutex		handler_mutex;
 
 	int			backlog;

From b14c95bee83537aff423cece4ae6078759bc6f34 Mon Sep 17 00:00:00 2001
From: Lang Cheng <chenglang@huawei.com>
Date: Sun, 26 Jan 2020 22:55:04 +0800
Subject: [PATCH 025/327] RDMA/hns: Cleanups of magic numbers

Some magic numbers are hard to understand, so replace them with macros or
add some comments for them.

Link: https://lore.kernel.org/r/20200126145504.9700-1-liweihang@huawei.com
Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Yixian Liu <liuyixian@huawei.com>
Signed-off-by: Wenpeng Liang <liangwenpeng@huawei.com>
Signed-off-by: Yixing Liu <liuyixing1@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h |  2 +-
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 10 ++++-----
 drivers/infiniband/hw/hns/hns_roce_qp.c     | 23 ++++++++++++---------
 drivers/infiniband/hw/hns/hns_roce_srq.c    |  3 ++-
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index a7c4ff975c28..3b3d6fee1eca 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -881,7 +881,7 @@ struct hns_roce_caps {
 	u32		cqc_timer_ba_pg_sz;
 	u32		cqc_timer_buf_pg_sz;
 	u32		cqc_timer_hop_num;
-	u32		cqe_ba_pg_sz;
+	u32             cqe_ba_pg_sz;	/* page_size = 4K*(2^cqe_ba_pg_sz) */
 	u32		cqe_buf_pg_sz;
 	u32		cqe_hop_num;
 	u32		srqwqe_ba_pg_sz;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 12c4cd8e9378..b86687809407 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -1999,7 +1999,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 		return ret;
 	}
 
-	if (hr_dev->pci_dev->revision == 0x21) {
+	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_B) {
 		ret = hns_roce_query_pf_timer_resource(hr_dev);
 		if (ret) {
 			dev_err(hr_dev->dev,
@@ -2016,7 +2016,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 		return ret;
 	}
 
-	if (hr_dev->pci_dev->revision == 0x21) {
+	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_B) {
 		ret = hns_roce_set_vf_switch_param(hr_dev, 0);
 		if (ret) {
 			dev_err(hr_dev->dev,
@@ -2298,7 +2298,7 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_v2_priv *priv = hr_dev->priv;
 
-	if (hr_dev->pci_dev->revision == 0x21)
+	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_B)
 		hns_roce_function_clear(hr_dev);
 
 	hns_roce_free_link_table(hr_dev, &priv->tpq);
@@ -2757,7 +2757,7 @@ static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, int wqe_index)
 
 static void hns_roce_v2_cq_set_ci(struct hns_roce_cq *hr_cq, u32 cons_index)
 {
-	*hr_cq->set_ci_db = cons_index & 0xffffff;
+	*hr_cq->set_ci_db = cons_index & V2_CQ_DB_PARAMETER_CONS_IDX_M;
 }
 
 static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
@@ -4475,7 +4475,7 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp,
 	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M,
 		       V2_QPC_BYTE_24_HOP_LIMIT_S, 0);
 
-	if (hr_dev->pci_dev->revision == 0x21 && is_udp)
+	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_B && is_udp)
 		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
 			       V2_QPC_BYTE_24_TC_S, grh->traffic_class >> 2);
 	else
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 3257ad11be48..e13f16c59115 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -309,7 +309,7 @@ static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
 
 		max_cnt = max(1U, cap->max_recv_sge);
 		hr_qp->rq.max_gs = roundup_pow_of_two(max_cnt);
-		if (hr_dev->caps.max_rq_sg <= 2)
+		if (hr_dev->caps.max_rq_sg <= HNS_ROCE_SGE_IN_WQE)
 			hr_qp->rq.wqe_shift =
 					ilog2(hr_dev->caps.max_rq_desc_sz);
 		else
@@ -370,16 +370,17 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
 	hr_qp->sq.wqe_shift = ucmd->log_sq_stride;
 
 	max_cnt = max(1U, cap->max_send_sge);
-	if (hr_dev->caps.max_sq_sg <= 2)
+	if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
 		hr_qp->sq.max_gs = roundup_pow_of_two(max_cnt);
 	else
 		hr_qp->sq.max_gs = max_cnt;
 
-	if (hr_qp->sq.max_gs > 2)
+	if (hr_qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE)
 		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
 							(hr_qp->sq.max_gs - 2));
 
-	if ((hr_qp->sq.max_gs > 2) && (hr_dev->pci_dev->revision == 0x20)) {
+	if (hr_qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE &&
+	    hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_A) {
 		if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) {
 			dev_err(hr_dev->dev,
 				"The extended sge cnt error! sge_cnt=%d\n",
@@ -392,7 +393,7 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
 	ex_sge_num = hr_qp->sge.sge_cnt;
 
 	/* Get buf size, SQ and RQ  are aligned to page_szie */
-	if (hr_dev->caps.max_sq_sg <= 2) {
+	if (hr_dev->hw_rev == HNS_ROCE_HW_VER1) {
 		hr_qp->buff_size = round_up((hr_qp->rq.wqe_cnt <<
 					     hr_qp->rq.wqe_shift), PAGE_SIZE) +
 				   round_up((hr_qp->sq.wqe_cnt <<
@@ -528,13 +529,15 @@ static int set_extend_sge_param(struct hns_roce_dev *hr_dev,
 	}
 
 	/* ud sqwqe's sge use extend sge */
-	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->ibqp.qp_type == IB_QPT_GSI) {
+	if (hr_dev->hw_rev != HNS_ROCE_HW_VER1 &&
+	    hr_qp->ibqp.qp_type == IB_QPT_GSI) {
 		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
 				     hr_qp->sq.max_gs);
 		hr_qp->sge.sge_shift = 4;
 	}
 
-	if ((hr_qp->sq.max_gs > 2) && hr_dev->pci_dev->revision == 0x20) {
+	if (hr_qp->sq.max_gs > 2 &&
+	    hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_A) {
 		if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) {
 			dev_err(dev, "The extended sge cnt error! sge_cnt=%d\n",
 				hr_qp->sge.sge_cnt);
@@ -577,7 +580,7 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
 
 	/* Get data_seg numbers */
 	max_cnt = max(1U, cap->max_send_sge);
-	if (hr_dev->caps.max_sq_sg <= 2)
+	if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
 		hr_qp->sq.max_gs = roundup_pow_of_two(max_cnt);
 	else
 		hr_qp->sq.max_gs = max_cnt;
@@ -593,7 +596,7 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
 	hr_qp->sq.offset = 0;
 	size = round_up(hr_qp->sq.wqe_cnt << hr_qp->sq.wqe_shift, page_size);
 
-	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->sge.sge_cnt) {
+	if (hr_dev->hw_rev != HNS_ROCE_HW_VER1 && hr_qp->sge.sge_cnt) {
 		hr_qp->sge.sge_cnt = max(page_size/(1 << hr_qp->sge.sge_shift),
 					 (u32)hr_qp->sge.sge_cnt);
 		hr_qp->sge.offset = size;
@@ -1078,7 +1081,7 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
 		hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
 
 		/* when hw version is v1, the sqpn is allocated */
-		if (hr_dev->caps.max_sq_sg <= 2)
+		if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
 			hr_qp->ibqp.qp_num = HNS_ROCE_MAX_PORTS +
 					     hr_dev->iboe.phy_port[hr_qp->port];
 		else
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index c6d5f06f9cde..5b3dd1a337d4 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -381,7 +381,8 @@ int hns_roce_create_srq(struct ib_srq *ib_srq,
 	srq->wqe_cnt = roundup_pow_of_two(init_attr->attr.max_wr + 1);
 	srq->max_gs = init_attr->attr.max_sge;
 
-	srq_desc_size = roundup_pow_of_two(max(16, 16 * srq->max_gs));
+	srq_desc_size = roundup_pow_of_two(max(HNS_ROCE_SGE_SIZE,
+					HNS_ROCE_SGE_SIZE * srq->max_gs));
 
 	srq->wqe_shift = ilog2(srq_desc_size);
 

From d7e2d3432ae7795e5b77a77039e904ed4a769f39 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Sun, 26 Jan 2020 22:58:35 +0800
Subject: [PATCH 026/327] RDMA/hns: Optimize eqe buffer allocation flow

The eqe has a private multi-hop addressing implementation, but there is
already a set of interfaces in the hns driver that can achieve this.

So, simplify the eqe buffer allocation process by using the mtr interface
and remove large amount of repeated logic.

Link: https://lore.kernel.org/r/20200126145835.11368-1-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h |  10 +-
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 481 +++++---------------
 2 files changed, 108 insertions(+), 383 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 3b3d6fee1eca..4db818e98af5 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -762,14 +762,8 @@ struct hns_roce_eq {
 	int				eqe_ba_pg_sz;
 	int				eqe_buf_pg_sz;
 	int				hop_num;
-	u64				*bt_l0;	/* Base address table for L0 */
-	u64				**bt_l1; /* Base address table for L1 */
-	u64				**buf;
-	dma_addr_t			l0_dma;
-	dma_addr_t			*l1_dma;
-	dma_addr_t			*buf_dma;
-	u32				l0_last_num; /* L0 last chunk num */
-	u32				l1_last_num; /* L1 last chunk num */
+	struct hns_roce_mtr		mtr;
+	struct hns_roce_buf		buf;
 	int				eq_max_cnt;
 	int				eq_period;
 	int				shift;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index b86687809407..29a5ee0cf07e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -5326,44 +5326,24 @@ static void set_eq_cons_index_v2(struct hns_roce_eq *eq)
 	hns_roce_write64(hr_dev, doorbell, eq->doorbell);
 }
 
-static struct hns_roce_aeqe *get_aeqe_v2(struct hns_roce_eq *eq, u32 entry)
+static inline void *get_eqe_buf(struct hns_roce_eq *eq, unsigned long offset)
 {
 	u32 buf_chk_sz;
-	unsigned long off;
 
 	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
-	off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE;
-
-	return (struct hns_roce_aeqe *)((char *)(eq->buf_list->buf) +
-		off % buf_chk_sz);
-}
-
-static struct hns_roce_aeqe *mhop_get_aeqe(struct hns_roce_eq *eq, u32 entry)
-{
-	u32 buf_chk_sz;
-	unsigned long off;
-
-	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
-
-	off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE;
-
-	if (eq->hop_num == HNS_ROCE_HOP_NUM_0)
-		return (struct hns_roce_aeqe *)((u8 *)(eq->bt_l0) +
-			off % buf_chk_sz);
+	if (eq->buf.nbufs == 1)
+		return eq->buf.direct.buf + offset % buf_chk_sz;
 	else
-		return (struct hns_roce_aeqe *)((u8 *)
-			(eq->buf[off / buf_chk_sz]) + off % buf_chk_sz);
+		return eq->buf.page_list[offset / buf_chk_sz].buf +
+		       offset % buf_chk_sz;
 }
 
 static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq)
 {
 	struct hns_roce_aeqe *aeqe;
 
-	if (!eq->hop_num)
-		aeqe = get_aeqe_v2(eq, eq->cons_index);
-	else
-		aeqe = mhop_get_aeqe(eq, eq->cons_index);
-
+	aeqe = get_eqe_buf(eq, (eq->cons_index & (eq->entries - 1)) *
+			   HNS_ROCE_AEQ_ENTRY_SIZE);
 	return (roce_get_bit(aeqe->asyn, HNS_ROCE_V2_AEQ_AEQE_OWNER_S) ^
 		!!(eq->cons_index & eq->entries)) ? aeqe : NULL;
 }
@@ -5456,44 +5436,12 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev,
 	return aeqe_found;
 }
 
-static struct hns_roce_ceqe *get_ceqe_v2(struct hns_roce_eq *eq, u32 entry)
-{
-	u32 buf_chk_sz;
-	unsigned long off;
-
-	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
-	off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE;
-
-	return (struct hns_roce_ceqe *)((char *)(eq->buf_list->buf) +
-		off % buf_chk_sz);
-}
-
-static struct hns_roce_ceqe *mhop_get_ceqe(struct hns_roce_eq *eq, u32 entry)
-{
-	u32 buf_chk_sz;
-	unsigned long off;
-
-	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
-
-	off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE;
-
-	if (eq->hop_num == HNS_ROCE_HOP_NUM_0)
-		return (struct hns_roce_ceqe *)((u8 *)(eq->bt_l0) +
-			off % buf_chk_sz);
-	else
-		return (struct hns_roce_ceqe *)((u8 *)(eq->buf[off /
-			buf_chk_sz]) + off % buf_chk_sz);
-}
-
 static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq)
 {
 	struct hns_roce_ceqe *ceqe;
 
-	if (!eq->hop_num)
-		ceqe = get_ceqe_v2(eq, eq->cons_index);
-	else
-		ceqe = mhop_get_ceqe(eq, eq->cons_index);
-
+	ceqe = get_eqe_buf(eq, (eq->cons_index & (eq->entries - 1)) *
+			   HNS_ROCE_CEQ_ENTRY_SIZE);
 	return (!!(roce_get_bit(ceqe->comp, HNS_ROCE_V2_CEQ_CEQE_OWNER_S))) ^
 		(!!(eq->cons_index & eq->entries)) ? ceqe : NULL;
 }
@@ -5653,90 +5601,11 @@ static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, int eqn)
 		dev_err(dev, "[mailbox cmd] destroy eqc(%d) failed.\n", eqn);
 }
 
-static void hns_roce_mhop_free_eq(struct hns_roce_dev *hr_dev,
-				  struct hns_roce_eq *eq)
+static void free_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
 {
-	struct device *dev = hr_dev->dev;
-	u64 idx;
-	u64 size;
-	u32 buf_chk_sz;
-	u32 bt_chk_sz;
-	u32 mhop_num;
-	int eqe_alloc;
-	int i = 0;
-	int j = 0;
-
-	mhop_num = hr_dev->caps.eqe_hop_num;
-	buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
-	bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT);
-
-	if (mhop_num == HNS_ROCE_HOP_NUM_0) {
-		dma_free_coherent(dev, (unsigned int)(eq->entries *
-				  eq->eqe_size), eq->bt_l0, eq->l0_dma);
-		return;
-	}
-
-	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
-	if (mhop_num == 1) {
-		for (i = 0; i < eq->l0_last_num; i++) {
-			if (i == eq->l0_last_num - 1) {
-				eqe_alloc = i * (buf_chk_sz / eq->eqe_size);
-				size = (eq->entries - eqe_alloc) * eq->eqe_size;
-				dma_free_coherent(dev, size, eq->buf[i],
-						  eq->buf_dma[i]);
-				break;
-			}
-			dma_free_coherent(dev, buf_chk_sz, eq->buf[i],
-					  eq->buf_dma[i]);
-		}
-	} else if (mhop_num == 2) {
-		for (i = 0; i < eq->l0_last_num; i++) {
-			dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
-					  eq->l1_dma[i]);
-
-			for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
-				idx = i * (bt_chk_sz / BA_BYTE_LEN) + j;
-				if ((i == eq->l0_last_num - 1)
-				     && j == eq->l1_last_num - 1) {
-					eqe_alloc = (buf_chk_sz / eq->eqe_size)
-						    * idx;
-					size = (eq->entries - eqe_alloc)
-						* eq->eqe_size;
-					dma_free_coherent(dev, size,
-							  eq->buf[idx],
-							  eq->buf_dma[idx]);
-					break;
-				}
-				dma_free_coherent(dev, buf_chk_sz, eq->buf[idx],
-						  eq->buf_dma[idx]);
-			}
-		}
-	}
-	kfree(eq->buf_dma);
-	kfree(eq->buf);
-	kfree(eq->l1_dma);
-	kfree(eq->bt_l1);
-	eq->buf_dma = NULL;
-	eq->buf = NULL;
-	eq->l1_dma = NULL;
-	eq->bt_l1 = NULL;
-}
-
-static void hns_roce_v2_free_eq(struct hns_roce_dev *hr_dev,
-				struct hns_roce_eq *eq)
-{
-	u32 buf_chk_sz;
-
-	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
-
-	if (hr_dev->caps.eqe_hop_num) {
-		hns_roce_mhop_free_eq(hr_dev, eq);
-		return;
-	}
-
-	dma_free_coherent(hr_dev->dev, buf_chk_sz, eq->buf_list->buf,
-			  eq->buf_list->map);
-	kfree(eq->buf_list);
+	if (!eq->hop_num || eq->hop_num == HNS_ROCE_HOP_NUM_0)
+		hns_roce_mtr_cleanup(hr_dev, &eq->mtr);
+	hns_roce_buf_free(hr_dev, eq->buf.size, &eq->buf);
 }
 
 static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev,
@@ -5744,6 +5613,8 @@ static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev,
 				void *mb_buf)
 {
 	struct hns_roce_eq_context *eqc;
+	u64 ba[MTT_MIN_COUNT] = { 0 };
+	int count;
 
 	eqc = mb_buf;
 	memset(eqc, 0, sizeof(struct hns_roce_eq_context));
@@ -5759,10 +5630,23 @@ static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev,
 	eq->eqe_buf_pg_sz = hr_dev->caps.eqe_buf_pg_sz;
 	eq->shift = ilog2((unsigned int)eq->entries);
 
-	if (!eq->hop_num)
-		eq->eqe_ba = eq->buf_list->map;
-	else
-		eq->eqe_ba = eq->l0_dma;
+	/* if not muti-hop, eqe buffer only use one trunk */
+	if (!eq->hop_num || eq->hop_num == HNS_ROCE_HOP_NUM_0) {
+		eq->eqe_ba = eq->buf.direct.map;
+		eq->cur_eqe_ba = eq->eqe_ba;
+		if (eq->buf.npages > 1)
+			eq->nxt_eqe_ba = eq->eqe_ba + (1 << eq->eqe_buf_pg_sz);
+		else
+			eq->nxt_eqe_ba = eq->eqe_ba;
+	} else {
+		count = hns_roce_mtr_find(hr_dev, &eq->mtr, 0, ba,
+					  MTT_MIN_COUNT, &eq->eqe_ba);
+		eq->cur_eqe_ba = ba[0];
+		if (count > 1)
+			eq->nxt_eqe_ba = ba[1];
+		else
+			eq->nxt_eqe_ba = ba[0];
+	}
 
 	/* set eqc state */
 	roce_set_field(eqc->byte_4, HNS_ROCE_EQC_EQ_ST_M, HNS_ROCE_EQC_EQ_ST_S,
@@ -5860,220 +5744,97 @@ static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev,
 		       HNS_ROCE_EQC_NXT_EQE_BA_H_S, eq->nxt_eqe_ba >> 44);
 }
 
-static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev,
-				  struct hns_roce_eq *eq)
+static int map_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq,
+		      u32 page_shift)
 {
-	struct device *dev = hr_dev->dev;
-	int eq_alloc_done = 0;
-	int eq_buf_cnt = 0;
-	int eqe_alloc;
-	u32 buf_chk_sz;
-	u32 bt_chk_sz;
-	u32 mhop_num;
-	u64 size;
-	u64 idx;
+	struct hns_roce_buf_region region = {};
+	dma_addr_t *buf_list = NULL;
 	int ba_num;
-	int bt_num;
-	int record_i;
-	int record_j;
-	int i = 0;
-	int j = 0;
-
-	mhop_num = hr_dev->caps.eqe_hop_num;
-	buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
-	bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT);
+	int ret;
 
 	ba_num = DIV_ROUND_UP(PAGE_ALIGN(eq->entries * eq->eqe_size),
-			      buf_chk_sz);
-	bt_num = DIV_ROUND_UP(ba_num, bt_chk_sz / BA_BYTE_LEN);
+			      1 << page_shift);
+	hns_roce_init_buf_region(&region, hr_dev->caps.eqe_hop_num, 0, ba_num);
 
-	if (mhop_num == HNS_ROCE_HOP_NUM_0) {
-		if (eq->entries > buf_chk_sz / eq->eqe_size) {
-			dev_err(dev, "eq entries %d is larger than buf_pg_sz!",
-				eq->entries);
-			return -EINVAL;
-		}
-		eq->bt_l0 = dma_alloc_coherent(dev, eq->entries * eq->eqe_size,
-					       &(eq->l0_dma), GFP_KERNEL);
-		if (!eq->bt_l0)
-			return -ENOMEM;
-
-		eq->cur_eqe_ba = eq->l0_dma;
-		eq->nxt_eqe_ba = 0;
-
-		return 0;
+	/* alloc a tmp list for storing eq buf address */
+	ret = hns_roce_alloc_buf_list(&region, &buf_list, 1);
+	if (ret) {
+		dev_err(hr_dev->dev, "alloc eq buf_list error\n");
+		return ret;
 	}
 
-	eq->buf_dma = kcalloc(ba_num, sizeof(*eq->buf_dma), GFP_KERNEL);
-	if (!eq->buf_dma)
-		return -ENOMEM;
-	eq->buf = kcalloc(ba_num, sizeof(*eq->buf), GFP_KERNEL);
-	if (!eq->buf)
-		goto err_kcalloc_buf;
-
-	if (mhop_num == 2) {
-		eq->l1_dma = kcalloc(bt_num, sizeof(*eq->l1_dma), GFP_KERNEL);
-		if (!eq->l1_dma)
-			goto err_kcalloc_l1_dma;
-
-		eq->bt_l1 = kcalloc(bt_num, sizeof(*eq->bt_l1), GFP_KERNEL);
-		if (!eq->bt_l1)
-			goto err_kcalloc_bt_l1;
+	ba_num = hns_roce_get_kmem_bufs(hr_dev, buf_list, region.count,
+					region.offset, &eq->buf);
+	if (ba_num != region.count) {
+		dev_err(hr_dev->dev, "get eqe buf err,expect %d,ret %d.\n",
+			region.count, ba_num);
+		ret = -ENOBUFS;
+		goto done;
 	}
 
-	/* alloc L0 BT */
-	eq->bt_l0 = dma_alloc_coherent(dev, bt_chk_sz, &eq->l0_dma, GFP_KERNEL);
-	if (!eq->bt_l0)
-		goto err_dma_alloc_l0;
+	hns_roce_mtr_init(&eq->mtr, PAGE_SHIFT + hr_dev->caps.eqe_ba_pg_sz,
+			  page_shift);
+	ret = hns_roce_mtr_attach(hr_dev, &eq->mtr, &buf_list, &region, 1);
+	if (ret)
+		dev_err(hr_dev->dev, "mtr attatch error for eqe\n");
 
-	if (mhop_num == 1) {
-		if (ba_num > (bt_chk_sz / BA_BYTE_LEN))
-			dev_err(dev, "ba_num %d is too large for 1 hop\n",
-				ba_num);
+	goto done;
 
-		/* alloc buf */
-		for (i = 0; i < bt_chk_sz / BA_BYTE_LEN; i++) {
-			if (eq_buf_cnt + 1 < ba_num) {
-				size = buf_chk_sz;
-			} else {
-				eqe_alloc = i * (buf_chk_sz / eq->eqe_size);
-				size = (eq->entries - eqe_alloc) * eq->eqe_size;
-			}
-			eq->buf[i] = dma_alloc_coherent(dev, size,
-							&(eq->buf_dma[i]),
-							GFP_KERNEL);
-			if (!eq->buf[i])
-				goto err_dma_alloc_buf;
+	hns_roce_mtr_cleanup(hr_dev, &eq->mtr);
+done:
+	hns_roce_free_buf_list(&buf_list, 1);
 
-			*(eq->bt_l0 + i) = eq->buf_dma[i];
+	return ret;
+}
 
-			eq_buf_cnt++;
-			if (eq_buf_cnt >= ba_num)
-				break;
-		}
-		eq->cur_eqe_ba = eq->buf_dma[0];
-		if (ba_num > 1)
-			eq->nxt_eqe_ba = eq->buf_dma[1];
+static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
+{
+	struct hns_roce_buf *buf = &eq->buf;
+	bool is_mhop = false;
+	u32 page_shift;
+	u32 mhop_num;
+	u32 max_size;
+	int ret;
 
-	} else if (mhop_num == 2) {
-		/* alloc L1 BT and buf */
-		for (i = 0; i < bt_chk_sz / BA_BYTE_LEN; i++) {
-			eq->bt_l1[i] = dma_alloc_coherent(dev, bt_chk_sz,
-							  &(eq->l1_dma[i]),
-							  GFP_KERNEL);
-			if (!eq->bt_l1[i])
-				goto err_dma_alloc_l1;
-			*(eq->bt_l0 + i) = eq->l1_dma[i];
-
-			for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
-				idx = i * bt_chk_sz / BA_BYTE_LEN + j;
-				if (eq_buf_cnt + 1 < ba_num) {
-					size = buf_chk_sz;
-				} else {
-					eqe_alloc = (buf_chk_sz / eq->eqe_size)
-						    * idx;
-					size = (eq->entries - eqe_alloc)
-						* eq->eqe_size;
-				}
-				eq->buf[idx] = dma_alloc_coherent(dev, size,
-								  &(eq->buf_dma[idx]),
-								  GFP_KERNEL);
-				if (!eq->buf[idx])
-					goto err_dma_alloc_buf;
-
-				*(eq->bt_l1[i] + j) = eq->buf_dma[idx];
-
-				eq_buf_cnt++;
-				if (eq_buf_cnt >= ba_num) {
-					eq_alloc_done = 1;
-					break;
-				}
-			}
-
-			if (eq_alloc_done)
-				break;
-		}
-		eq->cur_eqe_ba = eq->buf_dma[0];
-		if (ba_num > 1)
-			eq->nxt_eqe_ba = eq->buf_dma[1];
+	page_shift = PAGE_SHIFT + hr_dev->caps.eqe_buf_pg_sz;
+	mhop_num = hr_dev->caps.eqe_hop_num;
+	if (!mhop_num) {
+		max_size = 1 << page_shift;
+		buf->size = max_size;
+	} else if (mhop_num == HNS_ROCE_HOP_NUM_0) {
+		max_size = eq->entries * eq->eqe_size;
+		buf->size = max_size;
+	} else {
+		max_size = 1 << page_shift;
+		buf->size = PAGE_ALIGN(eq->entries * eq->eqe_size);
+		is_mhop = true;
 	}
 
-	eq->l0_last_num = i + 1;
-	if (mhop_num == 2)
-		eq->l1_last_num = j + 1;
+	ret = hns_roce_buf_alloc(hr_dev, buf->size, max_size, buf, page_shift);
+	if (ret) {
+		dev_err(hr_dev->dev, "alloc eq buf error\n");
+		return ret;
+	}
+
+	if (is_mhop) {
+		ret = map_eq_buf(hr_dev, eq, page_shift);
+		if (ret) {
+			dev_err(hr_dev->dev, "map roce buf error\n");
+			goto err_alloc;
+		}
+	}
 
 	return 0;
-
-err_dma_alloc_l1:
-	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
-	eq->bt_l0 = NULL;
-	eq->l0_dma = 0;
-	for (i -= 1; i >= 0; i--) {
-		dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
-				  eq->l1_dma[i]);
-
-		for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
-			idx = i * bt_chk_sz / BA_BYTE_LEN + j;
-			dma_free_coherent(dev, buf_chk_sz, eq->buf[idx],
-					  eq->buf_dma[idx]);
-		}
-	}
-	goto err_dma_alloc_l0;
-
-err_dma_alloc_buf:
-	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
-	eq->bt_l0 = NULL;
-	eq->l0_dma = 0;
-
-	if (mhop_num == 1)
-		for (i -= 1; i >= 0; i--)
-			dma_free_coherent(dev, buf_chk_sz, eq->buf[i],
-					  eq->buf_dma[i]);
-	else if (mhop_num == 2) {
-		record_i = i;
-		record_j = j;
-		for (; i >= 0; i--) {
-			dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
-					  eq->l1_dma[i]);
-
-			for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
-				if (i == record_i && j >= record_j)
-					break;
-
-				idx = i * bt_chk_sz / BA_BYTE_LEN + j;
-				dma_free_coherent(dev, buf_chk_sz,
-						  eq->buf[idx],
-						  eq->buf_dma[idx]);
-			}
-		}
-	}
-
-err_dma_alloc_l0:
-	kfree(eq->bt_l1);
-	eq->bt_l1 = NULL;
-
-err_kcalloc_bt_l1:
-	kfree(eq->l1_dma);
-	eq->l1_dma = NULL;
-
-err_kcalloc_l1_dma:
-	kfree(eq->buf);
-	eq->buf = NULL;
-
-err_kcalloc_buf:
-	kfree(eq->buf_dma);
-	eq->buf_dma = NULL;
-
-	return -ENOMEM;
+err_alloc:
+	hns_roce_buf_free(hr_dev, buf->size, buf);
+	return ret;
 }
 
 static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev,
 				 struct hns_roce_eq *eq,
 				 unsigned int eq_cmd)
 {
-	struct device *dev = hr_dev->dev;
 	struct hns_roce_cmd_mailbox *mailbox;
-	u32 buf_chk_sz = 0;
 	int ret;
 
 	/* Allocate mailbox memory */
@@ -6081,38 +5842,17 @@ static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev,
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 
-	if (!hr_dev->caps.eqe_hop_num) {
-		buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
-
-		eq->buf_list = kzalloc(sizeof(struct hns_roce_buf_list),
-				       GFP_KERNEL);
-		if (!eq->buf_list) {
-			ret = -ENOMEM;
-			goto free_cmd_mbox;
-		}
-
-		eq->buf_list->buf = dma_alloc_coherent(dev, buf_chk_sz,
-						       &(eq->buf_list->map),
-						       GFP_KERNEL);
-		if (!eq->buf_list->buf) {
-			ret = -ENOMEM;
-			goto err_alloc_buf;
-		}
-
-	} else {
-		ret = hns_roce_mhop_alloc_eq(hr_dev, eq);
-		if (ret) {
-			ret = -ENOMEM;
-			goto free_cmd_mbox;
-		}
+	ret = alloc_eq_buf(hr_dev, eq);
+	if (ret) {
+		ret = -ENOMEM;
+		goto free_cmd_mbox;
 	}
-
 	hns_roce_config_eqc(hr_dev, eq, mailbox->buf);
 
 	ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq->eqn, 0,
 				eq_cmd, HNS_ROCE_CMD_TIMEOUT_MSECS);
 	if (ret) {
-		dev_err(dev, "[mailbox cmd] create eqc failed.\n");
+		dev_err(hr_dev->dev, "[mailbox cmd] create eqc failed.\n");
 		goto err_cmd_mbox;
 	}
 
@@ -6121,16 +5861,7 @@ static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev,
 	return 0;
 
 err_cmd_mbox:
-	if (!hr_dev->caps.eqe_hop_num)
-		dma_free_coherent(dev, buf_chk_sz, eq->buf_list->buf,
-				  eq->buf_list->map);
-	else {
-		hns_roce_mhop_free_eq(hr_dev, eq);
-		goto free_cmd_mbox;
-	}
-
-err_alloc_buf:
-	kfree(eq->buf_list);
+	free_eq_buf(hr_dev, eq);
 
 free_cmd_mbox:
 	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
@@ -6310,7 +6041,7 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
 
 err_create_eq_fail:
 	for (i -= 1; i >= 0; i--)
-		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[i]);
+		free_eq_buf(hr_dev, &eq_table->eq[i]);
 	kfree(eq_table->eq);
 
 	return ret;
@@ -6332,7 +6063,7 @@ static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev)
 	for (i = 0; i < eq_num; i++) {
 		hns_roce_v2_destroy_eqc(hr_dev, i);
 
-		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[i]);
+		free_eq_buf(hr_dev, &eq_table->eq[i]);
 	}
 
 	kfree(eq_table->eq);

From 9a4b24108d92ffaa886e37923088bd806b988948 Mon Sep 17 00:00:00 2001
From: Shiraz Saleem <shiraz.saleem@intel.com>
Date: Tue, 4 Feb 2020 16:38:40 -0600
Subject: [PATCH 027/327] i40iw: Do an RCU lookup in i40iw_add_ipv4_addr

The in_dev_for_each_ifa_rtnl() iterator in i40iw_add_ipv4_addr requires
that the rtnl lock be held. But the rtnl_trylock/unlock scheme in this
function does not guarantee it.

Replace the rtnl locking with an RCU lookup using
in_dev_for_each_ifa_rcu()

Fixes: 8e06af711bf2 ("i40iw: add main, hdr, status")
Link: https://lore.kernel.org/r/20200204223840.2151-1-shiraz.saleem@intel.com
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/i40iw/i40iw_main.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
index 238614370927..84e1b52af15e 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -1212,22 +1212,19 @@ static void i40iw_add_ipv4_addr(struct i40iw_device *iwdev)
 {
 	struct net_device *dev;
 	struct in_device *idev;
-	bool got_lock = true;
 	u32 ip_addr;
 
-	if (!rtnl_trylock())
-		got_lock = false;
-
-	for_each_netdev(&init_net, dev) {
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
 		if ((((rdma_vlan_dev_vlan_id(dev) < 0xFFFF) &&
 		      (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
-		    (dev == iwdev->netdev)) && (dev->flags & IFF_UP)) {
+		    (dev == iwdev->netdev)) && (READ_ONCE(dev->flags) & IFF_UP)) {
 			const struct in_ifaddr *ifa;
 
-			idev = in_dev_get(dev);
+			idev = __in_dev_get_rcu(dev);
 			if (!idev)
 				continue;
-			in_dev_for_each_ifa_rtnl(ifa, idev) {
+			in_dev_for_each_ifa_rcu(ifa, idev) {
 				i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM,
 					    "IP=%pI4, vlan_id=%d, MAC=%pM\n", &ifa->ifa_address,
 					     rdma_vlan_dev_vlan_id(dev), dev->dev_addr);
@@ -1239,12 +1236,9 @@ static void i40iw_add_ipv4_addr(struct i40iw_device *iwdev)
 						       true,
 						       I40IW_ARP_ADD);
 			}
-
-			in_dev_put(idev);
 		}
 	}
-	if (got_lock)
-		rtnl_unlock();
+	rcu_read_unlock();
 }
 
 /**

From beb205dd67aaa4315dedf5c40b47c6e9dee5a469 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kamalheib1@gmail.com>
Date: Wed, 5 Feb 2020 10:13:54 +0200
Subject: [PATCH 028/327] RDMA/siw: Fix setting active_mtu attribute

Make sure to set the active_mtu attribute to avoid report the following
invalid value:

$ ibv_devinfo -d siw0 | grep active_mtu
			active_mtu:		invalid MTU (0)

Fixes: 303ae1cdfdf7 ("rdma/siw: application interface")
Link: https://lore.kernel.org/r/20200205081354.30438-1-kamalheib1@gmail.com
Signed-off-by: Kamal Heib <kamalheib1@gmail.com>
Reviewed-by: Gal Pressman <galpress@amazon.com>
Reviewed-by: Bernard Metzler <bmt@zurich.ibm.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/siw/siw_verbs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 07e30138aaa1..73485d0da907 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -168,12 +168,12 @@ int siw_query_port(struct ib_device *base_dev, u8 port,
 
 	memset(attr, 0, sizeof(*attr));
 
-	attr->active_mtu = attr->max_mtu;
 	attr->active_speed = 2;
 	attr->active_width = 2;
 	attr->gid_tbl_len = 1;
 	attr->max_msg_sz = -1;
 	attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
+	attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
 	attr->phys_state = sdev->state == IB_PORT_ACTIVE ?
 		IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
 	attr->pkey_tbl_len = 1;

From ca750d4a9c426854e221ca404fbefeab9800134e Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Wed, 12 Feb 2020 09:26:27 +0200
Subject: [PATCH 029/327] RDMA/ucma: Mask QPN to be 24 bits according to IBTA

IBTA declares QPN as 24bits, mask input to ensure that kernel
doesn't get higher bits and ensure by adding WANR_ONCE() that
other CM users do the same.

Link: https://lore.kernel.org/r/20200212072635.682689-2-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c   | 3 +++
 drivers/infiniband/core/ucma.c | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 68cc1b2d6824..33c0d9e7bb66 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -2188,6 +2188,9 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id,
 	cm_id_priv->initiator_depth = param->initiator_depth;
 	cm_id_priv->responder_resources = param->responder_resources;
 	cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REP_STARTING_PSN, rep_msg));
+	WARN_ONCE(param->qp_num & 0xFF000000,
+		  "IBTA declares QPN to be 24 bits, but it is 0x%X\n",
+		  param->qp_num);
 	cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF);
 
 out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 0274e9b704be..57e68491a2fd 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1045,7 +1045,7 @@ static void ucma_copy_conn_param(struct rdma_cm_id *id,
 	dst->retry_count = src->retry_count;
 	dst->rnr_retry_count = src->rnr_retry_count;
 	dst->srq = src->srq;
-	dst->qp_num = src->qp_num;
+	dst->qp_num = src->qp_num & 0xFFFFFF;
 	dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0;
 }
 

From f03d9fadfe13a78ee28fec320d43f7b37574adcb Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@mellanox.com>
Date: Wed, 12 Feb 2020 09:35:59 +0200
Subject: [PATCH 030/327] RDMA/core: Add weak ordering dma attr to dma mapping

For memory regions registered with IB_ACCESS_RELAXED_ORDERING will be dma
mapped with the DMA_ATTR_WEAK_ORDERING.

This will allow reads and writes to the mapping to be weakly ordered, such
change can enhance performance on some supporting architectures.

Link: https://lore.kernel.org/r/20200212073559.684139-1-leon@kernel.org
Signed-off-by: Michael Guralnik <michaelgur@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/umem.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 06b6125b5ae1..82455a1392f1 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -197,6 +197,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
 	unsigned long lock_limit;
 	unsigned long new_pinned;
 	unsigned long cur_base;
+	unsigned long dma_attr = 0;
 	struct mm_struct *mm;
 	unsigned long npages;
 	int ret;
@@ -278,10 +279,12 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
 
 	sg_mark_end(sg);
 
-	umem->nmap = ib_dma_map_sg(device,
-				   umem->sg_head.sgl,
-				   umem->sg_nents,
-				   DMA_BIDIRECTIONAL);
+	if (access & IB_ACCESS_RELAXED_ORDERING)
+		dma_attr |= DMA_ATTR_WEAK_ORDERING;
+
+	umem->nmap =
+		ib_dma_map_sg_attrs(device, umem->sg_head.sgl, umem->sg_nents,
+				    DMA_BIDIRECTIONAL, dma_attr);
 
 	if (!umem->nmap) {
 		ret = -ENOMEM;

From ffd541d45726341c1830ff595fd7352b6d1cfbcd Mon Sep 17 00:00:00 2001
From: Yixian Liu <liuyixian@huawei.com>
Date: Thu, 6 Feb 2020 17:56:44 +0800
Subject: [PATCH 031/327] RDMA/hns: Add the workqueue framework for flush cqe
 handler

HiP08 RoCE hardware lacks ability(a known hardware problem) to flush
outstanding WQEs if QP state gets into errored mode for some reason.  To
overcome this hardware problem and as a workaround, when QP is detected to
be in errored state during various legs like post send, post receive etc
[1], flush needs to be performed from the driver.

The earlier patch[1] sent to solve the hardware limitation explained in
the cover-letter had a bug in the software flushing leg. It acquired mutex
while modifying QP state to errored state and while conveying it to the
hardware using the mailbox. This caused leg to sleep while holding
spin-lock and caused crash.

Suggested Solution:
we have proposed to defer the flushing of the QP in the Errored state
using the workqueue to get around with the limitation of our hardware.

This patch adds the framework of the workqueue and the flush handler
function.

[1] https://patchwork.kernel.org/patch/10534271/

Link: https://lore.kernel.org/r/1580983005-13899-2-git-send-email-liuyixian@huawei.com
Signed-off-by: Yixian Liu <liuyixian@huawei.com>
Reviewed-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h | 20 ++++++-----
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  |  3 +-
 drivers/infiniband/hw/hns/hns_roce_qp.c     | 37 +++++++++++++++++++++
 3 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 4db818e98af5..067e3ac6fcfa 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -641,6 +641,15 @@ struct hns_roce_rinl_buf {
 	u32			 wqe_cnt;
 };
 
+struct hns_roce_work {
+	struct hns_roce_dev *hr_dev;
+	struct work_struct work;
+	u32 qpn;
+	u32 cqn;
+	int event_type;
+	int sub_type;
+};
+
 struct hns_roce_qp {
 	struct ib_qp		ibqp;
 	struct hns_roce_buf	hr_buf;
@@ -684,6 +693,7 @@ struct hns_roce_qp {
 	struct hns_roce_sge	sge;
 	u32			next_sge;
 
+	struct hns_roce_work	flush_work;
 	struct hns_roce_rinl_buf rq_inl_buf;
 	struct list_head	node;		/* all qps are on a list */
 	struct list_head	rq_node;	/* all recv qps are on a list */
@@ -900,15 +910,6 @@ struct hns_roce_caps {
 	u16		default_ceq_arm_st;
 };
 
-struct hns_roce_work {
-	struct hns_roce_dev *hr_dev;
-	struct work_struct work;
-	u32 qpn;
-	u32 cqn;
-	int event_type;
-	int sub_type;
-};
-
 struct hns_roce_dfx_hw {
 	int (*query_cqc_info)(struct hns_roce_dev *hr_dev, u32 cqn,
 			      int *buffer);
@@ -1231,6 +1232,7 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *ib_pd,
 				 struct ib_udata *udata);
 int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		       int attr_mask, struct ib_udata *udata);
+void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp);
 void *get_recv_wqe(struct hns_roce_qp *hr_qp, int n);
 void *get_send_wqe(struct hns_roce_qp *hr_qp, int n);
 void *get_send_extend_sge(struct hns_roce_qp *hr_qp, int n);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 29a5ee0cf07e..f69541acb085 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -6023,8 +6023,7 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
 		goto err_request_irq_fail;
 	}
 
-	hr_dev->irq_workq =
-		create_singlethread_workqueue("hns_roce_irq_workqueue");
+	hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0);
 	if (!hr_dev->irq_workq) {
 		dev_err(dev, "Create irq workqueue failed!\n");
 		ret = -ENOMEM;
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index e13f16c59115..aeb91a75cae5 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -43,6 +43,43 @@
 
 #define SQP_NUM				(2 * HNS_ROCE_MAX_PORTS)
 
+static void flush_work_handle(struct work_struct *work)
+{
+	struct hns_roce_work *flush_work = container_of(work,
+					struct hns_roce_work, work);
+	struct hns_roce_qp *hr_qp = container_of(flush_work,
+					struct hns_roce_qp, flush_work);
+	struct device *dev = flush_work->hr_dev->dev;
+	struct ib_qp_attr attr;
+	int attr_mask;
+	int ret;
+
+	attr_mask = IB_QP_STATE;
+	attr.qp_state = IB_QPS_ERR;
+
+	ret = hns_roce_modify_qp(&hr_qp->ibqp, &attr, attr_mask, NULL);
+	if (ret)
+		dev_err(dev, "Modify QP to error state failed(%d) during CQE flush\n",
+			ret);
+
+	/*
+	 * make sure we signal QP destroy leg that flush QP was completed
+	 * so that it can safely proceed ahead now and destroy QP
+	 */
+	if (atomic_dec_and_test(&hr_qp->refcount))
+		complete(&hr_qp->free);
+}
+
+void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
+{
+	struct hns_roce_work *flush_work = &hr_qp->flush_work;
+
+	flush_work->hr_dev = hr_dev;
+	INIT_WORK(&flush_work->work, flush_work_handle);
+	atomic_inc(&hr_qp->refcount);
+	queue_work(hr_dev->irq_workq, &flush_work->work);
+}
+
 void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type)
 {
 	struct device *dev = hr_dev->dev;

From b53742865e9f09cbba4d9daa161760ec23f36aa4 Mon Sep 17 00:00:00 2001
From: Yixian Liu <liuyixian@huawei.com>
Date: Thu, 6 Feb 2020 17:56:45 +0800
Subject: [PATCH 032/327] RDMA/hns: Delayed flush cqe process with workqueue

HiP08 RoCE hardware lacks ability(a known hardware problem) to flush
outstanding WQEs if QP state gets into errored mode for some reason.  To
overcome this hardware problem and as a workaround, when QP is detected to
be in errored state during various legs like post send, post receive
etc[1], flush needs to be performed from the driver.

The earlier patch[1] sent to solve the hardware limitation explained in
the cover-letter had a bug in the software flushing leg. It acquired mutex
while modifying QP state to errored state and while conveying it to the
hardware using the mailbox. This caused leg to sleep while holding
spin-lock and caused crash.

Suggested Solution: we have proposed to defer the flushing of the QP in
the Errored state using the workqueue to get around with the limitation of
our hardware.

This patch specifically adds the calls to the flush handler from where
parts of the code like post_send/post_recv etc. when the QP state gets
into the errored mode.

[1] https://patchwork.kernel.org/patch/10534271/

Link: https://lore.kernel.org/r/1580983005-13899-3-git-send-email-liuyixian@huawei.com
Signed-off-by: Yixian Liu <liuyixian@huawei.com>
Reviewed-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h |  6 ++
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 99 +++++++++++----------
 drivers/infiniband/hw/hns/hns_roce_qp.c     | 11 ++-
 3 files changed, 66 insertions(+), 50 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 067e3ac6fcfa..d0a83926dc8f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -641,6 +641,10 @@ struct hns_roce_rinl_buf {
 	u32			 wqe_cnt;
 };
 
+enum {
+	HNS_ROCE_FLUSH_FLAG = 0,
+};
+
 struct hns_roce_work {
 	struct hns_roce_dev *hr_dev;
 	struct work_struct work;
@@ -693,6 +697,8 @@ struct hns_roce_qp {
 	struct hns_roce_sge	sge;
 	u32			next_sge;
 
+	/* 0: flush needed, 1: unneeded */
+	unsigned long		flush_flag;
 	struct hns_roce_work	flush_work;
 	struct hns_roce_rinl_buf rq_inl_buf;
 	struct list_head	node;		/* all qps are on a list */
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index f69541acb085..aa9d1792e0cd 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -220,11 +220,6 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 	return 0;
 }
 
-static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
-				 const struct ib_qp_attr *attr,
-				 int attr_mask, enum ib_qp_state cur_state,
-				 enum ib_qp_state new_state);
-
 static int check_send_valid(struct hns_roce_dev *hr_dev,
 			    struct hns_roce_qp *hr_qp)
 {
@@ -261,7 +256,6 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 	struct hns_roce_wqe_frmr_seg *fseg;
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_v2_db sq_db;
-	struct ib_qp_attr attr;
 	unsigned int owner_bit;
 	unsigned int sge_idx;
 	unsigned int wqe_idx;
@@ -269,7 +263,6 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 	int valid_num_sge;
 	void *wqe = NULL;
 	bool loopback;
-	int attr_mask;
 	u32 tmp_len;
 	u32 hr_op;
 	u8 *smac;
@@ -607,18 +600,19 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 
 		qp->next_sge = sge_idx;
 
-		if (qp->state == IB_QPS_ERR) {
-			attr_mask = IB_QP_STATE;
-			attr.qp_state = IB_QPS_ERR;
-
-			ret = hns_roce_v2_modify_qp(&qp->ibqp, &attr, attr_mask,
-						    qp->state, IB_QPS_ERR);
-			if (ret) {
-				spin_unlock_irqrestore(&qp->sq.lock, flags);
-				*bad_wr = wr;
-				return ret;
-			}
-		}
+		/*
+		 * Hip08 hardware cannot flush the WQEs in SQ if the QP state
+		 * gets into errored mode. Hence, as a workaround to this
+		 * hardware limitation, driver needs to assist in flushing. But
+		 * the flushing operation uses mailbox to convey the QP state to
+		 * the hardware and which can sleep due to the mutex protection
+		 * around the mailbox calls. Hence, use the deferred flush for
+		 * now.
+		 */
+		if (qp->state == IB_QPS_ERR)
+			if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG,
+					      &qp->flush_flag))
+				init_flush_work(hr_dev, qp);
 	}
 
 	spin_unlock_irqrestore(&qp->sq.lock, flags);
@@ -646,10 +640,8 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
 	struct hns_roce_v2_wqe_data_seg *dseg;
 	struct hns_roce_rinl_sge *sge_list;
 	struct device *dev = hr_dev->dev;
-	struct ib_qp_attr attr;
 	unsigned long flags;
 	void *wqe = NULL;
-	int attr_mask;
 	u32 wqe_idx;
 	int nreq;
 	int ret;
@@ -719,19 +711,19 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
 
 		*hr_qp->rdb.db_record = hr_qp->rq.head & 0xffff;
 
-		if (hr_qp->state == IB_QPS_ERR) {
-			attr_mask = IB_QP_STATE;
-			attr.qp_state = IB_QPS_ERR;
-
-			ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, &attr,
-						    attr_mask, hr_qp->state,
-						    IB_QPS_ERR);
-			if (ret) {
-				spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
-				*bad_wr = wr;
-				return ret;
-			}
-		}
+		/*
+		 * Hip08 hardware cannot flush the WQEs in RQ if the QP state
+		 * gets into errored mode. Hence, as a workaround to this
+		 * hardware limitation, driver needs to assist in flushing. But
+		 * the flushing operation uses mailbox to convey the QP state to
+		 * the hardware and which can sleep due to the mutex protection
+		 * around the mailbox calls. Hence, use the deferred flush for
+		 * now.
+		 */
+		if (hr_qp->state == IB_QPS_ERR)
+			if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG,
+					      &hr_qp->flush_flag))
+				init_flush_work(hr_dev, hr_qp);
 	}
 	spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
 
@@ -3013,13 +3005,11 @@ static int hns_roce_v2_sw_poll_cq(struct hns_roce_cq *hr_cq, int num_entries,
 static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 				struct hns_roce_qp **cur_qp, struct ib_wc *wc)
 {
+	struct hns_roce_dev *hr_dev = to_hr_dev(hr_cq->ib_cq.device);
 	struct hns_roce_srq *srq = NULL;
-	struct hns_roce_dev *hr_dev;
 	struct hns_roce_v2_cqe *cqe;
 	struct hns_roce_qp *hr_qp;
 	struct hns_roce_wq *wq;
-	struct ib_qp_attr attr;
-	int attr_mask;
 	int is_send;
 	u16 wqe_ctr;
 	u32 opcode;
@@ -3043,7 +3033,6 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 				V2_CQE_BYTE_16_LCL_QPN_S);
 
 	if (!*cur_qp || (qpn & HNS_ROCE_V2_CQE_QPN_MASK) != (*cur_qp)->qpn) {
-		hr_dev = to_hr_dev(hr_cq->ib_cq.device);
 		hr_qp = __hns_roce_qp_lookup(hr_dev, qpn);
 		if (unlikely(!hr_qp)) {
 			dev_err(hr_dev->dev, "CQ %06lx with entry for unknown QPN %06x\n",
@@ -3053,6 +3042,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 		*cur_qp = hr_qp;
 	}
 
+	hr_qp = *cur_qp;
 	wc->qp = &(*cur_qp)->ibqp;
 	wc->vendor_err = 0;
 
@@ -3137,14 +3127,24 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 		break;
 	}
 
-	/* flush cqe if wc status is error, excluding flush error */
-	if ((wc->status != IB_WC_SUCCESS) &&
-	    (wc->status != IB_WC_WR_FLUSH_ERR)) {
-		attr_mask = IB_QP_STATE;
-		attr.qp_state = IB_QPS_ERR;
-		return hns_roce_v2_modify_qp(&(*cur_qp)->ibqp,
-					     &attr, attr_mask,
-					     (*cur_qp)->state, IB_QPS_ERR);
+	/*
+	 * Hip08 hardware cannot flush the WQEs in SQ/RQ if the QP state gets
+	 * into errored mode. Hence, as a workaround to this hardware
+	 * limitation, driver needs to assist in flushing. But the flushing
+	 * operation uses mailbox to convey the QP state to the hardware and
+	 * which can sleep due to the mutex protection around the mailbox calls.
+	 * Hence, use the deferred flush for now. Once wc error detected, the
+	 * flushing operation is needed.
+	 */
+	if (wc->status != IB_WC_SUCCESS &&
+	    wc->status != IB_WC_WR_FLUSH_ERR) {
+		dev_err(hr_dev->dev, "error cqe status is: 0x%x\n",
+			status & HNS_ROCE_V2_CQE_STATUS_MASK);
+
+		if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &hr_qp->flush_flag))
+			init_flush_work(hr_dev, hr_qp);
+
+		return 0;
 	}
 
 	if (wc->status == IB_WC_WR_FLUSH_ERR)
@@ -4735,6 +4735,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 	struct hns_roce_v2_qp_context *context = ctx;
 	struct hns_roce_v2_qp_context *qpc_mask = ctx + 1;
 	struct device *dev = hr_dev->dev;
+	unsigned long sq_flag = 0;
+	unsigned long rq_flag = 0;
 	int ret;
 
 	/*
@@ -4752,6 +4754,9 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 
 	/* When QP state is err, SQ and RQ WQE should be flushed */
 	if (new_state == IB_QPS_ERR) {
+		spin_lock_irqsave(&hr_qp->sq.lock, sq_flag);
+		spin_lock_irqsave(&hr_qp->rq.lock, rq_flag);
+		hr_qp->state = IB_QPS_ERR;
 		roce_set_field(context->byte_160_sq_ci_pi,
 			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
 			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S,
@@ -4769,6 +4774,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
 			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
 		}
+		spin_unlock_irqrestore(&hr_qp->rq.lock, rq_flag);
+		spin_unlock_irqrestore(&hr_qp->sq.lock, sq_flag);
 	}
 
 	/* Configure the optional fields */
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index aeb91a75cae5..c52e1b00f30d 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -57,10 +57,12 @@ static void flush_work_handle(struct work_struct *work)
 	attr_mask = IB_QP_STATE;
 	attr.qp_state = IB_QPS_ERR;
 
-	ret = hns_roce_modify_qp(&hr_qp->ibqp, &attr, attr_mask, NULL);
-	if (ret)
-		dev_err(dev, "Modify QP to error state failed(%d) during CQE flush\n",
-			ret);
+	if (test_and_clear_bit(HNS_ROCE_FLUSH_FLAG, &hr_qp->flush_flag)) {
+		ret = hns_roce_modify_qp(&hr_qp->ibqp, &attr, attr_mask, NULL);
+		if (ret)
+			dev_err(dev, "Modify QP to error state failed(%d) during CQE flush\n",
+				ret);
+	}
 
 	/*
 	 * make sure we signal QP destroy leg that flush QP was completed
@@ -764,6 +766,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 	spin_lock_init(&hr_qp->rq.lock);
 
 	hr_qp->state = IB_QPS_RESET;
+	hr_qp->flush_flag = 0;
 
 	hr_qp->ibqp.qp_type = init_attr->qp_type;
 

From b72bfc965eb5d3475acabb038a1f9f6034c4658d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 13 Feb 2020 15:19:11 -0400
Subject: [PATCH 033/327] RDMA/core: Get rid of ib_create_qp_user

This function accepts a udata but does nothing with it, and is never
passed a !NULL udata. Rename it to ib_create_qp which was the only caller
and remove the udata.

Link: https://lore.kernel.org/r/20200213191911.GA9898@ziepe.ca
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/verbs.c | 22 +++++++++++++++-------
 include/rdma/ib_verbs.h         | 31 ++-----------------------------
 2 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 3ebae3b65c28..13ff4c8bfe5a 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1127,8 +1127,7 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
 EXPORT_SYMBOL(ib_open_qp);
 
 static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp,
-					struct ib_qp_init_attr *qp_init_attr,
-					struct ib_udata *udata)
+					struct ib_qp_init_attr *qp_init_attr)
 {
 	struct ib_qp *real_qp = qp;
 
@@ -1150,9 +1149,18 @@ static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp,
 	return qp;
 }
 
-struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
-				struct ib_qp_init_attr *qp_init_attr,
-				struct ib_udata *udata)
+/**
+ * ib_create_qp - Creates a kernel QP associated with the specified protection
+ *   domain.
+ * @pd: The protection domain associated with the QP.
+ * @qp_init_attr: A list of initial attributes required to create the
+ *   QP.  If QP creation succeeds, then the attributes are updated to
+ *   the actual capabilities of the created QP.
+ *
+ * NOTE: for user qp use ib_create_qp_user with valid udata!
+ */
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+			   struct ib_qp_init_attr *qp_init_attr)
 {
 	struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
 	struct ib_qp *qp;
@@ -1197,7 +1205,7 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
 
 	if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
 		struct ib_qp *xrc_qp =
-			create_xrc_qp_user(qp, qp_init_attr, udata);
+			create_xrc_qp_user(qp, qp_init_attr);
 
 		if (IS_ERR(xrc_qp)) {
 			ret = PTR_ERR(xrc_qp);
@@ -1253,7 +1261,7 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
 	return ERR_PTR(ret);
 
 }
-EXPORT_SYMBOL(ib_create_qp_user);
+EXPORT_SYMBOL(ib_create_qp);
 
 static const struct {
 	int			valid;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 1f779fad3a1e..5f3a04ead9f5 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3627,35 +3627,8 @@ static inline int ib_post_srq_recv(struct ib_srq *srq,
 					      bad_recv_wr ? : &dummy);
 }
 
-/**
- * ib_create_qp_user - Creates a QP associated with the specified protection
- *   domain.
- * @pd: The protection domain associated with the QP.
- * @qp_init_attr: A list of initial attributes required to create the
- *   QP.  If QP creation succeeds, then the attributes are updated to
- *   the actual capabilities of the created QP.
- * @udata: Valid user data or NULL for kernel objects
- */
-struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
-				struct ib_qp_init_attr *qp_init_attr,
-				struct ib_udata *udata);
-
-/**
- * ib_create_qp - Creates a kernel QP associated with the specified protection
- *   domain.
- * @pd: The protection domain associated with the QP.
- * @qp_init_attr: A list of initial attributes required to create the
- *   QP.  If QP creation succeeds, then the attributes are updated to
- *   the actual capabilities of the created QP.
- * @udata: Valid user data or NULL for kernel objects
- *
- * NOTE: for user qp use ib_create_qp_user with valid udata!
- */
-static inline struct ib_qp *ib_create_qp(struct ib_pd *pd,
-					 struct ib_qp_init_attr *qp_init_attr)
-{
-	return ib_create_qp_user(pd, qp_init_attr, NULL);
-}
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+			   struct ib_qp_init_attr *qp_init_attr);
 
 /**
  * ib_modify_qp_with_udata - Modifies the attributes for the specified QP.

From 167b95ec887029c5fe0fb0f601fb209323f498fb Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 18 Feb 2020 19:17:00 +0000
Subject: [PATCH 034/327] RDMA/ucma: Use refcount_t for the ctx->ref

Don't use an atomic as a refcount.

Link: https://lore.kernel.org/r/20200218191657.GA29724@ziepe.ca
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/ucma.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 57e68491a2fd..66ad29c672fc 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -85,7 +85,7 @@ struct ucma_file {
 struct ucma_context {
 	u32			id;
 	struct completion	comp;
-	atomic_t		ref;
+	refcount_t		ref;
 	int			events_reported;
 	int			backlog;
 
@@ -152,7 +152,7 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
 		if (ctx->closing)
 			ctx = ERR_PTR(-EIO);
 		else
-			atomic_inc(&ctx->ref);
+			refcount_inc(&ctx->ref);
 	}
 	xa_unlock(&ctx_table);
 	return ctx;
@@ -160,7 +160,7 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
 
 static void ucma_put_ctx(struct ucma_context *ctx)
 {
-	if (atomic_dec_and_test(&ctx->ref))
+	if (refcount_dec_and_test(&ctx->ref))
 		complete(&ctx->comp);
 }
 
@@ -212,7 +212,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
 		return NULL;
 
 	INIT_WORK(&ctx->close_work, ucma_close_id);
-	atomic_set(&ctx->ref, 1);
+	refcount_set(&ctx->ref, 1);
 	init_completion(&ctx->comp);
 	INIT_LIST_HEAD(&ctx->mc_list);
 	ctx->file = file;
@@ -1502,7 +1502,7 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
 		mc = ERR_PTR(-ENOENT);
 	else if (mc->ctx->file != file)
 		mc = ERR_PTR(-EINVAL);
-	else if (!atomic_inc_not_zero(&mc->ctx->ref))
+	else if (!refcount_inc_not_zero(&mc->ctx->ref))
 		mc = ERR_PTR(-ENXIO);
 	else
 		__xa_erase(&multicast_table, mc->id);

From 779820c2e1e9251ddfdce5dd43b0bba30cd22271 Mon Sep 17 00:00:00 2001
From: Selvin Xavier <selvin.xavier@broadcom.com>
Date: Wed, 19 Feb 2020 02:19:53 -0800
Subject: [PATCH 035/327] RDMA/core: Add helper function to retrieve driver gid
 context from gid attr

Adding a helper function to retrieve the driver gid context from the gid
attr.

Link: https://lore.kernel.org/r/1582107594-5180-2-git-send-email-selvin.xavier@broadcom.com
Suggested-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cache.c | 17 +++++++++++++++++
 include/rdma/ib_cache.h         |  1 +
 2 files changed, 18 insertions(+)

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 17bfedd24cc3..9bbdace3c809 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -972,6 +972,23 @@ int rdma_query_gid(struct ib_device *device, u8 port_num,
 }
 EXPORT_SYMBOL(rdma_query_gid);
 
+/**
+ * rdma_read_gid_hw_context - Read the HW GID context from GID attribute
+ * @attr:		Potinter to the GID attribute
+ *
+ * rdma_read_gid_hw_context() reads the drivers GID HW context corresponding
+ * to the SGID attr. Callers are required to already be holding the reference
+ * to an existing GID entry.
+ *
+ * Returns the HW GID context
+ *
+ */
+void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr)
+{
+	return container_of(attr, struct ib_gid_table_entry, attr)->context;
+}
+EXPORT_SYMBOL(rdma_read_gid_hw_context);
+
 /**
  * rdma_find_gid - Returns SGID attributes if the matching GID is found.
  * @device: The device to query.
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
index 870b5e6c06db..e06d13388ae7 100644
--- a/include/rdma/ib_cache.h
+++ b/include/rdma/ib_cache.h
@@ -39,6 +39,7 @@
 
 int rdma_query_gid(struct ib_device *device, u8 port_num, int index,
 		   union ib_gid *gid);
+void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr);
 const struct ib_gid_attr *rdma_find_gid(struct ib_device *device,
 					const union ib_gid *gid,
 					enum ib_gid_type gid_type,

From 0a01623b74d41d91a595cbeb29e1a03648aec087 Mon Sep 17 00:00:00 2001
From: Selvin Xavier <selvin.xavier@broadcom.com>
Date: Wed, 19 Feb 2020 02:19:54 -0800
Subject: [PATCH 036/327] RDMA/bnxt_re: Use rdma_read_gid_hw_context to
 retrieve HW gid index

bnxt_re HW maintains a GID table with only a single entry for the two
duplicate GID entries (v1 and v2). Driver needs to map stack gid index to
the HW table gid index.  Use the new API rdma_read_gid_hw_context () to
retrieve the HW GID context to get the HW table index.

Link: https://lore.kernel.org/r/1582107594-5180-3-git-send-email-selvin.xavier@broadcom.com
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 52b6a4d85460..18579e8d630c 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -639,6 +639,7 @@ int bnxt_re_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr,
 	const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
 	struct bnxt_re_dev *rdev = pd->rdev;
 	const struct ib_gid_attr *sgid_attr;
+	struct bnxt_re_gid_ctx *ctx;
 	struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah);
 	u8 nw_type;
 	int rc;
@@ -654,19 +655,18 @@ int bnxt_re_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr,
 	/* Supply the configuration for the HW */
 	memcpy(ah->qplib_ah.dgid.data, grh->dgid.raw,
 	       sizeof(union ib_gid));
-	/*
-	 * If RoCE V2 is enabled, stack will have two entries for
-	 * each GID entry. Avoiding this duplicte entry in HW. Dividing
-	 * the GID index by 2 for RoCE V2
+	sgid_attr = grh->sgid_attr;
+	/* Get the HW context of the GID. The reference
+	 * of GID table entry is already taken by the caller.
 	 */
-	ah->qplib_ah.sgid_index = grh->sgid_index / 2;
+	ctx = rdma_read_gid_hw_context(sgid_attr);
+	ah->qplib_ah.sgid_index = ctx->idx;
 	ah->qplib_ah.host_sgid_index = grh->sgid_index;
 	ah->qplib_ah.traffic_class = grh->traffic_class;
 	ah->qplib_ah.flow_label = grh->flow_label;
 	ah->qplib_ah.hop_limit = grh->hop_limit;
 	ah->qplib_ah.sl = rdma_ah_get_sl(ah_attr);
 
-	sgid_attr = grh->sgid_attr;
 	/* Get network header type for this GID */
 	nw_type = rdma_gid_attr_network_type(sgid_attr);
 	ah->qplib_ah.nw_type = bnxt_re_stack_to_dev_nw_type(nw_type);
@@ -1593,6 +1593,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 		const struct ib_global_route *grh =
 			rdma_ah_read_grh(&qp_attr->ah_attr);
 		const struct ib_gid_attr *sgid_attr;
+		struct bnxt_re_gid_ctx *ctx;
 
 		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_DGID |
 				     CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL |
@@ -1604,11 +1605,12 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 		memcpy(qp->qplib_qp.ah.dgid.data, grh->dgid.raw,
 		       sizeof(qp->qplib_qp.ah.dgid.data));
 		qp->qplib_qp.ah.flow_label = grh->flow_label;
-		/* If RoCE V2 is enabled, stack will have two entries for
-		 * each GID entry. Avoiding this duplicte entry in HW. Dividing
-		 * the GID index by 2 for RoCE V2
+		sgid_attr = grh->sgid_attr;
+		/* Get the HW context of the GID. The reference
+		 * of GID table entry is already taken by the caller.
 		 */
-		qp->qplib_qp.ah.sgid_index = grh->sgid_index / 2;
+		ctx = rdma_read_gid_hw_context(sgid_attr);
+		qp->qplib_qp.ah.sgid_index = ctx->idx;
 		qp->qplib_qp.ah.host_sgid_index = grh->sgid_index;
 		qp->qplib_qp.ah.hop_limit = grh->hop_limit;
 		qp->qplib_qp.ah.traffic_class = grh->traffic_class;
@@ -1616,7 +1618,6 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 		ether_addr_copy(qp->qplib_qp.ah.dmac,
 				qp_attr->ah_attr.roce.dmac);
 
-		sgid_attr = qp_attr->ah_attr.grh.sgid_attr;
 		rc = rdma_read_gid_l2_fields(sgid_attr, NULL,
 					     &qp->qplib_qp.smac[0]);
 		if (rc)

From fb3063d31995cc4cf1d47a406bb61d6fb1b1d58d Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Mon, 17 Feb 2020 12:57:14 -0800
Subject: [PATCH 037/327] RDMA/rxe: Fix configuration of atomic queue pair
 attributes

From the comment above the definition of the roundup_pow_of_two() macro:

     The result is undefined when n == 0.

Hence only pass positive values to roundup_pow_of_two(). This patch fixes
the following UBSAN complaint:

  UBSAN: Undefined behaviour in ./include/linux/log2.h:57:13
  shift exponent 64 is too large for 64-bit type 'long unsigned int'
  Call Trace:
   dump_stack+0xa5/0xe6
   ubsan_epilogue+0x9/0x26
   __ubsan_handle_shift_out_of_bounds.cold+0x4c/0xf9
   rxe_qp_from_attr.cold+0x37/0x5d [rdma_rxe]
   rxe_modify_qp+0x59/0x70 [rdma_rxe]
   _ib_modify_qp+0x5aa/0x7c0 [ib_core]
   ib_modify_qp+0x3b/0x50 [ib_core]
   cma_modify_qp_rtr+0x234/0x260 [rdma_cm]
   __rdma_accept+0x1a7/0x650 [rdma_cm]
   nvmet_rdma_cm_handler+0x1286/0x14cd [nvmet_rdma]
   cma_cm_event_handler+0x6b/0x330 [rdma_cm]
   cma_ib_req_handler+0xe60/0x22d0 [rdma_cm]
   cm_process_work+0x30/0x140 [ib_cm]
   cm_req_handler+0x11f4/0x1cd0 [ib_cm]
   cm_work_handler+0xb8/0x344e [ib_cm]
   process_one_work+0x569/0xb60
   worker_thread+0x7a/0x5d0
   kthread+0x1e6/0x210
   ret_from_fork+0x24/0x30

Link: https://lore.kernel.org/r/20200217205714.26937-1-bvanassche@acm.org
Fixes: 8700e3e7c485 ("Soft RoCE driver")
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/rxe/rxe_qp.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index ec21f616ac98..6c11c3aeeca6 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -590,15 +590,16 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
 	int err;
 
 	if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
-		int max_rd_atomic = __roundup_pow_of_two(attr->max_rd_atomic);
+		int max_rd_atomic = attr->max_rd_atomic ?
+			roundup_pow_of_two(attr->max_rd_atomic) : 0;
 
 		qp->attr.max_rd_atomic = max_rd_atomic;
 		atomic_set(&qp->req.rd_atomic, max_rd_atomic);
 	}
 
 	if (mask & IB_QP_MAX_DEST_RD_ATOMIC) {
-		int max_dest_rd_atomic =
-			__roundup_pow_of_two(attr->max_dest_rd_atomic);
+		int max_dest_rd_atomic = attr->max_dest_rd_atomic ?
+			roundup_pow_of_two(attr->max_dest_rd_atomic) : 0;
 
 		qp->attr.max_dest_rd_atomic = max_dest_rd_atomic;
 

From 0afa5ca82212247456f9de1468b595a111fee633 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 19 Feb 2020 17:17:34 -0600
Subject: [PATCH 038/327] proc: Rename in proc_inode rename sysctl_inodes
 sibling_inodes

I about to need and use the same functionality for pid based
inodes and there is no point in adding a second field when
this field is already here and serving the same purporse.

Just give the field a generic name so it is clear that
it is no longer sysctl specific.

Also for good measure initialize sibling_inodes when
proc_inode is initialized.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/inode.c       | 1 +
 fs/proc/internal.h    | 2 +-
 fs/proc/proc_sysctl.c | 8 ++++----
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6da18316d209..bdae442d5262 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -68,6 +68,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	ei->pde = NULL;
 	ei->sysctl = NULL;
 	ei->sysctl_entry = NULL;
+	INIT_HLIST_NODE(&ei->sibling_inodes);
 	ei->ns_ops = NULL;
 	return &ei->vfs_inode;
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 41587276798e..366cd3aa690b 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -91,7 +91,7 @@ struct proc_inode {
 	struct proc_dir_entry *pde;
 	struct ctl_table_header *sysctl;
 	struct ctl_table *sysctl_entry;
-	struct hlist_node sysctl_inodes;
+	struct hlist_node sibling_inodes;
 	const struct proc_ns_operations *ns_ops;
 	struct inode vfs_inode;
 } __randomize_layout;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index c75bb4632ed1..42fbb7f3c587 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -279,9 +279,9 @@ static void proc_sys_prune_dcache(struct ctl_table_header *head)
 		node = hlist_first_rcu(&head->inodes);
 		if (!node)
 			break;
-		ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
+		ei = hlist_entry(node, struct proc_inode, sibling_inodes);
 		spin_lock(&sysctl_lock);
-		hlist_del_init_rcu(&ei->sysctl_inodes);
+		hlist_del_init_rcu(&ei->sibling_inodes);
 		spin_unlock(&sysctl_lock);
 
 		inode = &ei->vfs_inode;
@@ -483,7 +483,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	}
 	ei->sysctl = head;
 	ei->sysctl_entry = table;
-	hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes);
+	hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes);
 	head->count++;
 	spin_unlock(&sysctl_lock);
 
@@ -514,7 +514,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
 {
 	spin_lock(&sysctl_lock);
-	hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes);
+	hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes);
 	if (!--head->count)
 		kfree_rcu(head, rcu);
 	spin_unlock(&sysctl_lock);

From 26dbc60f385ff9cff475ea2a3bad02e80fd6fa43 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 20 Feb 2020 08:34:44 -0600
Subject: [PATCH 039/327] proc: Generalize proc_sys_prune_dcache into
 proc_prune_siblings_dcache

This prepares the way for allowing the pid part of proc to use this
dcache pruning code as well.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/inode.c       | 38 ++++++++++++++++++++++++++++++++++++++
 fs/proc/internal.h    |  1 +
 fs/proc/proc_sysctl.c | 35 +----------------------------------
 3 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index bdae442d5262..74ce4a8d05eb 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -103,6 +103,44 @@ void __init proc_init_kmemcache(void)
 	BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
 }
 
+void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
+{
+	struct inode *inode;
+	struct proc_inode *ei;
+	struct hlist_node *node;
+	struct super_block *sb;
+
+	rcu_read_lock();
+	for (;;) {
+		node = hlist_first_rcu(inodes);
+		if (!node)
+			break;
+		ei = hlist_entry(node, struct proc_inode, sibling_inodes);
+		spin_lock(lock);
+		hlist_del_init_rcu(&ei->sibling_inodes);
+		spin_unlock(lock);
+
+		inode = &ei->vfs_inode;
+		sb = inode->i_sb;
+		if (!atomic_inc_not_zero(&sb->s_active))
+			continue;
+		inode = igrab(inode);
+		rcu_read_unlock();
+		if (unlikely(!inode)) {
+			deactivate_super(sb);
+			rcu_read_lock();
+			continue;
+		}
+
+		d_prune_aliases(inode);
+		iput(inode);
+		deactivate_super(sb);
+
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
 static int proc_show_options(struct seq_file *seq, struct dentry *root)
 {
 	struct super_block *sb = root->d_sb;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 366cd3aa690b..ba9a991824a5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -210,6 +210,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
 extern const struct super_operations proc_sops;
 
 void proc_init_kmemcache(void);
+void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
 void set_proc_pid_nlink(void);
 extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 extern void proc_entry_rundown(struct proc_dir_entry *);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 42fbb7f3c587..5da9d7f7ae34 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -269,40 +269,7 @@ static void unuse_table(struct ctl_table_header *p)
 
 static void proc_sys_prune_dcache(struct ctl_table_header *head)
 {
-	struct inode *inode;
-	struct proc_inode *ei;
-	struct hlist_node *node;
-	struct super_block *sb;
-
-	rcu_read_lock();
-	for (;;) {
-		node = hlist_first_rcu(&head->inodes);
-		if (!node)
-			break;
-		ei = hlist_entry(node, struct proc_inode, sibling_inodes);
-		spin_lock(&sysctl_lock);
-		hlist_del_init_rcu(&ei->sibling_inodes);
-		spin_unlock(&sysctl_lock);
-
-		inode = &ei->vfs_inode;
-		sb = inode->i_sb;
-		if (!atomic_inc_not_zero(&sb->s_active))
-			continue;
-		inode = igrab(inode);
-		rcu_read_unlock();
-		if (unlikely(!inode)) {
-			deactivate_super(sb);
-			rcu_read_lock();
-			continue;
-		}
-
-		d_prune_aliases(inode);
-		iput(inode);
-		deactivate_super(sb);
-
-		rcu_read_lock();
-	}
-	rcu_read_unlock();
+	proc_prune_siblings_dcache(&head->inodes, &sysctl_lock);
 }
 
 /* called under sysctl_lock, will reacquire if has to wait */

From 8d8d2b76ac3545cf6c0e9c6aa61700db98d2496a Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 14 Feb 2020 00:33:38 +0000
Subject: [PATCH 040/327] RDMA/hns: fix spelling mistake: "attatch" -> "attach"

There is a spelling mistake in a dev_err error message. Fix it.

Link: https://lore.kernel.org/r/20200214003338.6573-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index aa9d1792e0cd..66c1cfeba768 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -5783,7 +5783,7 @@ static int map_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq,
 			  page_shift);
 	ret = hns_roce_mtr_attach(hr_dev, &eq->mtr, &buf_list, &region, 1);
 	if (ret)
-		dev_err(hr_dev->dev, "mtr attatch error for eqe\n");
+		dev_err(hr_dev->dev, "mtr attach error for eqe\n");
 
 	goto done;
 

From 52c5e9e7497b728b53a84cbd5873c4b707d10d55 Mon Sep 17 00:00:00 2001
From: Lang Cheng <chenglang@huawei.com>
Date: Thu, 20 Feb 2020 09:34:31 +0800
Subject: [PATCH 041/327] RDMA/hns: Initialize all fields of doorbells to zero

Prevent uninitialized fields when new fields are added, and make code look
simpler.

Link: https://lore.kernel.org/r/1582162471-50361-1-git-send-email-liweihang@huawei.com
Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 9 ++-------
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 5 +----
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index c6e66586e533..89dac44b3cef 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -69,7 +69,7 @@ static int hns_roce_v1_post_send(struct ib_qp *ibqp,
 	struct hns_roce_wqe_data_seg *dseg = NULL;
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_sq_db sq_db;
+	struct hns_roce_sq_db sq_db = {};
 	int ps_opcode = 0, i = 0;
 	unsigned long flags = 0;
 	void *wqe = NULL;
@@ -318,8 +318,6 @@ static int hns_roce_v1_post_send(struct ib_qp *ibqp,
 		/* Memory barrier */
 		wmb();
 
-		sq_db.u32_4 = 0;
-		sq_db.u32_8 = 0;
 		roce_set_field(sq_db.u32_4, SQ_DOORBELL_U32_4_SQ_HEAD_M,
 			       SQ_DOORBELL_U32_4_SQ_HEAD_S,
 			      (qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1)));
@@ -351,7 +349,7 @@ static int hns_roce_v1_post_recv(struct ib_qp *ibqp,
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_rq_db rq_db;
+	struct hns_roce_rq_db rq_db = {};
 	__le32 doorbell[2] = {0};
 	unsigned long flags = 0;
 	unsigned int wqe_idx;
@@ -418,9 +416,6 @@ static int hns_roce_v1_post_recv(struct ib_qp *ibqp,
 				   ROCEE_QP1C_CFG3_0_REG +
 				   QP1C_CFGN_OFFSET * hr_qp->phy_port, reg_val);
 		} else {
-			rq_db.u32_4 = 0;
-			rq_db.u32_8 = 0;
-
 			roce_set_field(rq_db.u32_4, RQ_DOORBELL_U32_4_RQ_HEAD_M,
 				       RQ_DOORBELL_U32_4_RQ_HEAD_S,
 				       hr_qp->rq.head);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 66c1cfeba768..dee1cc8ffb42 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -255,7 +255,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
 	struct hns_roce_wqe_frmr_seg *fseg;
 	struct device *dev = hr_dev->dev;
-	struct hns_roce_v2_db sq_db;
+	struct hns_roce_v2_db sq_db = {};
 	unsigned int owner_bit;
 	unsigned int sge_idx;
 	unsigned int wqe_idx;
@@ -583,9 +583,6 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 		/* Memory barrier */
 		wmb();
 
-		sq_db.byte_4 = 0;
-		sq_db.parameter = 0;
-
 		roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_TAG_M,
 			       V2_DB_BYTE_4_TAG_S, qp->doorbell_qpn);
 		roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_CMD_M,

From 5b361328ca649534d721e4eae20c96ccbe702ce7 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 12 Feb 2020 19:04:25 -0600
Subject: [PATCH 042/327] RDMA: Replace zero-length array with flexible-array
 member

The current codebase makes use of the zero-length array language
extension to the C90 standard, but the preferred mechanism to declare
variable-length types such as these ones is a flexible array member[1][2],
introduced in C99:

struct foo {
        int stuff;
        struct boo array[];
};

By making use of the mechanism above, we will get a compiler warning
in case the flexible array does not occur last in the structure, which
will help us prevent some kind of undefined behavior bugs from being
inadvertently introduced[3] to the codebase from now on.

Also, notice that, dynamic memory allocations won't be affected by
this change:

"Flexible array members have incomplete type, and so the sizeof operator
may not be applied. As a quirk of the original implementation of
zero-length arrays, sizeof evaluates to zero."[1]

This issue was found with the help of Coccinelle.

[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
[2] https://github.com/KSPP/linux/issues/21
[3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour")

Link: https://lore.kernel.org/r/20200213010425.GA13068@embeddedor.com
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> # added a few more
---
 drivers/infiniband/core/cache.c                |  2 +-
 drivers/infiniband/core/cm.c                   |  4 ++--
 drivers/infiniband/core/mad_priv.h             |  4 ++--
 drivers/infiniband/core/multicast.c            |  2 +-
 drivers/infiniband/core/sa_query.c             |  2 +-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h         |  4 ++--
 drivers/infiniband/hw/cxgb4/t4fw_ri_api.h      |  8 ++++----
 drivers/infiniband/hw/hfi1/mad.c               |  4 ++--
 drivers/infiniband/hw/hfi1/mad.h               |  2 +-
 drivers/infiniband/hw/hfi1/pio.h               |  4 ++--
 drivers/infiniband/hw/hfi1/sdma.c              |  2 +-
 drivers/infiniband/hw/hfi1/sdma.h              |  4 ++--
 drivers/infiniband/hw/hfi1/user_exp_rcv.h      |  2 +-
 drivers/infiniband/hw/i40iw/i40iw_cm.h         |  4 ++--
 drivers/infiniband/hw/mthca/mthca_memfree.c    |  2 +-
 drivers/infiniband/hw/mthca/mthca_memfree.h    |  2 +-
 drivers/infiniband/hw/usnic/usnic_uiom.h       |  2 +-
 drivers/infiniband/sw/rxe/rxe_queue.h          |  2 +-
 .../infiniband/ulp/opa_vnic/opa_vnic_encap.h   |  4 ++--
 drivers/infiniband/ulp/srp/ib_srp.h            |  2 +-
 include/rdma/ib_fmr_pool.h                     |  2 +-
 include/rdma/ib_verbs.h                        | 18 +++++++++---------
 include/rdma/opa_vnic.h                        |  2 +-
 include/rdma/rdmavt_mr.h                       |  2 +-
 include/rdma/rdmavt_qp.h                       |  2 +-
 25 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 9bbdace3c809..717b798cddad 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -46,7 +46,7 @@
 
 struct ib_pkey_cache {
 	int             table_len;
-	u16             table[0];
+	u16             table[];
 };
 
 struct ib_update_work {
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 33c0d9e7bb66..aec6867f0ed2 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -197,7 +197,7 @@ struct cm_device {
 	struct ib_device *ib_device;
 	u8 ack_delay;
 	int going_down;
-	struct cm_port *port[0];
+	struct cm_port *port[];
 };
 
 struct cm_av {
@@ -216,7 +216,7 @@ struct cm_work {
 	__be32 local_id;			/* Established / timewait */
 	__be32 remote_id;
 	struct ib_cm_event cm_event;
-	struct sa_path_rec path[0];
+	struct sa_path_rec path[];
 };
 
 struct cm_timewait_info {
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 956b3a7dfed7..403d8673a2f9 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -79,13 +79,13 @@ struct ib_mad_private {
 	struct ib_mad_private_header header;
 	size_t mad_size;
 	struct ib_grh grh;
-	u8 mad[0];
+	u8 mad[];
 } __packed;
 
 struct ib_rmpp_segment {
 	struct list_head list;
 	u32 num;
-	u8 data[0];
+	u8 data[];
 };
 
 struct ib_mad_agent_private {
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index cd338ddc4a39..9c2d8b7f1af9 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -71,7 +71,7 @@ struct mcast_device {
 	struct ib_event_handler	event_handler;
 	int			start_port;
 	int			end_port;
-	struct mcast_port	port[0];
+	struct mcast_port	port[];
 };
 
 enum mcast_state {
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 30d4c126a2db..74e0058fcf9e 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -101,7 +101,7 @@ struct ib_sa_port {
 struct ib_sa_device {
 	int                     start_port, end_port;
 	struct ib_event_handler event_handler;
-	struct ib_sa_port port[0];
+	struct ib_sa_port port[];
 };
 
 struct ib_sa_query {
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 7d06b0f8d49a..e8e11bd95e42 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -707,7 +707,7 @@ struct mpa_message {
 	u8 flags;
 	u8 revision;
 	__be16 private_data_size;
-	u8 private_data[0];
+	u8 private_data[];
 };
 
 struct mpa_v2_conn_params {
@@ -719,7 +719,7 @@ struct terminate_message {
 	u8 layer_etype;
 	u8 ecode;
 	__be16 hdrct_rsvd;
-	u8 len_hdrs[0];
+	u8 len_hdrs[];
 };
 
 #define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28)
diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
index cbdb300a4794..a2f5e29ef226 100644
--- a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
+++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
@@ -123,7 +123,7 @@ struct fw_ri_dsgl {
 	__be32	len0;
 	__be64	addr0;
 #ifndef C99_NOT_SUPPORTED
-	struct fw_ri_dsge_pair sge[0];
+	struct fw_ri_dsge_pair sge[];
 #endif
 };
 
@@ -139,7 +139,7 @@ struct fw_ri_isgl {
 	__be16	nsge;
 	__be32	r2;
 #ifndef C99_NOT_SUPPORTED
-	struct fw_ri_sge sge[0];
+	struct fw_ri_sge sge[];
 #endif
 };
 
@@ -149,7 +149,7 @@ struct fw_ri_immd {
 	__be16	r2;
 	__be32	immdlen;
 #ifndef C99_NOT_SUPPORTED
-	__u8	data[0];
+	__u8	data[];
 #endif
 };
 
@@ -321,7 +321,7 @@ struct fw_ri_res_wr {
 	__be32 len16_pkd;
 	__u64  cookie;
 #ifndef C99_NOT_SUPPORTED
-	struct fw_ri_res res[0];
+	struct fw_ri_res res[];
 #endif
 };
 
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index a51bcd2b4391..7073f237a949 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -2381,7 +2381,7 @@ struct opa_port_status_rsp {
 		__be64 port_vl_rcv_bubble;
 		__be64 port_vl_mark_fecn;
 		__be64 port_vl_xmit_discards;
-	} vls[0]; /* real array size defined by # bits set in vl_select_mask */
+	} vls[]; /* real array size defined by # bits set in vl_select_mask */
 };
 
 enum counter_selects {
@@ -2423,7 +2423,7 @@ struct opa_aggregate {
 	__be16 attr_id;
 	__be16 err_reqlength;	/* 1 bit, 8 res, 7 bit */
 	__be32 attr_mod;
-	u8 data[0];
+	u8 data[];
 };
 
 #define MSK_LLI 0x000000f0
diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h
index 2f48e6953629..889e63d3f2cc 100644
--- a/drivers/infiniband/hw/hfi1/mad.h
+++ b/drivers/infiniband/hw/hfi1/mad.h
@@ -165,7 +165,7 @@ struct opa_mad_notice_attr {
 		} __packed ntc_2048;
 
 	};
-	u8	class_data[0];
+	u8	class_data[];
 };
 
 #define IB_VLARB_LOWPRI_0_31    1
diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
index c9a58b642bdd..0102262343c0 100644
--- a/drivers/infiniband/hw/hfi1/pio.h
+++ b/drivers/infiniband/hw/hfi1/pio.h
@@ -243,7 +243,7 @@ struct sc_config_sizes {
  */
 struct pio_map_elem {
 	u32 mask;
-	struct send_context *ksc[0];
+	struct send_context *ksc[];
 };
 
 /*
@@ -263,7 +263,7 @@ struct pio_vl_map {
 	u32 mask;
 	u8 actual_vls;
 	u8 vls;
-	struct pio_map_elem *map[0];
+	struct pio_map_elem *map[];
 };
 
 int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls,
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index a51525647ac8..c93ea021cf49 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -833,7 +833,7 @@ struct sdma_engine *sdma_select_engine_sc(
 struct sdma_rht_map_elem {
 	u32 mask;
 	u8 ctr;
-	struct sdma_engine *sde[0];
+	struct sdma_engine *sde[];
 };
 
 struct sdma_rht_node {
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
index 1e2e40f79cb2..7a851191f987 100644
--- a/drivers/infiniband/hw/hfi1/sdma.h
+++ b/drivers/infiniband/hw/hfi1/sdma.h
@@ -1002,7 +1002,7 @@ void sdma_engine_interrupt(struct sdma_engine *sde, u64 status);
  */
 struct sdma_map_elem {
 	u32 mask;
-	struct sdma_engine *sde[0];
+	struct sdma_engine *sde[];
 };
 
 /**
@@ -1024,7 +1024,7 @@ struct sdma_vl_map {
 	u32 mask;
 	u8 actual_vls;
 	u8 vls;
-	struct sdma_map_elem *map[0];
+	struct sdma_map_elem *map[];
 };
 
 int sdma_map_init(
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
index 6257eee083a1..332abb446861 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -73,7 +73,7 @@ struct tid_rb_node {
 	dma_addr_t dma_addr;
 	bool freed;
 	unsigned int npages;
-	struct page *pages[0];
+	struct page *pages[];
 };
 
 static inline int num_user_pages(unsigned long addr,
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h
index 66dc1ba03389..6e43e4d730f4 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h
@@ -85,7 +85,7 @@ struct ietf_mpa_v1 {
 	u8 flags;
 	u8 rev;
 	__be16 priv_data_len;
-	u8 priv_data[0];
+	u8 priv_data[];
 };
 
 #define ietf_mpa_req_resp_frame ietf_mpa_frame
@@ -101,7 +101,7 @@ struct ietf_mpa_v2 {
 	u8 rev;
 	__be16 priv_data_len;
 	struct ietf_rtr_msg rtr_msg;
-	u8 priv_data[0];
+	u8 priv_data[];
 };
 
 struct i40iw_cm_node;
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index 78a48aea3faf..fa808582b08b 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -58,7 +58,7 @@ struct mthca_user_db_table {
 		u64                uvirt;
 		struct scatterlist mem;
 		int                refcount;
-	}                page[0];
+	} page[];
 };
 
 static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h
index da9b8f9b884f..f9a2e65e2ff5 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.h
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.h
@@ -68,7 +68,7 @@ struct mthca_icm_table {
 	int               lowmem;
 	int               coherent;
 	struct mutex      mutex;
-	struct mthca_icm *icm[0];
+	struct mthca_icm *icm[];
 };
 
 struct mthca_icm_iter {
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h
index 70be49b1ca05..7ec8991ace67 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.h
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.h
@@ -77,7 +77,7 @@ struct usnic_uiom_reg {
 struct usnic_uiom_chunk {
 	struct list_head		list;
 	int				nents;
-	struct scatterlist		page_list[0];
+	struct scatterlist		page_list[];
 };
 
 struct usnic_uiom_pd *usnic_uiom_alloc_pd(void);
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h
index acd0a925481c..8ef17d617022 100644
--- a/drivers/infiniband/sw/rxe/rxe_queue.h
+++ b/drivers/infiniband/sw/rxe/rxe_queue.h
@@ -63,7 +63,7 @@ struct rxe_queue_buf {
 	__u32			pad_2[31];
 	__u32			consumer_index;
 	__u32			pad_3[31];
-	__u8			data[0];
+	__u8			data[];
 };
 
 struct rxe_queue {
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
index 4480092c68e0..0b3570dc606d 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
@@ -258,7 +258,7 @@ struct opa_veswport_mactable {
 	__be16                              offset;
 	__be16                              num_entries;
 	__be32                              mac_tbl_digest;
-	struct opa_veswport_mactable_entry  tbl_entries[0];
+	struct opa_veswport_mactable_entry  tbl_entries[];
 } __packed;
 
 /**
@@ -440,7 +440,7 @@ struct opa_veswport_iface_macs {
 	__be16 num_macs_in_msg;
 	__be16 tot_macs_in_lst;
 	__be16 gen_count;
-	struct opa_vnic_iface_mac_entry entry[0];
+	struct opa_vnic_iface_mac_entry entry[];
 } __packed;
 
 /**
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 5359ece561ca..6fabcc2faf1f 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -309,7 +309,7 @@ struct srp_fr_pool {
 	int			max_page_list_len;
 	spinlock_t		lock;
 	struct list_head	free_list;
-	struct srp_fr_desc	desc[0];
+	struct srp_fr_desc	desc[];
 };
 
 /**
diff --git a/include/rdma/ib_fmr_pool.h b/include/rdma/ib_fmr_pool.h
index f8982e4e9702..2fd9bfb6d648 100644
--- a/include/rdma/ib_fmr_pool.h
+++ b/include/rdma/ib_fmr_pool.h
@@ -73,7 +73,7 @@ struct ib_pool_fmr {
 	int                 remap_count;
 	u64                 io_virtual_address;
 	int                 page_list_len;
-	u64                 page_list[0];
+	u64                 page_list[];
 };
 
 struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 5f3a04ead9f5..bbc5cfb57cd2 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1876,7 +1876,7 @@ struct ib_flow_eth_filter {
 	__be16	ether_type;
 	__be16	vlan_tag;
 	/* Must be last */
-	u8	real_sz[0];
+	u8	real_sz[];
 };
 
 struct ib_flow_spec_eth {
@@ -1890,7 +1890,7 @@ struct ib_flow_ib_filter {
 	__be16 dlid;
 	__u8   sl;
 	/* Must be last */
-	u8	real_sz[0];
+	u8	real_sz[];
 };
 
 struct ib_flow_spec_ib {
@@ -1915,7 +1915,7 @@ struct ib_flow_ipv4_filter {
 	u8	ttl;
 	u8	flags;
 	/* Must be last */
-	u8	real_sz[0];
+	u8	real_sz[];
 };
 
 struct ib_flow_spec_ipv4 {
@@ -1933,7 +1933,7 @@ struct ib_flow_ipv6_filter {
 	u8	traffic_class;
 	u8	hop_limit;
 	/* Must be last */
-	u8	real_sz[0];
+	u8	real_sz[];
 };
 
 struct ib_flow_spec_ipv6 {
@@ -1947,7 +1947,7 @@ struct ib_flow_tcp_udp_filter {
 	__be16	dst_port;
 	__be16	src_port;
 	/* Must be last */
-	u8	real_sz[0];
+	u8	real_sz[];
 };
 
 struct ib_flow_spec_tcp_udp {
@@ -1959,7 +1959,7 @@ struct ib_flow_spec_tcp_udp {
 
 struct ib_flow_tunnel_filter {
 	__be32	tunnel_id;
-	u8	real_sz[0];
+	u8	real_sz[];
 };
 
 /* ib_flow_spec_tunnel describes the Vxlan tunnel
@@ -1976,7 +1976,7 @@ struct ib_flow_esp_filter {
 	__be32	spi;
 	__be32  seq;
 	/* Must be last */
-	u8	real_sz[0];
+	u8	real_sz[];
 };
 
 struct ib_flow_spec_esp {
@@ -1991,7 +1991,7 @@ struct ib_flow_gre_filter {
 	__be16 protocol;
 	__be32 key;
 	/* Must be last */
-	u8	real_sz[0];
+	u8	real_sz[];
 };
 
 struct ib_flow_spec_gre {
@@ -2004,7 +2004,7 @@ struct ib_flow_spec_gre {
 struct ib_flow_mpls_filter {
 	__be32 tag;
 	/* Must be last */
-	u8	real_sz[0];
+	u8	real_sz[];
 };
 
 struct ib_flow_spec_mpls {
diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h
index 0c07a70bd7f6..e90b149fe92a 100644
--- a/include/rdma/opa_vnic.h
+++ b/include/rdma/opa_vnic.h
@@ -75,7 +75,7 @@
 struct opa_vnic_rdma_netdev {
 	struct rdma_netdev rn;  /* keep this first */
 	/* followed by device private data */
-	char *dev_priv[0];
+	char *dev_priv[];
 };
 
 static inline void *opa_vnic_priv(const struct net_device *dev)
diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h
index 72a3856d4057..ce6c888f7fe7 100644
--- a/include/rdma/rdmavt_mr.h
+++ b/include/rdma/rdmavt_mr.h
@@ -85,7 +85,7 @@ struct rvt_mregion {
 	u8  lkey_published;     /* in global table */
 	struct percpu_ref refcount;
 	struct completion comp; /* complete when refcount goes to zero */
-	struct rvt_segarray *map[0];    /* the segments */
+	struct rvt_segarray *map[];    /* the segments */
 };
 
 #define RVT_MAX_LKEY_TABLE_BITS 23
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 0d5c70e2d8ab..5fc10108703a 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -191,7 +191,7 @@ struct rvt_swqe {
 	u32 ssn;                /* send sequence number */
 	u32 length;             /* total length of data in sg_list */
 	void *priv;             /* driver dependent field */
-	struct rvt_sge sg_list[0];
+	struct rvt_sge sg_list[];
 };
 
 /**

From 080f6276fccfb3923691e57c1b44a627eabd1a25 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 21 Feb 2020 08:33:57 -0600
Subject: [PATCH 043/327] proc: In proc_prune_siblings_dcache cache an aquired
 super block

Because there are likely to be several sysctls in a row on the
same superblock cache the super_block after the count has
been raised and don't deactivate it until we are processing
another super_block.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/inode.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 74ce4a8d05eb..fa2dc732cd77 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -108,10 +108,11 @@ void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
 	struct inode *inode;
 	struct proc_inode *ei;
 	struct hlist_node *node;
-	struct super_block *sb;
+	struct super_block *old_sb = NULL;
 
 	rcu_read_lock();
 	for (;;) {
+		struct super_block *sb;
 		node = hlist_first_rcu(inodes);
 		if (!node)
 			break;
@@ -122,23 +123,28 @@ void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
 
 		inode = &ei->vfs_inode;
 		sb = inode->i_sb;
-		if (!atomic_inc_not_zero(&sb->s_active))
+		if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active))
 			continue;
 		inode = igrab(inode);
 		rcu_read_unlock();
+		if (sb != old_sb) {
+			if (old_sb)
+				deactivate_super(old_sb);
+			old_sb = sb;
+		}
 		if (unlikely(!inode)) {
-			deactivate_super(sb);
 			rcu_read_lock();
 			continue;
 		}
 
 		d_prune_aliases(inode);
 		iput(inode);
-		deactivate_super(sb);
 
 		rcu_read_lock();
 	}
 	rcu_read_unlock();
+	if (old_sb)
+		deactivate_super(old_sb);
 }
 
 static int proc_show_options(struct seq_file *seq, struct dentry *root)

From 8dae419f9ec730c1984ea7395067a2534780ada1 Mon Sep 17 00:00:00 2001
From: Devesh Sharma <devesh.sharma@broadcom.com>
Date: Sat, 15 Feb 2020 12:10:58 -0500
Subject: [PATCH 044/327] RDMA/bnxt_re: Refactor queue pair creation code

Restructuring the bnxt_re_create_qp function. Listing below the major
changes:
 - Monolithic central part of create_qp where attributes are initialized
   is now enclosed in one function and this new function has few more
   sub-functions.
 - Top level qp limit checking code moved to a function.
 - GSI QP creation and GSI Shadow qp creation code is handled in a sub
   function.

Link: https://lore.kernel.org/r/1581786665-23705-2-git-send-email-devesh.sharma@broadcom.com
Signed-off-by: Naresh Kumar PBS <nareshkumar.pbs@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/bnxt_re.h  |  13 +-
 drivers/infiniband/hw/bnxt_re/ib_verbs.c | 663 +++++++++++++++--------
 drivers/infiniband/hw/bnxt_re/main.c     |   3 +-
 3 files changed, 445 insertions(+), 234 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 725b2350e349..c2805384e832 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -104,6 +104,14 @@ struct bnxt_re_sqp_entries {
 	struct bnxt_re_qp *qp1_qp;
 };
 
+#define BNXT_RE_MAX_GSI_SQP_ENTRIES	1024
+struct bnxt_re_gsi_context {
+	struct	bnxt_re_qp *gsi_qp;
+	struct	bnxt_re_qp *gsi_sqp;
+	struct	bnxt_re_ah *gsi_sah;
+	struct	bnxt_re_sqp_entries *sqp_tbl;
+};
+
 #define BNXT_RE_MIN_MSIX		2
 #define BNXT_RE_MAX_MSIX		9
 #define BNXT_RE_AEQ_IDX			0
@@ -165,10 +173,7 @@ struct bnxt_re_dev {
 	u16				cosq[2];
 
 	/* QP for for handling QP1 packets */
-	u32				sqp_id;
-	struct bnxt_re_qp		*qp1_sqp;
-	struct bnxt_re_ah		*sqp_ah;
-	struct bnxt_re_sqp_entries sqp_tbl[1024];
+	struct bnxt_re_gsi_context	gsi_ctx;
 	atomic_t nq_alloc_cnt;
 	u32 is_virtfn;
 	u32 num_vfs;
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 18579e8d630c..c37f0b1df475 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -312,7 +312,7 @@ int bnxt_re_del_gid(const struct ib_gid_attr *attr, void **context)
 		 */
 		if (ctx->idx == 0 &&
 		    rdma_link_local_addr((struct in6_addr *)gid_to_del) &&
-		    ctx->refcnt == 1 && rdev->qp1_sqp) {
+		    ctx->refcnt == 1 && rdev->gsi_ctx.gsi_sqp) {
 			dev_dbg(rdev_to_dev(rdev),
 				"Trying to delete GID0 while QP1 is alive\n");
 			return -EFAULT;
@@ -742,6 +742,49 @@ void bnxt_re_unlock_cqs(struct bnxt_re_qp *qp,
 	spin_unlock_irqrestore(&qp->scq->cq_lock, flags);
 }
 
+static int bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp)
+{
+	struct bnxt_re_qp *gsi_sqp;
+	struct bnxt_re_ah *gsi_sah;
+	struct bnxt_re_dev *rdev;
+	int rc = 0;
+
+	rdev = qp->rdev;
+	gsi_sqp = rdev->gsi_ctx.gsi_sqp;
+	gsi_sah = rdev->gsi_ctx.gsi_sah;
+
+	/* remove from active qp list */
+	mutex_lock(&rdev->qp_lock);
+	list_del(&gsi_sqp->list);
+	mutex_unlock(&rdev->qp_lock);
+	atomic_dec(&rdev->qp_count);
+
+	dev_dbg(rdev_to_dev(rdev), "Destroy the shadow AH\n");
+	bnxt_qplib_destroy_ah(&rdev->qplib_res,
+			      &gsi_sah->qplib_ah,
+			      true);
+	bnxt_qplib_clean_qp(&qp->qplib_qp);
+
+	dev_dbg(rdev_to_dev(rdev), "Destroy the shadow QP\n");
+	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &gsi_sqp->qplib_qp);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "Destroy Shadow QP failed");
+		goto fail;
+	}
+	bnxt_qplib_free_qp_res(&rdev->qplib_res, &gsi_sqp->qplib_qp);
+
+	kfree(rdev->gsi_ctx.sqp_tbl);
+	kfree(gsi_sah);
+	kfree(gsi_sqp);
+	rdev->gsi_ctx.gsi_sqp = NULL;
+	rdev->gsi_ctx.gsi_sah = NULL;
+	rdev->gsi_ctx.sqp_tbl = NULL;
+
+	return 0;
+fail:
+	return rc;
+}
+
 /* Queue Pairs */
 int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 {
@@ -750,7 +793,13 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 	unsigned int flags;
 	int rc;
 
+	mutex_lock(&rdev->qp_lock);
+	list_del(&qp->list);
+	mutex_unlock(&rdev->qp_lock);
+	atomic_dec(&rdev->qp_count);
+
 	bnxt_qplib_flush_cqn_wq(&qp->qplib_qp);
+
 	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
 	if (rc) {
 		dev_err(rdev_to_dev(rdev), "Failed to destroy HW QP");
@@ -765,40 +814,19 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 
 	bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp);
 
-	if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp) {
-		bnxt_qplib_destroy_ah(&rdev->qplib_res, &rdev->sqp_ah->qplib_ah,
-				      false);
-
-		bnxt_qplib_clean_qp(&qp->qplib_qp);
-		rc = bnxt_qplib_destroy_qp(&rdev->qplib_res,
-					   &rdev->qp1_sqp->qplib_qp);
-		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to destroy Shadow QP");
-			return rc;
-		}
-		bnxt_qplib_free_qp_res(&rdev->qplib_res,
-				       &rdev->qp1_sqp->qplib_qp);
-		mutex_lock(&rdev->qp_lock);
-		list_del(&rdev->qp1_sqp->list);
-		atomic_dec(&rdev->qp_count);
-		mutex_unlock(&rdev->qp_lock);
-
-		kfree(rdev->sqp_ah);
-		kfree(rdev->qp1_sqp);
-		rdev->qp1_sqp = NULL;
-		rdev->sqp_ah = NULL;
+	if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp) {
+		rc = bnxt_re_destroy_gsi_sqp(qp);
+		if (rc)
+			goto sh_fail;
 	}
 
 	ib_umem_release(qp->rumem);
 	ib_umem_release(qp->sumem);
 
-	mutex_lock(&rdev->qp_lock);
-	list_del(&qp->list);
-	atomic_dec(&rdev->qp_count);
-	mutex_unlock(&rdev->qp_lock);
 	kfree(qp);
 	return 0;
+sh_fail:
+	return rc;
 }
 
 static u8 __from_ib_qp_type(enum ib_qp_type type)
@@ -967,8 +995,6 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp
 	if (rc)
 		goto fail;
 
-	rdev->sqp_id = qp->qplib_qp.id;
-
 	spin_lock_init(&qp->sq_lock);
 	INIT_LIST_HEAD(&qp->list);
 	mutex_lock(&rdev->qp_lock);
@@ -981,6 +1007,315 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp
 	return NULL;
 }
 
+static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp,
+				struct ib_qp_init_attr *init_attr)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_qplib_qp *qplqp;
+	struct bnxt_re_dev *rdev;
+	int entries;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	dev_attr = &rdev->dev_attr;
+
+	if (init_attr->srq) {
+		struct bnxt_re_srq *srq;
+
+		srq = container_of(init_attr->srq, struct bnxt_re_srq, ib_srq);
+		if (!srq) {
+			dev_err(rdev_to_dev(rdev), "SRQ not found");
+			return -EINVAL;
+		}
+		qplqp->srq = &srq->qplib_srq;
+		qplqp->rq.max_wqe = 0;
+	} else {
+		/* Allocate 1 more than what's provided so posting max doesn't
+		 * mean empty.
+		 */
+		entries = roundup_pow_of_two(init_attr->cap.max_recv_wr + 1);
+		qplqp->rq.max_wqe = min_t(u32, entries,
+					  dev_attr->max_qp_wqes + 1);
+
+		qplqp->rq.q_full_delta = qplqp->rq.max_wqe -
+					 init_attr->cap.max_recv_wr;
+		qplqp->rq.max_sge = init_attr->cap.max_recv_sge;
+		if (qplqp->rq.max_sge > dev_attr->max_qp_sges)
+			qplqp->rq.max_sge = dev_attr->max_qp_sges;
+	}
+
+	return 0;
+}
+
+static void bnxt_re_adjust_gsi_rq_attr(struct bnxt_re_qp *qp)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_qplib_qp *qplqp;
+	struct bnxt_re_dev *rdev;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	dev_attr = &rdev->dev_attr;
+
+	qplqp->rq.max_sge = dev_attr->max_qp_sges;
+	if (qplqp->rq.max_sge > dev_attr->max_qp_sges)
+		qplqp->rq.max_sge = dev_attr->max_qp_sges;
+}
+
+static void bnxt_re_init_sq_attr(struct bnxt_re_qp *qp,
+				 struct ib_qp_init_attr *init_attr,
+				 struct ib_udata *udata)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_qplib_qp *qplqp;
+	struct bnxt_re_dev *rdev;
+	int entries;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	dev_attr = &rdev->dev_attr;
+
+	qplqp->sq.max_sge = init_attr->cap.max_send_sge;
+	if (qplqp->sq.max_sge > dev_attr->max_qp_sges)
+		qplqp->sq.max_sge = dev_attr->max_qp_sges;
+	/*
+	 * Change the SQ depth if user has requested minimum using
+	 * configfs. Only supported for kernel consumers
+	 */
+	entries = init_attr->cap.max_send_wr;
+	/* Allocate 128 + 1 more than what's provided */
+	entries = roundup_pow_of_two(entries + BNXT_QPLIB_RESERVED_QP_WRS + 1);
+	qplqp->sq.max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes +
+			BNXT_QPLIB_RESERVED_QP_WRS + 1);
+	qplqp->sq.q_full_delta = BNXT_QPLIB_RESERVED_QP_WRS + 1;
+	/*
+	 * Reserving one slot for Phantom WQE. Application can
+	 * post one extra entry in this case. But allowing this to avoid
+	 * unexpected Queue full condition
+	 */
+	qplqp->sq.q_full_delta -= 1;
+}
+
+static void bnxt_re_adjust_gsi_sq_attr(struct bnxt_re_qp *qp,
+				       struct ib_qp_init_attr *init_attr)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_qplib_qp *qplqp;
+	struct bnxt_re_dev *rdev;
+	int entries;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	dev_attr = &rdev->dev_attr;
+
+	entries = roundup_pow_of_two(init_attr->cap.max_send_wr + 1);
+	qplqp->sq.max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1);
+	qplqp->sq.q_full_delta = qplqp->sq.max_wqe -
+				 init_attr->cap.max_send_wr;
+	qplqp->sq.max_sge++; /* Need one extra sge to put UD header */
+	if (qplqp->sq.max_sge > dev_attr->max_qp_sges)
+		qplqp->sq.max_sge = dev_attr->max_qp_sges;
+}
+
+static int bnxt_re_init_qp_type(struct bnxt_re_dev *rdev,
+				struct ib_qp_init_attr *init_attr)
+{
+	struct bnxt_qplib_chip_ctx *chip_ctx;
+	int qptype;
+
+	chip_ctx = &rdev->chip_ctx;
+
+	qptype = __from_ib_qp_type(init_attr->qp_type);
+	if (qptype == IB_QPT_MAX) {
+		dev_err(rdev_to_dev(rdev), "QP type 0x%x not supported",
+			qptype);
+		qptype = -EINVAL;
+		goto out;
+	}
+
+	if (bnxt_qplib_is_chip_gen_p5(chip_ctx) &&
+	    init_attr->qp_type == IB_QPT_GSI)
+		qptype = CMDQ_CREATE_QP_TYPE_GSI;
+out:
+	return qptype;
+}
+
+static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_qplib_qp *qplqp;
+	struct bnxt_re_dev *rdev;
+	struct bnxt_re_cq *cq;
+	int rc = 0, qptype;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	dev_attr = &rdev->dev_attr;
+
+	/* Setup misc params */
+	ether_addr_copy(qplqp->smac, rdev->netdev->dev_addr);
+	qplqp->pd = &pd->qplib_pd;
+	qplqp->qp_handle = (u64)qplqp;
+	qplqp->max_inline_data = init_attr->cap.max_inline_data;
+	qplqp->sig_type = ((init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) ?
+			    true : false);
+	qptype = bnxt_re_init_qp_type(rdev, init_attr);
+	if (qptype < 0) {
+		rc = qptype;
+		goto out;
+	}
+	qplqp->type = (u8)qptype;
+
+	if (init_attr->qp_type == IB_QPT_RC) {
+		qplqp->max_rd_atomic = dev_attr->max_qp_rd_atom;
+		qplqp->max_dest_rd_atomic = dev_attr->max_qp_init_rd_atom;
+	}
+	qplqp->mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
+	qplqp->dpi = &rdev->dpi_privileged; /* Doorbell page */
+	if (init_attr->create_flags)
+		dev_dbg(rdev_to_dev(rdev),
+			"QP create flags 0x%x not supported",
+			init_attr->create_flags);
+
+	/* Setup CQs */
+	if (init_attr->send_cq) {
+		cq = container_of(init_attr->send_cq, struct bnxt_re_cq, ib_cq);
+		if (!cq) {
+			dev_err(rdev_to_dev(rdev), "Send CQ not found");
+			rc = -EINVAL;
+			goto out;
+		}
+		qplqp->scq = &cq->qplib_cq;
+		qp->scq = cq;
+	}
+
+	if (init_attr->recv_cq) {
+		cq = container_of(init_attr->recv_cq, struct bnxt_re_cq, ib_cq);
+		if (!cq) {
+			dev_err(rdev_to_dev(rdev), "Receive CQ not found");
+			rc = -EINVAL;
+			goto out;
+		}
+		qplqp->rcq = &cq->qplib_cq;
+		qp->rcq = cq;
+	}
+
+	/* Setup RQ/SRQ */
+	rc = bnxt_re_init_rq_attr(qp, init_attr);
+	if (rc)
+		goto out;
+	if (init_attr->qp_type == IB_QPT_GSI)
+		bnxt_re_adjust_gsi_rq_attr(qp);
+
+	/* Setup SQ */
+	bnxt_re_init_sq_attr(qp, init_attr, udata);
+	if (init_attr->qp_type == IB_QPT_GSI)
+		bnxt_re_adjust_gsi_sq_attr(qp, init_attr);
+
+	if (udata) /* This will update DPI and qp_handle */
+		rc = bnxt_re_init_user_qp(rdev, pd, qp, udata);
+out:
+	return rc;
+}
+
+static int bnxt_re_create_shadow_gsi(struct bnxt_re_qp *qp,
+				     struct bnxt_re_pd *pd)
+{
+	struct bnxt_re_sqp_entries *sqp_tbl = NULL;
+	struct bnxt_re_dev *rdev;
+	struct bnxt_re_qp *sqp;
+	struct bnxt_re_ah *sah;
+	int rc = 0;
+
+	rdev = qp->rdev;
+	/* Create a shadow QP to handle the QP1 traffic */
+	sqp_tbl = kzalloc(sizeof(*sqp_tbl) * BNXT_RE_MAX_GSI_SQP_ENTRIES,
+			  GFP_KERNEL);
+	if (!sqp_tbl)
+		return -ENOMEM;
+	rdev->gsi_ctx.sqp_tbl = sqp_tbl;
+
+	sqp = bnxt_re_create_shadow_qp(pd, &rdev->qplib_res, &qp->qplib_qp);
+	if (!sqp) {
+		rc = -ENODEV;
+		dev_err(rdev_to_dev(rdev),
+			"Failed to create Shadow QP for QP1");
+		goto out;
+	}
+	rdev->gsi_ctx.gsi_sqp = sqp;
+
+	sqp->rcq = qp->rcq;
+	sqp->scq = qp->scq;
+	sah = bnxt_re_create_shadow_qp_ah(pd, &rdev->qplib_res,
+					  &qp->qplib_qp);
+	if (!sah) {
+		bnxt_qplib_destroy_qp(&rdev->qplib_res,
+				      &sqp->qplib_qp);
+		rc = -ENODEV;
+		dev_err(rdev_to_dev(rdev),
+			"Failed to create AH entry for ShadowQP");
+		goto out;
+	}
+	rdev->gsi_ctx.gsi_sah = sah;
+
+	return 0;
+out:
+	kfree(sqp_tbl);
+	return rc;
+}
+
+static int bnxt_re_create_gsi_qp(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd,
+				 struct ib_qp_init_attr *init_attr)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_re_dev *rdev;
+	struct bnxt_qplib_qp *qplqp;
+	int rc = 0;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	dev_attr = &rdev->dev_attr;
+
+	qplqp->rq_hdr_buf_size = BNXT_QPLIB_MAX_QP1_RQ_HDR_SIZE_V2;
+	qplqp->sq_hdr_buf_size = BNXT_QPLIB_MAX_QP1_SQ_HDR_SIZE_V2;
+
+	rc = bnxt_qplib_create_qp1(&rdev->qplib_res, qplqp);
+	if (rc) {
+		dev_err(rdev_to_dev(rdev), "create HW QP1 failed!");
+		goto out;
+	}
+
+	rc = bnxt_re_create_shadow_gsi(qp, pd);
+out:
+	return rc;
+}
+
+static bool bnxt_re_test_qp_limits(struct bnxt_re_dev *rdev,
+				   struct ib_qp_init_attr *init_attr,
+				   struct bnxt_qplib_dev_attr *dev_attr)
+{
+	bool rc = true;
+
+	if (init_attr->cap.max_send_wr > dev_attr->max_qp_wqes ||
+	    init_attr->cap.max_recv_wr > dev_attr->max_qp_wqes ||
+	    init_attr->cap.max_send_sge > dev_attr->max_qp_sges ||
+	    init_attr->cap.max_recv_sge > dev_attr->max_qp_sges ||
+	    init_attr->cap.max_inline_data > dev_attr->max_inline_data) {
+		dev_err(rdev_to_dev(rdev),
+			"Create QP failed - max exceeded! 0x%x/0x%x 0x%x/0x%x 0x%x/0x%x 0x%x/0x%x 0x%x/0x%x",
+			init_attr->cap.max_send_wr, dev_attr->max_qp_wqes,
+			init_attr->cap.max_recv_wr, dev_attr->max_qp_wqes,
+			init_attr->cap.max_send_sge, dev_attr->max_qp_sges,
+			init_attr->cap.max_recv_sge, dev_attr->max_qp_sges,
+			init_attr->cap.max_inline_data,
+			dev_attr->max_inline_data);
+		rc = false;
+	}
+	return rc;
+}
+
 struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
 				struct ib_qp_init_attr *qp_init_attr,
 				struct ib_udata *udata)
@@ -989,197 +1324,60 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
 	struct bnxt_re_dev *rdev = pd->rdev;
 	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
 	struct bnxt_re_qp *qp;
-	struct bnxt_re_cq *cq;
-	struct bnxt_re_srq *srq;
-	int rc, entries;
+	int rc;
 
-	if ((qp_init_attr->cap.max_send_wr > dev_attr->max_qp_wqes) ||
-	    (qp_init_attr->cap.max_recv_wr > dev_attr->max_qp_wqes) ||
-	    (qp_init_attr->cap.max_send_sge > dev_attr->max_qp_sges) ||
-	    (qp_init_attr->cap.max_recv_sge > dev_attr->max_qp_sges) ||
-	    (qp_init_attr->cap.max_inline_data > dev_attr->max_inline_data))
-		return ERR_PTR(-EINVAL);
+	rc = bnxt_re_test_qp_limits(rdev, qp_init_attr, dev_attr);
+	if (!rc) {
+		rc = -EINVAL;
+		goto exit;
+	}
 
 	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
-	if (!qp)
-		return ERR_PTR(-ENOMEM);
-
+	if (!qp) {
+		rc = -ENOMEM;
+		goto exit;
+	}
 	qp->rdev = rdev;
-	ether_addr_copy(qp->qplib_qp.smac, rdev->netdev->dev_addr);
-	qp->qplib_qp.pd = &pd->qplib_pd;
-	qp->qplib_qp.qp_handle = (u64)(unsigned long)(&qp->qplib_qp);
-	qp->qplib_qp.type = __from_ib_qp_type(qp_init_attr->qp_type);
-
-	if (qp_init_attr->qp_type == IB_QPT_GSI &&
-	    bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))
-		qp->qplib_qp.type = CMDQ_CREATE_QP_TYPE_GSI;
-	if (qp->qplib_qp.type == IB_QPT_MAX) {
-		dev_err(rdev_to_dev(rdev), "QP type 0x%x not supported",
-			qp->qplib_qp.type);
-		rc = -EINVAL;
+	rc = bnxt_re_init_qp_attr(qp, pd, qp_init_attr, udata);
+	if (rc)
 		goto fail;
-	}
-
-	qp->qplib_qp.max_inline_data = qp_init_attr->cap.max_inline_data;
-	qp->qplib_qp.sig_type = ((qp_init_attr->sq_sig_type ==
-				  IB_SIGNAL_ALL_WR) ? true : false);
-
-	qp->qplib_qp.sq.max_sge = qp_init_attr->cap.max_send_sge;
-	if (qp->qplib_qp.sq.max_sge > dev_attr->max_qp_sges)
-		qp->qplib_qp.sq.max_sge = dev_attr->max_qp_sges;
-
-	if (qp_init_attr->send_cq) {
-		cq = container_of(qp_init_attr->send_cq, struct bnxt_re_cq,
-				  ib_cq);
-		if (!cq) {
-			dev_err(rdev_to_dev(rdev), "Send CQ not found");
-			rc = -EINVAL;
-			goto fail;
-		}
-		qp->qplib_qp.scq = &cq->qplib_cq;
-		qp->scq = cq;
-	}
-
-	if (qp_init_attr->recv_cq) {
-		cq = container_of(qp_init_attr->recv_cq, struct bnxt_re_cq,
-				  ib_cq);
-		if (!cq) {
-			dev_err(rdev_to_dev(rdev), "Receive CQ not found");
-			rc = -EINVAL;
-			goto fail;
-		}
-		qp->qplib_qp.rcq = &cq->qplib_cq;
-		qp->rcq = cq;
-	}
-
-	if (qp_init_attr->srq) {
-		srq = container_of(qp_init_attr->srq, struct bnxt_re_srq,
-				   ib_srq);
-		if (!srq) {
-			dev_err(rdev_to_dev(rdev), "SRQ not found");
-			rc = -EINVAL;
-			goto fail;
-		}
-		qp->qplib_qp.srq = &srq->qplib_srq;
-		qp->qplib_qp.rq.max_wqe = 0;
-	} else {
-		/* Allocate 1 more than what's provided so posting max doesn't
-		 * mean empty
-		 */
-		entries = roundup_pow_of_two(qp_init_attr->cap.max_recv_wr + 1);
-		qp->qplib_qp.rq.max_wqe = min_t(u32, entries,
-						dev_attr->max_qp_wqes + 1);
-
-		qp->qplib_qp.rq.q_full_delta = qp->qplib_qp.rq.max_wqe -
-						qp_init_attr->cap.max_recv_wr;
-
-		qp->qplib_qp.rq.max_sge = qp_init_attr->cap.max_recv_sge;
-		if (qp->qplib_qp.rq.max_sge > dev_attr->max_qp_sges)
-			qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
-	}
-
-	qp->qplib_qp.mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
 
 	if (qp_init_attr->qp_type == IB_QPT_GSI &&
 	    !(bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))) {
-		/* Allocate 1 more than what's provided */
-		entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr + 1);
-		qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
-						dev_attr->max_qp_wqes + 1);
-		qp->qplib_qp.sq.q_full_delta = qp->qplib_qp.sq.max_wqe -
-						qp_init_attr->cap.max_send_wr;
-		qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
-		if (qp->qplib_qp.rq.max_sge > dev_attr->max_qp_sges)
-			qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
-		qp->qplib_qp.sq.max_sge++;
-		if (qp->qplib_qp.sq.max_sge > dev_attr->max_qp_sges)
-			qp->qplib_qp.sq.max_sge = dev_attr->max_qp_sges;
-
-		qp->qplib_qp.rq_hdr_buf_size =
-					BNXT_QPLIB_MAX_QP1_RQ_HDR_SIZE_V2;
-
-		qp->qplib_qp.sq_hdr_buf_size =
-					BNXT_QPLIB_MAX_QP1_SQ_HDR_SIZE_V2;
-		qp->qplib_qp.dpi = &rdev->dpi_privileged;
-		rc = bnxt_qplib_create_qp1(&rdev->qplib_res, &qp->qplib_qp);
-		if (rc) {
-			dev_err(rdev_to_dev(rdev), "Failed to create HW QP1");
+		rc = bnxt_re_create_gsi_qp(qp, pd, qp_init_attr);
+		if (rc == -ENODEV)
+			goto qp_destroy;
+		if (rc)
 			goto fail;
-		}
-		/* Create a shadow QP to handle the QP1 traffic */
-		rdev->qp1_sqp = bnxt_re_create_shadow_qp(pd, &rdev->qplib_res,
-							 &qp->qplib_qp);
-		if (!rdev->qp1_sqp) {
-			rc = -EINVAL;
-			dev_err(rdev_to_dev(rdev),
-				"Failed to create Shadow QP for QP1");
-			goto qp_destroy;
-		}
-		rdev->sqp_ah = bnxt_re_create_shadow_qp_ah(pd, &rdev->qplib_res,
-							   &qp->qplib_qp);
-		if (!rdev->sqp_ah) {
-			bnxt_qplib_destroy_qp(&rdev->qplib_res,
-					      &rdev->qp1_sqp->qplib_qp);
-			rc = -EINVAL;
-			dev_err(rdev_to_dev(rdev),
-				"Failed to create AH entry for ShadowQP");
-			goto qp_destroy;
-		}
-
 	} else {
-		/* Allocate 128 + 1 more than what's provided */
-		entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr +
-					     BNXT_QPLIB_RESERVED_QP_WRS + 1);
-		qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
-						dev_attr->max_qp_wqes +
-						BNXT_QPLIB_RESERVED_QP_WRS + 1);
-		qp->qplib_qp.sq.q_full_delta = BNXT_QPLIB_RESERVED_QP_WRS + 1;
-
-		/*
-		 * Reserving one slot for Phantom WQE. Application can
-		 * post one extra entry in this case. But allowing this to avoid
-		 * unexpected Queue full condition
-		 */
-
-		qp->qplib_qp.sq.q_full_delta -= 1;
-
-		qp->qplib_qp.max_rd_atomic = dev_attr->max_qp_rd_atom;
-		qp->qplib_qp.max_dest_rd_atomic = dev_attr->max_qp_init_rd_atom;
-		if (udata) {
-			rc = bnxt_re_init_user_qp(rdev, pd, qp, udata);
-			if (rc)
-				goto fail;
-		} else {
-			qp->qplib_qp.dpi = &rdev->dpi_privileged;
-		}
-
 		rc = bnxt_qplib_create_qp(&rdev->qplib_res, &qp->qplib_qp);
 		if (rc) {
 			dev_err(rdev_to_dev(rdev), "Failed to create HW QP");
 			goto free_umem;
 		}
+		if (udata) {
+			struct bnxt_re_qp_resp resp;
+
+			resp.qpid = qp->qplib_qp.id;
+			resp.rsvd = 0;
+			rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+			if (rc) {
+				dev_err(rdev_to_dev(rdev), "Failed to copy QP udata");
+				goto qp_destroy;
+			}
+		}
 	}
 
 	qp->ib_qp.qp_num = qp->qplib_qp.id;
+	if (qp_init_attr->qp_type == IB_QPT_GSI)
+		rdev->gsi_ctx.gsi_qp = qp;
 	spin_lock_init(&qp->sq_lock);
 	spin_lock_init(&qp->rq_lock);
-
-	if (udata) {
-		struct bnxt_re_qp_resp resp;
-
-		resp.qpid = qp->ib_qp.qp_num;
-		resp.rsvd = 0;
-		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
-		if (rc) {
-			dev_err(rdev_to_dev(rdev), "Failed to copy QP udata");
-			goto qp_destroy;
-		}
-	}
 	INIT_LIST_HEAD(&qp->list);
 	mutex_lock(&rdev->qp_lock);
 	list_add_tail(&qp->list, &rdev->qp_list);
-	atomic_inc(&rdev->qp_count);
 	mutex_unlock(&rdev->qp_lock);
+	atomic_inc(&rdev->qp_count);
 
 	return &qp->ib_qp;
 qp_destroy:
@@ -1189,6 +1387,7 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
 	ib_umem_release(qp->sumem);
 fail:
 	kfree(qp);
+exit:
 	return ERR_PTR(rc);
 }
 
@@ -1487,7 +1686,7 @@ static int bnxt_re_modify_shadow_qp(struct bnxt_re_dev *rdev,
 				    struct bnxt_re_qp *qp1_qp,
 				    int qp_attr_mask)
 {
-	struct bnxt_re_qp *qp = rdev->qp1_sqp;
+	struct bnxt_re_qp *qp = rdev->gsi_ctx.gsi_sqp;
 	int rc = 0;
 
 	if (qp_attr_mask & IB_QP_STATE) {
@@ -1752,7 +1951,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 		dev_err(rdev_to_dev(rdev), "Failed to modify HW QP");
 		return rc;
 	}
-	if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp)
+	if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp)
 		rc = bnxt_re_modify_shadow_qp(rdev, qp, qp_attr_mask);
 	return rc;
 }
@@ -1996,9 +2195,12 @@ static int bnxt_re_build_qp1_shadow_qp_recv(struct bnxt_re_qp *qp,
 					    struct bnxt_qplib_swqe *wqe,
 					    int payload_size)
 {
-	struct bnxt_qplib_sge ref, sge;
-	u32 rq_prod_index;
 	struct bnxt_re_sqp_entries *sqp_entry;
+	struct bnxt_qplib_sge ref, sge;
+	struct bnxt_re_dev *rdev;
+	u32 rq_prod_index;
+
+	rdev = qp->rdev;
 
 	rq_prod_index = bnxt_qplib_get_rq_prod_index(&qp->qplib_qp);
 
@@ -2013,7 +2215,7 @@ static int bnxt_re_build_qp1_shadow_qp_recv(struct bnxt_re_qp *qp,
 	ref.lkey = wqe->sg_list[0].lkey;
 	ref.size = wqe->sg_list[0].size;
 
-	sqp_entry = &qp->rdev->sqp_tbl[rq_prod_index];
+	sqp_entry = &rdev->gsi_ctx.sqp_tbl[rq_prod_index];
 
 	/* SGE 1 */
 	wqe->sg_list[0].addr = sge.addr;
@@ -2833,12 +3035,13 @@ static bool bnxt_re_is_loopback_packet(struct bnxt_re_dev *rdev,
 	return rc;
 }
 
-static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *qp1_qp,
+static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *gsi_qp,
 					 struct bnxt_qplib_cqe *cqe)
 {
-	struct bnxt_re_dev *rdev = qp1_qp->rdev;
+	struct bnxt_re_dev *rdev = gsi_qp->rdev;
 	struct bnxt_re_sqp_entries *sqp_entry = NULL;
-	struct bnxt_re_qp *qp = rdev->qp1_sqp;
+	struct bnxt_re_qp *gsi_sqp = rdev->gsi_ctx.gsi_sqp;
+	struct bnxt_re_ah *gsi_sah;
 	struct ib_send_wr *swr;
 	struct ib_ud_wr udwr;
 	struct ib_recv_wr rwr;
@@ -2861,19 +3064,19 @@ static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *qp1_qp,
 	swr = &udwr.wr;
 	tbl_idx = cqe->wr_id;
 
-	rq_hdr_buf = qp1_qp->qplib_qp.rq_hdr_buf +
-			(tbl_idx * qp1_qp->qplib_qp.rq_hdr_buf_size);
-	rq_hdr_buf_map = bnxt_qplib_get_qp_buf_from_index(&qp1_qp->qplib_qp,
+	rq_hdr_buf = gsi_qp->qplib_qp.rq_hdr_buf +
+			(tbl_idx * gsi_qp->qplib_qp.rq_hdr_buf_size);
+	rq_hdr_buf_map = bnxt_qplib_get_qp_buf_from_index(&gsi_qp->qplib_qp,
 							  tbl_idx);
 
 	/* Shadow QP header buffer */
-	shrq_hdr_buf_map = bnxt_qplib_get_qp_buf_from_index(&qp->qplib_qp,
+	shrq_hdr_buf_map = bnxt_qplib_get_qp_buf_from_index(&gsi_qp->qplib_qp,
 							    tbl_idx);
-	sqp_entry = &rdev->sqp_tbl[tbl_idx];
+	sqp_entry = &rdev->gsi_ctx.sqp_tbl[tbl_idx];
 
 	/* Store this cqe */
 	memcpy(&sqp_entry->cqe, cqe, sizeof(struct bnxt_qplib_cqe));
-	sqp_entry->qp1_qp = qp1_qp;
+	sqp_entry->qp1_qp = gsi_qp;
 
 	/* Find packet type from the cqe */
 
@@ -2927,7 +3130,7 @@ static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *qp1_qp,
 	rwr.wr_id = tbl_idx;
 	rwr.next = NULL;
 
-	rc = bnxt_re_post_recv_shadow_qp(rdev, qp, &rwr);
+	rc = bnxt_re_post_recv_shadow_qp(rdev, gsi_sqp, &rwr);
 	if (rc) {
 		dev_err(rdev_to_dev(rdev),
 			"Failed to post Rx buffers to shadow QP");
@@ -2939,13 +3142,13 @@ static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *qp1_qp,
 	swr->wr_id = tbl_idx;
 	swr->opcode = IB_WR_SEND;
 	swr->next = NULL;
-
-	udwr.ah = &rdev->sqp_ah->ib_ah;
-	udwr.remote_qpn = rdev->qp1_sqp->qplib_qp.id;
-	udwr.remote_qkey = rdev->qp1_sqp->qplib_qp.qkey;
+	gsi_sah = rdev->gsi_ctx.gsi_sah;
+	udwr.ah = &gsi_sah->ib_ah;
+	udwr.remote_qpn = gsi_sqp->qplib_qp.id;
+	udwr.remote_qkey = gsi_sqp->qplib_qp.qkey;
 
 	/* post data received  in the send queue */
-	rc = bnxt_re_post_send_shadow_qp(rdev, qp, swr);
+	rc = bnxt_re_post_send_shadow_qp(rdev, gsi_sqp, swr);
 
 	return 0;
 }
@@ -2999,12 +3202,12 @@ static void bnxt_re_process_res_rc_wc(struct ib_wc *wc,
 		wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
 }
 
-static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *qp,
+static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *gsi_sqp,
 					     struct ib_wc *wc,
 					     struct bnxt_qplib_cqe *cqe)
 {
-	struct bnxt_re_dev *rdev = qp->rdev;
-	struct bnxt_re_qp *qp1_qp = NULL;
+	struct bnxt_re_dev *rdev = gsi_sqp->rdev;
+	struct bnxt_re_qp *gsi_qp = NULL;
 	struct bnxt_qplib_cqe *orig_cqe = NULL;
 	struct bnxt_re_sqp_entries *sqp_entry = NULL;
 	int nw_type;
@@ -3014,13 +3217,13 @@ static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *qp,
 
 	tbl_idx = cqe->wr_id;
 
-	sqp_entry = &rdev->sqp_tbl[tbl_idx];
-	qp1_qp = sqp_entry->qp1_qp;
+	sqp_entry = &rdev->gsi_ctx.sqp_tbl[tbl_idx];
+	gsi_qp = sqp_entry->qp1_qp;
 	orig_cqe = &sqp_entry->cqe;
 
 	wc->wr_id = sqp_entry->wrid;
 	wc->byte_len = orig_cqe->length;
-	wc->qp = &qp1_qp->ib_qp;
+	wc->qp = &gsi_qp->ib_qp;
 
 	wc->ex.imm_data = orig_cqe->immdata;
 	wc->src_qp = orig_cqe->src_qp;
@@ -3099,7 +3302,7 @@ static int send_phantom_wqe(struct bnxt_re_qp *qp)
 int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
 {
 	struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq);
-	struct bnxt_re_qp *qp;
+	struct bnxt_re_qp *qp, *sh_qp;
 	struct bnxt_qplib_cqe *cqe;
 	int i, ncqe, budget;
 	struct bnxt_qplib_q *sq;
@@ -3163,8 +3366,9 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
 
 			switch (cqe->opcode) {
 			case CQ_BASE_CQE_TYPE_REQ:
-				if (qp->rdev->qp1_sqp && qp->qplib_qp.id ==
-				    qp->rdev->qp1_sqp->qplib_qp.id) {
+				sh_qp = qp->rdev->gsi_ctx.gsi_sqp;
+				if (sh_qp &&
+				    qp->qplib_qp.id == sh_qp->qplib_qp.id) {
 					/* Handle this completion with
 					 * the stored completion
 					 */
@@ -3190,7 +3394,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
 				 * stored in the table
 				 */
 				tbl_idx = cqe->wr_id;
-				sqp_entry = &cq->rdev->sqp_tbl[tbl_idx];
+				sqp_entry = &cq->rdev->gsi_ctx.sqp_tbl[tbl_idx];
 				wc->wr_id = sqp_entry->wrid;
 				bnxt_re_process_res_rawqp1_wc(wc, cqe);
 				break;
@@ -3198,8 +3402,9 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
 				bnxt_re_process_res_rc_wc(wc, cqe);
 				break;
 			case CQ_BASE_CQE_TYPE_RES_UD:
-				if (qp->rdev->qp1_sqp && qp->qplib_qp.id ==
-				    qp->rdev->qp1_sqp->qplib_qp.id) {
+				sh_qp = qp->rdev->gsi_ctx.gsi_sqp;
+				if (sh_qp &&
+				    qp->qplib_qp.id == sh_qp->qplib_qp.id) {
 					/* Handle this completion with
 					 * the stored completion
 					 */
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 793c97251588..82a5350ff6d7 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -1125,7 +1125,8 @@ static int bnxt_re_query_hwrm_pri2cos(struct bnxt_re_dev *rdev, u8 dir,
 static bool bnxt_re_is_qp1_or_shadow_qp(struct bnxt_re_dev *rdev,
 					struct bnxt_re_qp *qp)
 {
-	return (qp->ib_qp.qp_type == IB_QPT_GSI) || (qp == rdev->qp1_sqp);
+	return (qp->ib_qp.qp_type == IB_QPT_GSI) ||
+	       (qp == rdev->gsi_ctx.gsi_sqp);
 }
 
 static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev)

From 0cfb329db988804124423b311a2845e56914e3ca Mon Sep 17 00:00:00 2001
From: Devesh Sharma <devesh.sharma@broadcom.com>
Date: Sat, 15 Feb 2020 12:10:59 -0500
Subject: [PATCH 045/327] RDMA/bnxt_re: Replace chip context structure with
 pointer

The chip_ctx member in bnxt_re_dev structure is now a pointer to struct
bnxt_qplib_chip_ctx. Since the member type has changed there are changes
in rest of the code wherever dev->chip_ctx is used.

Link: https://lore.kernel.org/r/1581786665-23705-3-git-send-email-devesh.sharma@broadcom.com
Signed-off-by: Naresh Kumar PBS <nareshkumar.pbs@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/bnxt_re.h  |  2 +-
 drivers/infiniband/hw/bnxt_re/ib_verbs.c | 13 ++++----
 drivers/infiniband/hw/bnxt_re/main.c     | 40 ++++++++++++++++--------
 drivers/infiniband/hw/bnxt_re/qplib_fp.c |  2 +-
 4 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index c2805384e832..86274f4c8225 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -133,7 +133,7 @@ struct bnxt_re_dev {
 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS          29
 	struct net_device		*netdev;
 	unsigned int			version, major, minor;
-	struct bnxt_qplib_chip_ctx	chip_ctx;
+	struct bnxt_qplib_chip_ctx	*chip_ctx;
 	struct bnxt_en_dev		*en_dev;
 	struct bnxt_msix_entry		msix_entries[BNXT_RE_MAX_MSIX];
 	int				num_msix;
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index c37f0b1df475..a17c9cf91118 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -859,7 +859,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
 	bytes = (qplib_qp->sq.max_wqe * BNXT_QPLIB_MAX_SQE_ENTRY_SIZE);
 	/* Consider mapping PSN search memory only for RC QPs. */
 	if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC) {
-		psn_sz = bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ?
+		psn_sz = bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ?
 					sizeof(struct sq_psn_search_ext) :
 					sizeof(struct sq_psn_search);
 		bytes += (qplib_qp->sq.max_wqe * psn_sz);
@@ -1060,6 +1060,7 @@ static void bnxt_re_adjust_gsi_rq_attr(struct bnxt_re_qp *qp)
 	qplqp->rq.max_sge = dev_attr->max_qp_sges;
 	if (qplqp->rq.max_sge > dev_attr->max_qp_sges)
 		qplqp->rq.max_sge = dev_attr->max_qp_sges;
+	qplqp->rq.max_sge = 6;
 }
 
 static void bnxt_re_init_sq_attr(struct bnxt_re_qp *qp,
@@ -1123,7 +1124,7 @@ static int bnxt_re_init_qp_type(struct bnxt_re_dev *rdev,
 	struct bnxt_qplib_chip_ctx *chip_ctx;
 	int qptype;
 
-	chip_ctx = &rdev->chip_ctx;
+	chip_ctx = rdev->chip_ctx;
 
 	qptype = __from_ib_qp_type(init_attr->qp_type);
 	if (qptype == IB_QPT_MAX) {
@@ -1343,7 +1344,7 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
 		goto fail;
 
 	if (qp_init_attr->qp_type == IB_QPT_GSI &&
-	    !(bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))) {
+	    !(bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx))) {
 		rc = bnxt_re_create_gsi_qp(qp, pd, qp_init_attr);
 		if (rc == -ENODEV)
 			goto qp_destroy;
@@ -3820,10 +3821,10 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata)
 	spin_lock_init(&uctx->sh_lock);
 
 	resp.comp_mask = BNXT_RE_UCNTX_CMASK_HAVE_CCTX;
-	chip_met_rev_num = rdev->chip_ctx.chip_num;
-	chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_rev & 0xFF) <<
+	chip_met_rev_num = rdev->chip_ctx->chip_num;
+	chip_met_rev_num |= ((u32)rdev->chip_ctx->chip_rev & 0xFF) <<
 			     BNXT_RE_CHIP_ID0_CHIP_REV_SFT;
-	chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_metal & 0xFF) <<
+	chip_met_rev_num |= ((u32)rdev->chip_ctx->chip_metal & 0xFF) <<
 			     BNXT_RE_CHIP_ID0_CHIP_MET_SFT;
 	resp.chip_id0 = chip_met_rev_num;
 	/* Future extension of chip info */
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 82a5350ff6d7..390daebea4c3 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -82,22 +82,35 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev);
 
 static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev)
 {
+	struct bnxt_qplib_chip_ctx *chip_ctx;
+
+	if (!rdev->chip_ctx)
+		return;
+	chip_ctx = rdev->chip_ctx;
+	rdev->chip_ctx = NULL;
 	rdev->rcfw.res = NULL;
 	rdev->qplib_res.cctx = NULL;
+	kfree(chip_ctx);
 }
 
 static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev)
 {
+	struct bnxt_qplib_chip_ctx *chip_ctx;
 	struct bnxt_en_dev *en_dev;
 	struct bnxt *bp;
 
 	en_dev = rdev->en_dev;
 	bp = netdev_priv(en_dev->net);
 
-	rdev->chip_ctx.chip_num = bp->chip_num;
+	chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL);
+	if (!chip_ctx)
+		return -ENOMEM;
+	chip_ctx->chip_num = bp->chip_num;
+
+	rdev->chip_ctx = chip_ctx;
 	/* rest members to follow eventually */
 
-	rdev->qplib_res.cctx = &rdev->chip_ctx;
+	rdev->qplib_res.cctx = rdev->chip_ctx;
 	rdev->rcfw.res = &rdev->qplib_res;
 
 	return 0;
@@ -136,7 +149,7 @@ static void bnxt_re_limit_pf_res(struct bnxt_re_dev *rdev)
 	ctx->srqc_count = min_t(u32, BNXT_RE_MAX_SRQC_COUNT,
 				attr->max_srq);
 	ctx->cq_count = min_t(u32, BNXT_RE_MAX_CQ_COUNT, attr->max_cq);
-	if (!bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))
+	if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx))
 		for (i = 0; i < MAX_TQM_ALLOC_REQ; i++)
 			rdev->qplib_ctx.tqm_count[i] =
 			rdev->dev_attr.tqm_alloc_reqs[i];
@@ -185,7 +198,7 @@ static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev)
 	memset(&rdev->qplib_ctx.vf_res, 0, sizeof(struct bnxt_qplib_vf_res));
 	bnxt_re_limit_pf_res(rdev);
 
-	num_vfs =  bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ?
+	num_vfs =  bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ?
 			BNXT_RE_GEN_P5_MAX_VF : rdev->num_vfs;
 	if (num_vfs)
 		bnxt_re_limit_vf_res(&rdev->qplib_ctx, num_vfs);
@@ -208,7 +221,7 @@ static void bnxt_re_sriov_config(void *p, int num_vfs)
 		return;
 
 	rdev->num_vfs = num_vfs;
-	if (!bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx)) {
+	if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) {
 		bnxt_re_set_resource_limits(rdev);
 		bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw,
 					      &rdev->qplib_ctx);
@@ -916,7 +929,7 @@ static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq,
 #define BNXT_RE_GEN_P5_VF_NQ_DB		0x4000
 static u32 bnxt_re_get_nqdb_offset(struct bnxt_re_dev *rdev, u16 indx)
 {
-	return bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ?
+	return bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ?
 		(rdev->is_virtfn ? BNXT_RE_GEN_P5_VF_NQ_DB :
 				   BNXT_RE_GEN_P5_PF_NQ_DB) :
 				   rdev->msix_entries[indx].db_offset;
@@ -967,7 +980,7 @@ static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev)
 	int i;
 
 	for (i = 0; i < rdev->num_msix - 1; i++) {
-		type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
 		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type);
 		rdev->nq[i].res = NULL;
 		bnxt_qplib_free_nq(&rdev->nq[i]);
@@ -1025,7 +1038,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 				i, rc);
 			goto free_nq;
 		}
-		type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
 		pg_map = rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr;
 		pages = rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count;
 		rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type,
@@ -1044,7 +1057,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 	return 0;
 free_nq:
 	for (i = num_vec_created; i >= 0; i--) {
-		type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
 		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type);
 		bnxt_qplib_free_nq(&rdev->nq[i]);
 	}
@@ -1324,7 +1337,7 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
 		bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
 		bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
 		bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
-		type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
 		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
 		bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
 	}
@@ -1405,7 +1418,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 		pr_err("Failed to allocate RCFW Channel: %#x\n", rc);
 		goto fail;
 	}
-	type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+
+	type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
 	pg_map = rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr;
 	pages = rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count;
 	ridx = rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx;
@@ -1434,7 +1448,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	bnxt_re_set_resource_limits(rdev);
 
 	rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0,
-				  bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx));
+				  bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx));
 	if (rc) {
 		pr_err("Failed to allocate QPLIB context: %#x\n", rc);
 		goto disable_rcfw;
@@ -1504,7 +1518,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 disable_rcfw:
 	bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
 free_ring:
-	type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+	type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
 	bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
 free_rcfw:
 	bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index 020f70e6865e..ffe861060fbf 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -2426,7 +2426,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq,
 	}
 	cqe = *pcqe;
 	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
-	cqe->length = (u32)le16_to_cpu(hwcqe->length);
+	cqe->length = le16_to_cpu(hwcqe->length) & CQ_RES_UD_LENGTH_MASK;
 	cqe->cfa_meta = le16_to_cpu(hwcqe->cfa_metadata);
 	cqe->invrkey = le32_to_cpu(hwcqe->imm_data);
 	cqe->flags = le16_to_cpu(hwcqe->flags);

From 0c4dcd602817502bb3dced7a834a13ef717d65a4 Mon Sep 17 00:00:00 2001
From: Devesh Sharma <devesh.sharma@broadcom.com>
Date: Sat, 15 Feb 2020 12:11:00 -0500
Subject: [PATCH 046/327] RDMA/bnxt_re: Refactor hardware queue memory
 allocation

At top level there are three major data structure addition.  viz
bnxt_qplib_hwq_attr, bnxt_qplib_sg_info and bnxt_qplib_tqm_ctx

Intorduction of first data structure reduces the arguments list to
bnxt_re_alloc_init_hwq() function. There are changes all over the driver
code to incorporate this new structure. The caller needs to fill the
attribute data structure and pass to this function.

The second data structure is to pass memory region description
viz. sghead, page_size and page_shift. There are changes all over the
driver code to initialize bnxt_re_sg_info data structure. The new data
structure helps to reduce the argument list of __alloc_pbl() function
call.

Till now the TQM rings related members were not collected under any
specific data-structure making it hard to manage. The third data
sctructure bnxt_qplib_tqm_ctx is added to refactor the TQM queue
allocation and initialization.

Link: https://lore.kernel.org/r/1581786665-23705-4-git-send-email-devesh.sharma@broadcom.com
Signed-off-by: Naresh Kumar PBS <nareshkumar.pbs@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c   |  26 +-
 drivers/infiniband/hw/bnxt_re/main.c       |  20 +-
 drivers/infiniband/hw/bnxt_re/qplib_fp.c   | 169 ++++----
 drivers/infiniband/hw/bnxt_re/qplib_fp.h   |   2 +-
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.c |  55 +--
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.h |   5 +-
 drivers/infiniband/hw/bnxt_re/qplib_res.c  | 468 +++++++++++++--------
 drivers/infiniband/hw/bnxt_re/qplib_res.h  |  60 ++-
 drivers/infiniband/hw/bnxt_re/qplib_sp.c   |  48 ++-
 9 files changed, 523 insertions(+), 330 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index a17c9cf91118..4368aacdd482 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -871,9 +871,11 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
 		return PTR_ERR(umem);
 
 	qp->sumem = umem;
-	qplib_qp->sq.sg_info.sglist = umem->sg_head.sgl;
+	qplib_qp->sq.sg_info.sghead = umem->sg_head.sgl;
 	qplib_qp->sq.sg_info.npages = ib_umem_num_pages(umem);
 	qplib_qp->sq.sg_info.nmap = umem->nmap;
+	qplib_qp->sq.sg_info.pgsize = PAGE_SIZE;
+	qplib_qp->sq.sg_info.pgshft = PAGE_SHIFT;
 	qplib_qp->qp_handle = ureq.qp_handle;
 
 	if (!qp->qplib_qp.srq) {
@@ -884,9 +886,11 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
 		if (IS_ERR(umem))
 			goto rqfail;
 		qp->rumem = umem;
-		qplib_qp->rq.sg_info.sglist = umem->sg_head.sgl;
+		qplib_qp->rq.sg_info.sghead = umem->sg_head.sgl;
 		qplib_qp->rq.sg_info.npages = ib_umem_num_pages(umem);
 		qplib_qp->rq.sg_info.nmap = umem->nmap;
+		qplib_qp->rq.sg_info.pgsize = PAGE_SIZE;
+		qplib_qp->rq.sg_info.pgshft = PAGE_SHIFT;
 	}
 
 	qplib_qp->dpi = &cntx->dpi;
@@ -976,6 +980,8 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp
 	qp->qplib_qp.sq.max_sge = 2;
 	/* Q full delta can be 1 since it is internal QP */
 	qp->qplib_qp.sq.q_full_delta = 1;
+	qp->qplib_qp.sq.sg_info.pgsize = PAGE_SIZE;
+	qp->qplib_qp.sq.sg_info.pgshft = PAGE_SHIFT;
 
 	qp->qplib_qp.scq = qp1_qp->scq;
 	qp->qplib_qp.rcq = qp1_qp->rcq;
@@ -984,6 +990,8 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp
 	qp->qplib_qp.rq.max_sge = qp1_qp->rq.max_sge;
 	/* Q full delta can be 1 since it is internal QP */
 	qp->qplib_qp.rq.q_full_delta = 1;
+	qp->qplib_qp.rq.sg_info.pgsize = PAGE_SIZE;
+	qp->qplib_qp.rq.sg_info.pgshft = PAGE_SHIFT;
 
 	qp->qplib_qp.mtu = qp1_qp->mtu;
 
@@ -1043,6 +1051,8 @@ static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp,
 		if (qplqp->rq.max_sge > dev_attr->max_qp_sges)
 			qplqp->rq.max_sge = dev_attr->max_qp_sges;
 	}
+	qplqp->rq.sg_info.pgsize = PAGE_SIZE;
+	qplqp->rq.sg_info.pgshft = PAGE_SHIFT;
 
 	return 0;
 }
@@ -1095,6 +1105,8 @@ static void bnxt_re_init_sq_attr(struct bnxt_re_qp *qp,
 	 * unexpected Queue full condition
 	 */
 	qplqp->sq.q_full_delta -= 1;
+	qplqp->sq.sg_info.pgsize = PAGE_SIZE;
+	qplqp->sq.sg_info.pgshft = PAGE_SHIFT;
 }
 
 static void bnxt_re_adjust_gsi_sq_attr(struct bnxt_re_qp *qp,
@@ -1511,9 +1523,11 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
 		return PTR_ERR(umem);
 
 	srq->umem = umem;
-	qplib_srq->sg_info.sglist = umem->sg_head.sgl;
+	qplib_srq->sg_info.sghead = umem->sg_head.sgl;
 	qplib_srq->sg_info.npages = ib_umem_num_pages(umem);
 	qplib_srq->sg_info.nmap = umem->nmap;
+	qplib_srq->sg_info.pgsize = PAGE_SIZE;
+	qplib_srq->sg_info.pgshft = PAGE_SHIFT;
 	qplib_srq->srq_handle = ureq.srq_handle;
 	qplib_srq->dpi = &cntx->dpi;
 
@@ -2368,7 +2382,7 @@ static int bnxt_re_build_reg_wqe(const struct ib_reg_wr *wr,
 	wqe->frmr.pbl_dma_ptr = qplib_frpl->hwq.pbl_dma_ptr[0];
 	wqe->frmr.page_list = mr->pages;
 	wqe->frmr.page_list_len = mr->npages;
-	wqe->frmr.levels = qplib_frpl->hwq.level + 1;
+	wqe->frmr.levels = qplib_frpl->hwq.level;
 	wqe->type = BNXT_QPLIB_SWQE_TYPE_REG_MR;
 
 	/* Need unconditional fence for reg_mr
@@ -2742,6 +2756,8 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	if (entries > dev_attr->max_cq_wqes + 1)
 		entries = dev_attr->max_cq_wqes + 1;
 
+	cq->qplib_cq.sg_info.pgsize = PAGE_SIZE;
+	cq->qplib_cq.sg_info.pgshft = PAGE_SHIFT;
 	if (udata) {
 		struct bnxt_re_cq_req req;
 		struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(
@@ -2758,7 +2774,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 			rc = PTR_ERR(cq->umem);
 			goto fail;
 		}
-		cq->qplib_cq.sg_info.sglist = cq->umem->sg_head.sgl;
+		cq->qplib_cq.sg_info.sghead = cq->umem->sg_head.sgl;
 		cq->qplib_cq.sg_info.npages = ib_umem_num_pages(cq->umem);
 		cq->qplib_cq.sg_info.nmap = cq->umem->nmap;
 		cq->qplib_cq.dpi = &uctx->dpi;
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 390daebea4c3..a966c68dbc59 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -90,6 +90,8 @@ static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev)
 	rdev->chip_ctx = NULL;
 	rdev->rcfw.res = NULL;
 	rdev->qplib_res.cctx = NULL;
+	rdev->qplib_res.pdev = NULL;
+	rdev->qplib_res.netdev = NULL;
 	kfree(chip_ctx);
 }
 
@@ -151,7 +153,7 @@ static void bnxt_re_limit_pf_res(struct bnxt_re_dev *rdev)
 	ctx->cq_count = min_t(u32, BNXT_RE_MAX_CQ_COUNT, attr->max_cq);
 	if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx))
 		for (i = 0; i < MAX_TQM_ALLOC_REQ; i++)
-			rdev->qplib_ctx.tqm_count[i] =
+			rdev->qplib_ctx.tqm_ctx.qcount[i] =
 			rdev->dev_attr.tqm_alloc_reqs[i];
 }
 
@@ -982,8 +984,8 @@ static void bnxt_re_free_nq_res(struct bnxt_re_dev *rdev)
 	for (i = 0; i < rdev->num_msix - 1; i++) {
 		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
 		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type);
-		rdev->nq[i].res = NULL;
 		bnxt_qplib_free_nq(&rdev->nq[i]);
+		rdev->nq[i].res = NULL;
 	}
 }
 
@@ -1032,7 +1034,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 		rdev->nq[i].res = &rdev->qplib_res;
 		rdev->nq[i].hwq.max_elements = BNXT_RE_MAX_CQ_COUNT +
 			BNXT_RE_MAX_SRQC_COUNT + 2;
-		rc = bnxt_qplib_alloc_nq(rdev->en_dev->pdev, &rdev->nq[i]);
+		rc = bnxt_qplib_alloc_nq(&rdev->qplib_res, &rdev->nq[i]);
 		if (rc) {
 			dev_err(rdev_to_dev(rdev), "Alloc Failed NQ%d rc:%#x",
 				i, rc);
@@ -1056,7 +1058,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 	}
 	return 0;
 free_nq:
-	for (i = num_vec_created; i >= 0; i--) {
+	for (i = num_vec_created - 1; i >= 0; i--) {
 		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
 		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type);
 		bnxt_qplib_free_nq(&rdev->nq[i]);
@@ -1335,7 +1337,7 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
 			dev_warn(rdev_to_dev(rdev),
 				 "Failed to deinitialize RCFW: %#x", rc);
 		bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
-		bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
+		bnxt_qplib_free_ctx(&rdev->qplib_res, &rdev->qplib_ctx);
 		bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
 		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
 		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
@@ -1411,7 +1413,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	/* Establish RCFW Communication Channel to initialize the context
 	 * memory for the function and all child VFs
 	 */
-	rc = bnxt_qplib_alloc_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw,
+	rc = bnxt_qplib_alloc_rcfw_channel(&rdev->qplib_res, &rdev->rcfw,
 					   &rdev->qplib_ctx,
 					   BNXT_RE_MAX_QPC_COUNT);
 	if (rc) {
@@ -1432,7 +1434,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	}
 	db_offt = bnxt_re_get_nqdb_offset(rdev, BNXT_RE_AEQ_IDX);
 	vid = rdev->msix_entries[BNXT_RE_AEQ_IDX].vector;
-	rc = bnxt_qplib_enable_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw,
+	rc = bnxt_qplib_enable_rcfw_channel(&rdev->rcfw,
 					    vid, db_offt, rdev->is_virtfn,
 					    &bnxt_re_aeq_handler);
 	if (rc) {
@@ -1447,7 +1449,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 
 	bnxt_re_set_resource_limits(rdev);
 
-	rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0,
+	rc = bnxt_qplib_alloc_ctx(&rdev->qplib_res, &rdev->qplib_ctx, 0,
 				  bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx));
 	if (rc) {
 		pr_err("Failed to allocate QPLIB context: %#x\n", rc);
@@ -1514,7 +1516,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 free_sctx:
 	bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
 free_ctx:
-	bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
+	bnxt_qplib_free_ctx(&rdev->qplib_res, &rdev->qplib_ctx);
 disable_rcfw:
 	bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
 free_ring:
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index ffe861060fbf..e7fe86e61fbb 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -464,26 +464,33 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
 void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq)
 {
 	if (nq->hwq.max_elements) {
-		bnxt_qplib_free_hwq(nq->pdev, &nq->hwq);
+		bnxt_qplib_free_hwq(nq->res, &nq->hwq);
 		nq->hwq.max_elements = 0;
 	}
 }
 
-int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq)
+int bnxt_qplib_alloc_nq(struct bnxt_qplib_res *res, struct bnxt_qplib_nq *nq)
 {
-	u8 hwq_type;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
 
-	nq->pdev = pdev;
+	nq->pdev = res->pdev;
+	nq->res = res;
 	if (!nq->hwq.max_elements ||
 	    nq->hwq.max_elements > BNXT_QPLIB_NQE_MAX_CNT)
 		nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT;
-	hwq_type = bnxt_qplib_get_hwq_type(nq->res);
-	if (bnxt_qplib_alloc_init_hwq(nq->pdev, &nq->hwq, NULL,
-				      &nq->hwq.max_elements,
-				      BNXT_QPLIB_MAX_NQE_ENTRY_SIZE, 0,
-				      PAGE_SIZE, hwq_type))
-		return -ENOMEM;
 
+	sginfo.pgsize = PAGE_SIZE;
+	sginfo.pgshft = PAGE_SHIFT;
+	hwq_attr.res = res;
+	hwq_attr.sginfo = &sginfo;
+	hwq_attr.depth = nq->hwq.max_elements;
+	hwq_attr.stride = sizeof(struct nq_base);
+	hwq_attr.type = bnxt_qplib_get_hwq_type(nq->res);
+	if (bnxt_qplib_alloc_init_hwq(&nq->hwq, &hwq_attr)) {
+		dev_err(&nq->pdev->dev, "FP NQ allocation failed");
+		return -ENOMEM;
+	}
 	nq->budget = 8;
 	return 0;
 }
@@ -526,24 +533,26 @@ void bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
 	kfree(srq->swq);
 	if (rc)
 		return;
-	bnxt_qplib_free_hwq(res->pdev, &srq->hwq);
+	bnxt_qplib_free_hwq(res, &srq->hwq);
 }
 
 int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
 			  struct bnxt_qplib_srq *srq)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
-	struct cmdq_create_srq req;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
 	struct creq_create_srq_resp resp;
+	struct cmdq_create_srq req;
 	struct bnxt_qplib_pbl *pbl;
 	u16 cmd_flags = 0;
 	int rc, idx;
 
-	srq->hwq.max_elements = srq->max_wqe;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &srq->hwq, &srq->sg_info,
-				       &srq->hwq.max_elements,
-				       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	hwq_attr.res = res;
+	hwq_attr.sginfo = &srq->sg_info;
+	hwq_attr.depth = srq->max_wqe;
+	hwq_attr.stride = BNXT_QPLIB_MAX_RQE_ENTRY_SIZE;
+	hwq_attr.type = HWQ_TYPE_QUEUE;
+	rc = bnxt_qplib_alloc_init_hwq(&srq->hwq, &hwq_attr);
 	if (rc)
 		goto exit;
 
@@ -602,7 +611,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
 
 	return 0;
 fail:
-	bnxt_qplib_free_hwq(res->pdev, &srq->hwq);
+	bnxt_qplib_free_hwq(res, &srq->hwq);
 	kfree(srq->swq);
 exit:
 	return rc;
@@ -721,15 +730,16 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
 /* QP */
 int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 {
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
-	struct cmdq_create_qp1 req;
-	struct creq_create_qp1_resp resp;
-	struct bnxt_qplib_pbl *pbl;
 	struct bnxt_qplib_q *sq = &qp->sq;
 	struct bnxt_qplib_q *rq = &qp->rq;
-	int rc;
+	struct creq_create_qp1_resp resp;
+	struct cmdq_create_qp1 req;
+	struct bnxt_qplib_pbl *pbl;
 	u16 cmd_flags = 0;
 	u32 qp_flags = 0;
+	int rc;
 
 	RCFW_CMD_PREP(req, CREATE_QP1, cmd_flags);
 
@@ -739,11 +749,12 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 	req.qp_handle = cpu_to_le64(qp->qp_handle);
 
 	/* SQ */
-	sq->hwq.max_elements = sq->max_wqe;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, NULL,
-				       &sq->hwq.max_elements,
-				       BNXT_QPLIB_MAX_SQE_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	hwq_attr.res = res;
+	hwq_attr.sginfo = &sq->sg_info;
+	hwq_attr.depth = sq->max_wqe;
+	hwq_attr.stride = BNXT_QPLIB_MAX_SQE_ENTRY_SIZE;
+	hwq_attr.type = HWQ_TYPE_QUEUE;
+	rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr);
 	if (rc)
 		goto exit;
 
@@ -778,11 +789,12 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 
 	/* RQ */
 	if (rq->max_wqe) {
-		rq->hwq.max_elements = qp->rq.max_wqe;
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq, NULL,
-					       &rq->hwq.max_elements,
-					       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
-					       PAGE_SIZE, HWQ_TYPE_QUEUE);
+		hwq_attr.res = res;
+		hwq_attr.sginfo = &rq->sg_info;
+		hwq_attr.stride = BNXT_QPLIB_MAX_RQE_ENTRY_SIZE;
+		hwq_attr.depth = qp->rq.max_wqe;
+		hwq_attr.type = HWQ_TYPE_QUEUE;
+		rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr);
 		if (rc)
 			goto fail_sq;
 
@@ -848,10 +860,10 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 fail:
 	bnxt_qplib_free_qp_hdr_buf(res, qp);
 fail_rq:
-	bnxt_qplib_free_hwq(res->pdev, &rq->hwq);
+	bnxt_qplib_free_hwq(res, &rq->hwq);
 	kfree(rq->swq);
 fail_sq:
-	bnxt_qplib_free_hwq(res->pdev, &sq->hwq);
+	bnxt_qplib_free_hwq(res, &sq->hwq);
 	kfree(sq->swq);
 exit:
 	return rc;
@@ -860,7 +872,9 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
 	unsigned long int psn_search, poff = 0;
+	struct bnxt_qplib_sg_info sginfo = {};
 	struct sq_psn_search **psn_search_ptr;
 	struct bnxt_qplib_q *sq = &qp->sq;
 	struct bnxt_qplib_q *rq = &qp->rq;
@@ -887,12 +901,15 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 			 sizeof(struct sq_psn_search_ext) :
 			 sizeof(struct sq_psn_search);
 	}
-	sq->hwq.max_elements = sq->max_wqe;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, &sq->sg_info,
-				       &sq->hwq.max_elements,
-				       BNXT_QPLIB_MAX_SQE_ENTRY_SIZE,
-				       psn_sz,
-				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+
+	hwq_attr.res = res;
+	hwq_attr.sginfo = &sq->sg_info;
+	hwq_attr.stride = BNXT_QPLIB_MAX_SQE_ENTRY_SIZE;
+	hwq_attr.depth = sq->max_wqe;
+	hwq_attr.aux_stride = psn_sz;
+	hwq_attr.aux_depth = hwq_attr.depth;
+	hwq_attr.type = HWQ_TYPE_QUEUE;
+	rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr);
 	if (rc)
 		goto exit;
 
@@ -956,12 +973,14 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 
 	/* RQ */
 	if (rq->max_wqe) {
-		rq->hwq.max_elements = rq->max_wqe;
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq,
-					       &rq->sg_info,
-					       &rq->hwq.max_elements,
-					       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
-					       PAGE_SIZE, HWQ_TYPE_QUEUE);
+		hwq_attr.res = res;
+		hwq_attr.sginfo = &rq->sg_info;
+		hwq_attr.stride = BNXT_QPLIB_MAX_RQE_ENTRY_SIZE;
+		hwq_attr.depth = rq->max_wqe;
+		hwq_attr.aux_stride = 0;
+		hwq_attr.aux_depth = 0;
+		hwq_attr.type = HWQ_TYPE_QUEUE;
+		rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr);
 		if (rc)
 			goto fail_sq;
 
@@ -1029,10 +1048,17 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 		req_size = xrrq->max_elements *
 			   BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE + PAGE_SIZE - 1;
 		req_size &= ~(PAGE_SIZE - 1);
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL,
-					       &xrrq->max_elements,
-					       BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE,
-					       0, req_size, HWQ_TYPE_CTX);
+		sginfo.pgsize = req_size;
+		sginfo.pgshft = PAGE_SHIFT;
+
+		hwq_attr.res = res;
+		hwq_attr.sginfo = &sginfo;
+		hwq_attr.depth = xrrq->max_elements;
+		hwq_attr.stride = BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE;
+		hwq_attr.aux_stride = 0;
+		hwq_attr.aux_depth = 0;
+		hwq_attr.type = HWQ_TYPE_CTX;
+		rc = bnxt_qplib_alloc_init_hwq(xrrq, &hwq_attr);
 		if (rc)
 			goto fail_buf_free;
 		pbl = &xrrq->pbl[PBL_LVL_0];
@@ -1044,11 +1070,10 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 		req_size = xrrq->max_elements *
 			   BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE + PAGE_SIZE - 1;
 		req_size &= ~(PAGE_SIZE - 1);
-
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL,
-					       &xrrq->max_elements,
-					       BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE,
-					       0, req_size, HWQ_TYPE_CTX);
+		sginfo.pgsize = req_size;
+		hwq_attr.depth =  xrrq->max_elements;
+		hwq_attr.stride = BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE;
+		rc = bnxt_qplib_alloc_init_hwq(xrrq, &hwq_attr);
 		if (rc)
 			goto fail_orrq;
 
@@ -1074,17 +1099,17 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 
 fail:
 	if (qp->irrq.max_elements)
-		bnxt_qplib_free_hwq(res->pdev, &qp->irrq);
+		bnxt_qplib_free_hwq(res, &qp->irrq);
 fail_orrq:
 	if (qp->orrq.max_elements)
-		bnxt_qplib_free_hwq(res->pdev, &qp->orrq);
+		bnxt_qplib_free_hwq(res, &qp->orrq);
 fail_buf_free:
 	bnxt_qplib_free_qp_hdr_buf(res, qp);
 fail_rq:
-	bnxt_qplib_free_hwq(res->pdev, &rq->hwq);
+	bnxt_qplib_free_hwq(res, &rq->hwq);
 	kfree(rq->swq);
 fail_sq:
-	bnxt_qplib_free_hwq(res->pdev, &sq->hwq);
+	bnxt_qplib_free_hwq(res, &sq->hwq);
 	kfree(sq->swq);
 exit:
 	return rc;
@@ -1440,16 +1465,16 @@ void bnxt_qplib_free_qp_res(struct bnxt_qplib_res *res,
 			    struct bnxt_qplib_qp *qp)
 {
 	bnxt_qplib_free_qp_hdr_buf(res, qp);
-	bnxt_qplib_free_hwq(res->pdev, &qp->sq.hwq);
+	bnxt_qplib_free_hwq(res, &qp->sq.hwq);
 	kfree(qp->sq.swq);
 
-	bnxt_qplib_free_hwq(res->pdev, &qp->rq.hwq);
+	bnxt_qplib_free_hwq(res, &qp->rq.hwq);
 	kfree(qp->rq.swq);
 
 	if (qp->irrq.max_elements)
-		bnxt_qplib_free_hwq(res->pdev, &qp->irrq);
+		bnxt_qplib_free_hwq(res, &qp->irrq);
 	if (qp->orrq.max_elements)
-		bnxt_qplib_free_hwq(res->pdev, &qp->orrq);
+		bnxt_qplib_free_hwq(res, &qp->orrq);
 
 }
 
@@ -1927,17 +1952,19 @@ static void bnxt_qplib_arm_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
 int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
-	struct cmdq_create_cq req;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
 	struct creq_create_cq_resp resp;
+	struct cmdq_create_cq req;
 	struct bnxt_qplib_pbl *pbl;
 	u16 cmd_flags = 0;
 	int rc;
 
-	cq->hwq.max_elements = cq->max_wqe;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &cq->hwq, &cq->sg_info,
-				       &cq->hwq.max_elements,
-				       BNXT_QPLIB_MAX_CQE_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	hwq_attr.res = res;
+	hwq_attr.depth = cq->max_wqe;
+	hwq_attr.stride = sizeof(struct cq_base);
+	hwq_attr.type = HWQ_TYPE_QUEUE;
+	hwq_attr.sginfo = &cq->sg_info;
+	rc = bnxt_qplib_alloc_init_hwq(&cq->hwq, &hwq_attr);
 	if (rc)
 		goto exit;
 
@@ -1988,7 +2015,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 	return 0;
 
 fail:
-	bnxt_qplib_free_hwq(res->pdev, &cq->hwq);
+	bnxt_qplib_free_hwq(res, &cq->hwq);
 exit:
 	return rc;
 }
@@ -2008,7 +2035,7 @@ int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 					  (void *)&resp, NULL, 0);
 	if (rc)
 		return rc;
-	bnxt_qplib_free_hwq(res->pdev, &cq->hwq);
+	bnxt_qplib_free_hwq(res, &cq->hwq);
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index 99e0a13cbefa..d3f080c18b27 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -550,7 +550,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
 bool bnxt_qplib_is_cq_empty(struct bnxt_qplib_cq *cq);
 void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type);
 void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq);
-int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq);
+int bnxt_qplib_alloc_nq(struct bnxt_qplib_res *res, struct bnxt_qplib_nq *nq);
 void bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp);
 void bnxt_qplib_acquire_cq_locks(struct bnxt_qplib_qp *qp,
 				 unsigned long *flags);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
index 1291b12287a5..fada81c300ba 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -520,9 +520,10 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
 	level = ctx->tim_tbl.level;
 	req.tim_pg_size_tim_lvl = (level << CMDQ_INITIALIZE_FW_TIM_LVL_SFT) |
 				  __get_pbl_pg_idx(&ctx->tim_tbl.pbl[level]);
-	level = ctx->tqm_pde_level;
-	req.tqm_pg_size_tqm_lvl = (level << CMDQ_INITIALIZE_FW_TQM_LVL_SFT) |
-				  __get_pbl_pg_idx(&ctx->tqm_pde.pbl[level]);
+	level = ctx->tqm_ctx.pde.level;
+	req.tqm_pg_size_tqm_lvl =
+		(level << CMDQ_INITIALIZE_FW_TQM_LVL_SFT) |
+		 __get_pbl_pg_idx(&ctx->tqm_ctx.pde.pbl[level]);
 
 	req.qpc_page_dir =
 		cpu_to_le64(ctx->qpc_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
@@ -535,7 +536,7 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
 	req.tim_page_dir =
 		cpu_to_le64(ctx->tim_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
 	req.tqm_page_dir =
-		cpu_to_le64(ctx->tqm_pde.pbl[PBL_LVL_0].pg_map_arr[0]);
+		cpu_to_le64(ctx->tqm_ctx.pde.pbl[PBL_LVL_0].pg_map_arr[0]);
 
 	req.number_of_qp = cpu_to_le32(ctx->qpc_tbl.max_elements);
 	req.number_of_mrw = cpu_to_le32(ctx->mrw_tbl.max_elements);
@@ -563,25 +564,32 @@ void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
 {
 	kfree(rcfw->qp_tbl);
 	kfree(rcfw->crsqe_tbl);
-	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->cmdq);
-	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->creq);
+	bnxt_qplib_free_hwq(rcfw->res, &rcfw->cmdq);
+	bnxt_qplib_free_hwq(rcfw->res, &rcfw->creq);
 	rcfw->pdev = NULL;
 }
 
-int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
+int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res,
 				  struct bnxt_qplib_rcfw *rcfw,
 				  struct bnxt_qplib_ctx *ctx,
 				  int qp_tbl_sz)
 {
-	u8 hwq_type;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
 
-	rcfw->pdev = pdev;
-	rcfw->creq.max_elements = BNXT_QPLIB_CREQE_MAX_CNT;
-	hwq_type = bnxt_qplib_get_hwq_type(rcfw->res);
-	if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->creq, NULL,
-				      &rcfw->creq.max_elements,
-				      BNXT_QPLIB_CREQE_UNITS,
-				      0, PAGE_SIZE, hwq_type)) {
+	rcfw->pdev = res->pdev;
+	rcfw->res = res;
+
+	sginfo.pgsize = PAGE_SIZE;
+	sginfo.pgshft = PAGE_SHIFT;
+
+	hwq_attr.sginfo = &sginfo;
+	hwq_attr.res = rcfw->res;
+	hwq_attr.depth = BNXT_QPLIB_CREQE_MAX_CNT;
+	hwq_attr.stride = BNXT_QPLIB_CREQE_UNITS;
+	hwq_attr.type = bnxt_qplib_get_hwq_type(res);
+
+	if (bnxt_qplib_alloc_init_hwq(&rcfw->creq, &hwq_attr)) {
 		dev_err(&rcfw->pdev->dev,
 			"HW channel CREQ allocation failed\n");
 		goto fail;
@@ -591,13 +599,11 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
 	else
 		rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT_8192;
 
-	rcfw->cmdq.max_elements = rcfw->cmdq_depth;
-	if (bnxt_qplib_alloc_init_hwq
-			(rcfw->pdev, &rcfw->cmdq, NULL,
-			 &rcfw->cmdq.max_elements,
-			 BNXT_QPLIB_CMDQE_UNITS, 0,
-			 bnxt_qplib_cmdqe_page_size(rcfw->cmdq_depth),
-			 HWQ_TYPE_CTX)) {
+	sginfo.pgsize = bnxt_qplib_cmdqe_page_size(rcfw->cmdq_depth);
+	hwq_attr.depth = rcfw->cmdq_depth;
+	hwq_attr.stride = BNXT_QPLIB_CMDQE_UNITS;
+	hwq_attr.type = HWQ_TYPE_CTX;
+	if (bnxt_qplib_alloc_init_hwq(&rcfw->cmdq, &hwq_attr)) {
 		dev_err(&rcfw->pdev->dev,
 			"HW channel CMDQ allocation failed\n");
 		goto fail;
@@ -690,8 +696,7 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
 	return 0;
 }
 
-int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
-				   struct bnxt_qplib_rcfw *rcfw,
+int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw,
 				   int msix_vector,
 				   int cp_bar_reg_off, int virt_fn,
 				   int (*aeq_handler)(struct bnxt_qplib_rcfw *,
@@ -699,10 +704,12 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
 {
 	resource_size_t res_base;
 	struct cmdq_init init;
+	struct pci_dev *pdev;
 	u16 bmap_size;
 	int rc;
 
 	/* General */
+	pdev = rcfw->pdev;
 	rcfw->seq_num = 0;
 	set_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags);
 	bmap_size = BITS_TO_LONGS(rcfw->cmdq_depth) * sizeof(unsigned long);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
index dfeadc192e17..ab1531c7e27f 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -268,7 +268,7 @@ struct bnxt_qplib_rcfw {
 };
 
 void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
-int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
+int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res,
 				  struct bnxt_qplib_rcfw *rcfw,
 				  struct bnxt_qplib_ctx *ctx,
 				  int qp_tbl_sz);
@@ -276,8 +276,7 @@ void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill);
 void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
 int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
 			      bool need_init);
-int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
-				   struct bnxt_qplib_rcfw *rcfw,
+int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw,
 				   int msix_vector,
 				   int cp_bar_reg_off, int virt_fn,
 				   int (*aeq_handler)(struct bnxt_qplib_rcfw *,
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
index 60ea1b924b67..4346b95963cf 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -55,9 +55,10 @@ static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
 				      struct bnxt_qplib_stats *stats);
 
 /* PBL */
-static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
+static void __free_pbl(struct bnxt_qplib_res *res, struct bnxt_qplib_pbl *pbl,
 		       bool is_umem)
 {
+	struct pci_dev *pdev = res->pdev;
 	int i;
 
 	if (!is_umem) {
@@ -74,35 +75,57 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
 			pbl->pg_arr[i] = NULL;
 		}
 	}
-	kfree(pbl->pg_arr);
+	vfree(pbl->pg_arr);
 	pbl->pg_arr = NULL;
-	kfree(pbl->pg_map_arr);
+	vfree(pbl->pg_map_arr);
 	pbl->pg_map_arr = NULL;
 	pbl->pg_count = 0;
 	pbl->pg_size = 0;
 }
 
-static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
-		       struct scatterlist *sghead, u32 pages,
-		       u32 nmaps, u32 pg_size)
+static void bnxt_qplib_fill_user_dma_pages(struct bnxt_qplib_pbl *pbl,
+					   struct bnxt_qplib_sg_info *sginfo)
 {
+	struct scatterlist *sghead = sginfo->sghead;
 	struct sg_dma_page_iter sg_iter;
+	int i = 0;
+
+	for_each_sg_dma_page(sghead, &sg_iter, sginfo->nmap, 0) {
+		pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter);
+		pbl->pg_arr[i] = NULL;
+		pbl->pg_count++;
+		i++;
+	}
+}
+
+static int __alloc_pbl(struct bnxt_qplib_res *res,
+		       struct bnxt_qplib_pbl *pbl,
+		       struct bnxt_qplib_sg_info *sginfo)
+{
+	struct pci_dev *pdev = res->pdev;
+	struct scatterlist *sghead;
 	bool is_umem = false;
+	u32 pages, pg_size;
 	int i;
 
+	if (sginfo->nopte)
+		return 0;
+	pages = sginfo->npages;
+	pg_size = sginfo->pgsize;
+	sghead = sginfo->sghead;
 	/* page ptr arrays */
-	pbl->pg_arr = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+	pbl->pg_arr = vmalloc(pages * sizeof(void *));
 	if (!pbl->pg_arr)
 		return -ENOMEM;
 
-	pbl->pg_map_arr = kcalloc(pages, sizeof(dma_addr_t), GFP_KERNEL);
+	pbl->pg_map_arr = vmalloc(pages * sizeof(dma_addr_t));
 	if (!pbl->pg_map_arr) {
-		kfree(pbl->pg_arr);
+		vfree(pbl->pg_arr);
 		pbl->pg_arr = NULL;
 		return -ENOMEM;
 	}
 	pbl->pg_count = 0;
-	pbl->pg_size = pg_size;
+	pbl->pg_size = sginfo->pgsize;
 
 	if (!sghead) {
 		for (i = 0; i < pages; i++) {
@@ -115,25 +138,19 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
 			pbl->pg_count++;
 		}
 	} else {
-		i = 0;
 		is_umem = true;
-		for_each_sg_dma_page(sghead, &sg_iter, nmaps, 0) {
-			pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter);
-			pbl->pg_arr[i] = NULL;
-			pbl->pg_count++;
-			i++;
-		}
+		bnxt_qplib_fill_user_dma_pages(pbl, sginfo);
 	}
 
 	return 0;
-
 fail:
-	__free_pbl(pdev, pbl, is_umem);
+	__free_pbl(res, pbl, is_umem);
 	return -ENOMEM;
 }
 
 /* HWQ */
-void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq)
+void bnxt_qplib_free_hwq(struct bnxt_qplib_res *res,
+			 struct bnxt_qplib_hwq *hwq)
 {
 	int i;
 
@@ -144,9 +161,9 @@ void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq)
 
 	for (i = 0; i < hwq->level + 1; i++) {
 		if (i == hwq->level)
-			__free_pbl(pdev, &hwq->pbl[i], hwq->is_user);
+			__free_pbl(res, &hwq->pbl[i], hwq->is_user);
 		else
-			__free_pbl(pdev, &hwq->pbl[i], false);
+			__free_pbl(res, &hwq->pbl[i], false);
 	}
 
 	hwq->level = PBL_LVL_MAX;
@@ -158,79 +175,113 @@ void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq)
 }
 
 /* All HWQs are power of 2 in size */
-int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
-			      struct bnxt_qplib_sg_info *sg_info,
-			      u32 *elements, u32 element_size, u32 aux,
-			      u32 pg_size, enum bnxt_qplib_hwq_type hwq_type)
+
+int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq,
+			      struct bnxt_qplib_hwq_attr *hwq_attr)
 {
-	u32 pages, maps, slots, size, aux_pages = 0, aux_size = 0;
+	u32 npages, aux_slots, pg_size, aux_pages = 0, aux_size = 0;
+	struct bnxt_qplib_sg_info sginfo = {};
+	u32 depth, stride, npbl, npde;
 	dma_addr_t *src_phys_ptr, **dst_virt_ptr;
 	struct scatterlist *sghead = NULL;
-	int i, rc;
+	struct bnxt_qplib_res *res;
+	struct pci_dev *pdev;
+	int i, rc, lvl;
 
+	res = hwq_attr->res;
+	pdev = res->pdev;
+	sghead = hwq_attr->sginfo->sghead;
+	pg_size = hwq_attr->sginfo->pgsize;
 	hwq->level = PBL_LVL_MAX;
 
-	slots = roundup_pow_of_two(*elements);
-	if (aux) {
-		aux_size = roundup_pow_of_two(aux);
-		aux_pages = (slots * aux_size) / pg_size;
-		if ((slots * aux_size) % pg_size)
+	depth = roundup_pow_of_two(hwq_attr->depth);
+	stride = roundup_pow_of_two(hwq_attr->stride);
+	if (hwq_attr->aux_depth) {
+		aux_slots = hwq_attr->aux_depth;
+		aux_size = roundup_pow_of_two(hwq_attr->aux_stride);
+		aux_pages = (aux_slots * aux_size) / pg_size;
+		if ((aux_slots * aux_size) % pg_size)
 			aux_pages++;
 	}
-	size = roundup_pow_of_two(element_size);
-
-	if (sg_info)
-		sghead = sg_info->sglist;
 
 	if (!sghead) {
 		hwq->is_user = false;
-		pages = (slots * size) / pg_size + aux_pages;
-		if ((slots * size) % pg_size)
-			pages++;
-		if (!pages)
+		npages = (depth * stride) / pg_size + aux_pages;
+		if ((depth * stride) % pg_size)
+			npages++;
+		if (!npages)
 			return -EINVAL;
-		maps = 0;
+		hwq_attr->sginfo->npages = npages;
 	} else {
 		hwq->is_user = true;
-		pages = sg_info->npages;
-		maps = sg_info->nmap;
+		npages = hwq_attr->sginfo->npages;
+		npages = (npages * PAGE_SIZE) /
+			  BIT_ULL(hwq_attr->sginfo->pgshft);
+		if ((hwq_attr->sginfo->npages * PAGE_SIZE) %
+		     BIT_ULL(hwq_attr->sginfo->pgshft))
+			if (!npages)
+				npages++;
 	}
 
-	/* Alloc the 1st memory block; can be a PDL/PTL/PBL */
-	if (sghead && (pages == MAX_PBL_LVL_0_PGS))
-		rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], sghead,
-				 pages, maps, pg_size);
-	else
-		rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], NULL,
-				 1, 0, pg_size);
-	if (rc)
-		goto fail;
+	if (npages == MAX_PBL_LVL_0_PGS) {
+		/* This request is Level 0, map PTE */
+		rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_0], hwq_attr->sginfo);
+		if (rc)
+			goto fail;
+		hwq->level = PBL_LVL_0;
+	}
 
-	hwq->level = PBL_LVL_0;
-
-	if (pages > MAX_PBL_LVL_0_PGS) {
-		if (pages > MAX_PBL_LVL_1_PGS) {
+	if (npages > MAX_PBL_LVL_0_PGS) {
+		if (npages > MAX_PBL_LVL_1_PGS) {
+			u32 flag = (hwq_attr->type == HWQ_TYPE_L2_CMPL) ?
+				    0 : PTU_PTE_VALID;
 			/* 2 levels of indirection */
-			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_1], NULL,
-					 MAX_PBL_LVL_1_PGS_FOR_LVL_2,
-					 0, pg_size);
+			npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT;
+			if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT))
+				npbl++;
+			npde = npbl >> MAX_PDL_LVL_SHIFT;
+			if (npbl % BIT(MAX_PDL_LVL_SHIFT))
+				npde++;
+			/* Alloc PDE pages */
+			sginfo.pgsize = npde * pg_size;
+			sginfo.npages = 1;
+			rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_0], &sginfo);
+
+			/* Alloc PBL pages */
+			sginfo.npages = npbl;
+			sginfo.pgsize = PAGE_SIZE;
+			rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_1], &sginfo);
 			if (rc)
 				goto fail;
-			/* Fill in lvl0 PBL */
+			/* Fill PDL with PBL page pointers */
 			dst_virt_ptr =
 				(dma_addr_t **)hwq->pbl[PBL_LVL_0].pg_arr;
 			src_phys_ptr = hwq->pbl[PBL_LVL_1].pg_map_arr;
-			for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++)
-				dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
-					src_phys_ptr[i] | PTU_PDE_VALID;
-			hwq->level = PBL_LVL_1;
-
-			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_2], sghead,
-					 pages, maps, pg_size);
+			if (hwq_attr->type == HWQ_TYPE_MR) {
+			/* For MR it is expected that we supply only 1 contigous
+			 * page i.e only 1 entry in the PDL that will contain
+			 * all the PBLs for the user supplied memory region
+			 */
+				for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count;
+				     i++)
+					dst_virt_ptr[0][i] = src_phys_ptr[i] |
+						flag;
+			} else {
+				for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count;
+				     i++)
+					dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+						src_phys_ptr[i] |
+						PTU_PDE_VALID;
+			}
+			/* Alloc or init PTEs */
+			rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_2],
+					 hwq_attr->sginfo);
 			if (rc)
 				goto fail;
-
-			/* Fill in lvl1 PBL */
+			hwq->level = PBL_LVL_2;
+			if (hwq_attr->sginfo->nopte)
+				goto done;
+			/* Fill PBLs with PTE pointers */
 			dst_virt_ptr =
 				(dma_addr_t **)hwq->pbl[PBL_LVL_1].pg_arr;
 			src_phys_ptr = hwq->pbl[PBL_LVL_2].pg_map_arr;
@@ -238,7 +289,7 @@ int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
 				dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
 					src_phys_ptr[i] | PTU_PTE_VALID;
 			}
-			if (hwq_type == HWQ_TYPE_QUEUE) {
+			if (hwq_attr->type == HWQ_TYPE_QUEUE) {
 				/* Find the last pg of the size */
 				i = hwq->pbl[PBL_LVL_2].pg_count;
 				dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
@@ -248,25 +299,36 @@ int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
 						    [PTR_IDX(i - 2)] |=
 						    PTU_PTE_NEXT_TO_LAST;
 			}
-			hwq->level = PBL_LVL_2;
-		} else {
-			u32 flag = hwq_type == HWQ_TYPE_L2_CMPL ? 0 :
-						PTU_PTE_VALID;
+		} else { /* pages < 512 npbl = 1, npde = 0 */
+			u32 flag = (hwq_attr->type == HWQ_TYPE_L2_CMPL) ?
+				    0 : PTU_PTE_VALID;
 
 			/* 1 level of indirection */
-			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_1], sghead,
-					 pages, maps, pg_size);
+			npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT;
+			if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT))
+				npbl++;
+			sginfo.npages = npbl;
+			sginfo.pgsize = PAGE_SIZE;
+			/* Alloc PBL page */
+			rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_0], &sginfo);
 			if (rc)
 				goto fail;
-			/* Fill in lvl0 PBL */
+			/* Alloc or init  PTEs */
+			rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_1],
+					 hwq_attr->sginfo);
+			if (rc)
+				goto fail;
+			hwq->level = PBL_LVL_1;
+			if (hwq_attr->sginfo->nopte)
+				goto done;
+			/* Fill PBL with PTE pointers */
 			dst_virt_ptr =
 				(dma_addr_t **)hwq->pbl[PBL_LVL_0].pg_arr;
 			src_phys_ptr = hwq->pbl[PBL_LVL_1].pg_map_arr;
-			for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++) {
+			for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++)
 				dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
 					src_phys_ptr[i] | flag;
-			}
-			if (hwq_type == HWQ_TYPE_QUEUE) {
+			if (hwq_attr->type == HWQ_TYPE_QUEUE) {
 				/* Find the last pg of the size */
 				i = hwq->pbl[PBL_LVL_1].pg_count;
 				dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
@@ -276,42 +338,141 @@ int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
 						    [PTR_IDX(i - 2)] |=
 						    PTU_PTE_NEXT_TO_LAST;
 			}
-			hwq->level = PBL_LVL_1;
 		}
 	}
-	hwq->pdev = pdev;
-	spin_lock_init(&hwq->lock);
+done:
 	hwq->prod = 0;
 	hwq->cons = 0;
-	*elements = hwq->max_elements = slots;
-	hwq->element_size = size;
-
+	hwq->pdev = pdev;
+	hwq->depth = hwq_attr->depth;
+	hwq->max_elements = depth;
+	hwq->element_size = stride;
 	/* For direct access to the elements */
-	hwq->pbl_ptr = hwq->pbl[hwq->level].pg_arr;
-	hwq->pbl_dma_ptr = hwq->pbl[hwq->level].pg_map_arr;
+	lvl = hwq->level;
+	if (hwq_attr->sginfo->nopte && hwq->level)
+		lvl = hwq->level - 1;
+	hwq->pbl_ptr = hwq->pbl[lvl].pg_arr;
+	hwq->pbl_dma_ptr = hwq->pbl[lvl].pg_map_arr;
+	spin_lock_init(&hwq->lock);
 
 	return 0;
-
 fail:
-	bnxt_qplib_free_hwq(pdev, hwq);
+	bnxt_qplib_free_hwq(res, hwq);
 	return -ENOMEM;
 }
 
 /* Context Tables */
-void bnxt_qplib_free_ctx(struct pci_dev *pdev,
+void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_ctx *ctx)
 {
 	int i;
 
-	bnxt_qplib_free_hwq(pdev, &ctx->qpc_tbl);
-	bnxt_qplib_free_hwq(pdev, &ctx->mrw_tbl);
-	bnxt_qplib_free_hwq(pdev, &ctx->srqc_tbl);
-	bnxt_qplib_free_hwq(pdev, &ctx->cq_tbl);
-	bnxt_qplib_free_hwq(pdev, &ctx->tim_tbl);
+	bnxt_qplib_free_hwq(res, &ctx->qpc_tbl);
+	bnxt_qplib_free_hwq(res, &ctx->mrw_tbl);
+	bnxt_qplib_free_hwq(res, &ctx->srqc_tbl);
+	bnxt_qplib_free_hwq(res, &ctx->cq_tbl);
+	bnxt_qplib_free_hwq(res, &ctx->tim_tbl);
 	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++)
-		bnxt_qplib_free_hwq(pdev, &ctx->tqm_tbl[i]);
-	bnxt_qplib_free_hwq(pdev, &ctx->tqm_pde);
-	bnxt_qplib_free_stats_ctx(pdev, &ctx->stats);
+		bnxt_qplib_free_hwq(res, &ctx->tqm_ctx.qtbl[i]);
+	/* restore original pde level before destroy */
+	ctx->tqm_ctx.pde.level = ctx->tqm_ctx.pde_level;
+	bnxt_qplib_free_hwq(res, &ctx->tqm_ctx.pde);
+	bnxt_qplib_free_stats_ctx(res->pdev, &ctx->stats);
+}
+
+static int bnxt_qplib_alloc_tqm_rings(struct bnxt_qplib_res *res,
+				      struct bnxt_qplib_ctx *ctx)
+{
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
+	struct bnxt_qplib_tqm_ctx *tqmctx;
+	int rc = 0;
+	int i;
+
+	tqmctx = &ctx->tqm_ctx;
+
+	sginfo.pgsize = PAGE_SIZE;
+	sginfo.pgshft = PAGE_SHIFT;
+	hwq_attr.sginfo = &sginfo;
+	hwq_attr.res = res;
+	hwq_attr.type = HWQ_TYPE_CTX;
+	hwq_attr.depth = 512;
+	hwq_attr.stride = sizeof(u64);
+	/* Alloc pdl buffer */
+	rc = bnxt_qplib_alloc_init_hwq(&tqmctx->pde, &hwq_attr);
+	if (rc)
+		goto out;
+	/* Save original pdl level */
+	tqmctx->pde_level = tqmctx->pde.level;
+
+	hwq_attr.stride = 1;
+	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) {
+		if (!tqmctx->qcount[i])
+			continue;
+		hwq_attr.depth = ctx->qpc_count * tqmctx->qcount[i];
+		rc = bnxt_qplib_alloc_init_hwq(&tqmctx->qtbl[i], &hwq_attr);
+		if (rc)
+			goto out;
+	}
+out:
+	return rc;
+}
+
+static void bnxt_qplib_map_tqm_pgtbl(struct bnxt_qplib_tqm_ctx *ctx)
+{
+	struct bnxt_qplib_hwq *tbl;
+	dma_addr_t *dma_ptr;
+	__le64 **pbl_ptr, *ptr;
+	int i, j, k;
+	int fnz_idx = -1;
+	int pg_count;
+
+	pbl_ptr = (__le64 **)ctx->pde.pbl_ptr;
+
+	for (i = 0, j = 0; i < MAX_TQM_ALLOC_REQ;
+	     i++, j += MAX_TQM_ALLOC_BLK_SIZE) {
+		tbl = &ctx->qtbl[i];
+		if (!tbl->max_elements)
+			continue;
+		if (fnz_idx == -1)
+			fnz_idx = i; /* first non-zero index */
+		switch (tbl->level) {
+		case PBL_LVL_2:
+			pg_count = tbl->pbl[PBL_LVL_1].pg_count;
+			for (k = 0; k < pg_count; k++) {
+				ptr = &pbl_ptr[PTR_PG(j + k)][PTR_IDX(j + k)];
+				dma_ptr = &tbl->pbl[PBL_LVL_1].pg_map_arr[k];
+				*ptr = cpu_to_le64(*dma_ptr | PTU_PTE_VALID);
+			}
+			break;
+		case PBL_LVL_1:
+		case PBL_LVL_0:
+		default:
+			ptr = &pbl_ptr[PTR_PG(j)][PTR_IDX(j)];
+			*ptr = cpu_to_le64(tbl->pbl[PBL_LVL_0].pg_map_arr[0] |
+					   PTU_PTE_VALID);
+			break;
+		}
+	}
+	if (fnz_idx == -1)
+		fnz_idx = 0;
+	/* update pde level as per page table programming */
+	ctx->pde.level = (ctx->qtbl[fnz_idx].level == PBL_LVL_2) ? PBL_LVL_2 :
+			  ctx->qtbl[fnz_idx].level + 1;
+}
+
+static int bnxt_qplib_setup_tqm_rings(struct bnxt_qplib_res *res,
+				      struct bnxt_qplib_ctx *ctx)
+{
+	int rc = 0;
+
+	rc = bnxt_qplib_alloc_tqm_rings(res, ctx);
+	if (rc)
+		goto fail;
+
+	bnxt_qplib_map_tqm_pgtbl(&ctx->tqm_ctx);
+fail:
+	return rc;
 }
 
 /*
@@ -335,120 +496,72 @@ void bnxt_qplib_free_ctx(struct pci_dev *pdev,
  * Returns:
  *     0 if success, else -ERRORS
  */
-int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
+int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_ctx *ctx,
 			 bool virt_fn, bool is_p5)
 {
-	int i, j, k, rc = 0;
-	int fnz_idx = -1;
-	__le64 **pbl_ptr;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
+	int rc = 0;
 
 	if (virt_fn || is_p5)
 		goto stats_alloc;
 
 	/* QPC Tables */
-	ctx->qpc_tbl.max_elements = ctx->qpc_count;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->qpc_tbl, NULL,
-				       &ctx->qpc_tbl.max_elements,
-				       BNXT_QPLIB_MAX_QP_CTX_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_CTX);
+	sginfo.pgsize = PAGE_SIZE;
+	sginfo.pgshft = PAGE_SHIFT;
+	hwq_attr.sginfo = &sginfo;
+
+	hwq_attr.res = res;
+	hwq_attr.depth = ctx->qpc_count;
+	hwq_attr.stride = BNXT_QPLIB_MAX_QP_CTX_ENTRY_SIZE;
+	hwq_attr.type = HWQ_TYPE_CTX;
+	rc = bnxt_qplib_alloc_init_hwq(&ctx->qpc_tbl, &hwq_attr);
 	if (rc)
 		goto fail;
 
 	/* MRW Tables */
-	ctx->mrw_tbl.max_elements = ctx->mrw_count;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->mrw_tbl, NULL,
-				       &ctx->mrw_tbl.max_elements,
-				       BNXT_QPLIB_MAX_MRW_CTX_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_CTX);
+	hwq_attr.depth = ctx->mrw_count;
+	hwq_attr.stride = BNXT_QPLIB_MAX_MRW_CTX_ENTRY_SIZE;
+	rc = bnxt_qplib_alloc_init_hwq(&ctx->mrw_tbl, &hwq_attr);
 	if (rc)
 		goto fail;
 
 	/* SRQ Tables */
-	ctx->srqc_tbl.max_elements = ctx->srqc_count;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->srqc_tbl, NULL,
-				       &ctx->srqc_tbl.max_elements,
-				       BNXT_QPLIB_MAX_SRQ_CTX_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_CTX);
+	hwq_attr.depth = ctx->srqc_count;
+	hwq_attr.stride = BNXT_QPLIB_MAX_SRQ_CTX_ENTRY_SIZE;
+	rc = bnxt_qplib_alloc_init_hwq(&ctx->srqc_tbl, &hwq_attr);
 	if (rc)
 		goto fail;
 
 	/* CQ Tables */
-	ctx->cq_tbl.max_elements = ctx->cq_count;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->cq_tbl, NULL,
-				       &ctx->cq_tbl.max_elements,
-				       BNXT_QPLIB_MAX_CQ_CTX_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_CTX);
+	hwq_attr.depth = ctx->cq_count;
+	hwq_attr.stride = BNXT_QPLIB_MAX_CQ_CTX_ENTRY_SIZE;
+	rc = bnxt_qplib_alloc_init_hwq(&ctx->cq_tbl, &hwq_attr);
 	if (rc)
 		goto fail;
 
 	/* TQM Buffer */
-	ctx->tqm_pde.max_elements = 512;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_pde, NULL,
-				       &ctx->tqm_pde.max_elements, sizeof(u64),
-				       0, PAGE_SIZE, HWQ_TYPE_CTX);
+	rc = bnxt_qplib_setup_tqm_rings(res, ctx);
 	if (rc)
 		goto fail;
-
-	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) {
-		if (!ctx->tqm_count[i])
-			continue;
-		ctx->tqm_tbl[i].max_elements = ctx->qpc_count *
-					       ctx->tqm_count[i];
-		rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_tbl[i], NULL,
-					       &ctx->tqm_tbl[i].max_elements, 1,
-					       0, PAGE_SIZE, HWQ_TYPE_CTX);
-		if (rc)
-			goto fail;
-	}
-	pbl_ptr = (__le64 **)ctx->tqm_pde.pbl_ptr;
-	for (i = 0, j = 0; i < MAX_TQM_ALLOC_REQ;
-	     i++, j += MAX_TQM_ALLOC_BLK_SIZE) {
-		if (!ctx->tqm_tbl[i].max_elements)
-			continue;
-		if (fnz_idx == -1)
-			fnz_idx = i;
-		switch (ctx->tqm_tbl[i].level) {
-		case PBL_LVL_2:
-			for (k = 0; k < ctx->tqm_tbl[i].pbl[PBL_LVL_1].pg_count;
-			     k++)
-				pbl_ptr[PTR_PG(j + k)][PTR_IDX(j + k)] =
-				  cpu_to_le64(
-				    ctx->tqm_tbl[i].pbl[PBL_LVL_1].pg_map_arr[k]
-				    | PTU_PTE_VALID);
-			break;
-		case PBL_LVL_1:
-		case PBL_LVL_0:
-		default:
-			pbl_ptr[PTR_PG(j)][PTR_IDX(j)] = cpu_to_le64(
-				ctx->tqm_tbl[i].pbl[PBL_LVL_0].pg_map_arr[0] |
-				PTU_PTE_VALID);
-			break;
-		}
-	}
-	if (fnz_idx == -1)
-		fnz_idx = 0;
-	ctx->tqm_pde_level = ctx->tqm_tbl[fnz_idx].level == PBL_LVL_2 ?
-			     PBL_LVL_2 : ctx->tqm_tbl[fnz_idx].level + 1;
-
 	/* TIM Buffer */
 	ctx->tim_tbl.max_elements = ctx->qpc_count * 16;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tim_tbl, NULL,
-				       &ctx->tim_tbl.max_elements, 1,
-				       0, PAGE_SIZE, HWQ_TYPE_CTX);
+	hwq_attr.depth = ctx->qpc_count * 16;
+	hwq_attr.stride = 1;
+	rc = bnxt_qplib_alloc_init_hwq(&ctx->tim_tbl, &hwq_attr);
 	if (rc)
 		goto fail;
-
 stats_alloc:
 	/* Stats */
-	rc = bnxt_qplib_alloc_stats_ctx(pdev, &ctx->stats);
+	rc = bnxt_qplib_alloc_stats_ctx(res->pdev, &ctx->stats);
 	if (rc)
 		goto fail;
 
 	return 0;
 
 fail:
-	bnxt_qplib_free_ctx(pdev, ctx);
+	bnxt_qplib_free_ctx(res, ctx);
 	return rc;
 }
 
@@ -808,9 +921,6 @@ void bnxt_qplib_free_res(struct bnxt_qplib_res *res)
 	bnxt_qplib_free_sgid_tbl(res, &res->sgid_tbl);
 	bnxt_qplib_free_pd_tbl(&res->pd_tbl);
 	bnxt_qplib_free_dpi_tbl(res, &res->dpi_tbl);
-
-	res->netdev = NULL;
-	res->pdev = NULL;
 }
 
 int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev,
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
index aaa76d792185..fe8a6dd7aeb1 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -55,7 +55,8 @@ extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero;
 enum bnxt_qplib_hwq_type {
 	HWQ_TYPE_CTX,
 	HWQ_TYPE_QUEUE,
-	HWQ_TYPE_L2_CMPL
+	HWQ_TYPE_L2_CMPL,
+	HWQ_TYPE_MR
 };
 
 #define MAX_PBL_LVL_0_PGS		1
@@ -63,6 +64,7 @@ enum bnxt_qplib_hwq_type {
 #define MAX_PBL_LVL_1_PGS_SHIFT		9
 #define MAX_PBL_LVL_1_PGS_FOR_LVL_2	256
 #define MAX_PBL_LVL_2_PGS		(256 * 512)
+#define MAX_PDL_LVL_SHIFT               9
 
 enum bnxt_qplib_pbl_lvl {
 	PBL_LVL_0,
@@ -85,17 +87,37 @@ struct bnxt_qplib_pbl {
 	dma_addr_t			*pg_map_arr;
 };
 
+struct bnxt_qplib_sg_info {
+	struct scatterlist		*sghead;
+	u32				nmap;
+	u32				npages;
+	u32				pgshft;
+	u32				pgsize;
+	bool				nopte;
+};
+
+struct bnxt_qplib_hwq_attr {
+	struct bnxt_qplib_res		*res;
+	struct bnxt_qplib_sg_info	*sginfo;
+	enum bnxt_qplib_hwq_type	type;
+	u32				depth;
+	u32				stride;
+	u32				aux_stride;
+	u32				aux_depth;
+};
+
 struct bnxt_qplib_hwq {
 	struct pci_dev			*pdev;
 	/* lock to protect qplib_hwq */
 	spinlock_t			lock;
-	struct bnxt_qplib_pbl		pbl[PBL_LVL_MAX];
+	struct bnxt_qplib_pbl		pbl[PBL_LVL_MAX + 1];
 	enum bnxt_qplib_pbl_lvl		level;		/* 0, 1, or 2 */
 	/* ptr for easy access to the PBL entries */
 	void				**pbl_ptr;
 	/* ptr for easy access to the dma_addr */
 	dma_addr_t			*pbl_dma_ptr;
 	u32				max_elements;
+	u32				depth;
 	u16				element_size;	/* Size of each entry */
 
 	u32				prod;		/* raw */
@@ -159,6 +181,15 @@ struct bnxt_qplib_vf_res {
 #define BNXT_QPLIB_MAX_CQ_CTX_ENTRY_SIZE	64
 #define BNXT_QPLIB_MAX_MRW_CTX_ENTRY_SIZE	128
 
+#define MAX_TQM_ALLOC_REQ               48
+#define MAX_TQM_ALLOC_BLK_SIZE          8
+struct bnxt_qplib_tqm_ctx {
+	struct bnxt_qplib_hwq           pde;
+	u8                              pde_level; /* Original level */
+	struct bnxt_qplib_hwq           qtbl[MAX_TQM_ALLOC_REQ];
+	u8                              qcount[MAX_TQM_ALLOC_REQ];
+};
+
 struct bnxt_qplib_ctx {
 	u32				qpc_count;
 	struct bnxt_qplib_hwq		qpc_tbl;
@@ -169,12 +200,7 @@ struct bnxt_qplib_ctx {
 	u32				cq_count;
 	struct bnxt_qplib_hwq		cq_tbl;
 	struct bnxt_qplib_hwq		tim_tbl;
-#define MAX_TQM_ALLOC_REQ		48
-#define MAX_TQM_ALLOC_BLK_SIZE		8
-	u8				tqm_count[MAX_TQM_ALLOC_REQ];
-	struct bnxt_qplib_hwq		tqm_pde;
-	u32				tqm_pde_level;
-	struct bnxt_qplib_hwq		tqm_tbl[MAX_TQM_ALLOC_REQ];
+	struct bnxt_qplib_tqm_ctx	tqm_ctx;
 	struct bnxt_qplib_stats		stats;
 	struct bnxt_qplib_vf_res	vf_res;
 	u64				hwrm_intf_ver;
@@ -223,11 +249,6 @@ static inline u8 bnxt_qplib_get_ring_type(struct bnxt_qplib_chip_ctx *cctx)
 	       RING_ALLOC_REQ_RING_TYPE_ROCE_CMPL;
 }
 
-struct bnxt_qplib_sg_info {
-	struct scatterlist		*sglist;
-	u32				nmap;
-	u32				npages;
-};
 
 #define to_bnxt_qplib(ptr, type, member)	\
 	container_of(ptr, type, member)
@@ -235,11 +256,10 @@ struct bnxt_qplib_sg_info {
 struct bnxt_qplib_pd;
 struct bnxt_qplib_dev_attr;
 
-void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq);
-int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
-			      struct bnxt_qplib_sg_info *sg_info, u32 *elements,
-			      u32 elements_per_page, u32 aux, u32 pg_size,
-			      enum bnxt_qplib_hwq_type hwq_type);
+void bnxt_qplib_free_hwq(struct bnxt_qplib_res *res,
+			 struct bnxt_qplib_hwq *hwq);
+int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq,
+			      struct bnxt_qplib_hwq_attr *hwq_attr);
 void bnxt_qplib_get_guid(u8 *dev_addr, u8 *guid);
 int bnxt_qplib_alloc_pd(struct bnxt_qplib_pd_tbl *pd_tbl,
 			struct bnxt_qplib_pd *pd);
@@ -258,9 +278,9 @@ void bnxt_qplib_free_res(struct bnxt_qplib_res *res);
 int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev,
 			 struct net_device *netdev,
 			 struct bnxt_qplib_dev_attr *dev_attr);
-void bnxt_qplib_free_ctx(struct pci_dev *pdev,
+void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_ctx *ctx);
-int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
+int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_ctx *ctx,
 			 bool virt_fn, bool is_p5);
 #endif /* __BNXT_QPLIB_RES_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
index 40296b97d21e..66954ff6a2f2 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -585,7 +585,7 @@ int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw)
 
 	/* Free the qplib's MRW memory */
 	if (mrw->hwq.max_elements)
-		bnxt_qplib_free_hwq(res->pdev, &mrw->hwq);
+		bnxt_qplib_free_hwq(res, &mrw->hwq);
 
 	return 0;
 }
@@ -646,7 +646,7 @@ int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
 	if (mrw->hwq.max_elements) {
 		mrw->va = 0;
 		mrw->total_size = 0;
-		bnxt_qplib_free_hwq(res->pdev, &mrw->hwq);
+		bnxt_qplib_free_hwq(res, &mrw->hwq);
 	}
 
 	return 0;
@@ -656,10 +656,12 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
 		      u64 *pbl_tbl, int num_pbls, bool block, u32 buf_pg_size)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
-	struct cmdq_register_mr req;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
 	struct creq_register_mr_resp resp;
-	u16 cmd_flags = 0, level;
+	struct cmdq_register_mr req;
 	int pg_ptrs, pages, i, rc;
+	u16 cmd_flags = 0, level;
 	dma_addr_t **pbl_ptr;
 	u32 pg_size;
 
@@ -674,20 +676,23 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
 
 		if (pages > MAX_PBL_LVL_1_PGS) {
 			dev_err(&res->pdev->dev,
-				"SP: Reg MR pages requested (0x%x) exceeded max (0x%x)\n",
+				"SP: Reg MR: pages requested (0x%x) exceeded max (0x%x)\n",
 				pages, MAX_PBL_LVL_1_PGS);
 			return -ENOMEM;
 		}
 		/* Free the hwq if it already exist, must be a rereg */
 		if (mr->hwq.max_elements)
-			bnxt_qplib_free_hwq(res->pdev, &mr->hwq);
-
-		mr->hwq.max_elements = pages;
+			bnxt_qplib_free_hwq(res, &mr->hwq);
 		/* Use system PAGE_SIZE */
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL,
-					       &mr->hwq.max_elements,
-					       PAGE_SIZE, 0, PAGE_SIZE,
-					       HWQ_TYPE_CTX);
+		hwq_attr.res = res;
+		hwq_attr.depth = pages;
+		hwq_attr.stride = PAGE_SIZE;
+		hwq_attr.type = HWQ_TYPE_MR;
+		hwq_attr.sginfo = &sginfo;
+		hwq_attr.sginfo->npages = pages;
+		hwq_attr.sginfo->pgsize = PAGE_SIZE;
+		hwq_attr.sginfo->pgshft = PAGE_SHIFT;
+		rc = bnxt_qplib_alloc_init_hwq(&mr->hwq, &hwq_attr);
 		if (rc) {
 			dev_err(&res->pdev->dev,
 				"SP: Reg MR memory allocation failed\n");
@@ -734,7 +739,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
 
 fail:
 	if (mr->hwq.max_elements)
-		bnxt_qplib_free_hwq(res->pdev, &mr->hwq);
+		bnxt_qplib_free_hwq(res, &mr->hwq);
 	return rc;
 }
 
@@ -742,6 +747,8 @@ int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res,
 					struct bnxt_qplib_frpl *frpl,
 					int max_pg_ptrs)
 {
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
 	int pg_ptrs, pages, rc;
 
 	/* Re-calculate the max to fit the HWQ allocation model */
@@ -753,10 +760,15 @@ int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res,
 	if (pages > MAX_PBL_LVL_1_PGS)
 		return -ENOMEM;
 
-	frpl->hwq.max_elements = pages;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &frpl->hwq, NULL,
-				       &frpl->hwq.max_elements, PAGE_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_CTX);
+	sginfo.pgsize = PAGE_SIZE;
+	sginfo.nopte = true;
+
+	hwq_attr.res = res;
+	hwq_attr.depth = pg_ptrs;
+	hwq_attr.stride = PAGE_SIZE;
+	hwq_attr.sginfo = &sginfo;
+	hwq_attr.type = HWQ_TYPE_CTX;
+	rc = bnxt_qplib_alloc_init_hwq(&frpl->hwq, &hwq_attr);
 	if (!rc)
 		frpl->max_pg_ptrs = pg_ptrs;
 
@@ -766,7 +778,7 @@ int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res,
 int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res,
 				       struct bnxt_qplib_frpl *frpl)
 {
-	bnxt_qplib_free_hwq(res->pdev, &frpl->hwq);
+	bnxt_qplib_free_hwq(res, &frpl->hwq);
 	return 0;
 }
 

From b08fe048a69d8cb4c497b91c965fff754d3369f9 Mon Sep 17 00:00:00 2001
From: Devesh Sharma <devesh.sharma@broadcom.com>
Date: Sat, 15 Feb 2020 12:11:01 -0500
Subject: [PATCH 047/327] RDMA/bnxt_re: Refactor net ring allocation function

Introducing a new attribute structure to reduce the long list of arguments
passed in bnxt_re_net_ring_alloc() function.

The caller of bnxt_re_net_ring_alloc should fill in the list of attributes
in bnxt_re_ring_attr structure and then pass the pointer to the function.

Link: https://lore.kernel.org/r/1581786665-23705-5-git-send-email-devesh.sharma@broadcom.com
Signed-off-by: Naresh Kumar PBS <nareshkumar.pbs@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/bnxt_re.h |  9 ++++
 drivers/infiniband/hw/bnxt_re/main.c    | 64 ++++++++++++++-----------
 2 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 86274f4c8225..c736e8254e75 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -89,6 +89,15 @@
 
 #define BNXT_RE_DEFAULT_ACK_DELAY	16
 
+struct bnxt_re_ring_attr {
+	dma_addr_t	*dma_arr;
+	int		pages;
+	int		type;
+	u32		depth;
+	u32		lrid; /* Logical ring id */
+	u8		mode;
+};
+
 struct bnxt_re_work {
 	struct work_struct	work;
 	unsigned long		event;
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index a966c68dbc59..f013bf687384 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -427,9 +427,9 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev,
 	return rc;
 }
 
-static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr,
-				  int pages, int type, u32 ring_mask,
-				  u32 map_index, u16 *fw_ring_id)
+static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev,
+				  struct bnxt_re_ring_attr *ring_attr,
+				  u16 *fw_ring_id)
 {
 	struct bnxt_en_dev *en_dev = rdev->en_dev;
 	struct hwrm_ring_alloc_input req = {0};
@@ -443,18 +443,18 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr,
 	memset(&fw_msg, 0, sizeof(fw_msg));
 	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_ALLOC, -1, -1);
 	req.enables = 0;
-	req.page_tbl_addr =  cpu_to_le64(dma_arr[0]);
-	if (pages > 1) {
+	req.page_tbl_addr =  cpu_to_le64(ring_attr->dma_arr[0]);
+	if (ring_attr->pages > 1) {
 		/* Page size is in log2 units */
 		req.page_size = BNXT_PAGE_SHIFT;
 		req.page_tbl_depth = 1;
 	}
 	req.fbo = 0;
 	/* Association of ring index with doorbell index and MSIX number */
-	req.logical_id = cpu_to_le16(map_index);
-	req.length = cpu_to_le32(ring_mask + 1);
-	req.ring_type = type;
-	req.int_mode = RING_ALLOC_REQ_INT_MODE_MSIX;
+	req.logical_id = cpu_to_le16(ring_attr->lrid);
+	req.length = cpu_to_le32(ring_attr->depth + 1);
+	req.ring_type = ring_attr->type;
+	req.int_mode = ring_attr->mode;
 	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
 			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
 	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
@@ -1006,10 +1006,10 @@ static void bnxt_re_free_res(struct bnxt_re_dev *rdev)
 
 static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 {
+	struct bnxt_re_ring_attr rattr = {};
+	struct bnxt_qplib_ctx *qplib_ctx;
 	int num_vec_created = 0;
-	dma_addr_t *pg_map;
 	int rc = 0, i;
-	int pages;
 	u8 type;
 
 	/* Configure and allocate resources for qplib */
@@ -1030,10 +1030,13 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 	if (rc)
 		goto dealloc_res;
 
+	qplib_ctx = &rdev->qplib_ctx;
 	for (i = 0; i < rdev->num_msix - 1; i++) {
-		rdev->nq[i].res = &rdev->qplib_res;
-		rdev->nq[i].hwq.max_elements = BNXT_RE_MAX_CQ_COUNT +
-			BNXT_RE_MAX_SRQC_COUNT + 2;
+		struct bnxt_qplib_nq *nq;
+
+		nq = &rdev->nq[i];
+		nq->hwq.max_elements = (qplib_ctx->cq_count +
+					qplib_ctx->srqc_count + 2);
 		rc = bnxt_qplib_alloc_nq(&rdev->qplib_res, &rdev->nq[i]);
 		if (rc) {
 			dev_err(rdev_to_dev(rdev), "Alloc Failed NQ%d rc:%#x",
@@ -1041,12 +1044,13 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 			goto free_nq;
 		}
 		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
-		pg_map = rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr;
-		pages = rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count;
-		rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type,
-					    BNXT_QPLIB_NQE_MAX_CNT - 1,
-					    rdev->msix_entries[i + 1].ring_idx,
-					    &rdev->nq[i].ring_id);
+		rattr.dma_arr = nq->hwq.pbl[PBL_LVL_0].pg_map_arr;
+		rattr.pages = nq->hwq.pbl[rdev->nq[i].hwq.level].pg_count;
+		rattr.type = type;
+		rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX;
+		rattr.depth = BNXT_QPLIB_NQE_MAX_CNT - 1;
+		rattr.lrid = rdev->msix_entries[i + 1].ring_idx;
+		rc = bnxt_re_net_ring_alloc(rdev, &rattr, &nq->ring_id);
 		if (rc) {
 			dev_err(rdev_to_dev(rdev),
 				"Failed to allocate NQ fw id with rc = 0x%x",
@@ -1371,10 +1375,10 @@ static void bnxt_re_worker(struct work_struct *work)
 
 static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 {
-	dma_addr_t *pg_map;
-	u32 db_offt, ridx;
-	int pages, vid;
+	struct bnxt_re_ring_attr rattr;
+	u32 db_offt;
 	bool locked;
+	int vid;
 	u8 type;
 	int rc;
 
@@ -1383,6 +1387,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	locked = true;
 
 	/* Registered a new RoCE device instance to netdev */
+	memset(&rattr, 0, sizeof(rattr));
 	rc = bnxt_re_register_netdev(rdev);
 	if (rc) {
 		rtnl_unlock();
@@ -1422,12 +1427,13 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	}
 
 	type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
-	pg_map = rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr;
-	pages = rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count;
-	ridx = rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx;
-	rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type,
-				    BNXT_QPLIB_CREQE_MAX_CNT - 1,
-				    ridx, &rdev->rcfw.creq_ring_id);
+	rattr.dma_arr = rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr;
+	rattr.pages = rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count;
+	rattr.type = type;
+	rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX;
+	rattr.depth = BNXT_QPLIB_CREQE_MAX_CNT - 1;
+	rattr.lrid = rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx;
+	rc = bnxt_re_net_ring_alloc(rdev, &rattr, &rdev->rcfw.creq_ring_id);
 	if (rc) {
 		pr_err("Failed to allocate CREQ: %#x\n", rc);
 		goto free_rcfw;

From cee0c7bba4869170fd471758053406784eba35a5 Mon Sep 17 00:00:00 2001
From: Devesh Sharma <devesh.sharma@broadcom.com>
Date: Sat, 15 Feb 2020 12:11:02 -0500
Subject: [PATCH 048/327] RDMA/bnxt_re: Refactor command queue management code

Refactoring the command queue (rcfw) management code. A new data-structure
is introduced to describe the bar register.  each object which deals with
mmio space should have a descriptor structure. This structure specifically
hold DB register information.  Thus, slow path creq structure now hold a
bar register descriptor.

Further cleanup the rcfw structure to introduce the command queue context
and command response event queue context structures. Rest of the rcfw
related code has been touched to incorporate these three structures.

Link: https://lore.kernel.org/r/1581786665-23705-6-git-send-email-devesh.sharma@broadcom.com
Signed-off-by: Naresh Kumar PBS <nareshkumar.pbs@broadcom.com>
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/main.c       |  12 +-
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 427 +++++++++++++--------
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.h |  80 ++--
 drivers/infiniband/hw/bnxt_re/qplib_res.h  |   7 +
 4 files changed, 317 insertions(+), 209 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index f013bf687384..65f106f84c33 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -1344,7 +1344,7 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
 		bnxt_qplib_free_ctx(&rdev->qplib_res, &rdev->qplib_ctx);
 		bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
 		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
-		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
+		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type);
 		bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
 	}
 	if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) {
@@ -1375,6 +1375,7 @@ static void bnxt_re_worker(struct work_struct *work)
 
 static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 {
+	struct bnxt_qplib_creq_ctx *creq;
 	struct bnxt_re_ring_attr rattr;
 	u32 db_offt;
 	bool locked;
@@ -1427,13 +1428,14 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	}
 
 	type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
-	rattr.dma_arr = rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr;
-	rattr.pages = rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count;
+	creq = &rdev->rcfw.creq;
+	rattr.dma_arr = creq->hwq.pbl[PBL_LVL_0].pg_map_arr;
+	rattr.pages = creq->hwq.pbl[creq->hwq.level].pg_count;
 	rattr.type = type;
 	rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX;
 	rattr.depth = BNXT_QPLIB_CREQE_MAX_CNT - 1;
 	rattr.lrid = rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx;
-	rc = bnxt_re_net_ring_alloc(rdev, &rattr, &rdev->rcfw.creq_ring_id);
+	rc = bnxt_re_net_ring_alloc(rdev, &rattr, &creq->ring_id);
 	if (rc) {
 		pr_err("Failed to allocate CREQ: %#x\n", rc);
 		goto free_rcfw;
@@ -1527,7 +1529,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
 free_ring:
 	type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
-	bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
+	bnxt_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type);
 free_rcfw:
 	bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
 fail:
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
index fada81c300ba..119113ecfb64 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -55,12 +55,14 @@ static void bnxt_qplib_service_creq(unsigned long data);
 /* Hardware communication channel */
 static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
 {
+	struct bnxt_qplib_cmdq_ctx *cmdq;
 	u16 cbit;
 	int rc;
 
+	cmdq = &rcfw->cmdq;
 	cbit = cookie % rcfw->cmdq_depth;
-	rc = wait_event_timeout(rcfw->waitq,
-				!test_bit(cbit, rcfw->cmdq_bitmap),
+	rc = wait_event_timeout(cmdq->waitq,
+				!test_bit(cbit, cmdq->cmdq_bitmap),
 				msecs_to_jiffies(RCFW_CMD_WAIT_TIME_MS));
 	return rc ? 0 : -ETIMEDOUT;
 };
@@ -68,15 +70,17 @@ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
 static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
 {
 	u32 count = RCFW_BLOCKED_CMD_WAIT_COUNT;
+	struct bnxt_qplib_cmdq_ctx *cmdq;
 	u16 cbit;
 
+	cmdq = &rcfw->cmdq;
 	cbit = cookie % rcfw->cmdq_depth;
-	if (!test_bit(cbit, rcfw->cmdq_bitmap))
+	if (!test_bit(cbit, cmdq->cmdq_bitmap))
 		goto done;
 	do {
 		mdelay(1); /* 1m sec */
 		bnxt_qplib_service_creq((unsigned long)rcfw);
-	} while (test_bit(cbit, rcfw->cmdq_bitmap) && --count);
+	} while (test_bit(cbit, cmdq->cmdq_bitmap) && --count);
 done:
 	return count ? 0 : -ETIMEDOUT;
 };
@@ -84,56 +88,61 @@ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
 static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
 			  struct creq_base *resp, void *sb, u8 is_block)
 {
-	struct bnxt_qplib_cmdqe *cmdqe, **cmdq_ptr;
-	struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
+	struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq;
+	struct bnxt_qplib_cmdqe *cmdqe, **hwq_ptr;
+	struct bnxt_qplib_hwq *hwq = &cmdq->hwq;
+	struct bnxt_qplib_crsqe *crsqe;
 	u32 cmdq_depth = rcfw->cmdq_depth;
-	struct bnxt_qplib_crsq *crsqe;
 	u32 sw_prod, cmdq_prod;
+	struct pci_dev *pdev;
 	unsigned long flags;
 	u32 size, opcode;
 	u16 cookie, cbit;
+	int pg, idx;
 	u8 *preq;
 
+	pdev = rcfw->pdev;
+
 	opcode = req->opcode;
-	if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
+	if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) &&
 	    (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC &&
 	     opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW &&
 	     opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) {
-		dev_err(&rcfw->pdev->dev,
+		dev_err(&pdev->dev,
 			"RCFW not initialized, reject opcode 0x%x\n", opcode);
 		return -EINVAL;
 	}
 
-	if (test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
+	if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) &&
 	    opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) {
-		dev_err(&rcfw->pdev->dev, "RCFW already initialized!\n");
+		dev_err(&pdev->dev, "RCFW already initialized!\n");
 		return -EINVAL;
 	}
 
-	if (test_bit(FIRMWARE_TIMED_OUT, &rcfw->flags))
+	if (test_bit(FIRMWARE_TIMED_OUT, &cmdq->flags))
 		return -ETIMEDOUT;
 
 	/* Cmdq are in 16-byte units, each request can consume 1 or more
 	 * cmdqe
 	 */
-	spin_lock_irqsave(&cmdq->lock, flags);
-	if (req->cmd_size >= HWQ_FREE_SLOTS(cmdq)) {
-		dev_err(&rcfw->pdev->dev, "RCFW: CMDQ is full!\n");
-		spin_unlock_irqrestore(&cmdq->lock, flags);
+	spin_lock_irqsave(&hwq->lock, flags);
+	if (req->cmd_size >= HWQ_FREE_SLOTS(hwq)) {
+		dev_err(&pdev->dev, "RCFW: CMDQ is full!\n");
+		spin_unlock_irqrestore(&hwq->lock, flags);
 		return -EAGAIN;
 	}
 
 
-	cookie = rcfw->seq_num & RCFW_MAX_COOKIE_VALUE;
+	cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE;
 	cbit = cookie % rcfw->cmdq_depth;
 	if (is_block)
 		cookie |= RCFW_CMD_IS_BLOCKING;
 
-	set_bit(cbit, rcfw->cmdq_bitmap);
+	set_bit(cbit, cmdq->cmdq_bitmap);
 	req->cookie = cpu_to_le16(cookie);
 	crsqe = &rcfw->crsqe_tbl[cbit];
 	if (crsqe->resp) {
-		spin_unlock_irqrestore(&cmdq->lock, flags);
+		spin_unlock_irqrestore(&hwq->lock, flags);
 		return -EBUSY;
 	}
 
@@ -155,15 +164,18 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
 				  BNXT_QPLIB_CMDQE_UNITS;
 	}
 
-	cmdq_ptr = (struct bnxt_qplib_cmdqe **)cmdq->pbl_ptr;
+	hwq_ptr = (struct bnxt_qplib_cmdqe **)hwq->pbl_ptr;
 	preq = (u8 *)req;
 	do {
+		pg = 0;
+		idx = 0;
+
 		/* Locate the next cmdq slot */
-		sw_prod = HWQ_CMP(cmdq->prod, cmdq);
-		cmdqe = &cmdq_ptr[get_cmdq_pg(sw_prod, cmdq_depth)]
+		sw_prod = HWQ_CMP(hwq->prod, hwq);
+		cmdqe = &hwq_ptr[get_cmdq_pg(sw_prod, cmdq_depth)]
 				[get_cmdq_idx(sw_prod, cmdq_depth)];
 		if (!cmdqe) {
-			dev_err(&rcfw->pdev->dev,
+			dev_err(&pdev->dev,
 				"RCFW request failed with no cmdqe!\n");
 			goto done;
 		}
@@ -172,31 +184,27 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
 		memcpy(cmdqe, preq, min_t(u32, size, sizeof(*cmdqe)));
 		preq += min_t(u32, size, sizeof(*cmdqe));
 		size -= min_t(u32, size, sizeof(*cmdqe));
-		cmdq->prod++;
-		rcfw->seq_num++;
+		hwq->prod++;
 	} while (size > 0);
+	cmdq->seq_num++;
 
-	rcfw->seq_num++;
-
-	cmdq_prod = cmdq->prod;
-	if (test_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags)) {
+	cmdq_prod = hwq->prod;
+	if (test_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags)) {
 		/* The very first doorbell write
 		 * is required to set this flag
 		 * which prompts the FW to reset
 		 * its internal pointers
 		 */
 		cmdq_prod |= BIT(FIRMWARE_FIRST_FLAG);
-		clear_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags);
+		clear_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags);
 	}
 
 	/* ring CMDQ DB */
 	wmb();
-	writel(cmdq_prod, rcfw->cmdq_bar_reg_iomem +
-	       rcfw->cmdq_bar_reg_prod_off);
-	writel(RCFW_CMDQ_TRIG_VAL, rcfw->cmdq_bar_reg_iomem +
-	       rcfw->cmdq_bar_reg_trig_off);
+	writel(cmdq_prod, cmdq->cmdq_mbox.prod);
+	writel(RCFW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db);
 done:
-	spin_unlock_irqrestore(&cmdq->lock, flags);
+	spin_unlock_irqrestore(&hwq->lock, flags);
 	/* Return the CREQ response pointer */
 	return 0;
 }
@@ -236,7 +244,7 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
 		/* timed out */
 		dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n",
 			cookie, opcode, RCFW_CMD_WAIT_TIME_MS);
-		set_bit(FIRMWARE_TIMED_OUT, &rcfw->flags);
+		set_bit(FIRMWARE_TIMED_OUT, &rcfw->cmdq.flags);
 		return rc;
 	}
 
@@ -253,6 +261,8 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw,
 static int bnxt_qplib_process_func_event(struct bnxt_qplib_rcfw *rcfw,
 					 struct creq_func_event *func_event)
 {
+	int rc;
+
 	switch (func_event->event) {
 	case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR:
 		break;
@@ -286,37 +296,41 @@ static int bnxt_qplib_process_func_event(struct bnxt_qplib_rcfw *rcfw,
 	default:
 		return -EINVAL;
 	}
-	return 0;
+
+	rc = rcfw->creq.aeq_handler(rcfw, (void *)func_event, NULL);
+	return rc;
 }
 
 static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
 				       struct creq_qp_event *qp_event)
 {
-	struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
 	struct creq_qp_error_notification *err_event;
-	struct bnxt_qplib_crsq *crsqe;
-	unsigned long flags;
+	struct bnxt_qplib_hwq *hwq = &rcfw->cmdq.hwq;
+	struct bnxt_qplib_crsqe *crsqe;
 	struct bnxt_qplib_qp *qp;
 	u16 cbit, blocked = 0;
-	u16 cookie;
+	struct pci_dev *pdev;
+	unsigned long flags;
 	__le16  mcookie;
+	u16 cookie;
+	int rc = 0;
 	u32 qp_id;
 
+	pdev = rcfw->pdev;
 	switch (qp_event->event) {
 	case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION:
 		err_event = (struct creq_qp_error_notification *)qp_event;
 		qp_id = le32_to_cpu(err_event->xid);
 		qp = rcfw->qp_tbl[qp_id].qp_handle;
-		dev_dbg(&rcfw->pdev->dev,
-			"Received QP error notification\n");
-		dev_dbg(&rcfw->pdev->dev,
+		dev_dbg(&pdev->dev, "Received QP error notification\n");
+		dev_dbg(&pdev->dev,
 			"qpid 0x%x, req_err=0x%x, resp_err=0x%x\n",
 			qp_id, err_event->req_err_state_reason,
 			err_event->res_err_state_reason);
 		if (!qp)
 			break;
 		bnxt_qplib_mark_qp_error(qp);
-		rcfw->aeq_handler(rcfw, qp_event, qp);
+		rc = rcfw->creq.aeq_handler(rcfw, qp_event, qp);
 		break;
 	default:
 		/*
@@ -328,7 +342,7 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
 		 *
 		 */
 
-		spin_lock_irqsave_nested(&cmdq->lock, flags,
+		spin_lock_irqsave_nested(&hwq->lock, flags,
 					 SINGLE_DEPTH_NESTING);
 		cookie = le16_to_cpu(qp_event->cookie);
 		mcookie = qp_event->cookie;
@@ -342,23 +356,23 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
 			crsqe->resp = NULL;
 		} else {
 			if (crsqe->resp && crsqe->resp->cookie)
-				dev_err(&rcfw->pdev->dev,
+				dev_err(&pdev->dev,
 					"CMD %s cookie sent=%#x, recd=%#x\n",
 					crsqe->resp ? "mismatch" : "collision",
 					crsqe->resp ? crsqe->resp->cookie : 0,
 					mcookie);
 		}
-		if (!test_and_clear_bit(cbit, rcfw->cmdq_bitmap))
-			dev_warn(&rcfw->pdev->dev,
+		if (!test_and_clear_bit(cbit, rcfw->cmdq.cmdq_bitmap))
+			dev_warn(&pdev->dev,
 				 "CMD bit %d was not requested\n", cbit);
-		cmdq->cons += crsqe->req_size;
+		hwq->cons += crsqe->req_size;
 		crsqe->req_size = 0;
 
 		if (!blocked)
-			wake_up(&rcfw->waitq);
-		spin_unlock_irqrestore(&cmdq->lock, flags);
+			wake_up(&rcfw->cmdq.waitq);
+		spin_unlock_irqrestore(&hwq->lock, flags);
 	}
-	return 0;
+	return rc;
 }
 
 /* SP - CREQ Completion handlers */
@@ -366,20 +380,21 @@ static void bnxt_qplib_service_creq(unsigned long data)
 {
 	struct bnxt_qplib_rcfw *rcfw = (struct bnxt_qplib_rcfw *)data;
 	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
-	struct bnxt_qplib_hwq *creq = &rcfw->creq;
+	struct bnxt_qplib_creq_ctx *creq = &rcfw->creq;
 	u32 type, budget = CREQ_ENTRY_POLL_BUDGET;
-	struct creq_base *creqe, **creq_ptr;
+	struct bnxt_qplib_hwq *hwq = &creq->hwq;
+	struct creq_base *creqe, **hwq_ptr;
 	u32 sw_cons, raw_cons;
 	unsigned long flags;
 
 	/* Service the CREQ until budget is over */
-	spin_lock_irqsave(&creq->lock, flags);
-	raw_cons = creq->cons;
+	spin_lock_irqsave(&hwq->lock, flags);
+	raw_cons = hwq->cons;
 	while (budget > 0) {
-		sw_cons = HWQ_CMP(raw_cons, creq);
-		creq_ptr = (struct creq_base **)creq->pbl_ptr;
-		creqe = &creq_ptr[get_creq_pg(sw_cons)][get_creq_idx(sw_cons)];
-		if (!CREQ_CMP_VALID(creqe, raw_cons, creq->max_elements))
+		sw_cons = HWQ_CMP(raw_cons, hwq);
+		hwq_ptr = (struct creq_base **)hwq->pbl_ptr;
+		creqe = &hwq_ptr[get_creq_pg(sw_cons)][get_creq_idx(sw_cons)];
+		if (!CREQ_CMP_VALID(creqe, raw_cons, hwq->max_elements))
 			break;
 		/* The valid test of the entry must be done first before
 		 * reading any further.
@@ -391,12 +406,12 @@ static void bnxt_qplib_service_creq(unsigned long data)
 		case CREQ_BASE_TYPE_QP_EVENT:
 			bnxt_qplib_process_qp_event
 				(rcfw, (struct creq_qp_event *)creqe);
-			rcfw->creq_qp_event_processed++;
+			creq->stats.creq_qp_event_processed++;
 			break;
 		case CREQ_BASE_TYPE_FUNC_EVENT:
 			if (!bnxt_qplib_process_func_event
 			    (rcfw, (struct creq_func_event *)creqe))
-				rcfw->creq_func_event_processed++;
+				creq->stats.creq_func_event_processed++;
 			else
 				dev_warn(&rcfw->pdev->dev,
 					 "aeqe:%#x Not handled\n", type);
@@ -412,28 +427,31 @@ static void bnxt_qplib_service_creq(unsigned long data)
 		budget--;
 	}
 
-	if (creq->cons != raw_cons) {
-		creq->cons = raw_cons;
-		bnxt_qplib_ring_creq_db_rearm(rcfw->creq_bar_reg_iomem,
-					      raw_cons, creq->max_elements,
-					      rcfw->creq_ring_id, gen_p5);
+	if (hwq->cons != raw_cons) {
+		hwq->cons = raw_cons;
+		bnxt_qplib_ring_creq_db_rearm(creq->creq_db.db,
+					      raw_cons, hwq->max_elements,
+					      creq->ring_id, gen_p5);
 	}
-	spin_unlock_irqrestore(&creq->lock, flags);
+	spin_unlock_irqrestore(&hwq->lock, flags);
 }
 
 static irqreturn_t bnxt_qplib_creq_irq(int irq, void *dev_instance)
 {
 	struct bnxt_qplib_rcfw *rcfw = dev_instance;
-	struct bnxt_qplib_hwq *creq = &rcfw->creq;
+	struct bnxt_qplib_creq_ctx *creq;
 	struct creq_base **creq_ptr;
+	struct bnxt_qplib_hwq *hwq;
 	u32 sw_cons;
 
+	creq = &rcfw->creq;
+	hwq = &creq->hwq;
 	/* Prefetch the CREQ element */
-	sw_cons = HWQ_CMP(creq->cons, creq);
-	creq_ptr = (struct creq_base **)rcfw->creq.pbl_ptr;
+	sw_cons = HWQ_CMP(hwq->cons, hwq);
+	creq_ptr = (struct creq_base **)creq->hwq.pbl_ptr;
 	prefetch(&creq_ptr[get_creq_pg(sw_cons)][get_creq_idx(sw_cons)]);
 
-	tasklet_schedule(&rcfw->worker);
+	tasklet_schedule(&creq->creq_tasklet);
 
 	return IRQ_HANDLED;
 }
@@ -452,7 +470,7 @@ int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw)
 	if (rc)
 		return rc;
 
-	clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags);
+	clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags);
 	return 0;
 }
 
@@ -556,16 +574,17 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
 					  NULL, 0);
 	if (rc)
 		return rc;
-	set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags);
+	set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags);
 	return 0;
 }
 
 void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
 {
+	kfree(rcfw->cmdq.cmdq_bitmap);
 	kfree(rcfw->qp_tbl);
 	kfree(rcfw->crsqe_tbl);
-	bnxt_qplib_free_hwq(rcfw->res, &rcfw->cmdq);
-	bnxt_qplib_free_hwq(rcfw->res, &rcfw->creq);
+	bnxt_qplib_free_hwq(rcfw->res, &rcfw->cmdq.hwq);
+	bnxt_qplib_free_hwq(rcfw->res, &rcfw->creq.hwq);
 	rcfw->pdev = NULL;
 }
 
@@ -576,8 +595,13 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res,
 {
 	struct bnxt_qplib_hwq_attr hwq_attr = {};
 	struct bnxt_qplib_sg_info sginfo = {};
+	struct bnxt_qplib_cmdq_ctx *cmdq;
+	struct bnxt_qplib_creq_ctx *creq;
+	u32 bmap_size = 0;
 
 	rcfw->pdev = res->pdev;
+	cmdq = &rcfw->cmdq;
+	creq = &rcfw->creq;
 	rcfw->res = res;
 
 	sginfo.pgsize = PAGE_SIZE;
@@ -589,7 +613,7 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res,
 	hwq_attr.stride = BNXT_QPLIB_CREQE_UNITS;
 	hwq_attr.type = bnxt_qplib_get_hwq_type(res);
 
-	if (bnxt_qplib_alloc_init_hwq(&rcfw->creq, &hwq_attr)) {
+	if (bnxt_qplib_alloc_init_hwq(&creq->hwq, &hwq_attr)) {
 		dev_err(&rcfw->pdev->dev,
 			"HW channel CREQ allocation failed\n");
 		goto fail;
@@ -603,17 +627,24 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res,
 	hwq_attr.depth = rcfw->cmdq_depth;
 	hwq_attr.stride = BNXT_QPLIB_CMDQE_UNITS;
 	hwq_attr.type = HWQ_TYPE_CTX;
-	if (bnxt_qplib_alloc_init_hwq(&rcfw->cmdq, &hwq_attr)) {
+	if (bnxt_qplib_alloc_init_hwq(&cmdq->hwq, &hwq_attr)) {
 		dev_err(&rcfw->pdev->dev,
 			"HW channel CMDQ allocation failed\n");
 		goto fail;
 	}
 
-	rcfw->crsqe_tbl = kcalloc(rcfw->cmdq.max_elements,
+	rcfw->crsqe_tbl = kcalloc(cmdq->hwq.max_elements,
 				  sizeof(*rcfw->crsqe_tbl), GFP_KERNEL);
 	if (!rcfw->crsqe_tbl)
 		goto fail;
 
+	bmap_size = BITS_TO_LONGS(rcfw->cmdq_depth) * sizeof(unsigned long);
+	cmdq->cmdq_bitmap = kzalloc(bmap_size, GFP_KERNEL);
+	if (!cmdq->cmdq_bitmap)
+		goto fail;
+
+	cmdq->bmap_size = bmap_size;
+
 	rcfw->qp_tbl_size = qp_tbl_sz;
 	rcfw->qp_tbl = kcalloc(qp_tbl_sz, sizeof(struct bnxt_qplib_qp_node),
 			       GFP_KERNEL);
@@ -630,137 +661,201 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res,
 void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill)
 {
 	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
+	struct bnxt_qplib_creq_ctx *creq;
 
-	tasklet_disable(&rcfw->worker);
+	creq = &rcfw->creq;
+	tasklet_disable(&creq->creq_tasklet);
 	/* Mask h/w interrupts */
-	bnxt_qplib_ring_creq_db(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
-				rcfw->creq.max_elements, rcfw->creq_ring_id,
+	bnxt_qplib_ring_creq_db(creq->creq_db.db, creq->hwq.cons,
+				creq->hwq.max_elements, creq->ring_id,
 				gen_p5);
 	/* Sync with last running IRQ-handler */
-	synchronize_irq(rcfw->vector);
+	synchronize_irq(creq->msix_vec);
 	if (kill)
-		tasklet_kill(&rcfw->worker);
+		tasklet_kill(&creq->creq_tasklet);
 
-	if (rcfw->requested) {
-		free_irq(rcfw->vector, rcfw);
-		rcfw->requested = false;
+	if (creq->requested) {
+		free_irq(creq->msix_vec, rcfw);
+		creq->requested = false;
 	}
 }
 
 void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
 {
+	struct bnxt_qplib_creq_ctx *creq;
+	struct bnxt_qplib_cmdq_ctx *cmdq;
 	unsigned long indx;
 
+	creq = &rcfw->creq;
+	cmdq = &rcfw->cmdq;
+	/* Make sure the HW channel is stopped! */
 	bnxt_qplib_rcfw_stop_irq(rcfw, true);
 
-	iounmap(rcfw->cmdq_bar_reg_iomem);
-	iounmap(rcfw->creq_bar_reg_iomem);
+	iounmap(cmdq->cmdq_mbox.reg.bar_reg);
+	iounmap(creq->creq_db.reg.bar_reg);
 
-	indx = find_first_bit(rcfw->cmdq_bitmap, rcfw->bmap_size);
-	if (indx != rcfw->bmap_size)
+	indx = find_first_bit(cmdq->cmdq_bitmap, cmdq->bmap_size);
+	if (indx != cmdq->bmap_size)
 		dev_err(&rcfw->pdev->dev,
 			"disabling RCFW with pending cmd-bit %lx\n", indx);
-	kfree(rcfw->cmdq_bitmap);
-	rcfw->bmap_size = 0;
 
-	rcfw->cmdq_bar_reg_iomem = NULL;
-	rcfw->creq_bar_reg_iomem = NULL;
-	rcfw->aeq_handler = NULL;
-	rcfw->vector = 0;
+	cmdq->cmdq_mbox.reg.bar_reg = NULL;
+	creq->creq_db.reg.bar_reg = NULL;
+	creq->aeq_handler = NULL;
+	creq->msix_vec = 0;
 }
 
 int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
 			      bool need_init)
 {
 	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
+	struct bnxt_qplib_creq_ctx *creq;
 	int rc;
 
-	if (rcfw->requested)
+	creq = &rcfw->creq;
+
+	if (creq->requested)
 		return -EFAULT;
 
-	rcfw->vector = msix_vector;
+	creq->msix_vec = msix_vector;
 	if (need_init)
-		tasklet_init(&rcfw->worker,
+		tasklet_init(&creq->creq_tasklet,
 			     bnxt_qplib_service_creq, (unsigned long)rcfw);
 	else
-		tasklet_enable(&rcfw->worker);
-	rc = request_irq(rcfw->vector, bnxt_qplib_creq_irq, 0,
+		tasklet_enable(&creq->creq_tasklet);
+	rc = request_irq(creq->msix_vec, bnxt_qplib_creq_irq, 0,
 			 "bnxt_qplib_creq", rcfw);
 	if (rc)
 		return rc;
-	rcfw->requested = true;
-	bnxt_qplib_ring_creq_db_rearm(rcfw->creq_bar_reg_iomem,
-				      rcfw->creq.cons, rcfw->creq.max_elements,
-				      rcfw->creq_ring_id, gen_p5);
+	creq->requested = true;
+	bnxt_qplib_ring_creq_db_rearm(creq->creq_db.db,
+				      creq->hwq.cons, creq->hwq.max_elements,
+				      creq->ring_id, gen_p5);
 
 	return 0;
 }
 
+static int bnxt_qplib_map_cmdq_mbox(struct bnxt_qplib_rcfw *rcfw, bool is_vf)
+{
+	struct bnxt_qplib_cmdq_mbox *mbox;
+	resource_size_t bar_reg;
+	struct pci_dev *pdev;
+	u16 prod_offt;
+	int rc = 0;
+
+	pdev = rcfw->pdev;
+	mbox = &rcfw->cmdq.cmdq_mbox;
+
+	mbox->reg.bar_id = RCFW_COMM_PCI_BAR_REGION;
+	mbox->reg.len = RCFW_COMM_SIZE;
+	mbox->reg.bar_base = pci_resource_start(pdev, mbox->reg.bar_id);
+	if (!mbox->reg.bar_base) {
+		dev_err(&pdev->dev,
+			"QPLIB: CMDQ BAR region %d resc start is 0!\n",
+			mbox->reg.bar_id);
+		return -ENOMEM;
+	}
+
+	bar_reg = mbox->reg.bar_base + RCFW_COMM_BASE_OFFSET;
+	mbox->reg.len = RCFW_COMM_SIZE;
+	mbox->reg.bar_reg = ioremap(bar_reg, mbox->reg.len);
+	if (!mbox->reg.bar_reg) {
+		dev_err(&pdev->dev,
+			"QPLIB: CMDQ BAR region %d mapping failed\n",
+			mbox->reg.bar_id);
+		return -ENOMEM;
+	}
+
+	prod_offt = is_vf ? RCFW_VF_COMM_PROD_OFFSET :
+			    RCFW_PF_COMM_PROD_OFFSET;
+	mbox->prod = (void  __iomem *)(mbox->reg.bar_reg + prod_offt);
+	mbox->db = (void __iomem *)(mbox->reg.bar_reg + RCFW_COMM_TRIG_OFFSET);
+	return rc;
+}
+
+static int bnxt_qplib_map_creq_db(struct bnxt_qplib_rcfw *rcfw, u32 reg_offt)
+{
+	struct bnxt_qplib_creq_db *creq_db;
+	resource_size_t bar_reg;
+	struct pci_dev *pdev;
+
+	pdev = rcfw->pdev;
+	creq_db = &rcfw->creq.creq_db;
+
+	creq_db->reg.bar_id = RCFW_COMM_CONS_PCI_BAR_REGION;
+	creq_db->reg.bar_base = pci_resource_start(pdev, creq_db->reg.bar_id);
+	if (!creq_db->reg.bar_id)
+		dev_err(&pdev->dev,
+			"QPLIB: CREQ BAR region %d resc start is 0!",
+			creq_db->reg.bar_id);
+
+	bar_reg = creq_db->reg.bar_base + reg_offt;
+	/* Unconditionally map 8 bytes to support 57500 series */
+	creq_db->reg.len = 8;
+	creq_db->reg.bar_reg = ioremap(bar_reg, creq_db->reg.len);
+	if (!creq_db->reg.bar_reg) {
+		dev_err(&pdev->dev,
+			"QPLIB: CREQ BAR region %d mapping failed",
+			creq_db->reg.bar_id);
+		return -ENOMEM;
+	}
+	creq_db->db = creq_db->reg.bar_reg;
+	return 0;
+}
+
+static void bnxt_qplib_start_rcfw(struct bnxt_qplib_rcfw *rcfw)
+{
+	struct bnxt_qplib_cmdq_ctx *cmdq;
+	struct bnxt_qplib_creq_ctx *creq;
+	struct bnxt_qplib_cmdq_mbox *mbox;
+	struct cmdq_init init = {0};
+
+	cmdq = &rcfw->cmdq;
+	creq = &rcfw->creq;
+	mbox = &cmdq->cmdq_mbox;
+
+	init.cmdq_pbl = cpu_to_le64(cmdq->hwq.pbl[PBL_LVL_0].pg_map_arr[0]);
+	init.cmdq_size_cmdq_lvl =
+			cpu_to_le16(((rcfw->cmdq_depth <<
+				      CMDQ_INIT_CMDQ_SIZE_SFT) &
+				    CMDQ_INIT_CMDQ_SIZE_MASK) |
+				    ((cmdq->hwq.level <<
+				      CMDQ_INIT_CMDQ_LVL_SFT) &
+				    CMDQ_INIT_CMDQ_LVL_MASK));
+	init.creq_ring_id = cpu_to_le16(creq->ring_id);
+	/* Write to the Bono mailbox register */
+	__iowrite32_copy(mbox->reg.bar_reg, &init, sizeof(init) / 4);
+}
+
 int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw,
 				   int msix_vector,
 				   int cp_bar_reg_off, int virt_fn,
-				   int (*aeq_handler)(struct bnxt_qplib_rcfw *,
-						      void *, void *))
+				   aeq_handler_t aeq_handler)
 {
-	resource_size_t res_base;
-	struct cmdq_init init;
-	struct pci_dev *pdev;
-	u16 bmap_size;
+	struct bnxt_qplib_cmdq_ctx *cmdq;
+	struct bnxt_qplib_creq_ctx *creq;
 	int rc;
 
-	/* General */
-	pdev = rcfw->pdev;
-	rcfw->seq_num = 0;
-	set_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags);
-	bmap_size = BITS_TO_LONGS(rcfw->cmdq_depth) * sizeof(unsigned long);
-	rcfw->cmdq_bitmap = kzalloc(bmap_size, GFP_KERNEL);
-	if (!rcfw->cmdq_bitmap)
-		return -ENOMEM;
-	rcfw->bmap_size = bmap_size;
+	cmdq = &rcfw->cmdq;
+	creq = &rcfw->creq;
 
-	/* CMDQ */
-	rcfw->cmdq_bar_reg = RCFW_COMM_PCI_BAR_REGION;
-	res_base = pci_resource_start(pdev, rcfw->cmdq_bar_reg);
-	if (!res_base)
-		return -ENOMEM;
+	/* Clear to defaults */
 
-	rcfw->cmdq_bar_reg_iomem = ioremap(res_base +
-					      RCFW_COMM_BASE_OFFSET,
-					      RCFW_COMM_SIZE);
-	if (!rcfw->cmdq_bar_reg_iomem) {
-		dev_err(&rcfw->pdev->dev, "CMDQ BAR region %d mapping failed\n",
-			rcfw->cmdq_bar_reg);
-		return -ENOMEM;
-	}
+	cmdq->seq_num = 0;
+	set_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags);
+	init_waitqueue_head(&cmdq->waitq);
 
-	rcfw->cmdq_bar_reg_prod_off = virt_fn ? RCFW_VF_COMM_PROD_OFFSET :
-					RCFW_PF_COMM_PROD_OFFSET;
+	creq->stats.creq_qp_event_processed = 0;
+	creq->stats.creq_func_event_processed = 0;
+	creq->aeq_handler = aeq_handler;
 
-	rcfw->cmdq_bar_reg_trig_off = RCFW_COMM_TRIG_OFFSET;
+	rc = bnxt_qplib_map_cmdq_mbox(rcfw, virt_fn);
+	if (rc)
+		return rc;
 
-	/* CREQ */
-	rcfw->creq_bar_reg = RCFW_COMM_CONS_PCI_BAR_REGION;
-	res_base = pci_resource_start(pdev, rcfw->creq_bar_reg);
-	if (!res_base)
-		dev_err(&rcfw->pdev->dev,
-			"CREQ BAR region %d resc start is 0!\n",
-			rcfw->creq_bar_reg);
-	/* Unconditionally map 8 bytes to support 57500 series */
-	rcfw->creq_bar_reg_iomem = ioremap(res_base + cp_bar_reg_off,
-						   8);
-	if (!rcfw->creq_bar_reg_iomem) {
-		dev_err(&rcfw->pdev->dev, "CREQ BAR region %d mapping failed\n",
-			rcfw->creq_bar_reg);
-		iounmap(rcfw->cmdq_bar_reg_iomem);
-		rcfw->cmdq_bar_reg_iomem = NULL;
-		return -ENOMEM;
-	}
-	rcfw->creq_qp_event_processed = 0;
-	rcfw->creq_func_event_processed = 0;
-
-	if (aeq_handler)
-		rcfw->aeq_handler = aeq_handler;
-	init_waitqueue_head(&rcfw->waitq);
+	rc = bnxt_qplib_map_creq_db(rcfw, cp_bar_reg_off);
+	if (rc)
+		return rc;
 
 	rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_vector, true);
 	if (rc) {
@@ -770,16 +865,8 @@ int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw,
 		return rc;
 	}
 
-	init.cmdq_pbl = cpu_to_le64(rcfw->cmdq.pbl[PBL_LVL_0].pg_map_arr[0]);
-	init.cmdq_size_cmdq_lvl = cpu_to_le16(
-		((rcfw->cmdq_depth << CMDQ_INIT_CMDQ_SIZE_SFT) &
-		 CMDQ_INIT_CMDQ_SIZE_MASK) |
-		((rcfw->cmdq.level << CMDQ_INIT_CMDQ_LVL_SFT) &
-		 CMDQ_INIT_CMDQ_LVL_MASK));
-	init.creq_ring_id = cpu_to_le16(rcfw->creq_ring_id);
+	bnxt_qplib_start_rcfw(rcfw);
 
-	/* Write to the Bono mailbox register */
-	__iowrite32_copy(rcfw->cmdq_bar_reg_iomem, &init, sizeof(init) / 4);
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
index ab1531c7e27f..1aff6d458ac5 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -206,8 +206,9 @@ static inline void bnxt_qplib_ring_creq_db(void __iomem *db, u32 raw_cons,
 #define CREQ_ENTRY_POLL_BUDGET		0x100
 
 /* HWQ */
+typedef int (*aeq_handler_t)(struct bnxt_qplib_rcfw *, void *, void *);
 
-struct bnxt_qplib_crsq {
+struct bnxt_qplib_crsqe {
 	struct creq_qp_event	*resp;
 	u32			req_size;
 };
@@ -225,41 +226,53 @@ struct bnxt_qplib_qp_node {
 
 #define BNXT_QPLIB_OOS_COUNT_MASK 0xFFFFFFFF
 
+#define FIRMWARE_INITIALIZED_FLAG	(0)
+#define FIRMWARE_FIRST_FLAG		(31)
+#define FIRMWARE_TIMED_OUT		(3)
+struct bnxt_qplib_cmdq_mbox {
+	struct bnxt_qplib_reg_desc	reg;
+	void __iomem			*prod;
+	void __iomem			*db;
+};
+
+struct bnxt_qplib_cmdq_ctx {
+	struct bnxt_qplib_hwq		hwq;
+	struct bnxt_qplib_cmdq_mbox	cmdq_mbox;
+	wait_queue_head_t		waitq;
+	unsigned long			flags;
+	unsigned long			*cmdq_bitmap;
+	u32				bmap_size;
+	u32				seq_num;
+};
+
+struct bnxt_qplib_creq_db {
+	struct bnxt_qplib_reg_desc	reg;
+	void __iomem			*db;
+};
+
+struct bnxt_qplib_creq_stat {
+	u64	creq_qp_event_processed;
+	u64	creq_func_event_processed;
+};
+
+struct bnxt_qplib_creq_ctx {
+	struct bnxt_qplib_hwq		hwq;
+	struct bnxt_qplib_creq_db	creq_db;
+	struct bnxt_qplib_creq_stat	stats;
+	struct tasklet_struct		creq_tasklet;
+	aeq_handler_t			aeq_handler;
+	u16				ring_id;
+	int				msix_vec;
+	bool				requested; /*irq handler installed */
+};
+
 /* RCFW Communication Channels */
 struct bnxt_qplib_rcfw {
 	struct pci_dev		*pdev;
 	struct bnxt_qplib_res	*res;
-	int			vector;
-	struct tasklet_struct	worker;
-	bool			requested;
-	unsigned long		*cmdq_bitmap;
-	u32			bmap_size;
-	unsigned long		flags;
-#define FIRMWARE_INITIALIZED_FLAG	0
-#define FIRMWARE_FIRST_FLAG		31
-#define FIRMWARE_TIMED_OUT		3
-	wait_queue_head_t	waitq;
-	int			(*aeq_handler)(struct bnxt_qplib_rcfw *,
-					       void *, void *);
-	u32			seq_num;
-
-	/* Bar region info */
-	void __iomem		*cmdq_bar_reg_iomem;
-	u16			cmdq_bar_reg;
-	u16			cmdq_bar_reg_prod_off;
-	u16			cmdq_bar_reg_trig_off;
-	u16			creq_ring_id;
-	u16			creq_bar_reg;
-	void __iomem		*creq_bar_reg_iomem;
-
-	/* Cmd-Resp and Async Event notification queue */
-	struct bnxt_qplib_hwq	creq;
-	u64			creq_qp_event_processed;
-	u64			creq_func_event_processed;
-
-	/* Actual Cmd and Resp Queues */
-	struct bnxt_qplib_hwq	cmdq;
-	struct bnxt_qplib_crsq	*crsqe_tbl;
+	struct bnxt_qplib_cmdq_ctx	cmdq;
+	struct bnxt_qplib_creq_ctx	creq;
+	struct bnxt_qplib_crsqe		*crsqe_tbl;
 	int qp_tbl_size;
 	struct bnxt_qplib_qp_node *qp_tbl;
 	u64 oos_prev;
@@ -279,8 +292,7 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
 int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw,
 				   int msix_vector,
 				   int cp_bar_reg_off, int virt_fn,
-				   int (*aeq_handler)(struct bnxt_qplib_rcfw *,
-						      void *aeqe, void *obj));
+				   aeq_handler_t aeq_handler);
 
 struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf(
 				struct bnxt_qplib_rcfw *rcfw,
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
index fe8a6dd7aeb1..5fa278e744eb 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -80,6 +80,13 @@ enum bnxt_qplib_pbl_lvl {
 #define ROCE_PG_SIZE_8M		(8 * 1024 * 1024)
 #define ROCE_PG_SIZE_1G		(1024 * 1024 * 1024)
 
+struct bnxt_qplib_reg_desc {
+	u8		bar_id;
+	resource_size_t	bar_base;
+	void __iomem	*bar_reg;
+	size_t		len;
+};
+
 struct bnxt_qplib_pbl {
 	u32				pg_count;
 	u32				pg_size;

From 9555352bacfdfc68a972d5bf8a08849183b9e607 Mon Sep 17 00:00:00 2001
From: Devesh Sharma <devesh.sharma@broadcom.com>
Date: Sat, 15 Feb 2020 12:11:03 -0500
Subject: [PATCH 049/327] RDMA/bnxt_re: Refactor notification queue management
 code

Cleaning up the notification queue data structures and management
code. The CQ and SRQ event handlers have been type defined instead of
in-place declaration. NQ doorbell register descriptor has been added in
base NQ structure.  The nq->vector has been renamed to nq->msix_vec.

Link: https://lore.kernel.org/r/1581786665-23705-7-git-send-email-devesh.sharma@broadcom.com
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/qplib_fp.c | 114 +++++++++++++----------
 drivers/infiniband/hw/bnxt_re/qplib_fp.h |  52 +++++------
 2 files changed, 93 insertions(+), 73 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index e7fe86e61fbb..33272e58363c 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -236,16 +236,16 @@ static int bnxt_qplib_alloc_qp_hdr_buf(struct bnxt_qplib_res *res,
 static void bnxt_qplib_service_nq(unsigned long data)
 {
 	struct bnxt_qplib_nq *nq = (struct bnxt_qplib_nq *)data;
+	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
 	struct bnxt_qplib_hwq *hwq = &nq->hwq;
 	struct nq_base *nqe, **nq_ptr;
 	struct bnxt_qplib_cq *cq;
 	int num_cqne_processed = 0;
 	int num_srqne_processed = 0;
-	u32 sw_cons, raw_cons;
-	u16 type;
 	int budget = nq->budget;
+	u32 sw_cons, raw_cons;
 	uintptr_t q_handle;
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
+	u16 type;
 
 	/* Service the NQ until empty */
 	raw_cons = hwq->cons;
@@ -314,7 +314,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
 	}
 	if (hwq->cons != raw_cons) {
 		hwq->cons = raw_cons;
-		bnxt_qplib_ring_nq_db_rearm(nq->bar_reg_iomem, hwq->cons,
+		bnxt_qplib_ring_nq_db_rearm(nq->nq_db.db, hwq->cons,
 					    hwq->max_elements, nq->ring_id,
 					    gen_p5);
 	}
@@ -333,7 +333,7 @@ static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance)
 	prefetch(&nq_ptr[NQE_PG(sw_cons)][NQE_IDX(sw_cons)]);
 
 	/* Fan out to CPU affinitized kthreads? */
-	tasklet_schedule(&nq->worker);
+	tasklet_schedule(&nq->nq_tasklet);
 
 	return IRQ_HANDLED;
 }
@@ -341,17 +341,17 @@ static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance)
 void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill)
 {
 	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
-	tasklet_disable(&nq->worker);
+	tasklet_disable(&nq->nq_tasklet);
 	/* Mask h/w interrupt */
-	bnxt_qplib_ring_nq_db(nq->bar_reg_iomem, nq->hwq.cons,
+	bnxt_qplib_ring_nq_db(nq->nq_db.db, nq->hwq.cons,
 			      nq->hwq.max_elements, nq->ring_id, gen_p5);
 	/* Sync with last running IRQ handler */
-	synchronize_irq(nq->vector);
+	synchronize_irq(nq->msix_vec);
 	if (kill)
-		tasklet_kill(&nq->worker);
+		tasklet_kill(&nq->nq_tasklet);
 	if (nq->requested) {
-		irq_set_affinity_hint(nq->vector, NULL);
-		free_irq(nq->vector, nq);
+		irq_set_affinity_hint(nq->msix_vec, NULL);
+		free_irq(nq->msix_vec, nq);
 		nq->requested = false;
 	}
 }
@@ -364,16 +364,17 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq)
 	}
 
 	/* Make sure the HW is stopped! */
-	if (nq->requested)
-		bnxt_qplib_nq_stop_irq(nq, true);
+	bnxt_qplib_nq_stop_irq(nq, true);
 
-	if (nq->bar_reg_iomem)
-		iounmap(nq->bar_reg_iomem);
-	nq->bar_reg_iomem = NULL;
+	if (nq->nq_db.reg.bar_reg) {
+		iounmap(nq->nq_db.reg.bar_reg);
+		nq->nq_db.reg.bar_reg = NULL;
+		nq->nq_db.db = NULL;
+	}
 
 	nq->cqn_handler = NULL;
 	nq->srqn_handler = NULL;
-	nq->vector = 0;
+	nq->msix_vec = 0;
 }
 
 int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
@@ -385,68 +386,87 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
 	if (nq->requested)
 		return -EFAULT;
 
-	nq->vector = msix_vector;
+	nq->msix_vec = msix_vector;
 	if (need_init)
-		tasklet_init(&nq->worker, bnxt_qplib_service_nq,
+		tasklet_init(&nq->nq_tasklet, bnxt_qplib_service_nq,
 			     (unsigned long)nq);
 	else
-		tasklet_enable(&nq->worker);
+		tasklet_enable(&nq->nq_tasklet);
 
 	snprintf(nq->name, sizeof(nq->name), "bnxt_qplib_nq-%d", nq_indx);
-	rc = request_irq(nq->vector, bnxt_qplib_nq_irq, 0, nq->name, nq);
+	rc = request_irq(nq->msix_vec, bnxt_qplib_nq_irq, 0, nq->name, nq);
 	if (rc)
 		return rc;
 
 	cpumask_clear(&nq->mask);
 	cpumask_set_cpu(nq_indx, &nq->mask);
-	rc = irq_set_affinity_hint(nq->vector, &nq->mask);
+	rc = irq_set_affinity_hint(nq->msix_vec, &nq->mask);
 	if (rc) {
 		dev_warn(&nq->pdev->dev,
 			 "set affinity failed; vector: %d nq_idx: %d\n",
-			 nq->vector, nq_indx);
+			 nq->msix_vec, nq_indx);
 	}
 	nq->requested = true;
-	bnxt_qplib_ring_nq_db_rearm(nq->bar_reg_iomem, nq->hwq.cons,
+	bnxt_qplib_ring_nq_db_rearm(nq->nq_db.db, nq->hwq.cons,
 				    nq->hwq.max_elements, nq->ring_id, gen_p5);
 
 	return rc;
 }
 
+static int bnxt_qplib_map_nq_db(struct bnxt_qplib_nq *nq,  u32 reg_offt)
+{
+	resource_size_t reg_base;
+	struct bnxt_qplib_nq_db *nq_db;
+	struct pci_dev *pdev;
+	int rc = 0;
+
+	pdev = nq->pdev;
+	nq_db = &nq->nq_db;
+
+	nq_db->reg.bar_id = NQ_CONS_PCI_BAR_REGION;
+	nq_db->reg.bar_base = pci_resource_start(pdev, nq_db->reg.bar_id);
+	if (!nq_db->reg.bar_base) {
+		dev_err(&pdev->dev, "QPLIB: NQ BAR region %d resc start is 0!",
+			nq_db->reg.bar_id);
+		rc = -ENOMEM;
+		goto fail;
+	}
+
+	reg_base = nq_db->reg.bar_base + reg_offt;
+	/* Unconditionally map 8 bytes to support 57500 series */
+	nq_db->reg.len = 8;
+	nq_db->reg.bar_reg = ioremap(reg_base, nq_db->reg.len);
+	if (!nq_db->reg.bar_reg) {
+		dev_err(&pdev->dev, "QPLIB: NQ BAR region %d mapping failed",
+			nq_db->reg.bar_id);
+		rc = -ENOMEM;
+		goto fail;
+	}
+
+	nq_db->db = nq_db->reg.bar_reg;
+fail:
+	return rc;
+}
+
 int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
 			 int nq_idx, int msix_vector, int bar_reg_offset,
-			 int (*cqn_handler)(struct bnxt_qplib_nq *nq,
-					    struct bnxt_qplib_cq *),
-			 int (*srqn_handler)(struct bnxt_qplib_nq *nq,
-					     struct bnxt_qplib_srq *,
-					     u8 event))
+			 cqn_handler_t cqn_handler,
+			 srqn_handler_t srqn_handler)
 {
-	resource_size_t nq_base;
 	int rc = -1;
 
-	if (cqn_handler)
-		nq->cqn_handler = cqn_handler;
-
-	if (srqn_handler)
-		nq->srqn_handler = srqn_handler;
+	nq->pdev = pdev;
+	nq->cqn_handler = cqn_handler;
+	nq->srqn_handler = srqn_handler;
 
 	/* Have a task to schedule CQ notifiers in post send case */
 	nq->cqn_wq  = create_singlethread_workqueue("bnxt_qplib_nq");
 	if (!nq->cqn_wq)
 		return -ENOMEM;
 
-	nq->bar_reg = NQ_CONS_PCI_BAR_REGION;
-	nq->bar_reg_off = bar_reg_offset;
-	nq_base = pci_resource_start(pdev, nq->bar_reg);
-	if (!nq_base) {
-		rc = -ENOMEM;
+	rc = bnxt_qplib_map_nq_db(nq, bar_reg_offset);
+	if (rc)
 		goto fail;
-	}
-	/* Unconditionally map 8 bytes to support 57500 series */
-	nq->bar_reg_iomem = ioremap(nq_base + nq->bar_reg_off, 8);
-	if (!nq->bar_reg_iomem) {
-		rc = -ENOMEM;
-		goto fail;
-	}
 
 	rc = bnxt_qplib_nq_start_irq(nq, nq_idx, msix_vector, true);
 	if (rc) {
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index d3f080c18b27..765e5d272963 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -470,29 +470,32 @@ static inline void bnxt_qplib_ring_nq_db(void __iomem *db, u32 raw_cons,
 		writel(NQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK), db);
 }
 
+struct bnxt_qplib_nq_db {
+	struct bnxt_qplib_reg_desc	reg;
+	void __iomem			*db;
+};
+
+typedef int (*cqn_handler_t)(struct bnxt_qplib_nq *nq,
+		struct bnxt_qplib_cq *cq);
+typedef int (*srqn_handler_t)(struct bnxt_qplib_nq *nq,
+		struct bnxt_qplib_srq *srq, u8 event);
+
 struct bnxt_qplib_nq {
-	struct pci_dev		*pdev;
-	struct bnxt_qplib_res	*res;
+	struct pci_dev			*pdev;
+	struct bnxt_qplib_res		*res;
+	char				name[32];
+	struct bnxt_qplib_hwq		hwq;
+	struct bnxt_qplib_nq_db		nq_db;
+	u16				ring_id;
+	int				msix_vec;
+	cpumask_t			mask;
+	struct tasklet_struct		nq_tasklet;
+	bool				requested;
+	int				budget;
 
-	int			vector;
-	cpumask_t		mask;
-	int			budget;
-	bool			requested;
-	struct tasklet_struct	worker;
-	struct bnxt_qplib_hwq	hwq;
-
-	u16			bar_reg;
-	u32			bar_reg_off;
-	u16			ring_id;
-	void __iomem		*bar_reg_iomem;
-
-	int			(*cqn_handler)(struct bnxt_qplib_nq *nq,
-					       struct bnxt_qplib_cq *cq);
-	int			(*srqn_handler)(struct bnxt_qplib_nq *nq,
-						struct bnxt_qplib_srq *srq,
-						u8 event);
-	struct workqueue_struct	*cqn_wq;
-	char			name[32];
+	cqn_handler_t			cqn_handler;
+	srqn_handler_t			srqn_handler;
+	struct workqueue_struct		*cqn_wq;
 };
 
 struct bnxt_qplib_nq_work {
@@ -507,11 +510,8 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
 			    int msix_vector, bool need_init);
 int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
 			 int nq_idx, int msix_vector, int bar_reg_offset,
-			 int (*cqn_handler)(struct bnxt_qplib_nq *nq,
-					    struct bnxt_qplib_cq *cq),
-			 int (*srqn_handler)(struct bnxt_qplib_nq *nq,
-					     struct bnxt_qplib_srq *srq,
-					     u8 event));
+			 cqn_handler_t cqn_handler,
+			 srqn_handler_t srq_handler);
 int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
 			  struct bnxt_qplib_srq *srq);
 int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res,

From 6f53196bc5e7fd3c05337f24977cacb08e0f9753 Mon Sep 17 00:00:00 2001
From: Devesh Sharma <devesh.sharma@broadcom.com>
Date: Sat, 15 Feb 2020 12:11:04 -0500
Subject: [PATCH 050/327] RDMA/bnxt_re: Refactor doorbell management functions

Moving all the fast path doorbell functions at one place under
qplib_res.h. To pass doorbell record information a new structure
bnxt_qplib_db_info has been introduced.  Every roce object holds an
instance of this structure and doorbell information is initialized during
resource creation.

When DB is rung only the current queue index is read from hardware ring
and rest of the data is taken from pre-initialized dbinfo structure.

Link: https://lore.kernel.org/r/1581786665-23705-8-git-send-email-devesh.sharma@broadcom.com
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/qplib_fp.c   | 143 +++++++--------------
 drivers/infiniband/hw/bnxt_re/qplib_fp.h   |  44 +------
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.c |  21 ++-
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.h |   2 +-
 drivers/infiniband/hw/bnxt_re/qplib_res.h  |  78 +++++++++++
 5 files changed, 141 insertions(+), 147 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index 33272e58363c..2ccf1c3708d1 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -53,9 +53,7 @@
 #include "qplib_sp.h"
 #include "qplib_fp.h"
 
-static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq);
 static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp);
-static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type);
 
 static void bnxt_qplib_cancel_phantom_processing(struct bnxt_qplib_qp *qp)
 {
@@ -236,7 +234,6 @@ static int bnxt_qplib_alloc_qp_hdr_buf(struct bnxt_qplib_res *res,
 static void bnxt_qplib_service_nq(unsigned long data)
 {
 	struct bnxt_qplib_nq *nq = (struct bnxt_qplib_nq *)data;
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
 	struct bnxt_qplib_hwq *hwq = &nq->hwq;
 	struct nq_base *nqe, **nq_ptr;
 	struct bnxt_qplib_cq *cq;
@@ -272,7 +269,8 @@ static void bnxt_qplib_service_nq(unsigned long data)
 			q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high)
 						     << 32;
 			cq = (struct bnxt_qplib_cq *)(unsigned long)q_handle;
-			bnxt_qplib_arm_cq_enable(cq);
+			bnxt_qplib_armen_db(&cq->dbinfo,
+					    DBC_DBC_TYPE_CQ_ARMENA);
 			spin_lock_bh(&cq->compl_lock);
 			atomic_set(&cq->arm_state, 0);
 			if (!nq->cqn_handler(nq, (cq)))
@@ -285,14 +283,16 @@ static void bnxt_qplib_service_nq(unsigned long data)
 		}
 		case NQ_BASE_TYPE_SRQ_EVENT:
 		{
+			struct bnxt_qplib_srq *srq;
 			struct nq_srq_event *nqsrqe =
 						(struct nq_srq_event *)nqe;
 
 			q_handle = le32_to_cpu(nqsrqe->srq_handle_low);
 			q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high)
 				     << 32;
-			bnxt_qplib_arm_srq((struct bnxt_qplib_srq *)q_handle,
-					   DBC_DBC_TYPE_SRQ_ARMENA);
+			srq = (struct bnxt_qplib_srq *)q_handle;
+			bnxt_qplib_armen_db(&srq->dbinfo,
+					    DBC_DBC_TYPE_SRQ_ARMENA);
 			if (!nq->srqn_handler(nq,
 					      (struct bnxt_qplib_srq *)q_handle,
 					      nqsrqe->event))
@@ -314,9 +314,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
 	}
 	if (hwq->cons != raw_cons) {
 		hwq->cons = raw_cons;
-		bnxt_qplib_ring_nq_db_rearm(nq->nq_db.db, hwq->cons,
-					    hwq->max_elements, nq->ring_id,
-					    gen_p5);
+		bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, nq->res->cctx, true);
 	}
 }
 
@@ -340,11 +338,9 @@ static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance)
 
 void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill)
 {
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
 	tasklet_disable(&nq->nq_tasklet);
 	/* Mask h/w interrupt */
-	bnxt_qplib_ring_nq_db(nq->nq_db.db, nq->hwq.cons,
-			      nq->hwq.max_elements, nq->ring_id, gen_p5);
+	bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, nq->res->cctx, false);
 	/* Sync with last running IRQ handler */
 	synchronize_irq(nq->msix_vec);
 	if (kill)
@@ -369,7 +365,6 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq)
 	if (nq->nq_db.reg.bar_reg) {
 		iounmap(nq->nq_db.reg.bar_reg);
 		nq->nq_db.reg.bar_reg = NULL;
-		nq->nq_db.db = NULL;
 	}
 
 	nq->cqn_handler = NULL;
@@ -380,7 +375,6 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq)
 int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
 			    int msix_vector, bool need_init)
 {
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
 	int rc;
 
 	if (nq->requested)
@@ -407,8 +401,7 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
 			 nq->msix_vec, nq_indx);
 	}
 	nq->requested = true;
-	bnxt_qplib_ring_nq_db_rearm(nq->nq_db.db, nq->hwq.cons,
-				    nq->hwq.max_elements, nq->ring_id, gen_p5);
+	bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, nq->res->cctx, true);
 
 	return rc;
 }
@@ -443,7 +436,9 @@ static int bnxt_qplib_map_nq_db(struct bnxt_qplib_nq *nq,  u32 reg_offt)
 		goto fail;
 	}
 
-	nq_db->db = nq_db->reg.bar_reg;
+	nq_db->dbinfo.db = nq_db->reg.bar_reg;
+	nq_db->dbinfo.hwq = &nq->hwq;
+	nq_db->dbinfo.xid = nq->ring_id;
 fail:
 	return rc;
 }
@@ -516,24 +511,6 @@ int bnxt_qplib_alloc_nq(struct bnxt_qplib_res *res, struct bnxt_qplib_nq *nq)
 }
 
 /* SRQ */
-static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type)
-{
-	struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
-	void __iomem *db;
-	u32 sw_prod;
-	u64 val = 0;
-
-	/* Ring DB */
-	sw_prod = (arm_type == DBC_DBC_TYPE_SRQ_ARM) ?
-		   srq->threshold : HWQ_CMP(srq_hwq->prod, srq_hwq);
-	db = (arm_type == DBC_DBC_TYPE_SRQ_ARMENA) ? srq->dbr_base :
-						     srq->dpi->dbr;
-	val = ((srq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | arm_type;
-	val <<= 32;
-	val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
-	writeq(val, db);
-}
-
 void bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
 			   struct bnxt_qplib_srq *srq)
 {
@@ -624,9 +601,12 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
 	srq->swq[srq->last_idx].next_idx = -1;
 
 	srq->id = le32_to_cpu(resp.xid);
-	srq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
+	srq->dbinfo.hwq = &srq->hwq;
+	srq->dbinfo.xid = srq->id;
+	srq->dbinfo.db = srq->dpi->dbr;
+	srq->dbinfo.priv_db = res->dpi_tbl.dbr_bar_reg_iomem;
 	if (srq->threshold)
-		bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARMENA);
+		bnxt_qplib_armen_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ_ARMENA);
 	srq->arm_req = false;
 
 	return 0;
@@ -650,7 +630,7 @@ int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res,
 				    srq_hwq->max_elements - sw_cons + sw_prod;
 	if (count > srq->threshold) {
 		srq->arm_req = false;
-		bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARM);
+		bnxt_qplib_srq_arm_db(&srq->dbinfo, srq->threshold);
 	} else {
 		/* Deferred arming */
 		srq->arm_req = true;
@@ -738,10 +718,10 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
 				    srq_hwq->max_elements - sw_cons + sw_prod;
 	spin_unlock(&srq_hwq->lock);
 	/* Ring DB */
-	bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ);
+	bnxt_qplib_ring_prod_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ);
 	if (srq->arm_req == true && count > srq->threshold) {
 		srq->arm_req = false;
-		bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARM);
+		bnxt_qplib_srq_arm_db(&srq->dbinfo, srq->threshold);
 	}
 done:
 	return rc;
@@ -872,6 +852,15 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 
 	qp->id = le32_to_cpu(resp.xid);
 	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
+	qp->cctx = res->cctx;
+	sq->dbinfo.hwq = &sq->hwq;
+	sq->dbinfo.xid = qp->id;
+	sq->dbinfo.db = qp->dpi->dbr;
+	if (rq->max_wqe) {
+		rq->dbinfo.hwq = &rq->hwq;
+		rq->dbinfo.xid = qp->id;
+		rq->dbinfo.db = qp->dpi->dbr;
+	}
 	rcfw->qp_tbl[qp->id].qp_id = qp->id;
 	rcfw->qp_tbl[qp->id].qp_handle = (void *)qp;
 
@@ -1109,9 +1098,17 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 
 	qp->id = le32_to_cpu(resp.xid);
 	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
-	qp->cctx = res->cctx;
 	INIT_LIST_HEAD(&qp->sq_flush);
 	INIT_LIST_HEAD(&qp->rq_flush);
+	qp->cctx = res->cctx;
+	sq->dbinfo.hwq = &sq->hwq;
+	sq->dbinfo.xid = qp->id;
+	sq->dbinfo.db = qp->dpi->dbr;
+	if (rq->max_wqe) {
+		rq->dbinfo.hwq = &rq->hwq;
+		rq->dbinfo.xid = qp->id;
+		rq->dbinfo.db = qp->dpi->dbr;
+	}
 	rcfw->qp_tbl[qp->id].qp_id = qp->id;
 	rcfw->qp_tbl[qp->id].qp_handle = (void *)qp;
 
@@ -1551,16 +1548,8 @@ void *bnxt_qplib_get_qp1_rq_buf(struct bnxt_qplib_qp *qp,
 void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp)
 {
 	struct bnxt_qplib_q *sq = &qp->sq;
-	u32 sw_prod;
-	u64 val = 0;
 
-	val = (((qp->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
-	       DBC_DBC_TYPE_SQ);
-	val <<= 32;
-	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
-	val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
-	/* Flush all the WQE writes to HW */
-	writeq(val, qp->dpi->dbr);
+	bnxt_qplib_ring_prod_db(&sq->dbinfo, DBC_DBC_TYPE_SQ);
 }
 
 int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
@@ -1852,16 +1841,8 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
 void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp)
 {
 	struct bnxt_qplib_q *rq = &qp->rq;
-	u32 sw_prod;
-	u64 val = 0;
 
-	val = (((qp->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
-	       DBC_DBC_TYPE_RQ);
-	val <<= 32;
-	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
-	val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
-	/* Flush the writes to HW Rx WQE before the ringing Rx DB */
-	writeq(val, qp->dpi->dbr);
+	bnxt_qplib_ring_prod_db(&rq->dbinfo, DBC_DBC_TYPE_RQ);
 }
 
 int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
@@ -1941,34 +1922,6 @@ int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
 }
 
 /* CQ */
-
-/* Spinlock must be held */
-static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq)
-{
-	u64 val = 0;
-
-	val = ((cq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
-	       DBC_DBC_TYPE_CQ_ARMENA;
-	val <<= 32;
-	/* Flush memory writes before enabling the CQ */
-	writeq(val, cq->dbr_base);
-}
-
-static void bnxt_qplib_arm_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
-{
-	struct bnxt_qplib_hwq *cq_hwq = &cq->hwq;
-	u32 sw_cons;
-	u64 val = 0;
-
-	/* Ring DB */
-	val = ((cq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | arm_type;
-	val <<= 32;
-	sw_cons = HWQ_CMP(cq_hwq->cons, cq_hwq);
-	val |= (sw_cons << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
-	/* flush memory writes before arming the CQ */
-	writeq(val, cq->dpi->dbr);
-}
-
 int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
@@ -2023,7 +1976,6 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 		goto fail;
 
 	cq->id = le32_to_cpu(resp.xid);
-	cq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
 	cq->period = BNXT_QPLIB_QUEUE_START_PERIOD;
 	init_waitqueue_head(&cq->waitq);
 	INIT_LIST_HEAD(&cq->sqf_head);
@@ -2031,7 +1983,13 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 	spin_lock_init(&cq->compl_lock);
 	spin_lock_init(&cq->flush_lock);
 
-	bnxt_qplib_arm_cq_enable(cq);
+	cq->dbinfo.hwq = &cq->hwq;
+	cq->dbinfo.xid = cq->id;
+	cq->dbinfo.db = cq->dpi->dbr;
+	cq->dbinfo.priv_db = res->dpi_tbl.dbr_bar_reg_iomem;
+
+	bnxt_qplib_armen_db(&cq->dbinfo, DBC_DBC_TYPE_CQ_ARMENA);
+
 	return 0;
 
 fail:
@@ -2188,8 +2146,7 @@ static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq,
 		sq->send_phantom = true;
 
 		/* TODO: Only ARM if the previous SQE is ARMALL */
-		bnxt_qplib_arm_cq(cq, DBC_DBC_TYPE_CQ_ARMALL);
-
+		bnxt_qplib_ring_db(&cq->dbinfo, DBC_DBC_TYPE_CQ_ARMALL);
 		rc = -EAGAIN;
 		goto out;
 	}
@@ -2859,7 +2816,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
 	}
 	if (cq->hwq.cons != raw_cons) {
 		cq->hwq.cons = raw_cons;
-		bnxt_qplib_arm_cq(cq, DBC_DBC_TYPE_CQ);
+		bnxt_qplib_ring_db(&cq->dbinfo, DBC_DBC_TYPE_CQ);
 	}
 exit:
 	return num_cqes - budget;
@@ -2868,7 +2825,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
 void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
 {
 	if (arm_type)
-		bnxt_qplib_arm_cq(cq, arm_type);
+		bnxt_qplib_ring_db(&cq->dbinfo, arm_type);
 	/* Using cq->arm_state variable to track whether to issue cq handler */
 	atomic_set(&cq->arm_state, 1);
 }
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index 765e5d272963..9e8d1c5c3f4a 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -42,7 +42,7 @@
 struct bnxt_qplib_srq {
 	struct bnxt_qplib_pd		*pd;
 	struct bnxt_qplib_dpi		*dpi;
-	void __iomem			*dbr_base;
+	struct bnxt_qplib_db_info	dbinfo;
 	u64				srq_handle;
 	u32				id;
 	u32				max_wqe;
@@ -236,6 +236,7 @@ struct bnxt_qplib_swqe {
 struct bnxt_qplib_q {
 	struct bnxt_qplib_hwq		hwq;
 	struct bnxt_qplib_swq		*swq;
+	struct bnxt_qplib_db_info	dbinfo;
 	struct bnxt_qplib_sg_info	sg_info;
 	u32				max_wqe;
 	u16				q_full_delta;
@@ -370,7 +371,7 @@ struct bnxt_qplib_cqe {
 #define BNXT_QPLIB_QUEUE_START_PERIOD		0x01
 struct bnxt_qplib_cq {
 	struct bnxt_qplib_dpi		*dpi;
-	void __iomem			*dbr_base;
+	struct bnxt_qplib_db_info	dbinfo;
 	u32				max_wqe;
 	u32				id;
 	u16				count;
@@ -433,46 +434,9 @@ struct bnxt_qplib_cq {
 					 NQ_DB_IDX_VALID |	\
 					 NQ_DB_IRQ_DIS)
 
-static inline void bnxt_qplib_ring_nq_db64(void __iomem *db, u32 index,
-					   u32 xid, bool arm)
-{
-	u64 val;
-
-	val = xid & DBC_DBC_XID_MASK;
-	val |= DBC_DBC_PATH_ROCE;
-	val |= arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
-	val <<= 32;
-	val |= index & DBC_DBC_INDEX_MASK;
-	writeq(val, db);
-}
-
-static inline void bnxt_qplib_ring_nq_db_rearm(void __iomem *db, u32 raw_cons,
-					       u32 max_elements, u32 xid,
-					       bool gen_p5)
-{
-	u32 index = raw_cons & (max_elements - 1);
-
-	if (gen_p5)
-		bnxt_qplib_ring_nq_db64(db, index, xid, true);
-	else
-		writel(NQ_DB_CP_FLAGS_REARM | (index & DBC_DBC32_XID_MASK), db);
-}
-
-static inline void bnxt_qplib_ring_nq_db(void __iomem *db, u32 raw_cons,
-					 u32 max_elements, u32 xid,
-					 bool gen_p5)
-{
-	u32 index = raw_cons & (max_elements - 1);
-
-	if (gen_p5)
-		bnxt_qplib_ring_nq_db64(db, index, xid, false);
-	else
-		writel(NQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK), db);
-}
-
 struct bnxt_qplib_nq_db {
 	struct bnxt_qplib_reg_desc	reg;
-	void __iomem			*db;
+	struct bnxt_qplib_db_info	dbinfo;
 };
 
 typedef int (*cqn_handler_t)(struct bnxt_qplib_nq *nq,
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
index 119113ecfb64..b0b050e5cd12 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -379,7 +379,6 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
 static void bnxt_qplib_service_creq(unsigned long data)
 {
 	struct bnxt_qplib_rcfw *rcfw = (struct bnxt_qplib_rcfw *)data;
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
 	struct bnxt_qplib_creq_ctx *creq = &rcfw->creq;
 	u32 type, budget = CREQ_ENTRY_POLL_BUDGET;
 	struct bnxt_qplib_hwq *hwq = &creq->hwq;
@@ -429,9 +428,8 @@ static void bnxt_qplib_service_creq(unsigned long data)
 
 	if (hwq->cons != raw_cons) {
 		hwq->cons = raw_cons;
-		bnxt_qplib_ring_creq_db_rearm(creq->creq_db.db,
-					      raw_cons, hwq->max_elements,
-					      creq->ring_id, gen_p5);
+		bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo,
+				      rcfw->res->cctx, true);
 	}
 	spin_unlock_irqrestore(&hwq->lock, flags);
 }
@@ -660,15 +658,12 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res,
 
 void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill)
 {
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
 	struct bnxt_qplib_creq_ctx *creq;
 
 	creq = &rcfw->creq;
 	tasklet_disable(&creq->creq_tasklet);
 	/* Mask h/w interrupts */
-	bnxt_qplib_ring_creq_db(creq->creq_db.db, creq->hwq.cons,
-				creq->hwq.max_elements, creq->ring_id,
-				gen_p5);
+	bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, false);
 	/* Sync with last running IRQ-handler */
 	synchronize_irq(creq->msix_vec);
 	if (kill)
@@ -708,7 +703,6 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
 int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
 			      bool need_init)
 {
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
 	struct bnxt_qplib_creq_ctx *creq;
 	int rc;
 
@@ -728,9 +722,8 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
 	if (rc)
 		return rc;
 	creq->requested = true;
-	bnxt_qplib_ring_creq_db_rearm(creq->creq_db.db,
-				      creq->hwq.cons, creq->hwq.max_elements,
-				      creq->ring_id, gen_p5);
+
+	bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, true);
 
 	return 0;
 }
@@ -799,7 +792,9 @@ static int bnxt_qplib_map_creq_db(struct bnxt_qplib_rcfw *rcfw, u32 reg_offt)
 			creq_db->reg.bar_id);
 		return -ENOMEM;
 	}
-	creq_db->db = creq_db->reg.bar_reg;
+	creq_db->dbinfo.db = creq_db->reg.bar_reg;
+	creq_db->dbinfo.hwq = &rcfw->creq.hwq;
+	creq_db->dbinfo.xid = rcfw->creq.ring_id;
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
index 1aff6d458ac5..411fce3493b6 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -247,7 +247,7 @@ struct bnxt_qplib_cmdq_ctx {
 
 struct bnxt_qplib_creq_db {
 	struct bnxt_qplib_reg_desc	reg;
-	void __iomem			*db;
+	struct bnxt_qplib_db_info	dbinfo;
 };
 
 struct bnxt_qplib_creq_stat {
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
index 5fa278e744eb..95b645dbbc2d 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -133,6 +133,13 @@ struct bnxt_qplib_hwq {
 	u8				is_user;
 };
 
+struct bnxt_qplib_db_info {
+	void __iomem		*db;
+	void __iomem		*priv_db;
+	struct bnxt_qplib_hwq	*hwq;
+	u32			xid;
+};
+
 /* Tables */
 struct bnxt_qplib_pd_tbl {
 	unsigned long			*tbl;
@@ -290,4 +297,75 @@ void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res,
 int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_ctx *ctx,
 			 bool virt_fn, bool is_p5);
+
+static inline void bnxt_qplib_ring_db32(struct bnxt_qplib_db_info *info,
+					bool arm)
+{
+	u32 key;
+
+	key = info->hwq->cons & (info->hwq->max_elements - 1);
+	key |= (CMPL_DOORBELL_IDX_VALID |
+		(CMPL_DOORBELL_KEY_CMPL & CMPL_DOORBELL_KEY_MASK));
+	if (!arm)
+		key |= CMPL_DOORBELL_MASK;
+	writel(key, info->db);
+}
+
+static inline void bnxt_qplib_ring_db(struct bnxt_qplib_db_info *info,
+				      u32 type)
+{
+	u64 key = 0;
+
+	key = (info->xid & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | type;
+	key <<= 32;
+	key |= (info->hwq->cons & (info->hwq->max_elements - 1)) &
+		DBC_DBC_INDEX_MASK;
+	writeq(key, info->db);
+}
+
+static inline void bnxt_qplib_ring_prod_db(struct bnxt_qplib_db_info *info,
+					   u32 type)
+{
+	u64 key = 0;
+
+	key = (info->xid & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | type;
+	key <<= 32;
+	key |= (info->hwq->prod & (info->hwq->max_elements - 1)) &
+		DBC_DBC_INDEX_MASK;
+	writeq(key, info->db);
+}
+
+static inline void bnxt_qplib_armen_db(struct bnxt_qplib_db_info *info,
+				       u32 type)
+{
+	u64 key = 0;
+
+	key = (info->xid & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | type;
+	key <<= 32;
+	writeq(key, info->priv_db);
+}
+
+static inline void bnxt_qplib_srq_arm_db(struct bnxt_qplib_db_info *info,
+					 u32 th)
+{
+	u64 key = 0;
+
+	key = (info->xid & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | th;
+	key <<= 32;
+	key |=  th & DBC_DBC_INDEX_MASK;
+	writeq(key, info->priv_db);
+}
+
+static inline void bnxt_qplib_ring_nq_db(struct bnxt_qplib_db_info *info,
+					 struct bnxt_qplib_chip_ctx *cctx,
+					 bool arm)
+{
+	u32 type;
+
+	type = arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
+	if (bnxt_qplib_is_chip_gen_p5(cctx))
+		bnxt_qplib_ring_db(info, type);
+	else
+		bnxt_qplib_ring_db32(info, arm);
+}
 #endif /* __BNXT_QPLIB_RES_H__ */

From 6ccad8483b28666bdafbb52fe94e64f41e7836be Mon Sep 17 00:00:00 2001
From: Devesh Sharma <devesh.sharma@broadcom.com>
Date: Sat, 15 Feb 2020 12:11:05 -0500
Subject: [PATCH 051/327] RDMA/bnxt_re: use ibdev based message printing
 functions

Replacing the dev_err/dbg/warn with ibdev_err/dbg/warn. In the IB device
provider driver these functions are recommended to use.

Currently qplib layer function calls has not been replaced due to
unavailability of ib_device pointer at that layer.

Link: https://lore.kernel.org/r/1581786665-23705-9-git-send-email-devesh.sharma@broadcom.com
Signed-off-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c | 280 +++++++++++------------
 drivers/infiniband/hw/bnxt_re/main.c     | 131 ++++++-----
 2 files changed, 208 insertions(+), 203 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 4368aacdd482..ad3e524187e3 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -313,8 +313,8 @@ int bnxt_re_del_gid(const struct ib_gid_attr *attr, void **context)
 		if (ctx->idx == 0 &&
 		    rdma_link_local_addr((struct in6_addr *)gid_to_del) &&
 		    ctx->refcnt == 1 && rdev->gsi_ctx.gsi_sqp) {
-			dev_dbg(rdev_to_dev(rdev),
-				"Trying to delete GID0 while QP1 is alive\n");
+			ibdev_dbg(&rdev->ibdev,
+				  "Trying to delete GID0 while QP1 is alive\n");
 			return -EFAULT;
 		}
 		ctx->refcnt--;
@@ -322,8 +322,8 @@ int bnxt_re_del_gid(const struct ib_gid_attr *attr, void **context)
 			rc = bnxt_qplib_del_sgid(sgid_tbl, gid_to_del,
 						 vlan_id,  true);
 			if (rc) {
-				dev_err(rdev_to_dev(rdev),
-					"Failed to remove GID: %#x", rc);
+				ibdev_err(&rdev->ibdev,
+					  "Failed to remove GID: %#x", rc);
 			} else {
 				ctx_tbl = sgid_tbl->ctx;
 				ctx_tbl[ctx->idx] = NULL;
@@ -360,7 +360,7 @@ int bnxt_re_add_gid(const struct ib_gid_attr *attr, void **context)
 	}
 
 	if (rc < 0) {
-		dev_err(rdev_to_dev(rdev), "Failed to add GID: %#x", rc);
+		ibdev_err(&rdev->ibdev, "Failed to add GID: %#x", rc);
 		return rc;
 	}
 
@@ -423,12 +423,12 @@ static int bnxt_re_bind_fence_mw(struct bnxt_qplib_qp *qplib_qp)
 	wqe.bind.r_key = fence->bind_rkey;
 	fence->bind_rkey = ib_inc_rkey(fence->bind_rkey);
 
-	dev_dbg(rdev_to_dev(qp->rdev),
-		"Posting bind fence-WQE: rkey: %#x QP: %d PD: %p\n",
+	ibdev_dbg(&qp->rdev->ibdev,
+		  "Posting bind fence-WQE: rkey: %#x QP: %d PD: %p\n",
 		wqe.bind.r_key, qp->qplib_qp.id, pd);
 	rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
 	if (rc) {
-		dev_err(rdev_to_dev(qp->rdev), "Failed to bind fence-WQE\n");
+		ibdev_err(&qp->rdev->ibdev, "Failed to bind fence-WQE\n");
 		return rc;
 	}
 	bnxt_qplib_post_send_db(&qp->qplib_qp);
@@ -479,7 +479,7 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd)
 				  DMA_BIDIRECTIONAL);
 	rc = dma_mapping_error(dev, dma_addr);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to dma-map fence-MR-mem\n");
+		ibdev_err(&rdev->ibdev, "Failed to dma-map fence-MR-mem\n");
 		rc = -EIO;
 		fence->dma_addr = 0;
 		goto fail;
@@ -499,7 +499,7 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd)
 	mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
 	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to alloc fence-HW-MR\n");
+		ibdev_err(&rdev->ibdev, "Failed to alloc fence-HW-MR\n");
 		goto fail;
 	}
 
@@ -511,7 +511,7 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd)
 	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl_tbl,
 			       BNXT_RE_FENCE_PBL_SIZE, false, PAGE_SIZE);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to register fence-MR\n");
+		ibdev_err(&rdev->ibdev, "Failed to register fence-MR\n");
 		goto fail;
 	}
 	mr->ib_mr.rkey = mr->qplib_mr.rkey;
@@ -519,8 +519,8 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd)
 	/* Create a fence MW only for kernel consumers */
 	mw = bnxt_re_alloc_mw(&pd->ib_pd, IB_MW_TYPE_1, NULL);
 	if (IS_ERR(mw)) {
-		dev_err(rdev_to_dev(rdev),
-			"Failed to create fence-MW for PD: %p\n", pd);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to create fence-MW for PD: %p\n", pd);
 		rc = PTR_ERR(mw);
 		goto fail;
 	}
@@ -558,7 +558,7 @@ int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 
 	pd->rdev = rdev;
 	if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) {
-		dev_err(rdev_to_dev(rdev), "Failed to allocate HW PD");
+		ibdev_err(&rdev->ibdev, "Failed to allocate HW PD");
 		rc = -ENOMEM;
 		goto fail;
 	}
@@ -585,16 +585,16 @@ int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 
 		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
 		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to copy user response\n");
+			ibdev_err(&rdev->ibdev,
+				  "Failed to copy user response\n");
 			goto dbfail;
 		}
 	}
 
 	if (!udata)
 		if (bnxt_re_create_fence_mr(pd))
-			dev_warn(rdev_to_dev(rdev),
-				 "Failed to create Fence-MR\n");
+			ibdev_warn(&rdev->ibdev,
+				   "Failed to create Fence-MR\n");
 	return 0;
 dbfail:
 	bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
@@ -645,7 +645,7 @@ int bnxt_re_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr,
 	int rc;
 
 	if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) {
-		dev_err(rdev_to_dev(rdev), "Failed to alloc AH: GRH not set");
+		ibdev_err(&rdev->ibdev, "Failed to alloc AH: GRH not set");
 		return -EINVAL;
 	}
 
@@ -675,7 +675,7 @@ int bnxt_re_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr,
 	rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah,
 				  !(flags & RDMA_CREATE_AH_SLEEPABLE));
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to allocate HW AH");
+		ibdev_err(&rdev->ibdev, "Failed to allocate HW AH");
 		return rc;
 	}
 
@@ -759,16 +759,16 @@ static int bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp)
 	mutex_unlock(&rdev->qp_lock);
 	atomic_dec(&rdev->qp_count);
 
-	dev_dbg(rdev_to_dev(rdev), "Destroy the shadow AH\n");
+	ibdev_dbg(&rdev->ibdev, "Destroy the shadow AH\n");
 	bnxt_qplib_destroy_ah(&rdev->qplib_res,
 			      &gsi_sah->qplib_ah,
 			      true);
 	bnxt_qplib_clean_qp(&qp->qplib_qp);
 
-	dev_dbg(rdev_to_dev(rdev), "Destroy the shadow QP\n");
+	ibdev_dbg(&rdev->ibdev, "Destroy the shadow QP\n");
 	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &gsi_sqp->qplib_qp);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Destroy Shadow QP failed");
+		ibdev_err(&rdev->ibdev, "Destroy Shadow QP failed");
 		goto fail;
 	}
 	bnxt_qplib_free_qp_res(&rdev->qplib_res, &gsi_sqp->qplib_qp);
@@ -802,7 +802,7 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 
 	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to destroy HW QP");
+		ibdev_err(&rdev->ibdev, "Failed to destroy HW QP");
 		return rc;
 	}
 
@@ -938,8 +938,8 @@ static struct bnxt_re_ah *bnxt_re_create_shadow_qp_ah
 
 	rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah, false);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev),
-			"Failed to allocate HW AH for Shadow QP");
+		ibdev_err(&rdev->ibdev,
+			  "Failed to allocate HW AH for Shadow QP");
 		goto fail;
 	}
 
@@ -1032,7 +1032,7 @@ static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp,
 
 		srq = container_of(init_attr->srq, struct bnxt_re_srq, ib_srq);
 		if (!srq) {
-			dev_err(rdev_to_dev(rdev), "SRQ not found");
+			ibdev_err(&rdev->ibdev, "SRQ not found");
 			return -EINVAL;
 		}
 		qplqp->srq = &srq->qplib_srq;
@@ -1140,8 +1140,7 @@ static int bnxt_re_init_qp_type(struct bnxt_re_dev *rdev,
 
 	qptype = __from_ib_qp_type(init_attr->qp_type);
 	if (qptype == IB_QPT_MAX) {
-		dev_err(rdev_to_dev(rdev), "QP type 0x%x not supported",
-			qptype);
+		ibdev_err(&rdev->ibdev, "QP type 0x%x not supported", qptype);
 		qptype = -EINVAL;
 		goto out;
 	}
@@ -1188,15 +1187,15 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd,
 	qplqp->mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
 	qplqp->dpi = &rdev->dpi_privileged; /* Doorbell page */
 	if (init_attr->create_flags)
-		dev_dbg(rdev_to_dev(rdev),
-			"QP create flags 0x%x not supported",
-			init_attr->create_flags);
+		ibdev_dbg(&rdev->ibdev,
+			  "QP create flags 0x%x not supported",
+			  init_attr->create_flags);
 
 	/* Setup CQs */
 	if (init_attr->send_cq) {
 		cq = container_of(init_attr->send_cq, struct bnxt_re_cq, ib_cq);
 		if (!cq) {
-			dev_err(rdev_to_dev(rdev), "Send CQ not found");
+			ibdev_err(&rdev->ibdev, "Send CQ not found");
 			rc = -EINVAL;
 			goto out;
 		}
@@ -1207,7 +1206,7 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd,
 	if (init_attr->recv_cq) {
 		cq = container_of(init_attr->recv_cq, struct bnxt_re_cq, ib_cq);
 		if (!cq) {
-			dev_err(rdev_to_dev(rdev), "Receive CQ not found");
+			ibdev_err(&rdev->ibdev, "Receive CQ not found");
 			rc = -EINVAL;
 			goto out;
 		}
@@ -1253,8 +1252,7 @@ static int bnxt_re_create_shadow_gsi(struct bnxt_re_qp *qp,
 	sqp = bnxt_re_create_shadow_qp(pd, &rdev->qplib_res, &qp->qplib_qp);
 	if (!sqp) {
 		rc = -ENODEV;
-		dev_err(rdev_to_dev(rdev),
-			"Failed to create Shadow QP for QP1");
+		ibdev_err(&rdev->ibdev, "Failed to create Shadow QP for QP1");
 		goto out;
 	}
 	rdev->gsi_ctx.gsi_sqp = sqp;
@@ -1267,8 +1265,8 @@ static int bnxt_re_create_shadow_gsi(struct bnxt_re_qp *qp,
 		bnxt_qplib_destroy_qp(&rdev->qplib_res,
 				      &sqp->qplib_qp);
 		rc = -ENODEV;
-		dev_err(rdev_to_dev(rdev),
-			"Failed to create AH entry for ShadowQP");
+		ibdev_err(&rdev->ibdev,
+			  "Failed to create AH entry for ShadowQP");
 		goto out;
 	}
 	rdev->gsi_ctx.gsi_sah = sah;
@@ -1296,7 +1294,7 @@ static int bnxt_re_create_gsi_qp(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd,
 
 	rc = bnxt_qplib_create_qp1(&rdev->qplib_res, qplqp);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "create HW QP1 failed!");
+		ibdev_err(&rdev->ibdev, "create HW QP1 failed!");
 		goto out;
 	}
 
@@ -1316,14 +1314,14 @@ static bool bnxt_re_test_qp_limits(struct bnxt_re_dev *rdev,
 	    init_attr->cap.max_send_sge > dev_attr->max_qp_sges ||
 	    init_attr->cap.max_recv_sge > dev_attr->max_qp_sges ||
 	    init_attr->cap.max_inline_data > dev_attr->max_inline_data) {
-		dev_err(rdev_to_dev(rdev),
-			"Create QP failed - max exceeded! 0x%x/0x%x 0x%x/0x%x 0x%x/0x%x 0x%x/0x%x 0x%x/0x%x",
-			init_attr->cap.max_send_wr, dev_attr->max_qp_wqes,
-			init_attr->cap.max_recv_wr, dev_attr->max_qp_wqes,
-			init_attr->cap.max_send_sge, dev_attr->max_qp_sges,
-			init_attr->cap.max_recv_sge, dev_attr->max_qp_sges,
-			init_attr->cap.max_inline_data,
-			dev_attr->max_inline_data);
+		ibdev_err(&rdev->ibdev,
+			  "Create QP failed - max exceeded! 0x%x/0x%x 0x%x/0x%x 0x%x/0x%x 0x%x/0x%x 0x%x/0x%x",
+			  init_attr->cap.max_send_wr, dev_attr->max_qp_wqes,
+			  init_attr->cap.max_recv_wr, dev_attr->max_qp_wqes,
+			  init_attr->cap.max_send_sge, dev_attr->max_qp_sges,
+			  init_attr->cap.max_recv_sge, dev_attr->max_qp_sges,
+			  init_attr->cap.max_inline_data,
+			  dev_attr->max_inline_data);
 		rc = false;
 	}
 	return rc;
@@ -1365,7 +1363,7 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
 	} else {
 		rc = bnxt_qplib_create_qp(&rdev->qplib_res, &qp->qplib_qp);
 		if (rc) {
-			dev_err(rdev_to_dev(rdev), "Failed to create HW QP");
+			ibdev_err(&rdev->ibdev, "Failed to create HW QP");
 			goto free_umem;
 		}
 		if (udata) {
@@ -1375,7 +1373,7 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
 			resp.rsvd = 0;
 			rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
 			if (rc) {
-				dev_err(rdev_to_dev(rdev), "Failed to copy QP udata");
+				ibdev_err(&rdev->ibdev, "Failed to copy QP udata");
 				goto qp_destroy;
 			}
 		}
@@ -1548,7 +1546,7 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq,
 	int rc, entries;
 
 	if (srq_init_attr->attr.max_wr >= dev_attr->max_srq_wqes) {
-		dev_err(rdev_to_dev(rdev), "Create CQ failed - max exceeded");
+		ibdev_err(&rdev->ibdev, "Create CQ failed - max exceeded");
 		rc = -EINVAL;
 		goto exit;
 	}
@@ -1583,7 +1581,7 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq,
 
 	rc = bnxt_qplib_create_srq(&rdev->qplib_res, &srq->qplib_srq);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Create HW SRQ failed!");
+		ibdev_err(&rdev->ibdev, "Create HW SRQ failed!");
 		goto fail;
 	}
 
@@ -1593,7 +1591,7 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq,
 		resp.srqid = srq->qplib_srq.id;
 		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
 		if (rc) {
-			dev_err(rdev_to_dev(rdev), "SRQ copy to udata failed!");
+			ibdev_err(&rdev->ibdev, "SRQ copy to udata failed!");
 			bnxt_qplib_destroy_srq(&rdev->qplib_res,
 					       &srq->qplib_srq);
 			goto fail;
@@ -1632,7 +1630,7 @@ int bnxt_re_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr,
 		srq->qplib_srq.threshold = srq_attr->srq_limit;
 		rc = bnxt_qplib_modify_srq(&rdev->qplib_res, &srq->qplib_srq);
 		if (rc) {
-			dev_err(rdev_to_dev(rdev), "Modify HW SRQ failed!");
+			ibdev_err(&rdev->ibdev, "Modify HW SRQ failed!");
 			return rc;
 		}
 		/* On success, update the shadow */
@@ -1640,8 +1638,8 @@ int bnxt_re_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr,
 		/* No need to Build and send response back to udata */
 		break;
 	default:
-		dev_err(rdev_to_dev(rdev),
-			"Unsupported srq_attr_mask 0x%x", srq_attr_mask);
+		ibdev_err(&rdev->ibdev,
+			  "Unsupported srq_attr_mask 0x%x", srq_attr_mask);
 		return -EINVAL;
 	}
 	return 0;
@@ -1659,7 +1657,7 @@ int bnxt_re_query_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr)
 	tsrq.qplib_srq.id = srq->qplib_srq.id;
 	rc = bnxt_qplib_query_srq(&rdev->qplib_res, &tsrq.qplib_srq);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Query HW SRQ failed!");
+		ibdev_err(&rdev->ibdev, "Query HW SRQ failed!");
 		return rc;
 	}
 	srq_attr->max_wr = srq->qplib_srq.max_wqe;
@@ -1725,8 +1723,7 @@ static int bnxt_re_modify_shadow_qp(struct bnxt_re_dev *rdev,
 
 	rc = bnxt_qplib_modify_qp(&rdev->qplib_res, &qp->qplib_qp);
 	if (rc)
-		dev_err(rdev_to_dev(rdev),
-			"Failed to modify Shadow QP for QP1");
+		ibdev_err(&rdev->ibdev, "Failed to modify Shadow QP for QP1");
 	return rc;
 }
 
@@ -1747,15 +1744,15 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 		new_qp_state = qp_attr->qp_state;
 		if (!ib_modify_qp_is_ok(curr_qp_state, new_qp_state,
 					ib_qp->qp_type, qp_attr_mask)) {
-			dev_err(rdev_to_dev(rdev),
-				"Invalid attribute mask: %#x specified ",
-				qp_attr_mask);
-			dev_err(rdev_to_dev(rdev),
-				"for qpn: %#x type: %#x",
-				ib_qp->qp_num, ib_qp->qp_type);
-			dev_err(rdev_to_dev(rdev),
-				"curr_qp_state=0x%x, new_qp_state=0x%x\n",
-				curr_qp_state, new_qp_state);
+			ibdev_err(&rdev->ibdev,
+				  "Invalid attribute mask: %#x specified ",
+				  qp_attr_mask);
+			ibdev_err(&rdev->ibdev,
+				  "for qpn: %#x type: %#x",
+				  ib_qp->qp_num, ib_qp->qp_type);
+			ibdev_err(&rdev->ibdev,
+				  "curr_qp_state=0x%x, new_qp_state=0x%x\n",
+				  curr_qp_state, new_qp_state);
 			return -EINVAL;
 		}
 		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_STATE;
@@ -1763,18 +1760,16 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 
 		if (!qp->sumem &&
 		    qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
-			dev_dbg(rdev_to_dev(rdev),
-				"Move QP = %p to flush list\n",
-				qp);
+			ibdev_dbg(&rdev->ibdev,
+				  "Move QP = %p to flush list\n", qp);
 			flags = bnxt_re_lock_cqs(qp);
 			bnxt_qplib_add_flush_qp(&qp->qplib_qp);
 			bnxt_re_unlock_cqs(qp, flags);
 		}
 		if (!qp->sumem &&
 		    qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_RESET) {
-			dev_dbg(rdev_to_dev(rdev),
-				"Move QP = %p out of flush list\n",
-				qp);
+			ibdev_dbg(&rdev->ibdev,
+				  "Move QP = %p out of flush list\n", qp);
 			flags = bnxt_re_lock_cqs(qp);
 			bnxt_qplib_clean_qp(&qp->qplib_qp);
 			bnxt_re_unlock_cqs(qp, flags);
@@ -1905,10 +1900,10 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 	if (qp_attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
 		if (qp_attr->max_dest_rd_atomic >
 		    dev_attr->max_qp_init_rd_atom) {
-			dev_err(rdev_to_dev(rdev),
-				"max_dest_rd_atomic requested%d is > dev_max%d",
-				qp_attr->max_dest_rd_atomic,
-				dev_attr->max_qp_init_rd_atom);
+			ibdev_err(&rdev->ibdev,
+				  "max_dest_rd_atomic requested%d is > dev_max%d",
+				  qp_attr->max_dest_rd_atomic,
+				  dev_attr->max_qp_init_rd_atom);
 			return -EINVAL;
 		}
 
@@ -1929,8 +1924,8 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 		    (qp_attr->cap.max_recv_sge >= dev_attr->max_qp_sges) ||
 		    (qp_attr->cap.max_inline_data >=
 						dev_attr->max_inline_data)) {
-			dev_err(rdev_to_dev(rdev),
-				"Create QP failed - max exceeded");
+			ibdev_err(&rdev->ibdev,
+				  "Create QP failed - max exceeded");
 			return -EINVAL;
 		}
 		entries = roundup_pow_of_two(qp_attr->cap.max_send_wr);
@@ -1963,7 +1958,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 	}
 	rc = bnxt_qplib_modify_qp(&rdev->qplib_res, &qp->qplib_qp);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to modify HW QP");
+		ibdev_err(&rdev->ibdev, "Failed to modify HW QP");
 		return rc;
 	}
 	if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp)
@@ -1988,7 +1983,7 @@ int bnxt_re_query_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
 
 	rc = bnxt_qplib_query_qp(&rdev->qplib_res, qplib_qp);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to query HW QP");
+		ibdev_err(&rdev->ibdev, "Failed to query HW QP");
 		goto out;
 	}
 	qp_attr->qp_state = __to_ib_qp_state(qplib_qp->state);
@@ -2193,7 +2188,7 @@ static int bnxt_re_build_qp1_send_v2(struct bnxt_re_qp *qp,
 		wqe->num_sge++;
 
 	} else {
-		dev_err(rdev_to_dev(qp->rdev), "QP1 buffer is empty!");
+		ibdev_err(&qp->rdev->ibdev, "QP1 buffer is empty!");
 		rc = -ENOMEM;
 	}
 	return rc;
@@ -2429,8 +2424,8 @@ static int bnxt_re_copy_inline_data(struct bnxt_re_dev *rdev,
 
 		if ((sge_len + wqe->inline_len) >
 		    BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) {
-			dev_err(rdev_to_dev(rdev),
-				"Inline data size requested > supported value");
+			ibdev_err(&rdev->ibdev,
+				  "Inline data size requested > supported value");
 			return -EINVAL;
 		}
 		sge_len = wr->sg_list[i].length;
@@ -2490,8 +2485,8 @@ static int bnxt_re_post_send_shadow_qp(struct bnxt_re_dev *rdev,
 		/* Common */
 		wqe.num_sge = wr->num_sge;
 		if (wr->num_sge > qp->qplib_qp.sq.max_sge) {
-			dev_err(rdev_to_dev(rdev),
-				"Limit exceeded for Send SGEs");
+			ibdev_err(&rdev->ibdev,
+				  "Limit exceeded for Send SGEs");
 			rc = -EINVAL;
 			goto bad;
 		}
@@ -2510,9 +2505,9 @@ static int bnxt_re_post_send_shadow_qp(struct bnxt_re_dev *rdev,
 			rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
 bad:
 		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Post send failed opcode = %#x rc = %d",
-				wr->opcode, rc);
+			ibdev_err(&rdev->ibdev,
+				  "Post send failed opcode = %#x rc = %d",
+				  wr->opcode, rc);
 			break;
 		}
 		wr = wr->next;
@@ -2539,8 +2534,8 @@ int bnxt_re_post_send(struct ib_qp *ib_qp, const struct ib_send_wr *wr,
 		/* Common */
 		wqe.num_sge = wr->num_sge;
 		if (wr->num_sge > qp->qplib_qp.sq.max_sge) {
-			dev_err(rdev_to_dev(qp->rdev),
-				"Limit exceeded for Send SGEs");
+			ibdev_err(&qp->rdev->ibdev,
+				  "Limit exceeded for Send SGEs");
 			rc = -EINVAL;
 			goto bad;
 		}
@@ -2585,8 +2580,8 @@ int bnxt_re_post_send(struct ib_qp *ib_qp, const struct ib_send_wr *wr,
 			rc = bnxt_re_build_atomic_wqe(wr, &wqe);
 			break;
 		case IB_WR_RDMA_READ_WITH_INV:
-			dev_err(rdev_to_dev(qp->rdev),
-				"RDMA Read with Invalidate is not supported");
+			ibdev_err(&qp->rdev->ibdev,
+				  "RDMA Read with Invalidate is not supported");
 			rc = -EINVAL;
 			goto bad;
 		case IB_WR_LOCAL_INV:
@@ -2597,8 +2592,8 @@ int bnxt_re_post_send(struct ib_qp *ib_qp, const struct ib_send_wr *wr,
 			break;
 		default:
 			/* Unsupported WRs */
-			dev_err(rdev_to_dev(qp->rdev),
-				"WR (%#x) is not supported", wr->opcode);
+			ibdev_err(&qp->rdev->ibdev,
+				  "WR (%#x) is not supported", wr->opcode);
 			rc = -EINVAL;
 			goto bad;
 		}
@@ -2606,9 +2601,9 @@ int bnxt_re_post_send(struct ib_qp *ib_qp, const struct ib_send_wr *wr,
 			rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
 bad:
 		if (rc) {
-			dev_err(rdev_to_dev(qp->rdev),
-				"post_send failed op:%#x qps = %#x rc = %d\n",
-				wr->opcode, qp->qplib_qp.state, rc);
+			ibdev_err(&qp->rdev->ibdev,
+				  "post_send failed op:%#x qps = %#x rc = %d\n",
+				  wr->opcode, qp->qplib_qp.state, rc);
 			*bad_wr = wr;
 			break;
 		}
@@ -2636,8 +2631,8 @@ static int bnxt_re_post_recv_shadow_qp(struct bnxt_re_dev *rdev,
 		/* Common */
 		wqe.num_sge = wr->num_sge;
 		if (wr->num_sge > qp->qplib_qp.rq.max_sge) {
-			dev_err(rdev_to_dev(rdev),
-				"Limit exceeded for Receive SGEs");
+			ibdev_err(&rdev->ibdev,
+				  "Limit exceeded for Receive SGEs");
 			rc = -EINVAL;
 			break;
 		}
@@ -2673,8 +2668,8 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr,
 		/* Common */
 		wqe.num_sge = wr->num_sge;
 		if (wr->num_sge > qp->qplib_qp.rq.max_sge) {
-			dev_err(rdev_to_dev(qp->rdev),
-				"Limit exceeded for Receive SGEs");
+			ibdev_err(&qp->rdev->ibdev,
+				  "Limit exceeded for Receive SGEs");
 			rc = -EINVAL;
 			*bad_wr = wr;
 			break;
@@ -2745,7 +2740,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 
 	/* Validate CQ fields */
 	if (cqe < 1 || cqe > dev_attr->max_cq_wqes) {
-		dev_err(rdev_to_dev(rdev), "Failed to create CQ -max exceeded");
+		ibdev_err(&rdev->ibdev, "Failed to create CQ -max exceeded");
 		return -EINVAL;
 	}
 
@@ -2801,7 +2796,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 
 	rc = bnxt_qplib_create_cq(&rdev->qplib_res, &cq->qplib_cq);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to create HW CQ");
+		ibdev_err(&rdev->ibdev, "Failed to create HW CQ");
 		goto fail;
 	}
 
@@ -2821,7 +2816,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		resp.rsvd = 0;
 		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
 		if (rc) {
-			dev_err(rdev_to_dev(rdev), "Failed to copy CQ udata");
+			ibdev_err(&rdev->ibdev, "Failed to copy CQ udata");
 			bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
 			goto c2fail;
 		}
@@ -3100,7 +3095,7 @@ static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *gsi_qp,
 	pkt_type = bnxt_re_check_packet_type(cqe->raweth_qp1_flags,
 					     cqe->raweth_qp1_flags2);
 	if (pkt_type < 0) {
-		dev_err(rdev_to_dev(rdev), "Invalid packet\n");
+		ibdev_err(&rdev->ibdev, "Invalid packet\n");
 		return -EINVAL;
 	}
 
@@ -3149,8 +3144,8 @@ static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *gsi_qp,
 
 	rc = bnxt_re_post_recv_shadow_qp(rdev, gsi_sqp, &rwr);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev),
-			"Failed to post Rx buffers to shadow QP");
+		ibdev_err(&rdev->ibdev,
+			  "Failed to post Rx buffers to shadow QP");
 		return -ENOMEM;
 	}
 
@@ -3305,11 +3300,11 @@ static int send_phantom_wqe(struct bnxt_re_qp *qp)
 	rc = bnxt_re_bind_fence_mw(lib_qp);
 	if (!rc) {
 		lib_qp->sq.phantom_wqe_cnt++;
-		dev_dbg(&lib_qp->sq.hwq.pdev->dev,
-			"qp %#x sq->prod %#x sw_prod %#x phantom_wqe_cnt %d\n",
-			lib_qp->id, lib_qp->sq.hwq.prod,
-			HWQ_CMP(lib_qp->sq.hwq.prod, &lib_qp->sq.hwq),
-			lib_qp->sq.phantom_wqe_cnt);
+		ibdev_dbg(&qp->rdev->ibdev,
+			  "qp %#x sq->prod %#x sw_prod %#x phantom_wqe_cnt %d\n",
+			  lib_qp->id, lib_qp->sq.hwq.prod,
+			  HWQ_CMP(lib_qp->sq.hwq.prod, &lib_qp->sq.hwq),
+			  lib_qp->sq.phantom_wqe_cnt);
 	}
 
 	spin_unlock_irqrestore(&qp->sq_lock, flags);
@@ -3332,7 +3327,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
 	budget = min_t(u32, num_entries, cq->max_cql);
 	num_entries = budget;
 	if (!cq->cql) {
-		dev_err(rdev_to_dev(cq->rdev), "POLL CQ : no CQL to use");
+		ibdev_err(&cq->rdev->ibdev, "POLL CQ : no CQL to use");
 		goto exit;
 	}
 	cqe = &cq->cql[0];
@@ -3345,8 +3340,8 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
 				qp = container_of(lib_qp,
 						  struct bnxt_re_qp, qplib_qp);
 				if (send_phantom_wqe(qp) == -ENOMEM)
-					dev_err(rdev_to_dev(cq->rdev),
-						"Phantom failed! Scheduled to send again\n");
+					ibdev_err(&cq->rdev->ibdev,
+						  "Phantom failed! Scheduled to send again\n");
 				else
 					sq->send_phantom = false;
 			}
@@ -3370,8 +3365,7 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
 				 (unsigned long)(cqe->qp_handle),
 				 struct bnxt_re_qp, qplib_qp);
 			if (!qp) {
-				dev_err(rdev_to_dev(cq->rdev),
-					"POLL CQ : bad QP handle");
+				ibdev_err(&cq->rdev->ibdev, "POLL CQ : bad QP handle");
 				continue;
 			}
 			wc->qp = &qp->ib_qp;
@@ -3436,9 +3430,9 @@ int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
 				bnxt_re_process_res_ud_wc(qp, wc, cqe);
 				break;
 			default:
-				dev_err(rdev_to_dev(cq->rdev),
-					"POLL CQ : type 0x%x not handled",
-					cqe->opcode);
+				ibdev_err(&cq->rdev->ibdev,
+					  "POLL CQ : type 0x%x not handled",
+					  cqe->opcode);
 				continue;
 			}
 			wc++;
@@ -3531,7 +3525,7 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 
 	rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Dereg MR failed: %#x\n", rc);
+		ibdev_err(&rdev->ibdev, "Dereg MR failed: %#x\n", rc);
 		return rc;
 	}
 
@@ -3578,7 +3572,7 @@ struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type,
 	int rc;
 
 	if (type != IB_MR_TYPE_MEM_REG) {
-		dev_dbg(rdev_to_dev(rdev), "MR type 0x%x not supported", type);
+		ibdev_dbg(&rdev->ibdev, "MR type 0x%x not supported", type);
 		return ERR_PTR(-EINVAL);
 	}
 	if (max_num_sg > MAX_PBL_LVL_1_PGS)
@@ -3608,8 +3602,8 @@ struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type,
 	rc = bnxt_qplib_alloc_fast_reg_page_list(&rdev->qplib_res,
 						 &mr->qplib_frpl, max_num_sg);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev),
-			"Failed to allocate HW FR page list");
+		ibdev_err(&rdev->ibdev,
+			  "Failed to allocate HW FR page list");
 		goto fail_mr;
 	}
 
@@ -3644,7 +3638,7 @@ struct ib_mw *bnxt_re_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
 			       CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B);
 	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mw->qplib_mw);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Allocate MW failed!");
+		ibdev_err(&rdev->ibdev, "Allocate MW failed!");
 		goto fail;
 	}
 	mw->ib_mw.rkey = mw->qplib_mw.rkey;
@@ -3665,7 +3659,7 @@ int bnxt_re_dealloc_mw(struct ib_mw *ib_mw)
 
 	rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mw->qplib_mw);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Free MW failed: %#x\n", rc);
+		ibdev_err(&rdev->ibdev, "Free MW failed: %#x\n", rc);
 		return rc;
 	}
 
@@ -3717,8 +3711,8 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 	int umem_pgs, page_shift, rc;
 
 	if (length > BNXT_RE_MAX_MR_SIZE) {
-		dev_err(rdev_to_dev(rdev), "MR Size: %lld > Max supported:%lld\n",
-			length, BNXT_RE_MAX_MR_SIZE);
+		ibdev_err(&rdev->ibdev, "MR Size: %lld > Max supported:%lld\n",
+			  length, BNXT_RE_MAX_MR_SIZE);
 		return ERR_PTR(-ENOMEM);
 	}
 
@@ -3733,7 +3727,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 
 	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to allocate MR");
+		ibdev_err(&rdev->ibdev, "Failed to allocate MR");
 		goto free_mr;
 	}
 	/* The fixed portion of the rkey is the same as the lkey */
@@ -3741,7 +3735,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 
 	umem = ib_umem_get(&rdev->ibdev, start, length, mr_access_flags);
 	if (IS_ERR(umem)) {
-		dev_err(rdev_to_dev(rdev), "Failed to get umem");
+		ibdev_err(&rdev->ibdev, "Failed to get umem");
 		rc = -EFAULT;
 		goto free_mrw;
 	}
@@ -3750,7 +3744,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 	mr->qplib_mr.va = virt_addr;
 	umem_pgs = ib_umem_page_count(umem);
 	if (!umem_pgs) {
-		dev_err(rdev_to_dev(rdev), "umem is invalid!");
+		ibdev_err(&rdev->ibdev, "umem is invalid!");
 		rc = -EINVAL;
 		goto free_umem;
 	}
@@ -3767,15 +3761,15 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 				virt_addr));
 
 	if (!bnxt_re_page_size_ok(page_shift)) {
-		dev_err(rdev_to_dev(rdev), "umem page size unsupported!");
+		ibdev_err(&rdev->ibdev, "umem page size unsupported!");
 		rc = -EFAULT;
 		goto fail;
 	}
 
 	if (page_shift == BNXT_RE_PAGE_SHIFT_4K &&
 	    length > BNXT_RE_MAX_MR_SIZE_LOW) {
-		dev_err(rdev_to_dev(rdev), "Requested MR Sz:%llu Max sup:%llu",
-			length,	(u64)BNXT_RE_MAX_MR_SIZE_LOW);
+		ibdev_err(&rdev->ibdev, "Requested MR Sz:%llu Max sup:%llu",
+			  length, (u64)BNXT_RE_MAX_MR_SIZE_LOW);
 		rc = -EINVAL;
 		goto fail;
 	}
@@ -3785,7 +3779,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, pbl_tbl,
 			       umem_pgs, false, 1 << page_shift);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to register user MR");
+		ibdev_err(&rdev->ibdev, "Failed to register user MR");
 		goto fail;
 	}
 
@@ -3818,12 +3812,11 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata)
 	u32 chip_met_rev_num = 0;
 	int rc;
 
-	dev_dbg(rdev_to_dev(rdev), "ABI version requested %u",
-		ibdev->ops.uverbs_abi_ver);
+	ibdev_dbg(ibdev, "ABI version requested %u", ibdev->ops.uverbs_abi_ver);
 
 	if (ibdev->ops.uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
-		dev_dbg(rdev_to_dev(rdev), " is different from the device %d ",
-			BNXT_RE_ABI_VERSION);
+		ibdev_dbg(ibdev, " is different from the device %d ",
+			  BNXT_RE_ABI_VERSION);
 		return -EPERM;
 	}
 
@@ -3855,7 +3848,7 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata)
 
 	rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to copy user context");
+		ibdev_err(ibdev, "Failed to copy user context");
 		rc = -EFAULT;
 		goto cfail;
 	}
@@ -3905,15 +3898,14 @@ int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma)
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 		if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
 				       PAGE_SIZE, vma->vm_page_prot)) {
-			dev_err(rdev_to_dev(rdev), "Failed to map DPI");
+			ibdev_err(&rdev->ibdev, "Failed to map DPI");
 			return -EAGAIN;
 		}
 	} else {
 		pfn = virt_to_phys(uctx->shpg) >> PAGE_SHIFT;
 		if (remap_pfn_range(vma, vma->vm_start,
 				    pfn, PAGE_SIZE, vma->vm_page_prot)) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to map shared page");
+			ibdev_err(&rdev->ibdev, "Failed to map shared page");
 			return -EAGAIN;
 		}
 	}
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 65f106f84c33..b5128cce8e21 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -269,7 +269,7 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent)
 		 * to f/w will timeout and that will set the
 		 * timeout bit.
 		 */
-		dev_err(rdev_to_dev(rdev), "Failed to re-start IRQs\n");
+		ibdev_err(&rdev->ibdev, "Failed to re-start IRQs\n");
 		return;
 	}
 
@@ -286,8 +286,8 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent)
 		rc = bnxt_qplib_nq_start_irq(nq, indx - 1,
 					     msix_ent[indx].vector, false);
 		if (rc)
-			dev_warn(rdev_to_dev(rdev),
-				 "Failed to reinit NQ index %d\n", indx - 1);
+			ibdev_warn(&rdev->ibdev, "Failed to reinit NQ index %d\n",
+				   indx - 1);
 	}
 }
 
@@ -373,9 +373,9 @@ static int bnxt_re_request_msix(struct bnxt_re_dev *rdev)
 		goto done;
 	}
 	if (num_msix_got != num_msix_want) {
-		dev_warn(rdev_to_dev(rdev),
-			 "Requested %d MSI-X vectors, got %d\n",
-			 num_msix_want, num_msix_got);
+		ibdev_warn(&rdev->ibdev,
+			   "Requested %d MSI-X vectors, got %d\n",
+			   num_msix_want, num_msix_got);
 	}
 	rdev->num_msix = num_msix_got;
 done:
@@ -422,8 +422,8 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev,
 			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
 	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
 	if (rc)
-		dev_err(rdev_to_dev(rdev),
-			"Failed to free HW ring:%d :%#x", req.ring_id, rc);
+		ibdev_err(&rdev->ibdev, "Failed to free HW ring:%d :%#x",
+			  req.ring_id, rc);
 	return rc;
 }
 
@@ -483,8 +483,8 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev,
 			    sizeof(req), DFLT_HWRM_CMD_TIMEOUT);
 	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
 	if (rc)
-		dev_err(rdev_to_dev(rdev),
-			"Failed to free HW stats context %#x", rc);
+		ibdev_err(&rdev->ibdev, "Failed to free HW stats context %#x",
+			  rc);
 
 	return rc;
 }
@@ -757,8 +757,8 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev,
 	/* Allocate bnxt_re_dev instance here */
 	rdev = ib_alloc_device(bnxt_re_dev, ibdev);
 	if (!rdev) {
-		dev_err(NULL, "%s: bnxt_re_dev allocation failure!",
-			ROCE_DRV_MODULE_NAME);
+		ibdev_err(NULL, "%s: bnxt_re_dev allocation failure!",
+			  ROCE_DRV_MODULE_NAME);
 		return NULL;
 	}
 	/* Default values */
@@ -887,8 +887,8 @@ static int bnxt_re_srqn_handler(struct bnxt_qplib_nq *nq,
 	int rc = 0;
 
 	if (!srq) {
-		dev_err(NULL, "%s: SRQ is NULL, SRQN not handled",
-			ROCE_DRV_MODULE_NAME);
+		ibdev_err(NULL, "%s: SRQ is NULL, SRQN not handled",
+			  ROCE_DRV_MODULE_NAME);
 		rc = -EINVAL;
 		goto done;
 	}
@@ -915,8 +915,8 @@ static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq,
 					     qplib_cq);
 
 	if (!cq) {
-		dev_err(NULL, "%s: CQ is NULL, CQN not handled",
-			ROCE_DRV_MODULE_NAME);
+		ibdev_err(NULL, "%s: CQ is NULL, CQN not handled",
+			  ROCE_DRV_MODULE_NAME);
 		return -EINVAL;
 	}
 	if (cq->ib_cq.comp_handler) {
@@ -963,8 +963,8 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev)
 					  db_offt, &bnxt_re_cqn_handler,
 					  &bnxt_re_srqn_handler);
 		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to enable NQ with rc = 0x%x", rc);
+			ibdev_err(&rdev->ibdev,
+				  "Failed to enable NQ with rc = 0x%x", rc);
 			goto fail;
 		}
 		num_vec_enabled++;
@@ -1039,8 +1039,8 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 					qplib_ctx->srqc_count + 2);
 		rc = bnxt_qplib_alloc_nq(&rdev->qplib_res, &rdev->nq[i]);
 		if (rc) {
-			dev_err(rdev_to_dev(rdev), "Alloc Failed NQ%d rc:%#x",
-				i, rc);
+			ibdev_err(&rdev->ibdev, "Alloc Failed NQ%d rc:%#x",
+				  i, rc);
 			goto free_nq;
 		}
 		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
@@ -1052,9 +1052,9 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 		rattr.lrid = rdev->msix_entries[i + 1].ring_idx;
 		rc = bnxt_re_net_ring_alloc(rdev, &rattr, &nq->ring_id);
 		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to allocate NQ fw id with rc = 0x%x",
-				rc);
+			ibdev_err(&rdev->ibdev,
+				  "Failed to allocate NQ fw id with rc = 0x%x",
+				  rc);
 			bnxt_qplib_free_nq(&rdev->nq[i]);
 			goto free_nq;
 		}
@@ -1128,10 +1128,10 @@ static int bnxt_re_query_hwrm_pri2cos(struct bnxt_re_dev *rdev, u8 dir,
 		return rc;
 
 	if (resp.queue_cfg_info) {
-		dev_warn(rdev_to_dev(rdev),
-			 "Asymmetric cos queue configuration detected");
-		dev_warn(rdev_to_dev(rdev),
-			 " on device, QoS may not be fully functional\n");
+		ibdev_warn(&rdev->ibdev,
+			   "Asymmetric cos queue configuration detected");
+		ibdev_warn(&rdev->ibdev,
+			   " on device, QoS may not be fully functional\n");
 	}
 	qcfgmap = &resp.pri0_cos_queue_id;
 	tmp_map = (u8 *)cid_map;
@@ -1184,7 +1184,7 @@ static int bnxt_re_update_gid(struct bnxt_re_dev *rdev)
 		return 0;
 
 	if (!sgid_tbl) {
-		dev_err(rdev_to_dev(rdev), "QPLIB: SGID table not allocated");
+		ibdev_err(&rdev->ibdev, "QPLIB: SGID table not allocated");
 		return -EINVAL;
 	}
 
@@ -1261,7 +1261,7 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev)
 	/* Get cosq id for this priority */
 	rc = bnxt_re_query_hwrm_pri2cos(rdev, 0, &cid_map);
 	if (rc) {
-		dev_warn(rdev_to_dev(rdev), "no cos for p_mask %x\n", prio_map);
+		ibdev_warn(&rdev->ibdev, "no cos for p_mask %x\n", prio_map);
 		return rc;
 	}
 	/* Parse CoS IDs for app priority */
@@ -1270,8 +1270,8 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev)
 	/* Config BONO. */
 	rc = bnxt_qplib_map_tc2cos(&rdev->qplib_res, rdev->cosq);
 	if (rc) {
-		dev_warn(rdev_to_dev(rdev), "no tc for cos{%x, %x}\n",
-			 rdev->cosq[0], rdev->cosq[1]);
+		ibdev_warn(&rdev->ibdev, "no tc for cos{%x, %x}\n",
+			   rdev->cosq[0], rdev->cosq[1]);
 		return rc;
 	}
 
@@ -1306,8 +1306,8 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
 			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
 	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev),
-			"Failed to query HW version, rc = 0x%x", rc);
+		ibdev_err(&rdev->ibdev, "Failed to query HW version, rc = 0x%x",
+			  rc);
 		return;
 	}
 	rdev->qplib_ctx.hwrm_intf_ver =
@@ -1338,8 +1338,8 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
 	if (test_and_clear_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) {
 		rc = bnxt_qplib_deinit_rcfw(&rdev->rcfw);
 		if (rc)
-			dev_warn(rdev_to_dev(rdev),
-				 "Failed to deinitialize RCFW: %#x", rc);
+			ibdev_warn(&rdev->ibdev,
+				   "Failed to deinitialize RCFW: %#x", rc);
 		bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
 		bnxt_qplib_free_ctx(&rdev->qplib_res, &rdev->qplib_ctx);
 		bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
@@ -1350,16 +1350,16 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
 	if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) {
 		rc = bnxt_re_free_msix(rdev);
 		if (rc)
-			dev_warn(rdev_to_dev(rdev),
-				 "Failed to free MSI-X vectors: %#x", rc);
+			ibdev_warn(&rdev->ibdev,
+				   "Failed to free MSI-X vectors: %#x", rc);
 	}
 
 	bnxt_re_destroy_chip_ctx(rdev);
 	if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) {
 		rc = bnxt_re_unregister_netdev(rdev);
 		if (rc)
-			dev_warn(rdev_to_dev(rdev),
-				 "Failed to unregister with netdev: %#x", rc);
+			ibdev_warn(&rdev->ibdev,
+				   "Failed to unregister with netdev: %#x", rc);
 	}
 }
 
@@ -1392,14 +1392,15 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	rc = bnxt_re_register_netdev(rdev);
 	if (rc) {
 		rtnl_unlock();
-		pr_err("Failed to register with netedev: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to register with netedev: %#x\n", rc);
 		return -EINVAL;
 	}
 	set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
 
 	rc = bnxt_re_setup_chip_ctx(rdev);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to get chip context\n");
+		ibdev_err(&rdev->ibdev, "Failed to get chip context\n");
 		return -EINVAL;
 	}
 
@@ -1408,7 +1409,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 
 	rc = bnxt_re_request_msix(rdev);
 	if (rc) {
-		pr_err("Failed to get MSI-X vectors: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to get MSI-X vectors: %#x\n", rc);
 		rc = -EINVAL;
 		goto fail;
 	}
@@ -1423,7 +1425,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 					   &rdev->qplib_ctx,
 					   BNXT_RE_MAX_QPC_COUNT);
 	if (rc) {
-		pr_err("Failed to allocate RCFW Channel: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to allocate RCFW Channel: %#x\n", rc);
 		goto fail;
 	}
 
@@ -1437,7 +1440,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	rattr.lrid = rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx;
 	rc = bnxt_re_net_ring_alloc(rdev, &rattr, &creq->ring_id);
 	if (rc) {
-		pr_err("Failed to allocate CREQ: %#x\n", rc);
+		ibdev_err(&rdev->ibdev, "Failed to allocate CREQ: %#x\n", rc);
 		goto free_rcfw;
 	}
 	db_offt = bnxt_re_get_nqdb_offset(rdev, BNXT_RE_AEQ_IDX);
@@ -1446,7 +1449,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 					    vid, db_offt, rdev->is_virtfn,
 					    &bnxt_re_aeq_handler);
 	if (rc) {
-		pr_err("Failed to enable RCFW channel: %#x\n", rc);
+		ibdev_err(&rdev->ibdev, "Failed to enable RCFW channel: %#x\n",
+			  rc);
 		goto free_ring;
 	}
 
@@ -1460,21 +1464,24 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	rc = bnxt_qplib_alloc_ctx(&rdev->qplib_res, &rdev->qplib_ctx, 0,
 				  bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx));
 	if (rc) {
-		pr_err("Failed to allocate QPLIB context: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to allocate QPLIB context: %#x\n", rc);
 		goto disable_rcfw;
 	}
 	rc = bnxt_re_net_stats_ctx_alloc(rdev,
 					 rdev->qplib_ctx.stats.dma_map,
 					 &rdev->qplib_ctx.stats.fw_id);
 	if (rc) {
-		pr_err("Failed to allocate stats context: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to allocate stats context: %#x\n", rc);
 		goto free_ctx;
 	}
 
 	rc = bnxt_qplib_init_rcfw(&rdev->rcfw, &rdev->qplib_ctx,
 				  rdev->is_virtfn);
 	if (rc) {
-		pr_err("Failed to initialize RCFW: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to initialize RCFW: %#x\n", rc);
 		goto free_sctx;
 	}
 	set_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags);
@@ -1482,13 +1489,15 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	/* Resources based on the 'new' device caps */
 	rc = bnxt_re_alloc_res(rdev);
 	if (rc) {
-		pr_err("Failed to allocate resources: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to allocate resources: %#x\n", rc);
 		goto fail;
 	}
 	set_bit(BNXT_RE_FLAG_RESOURCES_ALLOCATED, &rdev->flags);
 	rc = bnxt_re_init_res(rdev);
 	if (rc) {
-		pr_err("Failed to initialize resources: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to initialize resources: %#x\n", rc);
 		goto fail;
 	}
 
@@ -1497,7 +1506,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	if (!rdev->is_virtfn) {
 		rc = bnxt_re_setup_qos(rdev);
 		if (rc)
-			pr_info("RoCE priority not yet configured\n");
+			ibdev_info(&rdev->ibdev,
+				   "RoCE priority not yet configured\n");
 
 		INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker);
 		set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags);
@@ -1510,11 +1520,12 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 	/* Register ib dev */
 	rc = bnxt_re_register_ib(rdev);
 	if (rc) {
-		pr_err("Failed to register with IB: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to register with IB: %#x\n", rc);
 		goto fail;
 	}
 	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
-	dev_info(rdev_to_dev(rdev), "Device registered successfully");
+	ibdev_info(&rdev->ibdev, "Device registered successfully");
 	ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
 			 &rdev->active_width);
 	set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags);
@@ -1563,7 +1574,8 @@ static int bnxt_re_dev_reg(struct bnxt_re_dev **rdev, struct net_device *netdev)
 	en_dev = bnxt_re_dev_probe(netdev);
 	if (IS_ERR(en_dev)) {
 		if (en_dev != ERR_PTR(-ENODEV))
-			pr_err("%s: Failed to probe\n", ROCE_DRV_MODULE_NAME);
+			ibdev_err(&(*rdev)->ibdev, "%s: Failed to probe\n",
+				  ROCE_DRV_MODULE_NAME);
 		rc = PTR_ERR(en_dev);
 		goto exit;
 	}
@@ -1600,8 +1612,8 @@ static void bnxt_re_task(struct work_struct *work)
 	case NETDEV_REGISTER:
 		rc = bnxt_re_ib_reg(rdev);
 		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to register with IB: %#x", rc);
+			ibdev_err(&rdev->ibdev,
+				  "Failed to register with IB: %#x", rc);
 			bnxt_re_remove_one(rdev);
 			bnxt_re_dev_unreg(rdev);
 			goto exit;
@@ -1678,8 +1690,9 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 		if (rc == -ENODEV)
 			break;
 		if (rc) {
-			pr_err("Failed to register with the device %s: %#x\n",
-			       real_dev->name, rc);
+			ibdev_err(&rdev->ibdev,
+				  "Failed to register with the device %s: %#x\n",
+				  real_dev->name, rc);
 			break;
 		}
 		bnxt_re_init_one(rdev);
@@ -1764,7 +1777,7 @@ static void __exit bnxt_re_mod_exit(void)
 	* cleanup is done before PF cleanup
 	*/
 	list_for_each_entry_safe_reverse(rdev, next, &to_be_deleted, list) {
-		dev_info(rdev_to_dev(rdev), "Unregistering Device");
+		ibdev_info(&rdev->ibdev, "Unregistering Device");
 		/*
 		 * Flush out any scheduled tasks before destroying the
 		 * resources

From f90f3cafe8d56d593fc509a4185da1d5800efea4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 21 Feb 2020 08:43:23 -0600
Subject: [PATCH 052/327] proc: Use d_invalidate in proc_prune_siblings_dcache

The function d_prune_aliases has the problem that it will only prune
aliases thare are completely unused.  It will not remove aliases for
the dcache or even think of removing mounts from the dcache.  For that
behavior d_invalidate is needed.

To use d_invalidate replace d_prune_aliases with d_find_alias followed
by d_invalidate and dput.

For completeness the directory and the non-directory cases are
separated because in theory (although not in currently in practice for
proc) directories can only ever have a single dentry while
non-directories can have hardlinks and thus multiple dentries.
As part of this separation use d_find_any_alias for directories
to spare d_find_alias the extra work of doing that.

Plus the differences between d_find_any_alias and d_find_alias makes
it clear why the directory and non-directory code and not share code.

To make it clear these routines now invalidate dentries rename
proc_prune_siblings_dache to proc_invalidate_siblings_dcache, and rename
proc_sys_prune_dcache proc_sys_invalidate_dcache.

V2: Split the directory and non-directory cases.  To make this
    code robust to future changes in proc.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/inode.c       | 16 ++++++++++++++--
 fs/proc/internal.h    |  2 +-
 fs/proc/proc_sysctl.c |  8 ++++----
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index fa2dc732cd77..ba6acd300ce1 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -103,7 +103,7 @@ void __init proc_init_kmemcache(void)
 	BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE);
 }
 
-void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
+void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
 {
 	struct inode *inode;
 	struct proc_inode *ei;
@@ -137,7 +137,19 @@ void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock)
 			continue;
 		}
 
-		d_prune_aliases(inode);
+		if (S_ISDIR(inode->i_mode)) {
+			struct dentry *dir = d_find_any_alias(inode);
+			if (dir) {
+				d_invalidate(dir);
+				dput(dir);
+			}
+		} else {
+			struct dentry *dentry;
+			while ((dentry = d_find_alias(inode))) {
+				d_invalidate(dentry);
+				dput(dentry);
+			}
+		}
 		iput(inode);
 
 		rcu_read_lock();
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index ba9a991824a5..fd470172675f 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -210,7 +210,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
 extern const struct super_operations proc_sops;
 
 void proc_init_kmemcache(void);
-void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
+void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock);
 void set_proc_pid_nlink(void);
 extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 extern void proc_entry_rundown(struct proc_dir_entry *);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5da9d7f7ae34..b6f5d459b087 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -267,9 +267,9 @@ static void unuse_table(struct ctl_table_header *p)
 			complete(p->unregistering);
 }
 
-static void proc_sys_prune_dcache(struct ctl_table_header *head)
+static void proc_sys_invalidate_dcache(struct ctl_table_header *head)
 {
-	proc_prune_siblings_dcache(&head->inodes, &sysctl_lock);
+	proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock);
 }
 
 /* called under sysctl_lock, will reacquire if has to wait */
@@ -291,10 +291,10 @@ static void start_unregistering(struct ctl_table_header *p)
 		spin_unlock(&sysctl_lock);
 	}
 	/*
-	 * Prune dentries for unregistered sysctls: namespaced sysctls
+	 * Invalidate dentries for unregistered sysctls: namespaced sysctls
 	 * can have duplicate names and contaminate dcache very badly.
 	 */
-	proc_sys_prune_dcache(p);
+	proc_sys_invalidate_dcache(p);
 	/*
 	 * do not remove from the list until nobody holds it; walking the
 	 * list in do_sysctl() relies on that.

From 71448011ea2a1cd36d8f5cbdab0ed716c454d565 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 20 Feb 2020 11:17:28 -0600
Subject: [PATCH 053/327] proc: Clear the pieces of proc_inode that
 proc_evict_inode cares about

This just keeps everything tidier, and allows for using flags like
SLAB_TYPESAFE_BY_RCU where slabs are not always cleared before reuse.
I don't see reuse without reinitializing happening with the proc_inode
but I had a false alarm while reworking flushing of proc dentries and
indoes when a process dies that caused me to tidy this up.

The code is a little easier to follow and reason about this
way so I figured the changes might as well be kept.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/inode.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index ba6acd300ce1..d9243b24554a 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -33,21 +33,27 @@ static void proc_evict_inode(struct inode *inode)
 {
 	struct proc_dir_entry *de;
 	struct ctl_table_header *head;
+	struct proc_inode *ei = PROC_I(inode);
 
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
 
 	/* Stop tracking associated processes */
-	put_pid(PROC_I(inode)->pid);
+	if (ei->pid) {
+		put_pid(ei->pid);
+		ei->pid = NULL;
+	}
 
 	/* Let go of any associated proc directory entry */
-	de = PDE(inode);
-	if (de)
+	de = ei->pde;
+	if (de) {
 		pde_put(de);
+		ei->pde = NULL;
+	}
 
-	head = PROC_I(inode)->sysctl;
+	head = ei->sysctl;
 	if (head) {
-		RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
+		RCU_INIT_POINTER(ei->sysctl, NULL);
 		proc_sys_evict_inode(inode, head);
 	}
 }

From 7bc3e6e55acf065500a24621f3b313e7e5998acf Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 19 Feb 2020 18:22:26 -0600
Subject: [PATCH 054/327] proc: Use a list of inodes to flush from proc

Rework the flushing of proc to use a list of directory inodes that
need to be flushed.

The list is kept on struct pid not on struct task_struct, as there is
a fixed connection between proc inodes and pids but at least for the
case of de_thread the pid of a task_struct changes.

This removes the dependency on proc_mnt which allows for different
mounts of proc having different mount options even in the same pid
namespace and this allows for the removal of proc_mnt which will
trivially the first mount of proc to honor it's mount options.

This flushing remains an optimization.  The functions
pid_delete_dentry and pid_revalidate ensure that ordinary dcache
management will not attempt to use dentries past the point their
respective task has died.  When unused the shrinker will
eventually be able to remove these dentries.

There is a case in de_thread where proc_flush_pid can be
called early for a given pid.  Which winds up being
safe (if suboptimal) as this is just an optiimization.

Only pid directories are put on the list as the other
per pid files are children of those directories and
d_invalidate on the directory will get them as well.

So that the pid can be used during flushing it's reference count is
taken in release_task and dropped in proc_flush_pid.  Further the call
of proc_flush_pid is moved after the tasklist_lock is released in
release_task so that it is certain that the pid has already been
unhashed when flushing it taking place.  This removes a small race
where a dentry could recreated.

As struct pid is supposed to be small and I need a per pid lock
I reuse the only lock that currently exists in struct pid the
the wait_pidfd.lock.

The net result is that this adds all of this functionality
with just a little extra list management overhead and
a single extra pointer in struct pid.

v2: Initialize pid->inodes.  I somehow failed to get that
    initialization into the initial version of the patch.  A boot
    failure was reported by "kernel test robot <lkp@intel.com>", and
    failure to initialize that pid->inodes matches all of the reported
    symptoms.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/base.c          | 111 +++++++++++++---------------------------
 fs/proc/inode.c         |   2 +-
 fs/proc/internal.h      |   1 +
 include/linux/pid.h     |   1 +
 include/linux/proc_fs.h |   4 +-
 kernel/exit.c           |   4 +-
 kernel/pid.c            |   1 +
 7 files changed, 45 insertions(+), 79 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7c64272b0fa..e7efe9d6f3d6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
 	*rgid = gid;
 }
 
+void proc_pid_evict_inode(struct proc_inode *ei)
+{
+	struct pid *pid = ei->pid;
+
+	if (S_ISDIR(ei->vfs_inode.i_mode)) {
+		spin_lock(&pid->wait_pidfd.lock);
+		hlist_del_init_rcu(&ei->sibling_inodes);
+		spin_unlock(&pid->wait_pidfd.lock);
+	}
+
+	put_pid(pid);
+}
+
 struct inode *proc_pid_make_inode(struct super_block * sb,
 				  struct task_struct *task, umode_t mode)
 {
 	struct inode * inode;
 	struct proc_inode *ei;
+	struct pid *pid;
 
 	/* We need a new inode */
 
@@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
 	/*
 	 * grab the reference to task.
 	 */
-	ei->pid = get_task_pid(task, PIDTYPE_PID);
-	if (!ei->pid)
+	pid = get_task_pid(task, PIDTYPE_PID);
+	if (!pid)
 		goto out_unlock;
 
+	/* Let the pid remember us for quick removal */
+	ei->pid = pid;
+	if (S_ISDIR(mode)) {
+		spin_lock(&pid->wait_pidfd.lock);
+		hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+		spin_unlock(&pid->wait_pidfd.lock);
+	}
+
 	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
 	security_task_to_inode(task, inode);
 
@@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
 	.permission	= proc_pid_permission,
 };
 
-static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
-{
-	struct dentry *dentry, *leader, *dir;
-	char buf[10 + 1];
-	struct qstr name;
-
-	name.name = buf;
-	name.len = snprintf(buf, sizeof(buf), "%u", pid);
-	/* no ->d_hash() rejects on procfs */
-	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
-	if (dentry) {
-		d_invalidate(dentry);
-		dput(dentry);
-	}
-
-	if (pid == tgid)
-		return;
-
-	name.name = buf;
-	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
-	leader = d_hash_and_lookup(mnt->mnt_root, &name);
-	if (!leader)
-		goto out;
-
-	name.name = "task";
-	name.len = strlen(name.name);
-	dir = d_hash_and_lookup(leader, &name);
-	if (!dir)
-		goto out_put_leader;
-
-	name.name = buf;
-	name.len = snprintf(buf, sizeof(buf), "%u", pid);
-	dentry = d_hash_and_lookup(dir, &name);
-	if (dentry) {
-		d_invalidate(dentry);
-		dput(dentry);
-	}
-
-	dput(dir);
-out_put_leader:
-	dput(leader);
-out:
-	return;
-}
-
 /**
- * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
- * @task: task that should be flushed.
+ * proc_flush_pid -  Remove dcache entries for @pid from the /proc dcache.
+ * @pid: pid that should be flushed.
  *
- * When flushing dentries from proc, one needs to flush them from global
- * proc (proc_mnt) and from all the namespaces' procs this task was seen
- * in. This call is supposed to do all of this job.
- *
- * Looks in the dcache for
- * /proc/@pid
- * /proc/@tgid/task/@pid
- * if either directory is present flushes it and all of it'ts children
- * from the dcache.
+ * This function walks a list of inodes (that belong to any proc
+ * filesystem) that are attached to the pid and flushes them from
+ * the dentry cache.
  *
  * It is safe and reasonable to cache /proc entries for a task until
  * that task exits.  After that they just clog up the dcache with
  * useless entries, possibly causing useful dcache entries to be
- * flushed instead.  This routine is proved to flush those useless
- * dcache entries at process exit time.
+ * flushed instead.  This routine is provided to flush those useless
+ * dcache entries when a process is reaped.
  *
  * NOTE: This routine is just an optimization so it does not guarantee
- *       that no dcache entries will exist at process exit time it
- *       just makes it very unlikely that any will persist.
+ *       that no dcache entries will exist after a process is reaped
+ *       it just makes it very unlikely that any will persist.
  */
 
-void proc_flush_task(struct task_struct *task)
+void proc_flush_pid(struct pid *pid)
 {
-	int i;
-	struct pid *pid, *tgid;
-	struct upid *upid;
-
-	pid = task_pid(task);
-	tgid = task_tgid(task);
-
-	for (i = 0; i <= pid->level; i++) {
-		upid = &pid->numbers[i];
-		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
-					tgid->numbers[i].nr);
-	}
+	proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock);
+	put_pid(pid);
 }
 
 static struct dentry *proc_pid_instantiate(struct dentry * dentry,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d9243b24554a..1e730ea1dcd6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode)
 
 	/* Stop tracking associated processes */
 	if (ei->pid) {
-		put_pid(ei->pid);
+		proc_pid_evict_inode(ei);
 		ei->pid = NULL;
 	}
 
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index fd470172675f..9e294f0290e5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -158,6 +158,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
 extern const struct dentry_operations pid_dentry_operations;
 extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int proc_setattr(struct dentry *, struct iattr *);
+extern void proc_pid_evict_inode(struct proc_inode *);
 extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
 extern void pid_update_inode(struct task_struct *, struct inode *);
 extern int pid_delete_dentry(const struct dentry *);
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 998ae7d24450..01a0d4e28506 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -62,6 +62,7 @@ struct pid
 	unsigned int level;
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
+	struct hlist_head inodes;
 	/* wait queue for pidfd notifications */
 	wait_queue_head_t wait_pidfd;
 	struct rcu_head rcu;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 3dfa92633af3..40a7982b7285 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -32,7 +32,7 @@ struct proc_ops {
 typedef int (*proc_write_t)(struct file *, char *, size_t);
 
 extern void proc_root_init(void);
-extern void proc_flush_task(struct task_struct *);
+extern void proc_flush_pid(struct pid *);
 
 extern struct proc_dir_entry *proc_symlink(const char *,
 		struct proc_dir_entry *, const char *);
@@ -105,7 +105,7 @@ static inline void proc_root_init(void)
 {
 }
 
-static inline void proc_flush_task(struct task_struct *task)
+static inline void proc_flush_pid(struct pid *pid)
 {
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 2833ffb0c211..502b4995b688 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -191,6 +191,7 @@ void put_task_struct_rcu_user(struct task_struct *task)
 void release_task(struct task_struct *p)
 {
 	struct task_struct *leader;
+	struct pid *thread_pid;
 	int zap_leader;
 repeat:
 	/* don't need to get the RCU readlock here - the process is dead and
@@ -199,11 +200,11 @@ void release_task(struct task_struct *p)
 	atomic_dec(&__task_cred(p)->user->processes);
 	rcu_read_unlock();
 
-	proc_flush_task(p);
 	cgroup_release(p);
 
 	write_lock_irq(&tasklist_lock);
 	ptrace_release_task(p);
+	thread_pid = get_pid(p->thread_pid);
 	__exit_signal(p);
 
 	/*
@@ -226,6 +227,7 @@ void release_task(struct task_struct *p)
 	}
 
 	write_unlock_irq(&tasklist_lock);
+	proc_flush_pid(thread_pid);
 	release_thread(p);
 	put_task_struct_rcu_user(p);
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 0f4ecb57214c..ca08d6a3aa77 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -258,6 +258,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 		INIT_HLIST_HEAD(&pid->tasks[type]);
 
 	init_waitqueue_head(&pid->wait_pidfd);
+	INIT_HLIST_HEAD(&pid->inodes);
 
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);

From 65a166201552113f9e1e8d1bb4a55a1eb70cb19c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 26 Feb 2020 10:17:24 -0400
Subject: [PATCH 055/327] RDMA/bnxt_re: Using vmalloc requires including
 vmalloc.h

Add it

Fixes: 0c4dcd602817 ("RDMA/bnxt_re: Refactor hardware queue memory allocation")
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Devesh Sharma <devesh.sharma@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/qplib_res.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
index 4346b95963cf..fc5909c7f2e0 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -44,6 +44,7 @@
 #include <linux/inetdevice.h>
 #include <linux/dma-mapping.h>
 #include <linux/if_vlan.h>
+#include <linux/vmalloc.h>
 #include "roce_hsi.h"
 #include "qplib_res.h"
 #include "qplib_sp.h"

From d8e93e3f22d9fd2e6a3ccae3623c3af8789ccfc0 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 27 Feb 2020 07:37:40 -0500
Subject: [PATCH 056/327] XArray: Optimise xas_sibling() if
 !CONFIG_XARRAY_MULTI

If CONFIG_XARRAY_MULTI is disabled, then xas_sibling() must be false.

Reported-by: JaeJoon Jung <rgbi3307@gmail.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 lib/xarray.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/xarray.c b/lib/xarray.c
index 05324cf571f4..f448bcd263ac 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1836,7 +1836,7 @@ static bool xas_sibling(struct xa_state *xas)
 	struct xa_node *node = xas->xa_node;
 	unsigned long mask;
 
-	if (!node)
+	if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node)
 		return false;
 	mask = (XA_CHUNK_SIZE << node->shift) - 1;
 	return (xas->xa_index & mask) >

From 34eee836a9dd3e1987c10ed6afc7ece4131a993d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 27 Feb 2020 12:25:47 -0500
Subject: [PATCH 057/327] radix tree test suite: Support kmem_cache alignment

The radix tree doesn't use alignment, so the argument was ignored.
The maple tree needs its nodes to be aligned, so we need to pay attention
to the alignment argument.  Also change the types of 'size' and 'align'
to unsigned int to match commit f4957d5bd0916.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 tools/testing/radix-tree/linux.c      | 32 +++++++++++++++++----------
 tools/testing/radix-tree/linux/slab.h |  6 ++---
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c
index 44a0d1ad4408..2d9c59df60de 100644
--- a/tools/testing/radix-tree/linux.c
+++ b/tools/testing/radix-tree/linux.c
@@ -19,37 +19,44 @@ int test_verbose;
 
 struct kmem_cache {
 	pthread_mutex_t lock;
-	int size;
+	unsigned int size;
+	unsigned int align;
 	int nr_objs;
 	void *objs;
 	void (*ctor)(void *);
 };
 
-void *kmem_cache_alloc(struct kmem_cache *cachep, int flags)
+void *kmem_cache_alloc(struct kmem_cache *cachep, int gfp)
 {
-	struct radix_tree_node *node;
+	void *p;
 
-	if (!(flags & __GFP_DIRECT_RECLAIM))
+	if (!(gfp & __GFP_DIRECT_RECLAIM))
 		return NULL;
 
 	pthread_mutex_lock(&cachep->lock);
 	if (cachep->nr_objs) {
+		struct radix_tree_node *node = cachep->objs;
 		cachep->nr_objs--;
-		node = cachep->objs;
 		cachep->objs = node->parent;
 		pthread_mutex_unlock(&cachep->lock);
 		node->parent = NULL;
+		p = node;
 	} else {
 		pthread_mutex_unlock(&cachep->lock);
-		node = malloc(cachep->size);
+		if (cachep->align)
+			posix_memalign(&p, cachep->align, cachep->size);
+		else
+			p = malloc(cachep->size);
 		if (cachep->ctor)
-			cachep->ctor(node);
+			cachep->ctor(p);
+		else if (gfp & __GFP_ZERO)
+			memset(p, 0, cachep->size);
 	}
 
 	uatomic_inc(&nr_allocated);
 	if (kmalloc_verbose)
-		printf("Allocating %p from slab\n", node);
-	return node;
+		printf("Allocating %p from slab\n", p);
+	return p;
 }
 
 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
@@ -59,7 +66,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 	if (kmalloc_verbose)
 		printf("Freeing %p to slab\n", objp);
 	pthread_mutex_lock(&cachep->lock);
-	if (cachep->nr_objs > 10) {
+	if (cachep->nr_objs > 10 || cachep->align) {
 		memset(objp, POISON_FREE, cachep->size);
 		free(objp);
 	} else {
@@ -98,13 +105,14 @@ void kfree(void *p)
 }
 
 struct kmem_cache *
-kmem_cache_create(const char *name, size_t size, size_t offset,
-	unsigned long flags, void (*ctor)(void *))
+kmem_cache_create(const char *name, unsigned int size, unsigned int align,
+		unsigned int flags, void (*ctor)(void *))
 {
 	struct kmem_cache *ret = malloc(sizeof(*ret));
 
 	pthread_mutex_init(&ret->lock, NULL);
 	ret->size = size;
+	ret->align = align;
 	ret->nr_objs = 0;
 	ret->objs = NULL;
 	ret->ctor = ctor;
diff --git a/tools/testing/radix-tree/linux/slab.h b/tools/testing/radix-tree/linux/slab.h
index a037def0dec6..2958830ce4d7 100644
--- a/tools/testing/radix-tree/linux/slab.h
+++ b/tools/testing/radix-tree/linux/slab.h
@@ -20,8 +20,8 @@ static inline void *kzalloc(size_t size, gfp_t gfp)
 void *kmem_cache_alloc(struct kmem_cache *cachep, int flags);
 void kmem_cache_free(struct kmem_cache *cachep, void *objp);
 
-struct kmem_cache *
-kmem_cache_create(const char *name, size_t size, size_t offset,
-	unsigned long flags, void (*ctor)(void *));
+struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
+			unsigned int align, unsigned int flags,
+			void (*ctor)(void *));
 
 #endif		/* SLAB_H */

From 26df6034fdb211857e069e7b07d25a368da646df Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 11 Jan 2020 10:44:29 -0500
Subject: [PATCH 058/327] fix automount/automount race properly

Protection against automount/automount races (two threads hitting the same
referral point at the same time) is based upon do_add_mount() prevention of
identical overmounts - trying to overmount the root of mounted tree with
the same tree fails with -EBUSY.  It's unreliable (the other thread might've
mounted something on top of the automount it has triggered) *and* causes
no end of headache for follow_automount() and its caller, since
finish_automount() behaves like do_new_mount() - if the mountpoint to be is
overmounted, it mounts on top what's overmounting it.  It's not only wrong
(we want to go into what's overmounting the automount point and quietly
discard what we planned to mount there), it introduces the possibility of
original parent mount getting dropped.  That's what 8aef18845266 (VFS: Fix
vfsmount overput on simultaneous automount) deals with, but it can't do
anything about the reliability of conflict detection - if something had
been overmounted the other thread's automount (e.g. that other thread
having stepped into automount in mount(2)), we don't get that -EBUSY and
the result is
	 referral point under automounted NFS under explicit overmount
under another copy of automounted NFS

What we need is finish_automount() *NOT* digging into overmounts - if it
finds one, it should just quietly discard the thing it was asked to mount.
And don't bother with actually crossing into the results of finish_automount() -
the same loop that calls follow_automount() will do that just fine on the
next iteration.

IOW, instead of calling lock_mount() have finish_automount() do it manually,
_without_ the "move into overmount and retry" part.  And leave crossing into
the results to the caller of follow_automount(), which simplifies it a lot.

Moral: if you end up with a lot of glue working around the calling conventions
of something, perhaps these calling conventions are simply wrong...

Fixes: 8aef18845266 (VFS: Fix vfsmount overput on simultaneous automount)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c     | 29 ++++-------------------------
 fs/namespace.c | 40 +++++++++++++++++++++++++++++++++-------
 2 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index db6565c99825..626eddb33508 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1208,11 +1208,9 @@ EXPORT_SYMBOL(follow_up);
  * - return -EISDIR to tell follow_managed() to stop and return the path we
  *   were called with.
  */
-static int follow_automount(struct path *path, struct nameidata *nd,
-			    bool *need_mntput)
+static int follow_automount(struct path *path, struct nameidata *nd)
 {
 	struct vfsmount *mnt;
-	int err;
 
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
 		return -EREMOTE;
@@ -1253,29 +1251,10 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 		return PTR_ERR(mnt);
 	}
 
-	if (!mnt) /* mount collision */
+	if (!mnt)
 		return 0;
 
-	if (!*need_mntput) {
-		/* lock_mount() may release path->mnt on error */
-		mntget(path->mnt);
-		*need_mntput = true;
-	}
-	err = finish_automount(mnt, path);
-
-	switch (err) {
-	case -EBUSY:
-		/* Someone else made a mount here whilst we were busy */
-		return 0;
-	case 0:
-		path_put(path);
-		path->mnt = mnt;
-		path->dentry = dget(mnt->mnt_root);
-		return 0;
-	default:
-		return err;
-	}
-
+	return finish_automount(mnt, path);
 }
 
 /*
@@ -1333,7 +1312,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
 
 		/* Handle an automount point */
 		if (flags & DCACHE_NEED_AUTOMOUNT) {
-			ret = follow_automount(path, nd, &need_mntput);
+			ret = follow_automount(path, nd);
 			if (ret < 0)
 				break;
 			continue;
diff --git a/fs/namespace.c b/fs/namespace.c
index dcd015fafe01..777c3116e62e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2823,6 +2823,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
 
 int finish_automount(struct vfsmount *m, struct path *path)
 {
+	struct dentry *dentry = path->dentry;
 	struct mount *mnt = real_mount(m);
 	struct mountpoint *mp;
 	int err;
@@ -2832,21 +2833,46 @@ int finish_automount(struct vfsmount *m, struct path *path)
 	BUG_ON(mnt_get_count(mnt) < 2);
 
 	if (m->mnt_sb == path->mnt->mnt_sb &&
-	    m->mnt_root == path->dentry) {
+	    m->mnt_root == dentry) {
 		err = -ELOOP;
-		goto fail;
+		goto discard;
 	}
 
-	mp = lock_mount(path);
+	/*
+	 * we don't want to use lock_mount() - in this case finding something
+	 * that overmounts our mountpoint to be means "quitely drop what we've
+	 * got", not "try to mount it on top".
+	 */
+	inode_lock(dentry->d_inode);
+	namespace_lock();
+	if (unlikely(cant_mount(dentry))) {
+		err = -ENOENT;
+		goto discard_locked;
+	}
+	rcu_read_lock();
+	if (unlikely(__lookup_mnt(path->mnt, dentry))) {
+		rcu_read_unlock();
+		err = 0;
+		goto discard_locked;
+	}
+	rcu_read_unlock();
+	mp = get_mountpoint(dentry);
 	if (IS_ERR(mp)) {
 		err = PTR_ERR(mp);
-		goto fail;
+		goto discard_locked;
 	}
+
 	err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
 	unlock_mount(mp);
-	if (!err)
-		return 0;
-fail:
+	if (unlikely(err))
+		goto discard;
+	mntput(m);
+	return 0;
+
+discard_locked:
+	namespace_unlock();
+	inode_unlock(dentry->d_inode);
+discard:
 	/* remove m from any expiration list it may be on */
 	if (!list_empty(&mnt->mnt_expire)) {
 		namespace_lock();

From 25e195aa1e607f129ab912d29fcfc79239703307 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 11 Jan 2020 11:27:46 -0500
Subject: [PATCH 059/327] follow_automount(): get rid of dead^Wstillborn code

1) no instances of ->d_automount() have ever made use of the "return
ERR_PTR(-EISDIR) if you don't feel like mounting anything" - that's
a rudiment of plans that got superseded before the thing went into
the tree.  Despite the comment in follow_automount(), autofs has
never done that.

2) if there's no ->d_automount() in dentry_operations, filesystems
should not set DCACHE_NEED_AUTOMOUNT in the first place.  None have
ever done so...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c     | 28 +++-------------------------
 fs/namespace.c |  9 ++++++++-
 2 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 626eddb33508..39dd56f5171f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1210,10 +1210,7 @@ EXPORT_SYMBOL(follow_up);
  */
 static int follow_automount(struct path *path, struct nameidata *nd)
 {
-	struct vfsmount *mnt;
-
-	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
-		return -EREMOTE;
+	struct dentry *dentry = path->dentry;
 
 	/* We don't want to mount if someone's just doing a stat -
 	 * unless they're stat'ing a directory and appended a '/' to
@@ -1228,33 +1225,14 @@ static int follow_automount(struct path *path, struct nameidata *nd)
 	 */
 	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
 			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
-	    path->dentry->d_inode)
+	    dentry->d_inode)
 		return -EISDIR;
 
 	nd->total_link_count++;
 	if (nd->total_link_count >= 40)
 		return -ELOOP;
 
-	mnt = path->dentry->d_op->d_automount(path);
-	if (IS_ERR(mnt)) {
-		/*
-		 * The filesystem is allowed to return -EISDIR here to indicate
-		 * it doesn't want to automount.  For instance, autofs would do
-		 * this so that its userspace daemon can mount on this dentry.
-		 *
-		 * However, we can only permit this if it's a terminal point in
-		 * the path being looked up; if it wasn't then the remainder of
-		 * the path is inaccessible and we should say so.
-		 */
-		if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
-			return -EREMOTE;
-		return PTR_ERR(mnt);
-	}
-
-	if (!mnt)
-		return 0;
-
-	return finish_automount(mnt, path);
+	return finish_automount(dentry->d_op->d_automount(path), path);
 }
 
 /*
diff --git a/fs/namespace.c b/fs/namespace.c
index 777c3116e62e..743980380a8f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2824,9 +2824,16 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
 int finish_automount(struct vfsmount *m, struct path *path)
 {
 	struct dentry *dentry = path->dentry;
-	struct mount *mnt = real_mount(m);
 	struct mountpoint *mp;
+	struct mount *mnt;
 	int err;
+
+	if (!m)
+		return 0;
+	if (IS_ERR(m))
+		return PTR_ERR(m);
+
+	mnt = real_mount(m);
 	/* The new mount record should have at least 2 refs to prevent it being
 	 * expired before we get a chance to add it
 	 */

From 1c9f5e06a613cc48608db1a4207b8bd5ebe70a81 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 16 Jan 2020 22:05:18 -0500
Subject: [PATCH 060/327] follow_automount() doesn't need the entire nameidata

Only the address of ->total_link_count and the flags.
And fix an off-by-one is ELOOP detection - make it
consistent with symlink following, where we check if
the pre-increment value has reached 40, rather than
check the post-increment one.

[kudos to Christian Brauner for spotted braino]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 39dd56f5171f..6721c5f7e9d5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1208,7 +1208,7 @@ EXPORT_SYMBOL(follow_up);
  * - return -EISDIR to tell follow_managed() to stop and return the path we
  *   were called with.
  */
-static int follow_automount(struct path *path, struct nameidata *nd)
+static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
 {
 	struct dentry *dentry = path->dentry;
 
@@ -1223,13 +1223,12 @@ static int follow_automount(struct path *path, struct nameidata *nd)
 	 * as being automount points.  These will need the attentions
 	 * of the daemon to instantiate them before they can be used.
 	 */
-	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
+	if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
 			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
 	    dentry->d_inode)
 		return -EISDIR;
 
-	nd->total_link_count++;
-	if (nd->total_link_count >= 40)
+	if (count && (*count)++ >= MAXSYMLINKS)
 		return -ELOOP;
 
 	return finish_automount(dentry->d_op->d_automount(path), path);
@@ -1290,7 +1289,8 @@ static int follow_managed(struct path *path, struct nameidata *nd)
 
 		/* Handle an automount point */
 		if (flags & DCACHE_NEED_AUTOMOUNT) {
-			ret = follow_automount(path, nd);
+			ret = follow_automount(path, &nd->total_link_count,
+						nd->flags);
 			if (ret < 0)
 				break;
 			continue;

From 31d1726d7250021c66c9f16d8a128444676db782 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 8 Jan 2020 20:19:38 -0500
Subject: [PATCH 061/327] make build_open_flags() treat O_CREAT | O_EXCL as
 implying O_NOFOLLOW

O_CREAT | O_EXCL means "-EEXIST if we run into a trailing symlink".
As it is, we might or might not have LOOKUP_FOLLOW in op->intent
in that case - that depends upon having O_NOFOLLOW in open flags.
It doesn't matter, since we won't be checking it in that case -
do_last() bails out earlier.

However, making sure it's not set (i.e. acting as if we had an explicit
O_NOFOLLOW) makes the behaviour more explicit and allows to reorder the
check for O_CREAT | O_EXCL in do_last() with the call of step_into()
immediately following it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 15 +++++----------
 fs/open.c  |  4 +++-
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 6721c5f7e9d5..6938d20aa73a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3396,22 +3396,17 @@ static int do_last(struct nameidata *nd,
 	if (unlikely(error < 0))
 		return error;
 
-	/*
-	 * create/update audit record if it already exists.
-	 */
-	audit_inode(nd->name, path.dentry, 0);
-
-	if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
-		path_to_nameidata(&path, nd);
-		return -EEXIST;
-	}
-
 	seq = 0;	/* out of RCU mode, so the value doesn't matter */
 	inode = d_backing_inode(path.dentry);
 finish_lookup:
 	error = step_into(nd, &path, 0, inode, seq);
 	if (unlikely(error))
 		return error;
+
+	if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
+		audit_inode(nd->name, nd->path.dentry, 0);
+		return -EEXIST;
+	}
 finish_open:
 	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
 	error = complete_walk(nd);
diff --git a/fs/open.c b/fs/open.c
index 0788b3715731..e5227cd533f4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1049,8 +1049,10 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
 
 	if (flags & O_CREAT) {
 		op->intent |= LOOKUP_CREATE;
-		if (flags & O_EXCL)
+		if (flags & O_EXCL) {
 			op->intent |= LOOKUP_EXCL;
+			flags |= O_NOFOLLOW;
+		}
 	}
 
 	if (flags & O_DIRECTORY)

From bd7c4b508344680c843e2d2436d56b9fc8aedc9d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 8 Jan 2020 20:37:23 -0500
Subject: [PATCH 062/327] handle_mounts(): start building a sane wrapper for
 follow_managed()

All callers of follow_managed() follow it on success with the same steps -
d_backing_inode(path->dentry) is calculated and stored into some struct inode *
variable and, in all but one case, an unsigned variable (nd->seq to be) is
zeroed.  The single exception is lookup_fast() and there zeroing is correct
thing to do - not doing it is a pointless microoptimization.

	Add a wrapper for follow_managed() that would do that combination.
It's mostly a vehicle for code massage - it will be changing quite a bit,
and the current calling conventions are by no means final.  Right now it
takes path, nameidata and (as out params) inode and seq, similar to
__follow_mount_rcu().  Which will soon get folded into it...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 6938d20aa73a..c104ec75faef 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1385,6 +1385,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 }
 
+static inline int handle_mounts(struct path *path, struct nameidata *nd,
+			  struct inode **inode, unsigned int *seqp)
+{
+	int ret = follow_managed(path, nd);
+
+	if (likely(ret >= 0)) {
+		*inode = d_backing_inode(path->dentry);
+		*seqp = 0; /* out of RCU mode, so the value doesn't matter */
+	}
+	return ret;
+}
+
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
 	struct inode *inode = nd->inode;
@@ -1607,7 +1619,6 @@ static int lookup_fast(struct nameidata *nd,
 	struct vfsmount *mnt = nd->path.mnt;
 	struct dentry *dentry, *parent = nd->path.dentry;
 	int status = 1;
-	int err;
 
 	/*
 	 * Rename seqlock is not required here because in the off chance
@@ -1677,10 +1688,7 @@ static int lookup_fast(struct nameidata *nd,
 
 	path->mnt = mnt;
 	path->dentry = dentry;
-	err = follow_managed(path, nd);
-	if (likely(err > 0))
-		*inode = d_backing_inode(path->dentry);
-	return err;
+	return handle_mounts(path, nd, inode, seqp);
 }
 
 /* Fast lookup failed, do it the slow way */
@@ -1875,12 +1883,9 @@ static int walk_component(struct nameidata *nd, int flags)
 			return PTR_ERR(path.dentry);
 
 		path.mnt = nd->path.mnt;
-		err = follow_managed(&path, nd);
+		err = handle_mounts(&path, nd, &inode, &seq);
 		if (unlikely(err < 0))
 			return err;
-
-		seq = 0;	/* we are already out of RCU mode */
-		inode = d_backing_inode(path.dentry);
 	}
 
 	return step_into(nd, &path, flags, inode, seq);
@@ -2365,11 +2370,9 @@ static int handle_lookup_down(struct nameidata *nd)
 			return -ECHILD;
 	} else {
 		dget(path.dentry);
-		err = follow_managed(&path, nd);
+		err = handle_mounts(&path, nd, &inode, &seq);
 		if (unlikely(err < 0))
 			return err;
-		inode = d_backing_inode(path.dentry);
-		seq = 0;
 	}
 	path_to_nameidata(&path, nd);
 	nd->inode = inode;
@@ -3392,12 +3395,9 @@ static int do_last(struct nameidata *nd,
 		got_write = false;
 	}
 
-	error = follow_managed(&path, nd);
+	error = handle_mounts(&path, nd, &inode, &seq);
 	if (unlikely(error < 0))
 		return error;
-
-	seq = 0;	/* out of RCU mode, so the value doesn't matter */
-	inode = d_backing_inode(path.dentry);
 finish_lookup:
 	error = step_into(nd, &path, 0, inode, seq);
 	if (unlikely(error))

From 239eb983383b1b140e588fe8f1c7d1ce953285f6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 9 Jan 2020 14:12:40 -0500
Subject: [PATCH 063/327] atomic_open(): saner calling conventions (return
 dentry on success)

Currently it either returns -E... or puts (nd->path.mnt,dentry)
into *path and returns 0.  Make it return ERR_PTR(-E...) or
dentry; adjust the caller.  Fewer arguments and it's easier
to keep track of *path contents that way.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index c104ec75faef..5f8b791a6d6e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3087,10 +3087,10 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m
  *
  * Returns an error code otherwise.
  */
-static int atomic_open(struct nameidata *nd, struct dentry *dentry,
-			struct path *path, struct file *file,
-			const struct open_flags *op,
-			int open_flag, umode_t mode)
+static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
+				  struct file *file,
+				  const struct open_flags *op,
+				  int open_flag, umode_t mode)
 {
 	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
 	struct inode *dir =  nd->path.dentry->d_inode;
@@ -3131,17 +3131,15 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 			}
 			if (file->f_mode & FMODE_CREATED)
 				fsnotify_create(dir, dentry);
-			if (unlikely(d_is_negative(dentry))) {
+			if (unlikely(d_is_negative(dentry)))
 				error = -ENOENT;
-			} else {
-				path->dentry = dentry;
-				path->mnt = nd->path.mnt;
-				return 0;
-			}
 		}
 	}
-	dput(dentry);
-	return error;
+	if (error) {
+		dput(dentry);
+		dentry = ERR_PTR(error);
+	}
+	return dentry;
 }
 
 /*
@@ -3236,11 +3234,20 @@ static int lookup_open(struct nameidata *nd, struct path *path,
 	}
 
 	if (dir_inode->i_op->atomic_open) {
-		error = atomic_open(nd, dentry, path, file, op, open_flag,
-				    mode);
-		if (unlikely(error == -ENOENT) && create_error)
-			error = create_error;
-		return error;
+		dentry = atomic_open(nd, dentry, file, op, open_flag, mode);
+		if (IS_ERR(dentry)) {
+			error = PTR_ERR(dentry);
+			if (unlikely(error == -ENOENT) && create_error)
+				error = create_error;
+			return error;
+		}
+		if (file->f_mode & FMODE_OPENED) {
+			dput(dentry);
+			return 0;
+		}
+		path->mnt = nd->path.mnt;
+		path->dentry = dentry;
+		return 0;
 	}
 
 no_open:

From 25baba217cdfc1dfc75a9f427f4551b9a90e372b Mon Sep 17 00:00:00 2001
From: Kamal Heib <kamalheib1@gmail.com>
Date: Tue, 18 Feb 2020 11:59:11 +0200
Subject: [PATCH 064/327] RDMA/siw: Fix setting active_{speed, width}
 attributes

Make sure to set the active_{speed, width} attributes to avoid reporting
the same values regardless of the underlying device.

Fixes: 303ae1cdfdf7 ("rdma/siw: application interface")
Link: https://lore.kernel.org/r/20200218095911.26614-1-kamalheib1@gmail.com
Signed-off-by: Kamal Heib <kamalheib1@gmail.com>
Tested-by: Bernard Metzler <bmt@zurich.ibm.com>
Reviewed-by: Bernard Metzler <bmt@zurich.ibm.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/siw/siw_verbs.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 73485d0da907..d5390d498c61 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -165,11 +165,12 @@ int siw_query_port(struct ib_device *base_dev, u8 port,
 		   struct ib_port_attr *attr)
 {
 	struct siw_device *sdev = to_siw_dev(base_dev);
+	int rv;
 
 	memset(attr, 0, sizeof(*attr));
 
-	attr->active_speed = 2;
-	attr->active_width = 2;
+	rv = ib_get_eth_speed(base_dev, port, &attr->active_speed,
+			 &attr->active_width);
 	attr->gid_tbl_len = 1;
 	attr->max_msg_sz = -1;
 	attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
@@ -192,7 +193,7 @@ int siw_query_port(struct ib_device *base_dev, u8 port,
 	 * attr->subnet_timeout = 0;
 	 * attr->init_type_repy = 0;
 	 */
-	return 0;
+	return rv;
 }
 
 int siw_get_port_immutable(struct ib_device *base_dev, u8 port,

From 7c11910783a1ea17e88777552ef146cace607b3c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 18 Feb 2020 15:45:38 -0400
Subject: [PATCH 065/327] RDMA/ucma: Put a lock around every call to the
 rdma_cm layer

The rdma_cm must be used single threaded.

This appears to be a bug in the design, as it does have lots of locking
that seems like it should allow concurrency. However, when it is all said
and done every single place that uses the cma_exch() scheme is broken, and
all the unlocked reads from the ucma of the cm_id data are wrong too.

syzkaller has been finding endless bugs related to this.

Fixing this in any elegant way is some enormous amount of work. Take a
very big hammer and put a mutex around everything to do with the
ucma_context at the top of every syscall.

Fixes: 75216638572f ("RDMA/cma: Export rdma cm interface to userspace")
Link: https://lore.kernel.org/r/20200218210432.GA31966@ziepe.ca
Reported-by: syzbot+adb15cf8c2798e4e0db4@syzkaller.appspotmail.com
Reported-by: syzbot+e5579222b6a3edd96522@syzkaller.appspotmail.com
Reported-by: syzbot+4b628fcc748474003457@syzkaller.appspotmail.com
Reported-by: syzbot+29ee8f76017ce6cf03da@syzkaller.appspotmail.com
Reported-by: syzbot+6956235342b7317ec564@syzkaller.appspotmail.com
Reported-by: syzbot+b358909d8d01556b790b@syzkaller.appspotmail.com
Reported-by: syzbot+6b46b135602a3f3ac99e@syzkaller.appspotmail.com
Reported-by: syzbot+8458d13b13562abf6b77@syzkaller.appspotmail.com
Reported-by: syzbot+bd034f3fdc0402e942ed@syzkaller.appspotmail.com
Reported-by: syzbot+c92378b32760a4eef756@syzkaller.appspotmail.com
Reported-by: syzbot+68b44a1597636e0b342c@syzkaller.appspotmail.com
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/ucma.c | 49 ++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 66ad29c672fc..16b6cf57fa85 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -91,6 +91,7 @@ struct ucma_context {
 
 	struct ucma_file	*file;
 	struct rdma_cm_id	*cm_id;
+	struct mutex		mutex;
 	u64			uid;
 
 	struct list_head	list;
@@ -216,6 +217,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
 	init_completion(&ctx->comp);
 	INIT_LIST_HEAD(&ctx->mc_list);
 	ctx->file = file;
+	mutex_init(&ctx->mutex);
 
 	if (xa_alloc(&ctx_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL))
 		goto error;
@@ -589,6 +591,7 @@ static int ucma_free_ctx(struct ucma_context *ctx)
 	}
 
 	events_reported = ctx->events_reported;
+	mutex_destroy(&ctx->mutex);
 	kfree(ctx);
 	return events_reported;
 }
@@ -658,7 +661,10 @@ static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	mutex_lock(&ctx->mutex);
 	ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+	mutex_unlock(&ctx->mutex);
+
 	ucma_put_ctx(ctx);
 	return ret;
 }
@@ -681,7 +687,9 @@ static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	mutex_lock(&ctx->mutex);
 	ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+	mutex_unlock(&ctx->mutex);
 	ucma_put_ctx(ctx);
 	return ret;
 }
@@ -705,8 +713,10 @@ static ssize_t ucma_resolve_ip(struct ucma_file *file,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	mutex_lock(&ctx->mutex);
 	ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
 				(struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms);
+	mutex_unlock(&ctx->mutex);
 	ucma_put_ctx(ctx);
 	return ret;
 }
@@ -731,8 +741,10 @@ static ssize_t ucma_resolve_addr(struct ucma_file *file,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	mutex_lock(&ctx->mutex);
 	ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
 				(struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms);
+	mutex_unlock(&ctx->mutex);
 	ucma_put_ctx(ctx);
 	return ret;
 }
@@ -752,7 +764,9 @@ static ssize_t ucma_resolve_route(struct ucma_file *file,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	mutex_lock(&ctx->mutex);
 	ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms);
+	mutex_unlock(&ctx->mutex);
 	ucma_put_ctx(ctx);
 	return ret;
 }
@@ -841,6 +855,7 @@ static ssize_t ucma_query_route(struct ucma_file *file,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	mutex_lock(&ctx->mutex);
 	memset(&resp, 0, sizeof resp);
 	addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
 	memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ?
@@ -864,6 +879,7 @@ static ssize_t ucma_query_route(struct ucma_file *file,
 		ucma_copy_iw_route(&resp, &ctx->cm_id->route);
 
 out:
+	mutex_unlock(&ctx->mutex);
 	if (copy_to_user(u64_to_user_ptr(cmd.response),
 			 &resp, sizeof(resp)))
 		ret = -EFAULT;
@@ -1014,6 +1030,7 @@ static ssize_t ucma_query(struct ucma_file *file,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	mutex_lock(&ctx->mutex);
 	switch (cmd.option) {
 	case RDMA_USER_CM_QUERY_ADDR:
 		ret = ucma_query_addr(ctx, response, out_len);
@@ -1028,6 +1045,7 @@ static ssize_t ucma_query(struct ucma_file *file,
 		ret = -ENOSYS;
 		break;
 	}
+	mutex_unlock(&ctx->mutex);
 
 	ucma_put_ctx(ctx);
 	return ret;
@@ -1068,7 +1086,9 @@ static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
 		return PTR_ERR(ctx);
 
 	ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+	mutex_lock(&ctx->mutex);
 	ret = rdma_connect(ctx->cm_id, &conn_param);
+	mutex_unlock(&ctx->mutex);
 	ucma_put_ctx(ctx);
 	return ret;
 }
@@ -1089,7 +1109,9 @@ static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf,
 
 	ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ?
 		       cmd.backlog : max_backlog;
+	mutex_lock(&ctx->mutex);
 	ret = rdma_listen(ctx->cm_id, ctx->backlog);
+	mutex_unlock(&ctx->mutex);
 	ucma_put_ctx(ctx);
 	return ret;
 }
@@ -1112,13 +1134,17 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
 	if (cmd.conn_param.valid) {
 		ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
 		mutex_lock(&file->mut);
+		mutex_lock(&ctx->mutex);
 		ret = __rdma_accept(ctx->cm_id, &conn_param, NULL);
+		mutex_unlock(&ctx->mutex);
 		if (!ret)
 			ctx->uid = cmd.uid;
 		mutex_unlock(&file->mut);
-	} else
+	} else {
+		mutex_lock(&ctx->mutex);
 		ret = __rdma_accept(ctx->cm_id, NULL, NULL);
-
+		mutex_unlock(&ctx->mutex);
+	}
 	ucma_put_ctx(ctx);
 	return ret;
 }
@@ -1137,7 +1163,9 @@ static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	mutex_lock(&ctx->mutex);
 	ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len);
+	mutex_unlock(&ctx->mutex);
 	ucma_put_ctx(ctx);
 	return ret;
 }
@@ -1156,7 +1184,9 @@ static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	mutex_lock(&ctx->mutex);
 	ret = rdma_disconnect(ctx->cm_id);
+	mutex_unlock(&ctx->mutex);
 	ucma_put_ctx(ctx);
 	return ret;
 }
@@ -1187,7 +1217,9 @@ static ssize_t ucma_init_qp_attr(struct ucma_file *file,
 	resp.qp_attr_mask = 0;
 	memset(&qp_attr, 0, sizeof qp_attr);
 	qp_attr.qp_state = cmd.qp_state;
+	mutex_lock(&ctx->mutex);
 	ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+	mutex_unlock(&ctx->mutex);
 	if (ret)
 		goto out;
 
@@ -1273,9 +1305,13 @@ static int ucma_set_ib_path(struct ucma_context *ctx,
 		struct sa_path_rec opa;
 
 		sa_convert_path_ib_to_opa(&opa, &sa_path);
+		mutex_lock(&ctx->mutex);
 		ret = rdma_set_ib_path(ctx->cm_id, &opa);
+		mutex_unlock(&ctx->mutex);
 	} else {
+		mutex_lock(&ctx->mutex);
 		ret = rdma_set_ib_path(ctx->cm_id, &sa_path);
+		mutex_unlock(&ctx->mutex);
 	}
 	if (ret)
 		return ret;
@@ -1308,7 +1344,9 @@ static int ucma_set_option_level(struct ucma_context *ctx, int level,
 
 	switch (level) {
 	case RDMA_OPTION_ID:
+		mutex_lock(&ctx->mutex);
 		ret = ucma_set_option_id(ctx, optname, optval, optlen);
+		mutex_unlock(&ctx->mutex);
 		break;
 	case RDMA_OPTION_IB:
 		ret = ucma_set_option_ib(ctx, optname, optval, optlen);
@@ -1368,8 +1406,10 @@ static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
+	mutex_lock(&ctx->mutex);
 	if (ctx->cm_id->device)
 		ret = rdma_notify(ctx->cm_id, (enum ib_event_type)cmd.event);
+	mutex_unlock(&ctx->mutex);
 
 	ucma_put_ctx(ctx);
 	return ret;
@@ -1412,8 +1452,10 @@ static ssize_t ucma_process_join(struct ucma_file *file,
 	mc->join_state = join_state;
 	mc->uid = cmd->uid;
 	memcpy(&mc->addr, addr, cmd->addr_size);
+	mutex_lock(&ctx->mutex);
 	ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr,
 				  join_state, mc);
+	mutex_unlock(&ctx->mutex);
 	if (ret)
 		goto err2;
 
@@ -1513,7 +1555,10 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
 		goto out;
 	}
 
+	mutex_lock(&mc->ctx->mutex);
 	rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
+	mutex_unlock(&mc->ctx->mutex);
+
 	mutex_lock(&mc->ctx->file->mut);
 	ucma_cleanup_mc_events(mc);
 	list_del(&mc->list);

From 968707207141aba5b32871b300dcc601da3afe5c Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Thu, 20 Feb 2020 09:12:38 +0200
Subject: [PATCH 066/327] RDMA/ipoib: Don't set constant driver version

There is no need to set driver version in in-tree kernel code.

Link: https://lore.kernel.org/r/20200220071239.231800-2-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/ipoib/ipoib.h         | 2 --
 drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 3 ---
 drivers/infiniband/ulp/ipoib/ipoib_main.c    | 4 ----
 3 files changed, 9 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 2aa3457a30ce..e188a95984b5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -838,6 +838,4 @@ extern int ipoib_debug_level;
 
 #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
 
-extern const char ipoib_driver_version[];
-
 #endif /* _IPOIB_H */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index 63e4f9d15fd9..a47097d4577c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -68,9 +68,6 @@ static void ipoib_get_drvinfo(struct net_device *netdev,
 	strlcpy(drvinfo->bus_info, dev_name(priv->ca->dev.parent),
 		sizeof(drvinfo->bus_info));
 
-	strlcpy(drvinfo->version, ipoib_driver_version,
-		sizeof(drvinfo->version));
-
 	strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver));
 }
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 4a0d3a9e72e1..81b8227214f1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -52,10 +52,6 @@
 #include <linux/inetdevice.h>
 #include <rdma/ib_cache.h>
 
-#define DRV_VERSION "1.0.0"
-
-const char ipoib_driver_version[] = DRV_VERSION;
-
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
 MODULE_LICENSE("Dual BSD/GPL");

From 699d9e7542825464acb4e08212bca1b6d5a82593 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Thu, 20 Feb 2020 09:12:39 +0200
Subject: [PATCH 067/327] RDMA/opa_vnic: Delete driver version

The default version provided by "ethtool -i" it the correct way
to identify Driver version. There is no need to overwrite it.

Link: https://lore.kernel.org/r/20200220071239.231800-3-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c  | 2 --
 drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h | 1 -
 drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c     | 5 -----
 3 files changed, 8 deletions(-)

diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c
index 8ad7da989a0e..42d557dff19d 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_ethtool.c
@@ -125,8 +125,6 @@ static void vnic_get_drvinfo(struct net_device *netdev,
 			     struct ethtool_drvinfo *drvinfo)
 {
 	strlcpy(drvinfo->driver, opa_vnic_driver_name, sizeof(drvinfo->driver));
-	strlcpy(drvinfo->version, opa_vnic_driver_version,
-		sizeof(drvinfo->version));
 	strlcpy(drvinfo->bus_info, dev_name(netdev->dev.parent),
 		sizeof(drvinfo->bus_info));
 }
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h
index 6dbc08e1a6a6..dd942dd642bd 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_internal.h
@@ -292,7 +292,6 @@ struct opa_vnic_mac_tbl_node {
 		hlist_for_each_entry(obj, &name[bkt], member)
 
 extern char opa_vnic_driver_name[];
-extern const char opa_vnic_driver_version[];
 
 struct opa_vnic_adapter *opa_vnic_add_netdev(struct ib_device *ibdev,
 					     u8 port_num, u8 vport_num);
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
index be5befd92d16..6e8d650c17c7 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
@@ -59,9 +59,7 @@
 
 #include "opa_vnic_internal.h"
 
-#define DRV_VERSION "1.0"
 char opa_vnic_driver_name[] = "opa_vnic";
-const char opa_vnic_driver_version[] = DRV_VERSION;
 
 /*
  * The trap service level is kept in bits 3 to 7 in the trap_sl_rsvd
@@ -1041,9 +1039,6 @@ static int __init opa_vnic_init(void)
 {
 	int rc;
 
-	pr_info("OPA Virtual Network Driver - v%s\n",
-		opa_vnic_driver_version);
-
 	rc = ib_register_client(&opa_vnic_client);
 	if (rc)
 		pr_err("VNIC driver register failed %d\n", rc);

From dfaf2854b02e5c45bf806c7a1d8b4706b87ddc8e Mon Sep 17 00:00:00 2001
From: Lang Cheng <chenglang@huawei.com>
Date: Sat, 22 Feb 2020 17:17:19 +0800
Subject: [PATCH 068/327] RDMA/hns: Treat revision HIP08_A as a special case

Set revisions that equal to or higher than HIP08_B as default to maintain
backward compatibility.

Link: https://lore.kernel.org/r/1582363039-10714-1-git-send-email-liweihang@huawei.com
Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 28 ++++++++++------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index dee1cc8ffb42..593bf8d986db 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -1680,7 +1680,7 @@ static void set_default_caps(struct hns_roce_dev *hr_dev)
 	caps->max_srq_wrs	= HNS_ROCE_V2_MAX_SRQ_WR;
 	caps->max_srq_sges	= HNS_ROCE_V2_MAX_SRQ_SGE;
 
-	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_B) {
+	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP08_B) {
 		caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC | HNS_ROCE_CAP_FLAG_MW |
 			       HNS_ROCE_CAP_FLAG_SRQ | HNS_ROCE_CAP_FLAG_FRMR |
 			       HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL;
@@ -1928,7 +1928,7 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
 		   caps->srqc_bt_num, &caps->srqc_buf_pg_sz,
 		   &caps->srqc_ba_pg_sz, HEM_TYPE_SRQC);
 
-	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_B) {
+	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP08_B) {
 		caps->sccc_hop_num = ctx_hop_num;
 		caps->qpc_timer_hop_num = HNS_ROCE_HOP_NUM_0;
 		caps->cqc_timer_hop_num = HNS_ROCE_HOP_NUM_0;
@@ -1988,7 +1988,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 		return ret;
 	}
 
-	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_B) {
+	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP08_B) {
 		ret = hns_roce_query_pf_timer_resource(hr_dev);
 		if (ret) {
 			dev_err(hr_dev->dev,
@@ -1996,6 +1996,14 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 				ret);
 			return ret;
 		}
+
+		ret = hns_roce_set_vf_switch_param(hr_dev, 0);
+		if (ret) {
+			dev_err(hr_dev->dev,
+				"Set function switch param fail, ret = %d.\n",
+				ret);
+			return ret;
+		}
 	}
 
 	ret = hns_roce_alloc_vf_resource(hr_dev);
@@ -2005,16 +2013,6 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 		return ret;
 	}
 
-	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_B) {
-		ret = hns_roce_set_vf_switch_param(hr_dev, 0);
-		if (ret) {
-			dev_err(hr_dev->dev,
-				"Set function switch param fail, ret = %d.\n",
-				ret);
-			return ret;
-		}
-	}
-
 	hr_dev->vendor_part_id = hr_dev->pci_dev->device;
 	hr_dev->sys_image_guid = be64_to_cpu(hr_dev->ib_dev.node_guid);
 
@@ -2287,7 +2285,7 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_v2_priv *priv = hr_dev->priv;
 
-	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_B)
+	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP08_B)
 		hns_roce_function_clear(hr_dev);
 
 	hns_roce_free_link_table(hr_dev, &priv->tpq);
@@ -4472,7 +4470,7 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp,
 	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M,
 		       V2_QPC_BYTE_24_HOP_LIMIT_S, 0);
 
-	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_B && is_udp)
+	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP08_B && is_udp)
 		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
 			       V2_QPC_BYTE_24_TC_S, grh->traffic_class >> 2);
 	else

From 0fc99566f6eeb8c1d7e1795710b95c03a987124d Mon Sep 17 00:00:00 2001
From: Yixian Liu <liuyixian@huawei.com>
Date: Sat, 22 Feb 2020 18:25:57 +0800
Subject: [PATCH 069/327] RDMA/hns: Use flush framework for the case in aeq

As now we already have flush framework, using it instead of current flush
process for qp error in asynchronized interrupt (aeq).

Link: https://lore.kernel.org/r/1582367158-27030-2-git-send-email-liuyixian@huawei.com
Signed-off-by: Yixian Liu <liuyixian@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 36 ----------------------
 drivers/infiniband/hw/hns/hns_roce_qp.c    |  9 ++++++
 2 files changed, 9 insertions(+), 36 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 593bf8d986db..2b372ee55fb5 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -5184,39 +5184,6 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
 	return ret;
 }
 
-static void hns_roce_set_qps_to_err(struct hns_roce_dev *hr_dev, u32 qpn)
-{
-	struct hns_roce_qp *hr_qp;
-	struct ib_qp_attr attr;
-	int attr_mask;
-	int ret;
-
-	hr_qp = __hns_roce_qp_lookup(hr_dev, qpn);
-	if (!hr_qp) {
-		dev_warn(hr_dev->dev, "no hr_qp can be found!\n");
-		return;
-	}
-
-	if (hr_qp->ibqp.uobject) {
-		if (hr_qp->sdb_en == 1) {
-			hr_qp->sq.head = *(int *)(hr_qp->sdb.virt_addr);
-			if (hr_qp->rdb_en == 1)
-				hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr);
-		} else {
-			dev_warn(hr_dev->dev, "flush cqe is unsupported in userspace!\n");
-			return;
-		}
-	}
-
-	attr_mask = IB_QP_STATE;
-	attr.qp_state = IB_QPS_ERR;
-	ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, &attr, attr_mask,
-				    hr_qp->state, IB_QPS_ERR);
-	if (ret)
-		dev_err(hr_dev->dev, "failed to modify qp %d to err state.\n",
-			qpn);
-}
-
 static void hns_roce_irq_work_handle(struct work_struct *work)
 {
 	struct hns_roce_work *irq_work =
@@ -5240,17 +5207,14 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
 	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
 		dev_err(dev, "Local work queue 0x%x catas error, sub_type:%d\n",
 			qpn, irq_work->sub_type);
-		hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
 		break;
 	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
 		dev_err(dev, "Invalid request local work queue 0x%x error.\n",
 			qpn);
-		hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
 		break;
 	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
 		dev_err(dev, "Local access violation work queue 0x%x error, sub_type:%d\n",
 			qpn, irq_work->sub_type);
-		hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
 		break;
 	case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
 		dev_warn(dev, "SRQ limit reach.\n");
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index c52e1b00f30d..6c3f0f737963 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -98,6 +98,15 @@ void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type)
 		return;
 	}
 
+	if (hr_dev->hw_rev != HNS_ROCE_HW_VER1 &&
+	    (event_type == HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR ||
+	     event_type == HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR ||
+	     event_type == HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR)) {
+		qp->state = IB_QPS_ERR;
+		if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &qp->flush_flag))
+			init_flush_work(hr_dev, qp);
+	}
+
 	qp->event(qp, (enum hns_roce_event)event_type);
 
 	if (atomic_dec_and_test(&qp->refcount))

From 75c994e6943c2aa887871eb6e6cea6c073b1cfd3 Mon Sep 17 00:00:00 2001
From: Yixian Liu <liuyixian@huawei.com>
Date: Sat, 22 Feb 2020 18:25:58 +0800
Subject: [PATCH 070/327] RDMA/hns: Stop doorbell update while qp state error

There are two paths to update qp producer index into hardware now, one
path is doorbell in post verbs (send and recv), the another is mailbox in
modify qp verb which is called by flush process. This will lead the
hardware to be broken to correctly generate flush cqe.  With stopping
doorbell update and holding qp spinlock in modify qp during flush process,
the problem can be solved.

Fixes: 0425e3e6e0c7 ("RDMA/hns: Support flush cqe for hip08 in kernel space")
Link: https://lore.kernel.org/r/1582367158-27030-3-git-send-email-liuyixian@huawei.com
Signed-off-by: Yixian Liu <liuyixian@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 76 ++++++++++++----------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 2b372ee55fb5..b19dedeba066 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -244,6 +244,38 @@ static int check_send_valid(struct hns_roce_dev *hr_dev,
 	return 0;
 }
 
+static inline void update_sq_db(struct hns_roce_dev *hr_dev,
+				struct hns_roce_qp *qp)
+{
+	/*
+	 * Hip08 hardware cannot flush the WQEs in SQ if the QP state
+	 * gets into errored mode. Hence, as a workaround to this
+	 * hardware limitation, driver needs to assist in flushing. But
+	 * the flushing operation uses mailbox to convey the QP state to
+	 * the hardware and which can sleep due to the mutex protection
+	 * around the mailbox calls. Hence, use the deferred flush for
+	 * now.
+	 */
+	if (qp->state == IB_QPS_ERR) {
+		if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &qp->flush_flag))
+			init_flush_work(hr_dev, qp);
+	} else {
+		struct hns_roce_v2_db sq_db = {};
+
+		roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_TAG_M,
+			       V2_DB_BYTE_4_TAG_S, qp->doorbell_qpn);
+		roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_CMD_M,
+			       V2_DB_BYTE_4_CMD_S, HNS_ROCE_V2_SQ_DB);
+		roce_set_field(sq_db.parameter, V2_DB_PARAMETER_IDX_M,
+			       V2_DB_PARAMETER_IDX_S,
+			       qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1));
+		roce_set_field(sq_db.parameter, V2_DB_PARAMETER_SL_M,
+			       V2_DB_PARAMETER_SL_S, qp->sl);
+
+		hns_roce_write64(hr_dev, (__le32 *)&sq_db, qp->sq.db_reg_l);
+	}
+}
+
 static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 				 const struct ib_send_wr *wr,
 				 const struct ib_send_wr **bad_wr)
@@ -255,7 +287,6 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
 	struct hns_roce_wqe_frmr_seg *fseg;
 	struct device *dev = hr_dev->dev;
-	struct hns_roce_v2_db sq_db = {};
 	unsigned int owner_bit;
 	unsigned int sge_idx;
 	unsigned int wqe_idx;
@@ -580,36 +611,10 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 out:
 	if (likely(nreq)) {
 		qp->sq.head += nreq;
+		qp->next_sge = sge_idx;
 		/* Memory barrier */
 		wmb();
-
-		roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_TAG_M,
-			       V2_DB_BYTE_4_TAG_S, qp->doorbell_qpn);
-		roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_CMD_M,
-			       V2_DB_BYTE_4_CMD_S, HNS_ROCE_V2_SQ_DB);
-		roce_set_field(sq_db.parameter, V2_DB_PARAMETER_IDX_M,
-			       V2_DB_PARAMETER_IDX_S,
-			       qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1));
-		roce_set_field(sq_db.parameter, V2_DB_PARAMETER_SL_M,
-			       V2_DB_PARAMETER_SL_S, qp->sl);
-
-		hns_roce_write64(hr_dev, (__le32 *)&sq_db, qp->sq.db_reg_l);
-
-		qp->next_sge = sge_idx;
-
-		/*
-		 * Hip08 hardware cannot flush the WQEs in SQ if the QP state
-		 * gets into errored mode. Hence, as a workaround to this
-		 * hardware limitation, driver needs to assist in flushing. But
-		 * the flushing operation uses mailbox to convey the QP state to
-		 * the hardware and which can sleep due to the mutex protection
-		 * around the mailbox calls. Hence, use the deferred flush for
-		 * now.
-		 */
-		if (qp->state == IB_QPS_ERR)
-			if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG,
-					      &qp->flush_flag))
-				init_flush_work(hr_dev, qp);
+		update_sq_db(hr_dev, qp);
 	}
 
 	spin_unlock_irqrestore(&qp->sq.lock, flags);
@@ -706,8 +711,6 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
 		/* Memory barrier */
 		wmb();
 
-		*hr_qp->rdb.db_record = hr_qp->rq.head & 0xffff;
-
 		/*
 		 * Hip08 hardware cannot flush the WQEs in RQ if the QP state
 		 * gets into errored mode. Hence, as a workaround to this
@@ -717,10 +720,13 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
 		 * around the mailbox calls. Hence, use the deferred flush for
 		 * now.
 		 */
-		if (hr_qp->state == IB_QPS_ERR)
+		if (hr_qp->state == IB_QPS_ERR) {
 			if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG,
 					      &hr_qp->flush_flag))
 				init_flush_work(hr_dev, hr_qp);
+		} else {
+			*hr_qp->rdb.db_record = hr_qp->rq.head & 0xffff;
+		}
 	}
 	spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
 
@@ -4750,7 +4756,6 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 	/* When QP state is err, SQ and RQ WQE should be flushed */
 	if (new_state == IB_QPS_ERR) {
 		spin_lock_irqsave(&hr_qp->sq.lock, sq_flag);
-		spin_lock_irqsave(&hr_qp->rq.lock, rq_flag);
 		hr_qp->state = IB_QPS_ERR;
 		roce_set_field(context->byte_160_sq_ci_pi,
 			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
@@ -4759,8 +4764,10 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 		roce_set_field(qpc_mask->byte_160_sq_ci_pi,
 			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
 			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0);
+		spin_unlock_irqrestore(&hr_qp->sq.lock, sq_flag);
 
 		if (!ibqp->srq) {
+			spin_lock_irqsave(&hr_qp->rq.lock, rq_flag);
 			roce_set_field(context->byte_84_rq_ci_pi,
 			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
 			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S,
@@ -4768,9 +4775,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 			roce_set_field(qpc_mask->byte_84_rq_ci_pi,
 			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
 			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
+			spin_unlock_irqrestore(&hr_qp->rq.lock, rq_flag);
 		}
-		spin_unlock_irqrestore(&hr_qp->rq.lock, rq_flag);
-		spin_unlock_irqrestore(&hr_qp->sq.lock, sq_flag);
 	}
 
 	/* Configure the optional fields */

From e365b26c6b66fe2bc800399912f7cbc875edf693 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Mon, 24 Feb 2020 14:37:32 +0800
Subject: [PATCH 071/327] RDMA/hns: Optimize qp destroy flow

Wrap the duplicate code in hip08 and hip06 qp destruction process as
hns_roce_qp_destroy() to simply the qp destroy flow.

Link: https://lore.kernel.org/r/1582526258-13825-2-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h |  5 +--
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c  | 19 +---------
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 39 +-------------------
 drivers/infiniband/hw/hns/hns_roce_qp.c     | 41 +++++++++++++++++++++
 4 files changed, 46 insertions(+), 58 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index d0a83926dc8f..f7335c9ff9d3 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -1250,9 +1250,8 @@ void hns_roce_lock_cqs(struct hns_roce_cq *send_cq,
 void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq,
 			 struct hns_roce_cq *recv_cq);
 void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp);
-void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp);
-void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
-			       int cnt);
+void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+			 struct ib_udata *udata);
 __be32 send_ieth(const struct ib_send_wr *wr);
 int to_hr_qp_type(int qp_type);
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 89dac44b3cef..c05a905c518e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -3618,26 +3618,11 @@ int hns_roce_v1_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 		if (send_cq && send_cq != recv_cq)
 			__hns_roce_v1_cq_clean(send_cq, hr_qp->qpn, NULL);
 	}
+	hns_roce_qp_remove(hr_dev, hr_qp);
 	hns_roce_unlock_cqs(send_cq, recv_cq);
 
-	hns_roce_qp_remove(hr_dev, hr_qp);
-	hns_roce_qp_free(hr_dev, hr_qp);
+	hns_roce_qp_destroy(hr_dev, hr_qp, udata);
 
-	/* RC QP, release QPN */
-	if (hr_qp->ibqp.qp_type == IB_QPT_RC)
-		hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
-
-	hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
-
-	ib_umem_release(hr_qp->umem);
-	if (!udata) {
-		kfree(hr_qp->sq.wrid);
-		kfree(hr_qp->rq.wrid);
-
-		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
-	}
-
-	kfree(hr_qp);
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index b19dedeba066..96ccef0a3097 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -5040,43 +5040,6 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 	hns_roce_unlock_cqs(send_cq, recv_cq);
 	spin_unlock_irqrestore(&hr_dev->qp_list_lock, flags);
 
-	hns_roce_qp_free(hr_dev, hr_qp);
-
-	/* Not special_QP, free their QPN */
-	if ((hr_qp->ibqp.qp_type == IB_QPT_RC) ||
-	    (hr_qp->ibqp.qp_type == IB_QPT_UC) ||
-	    (hr_qp->ibqp.qp_type == IB_QPT_UD))
-		hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
-
-	hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
-
-	if (udata) {
-		struct hns_roce_ucontext *context =
-			rdma_udata_to_drv_context(
-				udata,
-				struct hns_roce_ucontext,
-				ibucontext);
-
-		if (hr_qp->sq.wqe_cnt && (hr_qp->sdb_en == 1))
-			hns_roce_db_unmap_user(context, &hr_qp->sdb);
-
-		if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1))
-			hns_roce_db_unmap_user(context, &hr_qp->rdb);
-	} else {
-		kfree(hr_qp->sq.wrid);
-		kfree(hr_qp->rq.wrid);
-		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
-		if (hr_qp->rq.wqe_cnt)
-			hns_roce_free_db(hr_dev, &hr_qp->rdb);
-	}
-	ib_umem_release(hr_qp->umem);
-
-	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
-	     hr_qp->rq.wqe_cnt) {
-		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
-		kfree(hr_qp->rq_inl_buf.wqe_list);
-	}
-
 	return ret;
 }
 
@@ -5091,7 +5054,7 @@ static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 		ibdev_err(&hr_dev->ib_dev, "Destroy qp 0x%06lx failed(%d)\n",
 			  hr_qp->qpn, ret);
 
-	kfree(hr_qp);
+	hns_roce_qp_destroy(hr_dev, hr_qp, udata);
 
 	return 0;
 }
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 6c3f0f737963..da25b1d7b5d7 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -1087,6 +1087,47 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 	return ret;
 }
 
+void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+			 struct ib_udata *udata)
+{
+	hns_roce_qp_free(hr_dev, hr_qp);
+
+	/* Not special_QP, free their QPN */
+	if (hr_qp->ibqp.qp_type != IB_QPT_GSI)
+		hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
+
+	hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
+
+	if (udata) {
+		struct hns_roce_ucontext *context =
+			rdma_udata_to_drv_context(
+				udata,
+				struct hns_roce_ucontext,
+				ibucontext);
+
+		if (hr_qp->sq.wqe_cnt && (hr_qp->sdb_en == 1))
+			hns_roce_db_unmap_user(context, &hr_qp->sdb);
+
+		if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1))
+			hns_roce_db_unmap_user(context, &hr_qp->rdb);
+	} else {
+		kfree(hr_qp->sq.wrid);
+		kfree(hr_qp->rq.wrid);
+		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
+		if (hr_qp->rq.wqe_cnt)
+			hns_roce_free_db(hr_dev, &hr_qp->rdb);
+	}
+	ib_umem_release(hr_qp->umem);
+
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
+	     hr_qp->rq.wqe_cnt) {
+		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
+		kfree(hr_qp->rq_inl_buf.wqe_list);
+	}
+
+	kfree(hr_qp);
+}
+
 struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
 				 struct ib_qp_init_attr *init_attr,
 				 struct ib_udata *udata)

From b71961d1daa040c4979760e50606f34e24642764 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Mon, 24 Feb 2020 14:37:33 +0800
Subject: [PATCH 072/327] RDMA/hns: Optimize qp context create and destroy flow

Rename the qp context related functions and adjusts the code location to
distinguish between the qp context and the entire qp.

Link: https://lore.kernel.org/r/1582526258-13825-3-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c |   4 -
 drivers/infiniband/hw/hns/hns_roce_qp.c    | 166 ++++++++++-----------
 2 files changed, 81 insertions(+), 89 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 96ccef0a3097..82021fa46d9d 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -5019,10 +5019,6 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 	spin_lock_irqsave(&hr_dev->qp_list_lock, flags);
 	hns_roce_lock_cqs(send_cq, recv_cq);
 
-	list_del(&hr_qp->node);
-	list_del(&hr_qp->sq_node);
-	list_del(&hr_qp->rq_node);
-
 	if (!udata) {
 		if (recv_cq)
 			__hns_roce_v2_cq_clean(recv_cq, hr_qp->qpn,
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index da25b1d7b5d7..396356a11b1b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -187,50 +187,75 @@ enum hns_roce_qp_state to_hns_roce_state(enum ib_qp_state state)
 	}
 }
 
-static int hns_roce_gsi_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
-				 struct hns_roce_qp *hr_qp)
+static void add_qp_to_list(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_qp *hr_qp,
+			   struct ib_cq *send_cq, struct ib_cq *recv_cq)
+{
+	struct hns_roce_cq *hr_send_cq, *hr_recv_cq;
+	unsigned long flags;
+
+	hr_send_cq = send_cq ? to_hr_cq(send_cq) : NULL;
+	hr_recv_cq = recv_cq ? to_hr_cq(recv_cq) : NULL;
+
+	spin_lock_irqsave(&hr_dev->qp_list_lock, flags);
+	hns_roce_lock_cqs(hr_send_cq, hr_recv_cq);
+
+	list_add_tail(&hr_qp->node, &hr_dev->qp_list);
+	if (hr_send_cq)
+		list_add_tail(&hr_qp->sq_node, &hr_send_cq->sq_list);
+	if (hr_recv_cq)
+		list_add_tail(&hr_qp->rq_node, &hr_recv_cq->rq_list);
+
+	hns_roce_unlock_cqs(hr_send_cq, hr_recv_cq);
+	spin_unlock_irqrestore(&hr_dev->qp_list_lock, flags);
+}
+
+static int hns_roce_qp_store(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_qp *hr_qp,
+			     struct ib_qp_init_attr *init_attr)
 {
 	struct xarray *xa = &hr_dev->qp_table_xa;
 	int ret;
 
-	if (!qpn)
+	if (!hr_qp->qpn)
 		return -EINVAL;
 
-	hr_qp->qpn = qpn;
-	atomic_set(&hr_qp->refcount, 1);
-	init_completion(&hr_qp->free);
-
-	ret = xa_err(xa_store_irq(xa, hr_qp->qpn & (hr_dev->caps.num_qps - 1),
-				hr_qp, GFP_KERNEL));
+	ret = xa_err(xa_store_irq(xa, hr_qp->qpn, hr_qp, GFP_KERNEL));
 	if (ret)
-		dev_err(hr_dev->dev, "QPC xa_store failed\n");
+		dev_err(hr_dev->dev, "Failed to xa store for QPC\n");
+	else
+		/* add QP to device's QP list for softwc */
+		add_qp_to_list(hr_dev, hr_qp, init_attr->send_cq,
+			       init_attr->recv_cq);
 
 	return ret;
 }
 
-static int hns_roce_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
-			     struct hns_roce_qp *hr_qp)
+static int alloc_qpc(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
 	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
 	struct device *dev = hr_dev->dev;
 	int ret;
 
-	if (!qpn)
+	if (!hr_qp->qpn)
 		return -EINVAL;
 
-	hr_qp->qpn = qpn;
+	/* In v1 engine, GSI QP context is saved in the RoCE hw's register */
+	if (hr_qp->ibqp.qp_type == IB_QPT_GSI &&
+	    hr_dev->hw_rev == HNS_ROCE_HW_VER1)
+		return 0;
 
 	/* Alloc memory for QPC */
 	ret = hns_roce_table_get(hr_dev, &qp_table->qp_table, hr_qp->qpn);
 	if (ret) {
-		dev_err(dev, "QPC table get failed\n");
+		dev_err(dev, "Failed to get QPC table\n");
 		goto err_out;
 	}
 
 	/* Alloc memory for IRRL */
 	ret = hns_roce_table_get(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
 	if (ret) {
-		dev_err(dev, "IRRL table get failed\n");
+		dev_err(dev, "Failed to get IRRL table\n");
 		goto err_put_qp;
 	}
 
@@ -239,7 +264,7 @@ static int hns_roce_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
 		ret = hns_roce_table_get(hr_dev, &qp_table->trrl_table,
 					 hr_qp->qpn);
 		if (ret) {
-			dev_err(dev, "TRRL table get failed\n");
+			dev_err(dev, "Failed to get TRRL table\n");
 			goto err_put_irrl;
 		}
 	}
@@ -249,22 +274,13 @@ static int hns_roce_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
 		ret = hns_roce_table_get(hr_dev, &qp_table->sccc_table,
 					 hr_qp->qpn);
 		if (ret) {
-			dev_err(dev, "SCC CTX table get failed\n");
+			dev_err(dev, "Failed to get SCC CTX table\n");
 			goto err_put_trrl;
 		}
 	}
 
-	ret = hns_roce_gsi_qp_alloc(hr_dev, qpn, hr_qp);
-	if (ret)
-		goto err_put_sccc;
-
 	return 0;
 
-err_put_sccc:
-	if (hr_dev->caps.sccc_entry_sz)
-		hns_roce_table_put(hr_dev, &qp_table->sccc_table,
-				   hr_qp->qpn);
-
 err_put_trrl:
 	if (hr_dev->caps.trrl_entry_sz)
 		hns_roce_table_put(hr_dev, &qp_table->trrl_table, hr_qp->qpn);
@@ -284,25 +300,27 @@ void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 	struct xarray *xa = &hr_dev->qp_table_xa;
 	unsigned long flags;
 
+	list_del(&hr_qp->node);
+	list_del(&hr_qp->sq_node);
+	list_del(&hr_qp->rq_node);
+
 	xa_lock_irqsave(xa, flags);
 	__xa_erase(xa, hr_qp->qpn & (hr_dev->caps.num_qps - 1));
 	xa_unlock_irqrestore(xa, flags);
 }
 
-void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
+static void free_qpc(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
 	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
 
-	if (atomic_dec_and_test(&hr_qp->refcount))
-		complete(&hr_qp->free);
-	wait_for_completion(&hr_qp->free);
+	/* In v1 engine, GSI QP context is saved in the RoCE hw's register */
+	if (hr_qp->ibqp.qp_type == IB_QPT_GSI &&
+	    hr_dev->hw_rev == HNS_ROCE_HW_VER1)
+		return;
 
-	if ((hr_qp->ibqp.qp_type) != IB_QPT_GSI) {
-		if (hr_dev->caps.trrl_entry_sz)
-			hns_roce_table_put(hr_dev, &qp_table->trrl_table,
-					   hr_qp->qpn);
-		hns_roce_table_put(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
-	}
+	if (hr_dev->caps.trrl_entry_sz)
+		hns_roce_table_put(hr_dev, &qp_table->trrl_table, hr_qp->qpn);
+	hns_roce_table_put(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
 }
 
 void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
@@ -728,29 +746,6 @@ static void free_rq_inline_buf(struct hns_roce_qp *hr_qp)
 	kfree(hr_qp->rq_inl_buf.wqe_list);
 }
 
-static void add_qp_to_list(struct hns_roce_dev *hr_dev,
-			   struct hns_roce_qp *hr_qp,
-			   struct ib_cq *send_cq, struct ib_cq *recv_cq)
-{
-	struct hns_roce_cq *hr_send_cq, *hr_recv_cq;
-	unsigned long flags;
-
-	hr_send_cq = send_cq ? to_hr_cq(send_cq) : NULL;
-	hr_recv_cq = recv_cq ? to_hr_cq(recv_cq) : NULL;
-
-	spin_lock_irqsave(&hr_dev->qp_list_lock, flags);
-	hns_roce_lock_cqs(hr_send_cq, hr_recv_cq);
-
-	list_add_tail(&hr_qp->node, &hr_dev->qp_list);
-	if (hr_send_cq)
-		list_add_tail(&hr_qp->sq_node, &hr_send_cq->sq_list);
-	if (hr_recv_cq)
-		list_add_tail(&hr_qp->rq_node, &hr_recv_cq->rq_list);
-
-	hns_roce_unlock_cqs(hr_send_cq, hr_recv_cq);
-	spin_unlock_irqrestore(&hr_dev->qp_list_lock, flags);
-}
-
 static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 				     struct ib_pd *ib_pd,
 				     struct ib_qp_init_attr *init_attr,
@@ -975,6 +970,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		}
 	}
 
+	hr_qp->qpn = qpn;
+
 	hr_qp->wqe_bt_pg_shift = calc_wqe_bt_page_shift(hr_dev, hr_qp->regions,
 							hr_qp->region_cnt);
 	hns_roce_mtr_init(&hr_qp->mtr, PAGE_SHIFT + hr_qp->wqe_bt_pg_shift,
@@ -986,20 +983,16 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		goto err_mtr;
 	}
 
-	if (init_attr->qp_type == IB_QPT_GSI &&
-	    hr_dev->hw_rev == HNS_ROCE_HW_VER1) {
-		/* In v1 engine, GSI QP context in RoCE engine's register */
-		ret = hns_roce_gsi_qp_alloc(hr_dev, qpn, hr_qp);
-		if (ret) {
-			dev_err(dev, "hns_roce_qp_alloc failed!\n");
-			goto err_qpn;
-		}
-	} else {
-		ret = hns_roce_qp_alloc(hr_dev, qpn, hr_qp);
-		if (ret) {
-			dev_err(dev, "hns_roce_qp_alloc failed!\n");
-			goto err_qpn;
-		}
+	ret = alloc_qpc(hr_dev, hr_qp);
+	if (ret) {
+		ibdev_err(&hr_dev->ib_dev, "Failed to alloc QP context\n");
+		goto err_qpn;
+	}
+
+	ret = hns_roce_qp_store(hr_dev, hr_qp, init_attr);
+	if (ret) {
+		ibdev_err(&hr_dev->ib_dev, "Failed to store QP\n");
+		goto err_qpc;
 	}
 
 	if (sqpn)
@@ -1011,29 +1004,28 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		ret = ib_copy_to_udata(udata, &resp,
 				       min(udata->outlen, sizeof(resp)));
 		if (ret)
-			goto err_qp;
+			goto err_store;
 	}
 
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL) {
 		ret = hr_dev->hw->qp_flow_control_init(hr_dev, hr_qp);
 		if (ret)
-			goto err_qp;
+			goto err_store;
 	}
 
 	hr_qp->event = hns_roce_ib_qp_event;
-
-	add_qp_to_list(hr_dev, hr_qp, init_attr->send_cq, init_attr->recv_cq);
+	atomic_set(&hr_qp->refcount, 1);
+	init_completion(&hr_qp->free);
 
 	hns_roce_free_buf_list(buf_list, hr_qp->region_cnt);
 
 	return 0;
 
-err_qp:
-	if (init_attr->qp_type == IB_QPT_GSI &&
-		hr_dev->hw_rev == HNS_ROCE_HW_VER1)
-		hns_roce_qp_remove(hr_dev, hr_qp);
-	else
-		hns_roce_qp_free(hr_dev, hr_qp);
+err_store:
+	hns_roce_qp_remove(hr_dev, hr_qp);
+
+err_qpc:
+	free_qpc(hr_dev, hr_qp);
 
 err_qpn:
 	if (!sqpn)
@@ -1090,7 +1082,11 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 			 struct ib_udata *udata)
 {
-	hns_roce_qp_free(hr_dev, hr_qp);
+	if (atomic_dec_and_test(&hr_qp->refcount))
+		complete(&hr_qp->free);
+	wait_for_completion(&hr_qp->free);
+
+	free_qpc(hr_dev, hr_qp);
 
 	/* Not special_QP, free their QPN */
 	if (hr_qp->ibqp.qp_type != IB_QPT_GSI)

From df83a66e1b2e1194cd6cd744f09f06448e0b8b3c Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Mon, 24 Feb 2020 14:37:34 +0800
Subject: [PATCH 073/327] RDMA/hns: Optimize qp number assign flow

Encapsulate the code associated with the qp number assignment into
alloc_qpn() and free_qpn().

Link: https://lore.kernel.org/r/1582526258-13825-4-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_qp.c | 91 ++++++++++++-------------
 1 file changed, 44 insertions(+), 47 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 396356a11b1b..8ec0ea96badf 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -156,15 +156,34 @@ static void hns_roce_ib_qp_event(struct hns_roce_qp *hr_qp,
 	}
 }
 
-static int hns_roce_reserve_range_qp(struct hns_roce_dev *hr_dev, int cnt,
-				     int align, unsigned long *base)
+static int alloc_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
-	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+	unsigned long num = 0;
+	int ret;
 
-	return hns_roce_bitmap_alloc_range(&qp_table->bitmap, cnt, align,
-					   base) ?
-		       -ENOMEM :
-		       0;
+	if (hr_qp->ibqp.qp_type == IB_QPT_GSI) {
+		/* when hw version is v1, the sqpn is allocated */
+		if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
+			num = HNS_ROCE_MAX_PORTS +
+			      hr_dev->iboe.phy_port[hr_qp->port];
+		else
+			num = 1;
+
+		hr_qp->doorbell_qpn = 1;
+	} else {
+		ret = hns_roce_bitmap_alloc_range(&hr_dev->qp_table.bitmap,
+						  1, 1, &num);
+		if (ret) {
+			ibdev_err(&hr_dev->ib_dev, "Failed to alloc bitmap\n");
+			return -ENOMEM;
+		}
+
+		hr_qp->doorbell_qpn = (u32)num;
+	}
+
+	hr_qp->qpn = num;
+
+	return 0;
 }
 
 enum hns_roce_qp_state to_hns_roce_state(enum ib_qp_state state)
@@ -323,15 +342,17 @@ static void free_qpc(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 	hns_roce_table_put(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
 }
 
-void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
-			       int cnt)
+static void free_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
 	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
 
-	if (base_qpn < hr_dev->caps.reserved_qps)
+	if (hr_qp->ibqp.qp_type == IB_QPT_GSI)
 		return;
 
-	hns_roce_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, BITMAP_RR);
+	if (hr_qp->qpn < hr_dev->caps.reserved_qps)
+		return;
+
+	hns_roce_bitmap_free_range(&qp_table->bitmap, hr_qp->qpn, 1, BITMAP_RR);
 }
 
 static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
@@ -749,7 +770,7 @@ static void free_rq_inline_buf(struct hns_roce_qp *hr_qp)
 static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 				     struct ib_pd *ib_pd,
 				     struct ib_qp_init_attr *init_attr,
-				     struct ib_udata *udata, unsigned long sqpn,
+				     struct ib_udata *udata,
 				     struct hns_roce_qp *hr_qp)
 {
 	dma_addr_t *buf_list[ARRAY_SIZE(hr_qp->regions)] = { NULL };
@@ -759,7 +780,6 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 	struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(
 		udata, struct hns_roce_ucontext, ibucontext);
 	struct hns_roce_buf_region *r;
-	unsigned long qpn = 0;
 	u32 page_shift;
 	int buf_count;
 	int ret;
@@ -959,19 +979,6 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		}
 	}
 
-	if (sqpn) {
-		qpn = sqpn;
-	} else {
-		/* Get QPN */
-		ret = hns_roce_reserve_range_qp(hr_dev, 1, 1, &qpn);
-		if (ret) {
-			dev_err(dev, "hns_roce_reserve_range_qp alloc qpn error\n");
-			goto err_wrid;
-		}
-	}
-
-	hr_qp->qpn = qpn;
-
 	hr_qp->wqe_bt_pg_shift = calc_wqe_bt_page_shift(hr_dev, hr_qp->regions,
 							hr_qp->region_cnt);
 	hns_roce_mtr_init(&hr_qp->mtr, PAGE_SHIFT + hr_qp->wqe_bt_pg_shift,
@@ -980,6 +987,12 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 				  hr_qp->regions, hr_qp->region_cnt);
 	if (ret) {
 		dev_err(dev, "mtr attach error for create qp\n");
+		goto err_wrid;
+	}
+
+	ret = alloc_qpn(hr_dev, hr_qp);
+	if (ret) {
+		ibdev_err(&hr_dev->ib_dev, "Failed to alloc QPN\n");
 		goto err_mtr;
 	}
 
@@ -995,11 +1008,6 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		goto err_qpc;
 	}
 
-	if (sqpn)
-		hr_qp->doorbell_qpn = 1;
-	else
-		hr_qp->doorbell_qpn = (u32)hr_qp->qpn;
-
 	if (udata) {
 		ret = ib_copy_to_udata(udata, &resp,
 				       min(udata->outlen, sizeof(resp)));
@@ -1013,6 +1021,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 			goto err_store;
 	}
 
+	hr_qp->ibqp.qp_num = hr_qp->qpn;
 	hr_qp->event = hns_roce_ib_qp_event;
 	atomic_set(&hr_qp->refcount, 1);
 	init_completion(&hr_qp->free);
@@ -1028,8 +1037,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 	free_qpc(hr_dev, hr_qp);
 
 err_qpn:
-	if (!sqpn)
-		hns_roce_release_range_qp(hr_dev, qpn, 1);
+	free_qpn(hr_dev, hr_qp);
 
 err_mtr:
 	hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
@@ -1088,9 +1096,7 @@ void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 
 	free_qpc(hr_dev, hr_qp);
 
-	/* Not special_QP, free their QPN */
-	if (hr_qp->ibqp.qp_type != IB_QPT_GSI)
-		hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
+	free_qpn(hr_dev, hr_qp);
 
 	hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
 
@@ -1139,7 +1145,7 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
 		if (!hr_qp)
 			return ERR_PTR(-ENOMEM);
 
-		ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata, 0,
+		ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata,
 						hr_qp);
 		if (ret) {
 			ibdev_err(ibdev, "Create QP 0x%06lx failed(%d)\n",
@@ -1148,8 +1154,6 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
 			return ERR_PTR(ret);
 		}
 
-		hr_qp->ibqp.qp_num = hr_qp->qpn;
-
 		break;
 	}
 	case IB_QPT_GSI: {
@@ -1166,15 +1170,8 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
 		hr_qp->port = init_attr->port_num - 1;
 		hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
 
-		/* when hw version is v1, the sqpn is allocated */
-		if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
-			hr_qp->ibqp.qp_num = HNS_ROCE_MAX_PORTS +
-					     hr_dev->iboe.phy_port[hr_qp->port];
-		else
-			hr_qp->ibqp.qp_num = 1;
-
 		ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata,
-						hr_qp->ibqp.qp_num, hr_qp);
+						hr_qp);
 		if (ret) {
 			ibdev_err(ibdev, "Create GSI QP failed!\n");
 			kfree(hr_qp);

From 24c22112b9c2084a362c6fe7d7ddb2bbef5b4a2e Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Mon, 24 Feb 2020 14:37:35 +0800
Subject: [PATCH 074/327] RDMA/hns: Optimize qp buffer allocation flow

Encapsulate qp buffer allocation related code into 3 functions:
alloc_qp_buf(), map_wqe_buf() and free_qp_buf().

Link: https://lore.kernel.org/r/1582526258-13825-5-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h |   1 -
 drivers/infiniband/hw/hns/hns_roce_qp.c     | 266 +++++++++++---------
 2 files changed, 144 insertions(+), 123 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index f7335c9ff9d3..d7dcf6ebc526 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -673,7 +673,6 @@ struct hns_roce_qp {
 	/* this define must less than HNS_ROCE_MAX_BT_REGION */
 #define HNS_ROCE_WQE_REGION_MAX	 3
 	struct hns_roce_buf_region regions[HNS_ROCE_WQE_REGION_MAX];
-	int			region_cnt;
 	int                     wqe_bt_pg_shift;
 
 	u32			buff_size;
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 8ec0ea96badf..fea77f00340b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -767,23 +767,147 @@ static void free_rq_inline_buf(struct hns_roce_qp *hr_qp)
 	kfree(hr_qp->rq_inl_buf.wqe_list);
 }
 
+static int map_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+		       u32 page_shift, bool is_user)
+{
+	dma_addr_t *buf_list[ARRAY_SIZE(hr_qp->regions)] = { NULL };
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_buf_region *r;
+	int region_count;
+	int buf_count;
+	int ret;
+	int i;
+
+	region_count = split_wqe_buf_region(hr_dev, hr_qp, hr_qp->regions,
+					ARRAY_SIZE(hr_qp->regions), page_shift);
+
+	/* alloc a tmp list to store WQE buffers address */
+	ret = hns_roce_alloc_buf_list(hr_qp->regions, buf_list, region_count);
+	if (ret) {
+		ibdev_err(ibdev, "Failed to alloc WQE buffer list\n");
+		return ret;
+	}
+
+	for (i = 0; i < region_count; i++) {
+		r = &hr_qp->regions[i];
+		if (is_user)
+			buf_count = hns_roce_get_umem_bufs(hr_dev, buf_list[i],
+					r->count, r->offset, hr_qp->umem,
+					page_shift);
+		else
+			buf_count = hns_roce_get_kmem_bufs(hr_dev, buf_list[i],
+					r->count, r->offset, &hr_qp->hr_buf);
+
+		if (buf_count != r->count) {
+			ibdev_err(ibdev, "Failed to get %s WQE buf, expect %d = %d.\n",
+				  is_user ? "user" : "kernel",
+				  r->count, buf_count);
+			ret = -ENOBUFS;
+			goto done;
+		}
+	}
+
+	hr_qp->wqe_bt_pg_shift = calc_wqe_bt_page_shift(hr_dev, hr_qp->regions,
+							region_count);
+	hns_roce_mtr_init(&hr_qp->mtr, PAGE_SHIFT + hr_qp->wqe_bt_pg_shift,
+			  page_shift);
+	ret = hns_roce_mtr_attach(hr_dev, &hr_qp->mtr, buf_list, hr_qp->regions,
+				  region_count);
+	if (ret)
+		ibdev_err(ibdev, "Failed to attatch WQE's mtr\n");
+
+	goto done;
+
+	hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
+done:
+	hns_roce_free_buf_list(buf_list, region_count);
+
+	return ret;
+}
+
+static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+			struct ib_qp_init_attr *init_attr,
+			struct ib_udata *udata, unsigned long addr)
+{
+	u32 page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	bool is_rq_buf_inline;
+	int ret;
+
+	is_rq_buf_inline = (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
+			   hns_roce_qp_has_rq(init_attr);
+	if (is_rq_buf_inline) {
+		ret = alloc_rq_inline_buf(hr_qp, init_attr);
+		if (ret) {
+			ibdev_err(ibdev, "Failed to alloc inline RQ buffer\n");
+			return ret;
+		}
+	}
+
+	if (udata) {
+		hr_qp->umem = ib_umem_get(udata, addr, hr_qp->buff_size, 0);
+		if (IS_ERR(hr_qp->umem)) {
+			ret = PTR_ERR(hr_qp->umem);
+			goto err_inline;
+		}
+	} else {
+		ret = hns_roce_buf_alloc(hr_dev, hr_qp->buff_size,
+					 (1 << page_shift) * 2,
+					 &hr_qp->hr_buf, page_shift);
+		if (ret)
+			goto err_inline;
+	}
+
+	ret = map_wqe_buf(hr_dev, hr_qp, page_shift, udata);
+	if (ret)
+		goto err_alloc;
+
+	return 0;
+
+err_inline:
+	if (is_rq_buf_inline)
+		free_rq_inline_buf(hr_qp);
+
+err_alloc:
+	if (udata) {
+		ib_umem_release(hr_qp->umem);
+		hr_qp->umem = NULL;
+	} else {
+		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
+	}
+
+	ibdev_err(ibdev, "Failed to alloc WQE buffer, ret %d.\n", ret);
+
+	return ret;
+}
+
+static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
+{
+	hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
+	if (hr_qp->umem) {
+		ib_umem_release(hr_qp->umem);
+		hr_qp->umem = NULL;
+	}
+
+	if (hr_qp->hr_buf.nbufs > 0)
+		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
+
+	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
+	     hr_qp->rq.wqe_cnt)
+		free_rq_inline_buf(hr_qp);
+}
 static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 				     struct ib_pd *ib_pd,
 				     struct ib_qp_init_attr *init_attr,
 				     struct ib_udata *udata,
 				     struct hns_roce_qp *hr_qp)
 {
-	dma_addr_t *buf_list[ARRAY_SIZE(hr_qp->regions)] = { NULL };
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_ib_create_qp ucmd;
 	struct hns_roce_ib_create_qp_resp resp = {};
 	struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(
 		udata, struct hns_roce_ucontext, ibucontext);
-	struct hns_roce_buf_region *r;
-	u32 page_shift;
-	int buf_count;
 	int ret;
-	int i;
 
 	mutex_init(&hr_qp->mutex);
 	spin_lock_init(&hr_qp->sq.lock);
@@ -806,59 +930,18 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		goto err_out;
 	}
 
-	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
-	    hns_roce_qp_has_rq(init_attr)) {
-		ret = alloc_rq_inline_buf(hr_qp, init_attr);
-		if (ret) {
-			dev_err(dev, "allocate receive inline buffer failed\n");
-			goto err_out;
-		}
-	}
-
-	page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
 	if (udata) {
 		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
 			dev_err(dev, "ib_copy_from_udata error for create qp\n");
 			ret = -EFAULT;
-			goto err_alloc_rq_inline_buf;
+			goto err_out;
 		}
 
 		ret = hns_roce_set_user_sq_size(hr_dev, &init_attr->cap, hr_qp,
 						&ucmd);
 		if (ret) {
 			dev_err(dev, "hns_roce_set_user_sq_size error for create qp\n");
-			goto err_alloc_rq_inline_buf;
-		}
-
-		hr_qp->umem = ib_umem_get(ib_pd->device, ucmd.buf_addr,
-					  hr_qp->buff_size, 0);
-		if (IS_ERR(hr_qp->umem)) {
-			dev_err(dev, "ib_umem_get error for create qp\n");
-			ret = PTR_ERR(hr_qp->umem);
-			goto err_alloc_rq_inline_buf;
-		}
-		hr_qp->region_cnt = split_wqe_buf_region(hr_dev, hr_qp,
-				hr_qp->regions, ARRAY_SIZE(hr_qp->regions),
-				page_shift);
-		ret = hns_roce_alloc_buf_list(hr_qp->regions, buf_list,
-					      hr_qp->region_cnt);
-		if (ret) {
-			dev_err(dev, "alloc buf_list error for create qp\n");
-			goto err_alloc_list;
-		}
-
-		for (i = 0; i < hr_qp->region_cnt; i++) {
-			r = &hr_qp->regions[i];
-			buf_count = hns_roce_get_umem_bufs(hr_dev,
-					buf_list[i], r->count, r->offset,
-					hr_qp->umem, page_shift);
-			if (buf_count != r->count) {
-				dev_err(dev,
-					"get umem buf err, expect %d,ret %d.\n",
-					r->count, buf_count);
-				ret = -ENOBUFS;
-				goto err_get_bufs;
-			}
+			goto err_out;
 		}
 
 		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
@@ -869,7 +952,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 						   &hr_qp->sdb);
 			if (ret) {
 				dev_err(dev, "sq record doorbell map failed!\n");
-				goto err_get_bufs;
+				goto err_out;
 			}
 
 			/* indicate kernel supports sq record db */
@@ -896,13 +979,13 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		    IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
 			dev_err(dev, "init_attr->create_flags error!\n");
 			ret = -EINVAL;
-			goto err_alloc_rq_inline_buf;
+			goto err_out;
 		}
 
 		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) {
 			dev_err(dev, "init_attr->create_flags error!\n");
 			ret = -EINVAL;
-			goto err_alloc_rq_inline_buf;
+			goto err_out;
 		}
 
 		/* Set SQ size */
@@ -910,7 +993,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 						  hr_qp);
 		if (ret) {
 			dev_err(dev, "hns_roce_set_kernel_sq_size error!\n");
-			goto err_alloc_rq_inline_buf;
+			goto err_out;
 		}
 
 		/* QP doorbell register address */
@@ -924,49 +1007,17 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 			ret = hns_roce_alloc_db(hr_dev, &hr_qp->rdb, 0);
 			if (ret) {
 				dev_err(dev, "rq record doorbell alloc failed!\n");
-				goto err_alloc_rq_inline_buf;
+				goto err_out;
 			}
 			*hr_qp->rdb.db_record = 0;
 			hr_qp->rdb_en = 1;
 		}
 
-		/* Allocate QP buf */
-		if (hns_roce_buf_alloc(hr_dev, hr_qp->buff_size,
-				       (1 << page_shift) * 2,
-				       &hr_qp->hr_buf, page_shift)) {
-			dev_err(dev, "hns_roce_buf_alloc error!\n");
-			ret = -ENOMEM;
-			goto err_db;
-		}
-		hr_qp->region_cnt = split_wqe_buf_region(hr_dev, hr_qp,
-				hr_qp->regions, ARRAY_SIZE(hr_qp->regions),
-				page_shift);
-		ret = hns_roce_alloc_buf_list(hr_qp->regions, buf_list,
-					      hr_qp->region_cnt);
-		if (ret) {
-			dev_err(dev, "alloc buf_list error for create qp!\n");
-			goto err_alloc_list;
-		}
-
-		for (i = 0; i < hr_qp->region_cnt; i++) {
-			r = &hr_qp->regions[i];
-			buf_count = hns_roce_get_kmem_bufs(hr_dev,
-					buf_list[i], r->count, r->offset,
-					&hr_qp->hr_buf);
-			if (buf_count != r->count) {
-				dev_err(dev,
-					"get kmem buf err, expect %d,ret %d.\n",
-					r->count, buf_count);
-				ret = -ENOBUFS;
-				goto err_get_bufs;
-			}
-		}
-
 		hr_qp->sq.wrid = kcalloc(hr_qp->sq.wqe_cnt, sizeof(u64),
 					 GFP_KERNEL);
 		if (ZERO_OR_NULL_PTR(hr_qp->sq.wrid)) {
 			ret = -ENOMEM;
-			goto err_get_bufs;
+			goto err_db;
 		}
 
 		if (hr_qp->rq.wqe_cnt) {
@@ -979,21 +1030,16 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		}
 	}
 
-	hr_qp->wqe_bt_pg_shift = calc_wqe_bt_page_shift(hr_dev, hr_qp->regions,
-							hr_qp->region_cnt);
-	hns_roce_mtr_init(&hr_qp->mtr, PAGE_SHIFT + hr_qp->wqe_bt_pg_shift,
-			  page_shift);
-	ret = hns_roce_mtr_attach(hr_dev, &hr_qp->mtr, buf_list,
-				  hr_qp->regions, hr_qp->region_cnt);
+	ret = alloc_qp_buf(hr_dev, hr_qp, init_attr, udata, ucmd.buf_addr);
 	if (ret) {
-		dev_err(dev, "mtr attach error for create qp\n");
-		goto err_wrid;
+		ibdev_err(&hr_dev->ib_dev, "Failed to alloc QP buffer\n");
+		goto err_db;
 	}
 
 	ret = alloc_qpn(hr_dev, hr_qp);
 	if (ret) {
 		ibdev_err(&hr_dev->ib_dev, "Failed to alloc QPN\n");
-		goto err_mtr;
+		goto err_buf;
 	}
 
 	ret = alloc_qpc(hr_dev, hr_qp);
@@ -1026,8 +1072,6 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 	atomic_set(&hr_qp->refcount, 1);
 	init_completion(&hr_qp->free);
 
-	hns_roce_free_buf_list(buf_list, hr_qp->region_cnt);
-
 	return 0;
 
 err_store:
@@ -1039,10 +1083,9 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 err_qpn:
 	free_qpn(hr_dev, hr_qp);
 
-err_mtr:
-	hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
+err_buf:
+	free_qp_buf(hr_dev, hr_qp);
 
-err_wrid:
 	if (udata) {
 		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
 		    (udata->outlen >= sizeof(resp)) &&
@@ -1065,24 +1108,11 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 	if (!udata)
 		kfree(hr_qp->sq.wrid);
 
-err_get_bufs:
-	hns_roce_free_buf_list(buf_list, hr_qp->region_cnt);
-
-err_alloc_list:
-	if (!hr_qp->umem)
-		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
-	ib_umem_release(hr_qp->umem);
-
 err_db:
 	if (!udata && hns_roce_qp_has_rq(init_attr) &&
 	    (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB))
 		hns_roce_free_db(hr_dev, &hr_qp->rdb);
 
-err_alloc_rq_inline_buf:
-	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
-	     hns_roce_qp_has_rq(init_attr))
-		free_rq_inline_buf(hr_qp);
-
 err_out:
 	return ret;
 }
@@ -1098,7 +1128,7 @@ void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 
 	free_qpn(hr_dev, hr_qp);
 
-	hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
+	free_qp_buf(hr_dev, hr_qp);
 
 	if (udata) {
 		struct hns_roce_ucontext *context =
@@ -1115,17 +1145,9 @@ void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 	} else {
 		kfree(hr_qp->sq.wrid);
 		kfree(hr_qp->rq.wrid);
-		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
 		if (hr_qp->rq.wqe_cnt)
 			hns_roce_free_db(hr_dev, &hr_qp->rdb);
 	}
-	ib_umem_release(hr_qp->umem);
-
-	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
-	     hr_qp->rq.wqe_cnt) {
-		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
-		kfree(hr_qp->rq_inl_buf.wqe_list);
-	}
 
 	kfree(hr_qp);
 }

From ae85bf92effc1e4d4f5b3a3a291d2440a7200e25 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Mon, 24 Feb 2020 14:37:36 +0800
Subject: [PATCH 075/327] RDMA/hns: Optimize qp param setup flow

Encapsulate the qp param setup related code into set_qp_param().

Link: https://lore.kernel.org/r/1582526258-13825-6-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_qp.c | 136 +++++++++++++-----------
 1 file changed, 72 insertions(+), 64 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index fea77f00340b..51c6318fcc42 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -355,18 +355,18 @@ static void free_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 	hns_roce_bitmap_free_range(&qp_table->bitmap, hr_qp->qpn, 1, BITMAP_RR);
 }
 
-static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
+static int set_rq_size(struct hns_roce_dev *hr_dev,
 				struct ib_qp_cap *cap, bool is_user, int has_rq,
 				struct hns_roce_qp *hr_qp)
 {
-	struct device *dev = hr_dev->dev;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	u32 max_cnt;
 
 	/* Check the validity of QP support capacity */
 	if (cap->max_recv_wr > hr_dev->caps.max_wqes ||
 	    cap->max_recv_sge > hr_dev->caps.max_rq_sg) {
-		dev_err(dev, "RQ WR or sge error!max_recv_wr=%d max_recv_sge=%d\n",
-			cap->max_recv_wr, cap->max_recv_sge);
+		ibdev_err(ibdev, "Failed to check max recv WR %d and SGE %d\n",
+			  cap->max_recv_wr, cap->max_recv_sge);
 		return -EINVAL;
 	}
 
@@ -378,7 +378,7 @@ static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
 		cap->max_recv_sge = 0;
 	} else {
 		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) {
-			dev_err(dev, "user space no need config max_recv_wr max_recv_sge\n");
+			ibdev_err(ibdev, "Failed to check user max recv WR and SGE\n");
 			return -EINVAL;
 		}
 
@@ -390,7 +390,7 @@ static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
 		hr_qp->rq.wqe_cnt = roundup_pow_of_two(max_cnt);
 
 		if ((u32)hr_qp->rq.wqe_cnt > hr_dev->caps.max_wqes) {
-			dev_err(dev, "while setting rq size, rq.wqe_cnt too large\n");
+			ibdev_err(ibdev, "Failed to check RQ WQE count limit\n");
 			return -EINVAL;
 		}
 
@@ -421,12 +421,12 @@ static int check_sq_size_with_integrity(struct hns_roce_dev *hr_dev,
 	/* Sanity check SQ size before proceeding */
 	if (ucmd->log_sq_stride > max_sq_stride ||
 	    ucmd->log_sq_stride < HNS_ROCE_IB_MIN_SQ_STRIDE) {
-		ibdev_err(&hr_dev->ib_dev, "check SQ size error!\n");
+		ibdev_err(&hr_dev->ib_dev, "Failed to check SQ stride size\n");
 		return -EINVAL;
 	}
 
 	if (cap->max_send_sge > hr_dev->caps.max_sq_sg) {
-		ibdev_err(&hr_dev->ib_dev, "SQ sge error! max_send_sge=%d\n",
+		ibdev_err(&hr_dev->ib_dev, "Failed to check SQ SGE size %d\n",
 			  cap->max_send_sge);
 		return -EINVAL;
 	}
@@ -434,10 +434,9 @@ static int check_sq_size_with_integrity(struct hns_roce_dev *hr_dev,
 	return 0;
 }
 
-static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
-				     struct ib_qp_cap *cap,
-				     struct hns_roce_qp *hr_qp,
-				     struct hns_roce_ib_create_qp *ucmd)
+static int set_user_sq_size(struct hns_roce_dev *hr_dev,
+			    struct ib_qp_cap *cap, struct hns_roce_qp *hr_qp,
+			    struct hns_roce_ib_create_qp *ucmd)
 {
 	u32 ex_sge_num;
 	u32 page_size;
@@ -450,7 +449,7 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
 
 	ret = check_sq_size_with_integrity(hr_dev, cap, ucmd);
 	if (ret) {
-		ibdev_err(&hr_dev->ib_dev, "Sanity check sq size failed\n");
+		ibdev_err(&hr_dev->ib_dev, "Failed to check user SQ size limit\n");
 		return ret;
 	}
 
@@ -469,9 +468,9 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
 	if (hr_qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE &&
 	    hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_A) {
 		if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) {
-			dev_err(hr_dev->dev,
-				"The extended sge cnt error! sge_cnt=%d\n",
-				hr_qp->sge.sge_cnt);
+			ibdev_err(&hr_dev->ib_dev,
+				  "Failed to check extended SGE size limit %d\n",
+				  hr_qp->sge.sge_cnt);
 			return -EINVAL;
 		}
 	}
@@ -635,9 +634,8 @@ static int set_extend_sge_param(struct hns_roce_dev *hr_dev,
 	return 0;
 }
 
-static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
-				       struct ib_qp_cap *cap,
-				       struct hns_roce_qp *hr_qp)
+static int set_kernel_sq_size(struct hns_roce_dev *hr_dev,
+			      struct ib_qp_cap *cap, struct hns_roce_qp *hr_qp)
 {
 	struct device *dev = hr_dev->dev;
 	u32 page_size;
@@ -896,6 +894,58 @@ static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 	     hr_qp->rq.wqe_cnt)
 		free_rq_inline_buf(hr_qp);
 }
+
+static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+			struct ib_qp_init_attr *init_attr,
+			struct ib_udata *udata,
+			struct hns_roce_ib_create_qp *ucmd)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	int ret;
+
+	hr_qp->ibqp.qp_type = init_attr->qp_type;
+
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR;
+	else
+		hr_qp->sq_signal_bits = IB_SIGNAL_REQ_WR;
+
+	ret = set_rq_size(hr_dev, &init_attr->cap, udata,
+			  hns_roce_qp_has_rq(init_attr), hr_qp);
+	if (ret) {
+		ibdev_err(ibdev, "Failed to set user RQ size\n");
+		return ret;
+	}
+
+	if (udata) {
+		if (ib_copy_from_udata(ucmd, udata, sizeof(*ucmd))) {
+			ibdev_err(ibdev, "Failed to copy QP ucmd\n");
+			return -EFAULT;
+		}
+
+		ret = set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, ucmd);
+		if (ret)
+			ibdev_err(ibdev, "Failed to set user SQ size\n");
+	} else {
+		if (init_attr->create_flags &
+		    IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
+			ibdev_err(ibdev, "Failed to check multicast loopback\n");
+			return -EINVAL;
+		}
+
+		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) {
+			ibdev_err(ibdev, "Failed to check ipoib ud lso\n");
+			return -EINVAL;
+		}
+
+		ret = set_kernel_sq_size(hr_dev, &init_attr->cap, hr_qp);
+		if (ret)
+			ibdev_err(ibdev, "Failed to set kernel SQ size\n");
+	}
+
+	return ret;
+}
+
 static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 				     struct ib_pd *ib_pd,
 				     struct ib_qp_init_attr *init_attr,
@@ -916,34 +966,13 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 	hr_qp->state = IB_QPS_RESET;
 	hr_qp->flush_flag = 0;
 
-	hr_qp->ibqp.qp_type = init_attr->qp_type;
-
-	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
-		hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR;
-	else
-		hr_qp->sq_signal_bits = IB_SIGNAL_REQ_WR;
-
-	ret = hns_roce_set_rq_size(hr_dev, &init_attr->cap, udata,
-				   hns_roce_qp_has_rq(init_attr), hr_qp);
+	ret = set_qp_param(hr_dev, hr_qp, init_attr, udata, &ucmd);
 	if (ret) {
-		dev_err(dev, "hns_roce_set_rq_size failed\n");
-		goto err_out;
+		ibdev_err(&hr_dev->ib_dev, "Failed to set QP param\n");
+		return ret;
 	}
 
 	if (udata) {
-		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
-			dev_err(dev, "ib_copy_from_udata error for create qp\n");
-			ret = -EFAULT;
-			goto err_out;
-		}
-
-		ret = hns_roce_set_user_sq_size(hr_dev, &init_attr->cap, hr_qp,
-						&ucmd);
-		if (ret) {
-			dev_err(dev, "hns_roce_set_user_sq_size error for create qp\n");
-			goto err_out;
-		}
-
 		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
 		    (udata->inlen >= sizeof(ucmd)) &&
 		    (udata->outlen >= sizeof(resp)) &&
@@ -975,27 +1004,6 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 			hr_qp->rdb_en = 1;
 		}
 	} else {
-		if (init_attr->create_flags &
-		    IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
-			dev_err(dev, "init_attr->create_flags error!\n");
-			ret = -EINVAL;
-			goto err_out;
-		}
-
-		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) {
-			dev_err(dev, "init_attr->create_flags error!\n");
-			ret = -EINVAL;
-			goto err_out;
-		}
-
-		/* Set SQ size */
-		ret = hns_roce_set_kernel_sq_size(hr_dev, &init_attr->cap,
-						  hr_qp);
-		if (ret) {
-			dev_err(dev, "hns_roce_set_kernel_sq_size error!\n");
-			goto err_out;
-		}
-
 		/* QP doorbell register address */
 		hr_qp->sq.db_reg_l = hr_dev->reg_base + hr_dev->sdb_offset +
 				     DB_REG_OFFSET * hr_dev->priv_uar.index;

From b37c413997100d1ad5627bf38f0ebfeee170f305 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Mon, 24 Feb 2020 14:37:37 +0800
Subject: [PATCH 076/327] RDMA/hns: Optimize kernel qp wrid allocation flow

Encapsulate the kernel qp wrid allocation related code into 2 functions:
alloc_kernel_wrid() and free_kernel_wrid().

Link: https://lore.kernel.org/r/1582526258-13825-7-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_qp.c | 72 +++++++++++++++----------
 1 file changed, 45 insertions(+), 27 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 51c6318fcc42..a27c3dad8aae 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -895,6 +895,45 @@ static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 		free_rq_inline_buf(hr_qp);
 }
 
+static int alloc_kernel_wrid(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_qp *hr_qp)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	u64 *sq_wrid = NULL;
+	u64 *rq_wrid = NULL;
+	int ret;
+
+	sq_wrid = kcalloc(hr_qp->sq.wqe_cnt, sizeof(u64), GFP_KERNEL);
+	if (ZERO_OR_NULL_PTR(sq_wrid)) {
+		ibdev_err(ibdev, "Failed to alloc SQ wrid\n");
+		return -ENOMEM;
+	}
+
+	if (hr_qp->rq.wqe_cnt) {
+		rq_wrid = kcalloc(hr_qp->rq.wqe_cnt, sizeof(u64), GFP_KERNEL);
+		if (ZERO_OR_NULL_PTR(rq_wrid)) {
+			ibdev_err(ibdev, "Failed to alloc RQ wrid\n");
+			ret = -ENOMEM;
+			goto err_sq;
+		}
+	}
+
+	hr_qp->sq.wrid = sq_wrid;
+	hr_qp->rq.wrid = rq_wrid;
+	return 0;
+err_sq:
+	kfree(sq_wrid);
+
+	return ret;
+}
+
+static void free_kernel_wrid(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_qp *hr_qp)
+{
+	kfree(hr_qp->rq.wrid);
+	kfree(hr_qp->sq.wrid);
+}
+
 static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 			struct ib_qp_init_attr *init_attr,
 			struct ib_udata *udata,
@@ -1021,21 +1060,11 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 			hr_qp->rdb_en = 1;
 		}
 
-		hr_qp->sq.wrid = kcalloc(hr_qp->sq.wqe_cnt, sizeof(u64),
-					 GFP_KERNEL);
-		if (ZERO_OR_NULL_PTR(hr_qp->sq.wrid)) {
-			ret = -ENOMEM;
+		ret = alloc_kernel_wrid(hr_dev, hr_qp);
+		if (ret) {
+			ibdev_err(&hr_dev->ib_dev, "Failed to alloc wrid\n");
 			goto err_db;
 		}
-
-		if (hr_qp->rq.wqe_cnt) {
-			hr_qp->rq.wrid = kcalloc(hr_qp->rq.wqe_cnt, sizeof(u64),
-						 GFP_KERNEL);
-			if (ZERO_OR_NULL_PTR(hr_qp->rq.wrid)) {
-				ret = -ENOMEM;
-				goto err_sq_wrid;
-			}
-		}
 	}
 
 	ret = alloc_qp_buf(hr_dev, hr_qp, init_attr, udata, ucmd.buf_addr);
@@ -1084,24 +1113,20 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 
 err_store:
 	hns_roce_qp_remove(hr_dev, hr_qp);
-
 err_qpc:
 	free_qpc(hr_dev, hr_qp);
-
 err_qpn:
 	free_qpn(hr_dev, hr_qp);
-
 err_buf:
 	free_qp_buf(hr_dev, hr_qp);
 
+	free_kernel_wrid(hr_dev, hr_qp);
+
 	if (udata) {
 		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
 		    (udata->outlen >= sizeof(resp)) &&
 		    hns_roce_qp_has_rq(init_attr))
 			hns_roce_db_unmap_user(uctx, &hr_qp->rdb);
-	} else {
-		if (hr_qp->rq.wqe_cnt)
-			kfree(hr_qp->rq.wrid);
 	}
 
 err_sq_dbmap:
@@ -1112,10 +1137,6 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 		    hns_roce_qp_has_sq(init_attr))
 			hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
 
-err_sq_wrid:
-	if (!udata)
-		kfree(hr_qp->sq.wrid);
-
 err_db:
 	if (!udata && hns_roce_qp_has_rq(init_attr) &&
 	    (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB))
@@ -1133,10 +1154,9 @@ void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 	wait_for_completion(&hr_qp->free);
 
 	free_qpc(hr_dev, hr_qp);
-
 	free_qpn(hr_dev, hr_qp);
-
 	free_qp_buf(hr_dev, hr_qp);
+	free_kernel_wrid(hr_dev, hr_qp);
 
 	if (udata) {
 		struct hns_roce_ucontext *context =
@@ -1151,8 +1171,6 @@ void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 		if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1))
 			hns_roce_db_unmap_user(context, &hr_qp->rdb);
 	} else {
-		kfree(hr_qp->sq.wrid);
-		kfree(hr_qp->rq.wrid);
 		if (hr_qp->rq.wqe_cnt)
 			hns_roce_free_db(hr_dev, &hr_qp->rdb);
 	}

From cfec045b822a1c3f8f47604520d3194729d9d2af Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Mon, 24 Feb 2020 14:37:38 +0800
Subject: [PATCH 077/327] RDMA/hns: Optimize qp doorbell allocation flow

Encapsulate the kernel qp doorbell allocation related code into 2
functions: alloc_qp_db() and free_qp_db().

Link: https://lore.kernel.org/r/1582526258-13825-8-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_qp.c | 234 +++++++++++++-----------
 1 file changed, 132 insertions(+), 102 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index a27c3dad8aae..2a7535534ea8 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -843,7 +843,7 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 	}
 
 	if (udata) {
-		hr_qp->umem = ib_umem_get(udata, addr, hr_qp->buff_size, 0);
+		hr_qp->umem = ib_umem_get(ibdev, addr, hr_qp->buff_size, 0);
 		if (IS_ERR(hr_qp->umem)) {
 			ret = PTR_ERR(hr_qp->umem);
 			goto err_inline;
@@ -895,6 +895,114 @@ static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 		free_rq_inline_buf(hr_qp);
 }
 
+static inline bool user_qp_has_sdb(struct hns_roce_dev *hr_dev,
+				   struct ib_qp_init_attr *init_attr,
+				   struct ib_udata *udata,
+				   struct hns_roce_ib_create_qp_resp *resp,
+				   struct hns_roce_ib_create_qp *ucmd)
+{
+	return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
+		udata->outlen >= offsetofend(typeof(*resp), cap_flags) &&
+		hns_roce_qp_has_sq(init_attr) &&
+		udata->inlen >= offsetofend(typeof(*ucmd), sdb_addr));
+}
+
+static inline bool user_qp_has_rdb(struct hns_roce_dev *hr_dev,
+				   struct ib_qp_init_attr *init_attr,
+				   struct ib_udata *udata,
+				   struct hns_roce_ib_create_qp_resp *resp)
+{
+	return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+		udata->outlen >= offsetofend(typeof(*resp), cap_flags) &&
+		hns_roce_qp_has_rq(init_attr));
+}
+
+static inline bool kernel_qp_has_rdb(struct hns_roce_dev *hr_dev,
+				     struct ib_qp_init_attr *init_attr)
+{
+	return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+		hns_roce_qp_has_rq(init_attr));
+}
+
+static int alloc_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+		       struct ib_qp_init_attr *init_attr,
+		       struct ib_udata *udata,
+		       struct hns_roce_ib_create_qp *ucmd,
+		       struct hns_roce_ib_create_qp_resp *resp)
+{
+	struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(
+		udata, struct hns_roce_ucontext, ibucontext);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	int ret;
+
+	if (udata) {
+		if (user_qp_has_sdb(hr_dev, init_attr, udata, resp, ucmd)) {
+			ret = hns_roce_db_map_user(uctx, udata, ucmd->sdb_addr,
+						   &hr_qp->sdb);
+			if (ret) {
+				ibdev_err(ibdev,
+					  "Failed to map user SQ doorbell\n");
+				goto err_out;
+			}
+			hr_qp->sdb_en = 1;
+			resp->cap_flags |= HNS_ROCE_SUPPORT_SQ_RECORD_DB;
+		}
+
+		if (user_qp_has_rdb(hr_dev, init_attr, udata, resp)) {
+			ret = hns_roce_db_map_user(uctx, udata, ucmd->db_addr,
+						   &hr_qp->rdb);
+			if (ret) {
+				ibdev_err(ibdev,
+					  "Failed to map user RQ doorbell\n");
+				goto err_sdb;
+			}
+			hr_qp->rdb_en = 1;
+			resp->cap_flags |= HNS_ROCE_SUPPORT_RQ_RECORD_DB;
+		}
+	} else {
+		/* QP doorbell register address */
+		hr_qp->sq.db_reg_l = hr_dev->reg_base + hr_dev->sdb_offset +
+				     DB_REG_OFFSET * hr_dev->priv_uar.index;
+		hr_qp->rq.db_reg_l = hr_dev->reg_base + hr_dev->odb_offset +
+				     DB_REG_OFFSET * hr_dev->priv_uar.index;
+
+		if (kernel_qp_has_rdb(hr_dev, init_attr)) {
+			ret = hns_roce_alloc_db(hr_dev, &hr_qp->rdb, 0);
+			if (ret) {
+				ibdev_err(ibdev,
+					  "Failed to alloc kernel RQ doorbell\n");
+				goto err_out;
+			}
+			*hr_qp->rdb.db_record = 0;
+			hr_qp->rdb_en = 1;
+		}
+	}
+
+	return 0;
+err_sdb:
+	if (udata && hr_qp->sdb_en)
+		hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
+err_out:
+	return ret;
+}
+
+static void free_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+		       struct ib_udata *udata)
+{
+	struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(
+		udata, struct hns_roce_ucontext, ibucontext);
+
+	if (udata) {
+		if (hr_qp->rdb_en)
+			hns_roce_db_unmap_user(uctx, &hr_qp->rdb);
+		if (hr_qp->sdb_en)
+			hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
+	} else {
+		if (hr_qp->rdb_en)
+			hns_roce_free_db(hr_dev, &hr_qp->rdb);
+	}
+}
+
 static int alloc_kernel_wrid(struct hns_roce_dev *hr_dev,
 			     struct hns_roce_qp *hr_qp)
 {
@@ -991,11 +1099,9 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 				     struct ib_udata *udata,
 				     struct hns_roce_qp *hr_qp)
 {
-	struct device *dev = hr_dev->dev;
-	struct hns_roce_ib_create_qp ucmd;
 	struct hns_roce_ib_create_qp_resp resp = {};
-	struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(
-		udata, struct hns_roce_ucontext, ibucontext);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_ib_create_qp ucmd;
 	int ret;
 
 	mutex_init(&hr_qp->mutex);
@@ -1007,95 +1113,55 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 
 	ret = set_qp_param(hr_dev, hr_qp, init_attr, udata, &ucmd);
 	if (ret) {
-		ibdev_err(&hr_dev->ib_dev, "Failed to set QP param\n");
+		ibdev_err(ibdev, "Failed to set QP param\n");
 		return ret;
 	}
 
-	if (udata) {
-		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
-		    (udata->inlen >= sizeof(ucmd)) &&
-		    (udata->outlen >= sizeof(resp)) &&
-		    hns_roce_qp_has_sq(init_attr)) {
-			ret = hns_roce_db_map_user(uctx, udata, ucmd.sdb_addr,
-						   &hr_qp->sdb);
-			if (ret) {
-				dev_err(dev, "sq record doorbell map failed!\n");
-				goto err_out;
-			}
-
-			/* indicate kernel supports sq record db */
-			resp.cap_flags |= HNS_ROCE_SUPPORT_SQ_RECORD_DB;
-			hr_qp->sdb_en = 1;
-		}
-
-		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-		    (udata->outlen >= sizeof(resp)) &&
-		    hns_roce_qp_has_rq(init_attr)) {
-			ret = hns_roce_db_map_user(uctx, udata, ucmd.db_addr,
-						   &hr_qp->rdb);
-			if (ret) {
-				dev_err(dev, "rq record doorbell map failed!\n");
-				goto err_sq_dbmap;
-			}
-
-			/* indicate kernel supports rq record db */
-			resp.cap_flags |= HNS_ROCE_SUPPORT_RQ_RECORD_DB;
-			hr_qp->rdb_en = 1;
-		}
-	} else {
-		/* QP doorbell register address */
-		hr_qp->sq.db_reg_l = hr_dev->reg_base + hr_dev->sdb_offset +
-				     DB_REG_OFFSET * hr_dev->priv_uar.index;
-		hr_qp->rq.db_reg_l = hr_dev->reg_base + hr_dev->odb_offset +
-				     DB_REG_OFFSET * hr_dev->priv_uar.index;
-
-		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-		    hns_roce_qp_has_rq(init_attr)) {
-			ret = hns_roce_alloc_db(hr_dev, &hr_qp->rdb, 0);
-			if (ret) {
-				dev_err(dev, "rq record doorbell alloc failed!\n");
-				goto err_out;
-			}
-			*hr_qp->rdb.db_record = 0;
-			hr_qp->rdb_en = 1;
-		}
-
+	if (!udata) {
 		ret = alloc_kernel_wrid(hr_dev, hr_qp);
 		if (ret) {
-			ibdev_err(&hr_dev->ib_dev, "Failed to alloc wrid\n");
-			goto err_db;
+			ibdev_err(ibdev, "Failed to alloc wrid\n");
+			return ret;
 		}
 	}
 
+	ret = alloc_qp_db(hr_dev, hr_qp, init_attr, udata, &ucmd, &resp);
+	if (ret) {
+		ibdev_err(ibdev, "Failed to alloc QP doorbell\n");
+		goto err_wrid;
+	}
+
 	ret = alloc_qp_buf(hr_dev, hr_qp, init_attr, udata, ucmd.buf_addr);
 	if (ret) {
-		ibdev_err(&hr_dev->ib_dev, "Failed to alloc QP buffer\n");
+		ibdev_err(ibdev, "Failed to alloc QP buffer\n");
 		goto err_db;
 	}
 
 	ret = alloc_qpn(hr_dev, hr_qp);
 	if (ret) {
-		ibdev_err(&hr_dev->ib_dev, "Failed to alloc QPN\n");
+		ibdev_err(ibdev, "Failed to alloc QPN\n");
 		goto err_buf;
 	}
 
 	ret = alloc_qpc(hr_dev, hr_qp);
 	if (ret) {
-		ibdev_err(&hr_dev->ib_dev, "Failed to alloc QP context\n");
+		ibdev_err(ibdev, "Failed to alloc QP context\n");
 		goto err_qpn;
 	}
 
 	ret = hns_roce_qp_store(hr_dev, hr_qp, init_attr);
 	if (ret) {
-		ibdev_err(&hr_dev->ib_dev, "Failed to store QP\n");
+		ibdev_err(ibdev, "Failed to store QP\n");
 		goto err_qpc;
 	}
 
 	if (udata) {
 		ret = ib_copy_to_udata(udata, &resp,
 				       min(udata->outlen, sizeof(resp)));
-		if (ret)
+		if (ret) {
+			ibdev_err(ibdev, "copy qp resp failed!\n");
 			goto err_store;
+		}
 	}
 
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL) {
@@ -1119,30 +1185,10 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
 	free_qpn(hr_dev, hr_qp);
 err_buf:
 	free_qp_buf(hr_dev, hr_qp);
-
-	free_kernel_wrid(hr_dev, hr_qp);
-
-	if (udata) {
-		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-		    (udata->outlen >= sizeof(resp)) &&
-		    hns_roce_qp_has_rq(init_attr))
-			hns_roce_db_unmap_user(uctx, &hr_qp->rdb);
-	}
-
-err_sq_dbmap:
-	if (udata)
-		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
-		    (udata->inlen >= sizeof(ucmd)) &&
-		    (udata->outlen >= sizeof(resp)) &&
-		    hns_roce_qp_has_sq(init_attr))
-			hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
-
 err_db:
-	if (!udata && hns_roce_qp_has_rq(init_attr) &&
-	    (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB))
-		hns_roce_free_db(hr_dev, &hr_qp->rdb);
-
-err_out:
+	free_qp_db(hr_dev, hr_qp, udata);
+err_wrid:
+	free_kernel_wrid(hr_dev, hr_qp);
 	return ret;
 }
 
@@ -1157,23 +1203,7 @@ void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 	free_qpn(hr_dev, hr_qp);
 	free_qp_buf(hr_dev, hr_qp);
 	free_kernel_wrid(hr_dev, hr_qp);
-
-	if (udata) {
-		struct hns_roce_ucontext *context =
-			rdma_udata_to_drv_context(
-				udata,
-				struct hns_roce_ucontext,
-				ibucontext);
-
-		if (hr_qp->sq.wqe_cnt && (hr_qp->sdb_en == 1))
-			hns_roce_db_unmap_user(context, &hr_qp->sdb);
-
-		if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1))
-			hns_roce_db_unmap_user(context, &hr_qp->rdb);
-	} else {
-		if (hr_qp->rq.wqe_cnt)
-			hns_roce_free_db(hr_dev, &hr_qp->rdb);
-	}
+	free_qp_db(hr_dev, hr_qp, udata);
 
 	kfree(hr_qp);
 }

From 88d033077b2c7ca556a96b255bfa0ec502881b10 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galpress@amazon.com>
Date: Tue, 25 Feb 2020 13:40:08 +0200
Subject: [PATCH 078/327] RDMA/efa: Unified getters/setters for device structs
 bitmask access

Use unified macros for device structs access instead of open coding the
shifts and masks over and over again.

Link: https://lore.kernel.org/r/20200225114010.21790-2-galpress@amazon.com
Reviewed-by: Firas JahJah <firasj@amazon.com>
Reviewed-by: Yossi Leybovich <sleybo@amazon.com>
Signed-off-by: Gal Pressman <galpress@amazon.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 .../infiniband/hw/efa/efa_admin_cmds_defs.h   |   7 +-
 drivers/infiniband/hw/efa/efa_admin_defs.h    |   4 +-
 drivers/infiniband/hw/efa/efa_com.c           | 150 ++++++++----------
 drivers/infiniband/hw/efa/efa_com_cmd.c       |  29 ++--
 drivers/infiniband/hw/efa/efa_common_defs.h   |  13 +-
 drivers/infiniband/hw/efa/efa_regs_defs.h     |  22 +--
 6 files changed, 99 insertions(+), 126 deletions(-)

diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
index 74b787a90660..96b104ab5415 100644
--- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
+++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_ADMIN_CMDS_H_
@@ -801,21 +801,16 @@ struct efa_admin_mmio_req_read_less_resp {
 
 /* create_qp_cmd */
 #define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK                BIT(0)
-#define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_SHIFT               1
 #define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK                BIT(1)
 
 /* reg_mr_cmd */
 #define EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK      GENMASK(4, 0)
-#define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_SHIFT     7
 #define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_MASK      BIT(7)
 #define EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK        BIT(0)
-#define EFA_ADMIN_REG_MR_CMD_REMOTE_READ_ENABLE_SHIFT       2
 #define EFA_ADMIN_REG_MR_CMD_REMOTE_READ_ENABLE_MASK        BIT(2)
 
 /* create_cq_cmd */
-#define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_SHIFT 5
 #define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5)
-#define EFA_ADMIN_CREATE_CQ_CMD_VIRT_SHIFT                  6
 #define EFA_ADMIN_CREATE_CQ_CMD_VIRT_MASK                   BIT(6)
 #define EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK    GENMASK(4, 0)
 
diff --git a/drivers/infiniband/hw/efa/efa_admin_defs.h b/drivers/infiniband/hw/efa/efa_admin_defs.h
index c8e0c8b905be..29d53ed63b3e 100644
--- a/drivers/infiniband/hw/efa/efa_admin_defs.h
+++ b/drivers/infiniband/hw/efa/efa_admin_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_ADMIN_H_
@@ -121,9 +121,7 @@ struct efa_admin_aenq_entry {
 /* aq_common_desc */
 #define EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK            GENMASK(11, 0)
 #define EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK                 BIT(0)
-#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_SHIFT            1
 #define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK             BIT(1)
-#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_SHIFT   2
 #define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK    BIT(2)
 
 /* acq_common_desc */
diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c
index 0778f4f7dccd..9e6399bfcf6d 100644
--- a/drivers/infiniband/hw/efa/efa_com.c
+++ b/drivers/infiniband/hw/efa/efa_com.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "efa_com.h"
@@ -16,21 +16,10 @@
 #define EFA_ASYNC_QUEUE_DEPTH 16
 #define EFA_ADMIN_QUEUE_DEPTH 32
 
-#define MIN_EFA_VER\
-	((EFA_ADMIN_API_VERSION_MAJOR << EFA_REGS_VERSION_MAJOR_VERSION_SHIFT) | \
-	 (EFA_ADMIN_API_VERSION_MINOR & EFA_REGS_VERSION_MINOR_VERSION_MASK))
-
 #define EFA_CTRL_MAJOR          0
 #define EFA_CTRL_MINOR          0
 #define EFA_CTRL_SUB_MINOR      1
 
-#define MIN_EFA_CTRL_VER \
-	(((EFA_CTRL_MAJOR) << \
-	(EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT)) | \
-	((EFA_CTRL_MINOR) << \
-	(EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT)) | \
-	(EFA_CTRL_SUB_MINOR))
-
 #define EFA_DMA_ADDR_TO_UINT32_LOW(x)   ((u32)((u64)(x)))
 #define EFA_DMA_ADDR_TO_UINT32_HIGH(x)  ((u32)(((u64)(x)) >> 32))
 
@@ -84,7 +73,7 @@ static u32 efa_com_reg_read32(struct efa_com_dev *edev, u16 offset)
 	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
 	struct efa_admin_mmio_req_read_less_resp *read_resp;
 	unsigned long exp_time;
-	u32 mmio_read_reg;
+	u32 mmio_read_reg = 0;
 	u32 err;
 
 	read_resp = mmio_read->read_resp;
@@ -94,10 +83,9 @@ static u32 efa_com_reg_read32(struct efa_com_dev *edev, u16 offset)
 
 	/* trash DMA req_id to identify when hardware is done */
 	read_resp->req_id = mmio_read->seq_num + 0x9aL;
-	mmio_read_reg = (offset << EFA_REGS_MMIO_REG_READ_REG_OFF_SHIFT) &
-			EFA_REGS_MMIO_REG_READ_REG_OFF_MASK;
-	mmio_read_reg |= mmio_read->seq_num &
-			 EFA_REGS_MMIO_REG_READ_REQ_ID_MASK;
+	EFA_SET(&mmio_read_reg, EFA_REGS_MMIO_REG_READ_REG_OFF, offset);
+	EFA_SET(&mmio_read_reg, EFA_REGS_MMIO_REG_READ_REQ_ID,
+		mmio_read->seq_num);
 
 	writel(mmio_read_reg, edev->reg_bar + EFA_REGS_MMIO_REG_READ_OFF);
 
@@ -137,9 +125,9 @@ static int efa_com_admin_init_sq(struct efa_com_dev *edev)
 	struct efa_com_admin_queue *aq = &edev->aq;
 	struct efa_com_admin_sq *sq = &aq->sq;
 	u16 size = aq->depth * sizeof(*sq->entries);
+	u32 aq_caps = 0;
 	u32 addr_high;
 	u32 addr_low;
-	u32 aq_caps;
 
 	sq->entries =
 		dma_alloc_coherent(aq->dmadev, size, &sq->dma_addr, GFP_KERNEL);
@@ -160,10 +148,9 @@ static int efa_com_admin_init_sq(struct efa_com_dev *edev)
 	writel(addr_low, edev->reg_bar + EFA_REGS_AQ_BASE_LO_OFF);
 	writel(addr_high, edev->reg_bar + EFA_REGS_AQ_BASE_HI_OFF);
 
-	aq_caps = aq->depth & EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK;
-	aq_caps |= (sizeof(struct efa_admin_aq_entry) <<
-			EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT) &
-			EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK;
+	EFA_SET(&aq_caps, EFA_REGS_AQ_CAPS_AQ_DEPTH, aq->depth);
+	EFA_SET(&aq_caps, EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE,
+		sizeof(struct efa_admin_aq_entry));
 
 	writel(aq_caps, edev->reg_bar + EFA_REGS_AQ_CAPS_OFF);
 
@@ -175,9 +162,9 @@ static int efa_com_admin_init_cq(struct efa_com_dev *edev)
 	struct efa_com_admin_queue *aq = &edev->aq;
 	struct efa_com_admin_cq *cq = &aq->cq;
 	u16 size = aq->depth * sizeof(*cq->entries);
+	u32 acq_caps = 0;
 	u32 addr_high;
 	u32 addr_low;
-	u32 acq_caps;
 
 	cq->entries =
 		dma_alloc_coherent(aq->dmadev, size, &cq->dma_addr, GFP_KERNEL);
@@ -195,13 +182,11 @@ static int efa_com_admin_init_cq(struct efa_com_dev *edev)
 	writel(addr_low, edev->reg_bar + EFA_REGS_ACQ_BASE_LO_OFF);
 	writel(addr_high, edev->reg_bar + EFA_REGS_ACQ_BASE_HI_OFF);
 
-	acq_caps = aq->depth & EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK;
-	acq_caps |= (sizeof(struct efa_admin_acq_entry) <<
-			EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT) &
-			EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK;
-	acq_caps |= (aq->msix_vector_idx <<
-			EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_SHIFT) &
-			EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK;
+	EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_DEPTH, aq->depth);
+	EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE,
+		sizeof(struct efa_admin_acq_entry));
+	EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR,
+		aq->msix_vector_idx);
 
 	writel(acq_caps, edev->reg_bar + EFA_REGS_ACQ_CAPS_OFF);
 
@@ -212,7 +197,8 @@ static int efa_com_admin_init_aenq(struct efa_com_dev *edev,
 				   struct efa_aenq_handlers *aenq_handlers)
 {
 	struct efa_com_aenq *aenq = &edev->aenq;
-	u32 addr_low, addr_high, aenq_caps;
+	u32 addr_low, addr_high;
+	u32 aenq_caps = 0;
 	u16 size;
 
 	if (!aenq_handlers) {
@@ -237,13 +223,11 @@ static int efa_com_admin_init_aenq(struct efa_com_dev *edev,
 	writel(addr_low, edev->reg_bar + EFA_REGS_AENQ_BASE_LO_OFF);
 	writel(addr_high, edev->reg_bar + EFA_REGS_AENQ_BASE_HI_OFF);
 
-	aenq_caps = aenq->depth & EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK;
-	aenq_caps |= (sizeof(struct efa_admin_aenq_entry) <<
-		EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT) &
-		EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK;
-	aenq_caps |= (aenq->msix_vector_idx
-		      << EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_SHIFT) &
-		     EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK;
+	EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_DEPTH, aenq->depth);
+	EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE,
+		sizeof(struct efa_admin_aenq_entry));
+	EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR,
+		aenq->msix_vector_idx);
 	writel(aenq_caps, edev->reg_bar + EFA_REGS_AENQ_CAPS_OFF);
 
 	/*
@@ -280,8 +264,8 @@ static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq,
 static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq,
 					struct efa_comp_ctx *comp_ctx)
 {
-	u16 cmd_id = comp_ctx->user_cqe->acq_common_descriptor.command &
-		     EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK;
+	u16 cmd_id = EFA_GET(&comp_ctx->user_cqe->acq_common_descriptor.command,
+			     EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID);
 	u16 ctx_id = cmd_id & (aq->depth - 1);
 
 	ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id);
@@ -335,8 +319,8 @@ static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queu
 	cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK;
 
 	cmd->aq_common_descriptor.command_id = cmd_id;
-	cmd->aq_common_descriptor.flags |= aq->sq.phase &
-		EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK;
+	EFA_SET(&cmd->aq_common_descriptor.flags,
+		EFA_ADMIN_AQ_COMMON_DESC_PHASE, aq->sq.phase);
 
 	comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true);
 	if (!comp_ctx) {
@@ -427,8 +411,8 @@ static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *a
 	struct efa_comp_ctx *comp_ctx;
 	u16 cmd_id;
 
-	cmd_id = cqe->acq_common_descriptor.command &
-		 EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK;
+	cmd_id = EFA_GET(&cqe->acq_common_descriptor.command,
+			 EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID);
 
 	comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false);
 	if (!comp_ctx) {
@@ -743,7 +727,7 @@ int efa_com_admin_init(struct efa_com_dev *edev,
 	int err;
 
 	dev_sts = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
-	if (!(dev_sts & EFA_REGS_DEV_STS_READY_MASK)) {
+	if (!EFA_GET(&dev_sts, EFA_REGS_DEV_STS_READY)) {
 		ibdev_err(edev->efa_dev,
 			  "Device isn't ready, abort com init %#x\n", dev_sts);
 		return -ENODEV;
@@ -778,8 +762,7 @@ int efa_com_admin_init(struct efa_com_dev *edev,
 		goto err_destroy_cq;
 
 	cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
-	timeout = (cap & EFA_REGS_CAPS_ADMIN_CMD_TO_MASK) >>
-		  EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT;
+	timeout = EFA_GET(&cap, EFA_REGS_CAPS_ADMIN_CMD_TO);
 	if (timeout)
 		/* the resolution of timeout reg is 100ms */
 		aq->completion_timeout = timeout * 100000;
@@ -940,7 +923,9 @@ void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev)
 
 int efa_com_validate_version(struct efa_com_dev *edev)
 {
+	u32 min_ctrl_ver = 0;
 	u32 ctrl_ver_masked;
+	u32 min_ver = 0;
 	u32 ctrl_ver;
 	u32 ver;
 
@@ -953,33 +938,42 @@ int efa_com_validate_version(struct efa_com_dev *edev)
 				      EFA_REGS_CONTROLLER_VERSION_OFF);
 
 	ibdev_dbg(edev->efa_dev, "efa device version: %d.%d\n",
-		  (ver & EFA_REGS_VERSION_MAJOR_VERSION_MASK) >>
-			  EFA_REGS_VERSION_MAJOR_VERSION_SHIFT,
-		  ver & EFA_REGS_VERSION_MINOR_VERSION_MASK);
+		  EFA_GET(&ver, EFA_REGS_VERSION_MAJOR_VERSION),
+		  EFA_GET(&ver, EFA_REGS_VERSION_MINOR_VERSION));
 
-	if (ver < MIN_EFA_VER) {
+	EFA_SET(&min_ver, EFA_REGS_VERSION_MAJOR_VERSION,
+		EFA_ADMIN_API_VERSION_MAJOR);
+	EFA_SET(&min_ver, EFA_REGS_VERSION_MINOR_VERSION,
+		EFA_ADMIN_API_VERSION_MINOR);
+	if (ver < min_ver) {
 		ibdev_err(edev->efa_dev,
 			  "EFA version is lower than the minimal version the driver supports\n");
 		return -EOPNOTSUPP;
 	}
 
-	ibdev_dbg(edev->efa_dev,
-		  "efa controller version: %d.%d.%d implementation version %d\n",
-		  (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) >>
-			  EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT,
-		  (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) >>
-			  EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT,
-		  (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK),
-		  (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK) >>
-			  EFA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT);
+	ibdev_dbg(
+		edev->efa_dev,
+		"efa controller version: %d.%d.%d implementation version %d\n",
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION),
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION),
+		EFA_GET(&ctrl_ver,
+			EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION),
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_IMPL_ID));
 
 	ctrl_ver_masked =
-		(ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) |
-		(ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) |
-		(ctrl_ver & EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK);
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION) |
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION) |
+		EFA_GET(&ctrl_ver,
+			EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION);
 
+	EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION,
+		EFA_CTRL_MAJOR);
+	EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION,
+		EFA_CTRL_MINOR);
+	EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION,
+		EFA_CTRL_SUB_MINOR);
 	/* Validate the ctrl version without the implementation ID */
-	if (ctrl_ver_masked < MIN_EFA_CTRL_VER) {
+	if (ctrl_ver_masked < min_ctrl_ver) {
 		ibdev_err(edev->efa_dev,
 			  "EFA ctrl version is lower than the minimal ctrl version the driver supports\n");
 		return -EOPNOTSUPP;
@@ -1002,8 +996,7 @@ int efa_com_get_dma_width(struct efa_com_dev *edev)
 	u32 caps = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
 	int width;
 
-	width = (caps & EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK) >>
-		EFA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT;
+	width = EFA_GET(&caps, EFA_REGS_CAPS_DMA_ADDR_WIDTH);
 
 	ibdev_dbg(edev->efa_dev, "DMA width: %d\n", width);
 
@@ -1017,16 +1010,14 @@ int efa_com_get_dma_width(struct efa_com_dev *edev)
 	return width;
 }
 
-static int wait_for_reset_state(struct efa_com_dev *edev, u32 timeout,
-				u16 exp_state)
+static int wait_for_reset_state(struct efa_com_dev *edev, u32 timeout, int on)
 {
 	u32 val, i;
 
 	for (i = 0; i < timeout; i++) {
 		val = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
 
-		if ((val & EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK) ==
-		    exp_state)
+		if (EFA_GET(&val, EFA_REGS_DEV_STS_RESET_IN_PROGRESS) == on)
 			return 0;
 
 		ibdev_dbg(edev->efa_dev, "Reset indication val %d\n", val);
@@ -1046,36 +1037,34 @@ static int wait_for_reset_state(struct efa_com_dev *edev, u32 timeout,
 int efa_com_dev_reset(struct efa_com_dev *edev,
 		      enum efa_regs_reset_reason_types reset_reason)
 {
-	u32 stat, timeout, cap, reset_val;
+	u32 stat, timeout, cap;
+	u32 reset_val = 0;
 	int err;
 
 	stat = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
 	cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
 
-	if (!(stat & EFA_REGS_DEV_STS_READY_MASK)) {
+	if (!EFA_GET(&stat, EFA_REGS_DEV_STS_READY)) {
 		ibdev_err(edev->efa_dev,
 			  "Device isn't ready, can't reset device\n");
 		return -EINVAL;
 	}
 
-	timeout = (cap & EFA_REGS_CAPS_RESET_TIMEOUT_MASK) >>
-		  EFA_REGS_CAPS_RESET_TIMEOUT_SHIFT;
+	timeout = EFA_GET(&cap, EFA_REGS_CAPS_RESET_TIMEOUT);
 	if (!timeout) {
 		ibdev_err(edev->efa_dev, "Invalid timeout value\n");
 		return -EINVAL;
 	}
 
 	/* start reset */
-	reset_val = EFA_REGS_DEV_CTL_DEV_RESET_MASK;
-	reset_val |= (reset_reason << EFA_REGS_DEV_CTL_RESET_REASON_SHIFT) &
-		     EFA_REGS_DEV_CTL_RESET_REASON_MASK;
+	EFA_SET(&reset_val, EFA_REGS_DEV_CTL_DEV_RESET, 1);
+	EFA_SET(&reset_val, EFA_REGS_DEV_CTL_RESET_REASON, reset_reason);
 	writel(reset_val, edev->reg_bar + EFA_REGS_DEV_CTL_OFF);
 
 	/* reset clears the mmio readless address, restore it */
 	efa_com_mmio_reg_read_resp_addr_init(edev);
 
-	err = wait_for_reset_state(edev, timeout,
-				   EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK);
+	err = wait_for_reset_state(edev, timeout, 1);
 	if (err) {
 		ibdev_err(edev->efa_dev, "Reset indication didn't turn on\n");
 		return err;
@@ -1089,8 +1078,7 @@ int efa_com_dev_reset(struct efa_com_dev *edev,
 		return err;
 	}
 
-	timeout = (cap & EFA_REGS_CAPS_ADMIN_CMD_TO_MASK) >>
-		  EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT;
+	timeout = EFA_GET(&cap, EFA_REGS_CAPS_ADMIN_CMD_TO);
 	if (timeout)
 		/* the resolution of timeout reg is 100ms */
 		edev->aq.completion_timeout = timeout * 100000;
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c
index e20bd84a1014..eea5574a62e8 100644
--- a/drivers/infiniband/hw/efa/efa_com_cmd.c
+++ b/drivers/infiniband/hw/efa/efa_com_cmd.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "efa_com.h"
@@ -161,8 +161,9 @@ int efa_com_create_cq(struct efa_com_dev *edev,
 	int err;
 
 	create_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_CQ;
-	create_cmd.cq_caps_2 = (params->entry_size_in_bytes / 4) &
-				EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK;
+	EFA_SET(&create_cmd.cq_caps_2,
+		EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS,
+		params->entry_size_in_bytes / 4);
 	create_cmd.cq_depth = params->cq_depth;
 	create_cmd.num_sub_cqs = params->num_sub_cqs;
 	create_cmd.uar = params->uarn;
@@ -227,8 +228,8 @@ int efa_com_register_mr(struct efa_com_dev *edev,
 	mr_cmd.aq_common_desc.opcode = EFA_ADMIN_REG_MR;
 	mr_cmd.pd = params->pd;
 	mr_cmd.mr_length = params->mr_length_in_bytes;
-	mr_cmd.flags |= params->page_shift &
-		EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK;
+	EFA_SET(&mr_cmd.flags, EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT,
+		params->page_shift);
 	mr_cmd.iova = params->iova;
 	mr_cmd.permissions = params->permissions;
 
@@ -242,11 +243,11 @@ int efa_com_register_mr(struct efa_com_dev *edev,
 			params->pbl.pbl.address.mem_addr_low;
 		mr_cmd.pbl.pbl.address.mem_addr_high =
 			params->pbl.pbl.address.mem_addr_high;
-		mr_cmd.aq_common_desc.flags |=
-			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK;
+		EFA_SET(&mr_cmd.aq_common_desc.flags,
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1);
 		if (params->indirect)
-			mr_cmd.aq_common_desc.flags |=
-				EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+			EFA_SET(&mr_cmd.aq_common_desc.flags,
+				EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT, 1);
 	}
 
 	err = efa_com_cmd_exec(aq,
@@ -386,9 +387,8 @@ static int efa_com_get_feature_ex(struct efa_com_dev *edev,
 	get_cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_FEATURE;
 
 	if (control_buff_size)
-		get_cmd.aq_common_descriptor.flags =
-			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
-
+		EFA_SET(&get_cmd.aq_common_descriptor.flags,
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT, 1);
 
 	efa_com_set_dma_addr(control_buf_dma_addr,
 			     &get_cmd.control_buffer.address.mem_addr_high,
@@ -538,8 +538,9 @@ static int efa_com_set_feature_ex(struct efa_com_dev *edev,
 
 	set_cmd->aq_common_descriptor.opcode = EFA_ADMIN_SET_FEATURE;
 	if (control_buff_size) {
-		set_cmd->aq_common_descriptor.flags =
-			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+		set_cmd->aq_common_descriptor.flags = 0;
+		EFA_SET(&set_cmd->aq_common_descriptor.flags,
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT, 1);
 		efa_com_set_dma_addr(control_buf_dma_addr,
 				     &set_cmd->control_buffer.address.mem_addr_high,
 				     &set_cmd->control_buffer.address.mem_addr_low);
diff --git a/drivers/infiniband/hw/efa/efa_common_defs.h b/drivers/infiniband/hw/efa/efa_common_defs.h
index c559ec08898e..90af1c82c9c6 100644
--- a/drivers/infiniband/hw/efa/efa_common_defs.h
+++ b/drivers/infiniband/hw/efa/efa_common_defs.h
@@ -1,14 +1,25 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_COMMON_H_
 #define _EFA_COMMON_H_
 
+#include <linux/bitfield.h>
+
 #define EFA_COMMON_SPEC_VERSION_MAJOR        2
 #define EFA_COMMON_SPEC_VERSION_MINOR        0
 
+#define EFA_GET(ptr, mask) FIELD_GET(mask##_MASK, *(ptr))
+
+#define EFA_SET(ptr, mask, value)                                              \
+	({                                                                     \
+		typeof(ptr) _ptr = ptr;                                        \
+		*_ptr = (*_ptr & ~(mask##_MASK)) |                             \
+			FIELD_PREP(mask##_MASK, value);                        \
+	})
+
 struct efa_common_mem_addr {
 	u32 mem_addr_low;
 
diff --git a/drivers/infiniband/hw/efa/efa_regs_defs.h b/drivers/infiniband/hw/efa/efa_regs_defs.h
index bb9cad3d6a15..322a2c0d4ef9 100644
--- a/drivers/infiniband/hw/efa/efa_regs_defs.h
+++ b/drivers/infiniband/hw/efa/efa_regs_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_REGS_H_
@@ -45,69 +45,49 @@ enum efa_regs_reset_reason_types {
 
 /* version register */
 #define EFA_REGS_VERSION_MINOR_VERSION_MASK                 0xff
-#define EFA_REGS_VERSION_MAJOR_VERSION_SHIFT                8
 #define EFA_REGS_VERSION_MAJOR_VERSION_MASK                 0xff00
 
 /* controller_version register */
 #define EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK   0xff
-#define EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT     8
 #define EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK      0xff00
-#define EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT     16
 #define EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK      0xff0000
-#define EFA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT           24
 #define EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK            0xff000000
 
 /* caps register */
 #define EFA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK        0x1
-#define EFA_REGS_CAPS_RESET_TIMEOUT_SHIFT                   1
 #define EFA_REGS_CAPS_RESET_TIMEOUT_MASK                    0x3e
-#define EFA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT                  8
 #define EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK                   0xff00
-#define EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT                    16
 #define EFA_REGS_CAPS_ADMIN_CMD_TO_MASK                     0xf0000
 
 /* aq_caps register */
 #define EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK                      0xffff
-#define EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT                16
 #define EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK                 0xffff0000
 
 /* acq_caps register */
 #define EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK                    0xffff
-#define EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT              16
 #define EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK               0xff0000
-#define EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_SHIFT             24
 #define EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK              0xff000000
 
 /* aenq_caps register */
 #define EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK                  0xffff
-#define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT            16
 #define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK             0xff0000
-#define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_SHIFT           24
 #define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK            0xff000000
 
 /* dev_ctl register */
 #define EFA_REGS_DEV_CTL_DEV_RESET_MASK                     0x1
-#define EFA_REGS_DEV_CTL_AQ_RESTART_SHIFT                   1
 #define EFA_REGS_DEV_CTL_AQ_RESTART_MASK                    0x2
-#define EFA_REGS_DEV_CTL_RESET_REASON_SHIFT                 28
 #define EFA_REGS_DEV_CTL_RESET_REASON_MASK                  0xf0000000
 
 /* dev_sts register */
 #define EFA_REGS_DEV_STS_READY_MASK                         0x1
-#define EFA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_SHIFT       1
 #define EFA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK        0x2
-#define EFA_REGS_DEV_STS_AQ_RESTART_FINISHED_SHIFT          2
 #define EFA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK           0x4
-#define EFA_REGS_DEV_STS_RESET_IN_PROGRESS_SHIFT            3
 #define EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK             0x8
-#define EFA_REGS_DEV_STS_RESET_FINISHED_SHIFT               4
 #define EFA_REGS_DEV_STS_RESET_FINISHED_MASK                0x10
-#define EFA_REGS_DEV_STS_FATAL_ERROR_SHIFT                  5
 #define EFA_REGS_DEV_STS_FATAL_ERROR_MASK                   0x20
 
 /* mmio_reg_read register */
 #define EFA_REGS_MMIO_REG_READ_REQ_ID_MASK                  0xffff
-#define EFA_REGS_MMIO_REG_READ_REG_OFF_SHIFT                16
 #define EFA_REGS_MMIO_REG_READ_REG_OFF_MASK                 0xffff0000
 
 #endif /* _EFA_REGS_H_ */

From 56a7a721dd54fc8cead3d0eeeec4336db24b00fa Mon Sep 17 00:00:00 2001
From: Gal Pressman <galpress@amazon.com>
Date: Tue, 25 Feb 2020 13:40:09 +0200
Subject: [PATCH 079/327] RDMA/efa: Properly document the interrupt mask
 register

The fact that the LSB in the register is the enable bit should not be an
implicit assumption between the driver and the device, properly document
that in the register definition.

Link: https://lore.kernel.org/r/20200225114010.21790-3-galpress@amazon.com
Reviewed-by: Firas JahJah <firasj@amazon.com>
Reviewed-by: Yossi Leybovich <sleybo@amazon.com>
Signed-off-by: Gal Pressman <galpress@amazon.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/efa/efa_com.c       | 4 +---
 drivers/infiniband/hw/efa/efa_regs_defs.h | 3 +++
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c
index 9e6399bfcf6d..7fce69f5568f 100644
--- a/drivers/infiniband/hw/efa/efa_com.c
+++ b/drivers/infiniband/hw/efa/efa_com.c
@@ -23,8 +23,6 @@
 #define EFA_DMA_ADDR_TO_UINT32_LOW(x)   ((u32)((u64)(x)))
 #define EFA_DMA_ADDR_TO_UINT32_HIGH(x)  ((u32)(((u64)(x)) >> 32))
 
-#define EFA_REGS_ADMIN_INTR_MASK 1
-
 enum efa_cmd_status {
 	EFA_CMD_SUBMITTED,
 	EFA_CMD_COMPLETED,
@@ -689,7 +687,7 @@ void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling)
 	u32 mask_value = 0;
 
 	if (polling)
-		mask_value = EFA_REGS_ADMIN_INTR_MASK;
+		EFA_SET(&mask_value, EFA_REGS_INTR_MASK_EN, 1);
 
 	writel(mask_value, edev->reg_bar + EFA_REGS_INTR_MASK_OFF);
 	if (polling)
diff --git a/drivers/infiniband/hw/efa/efa_regs_defs.h b/drivers/infiniband/hw/efa/efa_regs_defs.h
index 322a2c0d4ef9..4017982fe13b 100644
--- a/drivers/infiniband/hw/efa/efa_regs_defs.h
+++ b/drivers/infiniband/hw/efa/efa_regs_defs.h
@@ -73,6 +73,9 @@ enum efa_regs_reset_reason_types {
 #define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK             0xff0000
 #define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK            0xff000000
 
+/* intr_mask register */
+#define EFA_REGS_INTR_MASK_EN_MASK                          0x1
+
 /* dev_ctl register */
 #define EFA_REGS_DEV_CTL_DEV_RESET_MASK                     0x1
 #define EFA_REGS_DEV_CTL_AQ_RESTART_MASK                    0x2

From ff6629f88c529b07d9704c656c64dae76910e3e9 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galpress@amazon.com>
Date: Tue, 25 Feb 2020 13:40:10 +0200
Subject: [PATCH 080/327] RDMA/efa: Do not delay freeing of DMA pages

When destroying a DMA mmapped object, there is no need to artificially
delay the freeing of the pages to the mmap entry removal.  Since the vma
keeps a reference count on these pages, free_pages_exact can be called on
the destroy verb as it won't really free the pages until the reference
count is cleared (in case the user hasn't called munmap yet).

Remove the special handling of DMA pages and call free_pages_exact on
destroy_qp/cq. The mmap entry removal is moved to the beginning of the
destroy flows, so the driver can safely free the pages.

Link: https://lore.kernel.org/r/20200225114010.21790-4-galpress@amazon.com
Reviewed-by: Firas JahJah <firasj@amazon.com>
Reviewed-by: Yossi Leybovich <sleybo@amazon.com>
Signed-off-by: Gal Pressman <galpress@amazon.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/efa/efa_verbs.c | 44 +++++++++++++--------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index ec5545870554..bf3120f140f7 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include <linux/vmalloc.h>
@@ -169,6 +169,14 @@ static void *efa_zalloc_mapped(struct efa_dev *dev, dma_addr_t *dma_addr,
 	return addr;
 }
 
+static void efa_free_mapped(struct efa_dev *dev, void *cpu_addr,
+			    dma_addr_t dma_addr,
+			    size_t size, enum dma_data_direction dir)
+{
+	dma_unmap_single(&dev->pdev->dev, dma_addr, size, dir);
+	free_pages_exact(cpu_addr, size);
+}
+
 int efa_query_device(struct ib_device *ibdev,
 		     struct ib_device_attr *props,
 		     struct ib_udata *udata)
@@ -402,6 +410,9 @@ int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 	int err;
 
 	ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num);
+
+	efa_qp_user_mmap_entries_remove(qp);
+
 	err = efa_destroy_qp_handle(dev, qp->qp_handle);
 	if (err)
 		return err;
@@ -411,11 +422,10 @@ int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 			  "qp->cpu_addr[0x%p] freed: size[%lu], dma[%pad]\n",
 			  qp->rq_cpu_addr, qp->rq_size,
 			  &qp->rq_dma_addr);
-		dma_unmap_single(&dev->pdev->dev, qp->rq_dma_addr, qp->rq_size,
-				 DMA_TO_DEVICE);
+		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
+				qp->rq_size, DMA_TO_DEVICE);
 	}
 
-	efa_qp_user_mmap_entries_remove(qp);
 	kfree(qp);
 	return 0;
 }
@@ -720,13 +730,9 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 err_destroy_qp:
 	efa_destroy_qp_handle(dev, create_qp_resp.qp_handle);
 err_free_mapped:
-	if (qp->rq_size) {
-		dma_unmap_single(&dev->pdev->dev, qp->rq_dma_addr, qp->rq_size,
-				 DMA_TO_DEVICE);
-
-		if (!qp->rq_mmap_entry)
-			free_pages_exact(qp->rq_cpu_addr, qp->rq_size);
-	}
+	if (qp->rq_size)
+		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
+				qp->rq_size, DMA_TO_DEVICE);
 err_free_qp:
 	kfree(qp);
 err_out:
@@ -845,10 +851,10 @@ void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
 		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
 
-	efa_destroy_cq_idx(dev, cq->cq_idx);
-	dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
-			 DMA_FROM_DEVICE);
 	rdma_user_mmap_entry_remove(cq->mmap_entry);
+	efa_destroy_cq_idx(dev, cq->cq_idx);
+	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
+			DMA_FROM_DEVICE);
 }
 
 static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
@@ -985,10 +991,8 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 err_destroy_cq:
 	efa_destroy_cq_idx(dev, cq->cq_idx);
 err_free_mapped:
-	dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
-			 DMA_FROM_DEVICE);
-	if (!cq->mmap_entry)
-		free_pages_exact(cq->cpu_addr, cq->size);
+	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
+			DMA_FROM_DEVICE);
 
 err_out:
 	atomic64_inc(&dev->stats.sw_stats.create_cq_err);
@@ -1550,10 +1554,6 @@ void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
 {
 	struct efa_user_mmap_entry *entry = to_emmap(rdma_entry);
 
-	/* DMA mapping is already gone, now free the pages */
-	if (entry->mmap_flag == EFA_MMAP_DMA_PAGE)
-		free_pages_exact(phys_to_virt(entry->address),
-				 entry->rdma_entry.npages * PAGE_SIZE);
 	kfree(entry);
 }
 

From af1abab986b85fdeb3e98c037988cfa4bf9d4018 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 27 Jun 2018 14:39:54 -0500
Subject: [PATCH 081/327] uml: Don't consult current to find the proc_mnt in
 mconsole_proc

Inspection of the control flow reveals that mconsole_proc is either
called from mconsole_stop called from mc_work_proc or from
mc_work_proc directly.  The function mc_work_proc is dispatched to a
kernel thread with schedule_work.

All of the threads that run dispatched by schedule_work are in the
init pid namespace.

So make the code clearer and by using init_pid_ns instead of
task_active_pid_ns(current).

Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/um/drivers/mconsole_kern.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index b80a1d616e4e..e8f5c81c2c6c 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -123,7 +123,7 @@ void mconsole_log(struct mc_request *req)
 
 void mconsole_proc(struct mc_request *req)
 {
-	struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
+	struct vfsmount *mnt = init_pid_ns.proc_mnt;
 	char *buf;
 	int len;
 	struct file *file;

From 76313c70c52f930af4afd21684509ca52297ea71 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 19 Feb 2020 10:37:15 -0600
Subject: [PATCH 082/327] uml: Create a private mount of proc for mconsole

The mconsole code only ever accesses proc for the initial pid
namespace.  Instead of depending upon the proc_mnt which is
for proc_flush_task have uml create it's own mount of proc
instead.

This allows proc_flush_task to evolve and remove the
need for having a proc_mnt to do it's job.

Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/um/drivers/mconsole_kern.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index e8f5c81c2c6c..30575bd92975 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -36,6 +36,8 @@
 #include "mconsole_kern.h"
 #include <os.h>
 
+static struct vfsmount *proc_mnt = NULL;
+
 static int do_unlink_socket(struct notifier_block *notifier,
 			    unsigned long what, void *data)
 {
@@ -123,7 +125,7 @@ void mconsole_log(struct mc_request *req)
 
 void mconsole_proc(struct mc_request *req)
 {
-	struct vfsmount *mnt = init_pid_ns.proc_mnt;
+	struct vfsmount *mnt = proc_mnt;
 	char *buf;
 	int len;
 	struct file *file;
@@ -134,6 +136,10 @@ void mconsole_proc(struct mc_request *req)
 	ptr += strlen("proc");
 	ptr = skip_spaces(ptr);
 
+	if (!mnt) {
+		mconsole_reply(req, "Proc not available", 1, 0);
+		goto out;
+	}
 	file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0);
 	if (IS_ERR(file)) {
 		mconsole_reply(req, "Failed to open file", 1, 0);
@@ -683,6 +689,24 @@ void mconsole_stack(struct mc_request *req)
 	with_console(req, stack_proc, to);
 }
 
+static int __init mount_proc(void)
+{
+	struct file_system_type *proc_fs_type;
+	struct vfsmount *mnt;
+
+	proc_fs_type = get_fs_type("proc");
+	if (!proc_fs_type)
+		return -ENODEV;
+
+	mnt = kern_mount(proc_fs_type);
+	put_filesystem(proc_fs_type);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	proc_mnt = mnt;
+	return 0;
+}
+
 /*
  * Changed by mconsole_setup, which is __setup, and called before SMP is
  * active.
@@ -696,6 +720,8 @@ static int __init mconsole_init(void)
 	int err;
 	char file[UNIX_PATH_MAX];
 
+	mount_proc();
+
 	if (umid_file_name("mconsole", file, sizeof(file)))
 		return -1;
 	snprintf(mconsole_socket_name, sizeof(file), "%s", file);

From 69879c01a0c3f70e0887cfb4d9ff439814361e46 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 20 Feb 2020 08:08:20 -0600
Subject: [PATCH 083/327] proc: Remove the now unnecessary internal mount of
 proc

There remains no more code in the kernel using pids_ns->proc_mnt,
therefore remove it from the kernel.

The big benefit of this change is that one of the most error prone and
tricky parts of the pid namespace implementation, maintaining kernel
mounts of proc is removed.

In addition removing the unnecessary complexity of the kernel mount
fixes a regression that caused the proc mount options to be ignored.
Now that the initial mount of proc comes from userspace, those mount
options are again honored.  This fixes Android's usage of the proc
hidepid option.

Reported-by: Alistair Strachan <astrachan@google.com>
Fixes: e94591d0d90c ("proc: Convert proc_mount to use mount_ns.")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/root.c                | 36 -----------------------------------
 include/linux/pid_namespace.h |  2 --
 include/linux/proc_ns.h       |  5 -----
 kernel/pid.c                  |  8 --------
 kernel/pid_namespace.c        |  7 -------
 5 files changed, 58 deletions(-)

diff --git a/fs/proc/root.c b/fs/proc/root.c
index 608233dfd29c..2633f10446c3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -292,39 +292,3 @@ struct proc_dir_entry proc_root = {
 	.subdir		= RB_ROOT,
 	.name		= "/proc",
 };
-
-int pid_ns_prepare_proc(struct pid_namespace *ns)
-{
-	struct proc_fs_context *ctx;
-	struct fs_context *fc;
-	struct vfsmount *mnt;
-
-	fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT);
-	if (IS_ERR(fc))
-		return PTR_ERR(fc);
-
-	if (fc->user_ns != ns->user_ns) {
-		put_user_ns(fc->user_ns);
-		fc->user_ns = get_user_ns(ns->user_ns);
-	}
-
-	ctx = fc->fs_private;
-	if (ctx->pid_ns != ns) {
-		put_pid_ns(ctx->pid_ns);
-		get_pid_ns(ns);
-		ctx->pid_ns = ns;
-	}
-
-	mnt = fc_mount(fc);
-	put_fs_context(fc);
-	if (IS_ERR(mnt))
-		return PTR_ERR(mnt);
-
-	ns->proc_mnt = mnt;
-	return 0;
-}
-
-void pid_ns_release_proc(struct pid_namespace *ns)
-{
-	kern_unmount(ns->proc_mnt);
-}
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 2ed6af88794b..4956e362e55e 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -33,7 +33,6 @@ struct pid_namespace {
 	unsigned int level;
 	struct pid_namespace *parent;
 #ifdef CONFIG_PROC_FS
-	struct vfsmount *proc_mnt;
 	struct dentry *proc_self;
 	struct dentry *proc_thread_self;
 #endif
@@ -42,7 +41,6 @@ struct pid_namespace {
 #endif
 	struct user_namespace *user_ns;
 	struct ucounts *ucounts;
-	struct work_struct proc_work;
 	kgid_t pid_gid;
 	int hide_pid;
 	int reboot;	/* group exit code if this pidns was rebooted */
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 4626b1ac3b6c..e1106a077c1a 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -50,16 +50,11 @@ enum {
 
 #ifdef CONFIG_PROC_FS
 
-extern int pid_ns_prepare_proc(struct pid_namespace *ns);
-extern void pid_ns_release_proc(struct pid_namespace *ns);
 extern int proc_alloc_inum(unsigned int *pino);
 extern void proc_free_inum(unsigned int inum);
 
 #else /* CONFIG_PROC_FS */
 
-static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
-static inline void pid_ns_release_proc(struct pid_namespace *ns) {}
-
 static inline int proc_alloc_inum(unsigned int *inum)
 {
 	*inum = 1;
diff --git a/kernel/pid.c b/kernel/pid.c
index ca08d6a3aa77..60820e72634c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -144,9 +144,6 @@ void free_pid(struct pid *pid)
 			/* Handle a fork failure of the first process */
 			WARN_ON(ns->child_reaper);
 			ns->pid_allocated = 0;
-			/* fall through */
-		case 0:
-			schedule_work(&ns->proc_work);
 			break;
 		}
 
@@ -247,11 +244,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 		tmp = tmp->parent;
 	}
 
-	if (unlikely(is_child_reaper(pid))) {
-		if (pid_ns_prepare_proc(ns))
-			goto out_free;
-	}
-
 	get_pid_ns(ns);
 	refcount_set(&pid->count, 1);
 	for (type = 0; type < PIDTYPE_MAX; ++type)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index d40017e79ebe..318fcc6ba301 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -57,12 +57,6 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
 	return READ_ONCE(*pkc);
 }
 
-static void proc_cleanup_work(struct work_struct *work)
-{
-	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
-	pid_ns_release_proc(ns);
-}
-
 static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
 {
 	return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
@@ -114,7 +108,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
 	ns->pid_allocated = PIDNS_ADDING;
-	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
 	return ns;
 

From af9fe6d607c9f456fb6c1cb87e1dc66a43846efd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 28 Feb 2020 16:29:12 -0600
Subject: [PATCH 084/327] pid: Improve the comment about waiting in
 zap_pid_ns_processes

Oleg wrote a very informative comment, but with the removal of
proc_cleanup_work it is no longer accurate.

Rewrite the comment so that it only talks about the details
that are still relevant, and hopefully is a little clearer.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/pid_namespace.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 318fcc6ba301..01f8ba32cc0c 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -224,20 +224,27 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	} while (rc != -ECHILD);
 
 	/*
-	 * kernel_wait4() above can't reap the EXIT_DEAD children but we do not
-	 * really care, we could reparent them to the global init. We could
-	 * exit and reap ->child_reaper even if it is not the last thread in
-	 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
-	 * pid_ns can not go away until proc_kill_sb() drops the reference.
+	 * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE
+	 * process whose parents processes are outside of the pid
+	 * namespace.  Such processes are created with setns()+fork().
 	 *
-	 * But this ns can also have other tasks injected by setns()+fork().
-	 * Again, ignoring the user visible semantics we do not really need
-	 * to wait until they are all reaped, but they can be reparented to
-	 * us and thus we need to ensure that pid->child_reaper stays valid
-	 * until they all go away. See free_pid()->wake_up_process().
+	 * If those EXIT_ZOMBIE processes are not reaped by their
+	 * parents before their parents exit, they will be reparented
+	 * to pid_ns->child_reaper.  Thus pidns->child_reaper needs to
+	 * stay valid until they all go away.
 	 *
-	 * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
-	 * if reparented.
+	 * The code relies on the the pid_ns->child_reaper ignoring
+	 * SIGCHILD to cause those EXIT_ZOMBIE processes to be
+	 * autoreaped if reparented.
+	 *
+	 * Semantically it is also desirable to wait for EXIT_ZOMBIE
+	 * processes before allowing the child_reaper to be reaped, as
+	 * that gives the invariant that when the init process of a
+	 * pid namespace is reaped all of the processes in the pid
+	 * namespace are gone.
+	 *
+	 * Once all of the other tasks are gone from the pid_namespace
+	 * free_pid() will awaken this task.
 	 */
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);

From c2b777a9592395bf68f17bcfa76813eb507a001c Mon Sep 17 00:00:00 2001
From: Selvin Xavier <selvin.xavier@broadcom.com>
Date: Wed, 26 Feb 2020 07:45:31 -0800
Subject: [PATCH 085/327] RDMA/bnxt_re: Refactor device add/remove
 functionalities

 - bnxt_re_ib_reg() handles two main functionalities - initializing the
   device and registering with the IB stack.  Split it into 2 functions
   i.e. bnxt_re_dev_init() and bnxt_re_ib_init() to account for the same
   thereby improve modularity. Do the same for
   bnxt_re_ib_unreg()i.e. split into two functions - bnxt_re_dev_uninit()
   and bnxt_re_ib_uninit().

 - Simplify the code by combining the different steps to add and remove
   the device into two functions.

 - Report correct netdev link state during device register

Link: https://lore.kernel.org/r/1582731932-26574-2-git-send-email-selvin.xavier@broadcom.com
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/main.c | 139 ++++++++++++++++-----------
 1 file changed, 82 insertions(+), 57 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index b5128cce8e21..5f8fd74f5629 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -78,7 +78,8 @@ static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list);
 /* Mutex to protect the list of bnxt_re devices added */
 static DEFINE_MUTEX(bnxt_re_dev_lock);
 static struct workqueue_struct *bnxt_re_wq;
-static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev);
+static void bnxt_re_remove_device(struct bnxt_re_dev *rdev);
+static void bnxt_re_ib_uninit(struct bnxt_re_dev *rdev);
 
 static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev)
 {
@@ -237,7 +238,9 @@ static void bnxt_re_shutdown(void *p)
 	if (!rdev)
 		return;
 
-	bnxt_re_ib_unreg(rdev);
+	bnxt_re_ib_uninit(rdev);
+	ASSERT_RTNL();
+	bnxt_re_remove_device(rdev);
 }
 
 static void bnxt_re_stop_irq(void *handle)
@@ -1317,7 +1320,41 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
 		le16_to_cpu(resp.hwrm_intf_patch);
 }
 
-static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
+static void bnxt_re_ib_uninit(struct bnxt_re_dev *rdev)
+{
+	/* Cleanup ib dev */
+	if (test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) {
+		ib_unregister_device(&rdev->ibdev);
+		clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
+	}
+}
+
+int bnxt_re_ib_init(struct bnxt_re_dev *rdev)
+{
+	int rc = 0;
+	u32 event;
+
+	/* Register ib dev */
+	rc = bnxt_re_register_ib(rdev);
+	if (rc) {
+		pr_err("Failed to register with IB: %#x\n", rc);
+		return rc;
+	}
+	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
+	dev_info(rdev_to_dev(rdev), "Device registered successfully");
+	ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
+			 &rdev->active_width);
+	set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags);
+
+	event = netif_running(rdev->netdev) && netif_carrier_ok(rdev->netdev) ?
+		IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+
+	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, event);
+
+	return rc;
+}
+
+static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev)
 {
 	u8 type;
 	int rc;
@@ -1373,20 +1410,15 @@ static void bnxt_re_worker(struct work_struct *work)
 	schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
 }
 
-static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
+static int bnxt_re_dev_init(struct bnxt_re_dev *rdev)
 {
 	struct bnxt_qplib_creq_ctx *creq;
 	struct bnxt_re_ring_attr rattr;
 	u32 db_offt;
-	bool locked;
 	int vid;
 	u8 type;
 	int rc;
 
-	/* Acquire rtnl lock through out this function */
-	rtnl_lock();
-	locked = true;
-
 	/* Registered a new RoCE device instance to netdev */
 	memset(&rattr, 0, sizeof(rattr));
 	rc = bnxt_re_register_netdev(rdev);
@@ -1514,23 +1546,6 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 		schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
 	}
 
-	rtnl_unlock();
-	locked = false;
-
-	/* Register ib dev */
-	rc = bnxt_re_register_ib(rdev);
-	if (rc) {
-		ibdev_err(&rdev->ibdev,
-			  "Failed to register with IB: %#x\n", rc);
-		goto fail;
-	}
-	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
-	ibdev_info(&rdev->ibdev, "Device registered successfully");
-	ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
-			 &rdev->active_width);
-	set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags);
-	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE);
-
 	return 0;
 free_sctx:
 	bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
@@ -1544,10 +1559,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
 free_rcfw:
 	bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
 fail:
-	if (!locked)
-		rtnl_lock();
-	bnxt_re_ib_unreg(rdev);
-	rtnl_unlock();
+	bnxt_re_dev_uninit(rdev);
 
 	return rc;
 }
@@ -1589,9 +1601,35 @@ static int bnxt_re_dev_reg(struct bnxt_re_dev **rdev, struct net_device *netdev)
 	return rc;
 }
 
-static void bnxt_re_remove_one(struct bnxt_re_dev *rdev)
+static void bnxt_re_remove_device(struct bnxt_re_dev *rdev)
 {
+	bnxt_re_dev_uninit(rdev);
 	pci_dev_put(rdev->en_dev->pdev);
+	bnxt_re_dev_unreg(rdev);
+}
+
+static int bnxt_re_add_device(struct bnxt_re_dev **rdev,
+			      struct net_device *netdev)
+{
+	int rc;
+
+	rc = bnxt_re_dev_reg(rdev, netdev);
+	if (rc == -ENODEV)
+		return rc;
+	if (rc) {
+		pr_err("Failed to register with the device %s: %#x\n",
+		       netdev->name, rc);
+		return rc;
+	}
+
+	pci_dev_get((*rdev)->en_dev->pdev);
+	rc = bnxt_re_dev_init(*rdev);
+	if (rc) {
+		pci_dev_put((*rdev)->en_dev->pdev);
+		bnxt_re_dev_unreg(*rdev);
+	}
+
+	return rc;
 }
 
 /* Handle all deferred netevents tasks */
@@ -1606,16 +1644,17 @@ static void bnxt_re_task(struct work_struct *work)
 
 	if (re_work->event != NETDEV_REGISTER &&
 	    !test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
-		return;
+		goto done;
 
 	switch (re_work->event) {
 	case NETDEV_REGISTER:
-		rc = bnxt_re_ib_reg(rdev);
+		rc = bnxt_re_ib_init(rdev);
 		if (rc) {
 			ibdev_err(&rdev->ibdev,
 				  "Failed to register with IB: %#x", rc);
-			bnxt_re_remove_one(rdev);
-			bnxt_re_dev_unreg(rdev);
+			rtnl_lock();
+			bnxt_re_remove_device(rdev);
+			rtnl_unlock();
 			goto exit;
 		}
 		break;
@@ -1638,17 +1677,13 @@ static void bnxt_re_task(struct work_struct *work)
 	default:
 		break;
 	}
+done:
 	smp_mb__before_atomic();
 	atomic_dec(&rdev->sched_count);
 exit:
 	kfree(re_work);
 }
 
-static void bnxt_re_init_one(struct bnxt_re_dev *rdev)
-{
-	pci_dev_get(rdev->en_dev->pdev);
-}
-
 /*
  * "Notifier chain callback can be invoked for the same chain from
  * different CPUs at the same time".
@@ -1686,17 +1721,9 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 	case NETDEV_REGISTER:
 		if (rdev)
 			break;
-		rc = bnxt_re_dev_reg(&rdev, real_dev);
-		if (rc == -ENODEV)
-			break;
-		if (rc) {
-			ibdev_err(&rdev->ibdev,
-				  "Failed to register with the device %s: %#x\n",
-				  real_dev->name, rc);
-			break;
-		}
-		bnxt_re_init_one(rdev);
-		sch_work = true;
+		rc = bnxt_re_add_device(&rdev, real_dev);
+		if (!rc)
+			sch_work = true;
 		break;
 
 	case NETDEV_UNREGISTER:
@@ -1705,9 +1732,8 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 		 */
 		if (atomic_read(&rdev->sched_count) > 0)
 			goto exit;
-		bnxt_re_ib_unreg(rdev);
-		bnxt_re_remove_one(rdev);
-		bnxt_re_dev_unreg(rdev);
+		bnxt_re_ib_uninit(rdev);
+		bnxt_re_remove_device(rdev);
 		break;
 
 	default:
@@ -1784,12 +1810,11 @@ static void __exit bnxt_re_mod_exit(void)
 		 */
 		flush_workqueue(bnxt_re_wq);
 		bnxt_re_dev_stop(rdev);
+		bnxt_re_ib_uninit(rdev);
 		/* Acquire the rtnl_lock as the L2 resources are freed here */
 		rtnl_lock();
-		bnxt_re_ib_unreg(rdev);
+		bnxt_re_remove_device(rdev);
 		rtnl_unlock();
-		bnxt_re_remove_one(rdev);
-		bnxt_re_dev_unreg(rdev);
 	}
 	unregister_netdevice_notifier(&bnxt_re_netdev_notifier);
 	if (bnxt_re_wq)

From 66832705c4d01e52df78570e72a9392a1271d2e9 Mon Sep 17 00:00:00 2001
From: Selvin Xavier <selvin.xavier@broadcom.com>
Date: Wed, 26 Feb 2020 07:45:32 -0800
Subject: [PATCH 086/327] RDMA/bnxt_re: Use driver_unregister and
 unregistration API

Using the new unregister APIs provided by the core.  Provide the
dealloc_driver hook for the core to callback at the time of device
un-registration.

bnxt_re VF resources are created by the corresponding PF driver.  During
ib_unregister_driver, PF might get removed before VF and this could cause
failure when VFs are removed. Driver is explicitly queuing the removal of
VF devices before calling ib_unregister_driver.

Link: https://lore.kernel.org/r/1582731932-26574-3-git-send-email-selvin.xavier@broadcom.com
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/main.c | 106 +++++++++++----------------
 1 file changed, 42 insertions(+), 64 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 5f8fd74f5629..415693f8015a 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -79,7 +79,8 @@ static struct list_head bnxt_re_dev_list = LIST_HEAD_INIT(bnxt_re_dev_list);
 static DEFINE_MUTEX(bnxt_re_dev_lock);
 static struct workqueue_struct *bnxt_re_wq;
 static void bnxt_re_remove_device(struct bnxt_re_dev *rdev);
-static void bnxt_re_ib_uninit(struct bnxt_re_dev *rdev);
+static void bnxt_re_dealloc_driver(struct ib_device *ib_dev);
+static void bnxt_re_stop_irq(void *handle);
 
 static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev)
 {
@@ -237,10 +238,10 @@ static void bnxt_re_shutdown(void *p)
 
 	if (!rdev)
 		return;
-
-	bnxt_re_ib_uninit(rdev);
 	ASSERT_RTNL();
-	bnxt_re_remove_device(rdev);
+	/* Release the MSIx vectors before queuing unregister */
+	bnxt_re_stop_irq(rdev);
+	ib_unregister_device_queued(&rdev->ibdev);
 }
 
 static void bnxt_re_stop_irq(void *handle)
@@ -542,17 +543,12 @@ static bool is_bnxt_re_dev(struct net_device *netdev)
 
 static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev)
 {
-	struct bnxt_re_dev *rdev;
+	struct ib_device *ibdev =
+		ib_device_get_by_netdev(netdev, RDMA_DRIVER_BNXT_RE);
+	if (!ibdev)
+		return NULL;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(rdev, &bnxt_re_dev_list, list) {
-		if (rdev->netdev == netdev) {
-			rcu_read_unlock();
-			return rdev;
-		}
-	}
-	rcu_read_unlock();
-	return NULL;
+	return container_of(ibdev, struct bnxt_re_dev, ibdev);
 }
 
 static void bnxt_re_dev_unprobe(struct net_device *netdev,
@@ -626,11 +622,6 @@ static const struct attribute_group bnxt_re_dev_attr_group = {
 	.attrs = bnxt_re_attributes,
 };
 
-static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev)
-{
-	ib_unregister_device(&rdev->ibdev);
-}
-
 static const struct ib_device_ops bnxt_re_dev_ops = {
 	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_BNXT_RE,
@@ -645,6 +636,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = {
 	.create_cq = bnxt_re_create_cq,
 	.create_qp = bnxt_re_create_qp,
 	.create_srq = bnxt_re_create_srq,
+	.dealloc_driver = bnxt_re_dealloc_driver,
 	.dealloc_pd = bnxt_re_dealloc_pd,
 	.dealloc_ucontext = bnxt_re_dealloc_ucontext,
 	.del_gid = bnxt_re_del_gid,
@@ -741,15 +733,11 @@ static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
 {
 	dev_put(rdev->netdev);
 	rdev->netdev = NULL;
-
 	mutex_lock(&bnxt_re_dev_lock);
 	list_del_rcu(&rdev->list);
 	mutex_unlock(&bnxt_re_dev_lock);
 
 	synchronize_rcu();
-
-	ib_dealloc_device(&rdev->ibdev);
-	/* rdev is gone */
 }
 
 static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev,
@@ -1320,15 +1308,6 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
 		le16_to_cpu(resp.hwrm_intf_patch);
 }
 
-static void bnxt_re_ib_uninit(struct bnxt_re_dev *rdev)
-{
-	/* Cleanup ib dev */
-	if (test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) {
-		ib_unregister_device(&rdev->ibdev);
-		clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
-	}
-}
-
 int bnxt_re_ib_init(struct bnxt_re_dev *rdev)
 {
 	int rc = 0;
@@ -1359,10 +1338,6 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev)
 	u8 type;
 	int rc;
 
-	if (test_and_clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) {
-		/* Cleanup ib dev */
-		bnxt_re_unregister_ib(rdev);
-	}
 	if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags))
 		cancel_delayed_work_sync(&rdev->worker);
 
@@ -1632,6 +1607,19 @@ static int bnxt_re_add_device(struct bnxt_re_dev **rdev,
 	return rc;
 }
 
+static void bnxt_re_dealloc_driver(struct ib_device *ib_dev)
+{
+	struct bnxt_re_dev *rdev =
+		container_of(ib_dev, struct bnxt_re_dev, ibdev);
+
+	clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
+	dev_info(rdev_to_dev(rdev), "Unregistering Device");
+
+	rtnl_lock();
+	bnxt_re_remove_device(rdev);
+	rtnl_unlock();
+}
+
 /* Handle all deferred netevents tasks */
 static void bnxt_re_task(struct work_struct *work)
 {
@@ -1706,6 +1694,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 	struct bnxt_re_dev *rdev;
 	int rc = 0;
 	bool sch_work = false;
+	bool release = true;
 
 	real_dev = rdma_vlan_dev_real_dev(netdev);
 	if (!real_dev)
@@ -1713,7 +1702,8 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 
 	rdev = bnxt_re_from_netdev(real_dev);
 	if (!rdev && event != NETDEV_REGISTER)
-		goto exit;
+		return NOTIFY_OK;
+
 	if (real_dev != netdev)
 		goto exit;
 
@@ -1724,6 +1714,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 		rc = bnxt_re_add_device(&rdev, real_dev);
 		if (!rc)
 			sch_work = true;
+		release = false;
 		break;
 
 	case NETDEV_UNREGISTER:
@@ -1732,8 +1723,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 		 */
 		if (atomic_read(&rdev->sched_count) > 0)
 			goto exit;
-		bnxt_re_ib_uninit(rdev);
-		bnxt_re_remove_device(rdev);
+		ib_unregister_device_queued(&rdev->ibdev);
 		break;
 
 	default:
@@ -1755,6 +1745,8 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 	}
 
 exit:
+	if (rdev && release)
+		ib_device_put(&rdev->ibdev);
 	return NOTIFY_DONE;
 }
 
@@ -1790,35 +1782,21 @@ static int __init bnxt_re_mod_init(void)
 
 static void __exit bnxt_re_mod_exit(void)
 {
-	struct bnxt_re_dev *rdev, *next;
-	LIST_HEAD(to_be_deleted);
+	struct bnxt_re_dev *rdev;
 
-	mutex_lock(&bnxt_re_dev_lock);
-	/* Free all adapter allocated resources */
-	if (!list_empty(&bnxt_re_dev_list))
-		list_splice_init(&bnxt_re_dev_list, &to_be_deleted);
-	mutex_unlock(&bnxt_re_dev_lock);
-       /*
-	* Cleanup the devices in reverse order so that the VF device
-	* cleanup is done before PF cleanup
-	*/
-	list_for_each_entry_safe_reverse(rdev, next, &to_be_deleted, list) {
-		ibdev_info(&rdev->ibdev, "Unregistering Device");
-		/*
-		 * Flush out any scheduled tasks before destroying the
-		 * resources
-		 */
-		flush_workqueue(bnxt_re_wq);
-		bnxt_re_dev_stop(rdev);
-		bnxt_re_ib_uninit(rdev);
-		/* Acquire the rtnl_lock as the L2 resources are freed here */
-		rtnl_lock();
-		bnxt_re_remove_device(rdev);
-		rtnl_unlock();
-	}
 	unregister_netdevice_notifier(&bnxt_re_netdev_notifier);
 	if (bnxt_re_wq)
 		destroy_workqueue(bnxt_re_wq);
+	list_for_each_entry(rdev, &bnxt_re_dev_list, list) {
+		/* VF device removal should be called before the removal
+		 * of PF device. Queue VFs unregister first, so that VFs
+		 * shall be removed before the PF during the call of
+		 * ib_unregister_driver.
+		 */
+		if (rdev->is_virtfn)
+			ib_unregister_device(&rdev->ibdev);
+	}
+	ib_unregister_driver(RDMA_DRIVER_BNXT_RE);
 }
 
 module_init(bnxt_re_mod_init);

From 6be2067d1e31477354b08d1ac85ff9ec6dec898e Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 27 Feb 2020 06:42:09 +0000
Subject: [PATCH 087/327] RDMA/bnxt_re: Remove set but not used variable
 'pg_size'

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/infiniband/hw/bnxt_re/qplib_res.c: In function '__alloc_pbl':
drivers/infiniband/hw/bnxt_re/qplib_res.c:109:13: warning:
 variable 'pg_size' set but not used [-Wunused-but-set-variable]

commit 0c4dcd602817 ("RDMA/bnxt_re: Refactor hardware queue memory
allocation") involved this, but not used, so remove it.

Link: https://lore.kernel.org/r/20200227064209.87893-1-yuehaibing@huawei.com
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/qplib_res.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
index fc5909c7f2e0..cab1adf1fed9 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -106,13 +106,12 @@ static int __alloc_pbl(struct bnxt_qplib_res *res,
 	struct pci_dev *pdev = res->pdev;
 	struct scatterlist *sghead;
 	bool is_umem = false;
-	u32 pages, pg_size;
+	u32 pages;
 	int i;
 
 	if (sginfo->nopte)
 		return 0;
 	pages = sginfo->npages;
-	pg_size = sginfo->pgsize;
 	sghead = sginfo->sghead;
 	/* page ptr arrays */
 	pbl->pg_arr = vmalloc(pages * sizeof(void *));

From a0b404a98e274b5fc0cfb7c108d99127d482e5ff Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 27 Feb 2020 06:45:42 +0000
Subject: [PATCH 088/327] RDMA/bnxt_re: Remove set but not used variable
 'dev_attr'

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/infiniband/hw/bnxt_re/ib_verbs.c: In function 'bnxt_re_create_gsi_qp':
drivers/infiniband/hw/bnxt_re/ib_verbs.c:1283:30: warning:
 variable 'dev_attr' set but not used [-Wunused-but-set-variable]

commit 8dae419f9ec7 ("RDMA/bnxt_re: Refactor queue pair creation code")
involved this, but not used, so remove it.

Link: https://lore.kernel.org/r/20200227064542.91205-1-yuehaibing@huawei.com
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index ad3e524187e3..7e74efd15d6d 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -1280,14 +1280,12 @@ static int bnxt_re_create_shadow_gsi(struct bnxt_re_qp *qp,
 static int bnxt_re_create_gsi_qp(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd,
 				 struct ib_qp_init_attr *init_attr)
 {
-	struct bnxt_qplib_dev_attr *dev_attr;
 	struct bnxt_re_dev *rdev;
 	struct bnxt_qplib_qp *qplqp;
 	int rc = 0;
 
 	rdev = qp->rdev;
 	qplqp = &qp->qplib_qp;
-	dev_attr = &rdev->dev_attr;
 
 	qplqp->rq_hdr_buf_size = BNXT_QPLIB_MAX_QP1_RQ_HDR_SIZE_V2;
 	qplqp->sq_hdr_buf_size = BNXT_QPLIB_MAX_QP1_SQ_HDR_SIZE_V2;

From 75d03665081e00881e76eaa3a7635c9202a82600 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 27 Feb 2020 06:49:00 +0000
Subject: [PATCH 089/327] RDMA/bnxt_re: Remove set but not used variables 'pg'
 and 'idx'

Fixes gcc '-Wunused-but-set-variable' warning:

drivers/infiniband/hw/bnxt_re/qplib_rcfw.c: In function '__send_message':
drivers/infiniband/hw/bnxt_re/qplib_rcfw.c:101:10: warning:
 variable 'idx' set but not used [-Wunused-but-set-variable]
drivers/infiniband/hw/bnxt_re/qplib_rcfw.c:101:6: warning:
 variable 'pg' set but not used [-Wunused-but-set-variable]

commit cee0c7bba486 ("RDMA/bnxt_re: Refactor command queue management
code") involved this, but not used.

Link: https://lore.kernel.org/r/20200227064900.92255-1-yuehaibing@huawei.com
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
index b0b050e5cd12..f01e864bb611 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -98,7 +98,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
 	unsigned long flags;
 	u32 size, opcode;
 	u16 cookie, cbit;
-	int pg, idx;
 	u8 *preq;
 
 	pdev = rcfw->pdev;
@@ -167,9 +166,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
 	hwq_ptr = (struct bnxt_qplib_cmdqe **)hwq->pbl_ptr;
 	preq = (u8 *)req;
 	do {
-		pg = 0;
-		idx = 0;
-
 		/* Locate the next cmdq slot */
 		sw_prod = HWQ_CMP(hwq->prod, hwq);
 		cmdqe = &hwq_ptr[get_cmdq_pg(sw_prod, cmdq_depth)]

From 5e29d1443c46b6ca70a4c940a67e8c09f05dcb7e Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@mellanox.com>
Date: Thu, 27 Feb 2020 13:38:34 +0200
Subject: [PATCH 090/327] RDMA/mlx5: Prevent UMR usage with RO only when we
 have RO caps

Relaxed ordering is not supported in UMR so we are disabling UMR usage
when user passes relaxed ordering access flag.

Enable using UMR when user requested relaxed ordering but there are no
relaxed ordering capabilities.

This will prevent user from unnecessarily registering a new mkey.

Fixes: d6de0bb1850f ("RDMA/mlx5: Set relaxed ordering when requested")
Link: https://lore.kernel.org/r/20200227113834.94233-1-leon@kernel.org
Signed-off-by: Michael Guralnik <michaelgur@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d9bffcc93587..f21d446249b8 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1532,7 +1532,9 @@ static inline bool mlx5_ib_can_use_umr(struct mlx5_ib_dev *dev,
 	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
 		return false;
 
-	if (access_flags & IB_ACCESS_RELAXED_ORDERING)
+	if (access_flags & IB_ACCESS_RELAXED_ORDERING &&
+	    (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) ||
+	     MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)))
 		return false;
 
 	return true;

From bb8865f435d81223596f1abd6dec0b12ed122af0 Mon Sep 17 00:00:00 2001
From: Kamal Heib <kamalheib1@gmail.com>
Date: Thu, 30 Jan 2020 10:20:49 +0200
Subject: [PATCH 091/327] RDMA/providers: Fix return value when QP type isn't
 supported

The proper return code is "-EOPNOTSUPP" when the requested QP type is
not supported by the provider.

Link: https://lore.kernel.org/r/20200130082049.463-1-kamalheib1@gmail.com
Signed-off-by: Kamal Heib <kamalheib1@gmail.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c     | 2 +-
 drivers/infiniband/hw/cxgb4/qp.c             | 2 +-
 drivers/infiniband/hw/hns/hns_roce_qp.c      | 2 +-
 drivers/infiniband/hw/i40iw/i40iw_verbs.c    | 2 +-
 drivers/infiniband/hw/mlx4/qp.c              | 2 +-
 drivers/infiniband/hw/mlx5/qp.c              | 2 +-
 drivers/infiniband/hw/mthca/mthca_provider.c | 2 +-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c  | 2 +-
 drivers/infiniband/hw/qedr/verbs.c           | 2 +-
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 2 +-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 2 +-
 drivers/infiniband/sw/rdmavt/qp.c            | 2 +-
 drivers/infiniband/sw/siw/siw_verbs.c        | 2 +-
 13 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 7e74efd15d6d..47b0b50b71e7 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -1141,7 +1141,7 @@ static int bnxt_re_init_qp_type(struct bnxt_re_dev *rdev,
 	qptype = __from_ib_qp_type(init_attr->qp_type);
 	if (qptype == IB_QPT_MAX) {
 		ibdev_err(&rdev->ibdev, "QP type 0x%x not supported", qptype);
-		qptype = -EINVAL;
+		qptype = -EOPNOTSUPP;
 		goto out;
 	}
 
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index bbcac539777a..708216d82852 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -2127,7 +2127,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
 	pr_debug("ib_pd %p\n", pd);
 
 	if (attrs->qp_type != IB_QPT_RC)
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-EOPNOTSUPP);
 
 	php = to_c4iw_pd(pd);
 	rhp = php->rhp;
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 2a7535534ea8..7bec0ceb160b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -1261,7 +1261,7 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
 	default:{
 		ibdev_err(ibdev, "not support QP type %d\n",
 			  init_attr->qp_type);
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 	}
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index c335de91508f..fa1292932b88 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -617,7 +617,7 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
 	iwqp->ctx_info.qp_compl_ctx = (uintptr_t)qp;
 
 	if (init_attr->qp_type != IB_QPT_RC) {
-		err_code = -EINVAL;
+		err_code = -EOPNOTSUPP;
 		goto error;
 	}
 	if (iwdev->push_mode)
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 26425dd2d960..2f9f78912267 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1636,7 +1636,7 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
 	}
 	default:
 		/* Don't support raw QPs */
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 
 	return &qp->ibqp;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index a4f8e7030787..a597c9043b1d 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -2789,7 +2789,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
 		mlx5_ib_dbg(dev, "unsupported qp type %d\n",
 			    init_attr->qp_type);
 		/* Don't support raw QPs */
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 
 	if (verbs_init_attr->qp_type == IB_QPT_DRIVER)
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index ac19d57803b5..69a3e4f62fb1 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -561,7 +561,7 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
 	}
 	default:
 		/* Don't support raw QPs */
-		return ERR_PTR(-ENOSYS);
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 
 	if (err) {
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index d47ea675734b..10e343894595 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -1111,7 +1111,7 @@ static int ocrdma_check_qp_params(struct ib_pd *ibpd, struct ocrdma_dev *dev,
 	    (attrs->qp_type != IB_QPT_UD)) {
 		pr_err("%s(%d) unsupported qp type=0x%x requested\n",
 		       __func__, dev->id, attrs->qp_type);
-		return -EINVAL;
+		return -EOPNOTSUPP;
 	}
 	/* Skip the check for QP1 to support CM size of 128 */
 	if ((attrs->qp_type != IB_QPT_GSI) &&
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 484b555150e0..a5bd3adaf90a 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -1186,7 +1186,7 @@ static int qedr_check_qp_attrs(struct ib_pd *ibpd, struct qedr_dev *dev,
 		DP_DEBUG(dev, QEDR_MSG_QP,
 			 "create qp: unsupported qp type=0x%x requested\n",
 			 attrs->qp_type);
-		return -EINVAL;
+		return -EOPNOTSUPP;
 	}
 
 	if (attrs->cap.max_send_wr > qattr->max_sqe) {
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index 556b8e44a51c..71f82339446c 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -504,7 +504,7 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
 	if (init_attr->qp_type != IB_QPT_UD) {
 		usnic_err("%s asked to make a non-UD QP: %d\n",
 			  dev_name(&us_ibdev->ib_dev.dev), init_attr->qp_type);
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 
 	trans_spec = cmd.spec;
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
index 9de1281f9a3b..afcc2abcf55c 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
@@ -217,7 +217,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
 	    init_attr->qp_type != IB_QPT_GSI) {
 		dev_warn(&dev->pdev->dev, "queuepair type %d not supported\n",
 			 init_attr->qp_type);
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 
 	if (is_srq && !dev->dsr->caps.max_srq) {
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 3cdf75d0c7a4..762d4dc11c41 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1196,7 +1196,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 
 	default:
 		/* Don't support raw QPs */
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 
 	init_attr->cap.max_inline_data = 0;
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index d5390d498c61..aeb842bc7a1e 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -323,7 +323,7 @@ struct ib_qp *siw_create_qp(struct ib_pd *pd,
 	}
 	if (attrs->qp_type != IB_QPT_RC) {
 		siw_dbg(base_dev, "only RC QP's supported\n");
-		rv = -EINVAL;
+		rv = -EOPNOTSUPP;
 		goto err_out;
 	}
 	if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||

From 91b74bf5310b22ac3286d8e9b5354b77f41af178 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <alobakin@dlink.ru>
Date: Mon, 17 Feb 2020 10:36:29 +0300
Subject: [PATCH 092/327] IB/mlx5: Optimize u64 division on 32-bit arches

Commit f164be8c0366 ("IB/mlx5: Extend caps stage to handle VAR
capabilities") introduced a straight "/" division of the u64 variable
"bar_size".

This was fixed with commit 685eff513183 ("IB/mlx5: Use div64_u64 for
num_var_hw_entries calculation"). However, div64_u64() is redundant here
as mlx5_var_table::stride_size is of type u32.  Make the actual code way
more optimized on 32-bit kernels using div_u64() and fix 80 chars
break-through by the way.

Fixes: 685eff513183 ("IB/mlx5: Use div64_u64 for num_var_hw_entries calculation")
Link: https://lore.kernel.org/r/20200217073629.8051-1-alobakin@dlink.ru
Signed-off-by: Alexander Lobakin <alobakin@dlink.ru>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index e4bcfa81b70a..026391e4ceb4 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6545,7 +6545,8 @@ static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev)
 					doorbell_bar_offset);
 	bar_size = (1ULL << log_doorbell_bar_size) * 4096;
 	var_table->stride_size = 1ULL << log_doorbell_stride;
-	var_table->num_var_hw_entries = div64_u64(bar_size, var_table->stride_size);
+	var_table->num_var_hw_entries = div_u64(bar_size,
+						var_table->stride_size);
 	mutex_init(&var_table->bitmap_lock);
 	var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries,
 					  GFP_KERNEL);

From 9e3aaf6883b3b47d9f14579c1a03c911dd17cf10 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Thu, 27 Feb 2020 14:52:46 +0200
Subject: [PATCH 093/327] IB/mlx5: Add np_min_time_between_cnps and rp_max_rate
 debug params

Add two debugfs parameters described below.

np_min_time_between_cnps - Minimum time between sending CNPs from the
                           port.
                           Unit = microseconds.
                           Default = 0 (no min wait time; generated
                           based on incoming ECN marked packets).

rp_max_rate - Maximum rate at which reaction point node can transmit.
              Once this limit is reached, RP is no longer rate limited.
              Unit = Mbits/sec
              Default = 0 (full speed)

Link: https://lore.kernel.org/r/20200227125246.99472-1-leon@kernel.org
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cong.c    | 20 ++++++++++++++++++++
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  2 ++
 2 files changed, 22 insertions(+)

diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c
index 8ba439fabf7f..de4da92b81a6 100644
--- a/drivers/infiniband/hw/mlx5/cong.c
+++ b/drivers/infiniband/hw/mlx5/cong.c
@@ -47,6 +47,7 @@ static const char * const mlx5_ib_dbg_cc_name[] = {
 	"rp_byte_reset",
 	"rp_threshold",
 	"rp_ai_rate",
+	"rp_max_rate",
 	"rp_hai_rate",
 	"rp_min_dec_fac",
 	"rp_min_rate",
@@ -56,6 +57,7 @@ static const char * const mlx5_ib_dbg_cc_name[] = {
 	"rp_rate_reduce_monitor_period",
 	"rp_initial_alpha_value",
 	"rp_gd",
+	"np_min_time_between_cnps",
 	"np_cnp_dscp",
 	"np_cnp_prio_mode",
 	"np_cnp_prio",
@@ -66,6 +68,7 @@ static const char * const mlx5_ib_dbg_cc_name[] = {
 #define MLX5_IB_RP_TIME_RESET_ATTR			BIT(3)
 #define MLX5_IB_RP_BYTE_RESET_ATTR			BIT(4)
 #define MLX5_IB_RP_THRESHOLD_ATTR			BIT(5)
+#define MLX5_IB_RP_MAX_RATE_ATTR			BIT(6)
 #define MLX5_IB_RP_AI_RATE_ATTR				BIT(7)
 #define MLX5_IB_RP_HAI_RATE_ATTR			BIT(8)
 #define MLX5_IB_RP_MIN_DEC_FAC_ATTR			BIT(9)
@@ -77,6 +80,7 @@ static const char * const mlx5_ib_dbg_cc_name[] = {
 #define MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR		BIT(15)
 #define MLX5_IB_RP_GD_ATTR				BIT(16)
 
+#define MLX5_IB_NP_MIN_TIME_BETWEEN_CNPS_ATTR		BIT(2)
 #define MLX5_IB_NP_CNP_DSCP_ATTR			BIT(3)
 #define MLX5_IB_NP_CNP_PRIO_MODE_ATTR			BIT(4)
 
@@ -111,6 +115,9 @@ static u32 mlx5_get_cc_param_val(void *field, int offset)
 	case MLX5_IB_DBG_CC_RP_AI_RATE:
 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
 				rpg_ai_rate);
+	case MLX5_IB_DBG_CC_RP_MAX_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_max_rate);
 	case MLX5_IB_DBG_CC_RP_HAI_RATE:
 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
 				rpg_hai_rate);
@@ -138,6 +145,9 @@ static u32 mlx5_get_cc_param_val(void *field, int offset)
 	case MLX5_IB_DBG_CC_RP_GD:
 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
 				rpg_gd);
+	case MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS:
+		return MLX5_GET(cong_control_r_roce_ecn_np, field,
+				min_time_between_cnps);
 	case MLX5_IB_DBG_CC_NP_CNP_DSCP:
 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
 				cnp_dscp);
@@ -186,6 +196,11 @@ static void mlx5_ib_set_cc_param_mask_val(void *field, int offset,
 		MLX5_SET(cong_control_r_roce_ecn_rp, field,
 			 rpg_ai_rate, var);
 		break;
+	case MLX5_IB_DBG_CC_RP_MAX_RATE:
+		*attr_mask |= MLX5_IB_RP_MAX_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_max_rate, var);
+		break;
 	case MLX5_IB_DBG_CC_RP_HAI_RATE:
 		*attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR;
 		MLX5_SET(cong_control_r_roce_ecn_rp, field,
@@ -231,6 +246,11 @@ static void mlx5_ib_set_cc_param_mask_val(void *field, int offset,
 		MLX5_SET(cong_control_r_roce_ecn_rp, field,
 			 rpg_gd, var);
 		break;
+	case MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS:
+		*attr_mask |= MLX5_IB_NP_MIN_TIME_BETWEEN_CNPS_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_np, field,
+			 min_time_between_cnps, var);
+		break;
 	case MLX5_IB_DBG_CC_NP_CNP_DSCP:
 		*attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR;
 		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_dscp, var);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index f21d446249b8..3976071a5dc9 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -792,6 +792,7 @@ enum mlx5_ib_dbg_cc_types {
 	MLX5_IB_DBG_CC_RP_BYTE_RESET,
 	MLX5_IB_DBG_CC_RP_THRESHOLD,
 	MLX5_IB_DBG_CC_RP_AI_RATE,
+	MLX5_IB_DBG_CC_RP_MAX_RATE,
 	MLX5_IB_DBG_CC_RP_HAI_RATE,
 	MLX5_IB_DBG_CC_RP_MIN_DEC_FAC,
 	MLX5_IB_DBG_CC_RP_MIN_RATE,
@@ -801,6 +802,7 @@ enum mlx5_ib_dbg_cc_types {
 	MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD,
 	MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE,
 	MLX5_IB_DBG_CC_RP_GD,
+	MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS,
 	MLX5_IB_DBG_CC_NP_CNP_DSCP,
 	MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE,
 	MLX5_IB_DBG_CC_NP_CNP_PRIO,

From 79db784e794b6e7b7fb9b1dd464a34e4c0c039af Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Thu, 27 Feb 2020 14:54:07 +0200
Subject: [PATCH 094/327] IB/mlx5: Fix missing congestion control debugfs on
 rep rdma device

Cited commit missed to include low level congestion control related
debugfs stage initialization.  This resulted in missing debugfs entries
for cc_params of a RDMA device.

Add them back.

Fixes: b5ca15ad7e61 ("IB/mlx5: Add proper representors support")
Link: https://lore.kernel.org/r/20200227125407.99803-1-leon@kernel.org
Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 026391e4ceb4..709ef3f57a06 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -7078,6 +7078,9 @@ const struct mlx5_ib_profile raw_eth_profile = {
 	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
 		     mlx5_ib_stage_counters_init,
 		     mlx5_ib_stage_counters_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
+		     mlx5_ib_stage_cong_debugfs_init,
+		     mlx5_ib_stage_cong_debugfs_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_UAR,
 		     mlx5_ib_stage_uar_init,
 		     mlx5_ib_stage_uar_cleanup),

From 33fb27fd54465c74cbffba6315b2f043e90cec4c Mon Sep 17 00:00:00 2001
From: Bernard Metzler <bmt@zurich.ibm.com>
Date: Fri, 28 Feb 2020 18:35:34 +0100
Subject: [PATCH 095/327] RDMA/siw: Fix passive connection establishment

Holding the rtnl_lock while iterating a devices interface address list
potentially causes deadlocks with the cma_netdev_callback. While this was
implemented to limit the scope of a wildcard listen to addresses of the
current device only, a better solution limits the scope of the socket to
the device. This completely avoiding locking, and also results in
significant code simplification.

Fixes: c421651fa229 ("RDMA/siw: Add missing rtnl_lock around access to ifa")
Link: https://lore.kernel.org/r/20200228173534.26815-1-bmt@zurich.ibm.com
Reported-by: syzbot+55de90ab5f44172b0c90@syzkaller.appspotmail.com
Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/siw/siw_cm.c | 137 +++++++----------------------
 1 file changed, 31 insertions(+), 106 deletions(-)

diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
index c5651a96b196..559e5fd3bad8 100644
--- a/drivers/infiniband/sw/siw/siw_cm.c
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -1769,14 +1769,23 @@ int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len)
 	return 0;
 }
 
-static int siw_listen_address(struct iw_cm_id *id, int backlog,
-			      struct sockaddr *laddr, int addr_family)
+/*
+ * siw_create_listen - Create resources for a listener's IWCM ID @id
+ *
+ * Starts listen on the socket address id->local_addr.
+ *
+ */
+int siw_create_listen(struct iw_cm_id *id, int backlog)
 {
 	struct socket *s;
 	struct siw_cep *cep = NULL;
 	struct siw_device *sdev = to_siw_dev(id->device);
+	int addr_family = id->local_addr.ss_family;
 	int rv = 0, s_val;
 
+	if (addr_family != AF_INET && addr_family != AF_INET6)
+		return -EAFNOSUPPORT;
+
 	rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
 	if (rv < 0)
 		return rv;
@@ -1791,9 +1800,25 @@ static int siw_listen_address(struct iw_cm_id *id, int backlog,
 		siw_dbg(id->device, "setsockopt error: %d\n", rv);
 		goto error;
 	}
-	rv = s->ops->bind(s, laddr, addr_family == AF_INET ?
-				    sizeof(struct sockaddr_in) :
-				    sizeof(struct sockaddr_in6));
+	if (addr_family == AF_INET) {
+		struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
+
+		/* For wildcard addr, limit binding to current device only */
+		if (ipv4_is_zeronet(laddr->sin_addr.s_addr))
+			s->sk->sk_bound_dev_if = sdev->netdev->ifindex;
+
+		rv = s->ops->bind(s, (struct sockaddr *)laddr,
+				  sizeof(struct sockaddr_in));
+	} else {
+		struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr);
+
+		/* For wildcard addr, limit binding to current device only */
+		if (ipv6_addr_any(&laddr->sin6_addr))
+			s->sk->sk_bound_dev_if = sdev->netdev->ifindex;
+
+		rv = s->ops->bind(s, (struct sockaddr *)laddr,
+				  sizeof(struct sockaddr_in6));
+	}
 	if (rv) {
 		siw_dbg(id->device, "socket bind error: %d\n", rv);
 		goto error;
@@ -1852,7 +1877,7 @@ static int siw_listen_address(struct iw_cm_id *id, int backlog,
 	list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
 	cep->state = SIW_EPSTATE_LISTENING;
 
-	siw_dbg(id->device, "Listen at laddr %pISp\n", laddr);
+	siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr);
 
 	return 0;
 
@@ -1910,106 +1935,6 @@ static void siw_drop_listeners(struct iw_cm_id *id)
 	}
 }
 
-/*
- * siw_create_listen - Create resources for a listener's IWCM ID @id
- *
- * Listens on the socket address id->local_addr.
- *
- * If the listener's @id provides a specific local IP address, at most one
- * listening socket is created and associated with @id.
- *
- * If the listener's @id provides the wildcard (zero) local IP address,
- * a separate listen is performed for each local IP address of the device
- * by creating a listening socket and binding to that local IP address.
- *
- */
-int siw_create_listen(struct iw_cm_id *id, int backlog)
-{
-	struct net_device *dev = to_siw_dev(id->device)->netdev;
-	int rv = 0, listeners = 0;
-
-	siw_dbg(id->device, "backlog %d\n", backlog);
-
-	/*
-	 * For each attached address of the interface, create a
-	 * listening socket, if id->local_addr is the wildcard
-	 * IP address or matches the IP address.
-	 */
-	if (id->local_addr.ss_family == AF_INET) {
-		struct in_device *in_dev = in_dev_get(dev);
-		struct sockaddr_in s_laddr;
-		const struct in_ifaddr *ifa;
-
-		if (!in_dev) {
-			rv = -ENODEV;
-			goto out;
-		}
-		memcpy(&s_laddr, &id->local_addr, sizeof(s_laddr));
-
-		siw_dbg(id->device, "laddr %pISp\n", &s_laddr);
-
-		rtnl_lock();
-		in_dev_for_each_ifa_rtnl(ifa, in_dev) {
-			if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) ||
-			    s_laddr.sin_addr.s_addr == ifa->ifa_address) {
-				s_laddr.sin_addr.s_addr = ifa->ifa_address;
-
-				rv = siw_listen_address(id, backlog,
-						(struct sockaddr *)&s_laddr,
-						AF_INET);
-				if (!rv)
-					listeners++;
-			}
-		}
-		rtnl_unlock();
-		in_dev_put(in_dev);
-	} else if (id->local_addr.ss_family == AF_INET6) {
-		struct inet6_dev *in6_dev = in6_dev_get(dev);
-		struct inet6_ifaddr *ifp;
-		struct sockaddr_in6 *s_laddr = &to_sockaddr_in6(id->local_addr);
-
-		if (!in6_dev) {
-			rv = -ENODEV;
-			goto out;
-		}
-		siw_dbg(id->device, "laddr %pISp\n", &s_laddr);
-
-		rtnl_lock();
-		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
-			if (ifp->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED))
-				continue;
-			if (ipv6_addr_any(&s_laddr->sin6_addr) ||
-			    ipv6_addr_equal(&s_laddr->sin6_addr, &ifp->addr)) {
-				struct sockaddr_in6 bind_addr  = {
-					.sin6_family = AF_INET6,
-					.sin6_port = s_laddr->sin6_port,
-					.sin6_flowinfo = 0,
-					.sin6_addr = ifp->addr,
-					.sin6_scope_id = dev->ifindex };
-
-				rv = siw_listen_address(id, backlog,
-						(struct sockaddr *)&bind_addr,
-						AF_INET6);
-				if (!rv)
-					listeners++;
-			}
-		}
-		rtnl_unlock();
-		in6_dev_put(in6_dev);
-	} else {
-		rv = -EAFNOSUPPORT;
-	}
-out:
-	if (listeners)
-		rv = 0;
-	else if (!rv)
-		rv = -EINVAL;
-
-	siw_dbg(id->device, "%s\n", rv ? "FAIL" : "OK");
-
-	return rv;
-}
-
 int siw_destroy_listen(struct iw_cm_id *id)
 {
 	if (!id->provider_data) {

From 0aeb3622ea6f14f36232a33f1d08c2ff02f4048b Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 4 Mar 2020 09:10:45 +0100
Subject: [PATCH 096/327] RDMA/hns: fix spelling mistake "attatch" -> "attach"

There is a spelling mistake in an error message. Fix it.

Link: https://lore.kernel.org/r/20200304081045.81164-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_qp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 7bec0ceb160b..5a28d62008e4 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -812,7 +812,7 @@ static int map_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 	ret = hns_roce_mtr_attach(hr_dev, &hr_qp->mtr, buf_list, hr_qp->regions,
 				  region_count);
 	if (ret)
-		ibdev_err(ibdev, "Failed to attatch WQE's mtr\n");
+		ibdev_err(ibdev, "Failed to attach WQE's mtr\n");
 
 	goto done;
 

From 30f2fe40c72bfbdde7bc066cb862bd05014be9f1 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Wed, 19 Feb 2020 21:05:18 +0200
Subject: [PATCH 097/327] IB/mlx5: Introduce UAPIs to manage packet pacing

Introduce packet pacing uobject and its alloc and destroy
methods.

This uobject holds mlx5 packet pacing context according to the device
specification and enables managing packet pacing device entries that are
needed by DEVX applications.

Link: https://lore.kernel.org/r/20200219190518.200912-3-leon@kernel.org
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/Makefile       |   1 +
 drivers/infiniband/hw/mlx5/main.c         |   1 +
 drivers/infiniband/hw/mlx5/mlx5_ib.h      |   6 +
 drivers/infiniband/hw/mlx5/qos.c          | 136 ++++++++++++++++++++++
 include/uapi/rdma/mlx5_user_ioctl_cmds.h  |  17 +++
 include/uapi/rdma/mlx5_user_ioctl_verbs.h |   4 +
 6 files changed, 165 insertions(+)
 create mode 100644 drivers/infiniband/hw/mlx5/qos.c

diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index d0a043ccbe58..2a334800f109 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -8,3 +8,4 @@ mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
 mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o
 mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o
 mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += flow.o
+mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += qos.o
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index e4bcfa81b70a..7fc5ce9bdf3d 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6251,6 +6251,7 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE(
 static const struct uapi_definition mlx5_ib_defs[] = {
 	UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
 	UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
+	UAPI_DEF_CHAIN(mlx5_ib_qos_defs),
 
 	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
 				&mlx5_ib_flow_action),
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d9bffcc93587..09ce80febec7 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -203,6 +203,11 @@ struct mlx5_ib_flow_matcher {
 	u8			match_criteria_enable;
 };
 
+struct mlx5_ib_pp {
+	u16 index;
+	struct mlx5_core_dev *mdev;
+};
+
 struct mlx5_ib_flow_db {
 	struct mlx5_ib_flow_prio	prios[MLX5_IB_NUM_FLOW_FT];
 	struct mlx5_ib_flow_prio	egress_prios[MLX5_IB_NUM_FLOW_FT];
@@ -1381,6 +1386,7 @@ int mlx5_ib_fill_stat_entry(struct sk_buff *msg,
 
 extern const struct uapi_definition mlx5_ib_devx_defs[];
 extern const struct uapi_definition mlx5_ib_flow_defs[];
+extern const struct uapi_definition mlx5_ib_qos_defs[];
 
 #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
 int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user);
diff --git a/drivers/infiniband/hw/mlx5/qos.c b/drivers/infiniband/hw/mlx5/qos.c
new file mode 100644
index 000000000000..f822b06e7c9e
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/qos.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_ib.h"
+
+#define UVERBS_MODULE_NAME mlx5_ib
+#include <rdma/uverbs_named_ioctl.h>
+
+static bool pp_is_supported(struct ib_device *device)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+
+	return (MLX5_CAP_GEN(dev->mdev, qos) &&
+		MLX5_CAP_QOS(dev->mdev, packet_pacing) &&
+		MLX5_CAP_QOS(dev->mdev, packet_pacing_uid));
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_PP_OBJ_ALLOC)(
+	struct uverbs_attr_bundle *attrs)
+{
+	u8 rl_raw[MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)] = {};
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
+		MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE);
+	struct mlx5_ib_dev *dev;
+	struct mlx5_ib_ucontext *c;
+	struct mlx5_ib_pp *pp_entry;
+	void *in_ctx;
+	u16 uid;
+	int inlen;
+	u32 flags;
+	int err;
+
+	c = to_mucontext(ib_uverbs_get_ucontext(attrs));
+	if (IS_ERR(c))
+		return PTR_ERR(c);
+
+	/* The allocated entry can be used only by a DEVX context */
+	if (!c->devx_uid)
+		return -EINVAL;
+
+	dev = to_mdev(c->ibucontext.device);
+	pp_entry = kzalloc(sizeof(*pp_entry), GFP_KERNEL);
+	if (IS_ERR(pp_entry))
+		return PTR_ERR(pp_entry);
+
+	in_ctx = uverbs_attr_get_alloced_ptr(attrs,
+					     MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX);
+	inlen = uverbs_attr_get_len(attrs,
+				    MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX);
+	memcpy(rl_raw, in_ctx, inlen);
+	err = uverbs_get_flags32(&flags, attrs,
+		MLX5_IB_ATTR_PP_OBJ_ALLOC_FLAGS,
+		MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX);
+	if (err)
+		goto err;
+
+	uid = (flags & MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX) ?
+		c->devx_uid : MLX5_SHARED_RESOURCE_UID;
+
+	err = mlx5_rl_add_rate_raw(dev->mdev, rl_raw, uid,
+			(flags & MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX),
+			&pp_entry->index);
+	if (err)
+		goto err;
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX,
+			     &pp_entry->index, sizeof(pp_entry->index));
+	if (err)
+		goto clean;
+
+	pp_entry->mdev = dev->mdev;
+	uobj->object = pp_entry;
+	return 0;
+
+clean:
+	mlx5_rl_remove_rate_raw(dev->mdev, pp_entry->index);
+err:
+	kfree(pp_entry);
+	return err;
+}
+
+static int pp_obj_cleanup(struct ib_uobject *uobject,
+			  enum rdma_remove_reason why,
+			  struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_pp *pp_entry = uobject->object;
+
+	mlx5_rl_remove_rate_raw(pp_entry->mdev, pp_entry->index);
+	kfree(pp_entry);
+	return 0;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_PP_OBJ_ALLOC,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE,
+			MLX5_IB_OBJECT_PP,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(
+		MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX,
+		UVERBS_ATTR_SIZE(1,
+			MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)),
+		UA_MANDATORY,
+		UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_PP_OBJ_ALLOC_FLAGS,
+			enum mlx5_ib_uapi_pp_alloc_flags,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX,
+			   UVERBS_ATTR_TYPE(u16),
+			   UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	MLX5_IB_METHOD_PP_OBJ_DESTROY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_PP_OBJ_DESTROY_HANDLE,
+			MLX5_IB_OBJECT_PP,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_PP,
+			    UVERBS_TYPE_ALLOC_IDR(pp_obj_cleanup),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_PP_OBJ_ALLOC),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_PP_OBJ_DESTROY));
+
+
+const struct uapi_definition mlx5_ib_qos_defs[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_PP,
+		UAPI_DEF_IS_OBJ_SUPPORTED(pp_is_supported)),
+	{},
+};
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index afe7da6f2b8e..8f4a417fc70a 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -143,6 +143,22 @@ enum mlx5_ib_devx_umem_dereg_attrs {
 	MLX5_IB_ATTR_DEVX_UMEM_DEREG_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
 };
 
+enum mlx5_ib_pp_obj_methods {
+	MLX5_IB_METHOD_PP_OBJ_ALLOC = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_METHOD_PP_OBJ_DESTROY,
+};
+
+enum mlx5_ib_pp_alloc_attrs {
+	MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX,
+	MLX5_IB_ATTR_PP_OBJ_ALLOC_FLAGS,
+	MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX,
+};
+
+enum mlx5_ib_pp_obj_destroy_attrs {
+	MLX5_IB_ATTR_PP_OBJ_DESTROY_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+};
+
 enum mlx5_ib_devx_umem_methods {
 	MLX5_IB_METHOD_DEVX_UMEM_REG = (1U << UVERBS_ID_NS_SHIFT),
 	MLX5_IB_METHOD_DEVX_UMEM_DEREG,
@@ -173,6 +189,7 @@ enum mlx5_ib_objects {
 	MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
 	MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
 	MLX5_IB_OBJECT_VAR,
+	MLX5_IB_OBJECT_PP,
 };
 
 enum mlx5_ib_flow_matcher_create_attrs {
diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
index 88b6ca70c2fe..b4641a7865f7 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
@@ -73,5 +73,9 @@ struct mlx5_ib_uapi_devx_async_event_hdr {
 	__u8		out_data[];
 };
 
+enum mlx5_ib_uapi_pp_alloc_flags {
+	MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX = 1 << 0,
+};
+
 #endif
 

From 679824148364773c8390abbf122bd7d9825e81eb Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Thu, 20 Feb 2020 12:08:19 +0200
Subject: [PATCH 098/327] RDMA/rw: map P2P memory correctly for signature
 operations

Since RDMA rw API support operations with P2P memory sg list, make sure to
map/unmap the scatter list for signature operation correctly.

Link: https://lore.kernel.org/r/20200220100819.41860-2-maxg@mellanox.com
Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/rw.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 06e5b6787443..557efbf29197 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -391,13 +391,13 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		return -EINVAL;
 	}
 
-	ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+	ret = rdma_rw_map_sg(dev, sg, sg_cnt, dir);
 	if (!ret)
 		return -ENOMEM;
 	sg_cnt = ret;
 
 	if (prot_sg_cnt) {
-		ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir);
+		ret = rdma_rw_map_sg(dev, prot_sg, prot_sg_cnt, dir);
 		if (!ret) {
 			ret = -ENOMEM;
 			goto out_unmap_sg;
@@ -466,9 +466,9 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 	kfree(ctx->reg);
 out_unmap_prot_sg:
 	if (prot_sg_cnt)
-		ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
+		rdma_rw_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
 out_unmap_sg:
-	ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
+	rdma_rw_unmap_sg(dev, sg, sg_cnt, dir);
 	return ret;
 }
 EXPORT_SYMBOL(rdma_rw_ctx_signature_init);
@@ -628,9 +628,9 @@ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 	ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
 	kfree(ctx->reg);
 
-	ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
 	if (prot_sg_cnt)
-		ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
+		rdma_rw_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
+	rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
 

From 32ac9e4399b12d3e54d312a0e0e30ed5cd19bd4e Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 27 Feb 2020 16:36:51 -0400
Subject: [PATCH 099/327] RDMA/cma: Teach lockdep about the order of rtnl and
 lock

This lock ordering only happens when bonding is enabled and a certain
bonding related event fires. However, since it can happen this is a global
restriction on lock ordering.

Teach lockdep about the order directly and unconditionally so bugs here
are found quickly.

See https://syzkaller.appspot.com/bug?extid=55de90ab5f44172b0c90

Link: https://lore.kernel.org/r/20200227203651.GA27185@ziepe.ca
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 468814e8c504..4df75ab4ee9d 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -4796,6 +4796,19 @@ static int __init cma_init(void)
 {
 	int ret;
 
+	/*
+	 * There is a rare lock ordering dependency in cma_netdev_callback()
+	 * that only happens when bonding is enabled. Teach lockdep that rtnl
+	 * must never be nested under lock so it can find these without having
+	 * to test with bonding.
+	 */
+	if (IS_ENABLED(CONFIG_LOCKDEP)) {
+		rtnl_lock();
+		mutex_lock(&lock);
+		mutex_unlock(&lock);
+		rtnl_unlock();
+	}
+
 	cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM);
 	if (!cma_wq)
 		return -ENOMEM;

From 24a5b0ce714210c69a8870d4a11ab6a8cff650c6 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 8 Mar 2020 07:54:42 +0100
Subject: [PATCH 100/327] RDMA/bnxt_re: Remove a redundant 'memset'

'wqe' is already zeroed at the top of the 'while' loop, just a few lines
below, and is not used outside of the loop.

So there is no need to zero it again, or for the variable to be declared
outside the loop.

Link: https://lore.kernel.org/r/20200308065442.5415-1-christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/ib_verbs.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 47b0b50b71e7..95f6d493d1b9 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -2470,15 +2470,12 @@ static int bnxt_re_post_send_shadow_qp(struct bnxt_re_dev *rdev,
 				       struct bnxt_re_qp *qp,
 				       const struct ib_send_wr *wr)
 {
-	struct bnxt_qplib_swqe wqe;
 	int rc = 0, payload_sz = 0;
 	unsigned long flags;
 
 	spin_lock_irqsave(&qp->sq_lock, flags);
-	memset(&wqe, 0, sizeof(wqe));
 	while (wr) {
-		/* House keeping */
-		memset(&wqe, 0, sizeof(wqe));
+		struct bnxt_qplib_swqe wqe = {};
 
 		/* Common */
 		wqe.num_sge = wr->num_sge;

From 2d870c5bd0acd42eced3860c3911ed6005654a2d Mon Sep 17 00:00:00 2001
From: Zhu Yanjun <yanjunz@mellanox.com>
Date: Tue, 10 Mar 2020 11:16:56 +0200
Subject: [PATCH 101/327] RDMA/core: Remove the duplicate header file

The header file rdma_core.h is duplicate, so let's remove it.

Fixes: 622db5b6439a ("RDMA/core: Add trace points to follow MR allocation")
Link: https://lore.kernel.org/r/20200310091656.249696-1-leon@kernel.org
Signed-off-by: Zhu Yanjun <yanjunz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/verbs.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 610ce9a6c6b9..56a71337112c 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -54,8 +54,6 @@
 #include "core_priv.h"
 #include <trace/events/rdma_core.h>
 
-#include <trace/events/rdma_core.h>
-
 static int ib_resolve_eth_dmac(struct ib_device *device,
 			       struct rdma_ah_attr *ah_attr);
 

From 0897f301bc285409ab6453839fa6a43d9ceb26e0 Mon Sep 17 00:00:00 2001
From: Erez Shitrit <erezsh@mellanox.com>
Date: Tue, 10 Mar 2020 09:57:06 +0200
Subject: [PATCH 102/327] RDMA/mlx5: Remove duplicate definitions of SW_ICM
 macros

Those macros are already defined in include/linux/mlx5/driver.h, so delete
their duplicate variants.

Link: https://lore.kernel.org/r/20200310075706.238592-1-leon@kernel.org
Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@mellanox.com>
Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Reviewed-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 798366bf6dd5..2e42258e6fce 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -128,10 +128,6 @@ enum mlx5_ib_mmap_type {
 	MLX5_IB_MMAP_TYPE_VAR = 2,
 };
 
-#define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)                                        \
-	(MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
-#define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
-
 struct mlx5_ib_ucontext {
 	struct ib_ucontext	ibucontext;
 	struct list_head	db_page_list;

From 41e684ef3f37ce6e5eac3fb5b9c7c1853f4b0447 Mon Sep 17 00:00:00 2001
From: Alex Vesker <valex@mellanox.com>
Date: Thu, 5 Mar 2020 14:38:41 +0200
Subject: [PATCH 103/327] IB/mlx5: Replace tunnel mpls capability bits for
 tunnel_offloads

Until now the flex parser capability was used in ib_query_device() to
indicate tunnel_offloads_caps support for mpls_over_gre/mpls_over_udp.

Newer devices and firmware will have configurations with the flexparser
but without mpls support.

Testing for the flex parser capability was a mistake, the tunnel_stateless
capability was intended for detecting mpls and was introduced at the same
time as the flex parser capability.

Otherwise userspace will be incorrectly informed that a future device
supports MPLS when it does not.

Link: https://lore.kernel.org/r/20200305123841.196086-1-leon@kernel.org
Cc: <stable@vger.kernel.org> # 4.17
Fixes: e818e255a58d ("IB/mlx5: Expose MPLS related tunneling offloads")
Signed-off-by: Alex Vesker <valex@mellanox.com>
Reviewed-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c | 6 ++----
 include/linux/mlx5/mlx5_ifc.h     | 6 +++++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index eba7604eaa76..9c3993c7e9a1 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1192,12 +1192,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre))
 			resp.tunnel_offloads_caps |=
 				MLX5_IB_TUNNELED_OFFLOADS_GRE;
-		if (MLX5_CAP_GEN(mdev, flex_parser_protocols) &
-		    MLX5_FLEX_PROTO_CW_MPLS_GRE)
+		if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre))
 			resp.tunnel_offloads_caps |=
 				MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE;
-		if (MLX5_CAP_GEN(mdev, flex_parser_protocols) &
-		    MLX5_FLEX_PROTO_CW_MPLS_UDP)
+		if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_udp))
 			resp.tunnel_offloads_caps |=
 				MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
 	}
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f9bcbe653fda..f3a7189f9d6d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -877,7 +877,11 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         swp_csum[0x1];
 	u8         swp_lso[0x1];
 	u8         cqe_checksum_full[0x1];
-	u8         reserved_at_24[0x5];
+	u8         tunnel_stateless_geneve_tx[0x1];
+	u8         tunnel_stateless_mpls_over_udp[0x1];
+	u8         tunnel_stateless_mpls_over_gre[0x1];
+	u8         tunnel_stateless_vxlan_gpe[0x1];
+	u8         tunnel_stateless_ipv4_over_vxlan[0x1];
 	u8         tunnel_stateless_ip_over_ip[0x1];
 	u8         reserved_at_2a[0x6];
 	u8         max_vxlan_udp_ports[0x8];

From 7e934cf5ace1dceeb804f7493fa28bb697ed3c52 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 12 Mar 2020 17:29:11 -0400
Subject: [PATCH 104/327] xarray: Fix early termination of xas_for_each_marked

xas_for_each_marked() is using entry == NULL as a termination condition
of the iteration. When xas_for_each_marked() is used protected only by
RCU, this can however race with xas_store(xas, NULL) in the following
way:

TASK1                                   TASK2
page_cache_delete()         	        find_get_pages_range_tag()
                                          xas_for_each_marked()
                                            xas_find_marked()
                                              off = xas_find_chunk()

  xas_store(&xas, NULL)
    xas_init_marks(&xas);
    ...
    rcu_assign_pointer(*slot, NULL);
                                              entry = xa_entry(off);

And thus xas_for_each_marked() terminates prematurely possibly leading
to missed entries in the iteration (translating to missing writeback of
some pages or a similar problem).

If we find a NULL entry that has been marked, skip it (unless we're trying
to allocate an entry).

Reported-by: Jan Kara <jack@suse.cz>
CC: stable@vger.kernel.org
Fixes: ef8e5717db01 ("page cache: Convert delete_batch to XArray")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/xarray.h                       |  6 +-
 lib/xarray.c                                 |  2 +
 tools/testing/radix-tree/Makefile            |  4 +-
 tools/testing/radix-tree/iteration_check_2.c | 87 ++++++++++++++++++++
 tools/testing/radix-tree/main.c              |  1 +
 tools/testing/radix-tree/test.h              |  1 +
 6 files changed, 98 insertions(+), 3 deletions(-)
 create mode 100644 tools/testing/radix-tree/iteration_check_2.c

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index a491653d8882..14c893433139 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1648,6 +1648,7 @@ static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
 								xa_mark_t mark)
 {
 	struct xa_node *node = xas->xa_node;
+	void *entry;
 	unsigned int offset;
 
 	if (unlikely(xas_not_node(node) || node->shift))
@@ -1659,7 +1660,10 @@ static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
 		return NULL;
 	if (offset == XA_CHUNK_SIZE)
 		return xas_find_marked(xas, max, mark);
-	return xa_entry(xas->xa, node, offset);
+	entry = xa_entry(xas->xa, node, offset);
+	if (!entry)
+		return xas_find_marked(xas, max, mark);
+	return entry;
 }
 
 /*
diff --git a/lib/xarray.c b/lib/xarray.c
index f448bcd263ac..e9e641d3c0c3 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1208,6 +1208,8 @@ void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
 		}
 
 		entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
+		if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK))
+			continue;
 		if (!xa_is_node(entry))
 			return entry;
 		xas->xa_node = xa_to_node(entry);
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index 397d6b612502..aa6abfe0749c 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -7,8 +7,8 @@ LDLIBS+= -lpthread -lurcu
 TARGETS = main idr-test multiorder xarray
 CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o bitmap.o
 OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \
-	 regression4.o \
-	 tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o
+	 regression4.o tag_check.o multiorder.o idr-test.o iteration_check.o \
+	 iteration_check_2.o benchmark.o
 
 ifndef SHIFT
 	SHIFT=3
diff --git a/tools/testing/radix-tree/iteration_check_2.c b/tools/testing/radix-tree/iteration_check_2.c
new file mode 100644
index 000000000000..aac5c50a3674
--- /dev/null
+++ b/tools/testing/radix-tree/iteration_check_2.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * iteration_check_2.c: Check that deleting a tagged entry doesn't cause
+ * an RCU walker to finish early.
+ * Copyright (c) 2020 Oracle
+ * Author: Matthew Wilcox <willy@infradead.org>
+ */
+#include <pthread.h>
+#include "test.h"
+
+static volatile bool test_complete;
+
+static void *iterator(void *arg)
+{
+	XA_STATE(xas, arg, 0);
+	void *entry;
+
+	rcu_register_thread();
+
+	while (!test_complete) {
+		xas_set(&xas, 0);
+		rcu_read_lock();
+		xas_for_each_marked(&xas, entry, ULONG_MAX, XA_MARK_0)
+			;
+		rcu_read_unlock();
+		assert(xas.xa_index >= 100);
+	}
+
+	rcu_unregister_thread();
+	return NULL;
+}
+
+static void *throbber(void *arg)
+{
+	struct xarray *xa = arg;
+
+	rcu_register_thread();
+
+	while (!test_complete) {
+		int i;
+
+		for (i = 0; i < 100; i++) {
+			xa_store(xa, i, xa_mk_value(i), GFP_KERNEL);
+			xa_set_mark(xa, i, XA_MARK_0);
+		}
+		for (i = 0; i < 100; i++)
+			xa_erase(xa, i);
+	}
+
+	rcu_unregister_thread();
+	return NULL;
+}
+
+void iteration_test2(unsigned test_duration)
+{
+	pthread_t threads[2];
+	DEFINE_XARRAY(array);
+	int i;
+
+	printv(1, "Running iteration test 2 for %d seconds\n", test_duration);
+
+	test_complete = false;
+
+	xa_store(&array, 100, xa_mk_value(100), GFP_KERNEL);
+	xa_set_mark(&array, 100, XA_MARK_0);
+
+	if (pthread_create(&threads[0], NULL, iterator, &array)) {
+		perror("create iterator thread");
+		exit(1);
+	}
+	if (pthread_create(&threads[1], NULL, throbber, &array)) {
+		perror("create throbber thread");
+		exit(1);
+	}
+
+	sleep(test_duration);
+	test_complete = true;
+
+	for (i = 0; i < 2; i++) {
+		if (pthread_join(threads[i], NULL)) {
+			perror("pthread_join");
+			exit(1);
+		}
+	}
+
+	xa_destroy(&array);
+}
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
index 7a22d6e3732e..f2cbc8e5b97c 100644
--- a/tools/testing/radix-tree/main.c
+++ b/tools/testing/radix-tree/main.c
@@ -311,6 +311,7 @@ int main(int argc, char **argv)
 	regression4_test();
 	iteration_test(0, 10 + 90 * long_run);
 	iteration_test(7, 10 + 90 * long_run);
+	iteration_test2(10 + 90 * long_run);
 	single_thread_tests(long_run);
 
 	/* Free any remaining preallocated nodes */
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index 1ee4b2c0ad10..34dab4d18744 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -34,6 +34,7 @@ void xarray_tests(void);
 void tag_check(void);
 void multiorder_checks(void);
 void iteration_test(unsigned order, unsigned duration);
+void iteration_test2(unsigned duration);
 void benchmark(void);
 void idr_checks(void);
 void ida_tests(void);

From da5ebf5aa676c4106d1e7032d22679831cb6ba72 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 9 Jan 2020 14:25:14 -0500
Subject: [PATCH 105/327] lookup_open(): saner calling conventions (return
 dentry on success)

same story as for atomic_open() in the previous commit.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 46 +++++++++++++++++++---------------------------
 1 file changed, 19 insertions(+), 27 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 5f8b791a6d6e..b350e1e2b46f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3157,10 +3157,9 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
  *
  * An error code is returned on failure.
  */
-static int lookup_open(struct nameidata *nd, struct path *path,
-			struct file *file,
-			const struct open_flags *op,
-			bool got_write)
+static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
+				  const struct open_flags *op,
+				  bool got_write)
 {
 	struct dentry *dir = nd->path.dentry;
 	struct inode *dir_inode = dir->d_inode;
@@ -3171,7 +3170,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 
 	if (unlikely(IS_DEADDIR(dir_inode)))
-		return -ENOENT;
+		return ERR_PTR(-ENOENT);
 
 	file->f_mode &= ~FMODE_CREATED;
 	dentry = d_lookup(dir, &nd->last);
@@ -3179,7 +3178,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
 		if (!dentry) {
 			dentry = d_alloc_parallel(dir, &nd->last, &wq);
 			if (IS_ERR(dentry))
-				return PTR_ERR(dentry);
+				return dentry;
 		}
 		if (d_in_lookup(dentry))
 			break;
@@ -3195,7 +3194,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
 	}
 	if (dentry->d_inode) {
 		/* Cached positive dentry: will open in f_op->open */
-		goto out_no_open;
+		return dentry;
 	}
 
 	/*
@@ -3235,19 +3234,9 @@ static int lookup_open(struct nameidata *nd, struct path *path,
 
 	if (dir_inode->i_op->atomic_open) {
 		dentry = atomic_open(nd, dentry, file, op, open_flag, mode);
-		if (IS_ERR(dentry)) {
-			error = PTR_ERR(dentry);
-			if (unlikely(error == -ENOENT) && create_error)
-				error = create_error;
-			return error;
-		}
-		if (file->f_mode & FMODE_OPENED) {
-			dput(dentry);
-			return 0;
-		}
-		path->mnt = nd->path.mnt;
-		path->dentry = dentry;
-		return 0;
+		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
+			dentry = ERR_PTR(create_error);
+		return dentry;
 	}
 
 no_open:
@@ -3283,14 +3272,11 @@ static int lookup_open(struct nameidata *nd, struct path *path,
 		error = create_error;
 		goto out_dput;
 	}
-out_no_open:
-	path->dentry = dentry;
-	path->mnt = nd->path.mnt;
-	return 0;
+	return dentry;
 
 out_dput:
 	dput(dentry);
-	return error;
+	return ERR_PTR(error);
 }
 
 /*
@@ -3309,6 +3295,7 @@ static int do_last(struct nameidata *nd,
 	unsigned seq;
 	struct inode *inode;
 	struct path path;
+	struct dentry *dentry;
 	int error;
 
 	nd->flags &= ~LOOKUP_PARENT;
@@ -3365,14 +3352,18 @@ static int do_last(struct nameidata *nd,
 		inode_lock(dir->d_inode);
 	else
 		inode_lock_shared(dir->d_inode);
-	error = lookup_open(nd, &path, file, op, got_write);
+	dentry = lookup_open(nd, file, op, got_write);
 	if (open_flag & O_CREAT)
 		inode_unlock(dir->d_inode);
 	else
 		inode_unlock_shared(dir->d_inode);
 
-	if (error)
+	if (IS_ERR(dentry)) {
+		error = PTR_ERR(dentry);
 		goto out;
+	}
+	path.mnt = nd->path.mnt;
+	path.dentry = dentry;
 
 	if (file->f_mode & FMODE_OPENED) {
 		if ((file->f_mode & FMODE_CREATED) ||
@@ -3380,6 +3371,7 @@ static int do_last(struct nameidata *nd,
 			will_truncate = false;
 
 		audit_inode(nd->name, file->f_path.dentry, 0);
+		dput(dentry);
 		goto opened;
 	}
 

From e73cabff591706d7eb643d4c6981a236cb9d76ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 9 Jan 2020 14:30:08 -0500
Subject: [PATCH 106/327] do_last(): collapse the call of path_to_nameidata()

... and shift filling struct path to just before the call of
handle_mounts().  All callers of handle_mounts() are
immediately preceded by path->mnt = nd->path.mnt now.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index b350e1e2b46f..79029a1048be 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3362,8 +3362,6 @@ static int do_last(struct nameidata *nd,
 		error = PTR_ERR(dentry);
 		goto out;
 	}
-	path.mnt = nd->path.mnt;
-	path.dentry = dentry;
 
 	if (file->f_mode & FMODE_OPENED) {
 		if ((file->f_mode & FMODE_CREATED) ||
@@ -3380,7 +3378,8 @@ static int do_last(struct nameidata *nd,
 		open_flag &= ~O_TRUNC;
 		will_truncate = false;
 		acc_mode = 0;
-		path_to_nameidata(&path, nd);
+		dput(nd->path.dentry);
+		nd->path.dentry = dentry;
 		goto finish_open_created;
 	}
 
@@ -3394,6 +3393,8 @@ static int do_last(struct nameidata *nd,
 		got_write = false;
 	}
 
+	path.mnt = nd->path.mnt;
+	path.dentry = dentry;
 	error = handle_mounts(&path, nd, &inode, &seq);
 	if (unlikely(error < 0))
 		return error;

From db3c9ade50b1f210c750886c23ba40dbf44b2bc8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 9 Jan 2020 14:41:00 -0500
Subject: [PATCH 107/327] handle_mounts(): pass dentry in, turn path into a
 pure out argument

All callers are equivalent to
	path->dentry = dentry;
	path->mnt = nd->path.mnt;
	err = handle_mounts(path, ...)
Pass dentry as an explicit argument, fill *path in handle_mounts()
itself.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 79029a1048be..a613fa52a174 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1385,11 +1385,15 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 }
 
-static inline int handle_mounts(struct path *path, struct nameidata *nd,
-			  struct inode **inode, unsigned int *seqp)
+static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
+			  struct path *path, struct inode **inode,
+			  unsigned int *seqp)
 {
-	int ret = follow_managed(path, nd);
+	int ret;
 
+	path->mnt = nd->path.mnt;
+	path->dentry = dentry;
+	ret = follow_managed(path, nd);
 	if (likely(ret >= 0)) {
 		*inode = d_backing_inode(path->dentry);
 		*seqp = 0; /* out of RCU mode, so the value doesn't matter */
@@ -1685,10 +1689,7 @@ static int lookup_fast(struct nameidata *nd,
 		dput(dentry);
 		return status;
 	}
-
-	path->mnt = mnt;
-	path->dentry = dentry;
-	return handle_mounts(path, nd, inode, seqp);
+	return handle_mounts(nd, dentry, path, inode, seqp);
 }
 
 /* Fast lookup failed, do it the slow way */
@@ -1859,6 +1860,7 @@ static inline int step_into(struct nameidata *nd, struct path *path,
 static int walk_component(struct nameidata *nd, int flags)
 {
 	struct path path;
+	struct dentry *dentry;
 	struct inode *inode;
 	unsigned seq;
 	int err;
@@ -1877,13 +1879,11 @@ static int walk_component(struct nameidata *nd, int flags)
 	if (unlikely(err <= 0)) {
 		if (err < 0)
 			return err;
-		path.dentry = lookup_slow(&nd->last, nd->path.dentry,
-					  nd->flags);
-		if (IS_ERR(path.dentry))
-			return PTR_ERR(path.dentry);
+		dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
+		if (IS_ERR(dentry))
+			return PTR_ERR(dentry);
 
-		path.mnt = nd->path.mnt;
-		err = handle_mounts(&path, nd, &inode, &seq);
+		err = handle_mounts(nd, dentry, &path, &inode, &seq);
 		if (unlikely(err < 0))
 			return err;
 	}
@@ -2355,7 +2355,7 @@ static inline int lookup_last(struct nameidata *nd)
 
 static int handle_lookup_down(struct nameidata *nd)
 {
-	struct path path = nd->path;
+	struct path path;
 	struct inode *inode = nd->inode;
 	unsigned seq = nd->seq;
 	int err;
@@ -2366,11 +2366,12 @@ static int handle_lookup_down(struct nameidata *nd)
 		 * at the very beginning of walk, so we lose nothing
 		 * if we simply redo everything in non-RCU mode
 		 */
+		path = nd->path;
 		if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
 			return -ECHILD;
 	} else {
-		dget(path.dentry);
-		err = handle_mounts(&path, nd, &inode, &seq);
+		dget(nd->path.dentry);
+		err = handle_mounts(nd, nd->path.dentry, &path, &inode, &seq);
 		if (unlikely(err < 0))
 			return err;
 	}
@@ -3393,9 +3394,7 @@ static int do_last(struct nameidata *nd,
 		got_write = false;
 	}
 
-	path.mnt = nd->path.mnt;
-	path.dentry = dentry;
-	error = handle_mounts(&path, nd, &inode, &seq);
+	error = handle_mounts(nd, dentry, &path, &inode, &seq);
 	if (unlikely(error < 0))
 		return error;
 finish_lookup:

From 282e79c1c61a119a8b41bf0c1d82a0ac7cc76488 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 10 Mar 2020 11:14:29 +0200
Subject: [PATCH 108/327] RDMA/mlx4: Delete duplicated offsetofend
 implementation

Convert mlx4 to use in-kernel offsetofend() instead
of its duplicated implementation.

Link: https://lore.kernel.org/r/20200310091438.248429-3-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx4/main.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 2f5d9b181848..a66518a5c938 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -434,9 +434,6 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
 	return real_index;
 }
 
-#define field_avail(type, fld, sz) (offsetof(type, fld) + \
-				    sizeof(((type *)0)->fld) <= (sz))
-
 static int mlx4_ib_query_device(struct ib_device *ibdev,
 				struct ib_device_attr *props,
 				struct ib_udata *uhw)
@@ -447,7 +444,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 	int err;
 	int have_ib_ports;
 	struct mlx4_uverbs_ex_query_device cmd;
-	struct mlx4_uverbs_ex_query_device_resp resp = {.comp_mask = 0};
+	struct mlx4_uverbs_ex_query_device_resp resp = {};
 	struct mlx4_clock_params clock_params;
 
 	if (uhw->inlen) {
@@ -602,7 +599,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 			sizeof(struct mlx4_wqe_data_seg);
 	}
 
-	if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
+	if (offsetofend(typeof(resp), rss_caps) <= uhw->outlen) {
 		if (props->rss_caps.supported_qpts) {
 			resp.rss_caps.rx_hash_function =
 				MLX4_IB_RX_HASH_FUNC_TOEPLITZ;
@@ -626,7 +623,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 				       sizeof(resp.rss_caps);
 	}
 
-	if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
+	if (offsetofend(typeof(resp), tso_caps) <= uhw->outlen) {
 		if (dev->dev->caps.max_gso_sz &&
 		    ((mlx4_ib_port_link_layer(ibdev, 1) ==
 		    IB_LINK_LAYER_ETHERNET) ||

From a762d460a06abc8d462ac513ba57dc3c31dd8c73 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 10 Mar 2020 11:14:31 +0200
Subject: [PATCH 109/327] RDMA/mlx5: Use offsetofend() instead of duplicated
 variant

Convert mlx5 driver to use offsetofend() instead of its duplicated
variant.

Link: https://lore.kernel.org/r/20200310091438.248429-5-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c    | 42 ++++++++++++++--------------
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 16 ++++-------
 2 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 9c3993c7e9a1..de5275a34da3 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -898,7 +898,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 			props->raw_packet_caps |=
 				IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
 
-		if (field_avail(typeof(resp), tso_caps, uhw_outlen)) {
+		if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) {
 			max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
 			if (max_tso) {
 				resp.tso_caps.max_tso = 1 << max_tso;
@@ -908,7 +908,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 			}
 		}
 
-		if (field_avail(typeof(resp), rss_caps, uhw_outlen)) {
+		if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) {
 			resp.rss_caps.rx_hash_function =
 						MLX5_RX_HASH_FUNC_TOEPLITZ;
 			resp.rss_caps.rx_hash_fields_mask =
@@ -928,9 +928,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 			resp.response_length += sizeof(resp.rss_caps);
 		}
 	} else {
-		if (field_avail(typeof(resp), tso_caps, uhw_outlen))
+		if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen)
 			resp.response_length += sizeof(resp.tso_caps);
-		if (field_avail(typeof(resp), rss_caps, uhw_outlen))
+		if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen)
 			resp.response_length += sizeof(resp.rss_caps);
 	}
 
@@ -1072,7 +1072,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 						MLX5_MAX_CQ_PERIOD;
 	}
 
-	if (field_avail(typeof(resp), cqe_comp_caps, uhw_outlen)) {
+	if (offsetofend(typeof(resp), cqe_comp_caps) <= uhw_outlen) {
 		resp.response_length += sizeof(resp.cqe_comp_caps);
 
 		if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
@@ -1090,7 +1090,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		}
 	}
 
-	if (field_avail(typeof(resp), packet_pacing_caps, uhw_outlen) &&
+	if (offsetofend(typeof(resp), packet_pacing_caps) <= uhw_outlen &&
 	    raw_support) {
 		if (MLX5_CAP_QOS(mdev, packet_pacing) &&
 		    MLX5_CAP_GEN(mdev, qos)) {
@@ -1108,8 +1108,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		resp.response_length += sizeof(resp.packet_pacing_caps);
 	}
 
-	if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
-			uhw_outlen)) {
+	if (offsetofend(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes) <=
+	    uhw_outlen) {
 		if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
 			resp.mlx5_ib_support_multi_pkt_send_wqes =
 				MLX5_IB_ALLOW_MPW;
@@ -1122,7 +1122,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 			sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
 	}
 
-	if (field_avail(typeof(resp), flags, uhw_outlen)) {
+	if (offsetofend(typeof(resp), flags) <= uhw_outlen) {
 		resp.response_length += sizeof(resp.flags);
 
 		if (MLX5_CAP_GEN(mdev, cqe_compression_128))
@@ -1138,7 +1138,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
 	}
 
-	if (field_avail(typeof(resp), sw_parsing_caps, uhw_outlen)) {
+	if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) {
 		resp.response_length += sizeof(resp.sw_parsing_caps);
 		if (MLX5_CAP_ETH(mdev, swp)) {
 			resp.sw_parsing_caps.sw_parsing_offloads |=
@@ -1158,7 +1158,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		}
 	}
 
-	if (field_avail(typeof(resp), striding_rq_caps, uhw_outlen) &&
+	if (offsetofend(typeof(resp), striding_rq_caps) <= uhw_outlen &&
 	    raw_support) {
 		resp.response_length += sizeof(resp.striding_rq_caps);
 		if (MLX5_CAP_GEN(mdev, striding_rq)) {
@@ -1181,7 +1181,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 		}
 	}
 
-	if (field_avail(typeof(resp), tunnel_offloads_caps, uhw_outlen)) {
+	if (offsetofend(typeof(resp), tunnel_offloads_caps) <= uhw_outlen) {
 		resp.response_length += sizeof(resp.tunnel_offloads_caps);
 		if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
 			resp.tunnel_offloads_caps |=
@@ -1899,16 +1899,16 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
 	resp.tot_bfregs = req.total_num_bfregs;
 	resp.num_ports = dev->num_ports;
 
-	if (field_avail(typeof(resp), cqe_version, udata->outlen))
+	if (offsetofend(typeof(resp), cqe_version) <= udata->outlen)
 		resp.response_length += sizeof(resp.cqe_version);
 
-	if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
+	if (offsetofend(typeof(resp), cmds_supp_uhw) <= udata->outlen) {
 		resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
 				      MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
 		resp.response_length += sizeof(resp.cmds_supp_uhw);
 	}
 
-	if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) {
+	if (offsetofend(typeof(resp), eth_min_inline) <= udata->outlen) {
 		if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
 			mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
 			resp.eth_min_inline++;
@@ -1916,7 +1916,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
 		resp.response_length += sizeof(resp.eth_min_inline);
 	}
 
-	if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) {
+	if (offsetofend(typeof(resp), clock_info_versions) <= udata->outlen) {
 		if (mdev->clock_info)
 			resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
 		resp.response_length += sizeof(resp.clock_info_versions);
@@ -1928,7 +1928,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
 	 * pretend we don't support reading the HCA's core clock. This is also
 	 * forced by mmap function.
 	 */
-	if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
+	if (offsetofend(typeof(resp), hca_core_clock_offset) <= udata->outlen) {
 		if (PAGE_SIZE <= 4096) {
 			resp.comp_mask |=
 				MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
@@ -1938,18 +1938,18 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
 		resp.response_length += sizeof(resp.hca_core_clock_offset);
 	}
 
-	if (field_avail(typeof(resp), log_uar_size, udata->outlen))
+	if (offsetofend(typeof(resp), log_uar_size) <= udata->outlen)
 		resp.response_length += sizeof(resp.log_uar_size);
 
-	if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
+	if (offsetofend(typeof(resp), num_uars_per_page) <= udata->outlen)
 		resp.response_length += sizeof(resp.num_uars_per_page);
 
-	if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) {
+	if (offsetofend(typeof(resp), num_dyn_bfregs) <= udata->outlen) {
 		resp.num_dyn_bfregs = bfregi->num_dyn_bfregs;
 		resp.response_length += sizeof(resp.num_dyn_bfregs);
 	}
 
-	if (field_avail(typeof(resp), dump_fill_mkey, udata->outlen)) {
+	if (offsetofend(typeof(resp), dump_fill_mkey) <= udata->outlen) {
 		if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
 			resp.dump_fill_mkey = dump_fill_mkey;
 			resp.comp_mask |=
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 2e42258e6fce..4b7d0dfabea2 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -64,8 +64,6 @@
 	dev_warn(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__,     \
 		 __LINE__, current->pid, ##arg)
 
-#define field_avail(type, fld, sz) (offsetof(type, fld) +		\
-				    sizeof(((type *)0)->fld) <= (sz))
 #define MLX5_IB_DEFAULT_UIDX 0xffffff
 #define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index)
 
@@ -1475,12 +1473,11 @@ static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext,
 {
 	u8 cqe_version = ucontext->cqe_version;
 
-	if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) &&
-	    !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+	if ((offsetofend(typeof(*ucmd), uidx) <= inlen) && !cqe_version &&
+	    (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
 		return 0;
 
-	if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) !=
-	       !!cqe_version))
+	if ((offsetofend(typeof(*ucmd), uidx) <= inlen) != !!cqe_version)
 		return -EINVAL;
 
 	return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
@@ -1493,12 +1490,11 @@ static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext,
 {
 	u8 cqe_version = ucontext->cqe_version;
 
-	if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) &&
-	    !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+	if ((offsetofend(typeof(*ucmd), uidx) <= inlen) && !cqe_version &&
+	    (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
 		return 0;
 
-	if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) !=
-	       !!cqe_version))
+	if ((offsetofend(typeof(*ucmd), uidx) <= inlen) != !!cqe_version)
 		return -EINVAL;
 
 	return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);

From a4f994a05926c96845ad40fb661d566a4eacf2b9 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 10 Mar 2020 11:14:32 +0200
Subject: [PATCH 110/327] RDMA/cm: Delete not implemented CM peer to peer
 communication

Peer to peer support was never implemented, so delete it to make code less
clutter.

Link: https://lore.kernel.org/r/20200310091438.248429-6-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Mark Zhang <markz@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 7 -------
 include/rdma/ib_cm.h         | 1 -
 2 files changed, 8 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 5c58619b2399..21ede1c08eea 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -261,7 +261,6 @@ struct cm_id_private {
 	__be16 pkey;
 	u8 private_data_len;
 	u8 max_cm_retries;
-	u8 peer_to_peer;
 	u8 responder_resources;
 	u8 initiator_depth;
 	u8 retry_count;
@@ -1381,10 +1380,6 @@ static void cm_format_req(struct cm_req_msg *req_msg,
 
 static int cm_validate_req_param(struct ib_cm_req_param *param)
 {
-	/* peer-to-peer not supported */
-	if (param->peer_to_peer)
-		return -EINVAL;
-
 	if (!param->primary_path)
 		return -EINVAL;
 
@@ -2437,8 +2432,6 @@ static int cm_rep_handler(struct cm_work *work)
 			cm_ack_timeout(cm_id_priv->target_ack_delay,
 				       cm_id_priv->alt_av.timeout - 1);
 
-	/* todo: handle peer_to_peer */
-
 	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
 	ret = atomic_inc_and_test(&cm_id_priv->work_count);
 	if (!ret)
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 8ec482e391aa..058cfbc2b37f 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -360,7 +360,6 @@ struct ib_cm_req_param {
 	u32			starting_psn;
 	const void		*private_data;
 	u8			private_data_len;
-	u8			peer_to_peer;
 	u8			responder_resources;
 	u8			initiator_depth;
 	u8			remote_cm_response_timeout;

From f743ff3b37dfc8e8cf4a32d4254cddd5f0f1add8 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Tue, 10 Mar 2020 10:22:29 +0200
Subject: [PATCH 111/327] RDMA/mlx5: Replace spinlock protected write with
 atomic var

mkey variant calculation was spinlock protected to make it atomic, replace
that with one atomic variable.

Link: https://lore.kernel.org/r/20200310082238.239865-4-leon@kernel.org
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c    | 2 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 5 +----
 drivers/infiniband/hw/mlx5/mr.c      | 6 +-----
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index fce863621414..e4f8bee486c2 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6390,7 +6390,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
 	spin_lock_init(&dev->reset_flow_resource_lock);
 	xa_init(&dev->odp_mkeys);
 	xa_init(&dev->sig_mrs);
-	spin_lock_init(&dev->mkey_lock);
+	atomic_set(&dev->mkey_var, 0);
 
 	spin_lock_init(&dev->dm.lock);
 	dev->dm.dev = mdev;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 89a050e516a8..3445402b23cc 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -993,10 +993,7 @@ struct mlx5_ib_dev {
 	 */
 	struct mlx5_ib_resources	devr;
 
-	/* protect mkey key part */
-	spinlock_t			mkey_lock;
-	u8				mkey_key;
-
+	atomic_t			mkey_var;
 	struct mlx5_mr_cache		cache;
 	struct timer_list		delay_timer;
 	/* Prevents soft lock on massive reg MRs */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 8508af500972..a1e6ab9b0bed 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -54,12 +54,8 @@ static void
 assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
 		    u32 *in)
 {
+	u8 key = atomic_inc_return(&dev->mkey_var);
 	void *mkc;
-	u8 key;
-
-	spin_lock_irq(&dev->mkey_lock);
-	key = dev->mkey_key++;
-	spin_unlock_irq(&dev->mkey_lock);
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	MLX5_SET(mkc, mkc, mkey_7_0, key);

From 7c8691a396bd2084c7abc02d7aa34dade597814d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 10:22:31 +0200
Subject: [PATCH 112/327] RDMA/mlx5: Rename the tracking variables for the MR
 cache

The old names do not clearly indicate the intent.

Link: https://lore.kernel.org/r/20200310082238.239865-6-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 19 +++++++---
 drivers/infiniband/hw/mlx5/mr.c      | 54 ++++++++++++++--------------
 2 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 3445402b23cc..731b0f7bbe5b 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -699,15 +699,26 @@ struct mlx5_cache_ent {
 	u32			access_mode;
 	u32			page;
 
-	u32			size;
-	u32                     cur;
+	/*
+	 * - available_mrs is the length of list head, ie the number of MRs
+	 *   available for immediate allocation.
+	 * - total_mrs is available_mrs plus all in use MRs that could be
+	 *   returned to the cache.
+	 * - limit is the low water mark for available_mrs, 2* limit is the
+	 *   upper water mark.
+	 * - pending is the number of MRs currently being created
+	 */
+	u32 total_mrs;
+	u32 available_mrs;
+	u32 limit;
+	u32 pending;
+
+	/* Statistics */
 	u32                     miss;
-	u32			limit;
 
 	struct mlx5_ib_dev     *dev;
 	struct work_struct	work;
 	struct delayed_work	dwork;
-	int			pending;
 	struct completion	compl;
 };
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index a1e6ab9b0bed..9f5afa24896d 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -144,8 +144,8 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 
 	spin_lock_irqsave(&ent->lock, flags);
 	list_add_tail(&mr->list, &ent->head);
-	ent->cur++;
-	ent->size++;
+	ent->available_mrs++;
+	ent->total_mrs++;
 	spin_unlock_irqrestore(&ent->lock, flags);
 
 	if (!completion_done(&ent->compl))
@@ -231,8 +231,8 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
 		}
 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 		list_move(&mr->list, &del_list);
-		ent->cur--;
-		ent->size--;
+		ent->available_mrs--;
+		ent->total_mrs--;
 		spin_unlock_irq(&ent->lock);
 		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 	}
@@ -265,16 +265,16 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
 	if (var < ent->limit)
 		return -EINVAL;
 
-	if (var > ent->size) {
+	if (var > ent->total_mrs) {
 		do {
-			err = add_keys(dev, c, var - ent->size);
+			err = add_keys(dev, c, var - ent->total_mrs);
 			if (err && err != -EAGAIN)
 				return err;
 
 			usleep_range(3000, 5000);
 		} while (err);
-	} else if (var < ent->size) {
-		remove_keys(dev, c, ent->size - var);
+	} else if (var < ent->total_mrs) {
+		remove_keys(dev, c, ent->total_mrs - var);
 	}
 
 	return count;
@@ -287,7 +287,7 @@ static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
 	char lbuf[20];
 	int err;
 
-	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
+	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
 	if (err < 0)
 		return err;
 
@@ -320,13 +320,13 @@ static ssize_t limit_write(struct file *filp, const char __user *buf,
 	if (sscanf(lbuf, "%u", &var) != 1)
 		return -EINVAL;
 
-	if (var > ent->size)
+	if (var > ent->total_mrs)
 		return -EINVAL;
 
 	ent->limit = var;
 
-	if (ent->cur < ent->limit) {
-		err = add_keys(dev, c, 2 * ent->limit - ent->cur);
+	if (ent->available_mrs < ent->limit) {
+		err = add_keys(dev, c, 2 * ent->limit - ent->available_mrs);
 		if (err)
 			return err;
 	}
@@ -360,7 +360,7 @@ static int someone_adding(struct mlx5_mr_cache *cache)
 	int i;
 
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
-		if (cache->ent[i].cur < cache->ent[i].limit)
+		if (cache->ent[i].available_mrs < cache->ent[i].limit)
 			return 1;
 	}
 
@@ -378,9 +378,9 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		return;
 
 	ent = &dev->cache.ent[i];
-	if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
+	if (ent->available_mrs < 2 * ent->limit && !dev->fill_delay) {
 		err = add_keys(dev, i, 1);
-		if (ent->cur < 2 * ent->limit) {
+		if (ent->available_mrs < 2 * ent->limit) {
 			if (err == -EAGAIN) {
 				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
 					    i + 2);
@@ -395,7 +395,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 				queue_work(cache->wq, &ent->work);
 			}
 		}
-	} else if (ent->cur > 2 * ent->limit) {
+	} else if (ent->available_mrs > 2 * ent->limit) {
 		/*
 		 * The remove_keys() logic is performed as garbage collection
 		 * task. Such task is intended to be run when no other active
@@ -411,7 +411,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		if (!need_resched() && !someone_adding(cache) &&
 		    time_after(jiffies, cache->last_add + 300 * HZ)) {
 			remove_keys(dev, i, 1);
-			if (ent->cur > ent->limit)
+			if (ent->available_mrs > ent->limit)
 				queue_work(cache->wq, &ent->work);
 		} else {
 			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
@@ -462,9 +462,9 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
 			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
 					      list);
 			list_del(&mr->list);
-			ent->cur--;
+			ent->available_mrs--;
 			spin_unlock_irq(&ent->lock);
-			if (ent->cur < ent->limit)
+			if (ent->available_mrs < ent->limit)
 				queue_work(cache->wq, &ent->work);
 			return mr;
 		}
@@ -497,9 +497,9 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
 			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
 					      list);
 			list_del(&mr->list);
-			ent->cur--;
+			ent->available_mrs--;
 			spin_unlock_irq(&ent->lock);
-			if (ent->cur < ent->limit)
+			if (ent->available_mrs < ent->limit)
 				queue_work(cache->wq, &ent->work);
 			break;
 		}
@@ -531,7 +531,7 @@ void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 		mr->allocated_from_cache = false;
 		destroy_mkey(dev, mr);
 		ent = &cache->ent[c];
-		if (ent->cur < ent->limit)
+		if (ent->available_mrs < ent->limit)
 			queue_work(cache->wq, &ent->work);
 		return;
 	}
@@ -539,8 +539,8 @@ void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	ent = &cache->ent[c];
 	spin_lock_irq(&ent->lock);
 	list_add_tail(&mr->list, &ent->head);
-	ent->cur++;
-	if (ent->cur > 2 * ent->limit)
+	ent->available_mrs++;
+	if (ent->available_mrs > 2 * ent->limit)
 		shrink = 1;
 	spin_unlock_irq(&ent->lock);
 
@@ -565,8 +565,8 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
 		}
 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 		list_move(&mr->list, &del_list);
-		ent->cur--;
-		ent->size--;
+		ent->available_mrs--;
+		ent->total_mrs--;
 		spin_unlock_irq(&ent->lock);
 		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 	}
@@ -604,7 +604,7 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 		dir = debugfs_create_dir(ent->name, cache->root);
 		debugfs_create_file("size", 0600, dir, ent, &size_fops);
 		debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
-		debugfs_create_u32("cur", 0400, dir, &ent->cur);
+		debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
 		debugfs_create_u32("miss", 0600, dir, &ent->miss);
 	}
 }

From b91e1751fbcee7692e45308e74d8816c43802ede Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 10:22:32 +0200
Subject: [PATCH 113/327] RDMA/mlx5: Simplify how the MR cache bucket is
 located

There are many bad APIs here that are accepting a cache bucket index
instead of a bucket pointer. Many of the callers already have a bucket
pointer, so this results in a lot of confusing uses of order2idx().

Pass the struct mlx5_cache_ent into add_keys(), remove_keys(), and
alloc_cached_mr().

Once the MR is in the cache, store the cache bucket pointer directly in
the MR, replacing the 'bool allocated_from cache'.

In the end there is only one place that needs to form index from order,
alloc_mr_from_cache(). Increase the safety of this function by disallowing
it from accessing cache entries in the ODP special area.

Link: https://lore.kernel.org/r/20200310082238.239865-7-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   7 +-
 drivers/infiniband/hw/mlx5/mr.c      | 160 +++++++++++----------------
 drivers/infiniband/hw/mlx5/odp.c     |   2 +-
 3 files changed, 71 insertions(+), 98 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 731b0f7bbe5b..7208946d2787 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -617,8 +617,8 @@ struct mlx5_ib_mr {
 	struct ib_umem	       *umem;
 	struct mlx5_shared_mr_info	*smr_info;
 	struct list_head	list;
-	int			order;
-	bool			allocated_from_cache;
+	unsigned int		order;
+	struct mlx5_cache_ent  *cache_ent;
 	int			npages;
 	struct mlx5_ib_dev     *dev;
 	u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
@@ -1274,7 +1274,8 @@ int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
 
-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry);
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+				       unsigned int entry);
 void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr);
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 9f5afa24896d..55e31f6effda 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -99,16 +99,6 @@ static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 }
 
-static int order2idx(struct mlx5_ib_dev *dev, int order)
-{
-	struct mlx5_mr_cache *cache = &dev->cache;
-
-	if (order < cache->ent[0].order)
-		return 0;
-	else
-		return order - cache->ent[0].order;
-}
-
 static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
 {
 	return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >=
@@ -120,9 +110,7 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 	struct mlx5_ib_mr *mr =
 		container_of(context, struct mlx5_ib_mr, cb_work);
 	struct mlx5_ib_dev *dev = mr->dev;
-	struct mlx5_mr_cache *cache = &dev->cache;
-	int c = order2idx(dev, mr->order);
-	struct mlx5_cache_ent *ent = &cache->ent[c];
+	struct mlx5_cache_ent *ent = mr->cache_ent;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ent->lock, flags);
@@ -140,7 +128,7 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 	mr->mmkey.key |= mlx5_idx_to_mkey(
 		MLX5_GET(create_mkey_out, mr->out, mkey_index));
 
-	cache->last_add = jiffies;
+	dev->cache.last_add = jiffies;
 
 	spin_lock_irqsave(&ent->lock, flags);
 	list_add_tail(&mr->list, &ent->head);
@@ -152,10 +140,8 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 		complete(&ent->compl);
 }
 
-static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
+static int add_keys(struct mlx5_cache_ent *ent, int num)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
-	struct mlx5_cache_ent *ent = &cache->ent[c];
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	struct mlx5_ib_mr *mr;
 	void *mkc;
@@ -180,8 +166,8 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 			break;
 		}
 		mr->order = ent->order;
-		mr->allocated_from_cache = true;
-		mr->dev = dev;
+		mr->cache_ent = ent;
+		mr->dev = ent->dev;
 
 		MLX5_SET(mkc, mkc, free, 1);
 		MLX5_SET(mkc, mkc, umr_en, 1);
@@ -196,15 +182,15 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 		spin_lock_irq(&ent->lock);
 		ent->pending++;
 		spin_unlock_irq(&ent->lock);
-		err = mlx5_ib_create_mkey_cb(dev, &mr->mmkey,
-					       &dev->async_ctx, in, inlen,
-					       mr->out, sizeof(mr->out),
-					       &mr->cb_work);
+		err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
+					     &ent->dev->async_ctx, in, inlen,
+					     mr->out, sizeof(mr->out),
+					     &mr->cb_work);
 		if (err) {
 			spin_lock_irq(&ent->lock);
 			ent->pending--;
 			spin_unlock_irq(&ent->lock);
-			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
+			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
 			kfree(mr);
 			break;
 		}
@@ -214,10 +200,8 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 	return err;
 }
 
-static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
+static void remove_keys(struct mlx5_cache_ent *ent, int num)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
-	struct mlx5_cache_ent *ent = &cache->ent[c];
 	struct mlx5_ib_mr *tmp_mr;
 	struct mlx5_ib_mr *mr;
 	LIST_HEAD(del_list);
@@ -234,7 +218,7 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
 		ent->available_mrs--;
 		ent->total_mrs--;
 		spin_unlock_irq(&ent->lock);
-		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
+		mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey);
 	}
 
 	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
@@ -247,18 +231,14 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
 			  size_t count, loff_t *pos)
 {
 	struct mlx5_cache_ent *ent = filp->private_data;
-	struct mlx5_ib_dev *dev = ent->dev;
 	char lbuf[20] = {0};
 	u32 var;
 	int err;
-	int c;
 
 	count = min(count, sizeof(lbuf) - 1);
 	if (copy_from_user(lbuf, buf, count))
 		return -EFAULT;
 
-	c = order2idx(dev, ent->order);
-
 	if (sscanf(lbuf, "%u", &var) != 1)
 		return -EINVAL;
 
@@ -267,14 +247,14 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
 
 	if (var > ent->total_mrs) {
 		do {
-			err = add_keys(dev, c, var - ent->total_mrs);
+			err = add_keys(ent, var - ent->total_mrs);
 			if (err && err != -EAGAIN)
 				return err;
 
 			usleep_range(3000, 5000);
 		} while (err);
 	} else if (var < ent->total_mrs) {
-		remove_keys(dev, c, ent->total_mrs - var);
+		remove_keys(ent, ent->total_mrs - var);
 	}
 
 	return count;
@@ -305,18 +285,14 @@ static ssize_t limit_write(struct file *filp, const char __user *buf,
 			   size_t count, loff_t *pos)
 {
 	struct mlx5_cache_ent *ent = filp->private_data;
-	struct mlx5_ib_dev *dev = ent->dev;
 	char lbuf[20] = {0};
 	u32 var;
 	int err;
-	int c;
 
 	count = min(count, sizeof(lbuf) - 1);
 	if (copy_from_user(lbuf, buf, count))
 		return -EFAULT;
 
-	c = order2idx(dev, ent->order);
-
 	if (sscanf(lbuf, "%u", &var) != 1)
 		return -EINVAL;
 
@@ -326,7 +302,7 @@ static ssize_t limit_write(struct file *filp, const char __user *buf,
 	ent->limit = var;
 
 	if (ent->available_mrs < ent->limit) {
-		err = add_keys(dev, c, 2 * ent->limit - ent->available_mrs);
+		err = add_keys(ent, 2 * ent->limit - ent->available_mrs);
 		if (err)
 			return err;
 	}
@@ -371,24 +347,22 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_dev *dev = ent->dev;
 	struct mlx5_mr_cache *cache = &dev->cache;
-	int i = order2idx(dev, ent->order);
 	int err;
 
 	if (cache->stopped)
 		return;
 
-	ent = &dev->cache.ent[i];
 	if (ent->available_mrs < 2 * ent->limit && !dev->fill_delay) {
-		err = add_keys(dev, i, 1);
+		err = add_keys(ent, 1);
 		if (ent->available_mrs < 2 * ent->limit) {
 			if (err == -EAGAIN) {
 				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
-					    i + 2);
+					    ent->order);
 				queue_delayed_work(cache->wq, &ent->dwork,
 						   msecs_to_jiffies(3));
 			} else if (err) {
 				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
-					     i + 2, err);
+					     ent->order, err);
 				queue_delayed_work(cache->wq, &ent->dwork,
 						   msecs_to_jiffies(1000));
 			} else {
@@ -410,7 +384,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		 */
 		if (!need_resched() && !someone_adding(cache) &&
 		    time_after(jiffies, cache->last_add + 300 * HZ)) {
-			remove_keys(dev, i, 1);
+			remove_keys(ent, 1);
 			if (ent->available_mrs > ent->limit)
 				queue_work(cache->wq, &ent->work);
 		} else {
@@ -435,17 +409,18 @@ static void cache_work_func(struct work_struct *work)
 	__cache_work_func(ent);
 }
 
-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
+/* Allocate a special entry from the cache */
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+				       unsigned int entry)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	struct mlx5_ib_mr *mr;
 	int err;
 
-	if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) {
-		mlx5_ib_err(dev, "cache entry %d is out of range\n", entry);
+	if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY ||
+		    entry >= ARRAY_SIZE(cache->ent)))
 		return ERR_PTR(-EINVAL);
-	}
 
 	ent = &cache->ent[entry];
 	while (1) {
@@ -453,7 +428,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
 		if (list_empty(&ent->head)) {
 			spin_unlock_irq(&ent->lock);
 
-			err = add_keys(dev, entry, 1);
+			err = add_keys(ent, 1);
 			if (err && err != -EAGAIN)
 				return ERR_PTR(err);
 
@@ -471,26 +446,16 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
 	}
 }
 
-static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
+static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_cache_ent *req_ent)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_ib_dev *dev = req_ent->dev;
 	struct mlx5_ib_mr *mr = NULL;
-	struct mlx5_cache_ent *ent;
-	int last_umr_cache_entry;
-	int c;
-	int i;
+	struct mlx5_cache_ent *ent = req_ent;
 
-	c = order2idx(dev, order);
-	last_umr_cache_entry = order2idx(dev, mr_cache_max_order(dev));
-	if (c < 0 || c > last_umr_cache_entry) {
-		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
-		return NULL;
-	}
-
-	for (i = c; i <= last_umr_cache_entry; i++) {
-		ent = &cache->ent[i];
-
-		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
+	/* Try larger MR pools from the cache to satisfy the allocation */
+	for (; ent != &dev->cache.ent[MR_CACHE_LAST_STD_ENTRY + 1]; ent++) {
+		mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order,
+			    ent - dev->cache.ent);
 
 		spin_lock_irq(&ent->lock);
 		if (!list_empty(&ent->head)) {
@@ -500,43 +465,36 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
 			ent->available_mrs--;
 			spin_unlock_irq(&ent->lock);
 			if (ent->available_mrs < ent->limit)
-				queue_work(cache->wq, &ent->work);
+				queue_work(dev->cache.wq, &ent->work);
 			break;
 		}
 		spin_unlock_irq(&ent->lock);
 
-		queue_work(cache->wq, &ent->work);
+		queue_work(dev->cache.wq, &ent->work);
 	}
 
 	if (!mr)
-		cache->ent[c].miss++;
+		req_ent->miss++;
 
 	return mr;
 }
 
 void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
-	struct mlx5_cache_ent *ent;
+	struct mlx5_cache_ent *ent = mr->cache_ent;
 	int shrink = 0;
-	int c;
 
-	if (!mr->allocated_from_cache)
+	if (!ent)
 		return;
 
-	c = order2idx(dev, mr->order);
-	WARN_ON(c < 0 || c >= MAX_MR_CACHE_ENTRIES);
-
 	if (mlx5_mr_cache_invalidate(mr)) {
-		mr->allocated_from_cache = false;
+		mr->cache_ent = NULL;
 		destroy_mkey(dev, mr);
-		ent = &cache->ent[c];
 		if (ent->available_mrs < ent->limit)
-			queue_work(cache->wq, &ent->work);
+			queue_work(dev->cache.wq, &ent->work);
 		return;
 	}
 
-	ent = &cache->ent[c];
 	spin_lock_irq(&ent->lock);
 	list_add_tail(&mr->list, &ent->head);
 	ent->available_mrs++;
@@ -545,7 +503,7 @@ void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	spin_unlock_irq(&ent->lock);
 
 	if (shrink)
-		queue_work(cache->wq, &ent->work);
+		queue_work(dev->cache.wq, &ent->work);
 }
 
 static void clean_keys(struct mlx5_ib_dev *dev, int c)
@@ -872,22 +830,38 @@ static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
 	return err;
 }
 
-static struct mlx5_ib_mr *alloc_mr_from_cache(
-				  struct ib_pd *pd, struct ib_umem *umem,
-				  u64 virt_addr, u64 len, int npages,
-				  int page_shift, int order, int access_flags)
+static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
+						      unsigned int order)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+
+	if (order < cache->ent[0].order)
+		return &cache->ent[0];
+	order = order - cache->ent[0].order;
+	if (order > MR_CACHE_LAST_STD_ENTRY)
+		return NULL;
+	return &cache->ent[order];
+}
+
+static struct mlx5_ib_mr *
+alloc_mr_from_cache(struct ib_pd *pd, struct ib_umem *umem, u64 virt_addr,
+		    u64 len, int npages, int page_shift, unsigned int order,
+		    int access_flags)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_cache_ent *ent = mr_cache_ent_from_order(dev, order);
 	struct mlx5_ib_mr *mr;
 	int err = 0;
 	int i;
 
+	if (!ent)
+		return ERR_PTR(-E2BIG);
 	for (i = 0; i < 1; i++) {
-		mr = alloc_cached_mr(dev, order);
+		mr = alloc_cached_mr(ent);
 		if (mr)
 			break;
 
-		err = add_keys(dev, order2idx(dev, order), 1);
+		err = add_keys(ent, 1);
 		if (err && err != -EAGAIN) {
 			mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
 			break;
@@ -1470,7 +1444,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 		/*
 		 * UMR can't be used - MKey needs to be replaced.
 		 */
-		if (mr->allocated_from_cache)
+		if (mr->cache_ent)
 			err = mlx5_mr_cache_invalidate(mr);
 		else
 			err = destroy_mkey(dev, mr);
@@ -1486,7 +1460,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 			goto err;
 		}
 
-		mr->allocated_from_cache = false;
+		mr->cache_ent = NULL;
 	} else {
 		/*
 		 * Send a UMR WQE
@@ -1573,8 +1547,6 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
 
 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
-	int allocated_from_cache = mr->allocated_from_cache;
-
 	if (mr->sig) {
 		if (mlx5_core_destroy_psv(dev->mdev,
 					  mr->sig->psv_memory.psv_idx))
@@ -1589,7 +1561,7 @@ static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 		mr->sig = NULL;
 	}
 
-	if (!allocated_from_cache) {
+	if (!mr->cache_ent) {
 		destroy_mkey(dev, mr);
 		mlx5_free_priv_descs(mr);
 	}
@@ -1606,7 +1578,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	else
 		clean_mr(dev, mr);
 
-	if (mr->allocated_from_cache)
+	if (mr->cache_ent)
 		mlx5_mr_cache_free(dev, mr);
 	else
 		kfree(mr);
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 4216814ba871..224f480fc441 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -197,7 +197,7 @@ static void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
 	odp->private = NULL;
 	mutex_unlock(&odp->umem_mutex);
 
-	if (!mr->allocated_from_cache) {
+	if (!mr->cache_ent) {
 		mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey);
 		WARN_ON(mr->descs);
 	}

From 1769c4c575489be28891c98f1e3f0a4252ca750a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 10:22:33 +0200
Subject: [PATCH 114/327] RDMA/mlx5: Always remove MRs from the cache before
 destroying them

The cache bucket tracks the total number of MRs that exists, both inside
and outside of the cache. Removing a MR from the cache (by setting
cache_ent to NULL) without updating total_mrs will cause the tracking to
leak and be inflated.

Further fix the rereg_mr path to always destroy the MR. reg_create will
always overwrite all the MR data in mlx5_ib_mr, so the MR must be
completely destroyed, in all cases, before this function can be
called. Detach the MR from the cache and unconditionally destroy it to
avoid leaking HW mkeys.

Fixes: afd1417404fb ("IB/mlx5: Use direct mkey destroy command upon UMR unreg failure")
Fixes: 56e11d628c5d ("IB/mlx5: Added support for re-registration of MRs")
Link: https://lore.kernel.org/r/20200310082238.239865-8-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mr.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 55e31f6effda..9b980ef326b4 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -479,6 +479,16 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_cache_ent *req_ent)
 	return mr;
 }
 
+static void detach_mr_from_cache(struct mlx5_ib_mr *mr)
+{
+	struct mlx5_cache_ent *ent = mr->cache_ent;
+
+	mr->cache_ent = NULL;
+	spin_lock_irq(&ent->lock);
+	ent->total_mrs--;
+	spin_unlock_irq(&ent->lock);
+}
+
 void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
 	struct mlx5_cache_ent *ent = mr->cache_ent;
@@ -488,7 +498,7 @@ void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 		return;
 
 	if (mlx5_mr_cache_invalidate(mr)) {
-		mr->cache_ent = NULL;
+		detach_mr_from_cache(mr);
 		destroy_mkey(dev, mr);
 		if (ent->available_mrs < ent->limit)
 			queue_work(dev->cache.wq, &ent->work);
@@ -1445,9 +1455,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 		 * UMR can't be used - MKey needs to be replaced.
 		 */
 		if (mr->cache_ent)
-			err = mlx5_mr_cache_invalidate(mr);
-		else
-			err = destroy_mkey(dev, mr);
+			detach_mr_from_cache(mr);
+		err = destroy_mkey(dev, mr);
 		if (err)
 			goto err;
 
@@ -1459,8 +1468,6 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 			mr = to_mmr(ib_mr);
 			goto err;
 		}
-
-		mr->cache_ent = NULL;
 	} else {
 		/*
 		 * Send a UMR WQE

From a1d8854aae4ee19df6161a276a99d3c9c2abc4f3 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 10:22:34 +0200
Subject: [PATCH 115/327] RDMA/mlx5: Fix MR cache size and limit debugfs

The size_write function is supposed to adjust the total_mr's to match the
user's request, but lacks locking and safety checking.

total_mrs can only be adjusted by at most available_mrs. mrs already
assigned to users cannot be revoked. Ensure that the user provides a
target value within the range of available_mrs and within the high/low
water mark.

limit_write has confusing and wrong sanity checking, and doesn't have the
ability to deallocate on limit reduction.

Since both functions use the same algorithm to adjust the available_mrs,
consolidate it into one function and write it correctly. Fix the locking
and by holding the spinlock for all accesses to ent->X.

Always fail if the user provides a malformed string.

Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters")
Link: https://lore.kernel.org/r/20200310082238.239865-9-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mr.c | 152 ++++++++++++++++++--------------
 1 file changed, 88 insertions(+), 64 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 9b980ef326b4..091e24c58e2c 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -140,7 +140,7 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 		complete(&ent->compl);
 }
 
-static int add_keys(struct mlx5_cache_ent *ent, int num)
+static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 {
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	struct mlx5_ib_mr *mr;
@@ -200,30 +200,54 @@ static int add_keys(struct mlx5_cache_ent *ent, int num)
 	return err;
 }
 
-static void remove_keys(struct mlx5_cache_ent *ent, int num)
+static void remove_cache_mr(struct mlx5_cache_ent *ent)
 {
-	struct mlx5_ib_mr *tmp_mr;
 	struct mlx5_ib_mr *mr;
-	LIST_HEAD(del_list);
-	int i;
 
-	for (i = 0; i < num; i++) {
-		spin_lock_irq(&ent->lock);
-		if (list_empty(&ent->head)) {
-			spin_unlock_irq(&ent->lock);
-			break;
-		}
-		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
-		list_move(&mr->list, &del_list);
-		ent->available_mrs--;
-		ent->total_mrs--;
+	spin_lock_irq(&ent->lock);
+	if (list_empty(&ent->head)) {
 		spin_unlock_irq(&ent->lock);
-		mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey);
+		return;
 	}
+	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
+	list_del(&mr->list);
+	ent->available_mrs--;
+	ent->total_mrs--;
+	spin_unlock_irq(&ent->lock);
+	mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey);
+	kfree(mr);
+}
 
-	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
-		list_del(&mr->list);
-		kfree(mr);
+static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
+				bool limit_fill)
+{
+	int err;
+
+	lockdep_assert_held(&ent->lock);
+
+	while (true) {
+		if (limit_fill)
+			target = ent->limit * 2;
+		if (target == ent->available_mrs + ent->pending)
+			return 0;
+		if (target > ent->available_mrs + ent->pending) {
+			u32 todo = target - (ent->available_mrs + ent->pending);
+
+			spin_unlock_irq(&ent->lock);
+			err = add_keys(ent, todo);
+			if (err == -EAGAIN)
+				usleep_range(3000, 5000);
+			spin_lock_irq(&ent->lock);
+			if (err) {
+				if (err != -EAGAIN)
+					return err;
+			} else
+				return 0;
+		} else {
+			spin_unlock_irq(&ent->lock);
+			remove_cache_mr(ent);
+			spin_lock_irq(&ent->lock);
+		}
 	}
 }
 
@@ -231,33 +255,38 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
 			  size_t count, loff_t *pos)
 {
 	struct mlx5_cache_ent *ent = filp->private_data;
-	char lbuf[20] = {0};
-	u32 var;
+	u32 target;
 	int err;
 
-	count = min(count, sizeof(lbuf) - 1);
-	if (copy_from_user(lbuf, buf, count))
-		return -EFAULT;
+	err = kstrtou32_from_user(buf, count, 0, &target);
+	if (err)
+		return err;
 
-	if (sscanf(lbuf, "%u", &var) != 1)
-		return -EINVAL;
-
-	if (var < ent->limit)
-		return -EINVAL;
-
-	if (var > ent->total_mrs) {
-		do {
-			err = add_keys(ent, var - ent->total_mrs);
-			if (err && err != -EAGAIN)
-				return err;
-
-			usleep_range(3000, 5000);
-		} while (err);
-	} else if (var < ent->total_mrs) {
-		remove_keys(ent, ent->total_mrs - var);
+	/*
+	 * Target is the new value of total_mrs the user requests, however we
+	 * cannot free MRs that are in use. Compute the target value for
+	 * available_mrs.
+	 */
+	spin_lock_irq(&ent->lock);
+	if (target < ent->total_mrs - ent->available_mrs) {
+		err = -EINVAL;
+		goto err_unlock;
 	}
+	target = target - (ent->total_mrs - ent->available_mrs);
+	if (target < ent->limit || target > ent->limit*2) {
+		err = -EINVAL;
+		goto err_unlock;
+	}
+	err = resize_available_mrs(ent, target, false);
+	if (err)
+		goto err_unlock;
+	spin_unlock_irq(&ent->lock);
 
 	return count;
+
+err_unlock:
+	spin_unlock_irq(&ent->lock);
+	return err;
 }
 
 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
@@ -285,28 +314,23 @@ static ssize_t limit_write(struct file *filp, const char __user *buf,
 			   size_t count, loff_t *pos)
 {
 	struct mlx5_cache_ent *ent = filp->private_data;
-	char lbuf[20] = {0};
 	u32 var;
 	int err;
 
-	count = min(count, sizeof(lbuf) - 1);
-	if (copy_from_user(lbuf, buf, count))
-		return -EFAULT;
-
-	if (sscanf(lbuf, "%u", &var) != 1)
-		return -EINVAL;
-
-	if (var > ent->total_mrs)
-		return -EINVAL;
+	err = kstrtou32_from_user(buf, count, 0, &var);
+	if (err)
+		return err;
 
+	/*
+	 * Upon set we immediately fill the cache to high water mark implied by
+	 * the limit.
+	 */
+	spin_lock_irq(&ent->lock);
 	ent->limit = var;
-
-	if (ent->available_mrs < ent->limit) {
-		err = add_keys(ent, 2 * ent->limit - ent->available_mrs);
-		if (err)
-			return err;
-	}
-
+	err = resize_available_mrs(ent, 0, true);
+	spin_unlock_irq(&ent->lock);
+	if (err)
+		return err;
 	return count;
 }
 
@@ -371,20 +395,20 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		}
 	} else if (ent->available_mrs > 2 * ent->limit) {
 		/*
-		 * The remove_keys() logic is performed as garbage collection
-		 * task. Such task is intended to be run when no other active
-		 * processes are running.
+		 * The remove_cache_mr() logic is performed as garbage
+		 * collection task. Such task is intended to be run when no
+		 * other active processes are running.
 		 *
 		 * The need_resched() will return TRUE if there are user tasks
 		 * to be activated in near future.
 		 *
-		 * In such case, we don't execute remove_keys() and postpone
-		 * the garbage collection work to try to run in next cycle,
-		 * in order to free CPU resources to other tasks.
+		 * In such case, we don't execute remove_cache_mr() and postpone
+		 * the garbage collection work to try to run in next cycle, in
+		 * order to free CPU resources to other tasks.
 		 */
 		if (!need_resched() && !someone_adding(cache) &&
 		    time_after(jiffies, cache->last_add + 300 * HZ)) {
-			remove_keys(ent, 1);
+			remove_cache_mr(ent);
 			if (ent->available_mrs > ent->limit)
 				queue_work(cache->wq, &ent->work);
 		} else {

From ad2d3ef46d2a88f2906d8d0cc6b912199ec3f1d6 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 10:22:35 +0200
Subject: [PATCH 116/327] RDMA/mlx5: Lock access to ent->available_mrs/limit
 when doing queue_work

Accesses to these members needs to be locked. There is no reason not to
hold a spinlock while calling queue_work(), so move the tests into a
helper and always call it under lock.

The helper should be called when available_mrs is adjusted.

Link: https://lore.kernel.org/r/20200310082238.239865-10-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mr.c | 40 ++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 091e24c58e2c..b46039d86b98 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -134,6 +134,10 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 	list_add_tail(&mr->list, &ent->head);
 	ent->available_mrs++;
 	ent->total_mrs++;
+	/*
+	 * Creating is always done in response to some demand, so do not call
+	 * queue_adjust_cache_locked().
+	 */
 	spin_unlock_irqrestore(&ent->lock, flags);
 
 	if (!completion_done(&ent->compl))
@@ -367,6 +371,20 @@ static int someone_adding(struct mlx5_mr_cache *cache)
 	return 0;
 }
 
+/*
+ * Check if the bucket is outside the high/low water mark and schedule an async
+ * update. The cache refill has hysteresis, once the low water mark is hit it is
+ * refilled up to the high mark.
+ */
+static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
+{
+	lockdep_assert_held(&ent->lock);
+
+	if (ent->available_mrs < ent->limit ||
+	    ent->available_mrs > 2 * ent->limit)
+		queue_work(ent->dev->cache.wq, &ent->work);
+}
+
 static void __cache_work_func(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_dev *dev = ent->dev;
@@ -462,9 +480,8 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 					      list);
 			list_del(&mr->list);
 			ent->available_mrs--;
+			queue_adjust_cache_locked(ent);
 			spin_unlock_irq(&ent->lock);
-			if (ent->available_mrs < ent->limit)
-				queue_work(cache->wq, &ent->work);
 			return mr;
 		}
 	}
@@ -487,14 +504,12 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_cache_ent *req_ent)
 					      list);
 			list_del(&mr->list);
 			ent->available_mrs--;
+			queue_adjust_cache_locked(ent);
 			spin_unlock_irq(&ent->lock);
-			if (ent->available_mrs < ent->limit)
-				queue_work(dev->cache.wq, &ent->work);
 			break;
 		}
+		queue_adjust_cache_locked(ent);
 		spin_unlock_irq(&ent->lock);
-
-		queue_work(dev->cache.wq, &ent->work);
 	}
 
 	if (!mr)
@@ -516,7 +531,6 @@ static void detach_mr_from_cache(struct mlx5_ib_mr *mr)
 void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
 	struct mlx5_cache_ent *ent = mr->cache_ent;
-	int shrink = 0;
 
 	if (!ent)
 		return;
@@ -524,20 +538,14 @@ void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	if (mlx5_mr_cache_invalidate(mr)) {
 		detach_mr_from_cache(mr);
 		destroy_mkey(dev, mr);
-		if (ent->available_mrs < ent->limit)
-			queue_work(dev->cache.wq, &ent->work);
 		return;
 	}
 
 	spin_lock_irq(&ent->lock);
 	list_add_tail(&mr->list, &ent->head);
 	ent->available_mrs++;
-	if (ent->available_mrs > 2 * ent->limit)
-		shrink = 1;
+	queue_adjust_cache_locked(ent);
 	spin_unlock_irq(&ent->lock);
-
-	if (shrink)
-		queue_work(dev->cache.wq, &ent->work);
 }
 
 static void clean_keys(struct mlx5_ib_dev *dev, int c)
@@ -653,7 +661,9 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 			ent->limit = dev->mdev->profile->mr_cache[i].limit;
 		else
 			ent->limit = 0;
-		queue_work(cache->wq, &ent->work);
+		spin_lock_irq(&ent->lock);
+		queue_adjust_cache_locked(ent);
+		spin_unlock_irq(&ent->lock);
 	}
 
 	mlx5_mr_cache_debugfs_init(dev);

From b9358bdbc713cb64b8701bcbd450edc155d761a1 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 10:22:36 +0200
Subject: [PATCH 117/327] RDMA/mlx5: Fix locking in MR cache work queue

All of the members of mlx5_cache_ent must be accessed while holding the
spinlock, add the missing spinlock in the __cache_work_func().

Using cache->stopped and flush_workqueue() is an inherently racy way to
shutdown self-scheduling work on a queue. Replace it with ent->disabled
under lock, and always check disabled before queuing any new work. Use
cancel_work_sync() to shutdown the queue.

Use READ_ONCE/WRITE_ONCE for dev->last_add to manage concurrency as
coherency is less important here.

Split fill_delay from the bitfield. C bitfield updates are not atomic and
this is just a mess. Use READ_ONCE/WRITE_ONCE, but this could also use
test_bit()/set_bit().

Link: https://lore.kernel.org/r/20200310082238.239865-11-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   5 +-
 drivers/infiniband/hw/mlx5/mr.c      | 121 +++++++++++++++++----------
 2 files changed, 80 insertions(+), 46 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7208946d2787..a22932ffb9c8 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -699,6 +699,8 @@ struct mlx5_cache_ent {
 	u32			access_mode;
 	u32			page;
 
+	u8 disabled:1;
+
 	/*
 	 * - available_mrs is the length of list head, ie the number of MRs
 	 *   available for immediate allocation.
@@ -725,7 +727,6 @@ struct mlx5_cache_ent {
 struct mlx5_mr_cache {
 	struct workqueue_struct *wq;
 	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
-	int			stopped;
 	struct dentry		*root;
 	unsigned long		last_add;
 };
@@ -995,10 +996,10 @@ struct mlx5_ib_dev {
 	 */
 	struct mutex			cap_mask_mutex;
 	u8				ib_active:1;
-	u8				fill_delay:1;
 	u8				is_rep:1;
 	u8				lag_active:1;
 	u8				wc_support:1;
+	u8				fill_delay;
 	struct umr_common		umrc;
 	/* sync used page count stats
 	 */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index b46039d86b98..424ce3de3865 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -113,13 +113,13 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 	struct mlx5_cache_ent *ent = mr->cache_ent;
 	unsigned long flags;
 
-	spin_lock_irqsave(&ent->lock, flags);
-	ent->pending--;
-	spin_unlock_irqrestore(&ent->lock, flags);
 	if (status) {
 		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
 		kfree(mr);
-		dev->fill_delay = 1;
+		spin_lock_irqsave(&ent->lock, flags);
+		ent->pending--;
+		WRITE_ONCE(dev->fill_delay, 1);
+		spin_unlock_irqrestore(&ent->lock, flags);
 		mod_timer(&dev->delay_timer, jiffies + HZ);
 		return;
 	}
@@ -128,12 +128,13 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 	mr->mmkey.key |= mlx5_idx_to_mkey(
 		MLX5_GET(create_mkey_out, mr->out, mkey_index));
 
-	dev->cache.last_add = jiffies;
+	WRITE_ONCE(dev->cache.last_add, jiffies);
 
 	spin_lock_irqsave(&ent->lock, flags);
 	list_add_tail(&mr->list, &ent->head);
 	ent->available_mrs++;
 	ent->total_mrs++;
+	ent->pending--;
 	/*
 	 * Creating is always done in response to some demand, so do not call
 	 * queue_adjust_cache_locked().
@@ -159,11 +160,6 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	for (i = 0; i < num; i++) {
-		if (ent->pending >= MAX_PENDING_REG_MR) {
-			err = -EAGAIN;
-			break;
-		}
-
 		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 		if (!mr) {
 			err = -ENOMEM;
@@ -184,6 +180,12 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 		MLX5_SET(mkc, mkc, log_page_size, ent->page);
 
 		spin_lock_irq(&ent->lock);
+		if (ent->pending >= MAX_PENDING_REG_MR) {
+			err = -EAGAIN;
+			spin_unlock_irq(&ent->lock);
+			kfree(mr);
+			break;
+		}
 		ent->pending++;
 		spin_unlock_irq(&ent->lock);
 		err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
@@ -204,15 +206,13 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 	return err;
 }
 
-static void remove_cache_mr(struct mlx5_cache_ent *ent)
+static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_mr *mr;
 
-	spin_lock_irq(&ent->lock);
-	if (list_empty(&ent->head)) {
-		spin_unlock_irq(&ent->lock);
+	lockdep_assert_held(&ent->lock);
+	if (list_empty(&ent->head))
 		return;
-	}
 	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 	list_del(&mr->list);
 	ent->available_mrs--;
@@ -220,6 +220,7 @@ static void remove_cache_mr(struct mlx5_cache_ent *ent)
 	spin_unlock_irq(&ent->lock);
 	mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey);
 	kfree(mr);
+	spin_lock_irq(&ent->lock);
 }
 
 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
@@ -248,9 +249,7 @@ static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
 			} else
 				return 0;
 		} else {
-			spin_unlock_irq(&ent->lock);
-			remove_cache_mr(ent);
-			spin_lock_irq(&ent->lock);
+			remove_cache_mr_locked(ent);
 		}
 	}
 }
@@ -359,16 +358,21 @@ static const struct file_operations limit_fops = {
 	.read	= limit_read,
 };
 
-static int someone_adding(struct mlx5_mr_cache *cache)
+static bool someone_adding(struct mlx5_mr_cache *cache)
 {
-	int i;
+	unsigned int i;
 
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
-		if (cache->ent[i].available_mrs < cache->ent[i].limit)
-			return 1;
-	}
+		struct mlx5_cache_ent *ent = &cache->ent[i];
+		bool ret;
 
-	return 0;
+		spin_lock_irq(&ent->lock);
+		ret = ent->available_mrs < ent->limit;
+		spin_unlock_irq(&ent->lock);
+		if (ret)
+			return true;
+	}
+	return false;
 }
 
 /*
@@ -380,6 +384,8 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 {
 	lockdep_assert_held(&ent->lock);
 
+	if (ent->disabled)
+		return;
 	if (ent->available_mrs < ent->limit ||
 	    ent->available_mrs > 2 * ent->limit)
 		queue_work(ent->dev->cache.wq, &ent->work);
@@ -391,27 +397,42 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 	struct mlx5_mr_cache *cache = &dev->cache;
 	int err;
 
-	if (cache->stopped)
-		return;
+	spin_lock_irq(&ent->lock);
+	if (ent->disabled)
+		goto out;
 
-	if (ent->available_mrs < 2 * ent->limit && !dev->fill_delay) {
+	if (ent->available_mrs + ent->pending < 2 * ent->limit &&
+	    !READ_ONCE(dev->fill_delay)) {
+		spin_unlock_irq(&ent->lock);
 		err = add_keys(ent, 1);
-		if (ent->available_mrs < 2 * ent->limit) {
+
+		spin_lock_irq(&ent->lock);
+		if (ent->disabled)
+			goto out;
+		if (err) {
 			if (err == -EAGAIN) {
 				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
 					    ent->order);
 				queue_delayed_work(cache->wq, &ent->dwork,
 						   msecs_to_jiffies(3));
-			} else if (err) {
-				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
-					     ent->order, err);
+			} else {
+				mlx5_ib_warn(
+					dev,
+					"command failed order %d, err %d\n",
+					ent->order, err);
 				queue_delayed_work(cache->wq, &ent->dwork,
 						   msecs_to_jiffies(1000));
-			} else {
-				queue_work(cache->wq, &ent->work);
 			}
 		}
+		/*
+		 * Once we start populating due to hitting a low water mark
+		 * continue until we pass the high water mark.
+		 */
+		if (ent->available_mrs + ent->pending < 2 * ent->limit)
+			queue_work(cache->wq, &ent->work);
 	} else if (ent->available_mrs > 2 * ent->limit) {
+		bool need_delay;
+
 		/*
 		 * The remove_cache_mr() logic is performed as garbage
 		 * collection task. Such task is intended to be run when no
@@ -424,15 +445,20 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		 * the garbage collection work to try to run in next cycle, in
 		 * order to free CPU resources to other tasks.
 		 */
-		if (!need_resched() && !someone_adding(cache) &&
-		    time_after(jiffies, cache->last_add + 300 * HZ)) {
-			remove_cache_mr(ent);
-			if (ent->available_mrs > ent->limit)
-				queue_work(cache->wq, &ent->work);
-		} else {
+		spin_unlock_irq(&ent->lock);
+		need_delay = need_resched() || someone_adding(cache) ||
+			     time_after(jiffies,
+					READ_ONCE(cache->last_add) + 300 * HZ);
+		spin_lock_irq(&ent->lock);
+		if (ent->disabled)
+			goto out;
+		if (need_delay)
 			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
-		}
+		remove_cache_mr_locked(ent);
+		queue_adjust_cache_locked(ent);
 	}
+out:
+	spin_unlock_irq(&ent->lock);
 }
 
 static void delayed_cache_work_func(struct work_struct *work)
@@ -613,7 +639,7 @@ static void delay_time_func(struct timer_list *t)
 {
 	struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
 
-	dev->fill_delay = 0;
+	WRITE_ONCE(dev->fill_delay, 0);
 }
 
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
@@ -673,13 +699,20 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 
 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 {
-	int i;
+	unsigned int i;
 
 	if (!dev->cache.wq)
 		return 0;
 
-	dev->cache.stopped = 1;
-	flush_workqueue(dev->cache.wq);
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
+
+		spin_lock_irq(&ent->lock);
+		ent->disabled = true;
+		spin_unlock_irq(&ent->lock);
+		cancel_work_sync(&ent->work);
+		cancel_delayed_work_sync(&ent->dwork);
+	}
 
 	mlx5_mr_cache_debugfs_cleanup(dev);
 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);

From 1c78a21a0c6f8df98a2281c1c0232f9c85e1c44d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 10:22:37 +0200
Subject: [PATCH 118/327] RDMA/mlx5: Revise how the hysteresis scheme works for
 cache filling

Currently if the work queue is running then it is in 'hysteresis' mode and
will fill until the cache reaches the high water mark. This implicit state
is very tricky and doesn't interact with pending very well.

Instead of self re-scheduling the work queue after the add_keys() has
started to create the new MR, have the queue scheduled from
reg_mr_callback() only after the requested MR has been added.

This avoids the bad design of an in-rush of queue'd work doing back to
back add_keys() until EAGAIN then sleeping. The add_keys() will be paced
one at a time as they complete, slowly filling up the cache.

Also, fix pending to be only manipulated under lock.

Link: https://lore.kernel.org/r/20200310082238.239865-12-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  1 +
 drivers/infiniband/hw/mlx5/mr.c      | 41 ++++++++++++++++++----------
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index a22932ffb9c8..1216575292a7 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -700,6 +700,7 @@ struct mlx5_cache_ent {
 	u32			page;
 
 	u8 disabled:1;
+	u8 fill_to_high_water:1;
 
 	/*
 	 * - available_mrs is the length of list head, ie the number of MRs
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 424ce3de3865..afacaf8981fa 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -86,6 +86,7 @@ mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 static int mr_cache_max_order(struct mlx5_ib_dev *dev);
+static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
 
 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
 {
@@ -134,11 +135,9 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 	list_add_tail(&mr->list, &ent->head);
 	ent->available_mrs++;
 	ent->total_mrs++;
+	/* If we are doing fill_to_high_water then keep going. */
+	queue_adjust_cache_locked(ent);
 	ent->pending--;
-	/*
-	 * Creating is always done in response to some demand, so do not call
-	 * queue_adjust_cache_locked().
-	 */
 	spin_unlock_irqrestore(&ent->lock, flags);
 
 	if (!completion_done(&ent->compl))
@@ -384,11 +383,29 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 {
 	lockdep_assert_held(&ent->lock);
 
-	if (ent->disabled)
+	if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
 		return;
-	if (ent->available_mrs < ent->limit ||
-	    ent->available_mrs > 2 * ent->limit)
+	if (ent->available_mrs < ent->limit) {
+		ent->fill_to_high_water = true;
 		queue_work(ent->dev->cache.wq, &ent->work);
+	} else if (ent->fill_to_high_water &&
+		   ent->available_mrs + ent->pending < 2 * ent->limit) {
+		/*
+		 * Once we start populating due to hitting a low water mark
+		 * continue until we pass the high water mark.
+		 */
+		queue_work(ent->dev->cache.wq, &ent->work);
+	} else if (ent->available_mrs == 2 * ent->limit) {
+		ent->fill_to_high_water = false;
+	} else if (ent->available_mrs > 2 * ent->limit) {
+		/* Queue deletion of excess entries */
+		ent->fill_to_high_water = false;
+		if (ent->pending)
+			queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
+					   msecs_to_jiffies(1000));
+		else
+			queue_work(ent->dev->cache.wq, &ent->work);
+	}
 }
 
 static void __cache_work_func(struct mlx5_cache_ent *ent)
@@ -401,11 +418,11 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 	if (ent->disabled)
 		goto out;
 
-	if (ent->available_mrs + ent->pending < 2 * ent->limit &&
+	if (ent->fill_to_high_water &&
+	    ent->available_mrs + ent->pending < 2 * ent->limit &&
 	    !READ_ONCE(dev->fill_delay)) {
 		spin_unlock_irq(&ent->lock);
 		err = add_keys(ent, 1);
-
 		spin_lock_irq(&ent->lock);
 		if (ent->disabled)
 			goto out;
@@ -424,12 +441,6 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 						   msecs_to_jiffies(1000));
 			}
 		}
-		/*
-		 * Once we start populating due to hitting a low water mark
-		 * continue until we pass the high water mark.
-		 */
-		if (ent->available_mrs + ent->pending < 2 * ent->limit)
-			queue_work(cache->wq, &ent->work);
 	} else if (ent->available_mrs > 2 * ent->limit) {
 		bool need_delay;
 

From aad719dcf379f1413dcb168413a53fea66e2ef90 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 10:22:38 +0200
Subject: [PATCH 119/327] RDMA/mlx5: Allow MRs to be created in the cache
 synchronously

If the cache is completely out of MRs, and we are running in cache mode,
then directly, and synchronously, create an MR that is compatible with the
cache bucket using a sleeping mailbox command. This ensures that the
thread that is waiting for the MR absolutely will get one.

When a MR allocated in this way becomes freed then it is compatible with
the cache bucket and will be recycled back into it.

Deletes the very buggy ent->compl scheme to create a synchronous MR
allocation.

Link: https://lore.kernel.org/r/20200310082238.239865-13-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   1 -
 drivers/infiniband/hw/mlx5/mr.c      | 149 ++++++++++++++++-----------
 2 files changed, 88 insertions(+), 62 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 1216575292a7..a5da2d5cf659 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -722,7 +722,6 @@ struct mlx5_cache_ent {
 	struct mlx5_ib_dev     *dev;
 	struct work_struct	work;
 	struct delayed_work	dwork;
-	struct completion	compl;
 };
 
 struct mlx5_mr_cache {
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index afacaf8981fa..a401931189b7 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -139,14 +139,34 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 	queue_adjust_cache_locked(ent);
 	ent->pending--;
 	spin_unlock_irqrestore(&ent->lock, flags);
-
-	if (!completion_done(&ent->compl))
-		complete(&ent->compl);
 }
 
+static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
+{
+	struct mlx5_ib_mr *mr;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return NULL;
+	mr->order = ent->order;
+	mr->cache_ent = ent;
+	mr->dev = ent->dev;
+
+	MLX5_SET(mkc, mkc, free, 1);
+	MLX5_SET(mkc, mkc, umr_en, 1);
+	MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
+	MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
+
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
+	MLX5_SET(mkc, mkc, log_page_size, ent->page);
+	return mr;
+}
+
+/* Asynchronously schedule new MRs to be populated in the cache. */
 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 {
-	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	struct mlx5_ib_mr *mr;
 	void *mkc;
 	u32 *in;
@@ -159,25 +179,11 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	for (i = 0; i < num; i++) {
-		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+		mr = alloc_cache_mr(ent, mkc);
 		if (!mr) {
 			err = -ENOMEM;
 			break;
 		}
-		mr->order = ent->order;
-		mr->cache_ent = ent;
-		mr->dev = ent->dev;
-
-		MLX5_SET(mkc, mkc, free, 1);
-		MLX5_SET(mkc, mkc, umr_en, 1);
-		MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
-		MLX5_SET(mkc, mkc, access_mode_4_2,
-			 (ent->access_mode >> 2) & 0x7);
-
-		MLX5_SET(mkc, mkc, qpn, 0xffffff);
-		MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
-		MLX5_SET(mkc, mkc, log_page_size, ent->page);
-
 		spin_lock_irq(&ent->lock);
 		if (ent->pending >= MAX_PENDING_REG_MR) {
 			err = -EAGAIN;
@@ -205,6 +211,44 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 	return err;
 }
 
+/* Synchronously create a MR in the cache */
+static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
+{
+	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	struct mlx5_ib_mr *mr;
+	void *mkc;
+	u32 *in;
+	int err;
+
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return ERR_PTR(-ENOMEM);
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+	mr = alloc_cache_mr(ent, mkc);
+	if (!mr) {
+		err = -ENOMEM;
+		goto free_in;
+	}
+
+	err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey, in, inlen);
+	if (err)
+		goto free_mr;
+
+	mr->mmkey.type = MLX5_MKEY_MR;
+	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
+	spin_lock_irq(&ent->lock);
+	ent->total_mrs++;
+	spin_unlock_irq(&ent->lock);
+	kfree(in);
+	return mr;
+free_mr:
+	kfree(mr);
+free_in:
+	kfree(in);
+	return ERR_PTR(err);
+}
+
 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_mr *mr;
@@ -427,12 +471,12 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		if (ent->disabled)
 			goto out;
 		if (err) {
-			if (err == -EAGAIN) {
-				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
-					    ent->order);
-				queue_delayed_work(cache->wq, &ent->dwork,
-						   msecs_to_jiffies(3));
-			} else {
+			/*
+			 * EAGAIN only happens if pending is positive, so we
+			 * will be rescheduled from reg_mr_callback(). The only
+			 * failure path here is ENOMEM.
+			 */
+			if (err != -EAGAIN) {
 				mlx5_ib_warn(
 					dev,
 					"command failed order %d, err %d\n",
@@ -495,36 +539,30 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	struct mlx5_ib_mr *mr;
-	int err;
 
 	if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY ||
 		    entry >= ARRAY_SIZE(cache->ent)))
 		return ERR_PTR(-EINVAL);
 
 	ent = &cache->ent[entry];
-	while (1) {
-		spin_lock_irq(&ent->lock);
-		if (list_empty(&ent->head)) {
-			spin_unlock_irq(&ent->lock);
-
-			err = add_keys(ent, 1);
-			if (err && err != -EAGAIN)
-				return ERR_PTR(err);
-
-			wait_for_completion(&ent->compl);
-		} else {
-			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
-					      list);
-			list_del(&mr->list);
-			ent->available_mrs--;
-			queue_adjust_cache_locked(ent);
-			spin_unlock_irq(&ent->lock);
+	spin_lock_irq(&ent->lock);
+	if (list_empty(&ent->head)) {
+		spin_unlock_irq(&ent->lock);
+		mr = create_cache_mr(ent);
+		if (IS_ERR(mr))
 			return mr;
-		}
+	} else {
+		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
+		list_del(&mr->list);
+		ent->available_mrs--;
+		queue_adjust_cache_locked(ent);
+		spin_unlock_irq(&ent->lock);
 	}
+	return mr;
 }
 
-static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_cache_ent *req_ent)
+/* Return a MR already available in the cache */
+static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
 {
 	struct mlx5_ib_dev *dev = req_ent->dev;
 	struct mlx5_ib_mr *mr = NULL;
@@ -676,7 +714,6 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 		ent->dev = dev;
 		ent->limit = 0;
 
-		init_completion(&ent->compl);
 		INIT_WORK(&ent->work, cache_work_func);
 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 
@@ -939,26 +976,16 @@ alloc_mr_from_cache(struct ib_pd *pd, struct ib_umem *umem, u64 virt_addr,
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_cache_ent *ent = mr_cache_ent_from_order(dev, order);
 	struct mlx5_ib_mr *mr;
-	int err = 0;
-	int i;
 
 	if (!ent)
 		return ERR_PTR(-E2BIG);
-	for (i = 0; i < 1; i++) {
-		mr = alloc_cached_mr(ent);
-		if (mr)
-			break;
-
-		err = add_keys(ent, 1);
-		if (err && err != -EAGAIN) {
-			mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
-			break;
-		}
+	mr = get_cache_mr(ent);
+	if (!mr) {
+		mr = create_cache_mr(ent);
+		if (IS_ERR(mr))
+			return mr;
 	}
 
-	if (!mr)
-		return ERR_PTR(-EAGAIN);
-
 	mr->ibmr.pd = pd;
 	mr->umem = umem;
 	mr->access_flags = access_flags;

From 9e57a9aa694cfc28f9549d2ce587309fdac15bfb Mon Sep 17 00:00:00 2001
From: Weihang Li <liweihang@huawei.com>
Date: Tue, 10 Mar 2020 21:06:09 +0800
Subject: [PATCH 120/327] RDMA/hns: Fix wrong judgments of udata->outlen

These judgments were used to keep the compatibility with older versions of
userspace that don't have the field named "cap_flags" in structure
hns_roce_ib_create_cq_resp. But it will be wrong to compare outlen with
the size of resp if another new field were added in resp. oulen should be
compared with the end offset of cap_flags in resp.

Fixes: 4f8f0d5e33dd ("RDMA/hns: Package the flow of creating cq")
Link: https://lore.kernel.org/r/1583845569-47257-1-git-send-email-liweihang@huawei.com
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_cq.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 5ffe4c996ed3..5bfb52ffd590 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -257,8 +257,8 @@ static int create_user_cq(struct hns_roce_dev *hr_dev,
 		return ret;
 	}
 
-	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-	    (udata->outlen >= sizeof(*resp))) {
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB &&
+	    udata->outlen >= offsetofend(typeof(*resp), cap_flags)) {
 		ret = hns_roce_db_map_user(context, udata, ucmd.db_addr,
 					   &hr_cq->db);
 		if (ret) {
@@ -321,8 +321,8 @@ static void destroy_user_cq(struct hns_roce_dev *hr_dev,
 	struct hns_roce_ucontext *context = rdma_udata_to_drv_context(
 				   udata, struct hns_roce_ucontext, ibucontext);
 
-	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-	    (udata->outlen >= sizeof(*resp)))
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB &&
+	    udata->outlen >= offsetofend(typeof(*resp), cap_flags))
 		hns_roce_db_unmap_user(context, &hr_cq->db);
 
 	hns_roce_mtt_cleanup(hr_dev, &hr_cq->mtt);

From 14f4283aa3e69a68e0a2e3bdb8a651a39bf18c52 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 12 Feb 2020 10:38:29 +0000
Subject: [PATCH 121/327] selftests/resctrl: fix spelling mistake "Errror" ->
 "Error"

There are two spelling mistakes in error messages. Fix these.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/resctrl/fill_buf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
index 84d2a8b9657a..79c611c99a3d 100644
--- a/tools/testing/selftests/resctrl/fill_buf.c
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -181,7 +181,7 @@ fill_cache(unsigned long long buf_size, int malloc_and_init, int memflush,
 		ret = fill_cache_write(start_ptr, end_ptr, resctrl_val);
 
 	if (ret) {
-		printf("\n Errror in fill cache read/write...\n");
+		printf("\n Error in fill cache read/write...\n");
 		return -1;
 	}
 
@@ -205,7 +205,7 @@ int run_fill_buf(unsigned long span, int malloc_and_init_memory,
 	ret = fill_cache(cache_size, malloc_and_init_memory, memflush, op,
 			 resctrl_val);
 	if (ret) {
-		printf("\n Errror in fill cache\n");
+		printf("\n Error in fill cache\n");
 		return -1;
 	}
 

From 785c4e834f5f34dc00a398e935a89fd38416ff64 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Fri, 6 Mar 2020 11:17:05 +0800
Subject: [PATCH 122/327] selftests/timens: Remove duplicated include <time.h>

Remove duplicated include.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Reviewed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/timens/exec.c   | 1 -
 tools/testing/selftests/timens/procfs.c | 1 -
 tools/testing/selftests/timens/timens.c | 1 -
 tools/testing/selftests/timens/timer.c  | 1 -
 4 files changed, 4 deletions(-)

diff --git a/tools/testing/selftests/timens/exec.c b/tools/testing/selftests/timens/exec.c
index 87b47b557a7a..e40dc5be2f66 100644
--- a/tools/testing/selftests/timens/exec.c
+++ b/tools/testing/selftests/timens/exec.c
@@ -11,7 +11,6 @@
 #include <sys/wait.h>
 #include <time.h>
 #include <unistd.h>
-#include <time.h>
 #include <string.h>
 
 #include "log.h"
diff --git a/tools/testing/selftests/timens/procfs.c b/tools/testing/selftests/timens/procfs.c
index 43d93f4006b9..7f14f0fdac84 100644
--- a/tools/testing/selftests/timens/procfs.c
+++ b/tools/testing/selftests/timens/procfs.c
@@ -12,7 +12,6 @@
 #include <sys/types.h>
 #include <time.h>
 #include <unistd.h>
-#include <time.h>
 
 #include "log.h"
 #include "timens.h"
diff --git a/tools/testing/selftests/timens/timens.c b/tools/testing/selftests/timens/timens.c
index 559d26e21ba0..098be7c83be3 100644
--- a/tools/testing/selftests/timens/timens.c
+++ b/tools/testing/selftests/timens/timens.c
@@ -10,7 +10,6 @@
 #include <sys/types.h>
 #include <time.h>
 #include <unistd.h>
-#include <time.h>
 #include <string.h>
 
 #include "log.h"
diff --git a/tools/testing/selftests/timens/timer.c b/tools/testing/selftests/timens/timer.c
index 0cca7aafc4bd..96dba11ebe44 100644
--- a/tools/testing/selftests/timens/timer.c
+++ b/tools/testing/selftests/timens/timer.c
@@ -11,7 +11,6 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <signal.h>
-#include <time.h>
 
 #include "log.h"
 #include "timens.h"

From 9c249ec312dbdb3ef05f5b2672c8c1d1f7562269 Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Thu, 12 Mar 2020 13:03:37 +0900
Subject: [PATCH 123/327] selftests/ftrace: Fix typo in trigger-multihist.tc

This patch fix a spelling typo in trigger-multihist.tc

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 .../selftests/ftrace/test.d/trigger/trigger-multihist.tc        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc
index 18fdaab9f570..68ff3f45c720 100644
--- a/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc
@@ -23,7 +23,7 @@ if [ ! -f events/sched/sched_process_fork/hist ]; then
     exit_unsupported
 fi
 
-echo "Test histogram multiple tiggers"
+echo "Test histogram multiple triggers"
 
 echo 'hist:keys=parent_pid:vals=child_pid' > events/sched/sched_process_fork/trigger
 echo 'hist:keys=parent_comm:vals=child_pid' >> events/sched/sched_process_fork/trigger

From 1ae81d78a8b2f6314fb90dc4296202611e710b37 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 11 Mar 2020 12:21:28 -0700
Subject: [PATCH 124/327] selftests/seccomp: Adjust test fixture counts

The seccomp selftest reported the wrong test counts since it was using
slightly the wrong API for defining text fixtures. Adjust the API usage.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index ee1b727ede04..7bf82fb07f67 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -909,7 +909,7 @@ TEST(ERRNO_order)
 	EXPECT_EQ(12, errno);
 }
 
-FIXTURE_DATA(TRAP) {
+FIXTURE(TRAP) {
 	struct sock_fprog prog;
 };
 
@@ -1020,7 +1020,7 @@ TEST_F(TRAP, handler)
 	EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
 }
 
-FIXTURE_DATA(precedence) {
+FIXTURE(precedence) {
 	struct sock_fprog allow;
 	struct sock_fprog log;
 	struct sock_fprog trace;
@@ -1509,7 +1509,7 @@ void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
 	EXPECT_EQ(0, ret);
 }
 
-FIXTURE_DATA(TRACE_poke) {
+FIXTURE(TRACE_poke) {
 	struct sock_fprog prog;
 	pid_t tracer;
 	long poked;
@@ -1817,7 +1817,7 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
 		change_syscall(_metadata, tracee, -1, -ESRCH);
 }
 
-FIXTURE_DATA(TRACE_syscall) {
+FIXTURE(TRACE_syscall) {
 	struct sock_fprog prog;
 	pid_t tracer, mytid, mypid, parent;
 };
@@ -2321,7 +2321,7 @@ struct tsync_sibling {
 		}							\
 	} while (0)
 
-FIXTURE_DATA(TSYNC) {
+FIXTURE(TSYNC) {
 	struct sock_fprog root_prog, apply_prog;
 	struct tsync_sibling sibling[TSYNC_SIBLINGS];
 	sem_t started;

From 29e911ef7b706215caf02a82b0d3076611d6abe8 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Wed, 26 Feb 2020 15:54:49 -0700
Subject: [PATCH 125/327] selftests: Fix kselftest O=objdir build from
 cluttering top level objdir

make kselftest-all O=objdir builds create generated objects in objdir.
This clutters the top level directory with kselftest objects. Fix it
to create sub-directory under objdir for kselftest objects.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 63430e2664c2..be22dbe94a4c 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -85,7 +85,7 @@ override LDFLAGS =
 override MAKEFLAGS =
 endif
 
-# Append kselftest to KBUILD_OUTPUT to avoid cluttering
+# Append kselftest to KBUILD_OUTPUT and O to avoid cluttering
 # KBUILD_OUTPUT with selftest objects and headers installed
 # by selftests Makefile or lib.mk.
 ifdef building_out_of_srctree
@@ -93,7 +93,7 @@ override LDFLAGS =
 endif
 
 ifneq ($(O),)
-	BUILD := $(O)
+	BUILD := $(O)/kselftest
 else
 	ifneq ($(KBUILD_OUTPUT),)
 		BUILD := $(KBUILD_OUTPUT)/kselftest

From 1dc74544edc6753d628fcd1059559fb0ddc1b422 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Mon, 2 Mar 2020 14:44:21 -0700
Subject: [PATCH 126/327] selftests: android: ion: Fix ionmap_test compile
 error

ionmap_test compile rule is missing ipcsocket.c dependency. Add it to
fix the following compile errors:

..android/ion/ionutils.c:221: undefined reference to `sendtosocket'
..android/ion/ionutils.c:243: undefined reference to `receivefromsocket'

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/android/ion/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/android/ion/Makefile b/tools/testing/selftests/android/ion/Makefile
index 0eb7ab626e1c..42b71f005332 100644
--- a/tools/testing/selftests/android/ion/Makefile
+++ b/tools/testing/selftests/android/ion/Makefile
@@ -17,4 +17,4 @@ include ../../lib.mk
 
 $(OUTPUT)/ionapp_export: ionapp_export.c ipcsocket.c ionutils.c
 $(OUTPUT)/ionapp_import: ionapp_import.c ipcsocket.c ionutils.c
-$(OUTPUT)/ionmap_test: ionmap_test.c ionutils.c
+$(OUTPUT)/ionmap_test: ionmap_test.c ionutils.c ipcsocket.c

From fb0bb3952401eac4e5a39551239f4e3bc9aeee4a Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Mon, 2 Mar 2020 14:55:51 -0700
Subject: [PATCH 127/327] selftests: android: Fix custom install from skipping
 test progs

Update custom install rule to install all generated test programs. This
fixes android/ion tests to be installed correctly.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/android/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/android/Makefile b/tools/testing/selftests/android/Makefile
index 7c462714b418..9258306cafe9 100644
--- a/tools/testing/selftests/android/Makefile
+++ b/tools/testing/selftests/android/Makefile
@@ -21,7 +21,7 @@ all:
 
 override define INSTALL_RULE
 	mkdir -p $(INSTALL_PATH)
-	install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)
+install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)  $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES)
 
 	@for SUBDIR in $(SUBDIRS); do \
 		BUILD_TARGET=$(OUTPUT)/$$SUBDIR;	\

From b023e1728bec6dbf0c6a7e2145e2cd025c89cd55 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 17 Jan 2020 22:04:43 -0500
Subject: [PATCH 128/327] lookup_fast(): consolidate the RCU success case

1) in case of __follow_mount_rcu() failure, lookup_fast() proceeds
to call unlazy_child() and, should it succeed, handle_mounts().
Note that we have status > 0 (or we wouldn't be calling
__follow_mount_rcu() at all), so all stuff conditional upon
non-positive status won't be even touched.

Consolidate just that sequence after the call of __follow_mount_rcu().

2) calling d_is_negative() and keeping its result is pointless -
we either don't get past checking ->d_seq (and don't use the results of
d_is_negative() at all), or we are guaranteed that ->d_inode and
type bits of ->d_flags had been consistent at the time of d_is_negative()
call.  IOW, we could only get to the use of its result if it's
equal to !inode.  The same ->d_seq check guarantees that after that point
this CPU won't observe ->d_flags values older than ->d_inode update.
So 'negative' variable is completely pointless these days.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index a613fa52a174..c34c27025cf0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1631,7 +1631,6 @@ static int lookup_fast(struct nameidata *nd,
 	 */
 	if (nd->flags & LOOKUP_RCU) {
 		unsigned seq;
-		bool negative;
 		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
 		if (unlikely(!dentry)) {
 			if (unlazy_walk(nd))
@@ -1644,7 +1643,6 @@ static int lookup_fast(struct nameidata *nd,
 		 * the dentry name information from lookup.
 		 */
 		*inode = d_backing_inode(dentry);
-		negative = d_is_negative(dentry);
 		if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
 			return -ECHILD;
 
@@ -1665,12 +1663,15 @@ static int lookup_fast(struct nameidata *nd,
 			 * Note: do negative dentry check after revalidation in
 			 * case that drops it.
 			 */
-			if (unlikely(negative))
+			if (unlikely(!*inode))
 				return -ENOENT;
 			path->mnt = mnt;
 			path->dentry = dentry;
 			if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
 				return 1;
+			if (unlazy_child(nd, dentry, seq))
+				return -ECHILD;
+			return handle_mounts(nd, dentry, path, inode, seqp);
 		}
 		if (unlazy_child(nd, dentry, seq))
 			return -ECHILD;

From c153007b7b7a65bb99c19eb1a72cebda5aa26a5e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 9 Jan 2020 14:50:18 -0500
Subject: [PATCH 129/327] teach handle_mounts() to handle RCU mode

... and make the callers of __follow_mount_rcu() use handle_mounts().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 46 +++++++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index c34c27025cf0..37b2f82e6f2c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1393,6 +1393,18 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
 
 	path->mnt = nd->path.mnt;
 	path->dentry = dentry;
+	if (nd->flags & LOOKUP_RCU) {
+		unsigned int seq = *seqp;
+		if (unlikely(!*inode))
+			return -ENOENT;
+		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
+			return 1;
+		if (unlazy_child(nd, dentry, seq))
+			return -ECHILD;
+		// *path might've been clobbered by __follow_mount_rcu()
+		path->mnt = nd->path.mnt;
+		path->dentry = dentry;
+	}
 	ret = follow_managed(path, nd);
 	if (likely(ret >= 0)) {
 		*inode = d_backing_inode(path->dentry);
@@ -1620,7 +1632,6 @@ static int lookup_fast(struct nameidata *nd,
 		       struct path *path, struct inode **inode,
 		       unsigned *seqp)
 {
-	struct vfsmount *mnt = nd->path.mnt;
 	struct dentry *dentry, *parent = nd->path.dentry;
 	int status = 1;
 
@@ -1658,21 +1669,8 @@ static int lookup_fast(struct nameidata *nd,
 
 		*seqp = seq;
 		status = d_revalidate(dentry, nd->flags);
-		if (likely(status > 0)) {
-			/*
-			 * Note: do negative dentry check after revalidation in
-			 * case that drops it.
-			 */
-			if (unlikely(!*inode))
-				return -ENOENT;
-			path->mnt = mnt;
-			path->dentry = dentry;
-			if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
-				return 1;
-			if (unlazy_child(nd, dentry, seq))
-				return -ECHILD;
+		if (likely(status > 0))
 			return handle_mounts(nd, dentry, path, inode, seqp);
-		}
 		if (unlazy_child(nd, dentry, seq))
 			return -ECHILD;
 		if (unlikely(status == -ECHILD))
@@ -2361,21 +2359,11 @@ static int handle_lookup_down(struct nameidata *nd)
 	unsigned seq = nd->seq;
 	int err;
 
-	if (nd->flags & LOOKUP_RCU) {
-		/*
-		 * don't bother with unlazy_walk on failure - we are
-		 * at the very beginning of walk, so we lose nothing
-		 * if we simply redo everything in non-RCU mode
-		 */
-		path = nd->path;
-		if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
-			return -ECHILD;
-	} else {
+	if (!(nd->flags & LOOKUP_RCU))
 		dget(nd->path.dentry);
-		err = handle_mounts(nd, nd->path.dentry, &path, &inode, &seq);
-		if (unlikely(err < 0))
-			return err;
-	}
+	err = handle_mounts(nd, nd->path.dentry, &path, &inode, &seq);
+	if (unlikely(err < 0))
+		return err;
 	path_to_nameidata(&path, nd);
 	nd->inode = inode;
 	nd->seq = seq;

From 20e343571cefe31b3dba3e61c9b057781fbce40a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 9 Jan 2020 14:58:31 -0500
Subject: [PATCH 130/327] lookup_fast(): take mount traversal into callers

Current calling conventions: -E... on error, 0 on cache miss,
result of handle_mounts(nd, dentry, path, inode, seqp) on
success.  Turn that into returning ERR_PTR(-E...), NULL and dentry
resp.; deal with handle_mounts() in the callers.  The thing
is, they already do that in cache miss handling case, so we
just need to supply dentry to them and unify the mount traversal
in those cases.  Fewer arguments that way, and we get closer
to merging handle_mounts() and step_into().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 50 ++++++++++++++++++++++++--------------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 37b2f82e6f2c..84ce7ccd944e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1628,9 +1628,9 @@ static struct dentry *__lookup_hash(const struct qstr *name,
 	return dentry;
 }
 
-static int lookup_fast(struct nameidata *nd,
-		       struct path *path, struct inode **inode,
-		       unsigned *seqp)
+static struct dentry *lookup_fast(struct nameidata *nd,
+				  struct inode **inode,
+			          unsigned *seqp)
 {
 	struct dentry *dentry, *parent = nd->path.dentry;
 	int status = 1;
@@ -1645,8 +1645,8 @@ static int lookup_fast(struct nameidata *nd,
 		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
 		if (unlikely(!dentry)) {
 			if (unlazy_walk(nd))
-				return -ECHILD;
-			return 0;
+				return ERR_PTR(-ECHILD);
+			return NULL;
 		}
 
 		/*
@@ -1655,7 +1655,7 @@ static int lookup_fast(struct nameidata *nd,
 		 */
 		*inode = d_backing_inode(dentry);
 		if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
-			return -ECHILD;
+			return ERR_PTR(-ECHILD);
 
 		/*
 		 * This sequence count validates that the parent had no
@@ -1665,30 +1665,30 @@ static int lookup_fast(struct nameidata *nd,
 		 *  enough, we can use __read_seqcount_retry here.
 		 */
 		if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
-			return -ECHILD;
+			return ERR_PTR(-ECHILD);
 
 		*seqp = seq;
 		status = d_revalidate(dentry, nd->flags);
 		if (likely(status > 0))
-			return handle_mounts(nd, dentry, path, inode, seqp);
+			return dentry;
 		if (unlazy_child(nd, dentry, seq))
-			return -ECHILD;
+			return ERR_PTR(-ECHILD);
 		if (unlikely(status == -ECHILD))
 			/* we'd been told to redo it in non-rcu mode */
 			status = d_revalidate(dentry, nd->flags);
 	} else {
 		dentry = __d_lookup(parent, &nd->last);
 		if (unlikely(!dentry))
-			return 0;
+			return NULL;
 		status = d_revalidate(dentry, nd->flags);
 	}
 	if (unlikely(status <= 0)) {
 		if (!status)
 			d_invalidate(dentry);
 		dput(dentry);
-		return status;
+		return ERR_PTR(status);
 	}
-	return handle_mounts(nd, dentry, path, inode, seqp);
+	return dentry;
 }
 
 /* Fast lookup failed, do it the slow way */
@@ -1874,19 +1874,18 @@ static int walk_component(struct nameidata *nd, int flags)
 			put_link(nd);
 		return err;
 	}
-	err = lookup_fast(nd, &path, &inode, &seq);
-	if (unlikely(err <= 0)) {
-		if (err < 0)
-			return err;
+	dentry = lookup_fast(nd, &inode, &seq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	if (unlikely(!dentry)) {
 		dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
 		if (IS_ERR(dentry))
 			return PTR_ERR(dentry);
-
-		err = handle_mounts(nd, dentry, &path, &inode, &seq);
-		if (unlikely(err < 0))
-			return err;
 	}
 
+	err = handle_mounts(nd, dentry, &path, &inode, &seq);
+	if (unlikely(err < 0))
+			return err;
 	return step_into(nd, &path, flags, inode, seq);
 }
 
@@ -3302,13 +3301,12 @@ static int do_last(struct nameidata *nd,
 		if (nd->last.name[nd->last.len])
 			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 		/* we _can_ be in RCU mode here */
-		error = lookup_fast(nd, &path, &inode, &seq);
-		if (likely(error > 0))
+		dentry = lookup_fast(nd, &inode, &seq);
+		if (IS_ERR(dentry))
+			return PTR_ERR(dentry);
+		if (likely(dentry))
 			goto finish_lookup;
 
-		if (error < 0)
-			return error;
-
 		BUG_ON(nd->inode != dir->d_inode);
 		BUG_ON(nd->flags & LOOKUP_RCU);
 	} else {
@@ -3383,10 +3381,10 @@ static int do_last(struct nameidata *nd,
 		got_write = false;
 	}
 
+finish_lookup:
 	error = handle_mounts(nd, dentry, &path, &inode, &seq);
 	if (unlikely(error < 0))
 		return error;
-finish_lookup:
 	error = step_into(nd, &path, 0, inode, seq);
 	if (unlikely(error))
 		return error;

From 56676ec390194930f7504cd84d494c221861bd7f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Mar 2020 21:54:54 -0400
Subject: [PATCH 131/327] step_into() callers: dismiss the symlink earlier

We need to dismiss a symlink when we are done traversing it;
currently that's done when we call step_into() for its last
component.  For the cases when we do not call step_into()
for that component (i.e. when it's . or ..) we do the same
symlink dismissal after the call of handle_dots().

What we need to guarantee is that the symlink won't be dismissed
while we are still using nd->last.name - it's pointing into the
body of said symlink.  step_into() is sufficiently late - by
the time it's called we'd already obtained the dentry, so the
name we'd been looking up is no longer needed.  However, it
turns out to be cleaner to have that ("we are done with that
component now, can dismiss the link") done explicitly - in the
callers of step_into().

In handle_dots() case we won't be using the component string
at all, so for . and .. the corresponding point is actually
_before_ the call of handle_dots(), not after it.

Fix a minor irregularity in do_last(), while we are at it -
if trailing symlink ended with . or .. we forgot to dismiss
it.  Not a problem, since nameidata is about to be done with
(neither . nor .. can be a trailing symlink, so this is the
last iteration through the loop) and terminate_walk() will
clean the stack anyway, but let's keep it more regular.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 84ce7ccd944e..3097edcb4a1a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1838,8 +1838,6 @@ enum {WALK_FOLLOW = 1, WALK_MORE = 2};
 static inline int step_into(struct nameidata *nd, struct path *path,
 			    int flags, struct inode *inode, unsigned seq)
 {
-	if (!(flags & WALK_MORE) && nd->depth)
-		put_link(nd);
 	if (likely(!d_is_symlink(path->dentry)) ||
 	   !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
 		/* not a symlink or should not follow */
@@ -1869,9 +1867,9 @@ static int walk_component(struct nameidata *nd, int flags)
 	 * parent relationships.
 	 */
 	if (unlikely(nd->last_type != LAST_NORM)) {
-		err = handle_dots(nd, nd->last_type);
 		if (!(flags & WALK_MORE) && nd->depth)
 			put_link(nd);
+		err = handle_dots(nd, nd->last_type);
 		return err;
 	}
 	dentry = lookup_fast(nd, &inode, &seq);
@@ -1882,6 +1880,8 @@ static int walk_component(struct nameidata *nd, int flags)
 		if (IS_ERR(dentry))
 			return PTR_ERR(dentry);
 	}
+	if (!(flags & WALK_MORE) && nd->depth)
+		put_link(nd);
 
 	err = handle_mounts(nd, dentry, &path, &inode, &seq);
 	if (unlikely(err < 0))
@@ -3291,6 +3291,8 @@ static int do_last(struct nameidata *nd,
 	nd->flags |= op->intent;
 
 	if (nd->last_type != LAST_NORM) {
+		if (nd->depth)
+			put_link(nd);
 		error = handle_dots(nd, nd->last_type);
 		if (unlikely(error))
 			return error;
@@ -3382,6 +3384,8 @@ static int do_last(struct nameidata *nd,
 	}
 
 finish_lookup:
+	if (nd->depth)
+		put_link(nd);
 	error = handle_mounts(nd, dentry, &path, &inode, &seq);
 	if (unlikely(error < 0))
 		return error;

From aca2903eefd0f8a3ba672f985182f899b425ca24 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 9 Jan 2020 15:17:57 -0500
Subject: [PATCH 132/327] new step_into() flag: WALK_NOFOLLOW

Tells step_into() not to follow symlinks, regardless of LOOKUP_FOLLOW.
Allows to switch handle_lookup_down() to of step_into(), getting
all follow_managed() and step_into() calls paired.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 3097edcb4a1a..79f06be7f5d4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1827,7 +1827,7 @@ static int pick_link(struct nameidata *nd, struct path *link,
 	return 1;
 }
 
-enum {WALK_FOLLOW = 1, WALK_MORE = 2};
+enum {WALK_FOLLOW = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
 
 /*
  * Do we need to follow links? We _really_ want to be able
@@ -1839,7 +1839,8 @@ static inline int step_into(struct nameidata *nd, struct path *path,
 			    int flags, struct inode *inode, unsigned seq)
 {
 	if (likely(!d_is_symlink(path->dentry)) ||
-	   !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
+	   !((flags & WALK_FOLLOW) || (nd->flags & LOOKUP_FOLLOW)) ||
+	   (flags & WALK_NOFOLLOW)) {
 		/* not a symlink or should not follow */
 		path_to_nameidata(path, nd);
 		nd->inode = inode;
@@ -2363,10 +2364,7 @@ static int handle_lookup_down(struct nameidata *nd)
 	err = handle_mounts(nd, nd->path.dentry, &path, &inode, &seq);
 	if (unlikely(err < 0))
 		return err;
-	path_to_nameidata(&path, nd);
-	nd->inode = inode;
-	nd->seq = seq;
-	return 0;
+	return step_into(nd, &path, WALK_NOFOLLOW, inode, seq);
 }
 
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */

From cbae4d12eeee6b002a1252c4c45213651e8f4b55 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Jan 2020 13:40:02 -0500
Subject: [PATCH 133/327] fold handle_mounts() into step_into()

The following is true:
	* calls of handle_mounts() and step_into() are always
paired in sequences like
	err = handle_mounts(nd, dentry, &path, &inode, &seq);
	if (unlikely(err < 0))
		return err;
	err = step_into(nd, &path, flags, inode, seq);
	* in all such sequences path is uninitialized before and
unused after this pair of calls
	* in all such sequences inode and seq are unused afterwards.

So the call of handle_mounts() can be shifted inside step_into(),
turning 'path' into a local variable in the combined function.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 79f06be7f5d4..8a673f44ffc2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1835,29 +1835,33 @@ enum {WALK_FOLLOW = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
  * so we keep a cache of "no, this doesn't need follow_link"
  * for the common case.
  */
-static inline int step_into(struct nameidata *nd, struct path *path,
-			    int flags, struct inode *inode, unsigned seq)
+static int step_into(struct nameidata *nd, int flags,
+		     struct dentry *dentry, struct inode *inode, unsigned seq)
 {
-	if (likely(!d_is_symlink(path->dentry)) ||
+	struct path path;
+	int err = handle_mounts(nd, dentry, &path, &inode, &seq);
+
+	if (err < 0)
+		return err;
+	if (likely(!d_is_symlink(path.dentry)) ||
 	   !((flags & WALK_FOLLOW) || (nd->flags & LOOKUP_FOLLOW)) ||
 	   (flags & WALK_NOFOLLOW)) {
 		/* not a symlink or should not follow */
-		path_to_nameidata(path, nd);
+		path_to_nameidata(&path, nd);
 		nd->inode = inode;
 		nd->seq = seq;
 		return 0;
 	}
 	/* make sure that d_is_symlink above matches inode */
 	if (nd->flags & LOOKUP_RCU) {
-		if (read_seqcount_retry(&path->dentry->d_seq, seq))
+		if (read_seqcount_retry(&path.dentry->d_seq, seq))
 			return -ECHILD;
 	}
-	return pick_link(nd, path, inode, seq);
+	return pick_link(nd, &path, inode, seq);
 }
 
 static int walk_component(struct nameidata *nd, int flags)
 {
-	struct path path;
 	struct dentry *dentry;
 	struct inode *inode;
 	unsigned seq;
@@ -1883,11 +1887,7 @@ static int walk_component(struct nameidata *nd, int flags)
 	}
 	if (!(flags & WALK_MORE) && nd->depth)
 		put_link(nd);
-
-	err = handle_mounts(nd, dentry, &path, &inode, &seq);
-	if (unlikely(err < 0))
-			return err;
-	return step_into(nd, &path, flags, inode, seq);
+	return step_into(nd, flags, dentry, inode, seq);
 }
 
 /*
@@ -2354,17 +2354,10 @@ static inline int lookup_last(struct nameidata *nd)
 
 static int handle_lookup_down(struct nameidata *nd)
 {
-	struct path path;
-	struct inode *inode = nd->inode;
-	unsigned seq = nd->seq;
-	int err;
-
 	if (!(nd->flags & LOOKUP_RCU))
 		dget(nd->path.dentry);
-	err = handle_mounts(nd, nd->path.dentry, &path, &inode, &seq);
-	if (unlikely(err < 0))
-		return err;
-	return step_into(nd, &path, WALK_NOFOLLOW, inode, seq);
+	return step_into(nd, WALK_NOFOLLOW,
+			nd->path.dentry, nd->inode, nd->seq);
 }
 
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
@@ -3281,7 +3274,6 @@ static int do_last(struct nameidata *nd,
 	int acc_mode = op->acc_mode;
 	unsigned seq;
 	struct inode *inode;
-	struct path path;
 	struct dentry *dentry;
 	int error;
 
@@ -3384,10 +3376,7 @@ static int do_last(struct nameidata *nd,
 finish_lookup:
 	if (nd->depth)
 		put_link(nd);
-	error = handle_mounts(nd, dentry, &path, &inode, &seq);
-	if (unlikely(error < 0))
-		return error;
-	error = step_into(nd, &path, 0, inode, seq);
+	error = step_into(nd, 0, dentry, inode, seq);
 	if (unlikely(error))
 		return error;
 

From 161aff1d93abf0e5b5e9dbca88928998c155f677 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 11 Jan 2020 22:52:26 -0500
Subject: [PATCH 134/327] LOOKUP_MOUNTPOINT: fold path_mountpointat() into
 path_lookupat()

New LOOKUP flag, telling path_lookupat() to act as path_mountpointat().
IOW, traverse mounts at the final point and skip revalidation of the
location where it ends up.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs/dev-ioctl.c |  6 +--
 fs/internal.h         |  1 -
 fs/namei.c            | 89 +++----------------------------------------
 fs/namespace.c        |  4 +-
 include/linux/namei.h |  2 +-
 5 files changed, 12 insertions(+), 90 deletions(-)

diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index a3cdb0036c5d..f3a0f412b43b 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -186,7 +186,7 @@ static int find_autofs_mount(const char *pathname,
 	struct path path;
 	int err;
 
-	err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
+	err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
 	if (err)
 		return err;
 	err = -ENOENT;
@@ -519,8 +519,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 
 	if (!fp || param->ioctlfd == -1) {
 		if (autofs_type_any(type))
-			err = kern_path_mountpoint(AT_FDCWD,
-						   name, &path, LOOKUP_FOLLOW);
+			err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_MOUNTPOINT,
+					&path);
 		else
 			err = find_autofs_mount(name, &path,
 						test_by_type, &type);
diff --git a/fs/internal.h b/fs/internal.h
index f3f280b952a3..b108a8eb75ca 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -61,7 +61,6 @@ extern int finish_clean_context(struct fs_context *fc);
  */
 extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
 			   struct path *path, struct path *root);
-extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct path *);
 long do_mknodat(int dfd, const char __user *filename, umode_t mode,
diff --git a/fs/namei.c b/fs/namei.c
index 8a673f44ffc2..0fcd738b95b9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2382,6 +2382,10 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
 	if (!err && nd->flags & LOOKUP_DIRECTORY)
 		if (!d_can_lookup(nd->path.dentry))
 			err = -ENOTDIR;
+	if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
+		err = handle_lookup_down(nd);
+		nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please...
+	}
 	if (!err) {
 		*path = nd->path;
 		nd->path.mnt = NULL;
@@ -2410,7 +2414,8 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags,
 		retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);
 
 	if (likely(!retval))
-		audit_inode(name, path->dentry, 0);
+		audit_inode(name, path->dentry,
+			    flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
 	restore_nameidata();
 	putname(name);
 	return retval;
@@ -2688,88 +2693,6 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 }
 EXPORT_SYMBOL(user_path_at_empty);
 
-/**
- * path_mountpoint - look up a path to be umounted
- * @nd:		lookup context
- * @flags:	lookup flags
- * @path:	pointer to container for result
- *
- * Look up the given name, but don't attempt to revalidate the last component.
- * Returns 0 and "path" will be valid on success; Returns error otherwise.
- */
-static int
-path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
-{
-	const char *s = path_init(nd, flags);
-	int err;
-
-	while (!(err = link_path_walk(s, nd)) &&
-		(err = lookup_last(nd)) > 0) {
-		s = trailing_symlink(nd);
-	}
-	if (!err && (nd->flags & LOOKUP_RCU))
-		err = unlazy_walk(nd);
-	if (!err)
-		err = handle_lookup_down(nd);
-	if (!err) {
-		*path = nd->path;
-		nd->path.mnt = NULL;
-		nd->path.dentry = NULL;
-	}
-	terminate_walk(nd);
-	return err;
-}
-
-static int
-filename_mountpoint(int dfd, struct filename *name, struct path *path,
-			unsigned int flags)
-{
-	struct nameidata nd;
-	int error;
-	if (IS_ERR(name))
-		return PTR_ERR(name);
-	set_nameidata(&nd, dfd, name);
-	error = path_mountpoint(&nd, flags | LOOKUP_RCU, path);
-	if (unlikely(error == -ECHILD))
-		error = path_mountpoint(&nd, flags, path);
-	if (unlikely(error == -ESTALE))
-		error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path);
-	if (likely(!error))
-		audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL);
-	restore_nameidata();
-	putname(name);
-	return error;
-}
-
-/**
- * user_path_mountpoint_at - lookup a path from userland in order to umount it
- * @dfd:	directory file descriptor
- * @name:	pathname from userland
- * @flags:	lookup flags
- * @path:	pointer to container to hold result
- *
- * A umount is a special case for path walking. We're not actually interested
- * in the inode in this situation, and ESTALE errors can be a problem. We
- * simply want track down the dentry and vfsmount attached at the mountpoint
- * and avoid revalidating the last component.
- *
- * Returns 0 and populates "path" on success.
- */
-int
-user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
-			struct path *path)
-{
-	return filename_mountpoint(dfd, getname(name), path, flags);
-}
-
-int
-kern_path_mountpoint(int dfd, const char *name, struct path *path,
-			unsigned int flags)
-{
-	return filename_mountpoint(dfd, getname_kernel(name), path, flags);
-}
-EXPORT_SYMBOL(kern_path_mountpoint);
-
 int __check_sticky(struct inode *dir, struct inode *inode)
 {
 	kuid_t fsuid = current_fsuid();
diff --git a/fs/namespace.c b/fs/namespace.c
index 743980380a8f..a28e4db075ed 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1669,7 +1669,7 @@ int ksys_umount(char __user *name, int flags)
 	struct path path;
 	struct mount *mnt;
 	int retval;
-	int lookup_flags = 0;
+	int lookup_flags = LOOKUP_MOUNTPOINT;
 
 	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
 		return -EINVAL;
@@ -1680,7 +1680,7 @@ int ksys_umount(char __user *name, int flags)
 	if (!(flags & UMOUNT_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
 
-	retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
+	retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
 	if (retval)
 		goto out;
 	mnt = real_mount(path.mnt);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 0dd980d7318f..d9576a051808 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -23,6 +23,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_AUTOMOUNT	0x0004  /* force terminal automount */
 #define LOOKUP_EMPTY		0x4000	/* accept empty path [user_... only] */
 #define LOOKUP_DOWN		0x8000	/* follow mounts in the starting point */
+#define LOOKUP_MOUNTPOINT	0x0080	/* follow mounts in the end */
 
 #define LOOKUP_REVAL		0x0020	/* tell ->d_revalidate() to trust no cache */
 #define LOOKUP_RCU		0x0040	/* RCU pathwalk mode; semi-internal */
@@ -64,7 +65,6 @@ extern struct dentry *kern_path_create(int, const char *, struct path *, unsigne
 extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int);
 extern void done_path_create(struct path *, struct dentry *);
 extern struct dentry *kern_path_locked(const char *, struct path *);
-extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);

From a9dc1494a78291bba3bfd427d90024659c942803 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Jan 2020 13:54:30 -0500
Subject: [PATCH 135/327] expand the only remaining call of
 path_lookup_conditional()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 0fcd738b95b9..e3393511d426 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -858,13 +858,6 @@ static int set_root(struct nameidata *nd)
 	return 0;
 }
 
-static void path_put_conditional(struct path *path, struct nameidata *nd)
-{
-	dput(path->dentry);
-	if (path->mnt != nd->path.mnt)
-		mntput(path->mnt);
-}
-
 static inline void path_to_nameidata(const struct path *path,
 					struct nameidata *nd)
 {
@@ -1312,8 +1305,11 @@ static int follow_managed(struct path *path, struct nameidata *nd)
 		ret = 1;
 	if (ret > 0 && unlikely(d_flags_negative(flags)))
 		ret = -ENOENT;
-	if (unlikely(ret < 0))
-		path_put_conditional(path, nd);
+	if (unlikely(ret < 0)) {
+		dput(path->dentry);
+		if (path->mnt != nd->path.mnt)
+			mntput(path->mnt);
+	}
 	return ret;
 }
 

From 43679723d27f5173a01fc343fb8bf61be5ae04ed Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 14 Jan 2020 09:55:16 -0500
Subject: [PATCH 136/327] merging pick_link() with get_link(), part 1

Move restoring LOOKUP_PARENT and zeroing nd->stack.name[0] past
the call of get_link() (nothing _currently_ uses them in there).
That allows to moved the call of may_follow_link() into get_link()
as well, since now the presence of LOOKUP_PARENT distinguishes
the callers from each other (link_path_walk() has it, trailing_symlink()
doesn't).

Preparations for folding trailing_symlink() into callers (lookup_last()
and do_last()) and changing the calling conventions of those.  Next
stage after that will have get_link() call migrate into walk_component(),
then - into step_into().  It's tricky enough to warrant doing that
in stages, unfortunately...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index e3393511d426..d39be45f91bc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1115,6 +1115,12 @@ const char *get_link(struct nameidata *nd)
 	int error;
 	const char *res;
 
+	if (!(nd->flags & LOOKUP_PARENT)) {
+		error = may_follow_link(nd);
+		if (unlikely(error))
+			return ERR_PTR(error);
+	}
+
 	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS))
 		return ERR_PTR(-ELOOP);
 
@@ -2329,13 +2335,9 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 
 static const char *trailing_symlink(struct nameidata *nd)
 {
-	const char *s;
-	int error = may_follow_link(nd);
-	if (unlikely(error))
-		return ERR_PTR(error);
+	const char *s = get_link(nd);
 	nd->flags |= LOOKUP_PARENT;
 	nd->stack[0].name = NULL;
-	s = get_link(nd);
 	return s ? s : "";
 }
 

From 1ccac622f9da58a113f9c5b8c9d07d76b60bc555 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 14 Jan 2020 10:13:40 -0500
Subject: [PATCH 137/327] merging pick_link() with get_link(), part 2

Fold trailing_symlink() into lookup_last() and do_last(), change
the calling conventions of those two.  Rules change:
	success, we are done => NULL instead of 0
	error	=> ERR_PTR(-E...) instead of -E...
	got a symlink to follow => return the path to be followed instead of 1

The loops calling those (in path_lookupat() and path_openat()) adjusted.

A subtle change of control flow here: originally a pure-jump trailing
symlink ("/" or procfs one) would've passed through the upper level
loop once more, with "" for path to traverse.  That would've brought
us back to the lookup_last/do_last entry and we would've hit LAST_BIND
case (LAST_BIND left from get_link() called by trailing_symlink())
and pretty much skip to the point right after where we'd left the
sucker back when we picked that trailing symlink.

Now we don't bother with that extra pass through the upper level
loop - if get_link() says "I've just done a pure jump, nothing
else to do", we just treat that as non-symlink case.

Boilerplate added on that step will go away shortly - it'll migrate
into walk_component() and then to step_into(), collapsing into the
change of calling conventions for those.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 68 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index d39be45f91bc..9678bb2179e6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2333,21 +2333,26 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	return s;
 }
 
-static const char *trailing_symlink(struct nameidata *nd)
-{
-	const char *s = get_link(nd);
-	nd->flags |= LOOKUP_PARENT;
-	nd->stack[0].name = NULL;
-	return s ? s : "";
-}
-
-static inline int lookup_last(struct nameidata *nd)
+static inline const char *lookup_last(struct nameidata *nd)
 {
+	int err;
 	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
 		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 
 	nd->flags &= ~LOOKUP_PARENT;
-	return walk_component(nd, 0);
+	err = walk_component(nd, 0);
+	if (unlikely(err)) {
+		const char *s;
+		if (err < 0)
+			return PTR_ERR(err);
+		s = get_link(nd);
+		if (s) {
+			nd->flags |= LOOKUP_PARENT;
+			nd->stack[0].name = NULL;
+			return s;
+		}
+	}
+	return NULL;
 }
 
 static int handle_lookup_down(struct nameidata *nd)
@@ -2370,10 +2375,9 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
 			s = ERR_PTR(err);
 	}
 
-	while (!(err = link_path_walk(s, nd))
-		&& ((err = lookup_last(nd)) > 0)) {
-		s = trailing_symlink(nd);
-	}
+	while (!(err = link_path_walk(s, nd)) &&
+	       (s = lookup_last(nd)) != NULL)
+		;
 	if (!err)
 		err = complete_walk(nd);
 
@@ -3183,7 +3187,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 /*
  * Handle the last step of open()
  */
-static int do_last(struct nameidata *nd,
+static const char *do_last(struct nameidata *nd,
 		   struct file *file, const struct open_flags *op)
 {
 	struct dentry *dir = nd->path.dentry;
@@ -3206,7 +3210,7 @@ static int do_last(struct nameidata *nd,
 			put_link(nd);
 		error = handle_dots(nd, nd->last_type);
 		if (unlikely(error))
-			return error;
+			return ERR_PTR(error);
 		goto finish_open;
 	}
 
@@ -3216,7 +3220,7 @@ static int do_last(struct nameidata *nd,
 		/* we _can_ be in RCU mode here */
 		dentry = lookup_fast(nd, &inode, &seq);
 		if (IS_ERR(dentry))
-			return PTR_ERR(dentry);
+			return ERR_CAST(dentry);
 		if (likely(dentry))
 			goto finish_lookup;
 
@@ -3231,12 +3235,12 @@ static int do_last(struct nameidata *nd,
 		 */
 		error = complete_walk(nd);
 		if (error)
-			return error;
+			return ERR_PTR(error);
 
 		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
 		/* trailing slashes? */
 		if (unlikely(nd->last.name[nd->last.len]))
-			return -EISDIR;
+			return ERR_PTR(-EISDIR);
 	}
 
 	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
@@ -3298,18 +3302,28 @@ static int do_last(struct nameidata *nd,
 	if (nd->depth)
 		put_link(nd);
 	error = step_into(nd, 0, dentry, inode, seq);
-	if (unlikely(error))
-		return error;
+	if (unlikely(error)) {
+		const char *s;
+		if (error < 0)
+			return ERR_PTR(error);
+		s = get_link(nd);
+		if (s) {
+			nd->flags |= LOOKUP_PARENT;
+			nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
+			nd->stack[0].name = NULL;
+			return s;
+		}
+	}
 
 	if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
 		audit_inode(nd->name, nd->path.dentry, 0);
-		return -EEXIST;
+		return ERR_PTR(-EEXIST);
 	}
 finish_open:
 	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
 	error = complete_walk(nd);
 	if (error)
-		return error;
+		return ERR_PTR(error);
 	audit_inode(nd->name, nd->path.dentry, 0);
 	if (open_flag & O_CREAT) {
 		error = -EISDIR;
@@ -3351,7 +3365,7 @@ static int do_last(struct nameidata *nd,
 	}
 	if (got_write)
 		mnt_drop_write(nd->path.mnt);
-	return error;
+	return ERR_PTR(error);
 }
 
 struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
@@ -3454,10 +3468,8 @@ static struct file *path_openat(struct nameidata *nd,
 	} else {
 		const char *s = path_init(nd, flags);
 		while (!(error = link_path_walk(s, nd)) &&
-			(error = do_last(nd, file, op)) > 0) {
-			nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-			s = trailing_symlink(nd);
-		}
+		       (s = do_last(nd, file, op)) != NULL)
+			;
 		terminate_walk(nd);
 	}
 	if (likely(!error)) {

From 40fcf5a931af901198fcfb23a50354e54e1fa7a6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 14 Jan 2020 13:07:57 -0500
Subject: [PATCH 138/327] merging pick_link() with get_link(), part 3

After a pure jump ("/" or procfs-style symlink) we don't need to
hold the link anymore.  link_path_walk() dropped it if such case
had been detected, lookup_last/do_last() (i.e. old trailing_symlink())
left it on the stack - it ended up calling terminate_walk() shortly
anyway, which would've purged the entire stack.

Do it in get_link() itself instead.  Simpler logics that way...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 9678bb2179e6..f844378ce908 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1154,7 +1154,9 @@ const char *get_link(struct nameidata *nd)
 		} else {
 			res = get(dentry, inode, &last->done);
 		}
-		if (IS_ERR_OR_NULL(res))
+		if (!res)
+			goto all_done;
+		if (IS_ERR(res))
 			return res;
 	}
 	if (*res == '/') {
@@ -1164,9 +1166,11 @@ const char *get_link(struct nameidata *nd)
 		while (unlikely(*++res == '/'))
 			;
 	}
-	if (!*res)
-		res = NULL;
-	return res;
+	if (*res)
+		return res;
+all_done: // pure jump
+	put_link(nd);
+	return NULL;
 }
 
 /*
@@ -2211,11 +2215,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 
 			if (IS_ERR(s))
 				return PTR_ERR(s);
-			err = 0;
-			if (unlikely(!s)) {
-				/* jumped */
-				put_link(nd);
-			} else {
+			if (likely(s)) {
 				nd->stack[nd->depth - 1].name = name;
 				name = s;
 				continue;

From 92d270165cffb57a91e036bea2f4d6f2f8522cf1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 14 Jan 2020 13:24:17 -0500
Subject: [PATCH 139/327] merging pick_link() with get_link(), part 4

Move the call of get_link() into walk_component().  Change the
calling conventions for walk_component() to returning the link
body to follow (if any).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 59 ++++++++++++++++++++++++------------------------------
 1 file changed, 26 insertions(+), 33 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index f844378ce908..c8a933c2f14a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1866,7 +1866,7 @@ static int step_into(struct nameidata *nd, int flags,
 	return pick_link(nd, &path, inode, seq);
 }
 
-static int walk_component(struct nameidata *nd, int flags)
+static const char *walk_component(struct nameidata *nd, int flags)
 {
 	struct dentry *dentry;
 	struct inode *inode;
@@ -1881,19 +1881,24 @@ static int walk_component(struct nameidata *nd, int flags)
 		if (!(flags & WALK_MORE) && nd->depth)
 			put_link(nd);
 		err = handle_dots(nd, nd->last_type);
-		return err;
+		return ERR_PTR(err);
 	}
 	dentry = lookup_fast(nd, &inode, &seq);
 	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
+		return ERR_CAST(dentry);
 	if (unlikely(!dentry)) {
 		dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
 		if (IS_ERR(dentry))
-			return PTR_ERR(dentry);
+			return ERR_CAST(dentry);
 	}
 	if (!(flags & WALK_MORE) && nd->depth)
 		put_link(nd);
-	return step_into(nd, flags, dentry, inode, seq);
+	err = step_into(nd, flags, dentry, inode, seq);
+	if (!err)
+		return NULL;
+	if (err > 0)
+		return get_link(nd);
+	return ERR_PTR(err);
 }
 
 /*
@@ -2145,6 +2150,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 
 	/* At this point we know we have a real path component. */
 	for(;;) {
+		const char *link;
 		u64 hash_len;
 		int type;
 
@@ -2202,24 +2208,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			if (!name)
 				return 0;
 			/* last component of nested symlink */
-			err = walk_component(nd, WALK_FOLLOW);
+			link = walk_component(nd, WALK_FOLLOW);
 		} else {
 			/* not the last component */
-			err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
+			link = walk_component(nd, WALK_FOLLOW | WALK_MORE);
 		}
-		if (err < 0)
-			return err;
-
-		if (err) {
-			const char *s = get_link(nd);
-
-			if (IS_ERR(s))
-				return PTR_ERR(s);
-			if (likely(s)) {
-				nd->stack[nd->depth - 1].name = name;
-				name = s;
-				continue;
-			}
+		if (unlikely(link)) {
+			if (IS_ERR(link))
+				return PTR_ERR(link);
+			/* a symlink to follow */
+			nd->stack[nd->depth - 1].name = name;
+			name = link;
+			continue;
 		}
 		if (unlikely(!d_can_lookup(nd->path.dentry))) {
 			if (nd->flags & LOOKUP_RCU) {
@@ -2335,24 +2335,17 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 
 static inline const char *lookup_last(struct nameidata *nd)
 {
-	int err;
+	const char *link;
 	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
 		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 
 	nd->flags &= ~LOOKUP_PARENT;
-	err = walk_component(nd, 0);
-	if (unlikely(err)) {
-		const char *s;
-		if (err < 0)
-			return PTR_ERR(err);
-		s = get_link(nd);
-		if (s) {
-			nd->flags |= LOOKUP_PARENT;
-			nd->stack[0].name = NULL;
-			return s;
-		}
+	link = walk_component(nd, 0);
+	if (link) {
+		nd->flags |= LOOKUP_PARENT;
+		nd->stack[0].name = NULL;
 	}
-	return NULL;
+	return link;
 }
 
 static int handle_lookup_down(struct nameidata *nd)

From b0417d2c729891eb2566d58a6d63b61a1ff38141 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 14 Jan 2020 13:34:20 -0500
Subject: [PATCH 140/327] merging pick_link() with get_link(), part 5

move get_link() call into step_into().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 43 ++++++++++++++++++-------------------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index c8a933c2f14a..2c8a814f3193 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1841,14 +1841,14 @@ enum {WALK_FOLLOW = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
  * so we keep a cache of "no, this doesn't need follow_link"
  * for the common case.
  */
-static int step_into(struct nameidata *nd, int flags,
+static const char *step_into(struct nameidata *nd, int flags,
 		     struct dentry *dentry, struct inode *inode, unsigned seq)
 {
 	struct path path;
 	int err = handle_mounts(nd, dentry, &path, &inode, &seq);
 
 	if (err < 0)
-		return err;
+		return ERR_PTR(err);
 	if (likely(!d_is_symlink(path.dentry)) ||
 	   !((flags & WALK_FOLLOW) || (nd->flags & LOOKUP_FOLLOW)) ||
 	   (flags & WALK_NOFOLLOW)) {
@@ -1856,14 +1856,17 @@ static int step_into(struct nameidata *nd, int flags,
 		path_to_nameidata(&path, nd);
 		nd->inode = inode;
 		nd->seq = seq;
-		return 0;
+		return NULL;
 	}
 	/* make sure that d_is_symlink above matches inode */
 	if (nd->flags & LOOKUP_RCU) {
 		if (read_seqcount_retry(&path.dentry->d_seq, seq))
-			return -ECHILD;
+			return ERR_PTR(-ECHILD);
 	}
-	return pick_link(nd, &path, inode, seq);
+	err = pick_link(nd, &path, inode, seq);
+	if (err > 0)
+		return get_link(nd);
+	return ERR_PTR(err);
 }
 
 static const char *walk_component(struct nameidata *nd, int flags)
@@ -1893,12 +1896,7 @@ static const char *walk_component(struct nameidata *nd, int flags)
 	}
 	if (!(flags & WALK_MORE) && nd->depth)
 		put_link(nd);
-	err = step_into(nd, flags, dentry, inode, seq);
-	if (!err)
-		return NULL;
-	if (err > 0)
-		return get_link(nd);
-	return ERR_PTR(err);
+	return step_into(nd, flags, dentry, inode, seq);
 }
 
 /*
@@ -2352,8 +2350,8 @@ static int handle_lookup_down(struct nameidata *nd)
 {
 	if (!(nd->flags & LOOKUP_RCU))
 		dget(nd->path.dentry);
-	return step_into(nd, WALK_NOFOLLOW,
-			nd->path.dentry, nd->inode, nd->seq);
+	return PTR_ERR(step_into(nd, WALK_NOFOLLOW,
+			nd->path.dentry, nd->inode, nd->seq));
 }
 
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
@@ -3193,6 +3191,7 @@ static const char *do_last(struct nameidata *nd,
 	unsigned seq;
 	struct inode *inode;
 	struct dentry *dentry;
+	const char *res;
 	int error;
 
 	nd->flags &= ~LOOKUP_PARENT;
@@ -3294,18 +3293,12 @@ static const char *do_last(struct nameidata *nd,
 finish_lookup:
 	if (nd->depth)
 		put_link(nd);
-	error = step_into(nd, 0, dentry, inode, seq);
-	if (unlikely(error)) {
-		const char *s;
-		if (error < 0)
-			return ERR_PTR(error);
-		s = get_link(nd);
-		if (s) {
-			nd->flags |= LOOKUP_PARENT;
-			nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-			nd->stack[0].name = NULL;
-			return s;
-		}
+	res = step_into(nd, 0, dentry, inode, seq);
+	if (unlikely(res)) {
+		nd->flags |= LOOKUP_PARENT;
+		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
+		nd->stack[0].name = NULL;
+		return res;
 	}
 
 	if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {

From 06708adb99e88d02556fb546b5c70ff8566bfce9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 14 Jan 2020 14:26:57 -0500
Subject: [PATCH 141/327] merging pick_link() with get_link(), part 6

move the only remaining call of get_link() into pick_link()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 2c8a814f3193..f5959f65a70e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1793,14 +1793,14 @@ static inline int handle_dots(struct nameidata *nd, int type)
 	return 0;
 }
 
-static int pick_link(struct nameidata *nd, struct path *link,
+static const char *pick_link(struct nameidata *nd, struct path *link,
 		     struct inode *inode, unsigned seq)
 {
 	int error;
 	struct saved *last;
 	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
 		path_to_nameidata(link, nd);
-		return -ELOOP;
+		return ERR_PTR(-ELOOP);
 	}
 	if (!(nd->flags & LOOKUP_RCU)) {
 		if (link->mnt == nd->path.mnt)
@@ -1821,7 +1821,7 @@ static int pick_link(struct nameidata *nd, struct path *link,
 		}
 		if (error) {
 			path_put(link);
-			return error;
+			return ERR_PTR(error);
 		}
 	}
 
@@ -1830,7 +1830,7 @@ static int pick_link(struct nameidata *nd, struct path *link,
 	clear_delayed_call(&last->done);
 	nd->link_inode = inode;
 	last->seq = seq;
-	return 1;
+	return get_link(nd);
 }
 
 enum {WALK_FOLLOW = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
@@ -1863,10 +1863,7 @@ static const char *step_into(struct nameidata *nd, int flags,
 		if (read_seqcount_retry(&path.dentry->d_seq, seq))
 			return ERR_PTR(-ECHILD);
 	}
-	err = pick_link(nd, &path, inode, seq);
-	if (err > 0)
-		return get_link(nd);
-	return ERR_PTR(err);
+	return pick_link(nd, &path, inode, seq);
 }
 
 static const char *walk_component(struct nameidata *nd, int flags)

From ad6cc4c338f4e5124c29971ea749073a697063a1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 14 Jan 2020 14:41:39 -0500
Subject: [PATCH 142/327] finally fold get_link() into pick_link()

kill nd->link_inode, while we are at it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 135 ++++++++++++++++++++++++-----------------------------
 1 file changed, 61 insertions(+), 74 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index f5959f65a70e..02f148124831 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -503,7 +503,6 @@ struct nameidata {
 	} *stack, internal[EMBEDDED_LEVELS];
 	struct filename	*name;
 	struct nameidata *saved;
-	struct inode	*link_inode;
 	unsigned	root_seq;
 	int		dfd;
 } __randomize_layout;
@@ -962,9 +961,8 @@ int sysctl_protected_regular __read_mostly;
  *
  * Returns 0 if following the symlink is allowed, -ve on error.
  */
-static inline int may_follow_link(struct nameidata *nd)
+static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
 {
-	const struct inode *inode;
 	const struct inode *parent;
 	kuid_t puid;
 
@@ -972,7 +970,6 @@ static inline int may_follow_link(struct nameidata *nd)
 		return 0;
 
 	/* Allowed if owner and follower match. */
-	inode = nd->link_inode;
 	if (uid_eq(current_cred()->fsuid, inode->i_uid))
 		return 0;
 
@@ -1106,73 +1103,6 @@ static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
 	return 0;
 }
 
-static __always_inline
-const char *get_link(struct nameidata *nd)
-{
-	struct saved *last = nd->stack + nd->depth - 1;
-	struct dentry *dentry = last->link.dentry;
-	struct inode *inode = nd->link_inode;
-	int error;
-	const char *res;
-
-	if (!(nd->flags & LOOKUP_PARENT)) {
-		error = may_follow_link(nd);
-		if (unlikely(error))
-			return ERR_PTR(error);
-	}
-
-	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS))
-		return ERR_PTR(-ELOOP);
-
-	if (!(nd->flags & LOOKUP_RCU)) {
-		touch_atime(&last->link);
-		cond_resched();
-	} else if (atime_needs_update(&last->link, inode)) {
-		if (unlikely(unlazy_walk(nd)))
-			return ERR_PTR(-ECHILD);
-		touch_atime(&last->link);
-	}
-
-	error = security_inode_follow_link(dentry, inode,
-					   nd->flags & LOOKUP_RCU);
-	if (unlikely(error))
-		return ERR_PTR(error);
-
-	nd->last_type = LAST_BIND;
-	res = READ_ONCE(inode->i_link);
-	if (!res) {
-		const char * (*get)(struct dentry *, struct inode *,
-				struct delayed_call *);
-		get = inode->i_op->get_link;
-		if (nd->flags & LOOKUP_RCU) {
-			res = get(NULL, inode, &last->done);
-			if (res == ERR_PTR(-ECHILD)) {
-				if (unlikely(unlazy_walk(nd)))
-					return ERR_PTR(-ECHILD);
-				res = get(dentry, inode, &last->done);
-			}
-		} else {
-			res = get(dentry, inode, &last->done);
-		}
-		if (!res)
-			goto all_done;
-		if (IS_ERR(res))
-			return res;
-	}
-	if (*res == '/') {
-		error = nd_jump_root(nd);
-		if (unlikely(error))
-			return ERR_PTR(error);
-		while (unlikely(*++res == '/'))
-			;
-	}
-	if (*res)
-		return res;
-all_done: // pure jump
-	put_link(nd);
-	return NULL;
-}
-
 /*
  * follow_up - Find the mountpoint of path's vfsmount
  *
@@ -1796,8 +1726,10 @@ static inline int handle_dots(struct nameidata *nd, int type)
 static const char *pick_link(struct nameidata *nd, struct path *link,
 		     struct inode *inode, unsigned seq)
 {
-	int error;
 	struct saved *last;
+	const char *res;
+	int error;
+
 	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
 		path_to_nameidata(link, nd);
 		return ERR_PTR(-ELOOP);
@@ -1828,9 +1760,64 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 	last = nd->stack + nd->depth++;
 	last->link = *link;
 	clear_delayed_call(&last->done);
-	nd->link_inode = inode;
 	last->seq = seq;
-	return get_link(nd);
+
+	if (!(nd->flags & LOOKUP_PARENT)) {
+		error = may_follow_link(nd, inode);
+		if (unlikely(error))
+			return ERR_PTR(error);
+	}
+
+	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS))
+		return ERR_PTR(-ELOOP);
+
+	if (!(nd->flags & LOOKUP_RCU)) {
+		touch_atime(&last->link);
+		cond_resched();
+	} else if (atime_needs_update(&last->link, inode)) {
+		if (unlikely(unlazy_walk(nd)))
+			return ERR_PTR(-ECHILD);
+		touch_atime(&last->link);
+	}
+
+	error = security_inode_follow_link(link->dentry, inode,
+					   nd->flags & LOOKUP_RCU);
+	if (unlikely(error))
+		return ERR_PTR(error);
+
+	nd->last_type = LAST_BIND;
+	res = READ_ONCE(inode->i_link);
+	if (!res) {
+		const char * (*get)(struct dentry *, struct inode *,
+				struct delayed_call *);
+		get = inode->i_op->get_link;
+		if (nd->flags & LOOKUP_RCU) {
+			res = get(NULL, inode, &last->done);
+			if (res == ERR_PTR(-ECHILD)) {
+				if (unlikely(unlazy_walk(nd)))
+					return ERR_PTR(-ECHILD);
+				res = get(link->dentry, inode, &last->done);
+			}
+		} else {
+			res = get(link->dentry, inode, &last->done);
+		}
+		if (!res)
+			goto all_done;
+		if (IS_ERR(res))
+			return res;
+	}
+	if (*res == '/') {
+		error = nd_jump_root(nd);
+		if (unlikely(error))
+			return ERR_PTR(error);
+		while (unlikely(*++res == '/'))
+			;
+	}
+	if (*res)
+		return res;
+all_done: // pure jump
+	put_link(nd);
+	return NULL;
 }
 
 enum {WALK_FOLLOW = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

From b4c0353693d22f9dcfa4757262dab0aab8376d19 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 19 Jan 2020 11:44:51 -0500
Subject: [PATCH 143/327] sanitize handling of nd->last_type, kill LAST_BIND

->last_type values are set in 3 places: path_init() (sets to LAST_ROOT),
link_path_walk (LAST_NORM/DOT/DOTDOT) and pick_link (LAST_BIND).

The are checked in walk_component(), lookup_last() and do_last().
They also get copied to the caller by filename_parentat().  In the last
3 cases the value is what we had at the return from link_path_walk().
In case of walk_component() it's either directly downstream from
assignment in link_path_walk() or, when called by lookup_last(), the
value we have at the return from link_path_walk().

The value at the entry into link_path_walk() can survive to return only
if the pathname contains nothing but slashes.  Note that pick_link()
never returns such - pure jumps are handled directly.  So for the calls
of link_path_walk() for trailing symlinks it does not matter what value
had been there at the entry; the value at the return won't depend upon it.

There are 3 call chains that might have pick_link() storing LAST_BIND:

1) pick_link() from step_into() from walk_component() from
link_path_walk().  In that case we will either be parsing the next
component immediately after return into link_path_walk(), which will
overwrite the ->last_type before anyone has a chance to look at it,
or we'll fail, in which case nobody will be looking at ->last_type at all.

2) pick_link() from step_into() from walk_component() from lookup_last().
The value is never looked at due to the above; it won't affect the value
seen at return from any link_path_walk().

3) pick_link() from step_into() from do_last().  Ditto.

In other words, assignemnt in pick_link() is pointless, and so is
LAST_BIND itself; nothing ever looks at that value.  Kill it off.
And make link_path_walk() _always_ assign ->last_type - in the only
case when the value at the entry might survive to the return that value
is always LAST_ROOT, inherited from path_init().  Move that assignment
from path_init() into the beginning of link_path_walk(), to consolidate
the things.

Historical note: LAST_BIND used to be used for the kludge with trailing
pure jump symlinks (extra iteration through the top-level loop).
No point keeping it anymore...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/path-lookup.rst | 7 ++-----
 fs/namei.c                                | 3 +--
 include/linux/namei.h                     | 2 +-
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/Documentation/filesystems/path-lookup.rst b/Documentation/filesystems/path-lookup.rst
index a3216979298b..f46b05e9b96c 100644
--- a/Documentation/filesystems/path-lookup.rst
+++ b/Documentation/filesystems/path-lookup.rst
@@ -404,11 +404,8 @@ that is the "next" component in the pathname.
 ``int last_type``
 ~~~~~~~~~~~~~~~~~
 
-This is one of ``LAST_NORM``, ``LAST_ROOT``, ``LAST_DOT``, ``LAST_DOTDOT``, or
-``LAST_BIND``.  The ``last`` field is only valid if the type is
-``LAST_NORM``.  ``LAST_BIND`` is used when following a symlink and no
-components of the symlink have been processed yet.  Others should be
-fairly self-explanatory.
+This is one of ``LAST_NORM``, ``LAST_ROOT``, ``LAST_DOT`` or ``LAST_DOTDOT``.
+The ``last`` field is only valid if the type is ``LAST_NORM``.
 
 ``struct path root``
 ~~~~~~~~~~~~~~~~~~~~
diff --git a/fs/namei.c b/fs/namei.c
index 02f148124831..1a83641e95e6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1785,7 +1785,6 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 	if (unlikely(error))
 		return ERR_PTR(error);
 
-	nd->last_type = LAST_BIND;
 	res = READ_ONCE(inode->i_link);
 	if (!res) {
 		const char * (*get)(struct dentry *, struct inode *,
@@ -2123,6 +2122,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 {
 	int err;
 
+	nd->last_type = LAST_ROOT;
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 	while (*name=='/')
@@ -2224,7 +2224,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	if (flags & LOOKUP_RCU)
 		rcu_read_lock();
 
-	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
 	nd->depth = 0;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d9576a051808..a4bb992623c4 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -15,7 +15,7 @@ enum { MAX_NESTED_LINKS = 8 };
 /*
  * Type of the last component on LOOKUP_PARENT
  */
-enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
+enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
 
 /* pathwalk mode */
 #define LOOKUP_FOLLOW		0x0001	/* follow links at the end */

From 8c4efe22e7c4de1d44f624120a979e31e3a584b8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 19 Jan 2020 12:44:18 -0500
Subject: [PATCH 144/327] namei: invert the meaning of WALK_FOLLOW

old flags & WALK_FOLLOW <=> new !(flags & WALK_TRAILING)
That's what that flag had really been used for.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 1a83641e95e6..5d25a3874a5b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1819,7 +1819,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 	return NULL;
 }
 
-enum {WALK_FOLLOW = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
+enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
 
 /*
  * Do we need to follow links? We _really_ want to be able
@@ -1836,7 +1836,7 @@ static const char *step_into(struct nameidata *nd, int flags,
 	if (err < 0)
 		return ERR_PTR(err);
 	if (likely(!d_is_symlink(path.dentry)) ||
-	   !((flags & WALK_FOLLOW) || (nd->flags & LOOKUP_FOLLOW)) ||
+	   ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
 	   (flags & WALK_NOFOLLOW)) {
 		/* not a symlink or should not follow */
 		path_to_nameidata(&path, nd);
@@ -2190,10 +2190,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			if (!name)
 				return 0;
 			/* last component of nested symlink */
-			link = walk_component(nd, WALK_FOLLOW);
+			link = walk_component(nd, 0);
 		} else {
 			/* not the last component */
-			link = walk_component(nd, WALK_FOLLOW | WALK_MORE);
+			link = walk_component(nd, WALK_MORE);
 		}
 		if (unlikely(link)) {
 			if (IS_ERR(link))
@@ -2321,7 +2321,7 @@ static inline const char *lookup_last(struct nameidata *nd)
 		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 
 	nd->flags &= ~LOOKUP_PARENT;
-	link = walk_component(nd, 0);
+	link = walk_component(nd, WALK_TRAILING);
 	if (link) {
 		nd->flags |= LOOKUP_PARENT;
 		nd->stack[0].name = NULL;
@@ -3276,7 +3276,7 @@ static const char *do_last(struct nameidata *nd,
 finish_lookup:
 	if (nd->depth)
 		put_link(nd);
-	res = step_into(nd, 0, dentry, inode, seq);
+	res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
 	if (unlikely(res)) {
 		nd->flags |= LOOKUP_PARENT;
 		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);

From b1a819724074f96bfd41c8f7825c3974b91fe518 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 19 Jan 2020 12:48:44 -0500
Subject: [PATCH 145/327] pick_link(): check for WALK_TRAILING, not
 LOOKUP_PARENT

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 5d25a3874a5b..ff028f12cb95 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1723,8 +1723,10 @@ static inline int handle_dots(struct nameidata *nd, int type)
 	return 0;
 }
 
+enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
+
 static const char *pick_link(struct nameidata *nd, struct path *link,
-		     struct inode *inode, unsigned seq)
+		     struct inode *inode, unsigned seq, int flags)
 {
 	struct saved *last;
 	const char *res;
@@ -1762,7 +1764,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 	clear_delayed_call(&last->done);
 	last->seq = seq;
 
-	if (!(nd->flags & LOOKUP_PARENT)) {
+	if (flags & WALK_TRAILING) {
 		error = may_follow_link(nd, inode);
 		if (unlikely(error))
 			return ERR_PTR(error);
@@ -1819,8 +1821,6 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 	return NULL;
 }
 
-enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
-
 /*
  * Do we need to follow links? We _really_ want to be able
  * to do this check without having to look at inode->i_op,
@@ -1849,7 +1849,7 @@ static const char *step_into(struct nameidata *nd, int flags,
 		if (read_seqcount_retry(&path.dentry->d_seq, seq))
 			return ERR_PTR(-ECHILD);
 	}
-	return pick_link(nd, &path, inode, seq);
+	return pick_link(nd, &path, inode, seq, flags);
 }
 
 static const char *walk_component(struct nameidata *nd, int flags)

From d8d4611a4f2dd6be8fe3ddb5cf8eea7f1dfe0787 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 23 Feb 2020 22:04:15 -0500
Subject: [PATCH 146/327] link_path_walk(): simplify stack handling

We use nd->stack to store two things: pinning down the symlinks
we are resolving and resuming the name traversal when a nested
symlink is finished.

Currently, nd->depth is used to keep track of both.  It's 0 when
we call link_path_walk() for the first time (for the pathname
itself) and 1 on all subsequent calls (for trailing symlinks,
if any).  That's fine, as far as pinning symlinks goes - when
handling a trailing symlink, the string we are interpreting
is the body of symlink pinned down in nd->stack[0].  It's
rather inconvenient with respect to handling nested symlinks,
though - when we run out of a string we are currently interpreting,
we need to decide whether it's a nested symlink (in which case
we need to pick the string saved back when we started to interpret
that nested symlink and resume its traversal) or not (in which
case we are done with link_path_walk()).

Current solution is a bit of a kludge - in handling of trailing symlink
(in lookup_last() and open_last_lookups() we clear nd->stack[0].name.
That allows link_path_walk() to use the following rules when
running out of a string to interpret:
	* if nd->depth is zero, we are at the end of pathname itself.
	* if nd->depth is positive, check the saved string; for
nested symlink it will be non-NULL, for trailing symlink - NULL.

It works, but it's rather non-obvious.  Note that we have two sets:
the set of symlinks currently being traversed and the set of postponed
pathname tails.  The former is stored in nd->stack[0..nd->depth-1].link
and it's valid throught the pathname resolution; the latter is valid only
during an individual call of link_path_walk() and it occupies
nd->stack[0..nd->depth-1].name for the first call of link_path_walk() and
nd->stack[1..nd->depth-1].name for subsequent ones.  The kludge is basically
a way to recognize the second set becoming empty.

The things get simpler if we keep track of the second set's size
explicitly and always store it in nd->stack[0..depth-1].name.
We access the second set only inside link_path_walk(), so its
size can live in a local variable; that way the check becomes
trivial without the need of that kludge.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index ff028f12cb95..04c1d798013f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2120,6 +2120,7 @@ static inline u64 hash_name(const void *salt, const char *name)
  */
 static int link_path_walk(const char *name, struct nameidata *nd)
 {
+	int depth = 0; // depth <= nd->depth
 	int err;
 
 	nd->last_type = LAST_ROOT;
@@ -2182,14 +2183,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		} while (unlikely(*name == '/'));
 		if (unlikely(!*name)) {
 OK:
-			/* pathname body, done */
-			if (!nd->depth)
-				return 0;
-			name = nd->stack[nd->depth - 1].name;
-			/* trailing symlink, done */
-			if (!name)
+			/* pathname or trailing symlink, done */
+			if (!depth)
 				return 0;
 			/* last component of nested symlink */
+			name = nd->stack[--depth].name;
 			link = walk_component(nd, 0);
 		} else {
 			/* not the last component */
@@ -2199,7 +2197,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			if (IS_ERR(link))
 				return PTR_ERR(link);
 			/* a symlink to follow */
-			nd->stack[nd->depth - 1].name = name;
+			nd->stack[depth++].name = name;
 			name = link;
 			continue;
 		}
@@ -2324,7 +2322,6 @@ static inline const char *lookup_last(struct nameidata *nd)
 	link = walk_component(nd, WALK_TRAILING);
 	if (link) {
 		nd->flags |= LOOKUP_PARENT;
-		nd->stack[0].name = NULL;
 	}
 	return link;
 }
@@ -3280,7 +3277,6 @@ static const char *do_last(struct nameidata *nd,
 	if (unlikely(res)) {
 		nd->flags |= LOOKUP_PARENT;
 		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-		nd->stack[0].name = NULL;
 		return res;
 	}
 

From c108837e06b659fd1dd144c415a25b7ac339926b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 5 Mar 2020 15:48:44 -0500
Subject: [PATCH 147/327] namei: have link_path_walk() maintain LOOKUP_PARENT

set on entry, clear when we get to the last component.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 04c1d798013f..b45ec86dc7b3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2124,6 +2124,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	int err;
 
 	nd->last_type = LAST_ROOT;
+	nd->flags |= LOOKUP_PARENT;
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 	while (*name=='/')
@@ -2184,8 +2185,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		if (unlikely(!*name)) {
 OK:
 			/* pathname or trailing symlink, done */
-			if (!depth)
+			if (!depth) {
+				nd->flags &= ~LOOKUP_PARENT;
 				return 0;
+			}
 			/* last component of nested symlink */
 			name = nd->stack[--depth].name;
 			link = walk_component(nd, 0);
@@ -2222,7 +2225,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	if (flags & LOOKUP_RCU)
 		rcu_read_lock();
 
-	nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
+	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
 
 	nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
@@ -2314,16 +2317,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 
 static inline const char *lookup_last(struct nameidata *nd)
 {
-	const char *link;
 	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
 		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 
-	nd->flags &= ~LOOKUP_PARENT;
-	link = walk_component(nd, WALK_TRAILING);
-	if (link) {
-		nd->flags |= LOOKUP_PARENT;
-	}
-	return link;
+	return walk_component(nd, WALK_TRAILING);
 }
 
 static int handle_lookup_down(struct nameidata *nd)
@@ -3174,7 +3171,6 @@ static const char *do_last(struct nameidata *nd,
 	const char *res;
 	int error;
 
-	nd->flags &= ~LOOKUP_PARENT;
 	nd->flags |= op->intent;
 
 	if (nd->last_type != LAST_NORM) {
@@ -3275,7 +3271,6 @@ static const char *do_last(struct nameidata *nd,
 		put_link(nd);
 	res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
 	if (unlikely(res)) {
-		nd->flags |= LOOKUP_PARENT;
 		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
 		return res;
 	}

From ea936aeb3ead0551945c85ff7a7d77d9a7259664 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 16 Jan 2020 09:52:04 -0500
Subject: [PATCH 148/327] massage __follow_mount_rcu() a bit

make the loop more similar to that in follow_managed(), with
explicit tracking of flags, etc.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 70 +++++++++++++++++++++++++++---------------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index b45ec86dc7b3..18c46b8db244 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1269,12 +1269,6 @@ int follow_down_one(struct path *path)
 }
 EXPORT_SYMBOL(follow_down_one);
 
-static inline int managed_dentry_rcu(const struct path *path)
-{
-	return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
-		path->dentry->d_op->d_manage(path, true) : 0;
-}
-
 /*
  * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
  * we meet a managed dentry that would need blocking.
@@ -1282,43 +1276,49 @@ static inline int managed_dentry_rcu(const struct path *path)
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 			       struct inode **inode, unsigned *seqp)
 {
+	struct dentry *dentry = path->dentry;
+	unsigned int flags = dentry->d_flags;
+
+	if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
+		return true;
+
+	if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+		return false;
+
 	for (;;) {
-		struct mount *mounted;
 		/*
 		 * Don't forget we might have a non-mountpoint managed dentry
 		 * that wants to block transit.
 		 */
-		switch (managed_dentry_rcu(path)) {
-		case -ECHILD:
-		default:
-			return false;
-		case -EISDIR:
-			return true;
-		case 0:
-			break;
+		if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
+			int res = dentry->d_op->d_manage(path, true);
+			if (res)
+				return res == -EISDIR;
+			flags = dentry->d_flags;
 		}
 
-		if (!d_mountpoint(path->dentry))
-			return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
-
-		mounted = __lookup_mnt(path->mnt, path->dentry);
-		if (!mounted)
-			break;
-		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-			return false;
-		path->mnt = &mounted->mnt;
-		path->dentry = mounted->mnt.mnt_root;
-		nd->flags |= LOOKUP_JUMPED;
-		*seqp = read_seqcount_begin(&path->dentry->d_seq);
-		/*
-		 * Update the inode too. We don't need to re-check the
-		 * dentry sequence number here after this d_inode read,
-		 * because a mount-point is always pinned.
-		 */
-		*inode = path->dentry->d_inode;
+		if (flags & DCACHE_MOUNTED) {
+			struct mount *mounted = __lookup_mnt(path->mnt, dentry);
+			if (mounted) {
+				path->mnt = &mounted->mnt;
+				dentry = path->dentry = mounted->mnt.mnt_root;
+				nd->flags |= LOOKUP_JUMPED;
+				*seqp = read_seqcount_begin(&dentry->d_seq);
+				*inode = dentry->d_inode;
+				/*
+				 * We don't need to re-check ->d_seq after this
+				 * ->d_inode read - there will be an RCU delay
+				 * between mount hash removal and ->mnt_root
+				 * becoming unpinned.
+				 */
+				flags = dentry->d_flags;
+				continue;
+			}
+			if (read_seqretry(&mount_lock, nd->m_seq))
+				return false;
+		}
+		return !(flags & DCACHE_NEED_AUTOMOUNT);
 	}
-	return !read_seqretry(&mount_lock, nd->m_seq) &&
-		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 }
 
 static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,

From 9deed3ebca244663530782631834e706a86a8c8f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 17 Jan 2020 08:45:08 -0500
Subject: [PATCH 149/327] new helper: traverse_mounts()

common guts of follow_down() and follow_managed() taken to a new
helper - traverse_mounts().  The remnants of follow_managed()
are folded into its sole remaining caller (handle_mounts()).
Calling conventions of handle_mounts() slightly sanitized -
instead of the weird "1 for success, -E... for failure" that used
to be imposed by the calling conventions of walk_component() et.al.
we can use the normal "0 for success, -E... for failure".

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 177 ++++++++++++++++++++++-------------------------------
 1 file changed, 72 insertions(+), 105 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 18c46b8db244..40d5f7abfa54 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1168,91 +1168,79 @@ static int follow_automount(struct path *path, int *count, unsigned lookup_flags
 }
 
 /*
- * Handle a dentry that is managed in some way.
- * - Flagged for transit management (autofs)
- * - Flagged as mountpoint
- * - Flagged as automount point
- *
- * This may only be called in refwalk mode.
- * On success path->dentry is known positive.
- *
- * Serialization is taken care of in namespace.c
+ * mount traversal - out-of-line part.  One note on ->d_flags accesses -
+ * dentries are pinned but not locked here, so negative dentry can go
+ * positive right under us.  Use of smp_load_acquire() provides a barrier
+ * sufficient for ->d_inode and ->d_flags consistency.
  */
-static int follow_managed(struct path *path, struct nameidata *nd)
+static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
+			     int *count, unsigned lookup_flags)
 {
-	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
-	unsigned flags;
+	struct vfsmount *mnt = path->mnt;
 	bool need_mntput = false;
 	int ret = 0;
 
-	/* Given that we're not holding a lock here, we retain the value in a
-	 * local variable for each dentry as we look at it so that we don't see
-	 * the components of that value change under us */
-	while (flags = smp_load_acquire(&path->dentry->d_flags),
-	       unlikely(flags & DCACHE_MANAGED_DENTRY)) {
+	while (flags & DCACHE_MANAGED_DENTRY) {
 		/* Allow the filesystem to manage the transit without i_mutex
 		 * being held. */
 		if (flags & DCACHE_MANAGE_TRANSIT) {
-			BUG_ON(!path->dentry->d_op);
-			BUG_ON(!path->dentry->d_op->d_manage);
 			ret = path->dentry->d_op->d_manage(path, false);
 			flags = smp_load_acquire(&path->dentry->d_flags);
 			if (ret < 0)
 				break;
 		}
 
-		/* Transit to a mounted filesystem. */
-		if (flags & DCACHE_MOUNTED) {
+		if (flags & DCACHE_MOUNTED) {	// something's mounted on it..
 			struct vfsmount *mounted = lookup_mnt(path);
-			if (mounted) {
+			if (mounted) {		// ... in our namespace
 				dput(path->dentry);
 				if (need_mntput)
 					mntput(path->mnt);
 				path->mnt = mounted;
 				path->dentry = dget(mounted->mnt_root);
+				// here we know it's positive
+				flags = path->dentry->d_flags;
 				need_mntput = true;
 				continue;
 			}
-
-			/* Something is mounted on this dentry in another
-			 * namespace and/or whatever was mounted there in this
-			 * namespace got unmounted before lookup_mnt() could
-			 * get it */
 		}
 
-		/* Handle an automount point */
-		if (flags & DCACHE_NEED_AUTOMOUNT) {
-			ret = follow_automount(path, &nd->total_link_count,
-						nd->flags);
-			if (ret < 0)
-				break;
-			continue;
-		}
+		if (!(flags & DCACHE_NEED_AUTOMOUNT))
+			break;
 
-		/* We didn't change the current path point */
-		break;
+		// uncovered automount point
+		ret = follow_automount(path, count, lookup_flags);
+		flags = smp_load_acquire(&path->dentry->d_flags);
+		if (ret < 0)
+			break;
 	}
 
-	if (need_mntput) {
-		if (path->mnt == mnt)
-			mntput(path->mnt);
-		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-			ret = -EXDEV;
-		else
-			nd->flags |= LOOKUP_JUMPED;
-	}
-	if (ret == -EISDIR || !ret)
-		ret = 1;
-	if (ret > 0 && unlikely(d_flags_negative(flags)))
+	if (ret == -EISDIR)
+		ret = 0;
+	// possible if you race with several mount --move
+	if (need_mntput && path->mnt == mnt)
+		mntput(path->mnt);
+	if (!ret && unlikely(d_flags_negative(flags)))
 		ret = -ENOENT;
-	if (unlikely(ret < 0)) {
-		dput(path->dentry);
-		if (path->mnt != nd->path.mnt)
-			mntput(path->mnt);
-	}
+	*jumped = need_mntput;
 	return ret;
 }
 
+static inline int traverse_mounts(struct path *path, bool *jumped,
+				  int *count, unsigned lookup_flags)
+{
+	unsigned flags = smp_load_acquire(&path->dentry->d_flags);
+
+	/* fastpath */
+	if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
+		*jumped = false;
+		if (unlikely(d_flags_negative(flags)))
+			return -ENOENT;
+		return 0;
+	}
+	return __traverse_mounts(path, flags, jumped, count, lookup_flags);
+}
+
 int follow_down_one(struct path *path)
 {
 	struct vfsmount *mounted;
@@ -1269,6 +1257,23 @@ int follow_down_one(struct path *path)
 }
 EXPORT_SYMBOL(follow_down_one);
 
+/*
+ * Follow down to the covering mount currently visible to userspace.  At each
+ * point, the filesystem owning that dentry may be queried as to whether the
+ * caller is permitted to proceed or not.
+ */
+int follow_down(struct path *path)
+{
+	struct vfsmount *mnt = path->mnt;
+	bool jumped;
+	int ret = traverse_mounts(path, &jumped, NULL, 0);
+
+	if (path->mnt != mnt)
+		mntput(mnt);
+	return ret;
+}
+EXPORT_SYMBOL(follow_down);
+
 /*
  * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
  * we meet a managed dentry that would need blocking.
@@ -1325,6 +1330,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
 			  struct path *path, struct inode **inode,
 			  unsigned int *seqp)
 {
+	bool jumped;
 	int ret;
 
 	path->mnt = nd->path.mnt;
@@ -1334,15 +1340,25 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
 		if (unlikely(!*inode))
 			return -ENOENT;
 		if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
-			return 1;
+			return 0;
 		if (unlazy_child(nd, dentry, seq))
 			return -ECHILD;
 		// *path might've been clobbered by __follow_mount_rcu()
 		path->mnt = nd->path.mnt;
 		path->dentry = dentry;
 	}
-	ret = follow_managed(path, nd);
-	if (likely(ret >= 0)) {
+	ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
+	if (jumped) {
+		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+			ret = -EXDEV;
+		else
+			nd->flags |= LOOKUP_JUMPED;
+	}
+	if (unlikely(ret)) {
+		dput(path->dentry);
+		if (path->mnt != nd->path.mnt)
+			mntput(path->mnt);
+	} else {
 		*inode = d_backing_inode(path->dentry);
 		*seqp = 0; /* out of RCU mode, so the value doesn't matter */
 	}
@@ -1410,55 +1426,6 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 	return 0;
 }
 
-/*
- * Follow down to the covering mount currently visible to userspace.  At each
- * point, the filesystem owning that dentry may be queried as to whether the
- * caller is permitted to proceed or not.
- */
-int follow_down(struct path *path)
-{
-	unsigned managed;
-	int ret;
-
-	while (managed = READ_ONCE(path->dentry->d_flags),
-	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
-		/* Allow the filesystem to manage the transit without i_mutex
-		 * being held.
-		 *
-		 * We indicate to the filesystem if someone is trying to mount
-		 * something here.  This gives autofs the chance to deny anyone
-		 * other than its daemon the right to mount on its
-		 * superstructure.
-		 *
-		 * The filesystem may sleep at this point.
-		 */
-		if (managed & DCACHE_MANAGE_TRANSIT) {
-			BUG_ON(!path->dentry->d_op);
-			BUG_ON(!path->dentry->d_op->d_manage);
-			ret = path->dentry->d_op->d_manage(path, false);
-			if (ret < 0)
-				return ret == -EISDIR ? 0 : ret;
-		}
-
-		/* Transit to a mounted filesystem. */
-		if (managed & DCACHE_MOUNTED) {
-			struct vfsmount *mounted = lookup_mnt(path);
-			if (!mounted)
-				break;
-			dput(path->dentry);
-			mntput(path->mnt);
-			path->mnt = mounted;
-			path->dentry = dget(mounted->mnt_root);
-			continue;
-		}
-
-		/* Don't handle automount points here */
-		break;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(follow_down);
-
 /*
  * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
  */

From 6fb968cdf9d0bd66742ac73f3db268dcb0278ada Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Jan 2020 09:53:19 -0500
Subject: [PATCH 150/327] atomic_open(): return the right dentry in
 FMODE_OPENED case

->atomic_open() might have used a different alias than the one we'd
passed to it; in "not opened" case we take care of that, in "opened"
one we don't.  Currently we don't care downstream of "opened" case
which alias to return; however, that will change shortly when we
get to unifying may_open() calls.

It's not hard to get right in all cases, anyway.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/namei.c b/fs/namei.c
index 40d5f7abfa54..88f985aff4f8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2949,11 +2949,15 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
 	d_lookup_done(dentry);
 	if (!error) {
 		if (file->f_mode & FMODE_OPENED) {
+			int acc_mode = op->acc_mode;
+			if (unlikely(dentry != file->f_path.dentry)) {
+				dput(dentry);
+				dentry = dget(file->f_path.dentry);
+			}
 			/*
 			 * We didn't have the inode before the open, so check open
 			 * permission here.
 			 */
-			int acc_mode = op->acc_mode;
 			if (file->f_mode & FMODE_CREATED) {
 				WARN_ON(!(open_flag & O_CREAT));
 				fsnotify_create(dir, dentry);

From 7be219b4dcd9c698a120726e48a52b96f554fd2c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Jan 2020 10:02:29 -0500
Subject: [PATCH 151/327] atomic_open(): lift the call of may_open() into
 do_last()

there we'll be able to merge it with its counterparts in other
cases, and there's no reason to do it before the parent has
been unlocked

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 88f985aff4f8..7e932d9a71a9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2949,23 +2949,12 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
 	d_lookup_done(dentry);
 	if (!error) {
 		if (file->f_mode & FMODE_OPENED) {
-			int acc_mode = op->acc_mode;
 			if (unlikely(dentry != file->f_path.dentry)) {
 				dput(dentry);
 				dentry = dget(file->f_path.dentry);
 			}
-			/*
-			 * We didn't have the inode before the open, so check open
-			 * permission here.
-			 */
-			if (file->f_mode & FMODE_CREATED) {
-				WARN_ON(!(open_flag & O_CREAT));
+			if (file->f_mode & FMODE_CREATED)
 				fsnotify_create(dir, dentry);
-				acc_mode = 0;
-			}
-			error = may_open(&file->f_path, acc_mode, open_flag);
-			if (WARN_ON(error > 0))
-				error = -EINVAL;
 		} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
 			error = -EIO;
 		} else {
@@ -3208,12 +3197,19 @@ static const char *do_last(struct nameidata *nd,
 	}
 
 	if (file->f_mode & FMODE_OPENED) {
-		if ((file->f_mode & FMODE_CREATED) ||
-		    !S_ISREG(file_inode(file)->i_mode))
+		if (file->f_mode & FMODE_CREATED) {
+			open_flag &= ~O_TRUNC;
+			will_truncate = false;
+			acc_mode = 0;
+		} else if (!S_ISREG(file_inode(file)->i_mode))
 			will_truncate = false;
 
 		audit_inode(nd->name, file->f_path.dentry, 0);
-		dput(dentry);
+		dput(nd->path.dentry);
+		nd->path.dentry = dentry;
+		error = may_open(&nd->path, acc_mode, open_flag);
+		if (error)
+			goto out;
 		goto opened;
 	}
 

From 3ad5615a071f7c1c41b2e23b9ffde73b09499eb0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Jan 2020 10:06:13 -0500
Subject: [PATCH 152/327] do_last(): merge the may_open() calls

have FMODE_OPENED case rejoin the main path at earlier point

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 7e932d9a71a9..2f8a5d3be784 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3207,10 +3207,7 @@ static const char *do_last(struct nameidata *nd,
 		audit_inode(nd->name, file->f_path.dentry, 0);
 		dput(nd->path.dentry);
 		nd->path.dentry = dentry;
-		error = may_open(&nd->path, acc_mode, open_flag);
-		if (error)
-			goto out;
-		goto opened;
+		goto finish_open_created;
 	}
 
 	if (file->f_mode & FMODE_CREATED) {
@@ -3277,11 +3274,10 @@ static const char *do_last(struct nameidata *nd,
 	error = may_open(&nd->path, acc_mode, open_flag);
 	if (error)
 		goto out;
-	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
-	error = vfs_open(&nd->path, file);
+	if (!(file->f_mode & FMODE_OPENED))
+		error = vfs_open(&nd->path, file);
 	if (error)
 		goto out;
-opened:
 	error = ima_file_check(file, op->acc_mode);
 	if (!error && will_truncate)
 		error = handle_truncate(file);

From 59e96e65833e05a9410327627d91ccef39392a25 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Jan 2020 10:22:24 -0500
Subject: [PATCH 153/327] do_last(): don't bother with keeping got_write in
 FMODE_OPENED case

it's easier to drop it right after lookup_open() and regain if
needed (i.e. if we will need to truncate).  On the non-FMODE_OPENED
path we do that anyway.  In case of FMODE_CREATED we won't be
needing it.  And it's easier to prove correctness that way,
especially since the initial failure to get write access is not
always fatal; proving that we'll never end up truncating in that
case is rather convoluted.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 2f8a5d3be784..2eef41f505db 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3191,11 +3191,14 @@ static const char *do_last(struct nameidata *nd,
 	else
 		inode_unlock_shared(dir->d_inode);
 
-	if (IS_ERR(dentry)) {
-		error = PTR_ERR(dentry);
-		goto out;
+	if (got_write) {
+		mnt_drop_write(nd->path.mnt);
+		got_write = false;
 	}
 
+	if (IS_ERR(dentry))
+		return ERR_CAST(dentry);
+
 	if (file->f_mode & FMODE_OPENED) {
 		if (file->f_mode & FMODE_CREATED) {
 			open_flag &= ~O_TRUNC;
@@ -3220,16 +3223,6 @@ static const char *do_last(struct nameidata *nd,
 		goto finish_open_created;
 	}
 
-	/*
-	 * If atomic_open() acquired write access it is dropped now due to
-	 * possible mount and symlink following (this might be optimized away if
-	 * necessary...)
-	 */
-	if (got_write) {
-		mnt_drop_write(nd->path.mnt);
-		got_write = false;
-	}
-
 finish_lookup:
 	if (nd->depth)
 		put_link(nd);
@@ -3250,27 +3243,25 @@ static const char *do_last(struct nameidata *nd,
 		return ERR_PTR(error);
 	audit_inode(nd->name, nd->path.dentry, 0);
 	if (open_flag & O_CREAT) {
-		error = -EISDIR;
 		if (d_is_dir(nd->path.dentry))
-			goto out;
+			return ERR_PTR(-EISDIR);
 		error = may_create_in_sticky(dir_mode, dir_uid,
 					     d_backing_inode(nd->path.dentry));
 		if (unlikely(error))
-			goto out;
+			return ERR_PTR(error);
 	}
-	error = -ENOTDIR;
 	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
-		goto out;
+		return ERR_PTR(-ENOTDIR);
 	if (!d_is_reg(nd->path.dentry))
 		will_truncate = false;
 
+finish_open_created:
 	if (will_truncate) {
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
-			goto out;
+			return ERR_PTR(error);
 		got_write = true;
 	}
-finish_open_created:
 	error = may_open(&nd->path, acc_mode, open_flag);
 	if (error)
 		goto out;

From 5a2d3edd8dad8c8870419c614e3a7449effe87cb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Jan 2020 10:32:22 -0500
Subject: [PATCH 154/327] do_last(): rejoing the common path earlier in
 FMODE_{OPENED,CREATED} case

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 2eef41f505db..c85fdfd6b33d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3200,13 +3200,6 @@ static const char *do_last(struct nameidata *nd,
 		return ERR_CAST(dentry);
 
 	if (file->f_mode & FMODE_OPENED) {
-		if (file->f_mode & FMODE_CREATED) {
-			open_flag &= ~O_TRUNC;
-			will_truncate = false;
-			acc_mode = 0;
-		} else if (!S_ISREG(file_inode(file)->i_mode))
-			will_truncate = false;
-
 		audit_inode(nd->name, file->f_path.dentry, 0);
 		dput(nd->path.dentry);
 		nd->path.dentry = dentry;
@@ -3214,10 +3207,6 @@ static const char *do_last(struct nameidata *nd,
 	}
 
 	if (file->f_mode & FMODE_CREATED) {
-		/* Don't check for write permission, don't truncate */
-		open_flag &= ~O_TRUNC;
-		will_truncate = false;
-		acc_mode = 0;
 		dput(nd->path.dentry);
 		nd->path.dentry = dentry;
 		goto finish_open_created;
@@ -3252,10 +3241,16 @@ static const char *do_last(struct nameidata *nd,
 	}
 	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
 		return ERR_PTR(-ENOTDIR);
-	if (!d_is_reg(nd->path.dentry))
-		will_truncate = false;
 
 finish_open_created:
+	if (file->f_mode & FMODE_CREATED) {
+		/* Don't check for write permission, don't truncate */
+		open_flag &= ~O_TRUNC;
+		will_truncate = false;
+		acc_mode = 0;
+	} else if (!d_is_reg(nd->path.dentry)) {
+		will_truncate = false;
+	}
 	if (will_truncate) {
 		error = mnt_want_write(nd->path.mnt);
 		if (error)

From 8795e7d48288163ce29eb927240a0c32c40b4d54 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Jan 2020 10:38:17 -0500
Subject: [PATCH 155/327] do_last(): simplify the liveness analysis past
 finish_open_created

Don't mess with got_write there - it is guaranteed to be false on
entry and it will be set true if and only if we decide to go for
truncation and manage to get write access for that.

Don't carry acc_mode through the entire thing - it's only used
in that part.  And don't bother with gotos in there - compiler is
quite capable of optimizing that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index c85fdfd6b33d..8cdf8ef41194 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3122,9 +3122,9 @@ static const char *do_last(struct nameidata *nd,
 	kuid_t dir_uid = nd->inode->i_uid;
 	umode_t dir_mode = nd->inode->i_mode;
 	int open_flag = op->open_flag;
-	bool will_truncate = (open_flag & O_TRUNC) != 0;
+	bool do_truncate;
 	bool got_write = false;
-	int acc_mode = op->acc_mode;
+	int acc_mode;
 	unsigned seq;
 	struct inode *inode;
 	struct dentry *dentry;
@@ -3243,36 +3243,30 @@ static const char *do_last(struct nameidata *nd,
 		return ERR_PTR(-ENOTDIR);
 
 finish_open_created:
+	do_truncate = false;
+	acc_mode = op->acc_mode;
 	if (file->f_mode & FMODE_CREATED) {
 		/* Don't check for write permission, don't truncate */
 		open_flag &= ~O_TRUNC;
-		will_truncate = false;
 		acc_mode = 0;
-	} else if (!d_is_reg(nd->path.dentry)) {
-		will_truncate = false;
-	}
-	if (will_truncate) {
+	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			return ERR_PTR(error);
-		got_write = true;
+		do_truncate = true;
 	}
 	error = may_open(&nd->path, acc_mode, open_flag);
-	if (error)
-		goto out;
-	if (!(file->f_mode & FMODE_OPENED))
+	if (!error && !(file->f_mode & FMODE_OPENED))
 		error = vfs_open(&nd->path, file);
-	if (error)
-		goto out;
-	error = ima_file_check(file, op->acc_mode);
-	if (!error && will_truncate)
+	if (!error)
+		error = ima_file_check(file, op->acc_mode);
+	if (!error && do_truncate)
 		error = handle_truncate(file);
-out:
 	if (unlikely(error > 0)) {
 		WARN_ON(1);
 		error = -EINVAL;
 	}
-	if (got_write)
+	if (do_truncate)
 		mnt_drop_write(nd->path.mnt);
 	return ERR_PTR(error);
 }

From 973d4b73fbaf705f8fa5208234175606df0999e4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Jan 2020 10:48:16 -0500
Subject: [PATCH 156/327] do_last(): rejoin the common path even earlier in
 FMODE_{OPENED,CREATED} case

... getting may_create_in_sticky() checks in FMODE_OPENED case as well.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 8cdf8ef41194..798672577367 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3199,14 +3199,7 @@ static const char *do_last(struct nameidata *nd,
 	if (IS_ERR(dentry))
 		return ERR_CAST(dentry);
 
-	if (file->f_mode & FMODE_OPENED) {
-		audit_inode(nd->name, file->f_path.dentry, 0);
-		dput(nd->path.dentry);
-		nd->path.dentry = dentry;
-		goto finish_open_created;
-	}
-
-	if (file->f_mode & FMODE_CREATED) {
+	if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
 		dput(nd->path.dentry);
 		nd->path.dentry = dentry;
 		goto finish_open_created;
@@ -3230,7 +3223,9 @@ static const char *do_last(struct nameidata *nd,
 	error = complete_walk(nd);
 	if (error)
 		return ERR_PTR(error);
-	audit_inode(nd->name, nd->path.dentry, 0);
+finish_open_created:
+	if (!(file->f_mode & FMODE_CREATED))
+		audit_inode(nd->name, nd->path.dentry, 0);
 	if (open_flag & O_CREAT) {
 		if (d_is_dir(nd->path.dentry))
 			return ERR_PTR(-EISDIR);
@@ -3242,7 +3237,6 @@ static const char *do_last(struct nameidata *nd,
 	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
 		return ERR_PTR(-ENOTDIR);
 
-finish_open_created:
 	do_truncate = false;
 	acc_mode = op->acc_mode;
 	if (file->f_mode & FMODE_CREATED) {

From c981a4828125e4a5832a726761343cbff8810674 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Jan 2020 11:06:21 -0500
Subject: [PATCH 157/327] split the lookup-related parts of do_last() into a
 separate helper

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 51 +++++++++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 798672577367..6530e3cbd486 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3112,19 +3112,12 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 	return ERR_PTR(error);
 }
 
-/*
- * Handle the last step of open()
- */
-static const char *do_last(struct nameidata *nd,
+static const char *open_last_lookups(struct nameidata *nd,
 		   struct file *file, const struct open_flags *op)
 {
 	struct dentry *dir = nd->path.dentry;
-	kuid_t dir_uid = nd->inode->i_uid;
-	umode_t dir_mode = nd->inode->i_mode;
 	int open_flag = op->open_flag;
-	bool do_truncate;
 	bool got_write = false;
-	int acc_mode;
 	unsigned seq;
 	struct inode *inode;
 	struct dentry *dentry;
@@ -3137,9 +3130,9 @@ static const char *do_last(struct nameidata *nd,
 		if (nd->depth)
 			put_link(nd);
 		error = handle_dots(nd, nd->last_type);
-		if (unlikely(error))
-			return ERR_PTR(error);
-		goto finish_open;
+		if (likely(!error))
+			error = complete_walk(nd);
+		return ERR_PTR(error);
 	}
 
 	if (!(open_flag & O_CREAT)) {
@@ -3152,7 +3145,6 @@ static const char *do_last(struct nameidata *nd,
 		if (likely(dentry))
 			goto finish_lookup;
 
-		BUG_ON(nd->inode != dir->d_inode);
 		BUG_ON(nd->flags & LOOKUP_RCU);
 	} else {
 		/* create side of things */
@@ -3162,7 +3154,7 @@ static const char *do_last(struct nameidata *nd,
 		 * about to look up
 		 */
 		error = complete_walk(nd);
-		if (error)
+		if (unlikely(error))
 			return ERR_PTR(error);
 
 		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
@@ -3191,10 +3183,8 @@ static const char *do_last(struct nameidata *nd,
 	else
 		inode_unlock_shared(dir->d_inode);
 
-	if (got_write) {
+	if (got_write)
 		mnt_drop_write(nd->path.mnt);
-		got_write = false;
-	}
 
 	if (IS_ERR(dentry))
 		return ERR_CAST(dentry);
@@ -3202,7 +3192,7 @@ static const char *do_last(struct nameidata *nd,
 	if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
 		dput(nd->path.dentry);
 		nd->path.dentry = dentry;
-		goto finish_open_created;
+		return NULL;
 	}
 
 finish_lookup:
@@ -3218,12 +3208,29 @@ static const char *do_last(struct nameidata *nd,
 		audit_inode(nd->name, nd->path.dentry, 0);
 		return ERR_PTR(-EEXIST);
 	}
-finish_open:
+
 	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
-	error = complete_walk(nd);
-	if (error)
-		return ERR_PTR(error);
-finish_open_created:
+	return ERR_PTR(complete_walk(nd));
+}
+
+/*
+ * Handle the last step of open()
+ */
+static const char *do_last(struct nameidata *nd,
+		   struct file *file, const struct open_flags *op)
+{
+	kuid_t dir_uid = nd->inode->i_uid;
+	umode_t dir_mode = nd->inode->i_mode;
+	int open_flag = op->open_flag;
+	bool do_truncate;
+	int acc_mode;
+	const char *link;
+	int error;
+
+	link = open_last_lookups(nd, file, op);
+	if (unlikely(link))
+		return link;
+
 	if (!(file->f_mode & FMODE_CREATED))
 		audit_inode(nd->name, nd->path.dentry, 0);
 	if (open_flag & O_CREAT) {

From 6b03f7edf43e3b284c023ab03aba6e5e7c5a7559 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 24 Feb 2020 15:53:19 -0500
Subject: [PATCH 158/327] path_connected(): pass mount and dentry separately

eventually we'll want to do that check *before* mangling
nd->path.dentry...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 6530e3cbd486..2bf9f605c46f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -550,22 +550,20 @@ static int __nd_alloc_stack(struct nameidata *nd)
 }
 
 /**
- * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
- * @path: nameidate to verify
+ * path_connected - Verify that a dentry is below mnt.mnt_root
  *
  * Rename can sometimes move a file or directory outside of a bind
  * mount, path_connected allows those cases to be detected.
  */
-static bool path_connected(const struct path *path)
+static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
 {
-	struct vfsmount *mnt = path->mnt;
 	struct super_block *sb = mnt->mnt_sb;
 
 	/* Bind mounts and multi-root filesystems can have disconnected paths */
 	if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
 		return true;
 
-	return is_subdir(path->dentry, mnt->mnt_root);
+	return is_subdir(dentry, mnt->mnt_root);
 }
 
 static inline int nd_alloc_stack(struct nameidata *nd)
@@ -1386,7 +1384,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 				return -ECHILD;
 			nd->path.dentry = parent;
 			nd->seq = seq;
-			if (unlikely(!path_connected(&nd->path)))
+			if (unlikely(!path_connected(nd->path.mnt, parent)))
 				return -ECHILD;
 			break;
 		} else {
@@ -1448,7 +1446,7 @@ static int path_parent_directory(struct path *path)
 	/* rare case of legitimate dget_parent()... */
 	path->dentry = dget_parent(path->dentry);
 	dput(old);
-	if (unlikely(!path_connected(path)))
+	if (unlikely(!path_connected(path->mnt, path->dentry)))
 		return -ENOENT;
 	return 0;
 }

From 63b27720a476e99d3eaceee9d2246459ae0f6286 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 24 Feb 2020 16:01:19 -0500
Subject: [PATCH 159/327] path_parent_directory(): leave changing path->dentry
 to callers

Instead of returning 0, return new dentry; instead of returning
-ENOENT, return NULL.  Adjust the callers accordingly.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 2bf9f605c46f..49b2a08105c7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1440,19 +1440,21 @@ static void follow_mount(struct path *path)
 	}
 }
 
-static int path_parent_directory(struct path *path)
+static struct dentry *path_parent_directory(struct path *path)
 {
-	struct dentry *old = path->dentry;
 	/* rare case of legitimate dget_parent()... */
-	path->dentry = dget_parent(path->dentry);
-	dput(old);
-	if (unlikely(!path_connected(path->mnt, path->dentry)))
-		return -ENOENT;
-	return 0;
+	struct dentry *parent = dget_parent(path->dentry);
+
+	if (unlikely(!path_connected(path->mnt, parent))) {
+		dput(parent);
+		parent = NULL;
+	}
+	return parent;
 }
 
 static int follow_dotdot(struct nameidata *nd)
 {
+	struct dentry *parent;
 	while (1) {
 		if (path_equal(&nd->path, &nd->root)) {
 			if (unlikely(nd->flags & LOOKUP_BENEATH))
@@ -1460,9 +1462,11 @@ static int follow_dotdot(struct nameidata *nd)
 			break;
 		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
-			int ret = path_parent_directory(&nd->path);
-			if (ret)
-				return ret;
+			parent = path_parent_directory(&nd->path);
+			if (!parent)
+				return -ENOENT;
+			dput(nd->path.dentry);
+			nd->path.dentry = parent;
 			break;
 		}
 		if (!follow_up(&nd->path))
@@ -2600,13 +2604,13 @@ int path_pts(struct path *path)
 	 */
 	struct dentry *child, *parent;
 	struct qstr this;
-	int ret;
 
-	ret = path_parent_directory(path);
-	if (ret)
-		return ret;
+	parent = path_parent_directory(path);
+	if (!parent)
+		return -ENOENT;
 
-	parent = path->dentry;
+	dput(path->dentry);
+	path->dentry = parent;
 	this.name = "pts";
 	this.len = 3;
 	child = d_hash_and_lookup(parent, &this);

From a6a7eb7628cfa7e0e8f62e185502facc43bd2ce3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 11 Mar 2020 13:05:03 -0400
Subject: [PATCH 160/327] expand path_parent_directory() in its callers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 49b2a08105c7..88e5d7920540 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1440,18 +1440,6 @@ static void follow_mount(struct path *path)
 	}
 }
 
-static struct dentry *path_parent_directory(struct path *path)
-{
-	/* rare case of legitimate dget_parent()... */
-	struct dentry *parent = dget_parent(path->dentry);
-
-	if (unlikely(!path_connected(path->mnt, parent))) {
-		dput(parent);
-		parent = NULL;
-	}
-	return parent;
-}
-
 static int follow_dotdot(struct nameidata *nd)
 {
 	struct dentry *parent;
@@ -1462,9 +1450,13 @@ static int follow_dotdot(struct nameidata *nd)
 			break;
 		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
-			parent = path_parent_directory(&nd->path);
-			if (!parent)
+			/* rare case of legitimate dget_parent()... */
+			parent = dget_parent(nd->path.dentry);
+
+			if (unlikely(!path_connected(nd->path.mnt, parent))) {
+				dput(parent);
 				return -ENOENT;
+			}
 			dput(nd->path.dentry);
 			nd->path.dentry = parent;
 			break;
@@ -2602,13 +2594,14 @@ int path_pts(struct path *path)
 	/* Find something mounted on "pts" in the same directory as
 	 * the input path.
 	 */
-	struct dentry *child, *parent;
+	struct dentry *parent = dget_parent(path->dentry);
+	struct dentry *child;
 	struct qstr this;
 
-	parent = path_parent_directory(path);
-	if (!parent)
+	if (unlikely(!path_connected(path->mnt, parent))) {
+		dput(parent);
 		return -ENOENT;
-
+	}
 	dput(path->dentry);
 	path->dentry = parent;
 	this.name = "pts";

From abc2c632e0cec6f76883d008c298e31cb3e2d242 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Feb 2020 11:55:02 -0500
Subject: [PATCH 161/327] follow_dotdot{,_rcu}(): lift switching nd->path to
 parent out of loop

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 88e5d7920540..772d82daf3b4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1365,7 +1365,9 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
 
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
+	struct dentry *parent = NULL;
 	struct inode *inode = nd->inode;
+	unsigned seq;
 
 	while (1) {
 		if (path_equal(&nd->path, &nd->root)) {
@@ -1375,15 +1377,12 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			struct dentry *old = nd->path.dentry;
-			struct dentry *parent = old->d_parent;
-			unsigned seq;
 
+			parent = old->d_parent;
 			inode = parent->d_inode;
 			seq = read_seqcount_begin(&parent->d_seq);
 			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
 				return -ECHILD;
-			nd->path.dentry = parent;
-			nd->seq = seq;
 			if (unlikely(!path_connected(nd->path.mnt, parent)))
 				return -ECHILD;
 			break;
@@ -1406,6 +1405,10 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 			nd->seq = seq;
 		}
 	}
+	if (likely(parent)) {
+		nd->path.dentry = parent;
+		nd->seq = seq;
+	}
 	while (unlikely(d_mountpoint(nd->path.dentry))) {
 		struct mount *mounted;
 		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
@@ -1442,7 +1445,7 @@ static void follow_mount(struct path *path)
 
 static int follow_dotdot(struct nameidata *nd)
 {
-	struct dentry *parent;
+	struct dentry *parent = NULL;
 	while (1) {
 		if (path_equal(&nd->path, &nd->root)) {
 			if (unlikely(nd->flags & LOOKUP_BENEATH))
@@ -1452,13 +1455,10 @@ static int follow_dotdot(struct nameidata *nd)
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			/* rare case of legitimate dget_parent()... */
 			parent = dget_parent(nd->path.dentry);
-
 			if (unlikely(!path_connected(nd->path.mnt, parent))) {
 				dput(parent);
 				return -ENOENT;
 			}
-			dput(nd->path.dentry);
-			nd->path.dentry = parent;
 			break;
 		}
 		if (!follow_up(&nd->path))
@@ -1466,6 +1466,10 @@ static int follow_dotdot(struct nameidata *nd)
 		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
 			return -EXDEV;
 	}
+	if (likely(parent)) {
+		dput(nd->path.dentry);
+		nd->path.dentry = parent;
+	}
 	follow_mount(&nd->path);
 	nd->inode = nd->path.dentry->d_inode;
 	return 0;

From c9a0f75d81e3ffccb885f0069d45e065acc0349a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Feb 2020 11:59:02 -0500
Subject: [PATCH 162/327] follow_dotdot{,_rcu}(): lift LOOKUP_BENEATH checks
 out of loop

Behaviour change: LOOKUP_BENEATH lookup of .. in absolute root
yields an error even if it's not the process' root.  That's
possible only if you'd managed to escape chroot jail by way of
procfs symlinks, but IMO the resulting behaviour is not worse -
more consistent and easier to describe:
	".." in root is "stay where you are", uness LOOKUP_BENEATH
	has been given, in which case it's "fail with EXDEV".

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 772d82daf3b4..577dc541a4d4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1370,11 +1370,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 	unsigned seq;
 
 	while (1) {
-		if (path_equal(&nd->path, &nd->root)) {
-			if (unlikely(nd->flags & LOOKUP_BENEATH))
-				return -ECHILD;
+		if (path_equal(&nd->path, &nd->root))
 			break;
-		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			struct dentry *old = nd->path.dentry;
 
@@ -1405,7 +1402,10 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 			nd->seq = seq;
 		}
 	}
-	if (likely(parent)) {
+	if (unlikely(!parent)) {
+		if (unlikely(nd->flags & LOOKUP_BENEATH))
+			return -ECHILD;
+	} else {
 		nd->path.dentry = parent;
 		nd->seq = seq;
 	}
@@ -1447,11 +1447,8 @@ static int follow_dotdot(struct nameidata *nd)
 {
 	struct dentry *parent = NULL;
 	while (1) {
-		if (path_equal(&nd->path, &nd->root)) {
-			if (unlikely(nd->flags & LOOKUP_BENEATH))
-				return -EXDEV;
+		if (path_equal(&nd->path, &nd->root))
 			break;
-		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			/* rare case of legitimate dget_parent()... */
 			parent = dget_parent(nd->path.dentry);
@@ -1466,7 +1463,10 @@ static int follow_dotdot(struct nameidata *nd)
 		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
 			return -EXDEV;
 	}
-	if (likely(parent)) {
+	if (unlikely(!parent)) {
+		if (unlikely(nd->flags & LOOKUP_BENEATH))
+			return -EXDEV;
+	} else {
 		dput(nd->path.dentry);
 		nd->path.dentry = parent;
 	}

From 957dd41d88427f1f592d2287b179904087518021 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Feb 2020 01:40:04 -0500
Subject: [PATCH 163/327] move handle_dots(), follow_dotdot() and
 follow_dotdot_rcu() past step_into()

pure move; we are going to have step_into() called by that bunch.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 260 ++++++++++++++++++++++++++---------------------------
 1 file changed, 130 insertions(+), 130 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 577dc541a4d4..3ee3b8719505 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1363,70 +1363,6 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
 	return ret;
 }
 
-static int follow_dotdot_rcu(struct nameidata *nd)
-{
-	struct dentry *parent = NULL;
-	struct inode *inode = nd->inode;
-	unsigned seq;
-
-	while (1) {
-		if (path_equal(&nd->path, &nd->root))
-			break;
-		if (nd->path.dentry != nd->path.mnt->mnt_root) {
-			struct dentry *old = nd->path.dentry;
-
-			parent = old->d_parent;
-			inode = parent->d_inode;
-			seq = read_seqcount_begin(&parent->d_seq);
-			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
-				return -ECHILD;
-			if (unlikely(!path_connected(nd->path.mnt, parent)))
-				return -ECHILD;
-			break;
-		} else {
-			struct mount *mnt = real_mount(nd->path.mnt);
-			struct mount *mparent = mnt->mnt_parent;
-			struct dentry *mountpoint = mnt->mnt_mountpoint;
-			struct inode *inode2 = mountpoint->d_inode;
-			unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
-			if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
-				return -ECHILD;
-			if (&mparent->mnt == nd->path.mnt)
-				break;
-			if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-				return -ECHILD;
-			/* we know that mountpoint was pinned */
-			nd->path.dentry = mountpoint;
-			nd->path.mnt = &mparent->mnt;
-			inode = inode2;
-			nd->seq = seq;
-		}
-	}
-	if (unlikely(!parent)) {
-		if (unlikely(nd->flags & LOOKUP_BENEATH))
-			return -ECHILD;
-	} else {
-		nd->path.dentry = parent;
-		nd->seq = seq;
-	}
-	while (unlikely(d_mountpoint(nd->path.dentry))) {
-		struct mount *mounted;
-		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
-		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
-			return -ECHILD;
-		if (!mounted)
-			break;
-		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-			return -ECHILD;
-		nd->path.mnt = &mounted->mnt;
-		nd->path.dentry = mounted->mnt.mnt_root;
-		inode = nd->path.dentry->d_inode;
-		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-	}
-	nd->inode = inode;
-	return 0;
-}
-
 /*
  * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
  */
@@ -1443,38 +1379,6 @@ static void follow_mount(struct path *path)
 	}
 }
 
-static int follow_dotdot(struct nameidata *nd)
-{
-	struct dentry *parent = NULL;
-	while (1) {
-		if (path_equal(&nd->path, &nd->root))
-			break;
-		if (nd->path.dentry != nd->path.mnt->mnt_root) {
-			/* rare case of legitimate dget_parent()... */
-			parent = dget_parent(nd->path.dentry);
-			if (unlikely(!path_connected(nd->path.mnt, parent))) {
-				dput(parent);
-				return -ENOENT;
-			}
-			break;
-		}
-		if (!follow_up(&nd->path))
-			break;
-		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-			return -EXDEV;
-	}
-	if (unlikely(!parent)) {
-		if (unlikely(nd->flags & LOOKUP_BENEATH))
-			return -EXDEV;
-	} else {
-		dput(nd->path.dentry);
-		nd->path.dentry = parent;
-	}
-	follow_mount(&nd->path);
-	nd->inode = nd->path.dentry->d_inode;
-	return 0;
-}
-
 /*
  * This looks up the name in dcache and possibly revalidates the found dentry.
  * NULL is returned if the dentry does not exist in the cache.
@@ -1654,40 +1558,6 @@ static inline int may_lookup(struct nameidata *nd)
 	return inode_permission(nd->inode, MAY_EXEC);
 }
 
-static inline int handle_dots(struct nameidata *nd, int type)
-{
-	if (type == LAST_DOTDOT) {
-		int error = 0;
-
-		if (!nd->root.mnt) {
-			error = set_root(nd);
-			if (error)
-				return error;
-		}
-		if (nd->flags & LOOKUP_RCU)
-			error = follow_dotdot_rcu(nd);
-		else
-			error = follow_dotdot(nd);
-		if (error)
-			return error;
-
-		if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
-			/*
-			 * If there was a racing rename or mount along our
-			 * path, then we can't be sure that ".." hasn't jumped
-			 * above nd->root (and so userspace should retry or use
-			 * some fallback).
-			 */
-			smp_rmb();
-			if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
-				return -EAGAIN;
-			if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
-				return -EAGAIN;
-		}
-	}
-	return 0;
-}
-
 enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
 
 static const char *pick_link(struct nameidata *nd, struct path *link,
@@ -1817,6 +1687,136 @@ static const char *step_into(struct nameidata *nd, int flags,
 	return pick_link(nd, &path, inode, seq, flags);
 }
 
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
+	struct dentry *parent = NULL;
+	struct inode *inode = nd->inode;
+	unsigned seq;
+
+	while (1) {
+		if (path_equal(&nd->path, &nd->root))
+			break;
+		if (nd->path.dentry != nd->path.mnt->mnt_root) {
+			struct dentry *old = nd->path.dentry;
+
+			parent = old->d_parent;
+			inode = parent->d_inode;
+			seq = read_seqcount_begin(&parent->d_seq);
+			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
+				return -ECHILD;
+			if (unlikely(!path_connected(nd->path.mnt, parent)))
+				return -ECHILD;
+			break;
+		} else {
+			struct mount *mnt = real_mount(nd->path.mnt);
+			struct mount *mparent = mnt->mnt_parent;
+			struct dentry *mountpoint = mnt->mnt_mountpoint;
+			struct inode *inode2 = mountpoint->d_inode;
+			unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
+			if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+				return -ECHILD;
+			if (&mparent->mnt == nd->path.mnt)
+				break;
+			if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+				return -ECHILD;
+			/* we know that mountpoint was pinned */
+			nd->path.dentry = mountpoint;
+			nd->path.mnt = &mparent->mnt;
+			inode = inode2;
+			nd->seq = seq;
+		}
+	}
+	if (unlikely(!parent)) {
+		if (unlikely(nd->flags & LOOKUP_BENEATH))
+			return -ECHILD;
+	} else {
+		nd->path.dentry = parent;
+		nd->seq = seq;
+	}
+	while (unlikely(d_mountpoint(nd->path.dentry))) {
+		struct mount *mounted;
+		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
+		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+			return -ECHILD;
+		if (!mounted)
+			break;
+		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+			return -ECHILD;
+		nd->path.mnt = &mounted->mnt;
+		nd->path.dentry = mounted->mnt.mnt_root;
+		inode = nd->path.dentry->d_inode;
+		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+	}
+	nd->inode = inode;
+	return 0;
+}
+
+static int follow_dotdot(struct nameidata *nd)
+{
+	struct dentry *parent = NULL;
+	while (1) {
+		if (path_equal(&nd->path, &nd->root))
+			break;
+		if (nd->path.dentry != nd->path.mnt->mnt_root) {
+			/* rare case of legitimate dget_parent()... */
+			parent = dget_parent(nd->path.dentry);
+			if (unlikely(!path_connected(nd->path.mnt, parent))) {
+				dput(parent);
+				return -ENOENT;
+			}
+			break;
+		}
+		if (!follow_up(&nd->path))
+			break;
+		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+			return -EXDEV;
+	}
+	if (unlikely(!parent)) {
+		if (unlikely(nd->flags & LOOKUP_BENEATH))
+			return -EXDEV;
+	} else {
+		dput(nd->path.dentry);
+		nd->path.dentry = parent;
+	}
+	follow_mount(&nd->path);
+	nd->inode = nd->path.dentry->d_inode;
+	return 0;
+}
+
+static inline int handle_dots(struct nameidata *nd, int type)
+{
+	if (type == LAST_DOTDOT) {
+		int error = 0;
+
+		if (!nd->root.mnt) {
+			error = set_root(nd);
+			if (error)
+				return error;
+		}
+		if (nd->flags & LOOKUP_RCU)
+			error = follow_dotdot_rcu(nd);
+		else
+			error = follow_dotdot(nd);
+		if (error)
+			return error;
+
+		if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
+			/*
+			 * If there was a racing rename or mount along our
+			 * path, then we can't be sure that ".." hasn't jumped
+			 * above nd->root (and so userspace should retry or use
+			 * some fallback).
+			 */
+			smp_rmb();
+			if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
+				return -EAGAIN;
+			if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
+				return -EAGAIN;
+		}
+	}
+	return 0;
+}
+
 static const char *walk_component(struct nameidata *nd, int flags)
 {
 	struct dentry *dentry;

From 7521f22b3ce2fa25e6301f5c7612fe1d698424b4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Feb 2020 12:22:58 -0500
Subject: [PATCH 164/327] handle_dots(), follow_dotdot{,_rcu}(): preparation to
 switch to step_into()

Right now the tail ends of follow_dotdot{,_rcu}() are pretty
much the open-coded analogues of step_into().  The differences:
	* the lack of proper LOOKUP_NO_XDEV handling in non-RCU case
(arguably a bug)
	* the lack of ->d_manage() handling (again, arguably a bug)

Adjust the calling conventions so that on the next step with could
just switch those functions to returning step_into().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 52 +++++++++++++++++++++++++---------------------------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 3ee3b8719505..1749e435edc7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1687,7 +1687,7 @@ static const char *step_into(struct nameidata *nd, int flags,
 	return pick_link(nd, &path, inode, seq, flags);
 }
 
-static int follow_dotdot_rcu(struct nameidata *nd)
+static const char *follow_dotdot_rcu(struct nameidata *nd)
 {
 	struct dentry *parent = NULL;
 	struct inode *inode = nd->inode;
@@ -1703,9 +1703,9 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 			inode = parent->d_inode;
 			seq = read_seqcount_begin(&parent->d_seq);
 			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
-				return -ECHILD;
+				return ERR_PTR(-ECHILD);
 			if (unlikely(!path_connected(nd->path.mnt, parent)))
-				return -ECHILD;
+				return ERR_PTR(-ECHILD);
 			break;
 		} else {
 			struct mount *mnt = real_mount(nd->path.mnt);
@@ -1714,11 +1714,11 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 			struct inode *inode2 = mountpoint->d_inode;
 			unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
 			if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
-				return -ECHILD;
+				return ERR_PTR(-ECHILD);
 			if (&mparent->mnt == nd->path.mnt)
 				break;
 			if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-				return -ECHILD;
+				return ERR_PTR(-ECHILD);
 			/* we know that mountpoint was pinned */
 			nd->path.dentry = mountpoint;
 			nd->path.mnt = &mparent->mnt;
@@ -1728,7 +1728,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 	}
 	if (unlikely(!parent)) {
 		if (unlikely(nd->flags & LOOKUP_BENEATH))
-			return -ECHILD;
+			return ERR_PTR(-ECHILD);
 	} else {
 		nd->path.dentry = parent;
 		nd->seq = seq;
@@ -1737,21 +1737,21 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 		struct mount *mounted;
 		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
 		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
-			return -ECHILD;
+			return ERR_PTR(-ECHILD);
 		if (!mounted)
 			break;
 		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-			return -ECHILD;
+			return ERR_PTR(-ECHILD);
 		nd->path.mnt = &mounted->mnt;
 		nd->path.dentry = mounted->mnt.mnt_root;
 		inode = nd->path.dentry->d_inode;
 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 	}
 	nd->inode = inode;
-	return 0;
+	return NULL;
 }
 
-static int follow_dotdot(struct nameidata *nd)
+static const char *follow_dotdot(struct nameidata *nd)
 {
 	struct dentry *parent = NULL;
 	while (1) {
@@ -1762,34 +1762,34 @@ static int follow_dotdot(struct nameidata *nd)
 			parent = dget_parent(nd->path.dentry);
 			if (unlikely(!path_connected(nd->path.mnt, parent))) {
 				dput(parent);
-				return -ENOENT;
+				return ERR_PTR(-ENOENT);
 			}
 			break;
 		}
 		if (!follow_up(&nd->path))
 			break;
 		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-			return -EXDEV;
+			return ERR_PTR(-EXDEV);
 	}
 	if (unlikely(!parent)) {
 		if (unlikely(nd->flags & LOOKUP_BENEATH))
-			return -EXDEV;
+			return ERR_PTR(-EXDEV);
 	} else {
 		dput(nd->path.dentry);
 		nd->path.dentry = parent;
 	}
 	follow_mount(&nd->path);
 	nd->inode = nd->path.dentry->d_inode;
-	return 0;
+	return NULL;
 }
 
-static inline int handle_dots(struct nameidata *nd, int type)
+static const char *handle_dots(struct nameidata *nd, int type)
 {
 	if (type == LAST_DOTDOT) {
-		int error = 0;
+		const char *error = NULL;
 
 		if (!nd->root.mnt) {
-			error = set_root(nd);
+			error = ERR_PTR(set_root(nd));
 			if (error)
 				return error;
 		}
@@ -1809,12 +1809,12 @@ static inline int handle_dots(struct nameidata *nd, int type)
 			 */
 			smp_rmb();
 			if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)))
-				return -EAGAIN;
+				return ERR_PTR(-EAGAIN);
 			if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)))
-				return -EAGAIN;
+				return ERR_PTR(-EAGAIN);
 		}
 	}
-	return 0;
+	return NULL;
 }
 
 static const char *walk_component(struct nameidata *nd, int flags)
@@ -1822,7 +1822,6 @@ static const char *walk_component(struct nameidata *nd, int flags)
 	struct dentry *dentry;
 	struct inode *inode;
 	unsigned seq;
-	int err;
 	/*
 	 * "." and ".." are special - ".." especially so because it has
 	 * to be able to know about the current root directory and
@@ -1831,8 +1830,7 @@ static const char *walk_component(struct nameidata *nd, int flags)
 	if (unlikely(nd->last_type != LAST_NORM)) {
 		if (!(flags & WALK_MORE) && nd->depth)
 			put_link(nd);
-		err = handle_dots(nd, nd->last_type);
-		return ERR_PTR(err);
+		return handle_dots(nd, nd->last_type);
 	}
 	dentry = lookup_fast(nd, &inode, &seq);
 	if (IS_ERR(dentry))
@@ -3128,10 +3126,10 @@ static const char *open_last_lookups(struct nameidata *nd,
 	if (nd->last_type != LAST_NORM) {
 		if (nd->depth)
 			put_link(nd);
-		error = handle_dots(nd, nd->last_type);
-		if (likely(!error))
-			error = complete_walk(nd);
-		return ERR_PTR(error);
+		res = handle_dots(nd, nd->last_type);
+		if (likely(!res))
+			res = ERR_PTR(complete_walk(nd));
+		return res;
 	}
 
 	if (!(open_flag & O_CREAT)) {

From 6dfd9fe54dfd615eb1dcb6a1281f3b5fddef190c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Feb 2020 12:33:48 -0500
Subject: [PATCH 165/327] follow_dotdot{,_rcu}(): switch to use of step_into()

gets the regular mount crossing on result of ..

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 31 +++++++------------------------
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 1749e435edc7..9c775013368a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1722,33 +1722,18 @@ static const char *follow_dotdot_rcu(struct nameidata *nd)
 			/* we know that mountpoint was pinned */
 			nd->path.dentry = mountpoint;
 			nd->path.mnt = &mparent->mnt;
-			inode = inode2;
+			inode = nd->inode = inode2;
 			nd->seq = seq;
 		}
 	}
 	if (unlikely(!parent)) {
 		if (unlikely(nd->flags & LOOKUP_BENEATH))
 			return ERR_PTR(-ECHILD);
+		return step_into(nd, WALK_NOFOLLOW,
+				 nd->path.dentry, nd->inode, nd->seq);
 	} else {
-		nd->path.dentry = parent;
-		nd->seq = seq;
+		return step_into(nd, WALK_NOFOLLOW, parent, inode, seq);
 	}
-	while (unlikely(d_mountpoint(nd->path.dentry))) {
-		struct mount *mounted;
-		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
-		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
-			return ERR_PTR(-ECHILD);
-		if (!mounted)
-			break;
-		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-			return ERR_PTR(-ECHILD);
-		nd->path.mnt = &mounted->mnt;
-		nd->path.dentry = mounted->mnt.mnt_root;
-		inode = nd->path.dentry->d_inode;
-		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-	}
-	nd->inode = inode;
-	return NULL;
 }
 
 static const char *follow_dotdot(struct nameidata *nd)
@@ -1774,13 +1759,11 @@ static const char *follow_dotdot(struct nameidata *nd)
 	if (unlikely(!parent)) {
 		if (unlikely(nd->flags & LOOKUP_BENEATH))
 			return ERR_PTR(-EXDEV);
+		return step_into(nd, WALK_NOFOLLOW,
+				 dget(nd->path.dentry), nd->inode, nd->seq);
 	} else {
-		dput(nd->path.dentry);
-		nd->path.dentry = parent;
+		return step_into(nd, WALK_NOFOLLOW, parent, parent->d_inode, 0);
 	}
-	follow_mount(&nd->path);
-	nd->inode = nd->path.dentry->d_inode;
-	return NULL;
 }
 
 static const char *handle_dots(struct nameidata *nd, int type)

From e8dc4e885c459343970b25acd9320fe9ee5492e7 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:31 +0200
Subject: [PATCH 166/327] RDMA/cm: Fix ordering of xa_alloc_cyclic() in
 ib_create_cm_id()

xa_alloc_cyclic() is a SMP release to be paired with some later acquire
during xa_load() as part of cm_acquire_id().

As such, xa_alloc_cyclic() must be done after the cm_id is fully
initialized, in particular, it absolutely must be after the
refcount_set(), otherwise the refcount_inc() in cm_acquire_id() may not
see the set.

As there are several cases where a reader will be able to use the
id.local_id after cm_acquire_id in the IB_CM_IDLE state there needs to be
an unfortunate split into a NULL allocate and a finalizing xa_store.

Fixes: a977049dacde ("[PATCH] IB: Add the kernel CM implementation")
Link: https://lore.kernel.org/r/20200310092545.251365-2-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 21ede1c08eea..db627fa3cd39 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -571,18 +571,6 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
 	return 0;
 }
 
-static int cm_alloc_id(struct cm_id_private *cm_id_priv)
-{
-	int err;
-	u32 id;
-
-	err = xa_alloc_cyclic_irq(&cm.local_id_table, &id, cm_id_priv,
-			xa_limit_32b, &cm.local_id_next, GFP_KERNEL);
-
-	cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
-	return err;
-}
-
 static u32 cm_local_id(__be32 local_id)
 {
 	return (__force u32) (local_id ^ cm.random_id_operand);
@@ -824,6 +812,7 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
 				 void *context)
 {
 	struct cm_id_private *cm_id_priv;
+	u32 id;
 	int ret;
 
 	cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL);
@@ -835,9 +824,6 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
 	cm_id_priv->id.cm_handler = cm_handler;
 	cm_id_priv->id.context = context;
 	cm_id_priv->id.remote_cm_qpn = 1;
-	ret = cm_alloc_id(cm_id_priv);
-	if (ret)
-		goto error;
 
 	spin_lock_init(&cm_id_priv->lock);
 	init_completion(&cm_id_priv->comp);
@@ -846,11 +832,20 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
 	INIT_LIST_HEAD(&cm_id_priv->altr_list);
 	atomic_set(&cm_id_priv->work_count, -1);
 	refcount_set(&cm_id_priv->refcount, 1);
+
+	ret = xa_alloc_cyclic_irq(&cm.local_id_table, &id, NULL, xa_limit_32b,
+				  &cm.local_id_next, GFP_KERNEL);
+	if (ret)
+		goto error;
+	cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
+	xa_store_irq(&cm.local_id_table, cm_local_id(cm_id_priv->id.local_id),
+		     cm_id_priv, GFP_KERNEL);
+
 	return &cm_id_priv->id;
 
 error:
 	kfree(cm_id_priv);
-	return ERR_PTR(-ENOMEM);
+	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL(ib_create_cm_id);
 

From ca21cb7fb127fcba013acc66f9ebe2e3a9ae1e49 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:32 +0200
Subject: [PATCH 167/327] RDMA/cm: Fix checking for allowed duplicate listens

The test here typod the cm_id_priv to use, it used the one that was
freshly allocated. By definition the allocated one has the matching
cm_handler and zero context, so the condition was always true.

Instead check that the existing listening ID is compatible with the
proposed handler so that it can be shared, as was originally intended.

Fixes: 067b171b8679 ("IB/cm: Share listening CM IDs")
Link: https://lore.kernel.org/r/20200310092545.251365-3-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index db627fa3cd39..ba35cad951e6 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1181,7 +1181,8 @@ struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
 	/* Find an existing ID */
 	cm_id_priv = cm_find_listen(device, service_id);
 	if (cm_id_priv) {
-		if (cm_id->cm_handler != cm_handler || cm_id->context) {
+		if (cm_id_priv->id.cm_handler != cm_handler ||
+		    cm_id_priv->id.context) {
 			/* Sharing an ib_cm_id with different handlers is not
 			 * supported */
 			spin_unlock_irqrestore(&cm.lock, flags);

From bede86a39d9dc3387ac00dcb8e1ac221676b2f25 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:33 +0200
Subject: [PATCH 168/327] RDMA/cm: Remove a race freeing timewait_info

When creating a cm_id during REQ the id immediately becomes visible to the
other MAD handlers, and shortly after the state is moved to IB_CM_REQ_RCVD

This allows cm_rej_handler() to run concurrently and free the work:

        CPU 0                                CPU1
 cm_req_handler()
  ib_create_cm_id()
  cm_match_req()
    id_priv->state = IB_CM_REQ_RCVD
                                       cm_rej_handler()
                                         cm_acquire_id()
                                         spin_lock(&id_priv->lock)
                                         switch (id_priv->state)
  					   case IB_CM_REQ_RCVD:
                                            cm_reset_to_idle()
                                             kfree(id_priv->timewait_info);
   goto destroy
  destroy:
    kfree(id_priv->timewait_info);
                                             id_priv->timewait_info = NULL

Causing a double free or worse.

Do not free the timewait_info without also holding the
id_priv->lock. Simplify this entire flow by making the free unconditional
during cm_destroy_id() and removing the confusing special case error
unwind during creation of the timewait_info.

This also fixes a leak of the timewait if cm_destroy_id() is called in
IB_CM_ESTABLISHED with an XRC TGT QP. The state machine will be left in
ESTABLISHED while it needed to transition through IB_CM_TIMEWAIT to
release the timewait pointer.

Also fix a leak of the timewait_info if the caller mis-uses the API and
does ib_send_cm_reqs().

Fixes: a977049dacde ("[PATCH] IB: Add the kernel CM implementation")
Link: https://lore.kernel.org/r/20200310092545.251365-4-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index ba35cad951e6..0685cb2a88c0 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1054,14 +1054,22 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		break;
 	}
 
-	spin_lock_irq(&cm.lock);
+	spin_lock_irq(&cm_id_priv->lock);
+	spin_lock(&cm.lock);
+	/* Required for cleanup paths related cm_req_handler() */
+	if (cm_id_priv->timewait_info) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		kfree(cm_id_priv->timewait_info);
+		cm_id_priv->timewait_info = NULL;
+	}
 	if (!list_empty(&cm_id_priv->altr_list) &&
 	    (!cm_id_priv->altr_send_port_not_ready))
 		list_del(&cm_id_priv->altr_list);
 	if (!list_empty(&cm_id_priv->prim_list) &&
 	    (!cm_id_priv->prim_send_port_not_ready))
 		list_del(&cm_id_priv->prim_list);
-	spin_unlock_irq(&cm.lock);
+	spin_unlock(&cm.lock);
+	spin_unlock_irq(&cm_id_priv->lock);
 
 	cm_free_id(cm_id->local_id);
 	cm_deref_id(cm_id_priv);
@@ -1410,7 +1418,7 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
 	/* Verify that we're not in timewait. */
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
-	if (cm_id->state != IB_CM_IDLE) {
+	if (cm_id->state != IB_CM_IDLE || WARN_ON(cm_id_priv->timewait_info)) {
 		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		ret = -EINVAL;
 		goto out;
@@ -1428,12 +1436,12 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
 				 param->ppath_sgid_attr, &cm_id_priv->av,
 				 cm_id_priv);
 	if (ret)
-		goto error1;
+		goto out;
 	if (param->alternate_path) {
 		ret = cm_init_av_by_path(param->alternate_path, NULL,
 					 &cm_id_priv->alt_av, cm_id_priv);
 		if (ret)
-			goto error1;
+			goto out;
 	}
 	cm_id->service_id = param->service_id;
 	cm_id->service_mask = ~cpu_to_be64(0);
@@ -1451,7 +1459,7 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
 
 	ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg);
 	if (ret)
-		goto error1;
+		goto out;
 
 	req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad;
 	cm_format_req(req_msg, cm_id_priv, param);
@@ -1474,7 +1482,6 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
 	return 0;
 
 error2:	cm_free_msg(cm_id_priv->msg);
-error1:	kfree(cm_id_priv->timewait_info);
 out:	return ret;
 }
 EXPORT_SYMBOL(ib_send_cm_req);
@@ -2003,7 +2010,7 @@ static int cm_req_handler(struct cm_work *work)
 		pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__,
 			 be32_to_cpu(cm_id->local_id));
 		ret = -EINVAL;
-		goto free_timeinfo;
+		goto destroy;
 	}
 
 	cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
@@ -2093,8 +2100,6 @@ static int cm_req_handler(struct cm_work *work)
 rejected:
 	refcount_dec(&cm_id_priv->refcount);
 	cm_deref_id(listen_cm_id_priv);
-free_timeinfo:
-	kfree(cm_id_priv->timewait_info);
 destroy:
 	ib_destroy_cm_id(cm_id);
 	return ret;

From 2305d6864aa54c1887563d9096e4b746bc776d04 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:34 +0200
Subject: [PATCH 169/327] RDMA/cm: Make the destroy_id flow more robust

Too much of the destruction is very carefully sensitive to the state
and various other things. Move more code to the unconditional path and
add several WARN_ONs to check consistency.

Link: https://lore.kernel.org/r/20200310092545.251365-5-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 0685cb2a88c0..cad3c58e660c 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -825,6 +825,8 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
 	cm_id_priv->id.context = context;
 	cm_id_priv->id.remote_cm_qpn = 1;
 
+	RB_CLEAR_NODE(&cm_id_priv->service_node);
+	RB_CLEAR_NODE(&cm_id_priv->sidr_id_node);
 	spin_lock_init(&cm_id_priv->lock);
 	init_completion(&cm_id_priv->comp);
 	INIT_LIST_HEAD(&cm_id_priv->work_list);
@@ -982,11 +984,13 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		spin_lock_irq(&cm.lock);
 		if (--cm_id_priv->listen_sharecount > 0) {
 			/* The id is still shared. */
+			WARN_ON(refcount_read(&cm_id_priv->refcount) == 1);
 			cm_deref_id(cm_id_priv);
 			spin_unlock_irq(&cm.lock);
 			return;
 		}
 		rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
+		RB_CLEAR_NODE(&cm_id_priv->service_node);
 		spin_unlock_irq(&cm.lock);
 		break;
 	case IB_CM_SIDR_REQ_SENT:
@@ -997,11 +1001,6 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 	case IB_CM_SIDR_REQ_RCVD:
 		spin_unlock_irq(&cm_id_priv->lock);
 		cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
-		spin_lock_irq(&cm.lock);
-		if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
-			rb_erase(&cm_id_priv->sidr_id_node,
-				 &cm.remote_sidr_table);
-		spin_unlock_irq(&cm.lock);
 		break;
 	case IB_CM_REQ_SENT:
 	case IB_CM_MRA_REQ_RCVD:
@@ -1068,6 +1067,10 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 	if (!list_empty(&cm_id_priv->prim_list) &&
 	    (!cm_id_priv->prim_send_port_not_ready))
 		list_del(&cm_id_priv->prim_list);
+	WARN_ON(cm_id_priv->listen_sharecount);
+	WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node));
+	if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
+		rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
 	spin_unlock(&cm.lock);
 	spin_unlock_irq(&cm_id_priv->lock);
 

From 98f67156a80f37db70ec64787020b1f9bc8aea8c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:35 +0200
Subject: [PATCH 170/327] RDMA/cm: Simplify establishing a listen cm_id

Any manipulation of cm_id->state must be done under the cm_id_priv->lock,
the two routines that added listens did not follow this rule, because they
never participate in any concurrent access around the state.

However, since this exception makes the code hard to understand, simplify
the flow so that it can be fully locked:
 - Move manipulation of listen_sharecount into cm_insert_listen() so it is
   trivially under the cm.lock without having to expose the cm.lock to the
   caller.
 - Push the cm.lock down into cm_insert_listen() and have the function
   increment the reference count before returning an existing pointer.
 - Split ib_cm_listen() into an cm_init_listen() and do not call
   ib_cm_listen() from ib_cm_insert_listen()
 - Make both ib_cm_listen() and ib_cm_insert_listen() directly call
   cm_insert_listen() under their cm_id_priv->lock which does both a
   collision detect and, if needed, the insert (atomically)
 - Enclose all state manipulation within the cm_id_priv->lock, notice this
   set can be done safely after cm_insert_listen() as no reader is allowed
   to read the state without holding the lock.
 - Do not set the listen cm_id in the xarray, as it is never correct to
   look it up. This makes the concurrency simpler to understand.

Many needless error unwinds are removed in the process.

Link: https://lore.kernel.org/r/20200310092545.251365-6-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 209 ++++++++++++++++++++---------------
 1 file changed, 121 insertions(+), 88 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index cad3c58e660c..d2783d5dd4dc 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -620,22 +620,44 @@ static int be64_gt(__be64 a, __be64 b)
 	return (__force u64) a > (__force u64) b;
 }
 
-static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
+/*
+ * Inserts a new cm_id_priv into the listen_service_table. Returns cm_id_priv
+ * if the new ID was inserted, NULL if it could not be inserted due to a
+ * collision, or the existing cm_id_priv ready for shared usage.
+ */
+static struct cm_id_private *cm_insert_listen(struct cm_id_private *cm_id_priv,
+					      ib_cm_handler shared_handler)
 {
 	struct rb_node **link = &cm.listen_service_table.rb_node;
 	struct rb_node *parent = NULL;
 	struct cm_id_private *cur_cm_id_priv;
 	__be64 service_id = cm_id_priv->id.service_id;
 	__be64 service_mask = cm_id_priv->id.service_mask;
+	unsigned long flags;
 
+	spin_lock_irqsave(&cm.lock, flags);
 	while (*link) {
 		parent = *link;
 		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
 					  service_node);
 		if ((cur_cm_id_priv->id.service_mask & service_id) ==
 		    (service_mask & cur_cm_id_priv->id.service_id) &&
-		    (cm_id_priv->id.device == cur_cm_id_priv->id.device))
+		    (cm_id_priv->id.device == cur_cm_id_priv->id.device)) {
+			/*
+			 * Sharing an ib_cm_id with different handlers is not
+			 * supported
+			 */
+			if (cur_cm_id_priv->id.cm_handler != shared_handler ||
+			    cur_cm_id_priv->id.context ||
+			    WARN_ON(!cur_cm_id_priv->id.cm_handler)) {
+				spin_unlock_irqrestore(&cm.lock, flags);
+				return NULL;
+			}
+			refcount_inc(&cur_cm_id_priv->refcount);
+			cur_cm_id_priv->listen_sharecount++;
+			spin_unlock_irqrestore(&cm.lock, flags);
 			return cur_cm_id_priv;
+		}
 
 		if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
 			link = &(*link)->rb_left;
@@ -648,9 +670,11 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
 		else
 			link = &(*link)->rb_right;
 	}
+	cm_id_priv->listen_sharecount++;
 	rb_link_node(&cm_id_priv->service_node, parent, link);
 	rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table);
-	return NULL;
+	spin_unlock_irqrestore(&cm.lock, flags);
+	return cm_id_priv;
 }
 
 static struct cm_id_private * cm_find_listen(struct ib_device *device,
@@ -807,9 +831,9 @@ static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv,
 	ib_send_cm_sidr_rep(&cm_id_priv->id, &param);
 }
 
-struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
-				 ib_cm_handler cm_handler,
-				 void *context)
+static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device,
+					      ib_cm_handler cm_handler,
+					      void *context)
 {
 	struct cm_id_private *cm_id_priv;
 	u32 id;
@@ -840,15 +864,37 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
 	if (ret)
 		goto error;
 	cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
-	xa_store_irq(&cm.local_id_table, cm_local_id(cm_id_priv->id.local_id),
-		     cm_id_priv, GFP_KERNEL);
 
-	return &cm_id_priv->id;
+	return cm_id_priv;
 
 error:
 	kfree(cm_id_priv);
 	return ERR_PTR(ret);
 }
+
+/*
+ * Make the ID visible to the MAD handlers and other threads that use the
+ * xarray.
+ */
+static void cm_finalize_id(struct cm_id_private *cm_id_priv)
+{
+	xa_store_irq(&cm.local_id_table, cm_local_id(cm_id_priv->id.local_id),
+		     cm_id_priv, GFP_KERNEL);
+}
+
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+				 ib_cm_handler cm_handler,
+				 void *context)
+{
+	struct cm_id_private *cm_id_priv;
+
+	cm_id_priv = cm_alloc_id_priv(device, cm_handler, context);
+	if (IS_ERR(cm_id_priv))
+		return ERR_CAST(cm_id_priv);
+
+	cm_finalize_id(cm_id_priv);
+	return &cm_id_priv->id;
+}
 EXPORT_SYMBOL(ib_create_cm_id);
 
 static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv)
@@ -1092,8 +1138,27 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id)
 }
 EXPORT_SYMBOL(ib_destroy_cm_id);
 
+static int cm_init_listen(struct cm_id_private *cm_id_priv, __be64 service_id,
+			  __be64 service_mask)
+{
+	service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
+	service_id &= service_mask;
+	if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
+	    (service_id != IB_CM_ASSIGN_SERVICE_ID))
+		return -EINVAL;
+
+	if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
+		cm_id_priv->id.service_id = cpu_to_be64(cm.listen_service_id++);
+		cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+	} else {
+		cm_id_priv->id.service_id = service_id;
+		cm_id_priv->id.service_mask = service_mask;
+	}
+	return 0;
+}
+
 /**
- * __ib_cm_listen - Initiates listening on the specified service ID for
+ * ib_cm_listen - Initiates listening on the specified service ID for
  *   connection and service ID resolution requests.
  * @cm_id: Connection identifier associated with the listen request.
  * @service_id: Service identifier matched against incoming connection
@@ -1105,51 +1170,33 @@ EXPORT_SYMBOL(ib_destroy_cm_id);
  *   exactly.  This parameter is ignored if %service_id is set to
  *   IB_CM_ASSIGN_SERVICE_ID.
  */
-static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
-			  __be64 service_mask)
-{
-	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
-	int ret = 0;
-
-	service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
-	service_id &= service_mask;
-	if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
-	    (service_id != IB_CM_ASSIGN_SERVICE_ID))
-		return -EINVAL;
-
-	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
-	if (cm_id->state != IB_CM_IDLE)
-		return -EINVAL;
-
-	cm_id->state = IB_CM_LISTEN;
-	++cm_id_priv->listen_sharecount;
-
-	if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
-		cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
-		cm_id->service_mask = ~cpu_to_be64(0);
-	} else {
-		cm_id->service_id = service_id;
-		cm_id->service_mask = service_mask;
-	}
-	cur_cm_id_priv = cm_insert_listen(cm_id_priv);
-
-	if (cur_cm_id_priv) {
-		cm_id->state = IB_CM_IDLE;
-		--cm_id_priv->listen_sharecount;
-		ret = -EBUSY;
-	}
-	return ret;
-}
-
 int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
 {
+	struct cm_id_private *cm_id_priv =
+		container_of(cm_id, struct cm_id_private, id);
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&cm.lock, flags);
-	ret = __ib_cm_listen(cm_id, service_id, service_mask);
-	spin_unlock_irqrestore(&cm.lock, flags);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->id.state != IB_CM_IDLE) {
+		ret = -EINVAL;
+		goto out;
+	}
 
+	ret = cm_init_listen(cm_id_priv, service_id, service_mask);
+	if (ret)
+		goto out;
+
+	if (!cm_insert_listen(cm_id_priv, NULL)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	cm_id_priv->id.state = IB_CM_LISTEN;
+	ret = 0;
+
+out:
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(ib_cm_listen);
@@ -1174,52 +1221,38 @@ struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
 				     ib_cm_handler cm_handler,
 				     __be64 service_id)
 {
+	struct cm_id_private *listen_id_priv;
 	struct cm_id_private *cm_id_priv;
-	struct ib_cm_id *cm_id;
-	unsigned long flags;
 	int err = 0;
 
 	/* Create an ID in advance, since the creation may sleep */
-	cm_id = ib_create_cm_id(device, cm_handler, NULL);
-	if (IS_ERR(cm_id))
-		return cm_id;
+	cm_id_priv = cm_alloc_id_priv(device, cm_handler, NULL);
+	if (IS_ERR(cm_id_priv))
+		return ERR_CAST(cm_id_priv);
 
-	spin_lock_irqsave(&cm.lock, flags);
-
-	if (service_id == IB_CM_ASSIGN_SERVICE_ID)
-		goto new_id;
-
-	/* Find an existing ID */
-	cm_id_priv = cm_find_listen(device, service_id);
-	if (cm_id_priv) {
-		if (cm_id_priv->id.cm_handler != cm_handler ||
-		    cm_id_priv->id.context) {
-			/* Sharing an ib_cm_id with different handlers is not
-			 * supported */
-			spin_unlock_irqrestore(&cm.lock, flags);
-			ib_destroy_cm_id(cm_id);
-			return ERR_PTR(-EINVAL);
-		}
-		refcount_inc(&cm_id_priv->refcount);
-		++cm_id_priv->listen_sharecount;
-		spin_unlock_irqrestore(&cm.lock, flags);
-
-		ib_destroy_cm_id(cm_id);
-		cm_id = &cm_id_priv->id;
-		return cm_id;
-	}
-
-new_id:
-	/* Use newly created ID */
-	err = __ib_cm_listen(cm_id, service_id, 0);
-
-	spin_unlock_irqrestore(&cm.lock, flags);
-
-	if (err) {
-		ib_destroy_cm_id(cm_id);
+	err = cm_init_listen(cm_id_priv, service_id, 0);
+	if (err)
 		return ERR_PTR(err);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	listen_id_priv = cm_insert_listen(cm_id_priv, cm_handler);
+	if (listen_id_priv != cm_id_priv) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_destroy_cm_id(&cm_id_priv->id);
+		if (!listen_id_priv)
+			return ERR_PTR(-EINVAL);
+		return &listen_id_priv->id;
 	}
-	return cm_id;
+	cm_id_priv->id.state = IB_CM_LISTEN;
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	/*
+	 * A listen ID does not need to be in the xarray since it does not
+	 * receive mads, is not placed in the remote_id or remote_qpn rbtree,
+	 * and does not enter timewait.
+	 */
+
+	return &cm_id_priv->id;
 }
 EXPORT_SYMBOL(ib_cm_insert_listen);
 

From 153a2e432e3d323b633155633429dcaf71ea7c53 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:36 +0200
Subject: [PATCH 171/327] RDMA/cm: Read id.state under lock when doing
 pr_debug()

The lock should not be dropped before doing the pr_debug() print as it is
accessing data protected by the lock, such as id.state.

Fixes: 119bf81793ea ("IB/cm: Add debug prints to ib_cm")
Link: https://lore.kernel.org/r/20200310092545.251365-7-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index d2783d5dd4dc..79ad3d64837e 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -2394,13 +2394,13 @@ static int cm_rep_handler(struct cm_work *work)
 	case IB_CM_MRA_REQ_RCVD:
 		break;
 	default:
-		spin_unlock_irq(&cm_id_priv->lock);
 		ret = -EINVAL;
 		pr_debug(
 			"%s: cm_id_priv->id.state: %d, local_comm_id %d, remote_comm_id %d\n",
 			__func__, cm_id_priv->id.state,
 			IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg),
 			IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg));
+		spin_unlock_irq(&cm_id_priv->lock);
 		goto error;
 	}
 
@@ -2666,10 +2666,10 @@ int ib_send_cm_drep(struct ib_cm_id *cm_id,
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 	spin_lock_irqsave(&cm_id_priv->lock, flags);
 	if (cm_id->state != IB_CM_DREQ_RCVD) {
-		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-		kfree(data);
 		pr_debug("%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n",
 			 __func__, be32_to_cpu(cm_id->local_id), cm_id->state);
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		kfree(data);
 		return -EINVAL;
 	}
 
@@ -3005,10 +3005,10 @@ static int cm_rej_handler(struct cm_work *work)
 		}
 		/* fall through */
 	default:
-		spin_unlock_irq(&cm_id_priv->lock);
 		pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n",
 			 __func__, be32_to_cpu(cm_id_priv->id.local_id),
 			 cm_id_priv->id.state);
+		spin_unlock_irq(&cm_id_priv->lock);
 		ret = -EINVAL;
 		goto out;
 	}

From 083bfdbfd5ee2d008d256975d962ef75ddbbc353 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:37 +0200
Subject: [PATCH 172/327] RDMA/cm: Make it clear that there is no concurrency
 in cm_sidr_req_handler()

ib_create_cm_id() immediately places the id in the xarray, so it is visible
to network traffic.

The state is initially set to IB_CM_IDLE and all the MAD handlers will
test this state under lock and refuse to advance from IDLE, so adding to
the xarray is harmless.

Further, the set to IB_CM_SIDR_REQ_RCVD also excludes all MAD handlers.

However, the local_id isn't even used for SIDR mode, and there will be no
input MADs related to the newly created ID.

So, make the whole flow simpler so it can be understood:
 - Do not put the SIDR cm_id in the xarray. This directly shows that there
   is no concurrency
 - Delete the confusing work_count and pending_list manipulations. This
   mechanism is only used by MAD handlers and timewait, neither of which
   apply to SIDR.
 - Add a few comments and rename 'cur_cm_id_priv' to 'listen_cm_id_priv'
 - Move other loose sets up to immediately after cm_id creation so that
   the cm_id is fully configured right away. This fixes an oversight where
   the service_id will not be returned back on a IB_SIDR_UNSUPPORTED
   reject.

Link: https://lore.kernel.org/r/20200310092545.251365-8-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 64 +++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 27 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 79ad3d64837e..36e7ce522d04 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3535,20 +3535,27 @@ static void cm_format_sidr_req_event(struct cm_work *work,
 
 static int cm_sidr_req_handler(struct cm_work *work)
 {
-	struct ib_cm_id *cm_id;
-	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+	struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
 	struct cm_sidr_req_msg *sidr_req_msg;
 	struct ib_wc *wc;
 	int ret;
 
-	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
-	if (IS_ERR(cm_id))
-		return PTR_ERR(cm_id);
-	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	cm_id_priv =
+		cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL);
+	if (IS_ERR(cm_id_priv))
+		return PTR_ERR(cm_id_priv);
 
 	/* Record SGID/SLID and request ID for lookup. */
 	sidr_req_msg = (struct cm_sidr_req_msg *)
 				work->mad_recv_wc->recv_buf.mad;
+
+	cm_id_priv->id.remote_id =
+		cpu_to_be32(IBA_GET(CM_SIDR_REQ_REQUESTID, sidr_req_msg));
+	cm_id_priv->id.service_id =
+		cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg));
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+	cm_id_priv->tid = sidr_req_msg->hdr.tid;
+
 	wc = work->mad_recv_wc->wc;
 	cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid);
 	cm_id_priv->av.dgid.global.interface_id = 0;
@@ -3558,41 +3565,44 @@ static int cm_sidr_req_handler(struct cm_work *work)
 	if (ret)
 		goto out;
 
-	cm_id_priv->id.remote_id =
-		cpu_to_be32(IBA_GET(CM_SIDR_REQ_REQUESTID, sidr_req_msg));
-	cm_id_priv->tid = sidr_req_msg->hdr.tid;
-	atomic_inc(&cm_id_priv->work_count);
-
 	spin_lock_irq(&cm.lock);
-	cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
-	if (cur_cm_id_priv) {
+	listen_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
+	if (listen_cm_id_priv) {
 		spin_unlock_irq(&cm.lock);
 		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
 				counter[CM_SIDR_REQ_COUNTER]);
 		goto out; /* Duplicate message. */
 	}
 	cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
-	cur_cm_id_priv = cm_find_listen(
-		cm_id->device,
-		cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg)));
-	if (!cur_cm_id_priv) {
+	listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
+					   cm_id_priv->id.service_id);
+	if (!listen_cm_id_priv) {
 		spin_unlock_irq(&cm.lock);
 		cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
 		goto out; /* No match. */
 	}
-	refcount_inc(&cur_cm_id_priv->refcount);
-	refcount_inc(&cm_id_priv->refcount);
+	refcount_inc(&listen_cm_id_priv->refcount);
 	spin_unlock_irq(&cm.lock);
 
-	cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler;
-	cm_id_priv->id.context = cur_cm_id_priv->id.context;
-	cm_id_priv->id.service_id =
-		cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg));
-	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+	cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
+	cm_id_priv->id.context = listen_cm_id_priv->id.context;
 
-	cm_format_sidr_req_event(work, cm_id_priv, &cur_cm_id_priv->id);
-	cm_process_work(cm_id_priv, work);
-	cm_deref_id(cur_cm_id_priv);
+	/*
+	 * A SIDR ID does not need to be in the xarray since it does not receive
+	 * mads, is not placed in the remote_id or remote_qpn rbtree, and does
+	 * not enter timewait.
+	 */
+
+	cm_format_sidr_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
+	cm_free_work(work);
+	/*
+	 * A pointer to the listen_cm_id is held in the event, so this deref
+	 * must be after the event is delivered above.
+	 */
+	cm_deref_id(listen_cm_id_priv);
+	if (ret)
+		cm_destroy_id(&cm_id_priv->id, ret);
 	return 0;
 out:
 	ib_destroy_cm_id(&cm_id_priv->id);

From c206f8bad15d30f1e35821c21a2fb146e4668ebf Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:38 +0200
Subject: [PATCH 173/327] RDMA/cm: Make it clearer how concurrency works in
 cm_req_handler()

ib_crate_cm_id() immediately places the id in the xarray, and publishes it
into the remote_id and remote_qpn rbtrees. This makes it visible to other
threads before it is fully set up.

It appears the thinking here was that the states IB_CM_IDLE and
IB_CM_REQ_RCVD do not allow any MAD handler or lookup in the remote_id and
remote_qpn rbtrees to advance.

However, cm_rej_handler() does take an action on IB_CM_REQ_RCVD, which is
not really expected by the design.

Make the whole thing clearer:
 - Keep the new cm_id out of the xarray until it is completely set up.
   This directly prevents MAD handlers and all rbtree lookups from seeing
   the pointer.
 - Move all the trivial setup right to the top so it is obviously done
   before any concurrency begins
 - Move the mutation of the cm_id_priv out of cm_match_id() and into the
   caller so the state transition is obvious
 - Place the manipulation of the work_list at the end, under lock, after
   the cm_id is placed in the xarray. The work_count cannot change on an
   ID outside the xarray.
 - Add some comments

Link: https://lore.kernel.org/r/20200310092545.251365-9-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 99 +++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 42 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 36e7ce522d04..3f125044d35e 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1958,14 +1958,10 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
 		cm_issue_rej(work->port, work->mad_recv_wc,
 			     IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ,
 			     NULL, 0);
-		goto out;
+		return NULL;
 	}
 	refcount_inc(&listen_cm_id_priv->refcount);
-	refcount_inc(&cm_id_priv->refcount);
-	cm_id_priv->id.state = IB_CM_REQ_RCVD;
-	atomic_inc(&cm_id_priv->work_count);
 	spin_unlock_irq(&cm.lock);
-out:
 	return listen_cm_id_priv;
 }
 
@@ -2007,7 +2003,6 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
 
 static int cm_req_handler(struct cm_work *work)
 {
-	struct ib_cm_id *cm_id;
 	struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
 	struct cm_req_msg *req_msg;
 	const struct ib_global_route *grh;
@@ -2016,13 +2011,33 @@ static int cm_req_handler(struct cm_work *work)
 
 	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
 
-	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
-	if (IS_ERR(cm_id))
-		return PTR_ERR(cm_id);
+	cm_id_priv =
+		cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL);
+	if (IS_ERR(cm_id_priv))
+		return PTR_ERR(cm_id_priv);
 
-	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
 	cm_id_priv->id.remote_id =
 		cpu_to_be32(IBA_GET(CM_REQ_LOCAL_COMM_ID, req_msg));
+	cm_id_priv->id.service_id =
+		cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg));
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+	cm_id_priv->tid = req_msg->hdr.tid;
+	cm_id_priv->timeout_ms = cm_convert_to_ms(
+		IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg));
+	cm_id_priv->max_cm_retries = IBA_GET(CM_REQ_MAX_CM_RETRIES, req_msg);
+	cm_id_priv->remote_qpn =
+		cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg));
+	cm_id_priv->initiator_depth =
+		IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg);
+	cm_id_priv->responder_resources =
+		IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg);
+	cm_id_priv->path_mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg);
+	cm_id_priv->pkey = cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg));
+	cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg));
+	cm_id_priv->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg);
+	cm_id_priv->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg);
+	cm_id_priv->qp_type = cm_req_get_qp_type(req_msg);
+
 	ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
 				      work->mad_recv_wc->recv_buf.grh,
 				      &cm_id_priv->av);
@@ -2034,27 +2049,26 @@ static int cm_req_handler(struct cm_work *work)
 		ret = PTR_ERR(cm_id_priv->timewait_info);
 		goto destroy;
 	}
-	cm_id_priv->timewait_info->work.remote_id =
-		cpu_to_be32(IBA_GET(CM_REQ_LOCAL_COMM_ID, req_msg));
+	cm_id_priv->timewait_info->work.remote_id = cm_id_priv->id.remote_id;
 	cm_id_priv->timewait_info->remote_ca_guid =
 		cpu_to_be64(IBA_GET(CM_REQ_LOCAL_CA_GUID, req_msg));
-	cm_id_priv->timewait_info->remote_qpn =
-		cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg));
+	cm_id_priv->timewait_info->remote_qpn = cm_id_priv->remote_qpn;
+
+	/*
+	 * Note that the ID pointer is not in the xarray at this point,
+	 * so this set is only visible to the local thread.
+	 */
+	cm_id_priv->id.state = IB_CM_REQ_RCVD;
 
 	listen_cm_id_priv = cm_match_req(work, cm_id_priv);
 	if (!listen_cm_id_priv) {
 		pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__,
-			 be32_to_cpu(cm_id->local_id));
+			 be32_to_cpu(cm_id_priv->id.local_id));
+		cm_id_priv->id.state = IB_CM_IDLE;
 		ret = -EINVAL;
 		goto destroy;
 	}
 
-	cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
-	cm_id_priv->id.context = listen_cm_id_priv->id.context;
-	cm_id_priv->id.service_id =
-		cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg));
-	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
-
 	cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
 
 	memset(&work->path[0], 0, sizeof(work->path[0]));
@@ -2092,10 +2106,10 @@ static int cm_req_handler(struct cm_work *work)
 				     work->port->port_num, 0,
 				     &work->path[0].sgid);
 		if (err)
-			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+			ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID,
 				       NULL, 0, NULL, 0);
 		else
-			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+			ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID,
 				       &work->path[0].sgid,
 				       sizeof(work->path[0].sgid),
 				       NULL, 0);
@@ -2105,39 +2119,40 @@ static int cm_req_handler(struct cm_work *work)
 		ret = cm_init_av_by_path(&work->path[1], NULL,
 					 &cm_id_priv->alt_av, cm_id_priv);
 		if (ret) {
-			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
+			ib_send_cm_rej(&cm_id_priv->id,
+				       IB_CM_REJ_INVALID_ALT_GID,
 				       &work->path[0].sgid,
 				       sizeof(work->path[0].sgid), NULL, 0);
 			goto rejected;
 		}
 	}
-	cm_id_priv->tid = req_msg->hdr.tid;
-	cm_id_priv->timeout_ms = cm_convert_to_ms(
-		IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg));
-	cm_id_priv->max_cm_retries = IBA_GET(CM_REQ_MAX_CM_RETRIES, req_msg);
-	cm_id_priv->remote_qpn =
-		cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg));
-	cm_id_priv->initiator_depth =
-		IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg);
-	cm_id_priv->responder_resources =
-		IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg);
-	cm_id_priv->path_mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg);
-	cm_id_priv->pkey = cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg));
-	cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg));
-	cm_id_priv->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg);
-	cm_id_priv->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg);
-	cm_id_priv->qp_type = cm_req_get_qp_type(req_msg);
 
+	cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
+	cm_id_priv->id.context = listen_cm_id_priv->id.context;
 	cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
+
+	/* Now MAD handlers can see the new ID */
+	spin_lock_irq(&cm_id_priv->lock);
+	cm_finalize_id(cm_id_priv);
+
+	/* Refcount belongs to the event, pairs with cm_process_work() */
+	refcount_inc(&cm_id_priv->refcount);
+	atomic_inc(&cm_id_priv->work_count);
+	spin_unlock_irq(&cm_id_priv->lock);
 	cm_process_work(cm_id_priv, work);
+	/*
+	 * Since this ID was just created and was not made visible to other MAD
+	 * handlers until the cm_finalize_id() above we know that the
+	 * cm_process_work() will deliver the event and the listen_cm_id
+	 * embedded in the event can be derefed here.
+	 */
 	cm_deref_id(listen_cm_id_priv);
 	return 0;
 
 rejected:
-	refcount_dec(&cm_id_priv->refcount);
 	cm_deref_id(listen_cm_id_priv);
 destroy:
-	ib_destroy_cm_id(cm_id);
+	ib_destroy_cm_id(&cm_id_priv->id);
 	return ret;
 }
 

From d1de9a88074b66482443f0cd91618d7b51a7c9b6 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:39 +0200
Subject: [PATCH 174/327] RDMA/cm: Add missing locking around id.state in
 cm_dup_req_handler

All accesses to id.state must be done under the spinlock.

Fixes: a977049dacde ("[PATCH] IB: Add the kernel CM implementation")
Link: https://lore.kernel.org/r/20200310092545.251365-10-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 3f125044d35e..3ce735527938 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1872,8 +1872,12 @@ static void cm_dup_req_handler(struct cm_work *work,
 			counter[CM_REQ_COUNTER]);
 
 	/* Quick state check to discard duplicate REQs. */
-	if (cm_id_priv->id.state == IB_CM_REQ_RCVD)
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state == IB_CM_REQ_RCVD) {
+		spin_unlock_irq(&cm_id_priv->lock);
 		return;
+	}
+	spin_unlock_irq(&cm_id_priv->lock);
 
 	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
 	if (ret)

From 00777a68ae98931d2d10f0aa8bfa60734cc09bfd Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:40 +0200
Subject: [PATCH 175/327] RDMA/cm: Add some lockdep assertions for
 cm_id_priv->lock

These functions all touch state, so must be called under the lock.
Inspection shows this is currently true.

Link: https://lore.kernel.org/r/20200310092545.251365-11-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 3ce735527938..cc3e90e77eec 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -974,6 +974,8 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
 	unsigned long flags;
 	struct cm_device *cm_dev;
 
+	lockdep_assert_held(&cm_id_priv->lock);
+
 	cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client);
 	if (!cm_dev)
 		return;
@@ -1005,6 +1007,8 @@ static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
 {
 	unsigned long flags;
 
+	lockdep_assert_held(&cm_id_priv->lock);
+
 	cm_id_priv->id.state = IB_CM_IDLE;
 	if (cm_id_priv->timewait_info) {
 		spin_lock_irqsave(&cm.lock, flags);
@@ -1823,6 +1827,8 @@ static void cm_format_rej(struct cm_rej_msg *rej_msg,
 			  const void *private_data,
 			  u8 private_data_len)
 {
+	lockdep_assert_held(&cm_id_priv->lock);
+
 	cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid);
 	IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg,
 		be32_to_cpu(cm_id_priv->id.remote_id));

From e029fdc0684785ffdeca758a08bd860bae8c344d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:41 +0200
Subject: [PATCH 176/327] RDMA/cm: Allow ib_send_cm_dreq() to be done under
 lock

The first thing ib_send_cm_dreq() does is obtain the lock, so use the
usual unlocked wrapper, locked actor pattern here.

This avoids a sketchy lock/unlock sequence (which could allow state to
change) during cm_destroy_id().

Link: https://lore.kernel.org/r/20200310092545.251365-12-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 54 +++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index cc3e90e77eec..00bbfa244afb 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -80,8 +80,11 @@ const char *__attribute_const__ ibcm_reject_msg(int reason)
 }
 EXPORT_SYMBOL(ibcm_reject_msg);
 
+struct cm_id_private;
 static void cm_add_one(struct ib_device *device);
 static void cm_remove_one(struct ib_device *device, void *client_data);
+static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
+			       const void *private_data, u8 private_data_len);
 
 static struct ib_client cm_client = {
 	.name   = "cm",
@@ -1084,10 +1087,12 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 			       NULL, 0, NULL, 0);
 		break;
 	case IB_CM_ESTABLISHED:
-		spin_unlock_irq(&cm_id_priv->lock);
-		if (cm_id_priv->qp_type == IB_QPT_XRC_TGT)
+		if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
+			spin_unlock_irq(&cm_id_priv->lock);
 			break;
-		ib_send_cm_dreq(cm_id, NULL, 0);
+		}
+		cm_send_dreq_locked(cm_id_priv, NULL, 0);
+		spin_unlock_irq(&cm_id_priv->lock);
 		goto retest;
 	case IB_CM_DREQ_SENT:
 		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
@@ -2604,35 +2609,32 @@ static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
 			    private_data_len);
 }
 
-int ib_send_cm_dreq(struct ib_cm_id *cm_id,
-		    const void *private_data,
-		    u8 private_data_len)
+static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
+			       const void *private_data, u8 private_data_len)
 {
-	struct cm_id_private *cm_id_priv;
 	struct ib_mad_send_buf *msg;
-	unsigned long flags;
 	int ret;
 
+	lockdep_assert_held(&cm_id_priv->lock);
+
 	if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
 		return -EINVAL;
 
-	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
-	spin_lock_irqsave(&cm_id_priv->lock, flags);
-	if (cm_id->state != IB_CM_ESTABLISHED) {
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
 		pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__,
-			 be32_to_cpu(cm_id->local_id), cm_id->state);
-		ret = -EINVAL;
-		goto out;
+			 be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
+		return -EINVAL;
 	}
 
-	if (cm_id->lap_state == IB_CM_LAP_SENT ||
-	    cm_id->lap_state == IB_CM_MRA_LAP_RCVD)
+	if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
+	    cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
 		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
 
 	ret = cm_alloc_msg(cm_id_priv, &msg);
 	if (ret) {
 		cm_enter_timewait(cm_id_priv);
-		goto out;
+		return ret;
 	}
 
 	cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
@@ -2643,14 +2645,26 @@ int ib_send_cm_dreq(struct ib_cm_id *cm_id,
 	ret = ib_post_send_mad(msg, NULL);
 	if (ret) {
 		cm_enter_timewait(cm_id_priv);
-		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		cm_free_msg(msg);
 		return ret;
 	}
 
-	cm_id->state = IB_CM_DREQ_SENT;
+	cm_id_priv->id.state = IB_CM_DREQ_SENT;
 	cm_id_priv->msg = msg;
-out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+}
+
+int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data,
+		    u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv =
+		container_of(cm_id, struct cm_id_private, id);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	ret = cm_send_dreq_locked(cm_id_priv, private_data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(ib_send_cm_dreq);

From 87cabf3e09fff9960743801a046bb2676427f00a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:42 +0200
Subject: [PATCH 177/327] RDMA/cm: Allow ib_send_cm_drep() to be done under
 lock

The first thing ib_send_cm_drep() does is obtain the lock, so use the
usual unlocked wrapper, locked actor pattern here.

This avoids a sketchy lock/unlock sequence (which could allow state to
change) during cm_destroy_id().

Link: https://lore.kernel.org/r/20200310092545.251365-13-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 55 +++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 00bbfa244afb..8ed25341edd7 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -85,6 +85,8 @@ static void cm_add_one(struct ib_device *device);
 static void cm_remove_one(struct ib_device *device, void *client_data);
 static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
 			       const void *private_data, u8 private_data_len);
+static int cm_send_drep_locked(struct cm_id_private *cm_id_priv,
+			       void *private_data, u8 private_data_len);
 
 static struct ib_client cm_client = {
 	.name   = "cm",
@@ -1100,8 +1102,8 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		spin_unlock_irq(&cm_id_priv->lock);
 		break;
 	case IB_CM_DREQ_RCVD:
+		cm_send_drep_locked(cm_id_priv, NULL, 0);
 		spin_unlock_irq(&cm_id_priv->lock);
-		ib_send_cm_drep(cm_id, NULL, 0);
 		break;
 	default:
 		spin_unlock_irq(&cm_id_priv->lock);
@@ -2685,51 +2687,60 @@ static void cm_format_drep(struct cm_drep_msg *drep_msg,
 			    private_data_len);
 }
 
-int ib_send_cm_drep(struct ib_cm_id *cm_id,
-		    const void *private_data,
-		    u8 private_data_len)
+static int cm_send_drep_locked(struct cm_id_private *cm_id_priv,
+			       void *private_data, u8 private_data_len)
 {
-	struct cm_id_private *cm_id_priv;
 	struct ib_mad_send_buf *msg;
-	unsigned long flags;
-	void *data;
 	int ret;
 
+	lockdep_assert_held(&cm_id_priv->lock);
+
 	if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE)
 		return -EINVAL;
 
-	data = cm_copy_private_data(private_data, private_data_len);
-	if (IS_ERR(data))
-		return PTR_ERR(data);
-
-	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
-	spin_lock_irqsave(&cm_id_priv->lock, flags);
-	if (cm_id->state != IB_CM_DREQ_RCVD) {
-		pr_debug("%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n",
-			 __func__, be32_to_cpu(cm_id->local_id), cm_id->state);
-		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-		kfree(data);
+	if (cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
+		pr_debug(
+			"%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n",
+			__func__, be32_to_cpu(cm_id_priv->id.local_id),
+			cm_id_priv->id.state);
+		kfree(private_data);
 		return -EINVAL;
 	}
 
-	cm_set_private_data(cm_id_priv, data, private_data_len);
+	cm_set_private_data(cm_id_priv, private_data, private_data_len);
 	cm_enter_timewait(cm_id_priv);
 
 	ret = cm_alloc_msg(cm_id_priv, &msg);
 	if (ret)
-		goto out;
+		return ret;
 
 	cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
 		       private_data, private_data_len);
 
 	ret = ib_post_send_mad(msg, NULL);
 	if (ret) {
-		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		cm_free_msg(msg);
 		return ret;
 	}
+	return 0;
+}
 
-out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+int ib_send_cm_drep(struct ib_cm_id *cm_id, const void *private_data,
+		    u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv =
+		container_of(cm_id, struct cm_id_private, id);
+	unsigned long flags;
+	void *data;
+	int ret;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	ret = cm_send_drep_locked(cm_id_priv, data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(ib_send_cm_drep);

From 81ddb41f876d488416e8aa5a740a4dbd01a83705 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:43 +0200
Subject: [PATCH 178/327] RDMA/cm: Allow ib_send_cm_rej() to be done under lock

The first thing ib_send_cm_rej() does is obtain the lock, so use the usual
unlocked wrapper, locked actor pattern here.

This avoids a sketchy lock/unlock sequence (which could allow state to
change) during cm_destroy_id().

While here simplify some of the logic in the implementation.

Link: https://lore.kernel.org/r/20200310092545.251365-14-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 92 ++++++++++++++++++++----------------
 1 file changed, 52 insertions(+), 40 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 8ed25341edd7..651e7c391762 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -87,6 +87,10 @@ static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
 			       const void *private_data, u8 private_data_len);
 static int cm_send_drep_locked(struct cm_id_private *cm_id_priv,
 			       void *private_data, u8 private_data_len);
+static int cm_send_rej_locked(struct cm_id_private *cm_id_priv,
+			      enum ib_cm_rej_reason reason, void *ari,
+			      u8 ari_length, const void *private_data,
+			      u8 private_data_len);
 
 static struct ib_client cm_client = {
 	.name   = "cm",
@@ -1060,11 +1064,11 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 	case IB_CM_REQ_SENT:
 	case IB_CM_MRA_REQ_RCVD:
 		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		cm_send_rej_locked(cm_id_priv, IB_CM_REJ_TIMEOUT,
+				   &cm_id_priv->id.device->node_guid,
+				   sizeof(cm_id_priv->id.device->node_guid),
+				   NULL, 0);
 		spin_unlock_irq(&cm_id_priv->lock);
-		ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT,
-			       &cm_id_priv->id.device->node_guid,
-			       sizeof cm_id_priv->id.device->node_guid,
-			       NULL, 0);
 		break;
 	case IB_CM_REQ_RCVD:
 		if (err == -ENOMEM) {
@@ -1072,9 +1076,10 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 			cm_reset_to_idle(cm_id_priv);
 			spin_unlock_irq(&cm_id_priv->lock);
 		} else {
+			cm_send_rej_locked(cm_id_priv,
+					   IB_CM_REJ_CONSUMER_DEFINED, NULL, 0,
+					   NULL, 0);
 			spin_unlock_irq(&cm_id_priv->lock);
-			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
-				       NULL, 0, NULL, 0);
 		}
 		break;
 	case IB_CM_REP_SENT:
@@ -1084,9 +1089,9 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 	case IB_CM_MRA_REQ_SENT:
 	case IB_CM_REP_RCVD:
 	case IB_CM_MRA_REP_SENT:
+		cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL,
+				   0, NULL, 0);
 		spin_unlock_irq(&cm_id_priv->lock);
-		ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
-			       NULL, 0, NULL, 0);
 		break;
 	case IB_CM_ESTABLISHED:
 		if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
@@ -2899,65 +2904,72 @@ static int cm_drep_handler(struct cm_work *work)
 	return -EINVAL;
 }
 
-int ib_send_cm_rej(struct ib_cm_id *cm_id,
-		   enum ib_cm_rej_reason reason,
-		   void *ari,
-		   u8 ari_length,
-		   const void *private_data,
-		   u8 private_data_len)
+static int cm_send_rej_locked(struct cm_id_private *cm_id_priv,
+			      enum ib_cm_rej_reason reason, void *ari,
+			      u8 ari_length, const void *private_data,
+			      u8 private_data_len)
 {
-	struct cm_id_private *cm_id_priv;
 	struct ib_mad_send_buf *msg;
-	unsigned long flags;
 	int ret;
 
+	lockdep_assert_held(&cm_id_priv->lock);
+
 	if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) ||
 	    (ari && ari_length > IB_CM_REJ_ARI_LENGTH))
 		return -EINVAL;
 
-	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
-
-	spin_lock_irqsave(&cm_id_priv->lock, flags);
-	switch (cm_id->state) {
+	switch (cm_id_priv->id.state) {
 	case IB_CM_REQ_SENT:
 	case IB_CM_MRA_REQ_RCVD:
 	case IB_CM_REQ_RCVD:
 	case IB_CM_MRA_REQ_SENT:
 	case IB_CM_REP_RCVD:
 	case IB_CM_MRA_REP_SENT:
-		ret = cm_alloc_msg(cm_id_priv, &msg);
-		if (!ret)
-			cm_format_rej((struct cm_rej_msg *) msg->mad,
-				      cm_id_priv, reason, ari, ari_length,
-				      private_data, private_data_len);
-
 		cm_reset_to_idle(cm_id_priv);
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (ret)
+			return ret;
+		cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason,
+			      ari, ari_length, private_data, private_data_len);
 		break;
 	case IB_CM_REP_SENT:
 	case IB_CM_MRA_REP_RCVD:
-		ret = cm_alloc_msg(cm_id_priv, &msg);
-		if (!ret)
-			cm_format_rej((struct cm_rej_msg *) msg->mad,
-				      cm_id_priv, reason, ari, ari_length,
-				      private_data, private_data_len);
-
 		cm_enter_timewait(cm_id_priv);
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (ret)
+			return ret;
+		cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason,
+			      ari, ari_length, private_data, private_data_len);
 		break;
 	default:
 		pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__,
-			 be32_to_cpu(cm_id_priv->id.local_id), cm_id->state);
-		ret = -EINVAL;
-		goto out;
+			 be32_to_cpu(cm_id_priv->id.local_id),
+			 cm_id_priv->id.state);
+		return -EINVAL;
 	}
 
-	if (ret)
-		goto out;
-
 	ret = ib_post_send_mad(msg, NULL);
-	if (ret)
+	if (ret) {
 		cm_free_msg(msg);
+		return ret;
+	}
 
-out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+}
+
+int ib_send_cm_rej(struct ib_cm_id *cm_id, enum ib_cm_rej_reason reason,
+		   void *ari, u8 ari_length, const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv =
+		container_of(cm_id, struct cm_id_private, id);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	ret = cm_send_rej_locked(cm_id_priv, reason, ari, ari_length,
+				 private_data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(ib_send_cm_rej);

From 6a8824a74bc9dccb2cae5caa993d2ec09f4694f2 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:44 +0200
Subject: [PATCH 179/327] RDMA/cm: Allow ib_send_cm_sidr_rep() to be done under
 lock

The first thing ib_send_cm_sidr_rep() does is obtain the lock, so use the
usual unlocked wrapper, locked actor pattern here.

Get rid of the cm_reject_sidr_req() wrapper so each call site can call the
locked or unlocked version as required.

This avoids a sketchy lock/unlock sequence (which could allow state to
change) during cm_destroy_id().

Link: https://lore.kernel.org/r/20200310092545.251365-15-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 58 +++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 651e7c391762..9505b9ef4330 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -83,6 +83,8 @@ EXPORT_SYMBOL(ibcm_reject_msg);
 struct cm_id_private;
 static void cm_add_one(struct ib_device *device);
 static void cm_remove_one(struct ib_device *device, void *client_data);
+static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv,
+				   struct ib_cm_sidr_rep_param *param);
 static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
 			       const void *private_data, u8 private_data_len);
 static int cm_send_drep_locked(struct cm_id_private *cm_id_priv,
@@ -830,16 +832,6 @@ static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private
 	return NULL;
 }
 
-static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv,
-			       enum ib_cm_sidr_status status)
-{
-	struct ib_cm_sidr_rep_param param;
-
-	memset(&param, 0, sizeof param);
-	param.status = status;
-	ib_send_cm_sidr_rep(&cm_id_priv->id, &param);
-}
-
 static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device,
 					      ib_cm_handler cm_handler,
 					      void *context)
@@ -1058,8 +1050,10 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 		spin_unlock_irq(&cm_id_priv->lock);
 		break;
 	case IB_CM_SIDR_REQ_RCVD:
+		cm_send_sidr_rep_locked(cm_id_priv,
+					&(struct ib_cm_sidr_rep_param){
+						.status = IB_SIDR_REJECT });
 		spin_unlock_irq(&cm_id_priv->lock);
-		cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
 		break;
 	case IB_CM_REQ_SENT:
 	case IB_CM_MRA_REQ_RCVD:
@@ -3640,7 +3634,9 @@ static int cm_sidr_req_handler(struct cm_work *work)
 					   cm_id_priv->id.service_id);
 	if (!listen_cm_id_priv) {
 		spin_unlock_irq(&cm.lock);
-		cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
+		ib_send_cm_sidr_rep(&cm_id_priv->id,
+				    &(struct ib_cm_sidr_rep_param){
+					    .status = IB_SIDR_UNSUPPORTED });
 		goto out; /* No match. */
 	}
 	refcount_inc(&listen_cm_id_priv->refcount);
@@ -3694,50 +3690,52 @@ static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg,
 			    param->private_data, param->private_data_len);
 }
 
-int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
-			struct ib_cm_sidr_rep_param *param)
+static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv,
+				   struct ib_cm_sidr_rep_param *param)
 {
-	struct cm_id_private *cm_id_priv;
 	struct ib_mad_send_buf *msg;
-	unsigned long flags;
 	int ret;
 
+	lockdep_assert_held(&cm_id_priv->lock);
+
 	if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) ||
 	    (param->private_data &&
 	     param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE))
 		return -EINVAL;
 
-	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
-	spin_lock_irqsave(&cm_id_priv->lock, flags);
-	if (cm_id->state != IB_CM_SIDR_REQ_RCVD) {
-		ret = -EINVAL;
-		goto error;
-	}
+	if (cm_id_priv->id.state != IB_CM_SIDR_REQ_RCVD)
+		return -EINVAL;
 
 	ret = cm_alloc_msg(cm_id_priv, &msg);
 	if (ret)
-		goto error;
+		return ret;
 
 	cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv,
 			   param);
 	ret = ib_post_send_mad(msg, NULL);
 	if (ret) {
-		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 		cm_free_msg(msg);
 		return ret;
 	}
-	cm_id->state = IB_CM_IDLE;
-	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-
-	spin_lock_irqsave(&cm.lock, flags);
+	cm_id_priv->id.state = IB_CM_IDLE;
 	if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) {
 		rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
 		RB_CLEAR_NODE(&cm_id_priv->sidr_id_node);
 	}
-	spin_unlock_irqrestore(&cm.lock, flags);
 	return 0;
+}
 
-error:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_rep_param *param)
+{
+	struct cm_id_private *cm_id_priv =
+		container_of(cm_id, struct cm_id_private, id);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	ret = cm_send_sidr_rep_locked(cm_id_priv, param);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 	return ret;
 }
 EXPORT_SYMBOL(ib_send_cm_sidr_rep);

From 67b3c8dceac6644322d5707687449f6dcbdff417 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 10 Mar 2020 11:25:45 +0200
Subject: [PATCH 180/327] RDMA/cm: Make sure the cm_id is in the IB_CM_IDLE
 state in destroy

The first switch statement in cm_destroy_id() tries to move the ID to
either IB_CM_IDLE or IB_CM_TIMEWAIT. Both states will block concurrent
MAD handlers from progressing.

Previous patches removed the unreliably lock/unlock sequences in this
flow, this patch removes the extra locking steps and adds the missing
parts to guarantee that destroy reaches IB_CM_IDLE. There is no point in
leaving the ID in the IB_CM_TIMEWAIT state the memory about to be kfreed.

Rework things to hold the lock across all the state transitions and
directly assert when done that it ended up in IB_CM_IDLE as expected.

This was accompanied by a careful audit of all the state transitions here,
which generally did end up in IDLE on their success and non-racy paths.

Link: https://lore.kernel.org/r/20200310092545.251365-16-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 41 ++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 9505b9ef4330..4794113ecd59 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1026,34 +1026,34 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 	struct cm_work *work;
 
 	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
-retest:
 	spin_lock_irq(&cm_id_priv->lock);
+retest:
 	switch (cm_id->state) {
 	case IB_CM_LISTEN:
-		spin_unlock_irq(&cm_id_priv->lock);
-
-		spin_lock_irq(&cm.lock);
+		spin_lock(&cm.lock);
 		if (--cm_id_priv->listen_sharecount > 0) {
 			/* The id is still shared. */
 			WARN_ON(refcount_read(&cm_id_priv->refcount) == 1);
+			spin_unlock(&cm.lock);
+			spin_unlock_irq(&cm_id_priv->lock);
 			cm_deref_id(cm_id_priv);
-			spin_unlock_irq(&cm.lock);
 			return;
 		}
+		cm_id->state = IB_CM_IDLE;
 		rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
 		RB_CLEAR_NODE(&cm_id_priv->service_node);
-		spin_unlock_irq(&cm.lock);
+		spin_unlock(&cm.lock);
 		break;
 	case IB_CM_SIDR_REQ_SENT:
 		cm_id->state = IB_CM_IDLE;
 		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
-		spin_unlock_irq(&cm_id_priv->lock);
 		break;
 	case IB_CM_SIDR_REQ_RCVD:
 		cm_send_sidr_rep_locked(cm_id_priv,
 					&(struct ib_cm_sidr_rep_param){
 						.status = IB_SIDR_REJECT });
-		spin_unlock_irq(&cm_id_priv->lock);
+		/* cm_send_sidr_rep_locked will not move to IDLE if it fails */
+		cm_id->state = IB_CM_IDLE;
 		break;
 	case IB_CM_REQ_SENT:
 	case IB_CM_MRA_REQ_RCVD:
@@ -1062,18 +1062,15 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 				   &cm_id_priv->id.device->node_guid,
 				   sizeof(cm_id_priv->id.device->node_guid),
 				   NULL, 0);
-		spin_unlock_irq(&cm_id_priv->lock);
 		break;
 	case IB_CM_REQ_RCVD:
 		if (err == -ENOMEM) {
 			/* Do not reject to allow future retries. */
 			cm_reset_to_idle(cm_id_priv);
-			spin_unlock_irq(&cm_id_priv->lock);
 		} else {
 			cm_send_rej_locked(cm_id_priv,
 					   IB_CM_REJ_CONSUMER_DEFINED, NULL, 0,
 					   NULL, 0);
-			spin_unlock_irq(&cm_id_priv->lock);
 		}
 		break;
 	case IB_CM_REP_SENT:
@@ -1085,31 +1082,35 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 	case IB_CM_MRA_REP_SENT:
 		cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL,
 				   0, NULL, 0);
-		spin_unlock_irq(&cm_id_priv->lock);
 		break;
 	case IB_CM_ESTABLISHED:
 		if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
-			spin_unlock_irq(&cm_id_priv->lock);
+			cm_id->state = IB_CM_IDLE;
 			break;
 		}
 		cm_send_dreq_locked(cm_id_priv, NULL, 0);
-		spin_unlock_irq(&cm_id_priv->lock);
 		goto retest;
 	case IB_CM_DREQ_SENT:
 		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
 		cm_enter_timewait(cm_id_priv);
-		spin_unlock_irq(&cm_id_priv->lock);
-		break;
+		goto retest;
 	case IB_CM_DREQ_RCVD:
 		cm_send_drep_locked(cm_id_priv, NULL, 0);
-		spin_unlock_irq(&cm_id_priv->lock);
+		WARN_ON(cm_id->state != IB_CM_TIMEWAIT);
+		goto retest;
+	case IB_CM_TIMEWAIT:
+		/*
+		 * The cm_acquire_id in cm_timewait_handler will stop working
+		 * once we do cm_free_id() below, so just move to idle here for
+		 * consistency.
+		 */
+		cm_id->state = IB_CM_IDLE;
 		break;
-	default:
-		spin_unlock_irq(&cm_id_priv->lock);
+	case IB_CM_IDLE:
 		break;
 	}
+	WARN_ON(cm_id->state != IB_CM_IDLE);
 
-	spin_lock_irq(&cm_id_priv->lock);
 	spin_lock(&cm.lock);
 	/* Required for cleanup paths related cm_req_handler() */
 	if (cm_id_priv->timewait_info) {

From 3cae58047c1343949fb20c0e142845133ce0a074 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 13 Mar 2020 09:33:25 -0700
Subject: [PATCH 181/327] RDMA/bnxt_re: Use ib_device_try_get()

There are a couple places in this driver running from a work queue that
need the ib_device to be registered. Instead of using a broken internal
bit rely on the new core code to guarantee device registration.

Link: https://lore.kernel.org/r/1584117207-2664-2-git-send-email-selvin.xavier@broadcom.com
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/bnxt_re.h |  1 -
 drivers/infiniband/hw/bnxt_re/main.c    | 27 +++++++++++++------------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index c736e8254e75..407141ea0242 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -132,7 +132,6 @@ struct bnxt_re_dev {
 	struct list_head		list;
 	unsigned long			flags;
 #define BNXT_RE_FLAG_NETDEV_REGISTERED		0
-#define BNXT_RE_FLAG_IBDEV_REGISTERED		1
 #define BNXT_RE_FLAG_GOT_MSIX			2
 #define BNXT_RE_FLAG_HAVE_L2_REF		3
 #define BNXT_RE_FLAG_RCFW_CHANNEL_EN		4
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 415693f8015a..885127c1e81b 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -1171,12 +1171,13 @@ static int bnxt_re_update_gid(struct bnxt_re_dev *rdev)
 	u16 gid_idx, index;
 	int rc = 0;
 
-	if (!test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
+	if (!ib_device_try_get(&rdev->ibdev))
 		return 0;
 
 	if (!sgid_tbl) {
 		ibdev_err(&rdev->ibdev, "QPLIB: SGID table not allocated");
-		return -EINVAL;
+		rc = -EINVAL;
+		goto out;
 	}
 
 	for (index = 0; index < sgid_tbl->active; index++) {
@@ -1196,7 +1197,8 @@ static int bnxt_re_update_gid(struct bnxt_re_dev *rdev)
 		rc = bnxt_qplib_update_sgid(sgid_tbl, &gid, gid_idx,
 					    rdev->qplib_res.netdev->dev_addr);
 	}
-
+out:
+	ib_device_put(&rdev->ibdev);
 	return rc;
 }
 
@@ -1319,7 +1321,6 @@ int bnxt_re_ib_init(struct bnxt_re_dev *rdev)
 		pr_err("Failed to register with IB: %#x\n", rc);
 		return rc;
 	}
-	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
 	dev_info(rdev_to_dev(rdev), "Device registered successfully");
 	ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
 			 &rdev->active_width);
@@ -1612,7 +1613,6 @@ static void bnxt_re_dealloc_driver(struct ib_device *ib_dev)
 	struct bnxt_re_dev *rdev =
 		container_of(ib_dev, struct bnxt_re_dev, ibdev);
 
-	clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
 	dev_info(rdev_to_dev(rdev), "Unregistering Device");
 
 	rtnl_lock();
@@ -1630,12 +1630,7 @@ static void bnxt_re_task(struct work_struct *work)
 	re_work = container_of(work, struct bnxt_re_work, work);
 	rdev = re_work->rdev;
 
-	if (re_work->event != NETDEV_REGISTER &&
-	    !test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
-		goto done;
-
-	switch (re_work->event) {
-	case NETDEV_REGISTER:
+	if (re_work->event == NETDEV_REGISTER) {
 		rc = bnxt_re_ib_init(rdev);
 		if (rc) {
 			ibdev_err(&rdev->ibdev,
@@ -1645,7 +1640,13 @@ static void bnxt_re_task(struct work_struct *work)
 			rtnl_unlock();
 			goto exit;
 		}
-		break;
+		goto exit;
+	}
+
+	if (!ib_device_try_get(&rdev->ibdev))
+		goto exit;
+
+	switch (re_work->event) {
 	case NETDEV_UP:
 		bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1,
 				       IB_EVENT_PORT_ACTIVE);
@@ -1665,7 +1666,7 @@ static void bnxt_re_task(struct work_struct *work)
 	default:
 		break;
 	}
-done:
+	ib_device_put(&rdev->ibdev);
 	smp_mb__before_atomic();
 	atomic_dec(&rdev->sched_count);
 exit:

From 8a6c61704746d3a1e004e054504ae8d98ed95697 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 13 Mar 2020 09:33:26 -0700
Subject: [PATCH 182/327] RDMA/bnxt_re: Fix lifetimes in bnxt_re_task

A work queue cannot just rely on the ib_device not being freed, it must
hold a kref on the memory so that the BNXT_RE_FLAG_IBDEV_REGISTERED check
works.

Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver")
Link: https://lore.kernel.org/r/1584117207-2664-3-git-send-email-selvin.xavier@broadcom.com
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 885127c1e81b..c494e11c3fe3 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -1670,6 +1670,7 @@ static void bnxt_re_task(struct work_struct *work)
 	smp_mb__before_atomic();
 	atomic_dec(&rdev->sched_count);
 exit:
+	put_device(&rdev->ibdev.dev);
 	kfree(re_work);
 }
 
@@ -1735,6 +1736,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 		/* Allocate for the deferred task */
 		re_work = kzalloc(sizeof(*re_work), GFP_ATOMIC);
 		if (re_work) {
+			get_device(&rdev->ibdev.dev);
 			re_work->rdev = rdev;
 			re_work->event = event;
 			re_work->vlan_dev = (real_dev == netdev ?

From 4e88cef11d1913d1875c7870df4facbd096a062b Mon Sep 17 00:00:00 2001
From: Selvin Xavier <selvin.xavier@broadcom.com>
Date: Fri, 13 Mar 2020 09:33:27 -0700
Subject: [PATCH 183/327] RDMA/bnxt_re: Remove unnecessary sched count

Since the lifetime of bnxt_re_task is controlled by the kref of device,
sched_count is no longer required.  Remove it.

Link: https://lore.kernel.org/r/1584117207-2664-4-git-send-email-selvin.xavier@broadcom.com
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/bnxt_re.h | 1 -
 drivers/infiniband/hw/bnxt_re/main.c    | 8 --------
 2 files changed, 9 deletions(-)

diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index 407141ea0242..a300588634c5 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -176,7 +176,6 @@ struct bnxt_re_dev {
 	atomic_t			srq_count;
 	atomic_t			mr_count;
 	atomic_t			mw_count;
-	atomic_t			sched_count;
 	/* Max of 2 lossless traffic class supported per port */
 	u16				cosq[2];
 
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index c494e11c3fe3..4a8fb1ad74a8 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -1667,8 +1667,6 @@ static void bnxt_re_task(struct work_struct *work)
 		break;
 	}
 	ib_device_put(&rdev->ibdev);
-	smp_mb__before_atomic();
-	atomic_dec(&rdev->sched_count);
 exit:
 	put_device(&rdev->ibdev.dev);
 	kfree(re_work);
@@ -1720,11 +1718,6 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 		break;
 
 	case NETDEV_UNREGISTER:
-		/* netdev notifier will call NETDEV_UNREGISTER again later since
-		 * we are still holding the reference to the netdev
-		 */
-		if (atomic_read(&rdev->sched_count) > 0)
-			goto exit;
 		ib_unregister_device_queued(&rdev->ibdev);
 		break;
 
@@ -1742,7 +1735,6 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier,
 			re_work->vlan_dev = (real_dev == netdev ?
 					     NULL : netdev);
 			INIT_WORK(&re_work->work, bnxt_re_task);
-			atomic_inc(&rdev->sched_count);
 			queue_work(bnxt_re_wq, &re_work->work);
 		}
 	}

From 6c6e39212b18bffd806423a45359c70221ce0d77 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Tue, 10 Mar 2020 19:18:00 +0800
Subject: [PATCH 184/327] RDMA/hns: Rename wqe buffer related functions

There are serval global functions related to wqe buffer in the hns driver
and are called in different files. These symbols cannot directly represent
the namespace they belong to. So add prefix 'hns_roce_' to 3 wqe buffer
related global functions: get_recv_wqe(), get_send_wqe(), and
get_send_extend_sge().

Link: https://lore.kernel.org/r/1583839084-31579-2-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h |  6 +++---
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c  |  9 +++++----
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 10 +++++-----
 drivers/infiniband/hw/hns/hns_roce_qp.c     |  6 +++---
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index d7dcf6ebc526..b6ae12dd4024 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -1238,9 +1238,9 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *ib_pd,
 int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		       int attr_mask, struct ib_udata *udata);
 void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp);
-void *get_recv_wqe(struct hns_roce_qp *hr_qp, int n);
-void *get_send_wqe(struct hns_roce_qp *hr_qp, int n);
-void *get_send_extend_sge(struct hns_roce_qp *hr_qp, int n);
+void *hns_roce_get_recv_wqe(struct hns_roce_qp *hr_qp, int n);
+void *hns_roce_get_send_wqe(struct hns_roce_qp *hr_qp, int n);
+void *hns_roce_get_extend_sge(struct hns_roce_qp *hr_qp, int n);
 bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq,
 			  struct ib_cq *ib_cq);
 enum hns_roce_qp_state to_hns_roce_state(enum ib_qp_state state);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index c05a905c518e..2e5304502a08 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -106,7 +106,7 @@ static int hns_roce_v1_post_send(struct ib_qp *ibqp,
 			goto out;
 		}
 
-		wqe = get_send_wqe(qp, wqe_idx);
+		wqe = hns_roce_get_send_wqe(qp, wqe_idx);
 		qp->sq.wrid[wqe_idx] = wr->wr_id;
 
 		/* Corresponding to the RC and RD type wqe process separately */
@@ -378,7 +378,7 @@ static int hns_roce_v1_post_recv(struct ib_qp *ibqp,
 			goto out;
 		}
 
-		ctrl = get_recv_wqe(hr_qp, wqe_idx);
+		ctrl = hns_roce_get_recv_wqe(hr_qp, wqe_idx);
 
 		roce_set_field(ctrl->rwqe_byte_12,
 			       RQ_WQE_CTRL_RWQE_BYTE_12_RWQE_SGE_NUM_M,
@@ -2284,9 +2284,10 @@ static int hns_roce_v1_poll_one(struct hns_roce_cq *hr_cq,
 
 	if (is_send) {
 		/* SQ conrespond to CQE */
-		sq_wqe = get_send_wqe(*cur_qp, roce_get_field(cqe->cqe_byte_4,
+		sq_wqe = hns_roce_get_send_wqe(*cur_qp,
+						roce_get_field(cqe->cqe_byte_4,
 						CQE_BYTE_4_WQE_INDEX_M,
-						CQE_BYTE_4_WQE_INDEX_S)&
+						CQE_BYTE_4_WQE_INDEX_S) &
 						((*cur_qp)->sq.wqe_cnt-1));
 		switch (le32_to_cpu(sq_wqe->flag) & HNS_ROCE_WQE_OPCODE_MASK) {
 		case HNS_ROCE_WQE_OPCODE_SEND:
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 82021fa46d9d..88d671a976d3 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -127,7 +127,7 @@ static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr,
 	 * should calculate how many sges in the first page and the second
 	 * page.
 	 */
-	dseg = get_send_extend_sge(qp, (*sge_ind) & (qp->sge.sge_cnt - 1));
+	dseg = hns_roce_get_extend_sge(qp, (*sge_ind) & (qp->sge.sge_cnt - 1));
 	fi_sge_num = (round_up((uintptr_t)dseg, 1 << shift) -
 		      (uintptr_t)dseg) /
 		      sizeof(struct hns_roce_v2_wqe_data_seg);
@@ -137,7 +137,7 @@ static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr,
 			set_data_seg_v2(dseg++, sg + i);
 			(*sge_ind)++;
 		}
-		dseg = get_send_extend_sge(qp,
+		dseg = hns_roce_get_extend_sge(qp,
 					   (*sge_ind) & (qp->sge.sge_cnt - 1));
 		for (i = 0; i < se_sge_num; i++) {
 			set_data_seg_v2(dseg++, sg + fi_sge_num + i);
@@ -329,7 +329,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 			goto out;
 		}
 
-		wqe = get_send_wqe(qp, wqe_idx);
+		wqe = hns_roce_get_send_wqe(qp, wqe_idx);
 		qp->sq.wrid[wqe_idx] = wr->wr_id;
 		owner_bit =
 		       ~(((qp->sq.head + nreq) >> ilog2(qp->sq.wqe_cnt)) & 0x1);
@@ -676,7 +676,7 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
 			goto out;
 		}
 
-		wqe = get_recv_wqe(hr_qp, wqe_idx);
+		wqe = hns_roce_get_recv_wqe(hr_qp, wqe_idx);
 		dseg = (struct hns_roce_v2_wqe_data_seg *)wqe;
 		for (i = 0; i < wr->num_sge; i++) {
 			if (!wr->sg_list[i].length)
@@ -2935,7 +2935,7 @@ static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe,
 
 	sge_list = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sg_list;
 	sge_num = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sge_cnt;
-	wqe_buf = get_recv_wqe(*cur_qp, wr_cnt);
+	wqe_buf = hns_roce_get_recv_wqe(*cur_qp, wr_cnt);
 	data_len = wc->byte_len;
 
 	for (sge_cnt = 0; (sge_cnt < sge_num) && (data_len); sge_cnt++) {
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 5a28d62008e4..c2ea489b0df5 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -1470,17 +1470,17 @@ static void *get_wqe(struct hns_roce_qp *hr_qp, int offset)
 	return hns_roce_buf_offset(&hr_qp->hr_buf, offset);
 }
 
-void *get_recv_wqe(struct hns_roce_qp *hr_qp, int n)
+void *hns_roce_get_recv_wqe(struct hns_roce_qp *hr_qp, int n)
 {
 	return get_wqe(hr_qp, hr_qp->rq.offset + (n << hr_qp->rq.wqe_shift));
 }
 
-void *get_send_wqe(struct hns_roce_qp *hr_qp, int n)
+void *hns_roce_get_send_wqe(struct hns_roce_qp *hr_qp, int n)
 {
 	return get_wqe(hr_qp, hr_qp->sq.offset + (n << hr_qp->sq.wqe_shift));
 }
 
-void *get_send_extend_sge(struct hns_roce_qp *hr_qp, int n)
+void *hns_roce_get_extend_sge(struct hns_roce_qp *hr_qp, int n)
 {
 	return hns_roce_buf_offset(&hr_qp->hr_buf, hr_qp->sge.offset +
 					(n << hr_qp->sge.sge_shift));

From 00a59d30f3f953b4dbfffcae96e927e62533e6cf Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Tue, 10 Mar 2020 19:18:01 +0800
Subject: [PATCH 185/327] RDMA/hns: Optimize wqe buffer filling process for
 post send

Encapsulates the wqe buffer process details for datagram seg, fast mr seg
and atomic seg.

Link: https://lore.kernel.org/r/1583839084-31579-3-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 63 +++++++++++-----------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 88d671a976d3..c8c345f84c9e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -57,10 +57,10 @@ static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg,
 }
 
 static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
-			 struct hns_roce_wqe_frmr_seg *fseg,
-			 const struct ib_reg_wr *wr)
+			 void *wqe, const struct ib_reg_wr *wr)
 {
 	struct hns_roce_mr *mr = to_hr_mr(wr->mr);
+	struct hns_roce_wqe_frmr_seg *fseg = wqe;
 
 	/* use ib_access_flags */
 	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_FRMR_WQE_BYTE_4_BIND_EN_S,
@@ -92,16 +92,26 @@ static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
 		     V2_RC_FRMR_WQE_BYTE_40_BLK_MODE_S, 0);
 }
 
-static void set_atomic_seg(struct hns_roce_wqe_atomic_seg *aseg,
-			   const struct ib_atomic_wr *wr)
+static void set_atomic_seg(const struct ib_send_wr *wr, void *wqe,
+			   struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
+			   int valid_num_sge)
 {
-	if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
-		aseg->fetchadd_swap_data = cpu_to_le64(wr->swap);
-		aseg->cmp_data  = cpu_to_le64(wr->compare_add);
+	struct hns_roce_wqe_atomic_seg *aseg;
+
+	set_data_seg_v2(wqe, wr->sg_list);
+	aseg = wqe + sizeof(struct hns_roce_v2_wqe_data_seg);
+
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->fetchadd_swap_data = cpu_to_le64(atomic_wr(wr)->swap);
+		aseg->cmp_data = cpu_to_le64(atomic_wr(wr)->compare_add);
 	} else {
-		aseg->fetchadd_swap_data = cpu_to_le64(wr->compare_add);
+		aseg->fetchadd_swap_data =
+			cpu_to_le64(atomic_wr(wr)->compare_add);
 		aseg->cmp_data  = 0;
 	}
+
+	roce_set_field(rc_sq_wqe->byte_16, V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
+		       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, valid_num_sge);
 }
 
 static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr,
@@ -154,11 +164,11 @@ static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr,
 static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 			     struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
 			     void *wqe, unsigned int *sge_ind,
-			     int valid_num_sge,
-			     const struct ib_send_wr **bad_wr)
+			     int valid_num_sge)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_v2_wqe_data_seg *dseg = wqe;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
 	int j = 0;
 	int i;
@@ -166,15 +176,14 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 	if (wr->send_flags & IB_SEND_INLINE && valid_num_sge) {
 		if (le32_to_cpu(rc_sq_wqe->msg_len) >
 		    hr_dev->caps.max_sq_inline) {
-			*bad_wr = wr;
-			dev_err(hr_dev->dev, "inline len(1-%d)=%d, illegal",
-				rc_sq_wqe->msg_len, hr_dev->caps.max_sq_inline);
+			ibdev_err(ibdev, "inline len(1-%d)=%d, illegal",
+				  rc_sq_wqe->msg_len,
+				  hr_dev->caps.max_sq_inline);
 			return -EINVAL;
 		}
 
 		if (wr->opcode == IB_WR_RDMA_READ) {
-			*bad_wr =  wr;
-			dev_err(hr_dev->dev, "Not support inline data!\n");
+			ibdev_err(ibdev, "Not support inline data!\n");
 			return -EINVAL;
 		}
 
@@ -285,7 +294,6 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 	struct hns_roce_v2_ud_send_wqe *ud_sq_wqe;
 	struct hns_roce_v2_rc_send_wqe *rc_sq_wqe;
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
-	struct hns_roce_wqe_frmr_seg *fseg;
 	struct device *dev = hr_dev->dev;
 	unsigned int owner_bit;
 	unsigned int sge_idx;
@@ -547,8 +555,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 				break;
 			case IB_WR_REG_MR:
 				hr_op = HNS_ROCE_V2_WQE_OP_FAST_REG_PMR;
-				fseg = wqe;
-				set_frmr_seg(rc_sq_wqe, fseg, reg_wr(wr));
+				set_frmr_seg(rc_sq_wqe, wqe, reg_wr(wr));
 				break;
 			case IB_WR_ATOMIC_CMP_AND_SWP:
 				hr_op = HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP;
@@ -582,23 +589,17 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S, hr_op);
 
 			if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-			    wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
-				struct hns_roce_v2_wqe_data_seg *dseg;
-
-				dseg = wqe;
-				set_data_seg_v2(dseg, wr->sg_list);
-				wqe += sizeof(struct hns_roce_v2_wqe_data_seg);
-				set_atomic_seg(wqe, atomic_wr(wr));
-				roce_set_field(rc_sq_wqe->byte_16,
-					       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
-					       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S,
+			    wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+				set_atomic_seg(wr, wqe, rc_sq_wqe,
 					       valid_num_sge);
-			} else if (wr->opcode != IB_WR_REG_MR) {
+			else if (wr->opcode != IB_WR_REG_MR) {
 				ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe,
 							wqe, &sge_idx,
-							valid_num_sge, bad_wr);
-				if (ret)
+							valid_num_sge);
+				if (ret) {
+					*bad_wr = wr;
 					goto out;
+				}
 			}
 		} else {
 			dev_err(dev, "Illegal qp_type(0x%x)\n", ibqp->qp_type);

From e363f7de4e60eedbf7af126ed14d10ffb110c697 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Tue, 10 Mar 2020 19:18:02 +0800
Subject: [PATCH 186/327] RDMA/hns: Optimize the wr opcode conversion from ib
 to hns

Simplify the wr opcode conversion from ib to hns by using a map table
instead of the switch-case statement.

Link: https://lore.kernel.org/r/1583839084-31579-4-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 63 ++++++++++++----------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index c8c345f84c9e..c813c7419932 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -56,6 +56,40 @@ static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg,
 	dseg->len  = cpu_to_le32(sg->length);
 }
 
+/*
+ * mapped-value = 1 + real-value
+ * The hns wr opcode real value is start from 0, In order to distinguish between
+ * initialized and uninitialized map values, we plus 1 to the actual value when
+ * defining the mapping, so that the validity can be identified by checking the
+ * mapped value is greater than 0.
+ */
+#define HR_OPC_MAP(ib_key, hr_key) \
+		[IB_WR_ ## ib_key] = 1 + HNS_ROCE_V2_WQE_OP_ ## hr_key
+
+static const u32 hns_roce_op_code[] = {
+	HR_OPC_MAP(RDMA_WRITE,			RDMA_WRITE),
+	HR_OPC_MAP(RDMA_WRITE_WITH_IMM,		RDMA_WRITE_WITH_IMM),
+	HR_OPC_MAP(SEND,			SEND),
+	HR_OPC_MAP(SEND_WITH_IMM,		SEND_WITH_IMM),
+	HR_OPC_MAP(RDMA_READ,			RDMA_READ),
+	HR_OPC_MAP(ATOMIC_CMP_AND_SWP,		ATOM_CMP_AND_SWAP),
+	HR_OPC_MAP(ATOMIC_FETCH_AND_ADD,	ATOM_FETCH_AND_ADD),
+	HR_OPC_MAP(SEND_WITH_INV,		SEND_WITH_INV),
+	HR_OPC_MAP(LOCAL_INV,			LOCAL_INV),
+	HR_OPC_MAP(MASKED_ATOMIC_CMP_AND_SWP,	ATOM_MSK_CMP_AND_SWAP),
+	HR_OPC_MAP(MASKED_ATOMIC_FETCH_AND_ADD,	ATOM_MSK_FETCH_AND_ADD),
+	HR_OPC_MAP(REG_MR,			FAST_REG_PMR),
+};
+
+static u32 to_hr_opcode(u32 ib_opcode)
+{
+	if (ib_opcode >= ARRAY_SIZE(hns_roce_op_code))
+		return HNS_ROCE_V2_WQE_OP_MASK;
+
+	return hns_roce_op_code[ib_opcode] ? hns_roce_op_code[ib_opcode] - 1 :
+					     HNS_ROCE_V2_WQE_OP_MASK;
+}
+
 static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
 			 void *wqe, const struct ib_reg_wr *wr)
 {
@@ -303,7 +337,6 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 	void *wqe = NULL;
 	bool loopback;
 	u32 tmp_len;
-	u32 hr_op;
 	u8 *smac;
 	int nreq;
 	int ret;
@@ -517,76 +550,52 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 			wqe += sizeof(struct hns_roce_v2_rc_send_wqe);
 			switch (wr->opcode) {
 			case IB_WR_RDMA_READ:
-				hr_op = HNS_ROCE_V2_WQE_OP_RDMA_READ;
 				rc_sq_wqe->rkey =
 					cpu_to_le32(rdma_wr(wr)->rkey);
 				rc_sq_wqe->va =
 					cpu_to_le64(rdma_wr(wr)->remote_addr);
 				break;
 			case IB_WR_RDMA_WRITE:
-				hr_op = HNS_ROCE_V2_WQE_OP_RDMA_WRITE;
 				rc_sq_wqe->rkey =
 					cpu_to_le32(rdma_wr(wr)->rkey);
 				rc_sq_wqe->va =
 					cpu_to_le64(rdma_wr(wr)->remote_addr);
 				break;
 			case IB_WR_RDMA_WRITE_WITH_IMM:
-				hr_op = HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM;
 				rc_sq_wqe->rkey =
 					cpu_to_le32(rdma_wr(wr)->rkey);
 				rc_sq_wqe->va =
 					cpu_to_le64(rdma_wr(wr)->remote_addr);
 				break;
-			case IB_WR_SEND:
-				hr_op = HNS_ROCE_V2_WQE_OP_SEND;
-				break;
-			case IB_WR_SEND_WITH_INV:
-				hr_op = HNS_ROCE_V2_WQE_OP_SEND_WITH_INV;
-				break;
-			case IB_WR_SEND_WITH_IMM:
-				hr_op = HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM;
-				break;
 			case IB_WR_LOCAL_INV:
-				hr_op = HNS_ROCE_V2_WQE_OP_LOCAL_INV;
 				roce_set_bit(rc_sq_wqe->byte_4,
 					       V2_RC_SEND_WQE_BYTE_4_SO_S, 1);
 				rc_sq_wqe->inv_key =
 					    cpu_to_le32(wr->ex.invalidate_rkey);
 				break;
 			case IB_WR_REG_MR:
-				hr_op = HNS_ROCE_V2_WQE_OP_FAST_REG_PMR;
 				set_frmr_seg(rc_sq_wqe, wqe, reg_wr(wr));
 				break;
 			case IB_WR_ATOMIC_CMP_AND_SWP:
-				hr_op = HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP;
 				rc_sq_wqe->rkey =
 					cpu_to_le32(atomic_wr(wr)->rkey);
 				rc_sq_wqe->va =
 					cpu_to_le64(atomic_wr(wr)->remote_addr);
 				break;
 			case IB_WR_ATOMIC_FETCH_AND_ADD:
-				hr_op = HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD;
 				rc_sq_wqe->rkey =
 					cpu_to_le32(atomic_wr(wr)->rkey);
 				rc_sq_wqe->va =
 					cpu_to_le64(atomic_wr(wr)->remote_addr);
 				break;
-			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
-				hr_op =
-				       HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP;
-				break;
-			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
-				hr_op =
-				      HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD;
-				break;
 			default:
-				hr_op = HNS_ROCE_V2_WQE_OP_MASK;
 				break;
 			}
 
 			roce_set_field(rc_sq_wqe->byte_4,
 				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S, hr_op);
+				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+				       to_hr_opcode(wr->opcode));
 
 			if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 			    wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)

From 1133401412a9c05cbc2b697e692c578419c14a7c Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Tue, 10 Mar 2020 19:18:03 +0800
Subject: [PATCH 187/327] RDMA/hns: Optimize base address table config flow for
 qp buffer

Currently, before the qp is created, a page size needs to be calculated
for the base address table to store all base addresses in the mtr. As a
result, the parameter configuration of the mtr is complex. So integrate
the process of calculating the base table page size into the hem related
interface to simplify the process of using mtr.

Link: https://lore.kernel.org/r/1583839084-31579-5-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_device.h |  4 --
 drivers/infiniband/hw/hns/hns_roce_hem.c    | 16 +++++---
 drivers/infiniband/hw/hns/hns_roce_qp.c     | 42 +++++----------------
 3 files changed, 21 insertions(+), 41 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index b6ae12dd4024..f6b3cf6b95d6 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -669,10 +669,6 @@ struct hns_roce_qp {
 	struct ib_umem		*umem;
 	struct hns_roce_mtt	mtt;
 	struct hns_roce_mtr	mtr;
-
-	/* this define must less than HNS_ROCE_MAX_BT_REGION */
-#define HNS_ROCE_WQE_REGION_MAX	 3
-	struct hns_roce_buf_region regions[HNS_ROCE_WQE_REGION_MAX];
 	int                     wqe_bt_pg_shift;
 
 	u32			buff_size;
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index e82215774032..8380d7187494 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -1383,6 +1383,7 @@ static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev,
 	void *cpu_base;
 	u64 phy_base;
 	int ret = 0;
+	int ba_num;
 	int offset;
 	int total;
 	int step;
@@ -1393,12 +1394,16 @@ static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev,
 	if (root_hem)
 		return 0;
 
+	ba_num = hns_roce_hem_list_calc_root_ba(regions, region_cnt, unit);
+	if (ba_num < 1)
+		return -ENOMEM;
+
 	INIT_LIST_HEAD(&temp_root);
-	total = r->offset;
+	offset = r->offset;
 	/* indicate to last region */
 	r = &regions[region_cnt - 1];
-	root_hem = hem_list_alloc_item(hr_dev, total, r->offset + r->count - 1,
-				       unit, true, 0);
+	root_hem = hem_list_alloc_item(hr_dev, offset, r->offset + r->count - 1,
+				       ba_num, true, 0);
 	if (!root_hem)
 		return -ENOMEM;
 	list_add(&root_hem->list, &temp_root);
@@ -1410,7 +1415,7 @@ static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev,
 		INIT_LIST_HEAD(&temp_list[i]);
 
 	total = 0;
-	for (i = 0; i < region_cnt && total < unit; i++) {
+	for (i = 0; i < region_cnt && total < ba_num; i++) {
 		r = &regions[i];
 		if (!r->count)
 			continue;
@@ -1443,7 +1448,8 @@ static int hem_list_alloc_root_bt(struct hns_roce_dev *hr_dev,
 			/* if exist mid bt, link L1 to L0 */
 			list_for_each_entry_safe(hem, temp_hem,
 					  &hem_list->mid_bt[i][1], list) {
-				offset = hem->start / step * BA_BYTE_LEN;
+				offset = (hem->start - r->offset) / step *
+					  BA_BYTE_LEN;
 				hem_list_link_bt(hr_dev, cpu_base + offset,
 						 hem->dma_addr);
 				total++;
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index c2ea489b0df5..7ba3db5a6192 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -579,30 +579,6 @@ static int split_wqe_buf_region(struct hns_roce_dev *hr_dev,
 	return region_cnt;
 }
 
-static int calc_wqe_bt_page_shift(struct hns_roce_dev *hr_dev,
-				  struct hns_roce_buf_region *regions,
-				  int region_cnt)
-{
-	int bt_pg_shift;
-	int ba_num;
-	int ret;
-
-	bt_pg_shift = PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz;
-
-	/* all root ba entries must in one bt page */
-	do {
-		ba_num = (1 << bt_pg_shift) / BA_BYTE_LEN;
-		ret = hns_roce_hem_list_calc_root_ba(regions, region_cnt,
-						     ba_num);
-		if (ret <= ba_num)
-			break;
-
-		bt_pg_shift++;
-	} while (ret > ba_num);
-
-	return bt_pg_shift - PAGE_SHIFT;
-}
-
 static int set_extend_sge_param(struct hns_roce_dev *hr_dev,
 				struct hns_roce_qp *hr_qp)
 {
@@ -768,7 +744,10 @@ static void free_rq_inline_buf(struct hns_roce_qp *hr_qp)
 static int map_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 		       u32 page_shift, bool is_user)
 {
-	dma_addr_t *buf_list[ARRAY_SIZE(hr_qp->regions)] = { NULL };
+/* WQE buffer include 3 parts: SQ, extend SGE and RQ. */
+#define HNS_ROCE_WQE_REGION_MAX	 3
+	struct hns_roce_buf_region regions[HNS_ROCE_WQE_REGION_MAX] = {};
+	dma_addr_t *buf_list[HNS_ROCE_WQE_REGION_MAX] = {};
 	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_buf_region *r;
 	int region_count;
@@ -776,18 +755,18 @@ static int map_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 	int ret;
 	int i;
 
-	region_count = split_wqe_buf_region(hr_dev, hr_qp, hr_qp->regions,
-					ARRAY_SIZE(hr_qp->regions), page_shift);
+	region_count = split_wqe_buf_region(hr_dev, hr_qp, regions,
+					    ARRAY_SIZE(regions), page_shift);
 
 	/* alloc a tmp list to store WQE buffers address */
-	ret = hns_roce_alloc_buf_list(hr_qp->regions, buf_list, region_count);
+	ret = hns_roce_alloc_buf_list(regions, buf_list, region_count);
 	if (ret) {
 		ibdev_err(ibdev, "Failed to alloc WQE buffer list\n");
 		return ret;
 	}
 
 	for (i = 0; i < region_count; i++) {
-		r = &hr_qp->regions[i];
+		r = &regions[i];
 		if (is_user)
 			buf_count = hns_roce_get_umem_bufs(hr_dev, buf_list[i],
 					r->count, r->offset, hr_qp->umem,
@@ -805,11 +784,10 @@ static int map_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
 		}
 	}
 
-	hr_qp->wqe_bt_pg_shift = calc_wqe_bt_page_shift(hr_dev, hr_qp->regions,
-							region_count);
+	hr_qp->wqe_bt_pg_shift = hr_dev->caps.mtt_ba_pg_sz;
 	hns_roce_mtr_init(&hr_qp->mtr, PAGE_SHIFT + hr_qp->wqe_bt_pg_shift,
 			  page_shift);
-	ret = hns_roce_mtr_attach(hr_dev, &hr_qp->mtr, buf_list, hr_qp->regions,
+	ret = hns_roce_mtr_attach(hr_dev, &hr_qp->mtr, buf_list, regions,
 				  region_count);
 	if (ret)
 		ibdev_err(ibdev, "Failed to attach WQE's mtr\n");

From d6a3627e311c5488539150614fedaea5e3103235 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Tue, 10 Mar 2020 19:18:04 +0800
Subject: [PATCH 188/327] RDMA/hns: Optimize wqe buffer set flow for post send

Splits hns_roce_v2_post_send() into three sub-functions: set_rc_wqe(),
set_ud_wqe() and update_sq_db() to simplify the code.

Link: https://lore.kernel.org/r/1583839084-31579-6-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 472 ++++++++++-----------
 1 file changed, 224 insertions(+), 248 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index c813c7419932..9bd8fbf2e96b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -287,6 +287,214 @@ static int check_send_valid(struct hns_roce_dev *hr_dev,
 	return 0;
 }
 
+static inline int calc_wr_sge_num(const struct ib_send_wr *wr, u32 *sge_len)
+{
+	int valid_num = 0;
+	u32 len = 0;
+	int i;
+
+	for (i = 0; i < wr->num_sge; i++) {
+		if (likely(wr->sg_list[i].length)) {
+			len += wr->sg_list[i].length;
+			valid_num++;
+		}
+	}
+
+	*sge_len = len;
+	return valid_num;
+}
+
+static inline int set_ud_wqe(struct hns_roce_qp *qp,
+			     const struct ib_send_wr *wr,
+			     void *wqe, unsigned int *sge_idx,
+			     unsigned int owner_bit)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(qp->ibqp.device);
+	struct hns_roce_ah *ah = to_hr_ah(ud_wr(wr)->ah);
+	struct hns_roce_v2_ud_send_wqe *ud_sq_wqe = wqe;
+	unsigned int curr_idx = *sge_idx;
+	int valid_num_sge;
+	u32 msg_len = 0;
+	bool loopback;
+	u8 *smac;
+
+	valid_num_sge = calc_wr_sge_num(wr, &msg_len);
+	memset(ud_sq_wqe, 0, sizeof(*ud_sq_wqe));
+
+	roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_0_M,
+		       V2_UD_SEND_WQE_DMAC_0_S, ah->av.mac[0]);
+	roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_1_M,
+		       V2_UD_SEND_WQE_DMAC_1_S, ah->av.mac[1]);
+	roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_2_M,
+		       V2_UD_SEND_WQE_DMAC_2_S, ah->av.mac[2]);
+	roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_3_M,
+		       V2_UD_SEND_WQE_DMAC_3_S, ah->av.mac[3]);
+	roce_set_field(ud_sq_wqe->byte_48, V2_UD_SEND_WQE_BYTE_48_DMAC_4_M,
+		       V2_UD_SEND_WQE_BYTE_48_DMAC_4_S, ah->av.mac[4]);
+	roce_set_field(ud_sq_wqe->byte_48, V2_UD_SEND_WQE_BYTE_48_DMAC_5_M,
+		       V2_UD_SEND_WQE_BYTE_48_DMAC_5_S, ah->av.mac[5]);
+
+	/* MAC loopback */
+	smac = (u8 *)hr_dev->dev_addr[qp->port];
+	loopback = ether_addr_equal_unaligned(ah->av.mac, smac) ? 1 : 0;
+
+	roce_set_bit(ud_sq_wqe->byte_40,
+		     V2_UD_SEND_WQE_BYTE_40_LBI_S, loopback);
+
+	roce_set_field(ud_sq_wqe->byte_4,
+		       V2_UD_SEND_WQE_BYTE_4_OPCODE_M,
+		       V2_UD_SEND_WQE_BYTE_4_OPCODE_S,
+		       HNS_ROCE_V2_WQE_OP_SEND);
+
+	ud_sq_wqe->msg_len = cpu_to_le32(msg_len);
+
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		ud_sq_wqe->immtdata = cpu_to_le32(be32_to_cpu(wr->ex.imm_data));
+		break;
+	default:
+		ud_sq_wqe->immtdata = 0;
+		break;
+	}
+
+	/* Set sig attr */
+	roce_set_bit(ud_sq_wqe->byte_4, V2_UD_SEND_WQE_BYTE_4_CQE_S,
+		     (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
+
+	/* Set se attr */
+	roce_set_bit(ud_sq_wqe->byte_4, V2_UD_SEND_WQE_BYTE_4_SE_S,
+		     (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
+
+	roce_set_bit(ud_sq_wqe->byte_4, V2_UD_SEND_WQE_BYTE_4_OWNER_S,
+		     owner_bit);
+
+	roce_set_field(ud_sq_wqe->byte_16, V2_UD_SEND_WQE_BYTE_16_PD_M,
+		       V2_UD_SEND_WQE_BYTE_16_PD_S, to_hr_pd(qp->ibqp.pd)->pdn);
+
+	roce_set_field(ud_sq_wqe->byte_16, V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M,
+		       V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S, valid_num_sge);
+
+	roce_set_field(ud_sq_wqe->byte_20,
+		       V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M,
+		       V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S,
+		       curr_idx & (qp->sge.sge_cnt - 1));
+
+	roce_set_field(ud_sq_wqe->byte_24, V2_UD_SEND_WQE_BYTE_24_UDPSPN_M,
+		       V2_UD_SEND_WQE_BYTE_24_UDPSPN_S, 0);
+	ud_sq_wqe->qkey = cpu_to_le32(ud_wr(wr)->remote_qkey & 0x80000000 ?
+			  qp->qkey : ud_wr(wr)->remote_qkey);
+	roce_set_field(ud_sq_wqe->byte_32, V2_UD_SEND_WQE_BYTE_32_DQPN_M,
+		       V2_UD_SEND_WQE_BYTE_32_DQPN_S, ud_wr(wr)->remote_qpn);
+
+	roce_set_field(ud_sq_wqe->byte_36, V2_UD_SEND_WQE_BYTE_36_VLAN_M,
+		       V2_UD_SEND_WQE_BYTE_36_VLAN_S, ah->av.vlan_id);
+	roce_set_field(ud_sq_wqe->byte_36, V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M,
+		       V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S, ah->av.hop_limit);
+	roce_set_field(ud_sq_wqe->byte_36, V2_UD_SEND_WQE_BYTE_36_TCLASS_M,
+		       V2_UD_SEND_WQE_BYTE_36_TCLASS_S, ah->av.tclass);
+	roce_set_field(ud_sq_wqe->byte_40, V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M,
+		       V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S, ah->av.flowlabel);
+	roce_set_field(ud_sq_wqe->byte_40, V2_UD_SEND_WQE_BYTE_40_SL_M,
+		       V2_UD_SEND_WQE_BYTE_40_SL_S, ah->av.sl);
+	roce_set_field(ud_sq_wqe->byte_40, V2_UD_SEND_WQE_BYTE_40_PORTN_M,
+		       V2_UD_SEND_WQE_BYTE_40_PORTN_S, qp->port);
+
+	roce_set_bit(ud_sq_wqe->byte_40, V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S,
+		     ah->av.vlan_en ? 1 : 0);
+	roce_set_field(ud_sq_wqe->byte_48, V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M,
+		       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S, ah->av.gid_index);
+
+	memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0], GID_LEN_V2);
+
+	set_extend_sge(qp, wr, &curr_idx, valid_num_sge);
+
+	*sge_idx = curr_idx;
+
+	return 0;
+}
+
+static inline int set_rc_wqe(struct hns_roce_qp *qp,
+			     const struct ib_send_wr *wr,
+			     void *wqe, unsigned int *sge_idx,
+			     unsigned int owner_bit)
+{
+	struct hns_roce_v2_rc_send_wqe *rc_sq_wqe = wqe;
+	unsigned int curr_idx = *sge_idx;
+	int valid_num_sge;
+	u32 msg_len = 0;
+	int ret = 0;
+
+	valid_num_sge = calc_wr_sge_num(wr, &msg_len);
+	memset(rc_sq_wqe, 0, sizeof(*rc_sq_wqe));
+
+	rc_sq_wqe->msg_len = cpu_to_le32(msg_len);
+
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		rc_sq_wqe->immtdata = cpu_to_le32(be32_to_cpu(wr->ex.imm_data));
+		break;
+	case IB_WR_SEND_WITH_INV:
+		rc_sq_wqe->inv_key = cpu_to_le32(wr->ex.invalidate_rkey);
+		break;
+	default:
+		rc_sq_wqe->immtdata = 0;
+		break;
+	}
+
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_FENCE_S,
+		     (wr->send_flags & IB_SEND_FENCE) ? 1 : 0);
+
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_SE_S,
+		     (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
+
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_CQE_S,
+		     (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
+
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OWNER_S,
+		     owner_bit);
+
+	wqe += sizeof(struct hns_roce_v2_rc_send_wqe);
+	switch (wr->opcode) {
+	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey);
+		rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr);
+		break;
+	case IB_WR_LOCAL_INV:
+		roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_SO_S, 1);
+		rc_sq_wqe->inv_key = cpu_to_le32(wr->ex.invalidate_rkey);
+		break;
+	case IB_WR_REG_MR:
+		set_frmr_seg(rc_sq_wqe, wqe, reg_wr(wr));
+		break;
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		rc_sq_wqe->rkey = cpu_to_le32(atomic_wr(wr)->rkey);
+		rc_sq_wqe->va = cpu_to_le64(atomic_wr(wr)->remote_addr);
+		break;
+	default:
+		break;
+	}
+
+	roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+		       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
+		       to_hr_opcode(wr->opcode));
+
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+	    wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+		set_atomic_seg(wr, wqe, rc_sq_wqe, valid_num_sge);
+	else if (wr->opcode != IB_WR_REG_MR)
+		ret = set_rwqe_data_seg(&qp->ibqp, wr, rc_sq_wqe,
+					wqe, &curr_idx, valid_num_sge);
+
+	*sge_idx = curr_idx;
+
+	return ret;
+}
+
 static inline void update_sq_db(struct hns_roce_dev *hr_dev,
 				struct hns_roce_qp *qp)
 {
@@ -324,23 +532,15 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 				 const struct ib_send_wr **bad_wr)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
-	struct hns_roce_ah *ah = to_hr_ah(ud_wr(wr)->ah);
-	struct hns_roce_v2_ud_send_wqe *ud_sq_wqe;
-	struct hns_roce_v2_rc_send_wqe *rc_sq_wqe;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
-	struct device *dev = hr_dev->dev;
+	unsigned long flags = 0;
 	unsigned int owner_bit;
 	unsigned int sge_idx;
 	unsigned int wqe_idx;
-	unsigned long flags;
-	int valid_num_sge;
 	void *wqe = NULL;
-	bool loopback;
-	u32 tmp_len;
-	u8 *smac;
 	int nreq;
 	int ret;
-	int i;
 
 	spin_lock_irqsave(&qp->sq.lock, flags);
 
@@ -363,8 +563,8 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 		wqe_idx = (qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1);
 
 		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
-			dev_err(dev, "num_sge=%d > qp->sq.max_gs=%d\n",
-				wr->num_sge, qp->sq.max_gs);
+			ibdev_err(ibdev, "num_sge=%d > qp->sq.max_gs=%d\n",
+				  wr->num_sge, qp->sq.max_gs);
 			ret = -EINVAL;
 			*bad_wr = wr;
 			goto out;
@@ -374,248 +574,24 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 		qp->sq.wrid[wqe_idx] = wr->wr_id;
 		owner_bit =
 		       ~(((qp->sq.head + nreq) >> ilog2(qp->sq.wqe_cnt)) & 0x1);
-		valid_num_sge = 0;
-		tmp_len = 0;
-
-		for (i = 0; i < wr->num_sge; i++) {
-			if (likely(wr->sg_list[i].length)) {
-				tmp_len += wr->sg_list[i].length;
-				valid_num_sge++;
-			}
-		}
 
 		/* Corresponding to the QP type, wqe process separately */
-		if (ibqp->qp_type == IB_QPT_GSI) {
-			ud_sq_wqe = wqe;
-			memset(ud_sq_wqe, 0, sizeof(*ud_sq_wqe));
-
-			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_0_M,
-				       V2_UD_SEND_WQE_DMAC_0_S, ah->av.mac[0]);
-			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_1_M,
-				       V2_UD_SEND_WQE_DMAC_1_S, ah->av.mac[1]);
-			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_2_M,
-				       V2_UD_SEND_WQE_DMAC_2_S, ah->av.mac[2]);
-			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_3_M,
-				       V2_UD_SEND_WQE_DMAC_3_S, ah->av.mac[3]);
-			roce_set_field(ud_sq_wqe->byte_48,
-				       V2_UD_SEND_WQE_BYTE_48_DMAC_4_M,
-				       V2_UD_SEND_WQE_BYTE_48_DMAC_4_S,
-				       ah->av.mac[4]);
-			roce_set_field(ud_sq_wqe->byte_48,
-				       V2_UD_SEND_WQE_BYTE_48_DMAC_5_M,
-				       V2_UD_SEND_WQE_BYTE_48_DMAC_5_S,
-				       ah->av.mac[5]);
-
-			/* MAC loopback */
-			smac = (u8 *)hr_dev->dev_addr[qp->port];
-			loopback = ether_addr_equal_unaligned(ah->av.mac,
-							      smac) ? 1 : 0;
-
-			roce_set_bit(ud_sq_wqe->byte_40,
-				     V2_UD_SEND_WQE_BYTE_40_LBI_S, loopback);
-
-			roce_set_field(ud_sq_wqe->byte_4,
-				       V2_UD_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_UD_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_SEND);
-
-			ud_sq_wqe->msg_len =
-			 cpu_to_le32(le32_to_cpu(ud_sq_wqe->msg_len) + tmp_len);
-
-			switch (wr->opcode) {
-			case IB_WR_SEND_WITH_IMM:
-			case IB_WR_RDMA_WRITE_WITH_IMM:
-				ud_sq_wqe->immtdata =
-				      cpu_to_le32(be32_to_cpu(wr->ex.imm_data));
-				break;
-			default:
-				ud_sq_wqe->immtdata = 0;
-				break;
-			}
-
-			/* Set sig attr */
-			roce_set_bit(ud_sq_wqe->byte_4,
-				   V2_UD_SEND_WQE_BYTE_4_CQE_S,
-				   (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
-
-			/* Set se attr */
-			roce_set_bit(ud_sq_wqe->byte_4,
-				  V2_UD_SEND_WQE_BYTE_4_SE_S,
-				  (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
-
-			roce_set_bit(ud_sq_wqe->byte_4,
-				     V2_UD_SEND_WQE_BYTE_4_OWNER_S, owner_bit);
-
-			roce_set_field(ud_sq_wqe->byte_16,
-				       V2_UD_SEND_WQE_BYTE_16_PD_M,
-				       V2_UD_SEND_WQE_BYTE_16_PD_S,
-				       to_hr_pd(ibqp->pd)->pdn);
-
-			roce_set_field(ud_sq_wqe->byte_16,
-				       V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M,
-				       V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S,
-				       valid_num_sge);
-
-			roce_set_field(ud_sq_wqe->byte_20,
-				     V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M,
-				     V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S,
-				     sge_idx & (qp->sge.sge_cnt - 1));
-
-			roce_set_field(ud_sq_wqe->byte_24,
-				       V2_UD_SEND_WQE_BYTE_24_UDPSPN_M,
-				       V2_UD_SEND_WQE_BYTE_24_UDPSPN_S, 0);
-			ud_sq_wqe->qkey =
-			     cpu_to_le32(ud_wr(wr)->remote_qkey & 0x80000000 ?
-			     qp->qkey : ud_wr(wr)->remote_qkey);
-			roce_set_field(ud_sq_wqe->byte_32,
-				       V2_UD_SEND_WQE_BYTE_32_DQPN_M,
-				       V2_UD_SEND_WQE_BYTE_32_DQPN_S,
-				       ud_wr(wr)->remote_qpn);
-
-			roce_set_field(ud_sq_wqe->byte_36,
-				       V2_UD_SEND_WQE_BYTE_36_VLAN_M,
-				       V2_UD_SEND_WQE_BYTE_36_VLAN_S,
-				       ah->av.vlan_id);
-			roce_set_field(ud_sq_wqe->byte_36,
-				       V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M,
-				       V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S,
-				       ah->av.hop_limit);
-			roce_set_field(ud_sq_wqe->byte_36,
-				       V2_UD_SEND_WQE_BYTE_36_TCLASS_M,
-				       V2_UD_SEND_WQE_BYTE_36_TCLASS_S,
-				       ah->av.tclass);
-			roce_set_field(ud_sq_wqe->byte_40,
-				       V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M,
-				       V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S,
-				       ah->av.flowlabel);
-			roce_set_field(ud_sq_wqe->byte_40,
-				       V2_UD_SEND_WQE_BYTE_40_SL_M,
-				       V2_UD_SEND_WQE_BYTE_40_SL_S,
-				       ah->av.sl);
-			roce_set_field(ud_sq_wqe->byte_40,
-				       V2_UD_SEND_WQE_BYTE_40_PORTN_M,
-				       V2_UD_SEND_WQE_BYTE_40_PORTN_S,
-				       qp->port);
-
-			roce_set_bit(ud_sq_wqe->byte_40,
-				     V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S,
-				     ah->av.vlan_en ? 1 : 0);
-			roce_set_field(ud_sq_wqe->byte_48,
-				       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M,
-				       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S,
-				       hns_get_gid_index(hr_dev, qp->phy_port,
-							 ah->av.gid_index));
-
-			memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0],
-			       GID_LEN_V2);
-
-			set_extend_sge(qp, wr, &sge_idx, valid_num_sge);
-		} else if (ibqp->qp_type == IB_QPT_RC) {
-			rc_sq_wqe = wqe;
-			memset(rc_sq_wqe, 0, sizeof(*rc_sq_wqe));
-
-			rc_sq_wqe->msg_len =
-			 cpu_to_le32(le32_to_cpu(rc_sq_wqe->msg_len) + tmp_len);
-
-			switch (wr->opcode) {
-			case IB_WR_SEND_WITH_IMM:
-			case IB_WR_RDMA_WRITE_WITH_IMM:
-				rc_sq_wqe->immtdata =
-				      cpu_to_le32(be32_to_cpu(wr->ex.imm_data));
-				break;
-			case IB_WR_SEND_WITH_INV:
-				rc_sq_wqe->inv_key =
-					cpu_to_le32(wr->ex.invalidate_rkey);
-				break;
-			default:
-				rc_sq_wqe->immtdata = 0;
-				break;
-			}
-
-			roce_set_bit(rc_sq_wqe->byte_4,
-				     V2_RC_SEND_WQE_BYTE_4_FENCE_S,
-				     (wr->send_flags & IB_SEND_FENCE) ? 1 : 0);
-
-			roce_set_bit(rc_sq_wqe->byte_4,
-				  V2_RC_SEND_WQE_BYTE_4_SE_S,
-				  (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
-
-			roce_set_bit(rc_sq_wqe->byte_4,
-				   V2_RC_SEND_WQE_BYTE_4_CQE_S,
-				   (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
-
-			roce_set_bit(rc_sq_wqe->byte_4,
-				     V2_RC_SEND_WQE_BYTE_4_OWNER_S, owner_bit);
-
-			wqe += sizeof(struct hns_roce_v2_rc_send_wqe);
-			switch (wr->opcode) {
-			case IB_WR_RDMA_READ:
-				rc_sq_wqe->rkey =
-					cpu_to_le32(rdma_wr(wr)->rkey);
-				rc_sq_wqe->va =
-					cpu_to_le64(rdma_wr(wr)->remote_addr);
-				break;
-			case IB_WR_RDMA_WRITE:
-				rc_sq_wqe->rkey =
-					cpu_to_le32(rdma_wr(wr)->rkey);
-				rc_sq_wqe->va =
-					cpu_to_le64(rdma_wr(wr)->remote_addr);
-				break;
-			case IB_WR_RDMA_WRITE_WITH_IMM:
-				rc_sq_wqe->rkey =
-					cpu_to_le32(rdma_wr(wr)->rkey);
-				rc_sq_wqe->va =
-					cpu_to_le64(rdma_wr(wr)->remote_addr);
-				break;
-			case IB_WR_LOCAL_INV:
-				roce_set_bit(rc_sq_wqe->byte_4,
-					       V2_RC_SEND_WQE_BYTE_4_SO_S, 1);
-				rc_sq_wqe->inv_key =
-					    cpu_to_le32(wr->ex.invalidate_rkey);
-				break;
-			case IB_WR_REG_MR:
-				set_frmr_seg(rc_sq_wqe, wqe, reg_wr(wr));
-				break;
-			case IB_WR_ATOMIC_CMP_AND_SWP:
-				rc_sq_wqe->rkey =
-					cpu_to_le32(atomic_wr(wr)->rkey);
-				rc_sq_wqe->va =
-					cpu_to_le64(atomic_wr(wr)->remote_addr);
-				break;
-			case IB_WR_ATOMIC_FETCH_AND_ADD:
-				rc_sq_wqe->rkey =
-					cpu_to_le32(atomic_wr(wr)->rkey);
-				rc_sq_wqe->va =
-					cpu_to_le64(atomic_wr(wr)->remote_addr);
-				break;
-			default:
-				break;
-			}
-
-			roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S,
-				       to_hr_opcode(wr->opcode));
-
-			if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-			    wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
-				set_atomic_seg(wr, wqe, rc_sq_wqe,
-					       valid_num_sge);
-			else if (wr->opcode != IB_WR_REG_MR) {
-				ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe,
-							wqe, &sge_idx,
-							valid_num_sge);
-				if (ret) {
-					*bad_wr = wr;
-					goto out;
-				}
-			}
-		} else {
-			dev_err(dev, "Illegal qp_type(0x%x)\n", ibqp->qp_type);
+		if (ibqp->qp_type == IB_QPT_GSI)
+			ret = set_ud_wqe(qp, wr, wqe, &sge_idx, owner_bit);
+		else if (ibqp->qp_type == IB_QPT_RC)
+			ret = set_rc_wqe(qp, wr, wqe, &sge_idx, owner_bit);
+		else {
+			ibdev_err(ibdev, "Illegal qp_type(0x%x)\n",
+				  ibqp->qp_type);
 			spin_unlock_irqrestore(&qp->sq.lock, flags);
 			*bad_wr = wr;
 			return -EOPNOTSUPP;
 		}
+
+		if (ret) {
+			*bad_wr = wr;
+			goto out;
+		}
 	}
 
 out:

From 4b34e23f4eaa236b918886fb90f468a6aa04997f Mon Sep 17 00:00:00 2001
From: "Sindhu, Devale" <sindhu.devale@intel.com>
Date: Fri, 13 Mar 2020 16:44:06 -0500
Subject: [PATCH 189/327] i40iw: Report correct firmware version

The driver uses a hard-coded value for FW version and reports an
inconsistent FW version between ibv_devinfo and
/sys/class/infiniband/i40iw/fw_ver.

Retrieve the FW version via a Control QP (CQP) operation and report it
consistently across sysfs and query device.

Fixes: d37498417947 ("i40iw: add files for iwarp interface")
Link: https://lore.kernel.org/r/20200313214406.2159-1-shiraz.saleem@intel.com
Reported-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Sindhu, Devale <sindhu.devale@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/i40iw/i40iw.h        | 22 ++++-
 drivers/infiniband/hw/i40iw/i40iw_ctrl.c   | 96 ++++++++++++++++++++++
 drivers/infiniband/hw/i40iw/i40iw_d.h      | 26 +++++-
 drivers/infiniband/hw/i40iw/i40iw_main.c   |  6 ++
 drivers/infiniband/hw/i40iw/i40iw_p.h      |  1 +
 drivers/infiniband/hw/i40iw/i40iw_status.h |  3 +-
 drivers/infiniband/hw/i40iw/i40iw_type.h   | 12 +++
 drivers/infiniband/hw/i40iw/i40iw_verbs.c  | 10 ++-
 8 files changed, 167 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h
index 8feec35f95a7..3c62c9327a9c 100644
--- a/drivers/infiniband/hw/i40iw/i40iw.h
+++ b/drivers/infiniband/hw/i40iw/i40iw.h
@@ -67,7 +67,7 @@
 #include "i40iw_user.h"
 #include "i40iw_puda.h"
 
-#define I40IW_FW_VERSION  2
+#define I40IW_FW_VER_DEFAULT 2
 #define I40IW_HW_VERSION  2
 
 #define I40IW_ARP_ADD     1
@@ -325,6 +325,26 @@ struct i40iw_handler {
 	struct i40e_info ldev;
 };
 
+/**
+ * i40iw_fw_major_ver - get firmware major version
+ * @dev: iwarp device
+ **/
+static inline u64 i40iw_fw_major_ver(struct i40iw_sc_dev *dev)
+{
+	return RS_64(dev->feature_info[I40IW_FEATURE_FW_INFO],
+		     I40IW_FW_VER_MAJOR);
+}
+
+/**
+ * i40iw_fw_minor_ver - get firmware minor version
+ * @dev: iwarp device
+ **/
+static inline u64 i40iw_fw_minor_ver(struct i40iw_sc_dev *dev)
+{
+	return RS_64(dev->feature_info[I40IW_FEATURE_FW_INFO],
+		     I40IW_FW_VER_MINOR);
+}
+
 /**
  * to_iwdev - get device
  * @ibdev: ib device
diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
index 4d841a3c68f3..e8b4b3743661 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
@@ -1021,6 +1021,95 @@ static enum i40iw_status_code i40iw_sc_commit_fpm_values(
 	return ret_code;
 }
 
+/**
+ * i40iw_sc_query_rdma_features_done - poll cqp for query features done
+ * @cqp: struct for cqp hw
+ */
+static enum i40iw_status_code
+i40iw_sc_query_rdma_features_done(struct i40iw_sc_cqp *cqp)
+{
+	return i40iw_sc_poll_for_cqp_op_done(
+		cqp, I40IW_CQP_OP_QUERY_RDMA_FEATURES, NULL);
+}
+
+/**
+ * i40iw_sc_query_rdma_features - query rdma features
+ * @cqp: struct for cqp hw
+ * @feat_mem: holds PA for HW to use
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum i40iw_status_code
+i40iw_sc_query_rdma_features(struct i40iw_sc_cqp *cqp,
+			     struct i40iw_dma_mem *feat_mem, u64 scratch)
+{
+	u64 *wqe;
+	u64 header;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (wqe)
+		return I40IW_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 32, feat_mem->pa);
+
+	header = LS_64(I40IW_CQP_OP_QUERY_RDMA_FEATURES, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID) | feat_mem->size;
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QUERY RDMA FEATURES WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	i40iw_sc_cqp_post_sq(cqp);
+
+	return 0;
+}
+
+/**
+ * i40iw_get_rdma_features - get RDMA features
+ * @dev - sc device struct
+ */
+enum i40iw_status_code i40iw_get_rdma_features(struct i40iw_sc_dev *dev)
+{
+	enum i40iw_status_code ret_code;
+	struct i40iw_dma_mem feat_buf;
+	u64 temp;
+	u16 byte_idx, feat_type, feat_cnt;
+
+	ret_code = i40iw_allocate_dma_mem(dev->hw, &feat_buf,
+					  I40IW_FEATURE_BUF_SIZE,
+					  I40IW_FEATURE_BUF_ALIGNMENT);
+
+	if (ret_code)
+		return I40IW_ERR_NO_MEMORY;
+
+	ret_code = i40iw_sc_query_rdma_features(dev->cqp, &feat_buf, 0);
+	if (!ret_code)
+		ret_code = i40iw_sc_query_rdma_features_done(dev->cqp);
+
+	if (ret_code)
+		goto exit;
+
+	get_64bit_val(feat_buf.va, 0, &temp);
+	feat_cnt = RS_64(temp, I40IW_FEATURE_CNT);
+	if (feat_cnt < I40IW_MAX_FEATURES) {
+		ret_code = I40IW_ERR_INVALID_FEAT_CNT;
+		goto exit;
+	} else if (feat_cnt > I40IW_MAX_FEATURES) {
+		i40iw_debug(dev, I40IW_DEBUG_CQP,
+			    "features buf size insufficient\n");
+	}
+
+	for (byte_idx = 0, feat_type = 0; feat_type < I40IW_MAX_FEATURES;
+	     feat_type++, byte_idx += 8) {
+		get_64bit_val((u64 *)feat_buf.va, byte_idx, &temp);
+		dev->feature_info[feat_type] = RS_64(temp, I40IW_FEATURE_INFO);
+	}
+exit:
+	i40iw_free_dma_mem(dev->hw, &feat_buf);
+
+	return ret_code;
+}
+
 /**
  * i40iw_sc_query_fpm_values_done - poll for cqp wqe completion for query fpm
  * @cqp: struct for cqp hw
@@ -4265,6 +4354,13 @@ static enum i40iw_status_code i40iw_exec_cqp_cmd(struct i40iw_sc_dev *dev,
 				true,
 				I40IW_CQP_WAIT_EVENT);
 		break;
+	case OP_QUERY_RDMA_FEATURES:
+		values_mem.pa = pcmdinfo->in.u.query_rdma_features.cap_pa;
+		values_mem.va = pcmdinfo->in.u.query_rdma_features.cap_va;
+		status = i40iw_sc_query_rdma_features(
+			pcmdinfo->in.u.query_rdma_features.cqp, &values_mem,
+			pcmdinfo->in.u.query_rdma_features.scratch);
+		break;
 	default:
 		status = I40IW_NOT_SUPPORTED;
 		break;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h
index 6ddaeec87d2f..e8367d67575d 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_d.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_d.h
@@ -403,7 +403,7 @@
 #define I40IW_CQP_OP_MANAGE_ARP                 0x0f
 #define I40IW_CQP_OP_MANAGE_VF_PBLE_BP          0x10
 #define I40IW_CQP_OP_MANAGE_PUSH_PAGES          0x11
-#define I40IW_CQP_OP_MANAGE_PE_TEAM             0x12
+#define I40IW_CQP_OP_QUERY_RDMA_FEATURES	0x12
 #define I40IW_CQP_OP_UPLOAD_CONTEXT             0x13
 #define I40IW_CQP_OP_ALLOCATE_LOC_MAC_IP_TABLE_ENTRY 0x14
 #define I40IW_CQP_OP_MANAGE_HMC_PM_FUNC_TABLE   0x15
@@ -431,6 +431,24 @@
 #define I40IW_CQP_OP_SHMC_PAGES_ALLOCATED       0x2b
 #define I40IW_CQP_OP_SET_HMC_RESOURCE_PROFILE   0x2d
 
+#define I40IW_FEATURE_BUF_SIZE                  (8 * I40IW_MAX_FEATURES)
+
+#define I40IW_FW_VER_MINOR_SHIFT        0
+#define I40IW_FW_VER_MINOR_MASK         \
+	(0xffffULL << I40IW_FW_VER_MINOR_SHIFT)
+
+#define I40IW_FW_VER_MAJOR_SHIFT        16
+#define I40IW_FW_VER_MAJOR_MASK	        \
+	(0xffffULL << I40IW_FW_VER_MAJOR_SHIFT)
+
+#define I40IW_FEATURE_INFO_SHIFT        0
+#define I40IW_FEATURE_INFO_MASK         \
+	(0xffffULL << I40IW_FEATURE_INFO_SHIFT)
+
+#define I40IW_FEATURE_CNT_SHIFT         32
+#define I40IW_FEATURE_CNT_MASK          \
+	(0xffffULL << I40IW_FEATURE_CNT_SHIFT)
+
 #define I40IW_UDA_QPSQ_NEXT_HEADER_SHIFT 16
 #define I40IW_UDA_QPSQ_NEXT_HEADER_MASK ((u64)0xff << I40IW_UDA_QPSQ_NEXT_HEADER_SHIFT)
 
@@ -1529,7 +1547,8 @@ enum i40iw_alignment {
 	I40IW_AEQ_ALIGNMENT =		0x100,
 	I40IW_CEQ_ALIGNMENT =		0x100,
 	I40IW_CQ0_ALIGNMENT =		0x100,
-	I40IW_SD_BUF_ALIGNMENT =	0x80
+	I40IW_SD_BUF_ALIGNMENT =	0x80,
+	I40IW_FEATURE_BUF_ALIGNMENT =	0x8
 };
 
 #define I40IW_WQE_SIZE_64	64
@@ -1732,6 +1751,7 @@ enum i40iw_alignment {
 #define OP_REQUESTED_COMMANDS                   31
 #define OP_COMPLETED_COMMANDS                   32
 #define OP_GEN_AE                               33
-#define OP_SIZE_CQP_STAT_ARRAY                  34
+#define OP_QUERY_RDMA_FEATURES                  34
+#define OP_SIZE_CQP_STAT_ARRAY			35
 
 #endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
index 84e1b52af15e..9c96ece5e7f3 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -1683,6 +1683,12 @@ static int i40iw_open(struct i40e_info *ldev, struct i40e_client *client)
 		status = i40iw_setup_ceqs(iwdev, ldev);
 		if (status)
 			break;
+
+		status = i40iw_get_rdma_features(dev);
+		if (status)
+			dev->feature_info[I40IW_FEATURE_FW_INFO] =
+				I40IW_FW_VER_DEFAULT;
+
 		iwdev->init_state = CEQ_CREATED;
 		status = i40iw_initialize_hw_resources(iwdev);
 		if (status)
diff --git a/drivers/infiniband/hw/i40iw/i40iw_p.h b/drivers/infiniband/hw/i40iw/i40iw_p.h
index 11d3a2a72100..4c429567bbb4 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_p.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_p.h
@@ -105,6 +105,7 @@ enum i40iw_status_code i40iw_sc_static_hmc_pages_allocated(struct i40iw_sc_cqp *
 							   bool poll_registers);
 
 enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_count);
+enum i40iw_status_code i40iw_get_rdma_features(struct i40iw_sc_dev *dev);
 
 void free_sd_mem(struct i40iw_sc_dev *dev);
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw_status.h b/drivers/infiniband/hw/i40iw/i40iw_status.h
index f7013f11d808..d1c5855bd8c3 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_status.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_status.h
@@ -95,7 +95,8 @@ enum i40iw_status_code {
 	I40IW_ERR_INVALID_MAC_ADDR = -65,
 	I40IW_ERR_BAD_STAG      = -66,
 	I40IW_ERR_CQ_COMPL_ERROR = -67,
-	I40IW_ERR_QUEUE_DESTROYED = -68
+	I40IW_ERR_QUEUE_DESTROYED = -68,
+	I40IW_ERR_INVALID_FEAT_CNT = -69
 
 };
 #endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h
index adc8d2ec523d..54c323c40d96 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_type.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_type.h
@@ -234,6 +234,11 @@ enum i40iw_hw_stats_index_64b {
 	I40IW_HW_STAT_INDEX_MAX_64
 };
 
+enum i40iw_feature_type {
+	I40IW_FEATURE_FW_INFO = 0,
+	I40IW_MAX_FEATURES
+};
+
 struct i40iw_dev_hw_stats_offsets {
 	u32 stats_offset_32[I40IW_HW_STAT_INDEX_MAX_32];
 	u32 stats_offset_64[I40IW_HW_STAT_INDEX_MAX_64];
@@ -501,6 +506,7 @@ struct i40iw_sc_dev {
 	const struct i40iw_vf_cqp_ops *iw_vf_cqp_ops;
 
 	struct i40iw_hmc_fpm_misc hmc_fpm_misc;
+	u64 feature_info[I40IW_MAX_FEATURES];
 	u32 debug_mask;
 	u8 hmc_fn_id;
 	bool is_pf;
@@ -1340,6 +1346,12 @@ struct cqp_info {
 			struct i40iw_sc_qp *qp;
 			u64 scratch;
 		} suspend_resume;
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			void *cap_va;
+			u64 cap_pa;
+			u64 scratch;
+		} query_rdma_features;
 	} u;
 };
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index fa1292932b88..1b6fb1380961 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -64,7 +64,8 @@ static int i40iw_query_device(struct ib_device *ibdev,
 		return -EINVAL;
 	memset(props, 0, sizeof(*props));
 	ether_addr_copy((u8 *)&props->sys_image_guid, iwdev->netdev->dev_addr);
-	props->fw_ver = I40IW_FW_VERSION;
+	props->fw_ver = i40iw_fw_major_ver(&iwdev->sc_dev) << 32 |
+			i40iw_fw_minor_ver(&iwdev->sc_dev);
 	props->device_cap_flags = iwdev->device_cap_flags;
 	props->vendor_id = iwdev->ldev->pcidev->vendor;
 	props->vendor_part_id = iwdev->ldev->pcidev->device;
@@ -2534,10 +2535,11 @@ static const char * const i40iw_hw_stat_names[] = {
 
 static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str)
 {
-	u32 firmware_version = I40IW_FW_VERSION;
+	struct i40iw_device *iwdev = to_iwdev(dev);
 
-	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u", firmware_version,
-		 (firmware_version & 0x000000ff));
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%llu.%llu",
+		 i40iw_fw_major_ver(&iwdev->sc_dev),
+		 i40iw_fw_minor_ver(&iwdev->sc_dev));
 }
 
 /**

From 026ded373483c07983a6a30b70034ad0f3667a44 Mon Sep 17 00:00:00 2001
From: Lang Cheng <chenglang@huawei.com>
Date: Thu, 12 Mar 2020 17:50:24 +0800
Subject: [PATCH 190/327] RDMA/hns: Check if depth of qp is 0 before configure

Depth of qp shouldn't be allowed to be set to zero, after ensuring that,
subsequent process can be simplified. And when qp is changed from reset to
reset, the capability of minimum qp depth was used to identify hardware of
hip06, it should be changed into a more readable form.

Link: https://lore.kernel.org/r/1584006624-11846-1-git-send-email-liweihang@huawei.com
Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_qp.c | 83 +++++++++++--------------
 1 file changed, 36 insertions(+), 47 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 7ba3db5a6192..6317901c4b4f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -359,52 +359,44 @@ static int set_rq_size(struct hns_roce_dev *hr_dev,
 				struct ib_qp_cap *cap, bool is_user, int has_rq,
 				struct hns_roce_qp *hr_qp)
 {
-	struct ib_device *ibdev = &hr_dev->ib_dev;
 	u32 max_cnt;
 
-	/* Check the validity of QP support capacity */
-	if (cap->max_recv_wr > hr_dev->caps.max_wqes ||
-	    cap->max_recv_sge > hr_dev->caps.max_rq_sg) {
-		ibdev_err(ibdev, "Failed to check max recv WR %d and SGE %d\n",
-			  cap->max_recv_wr, cap->max_recv_sge);
-		return -EINVAL;
-	}
-
 	/* If srq exist, set zero for relative number of rq */
 	if (!has_rq) {
 		hr_qp->rq.wqe_cnt = 0;
 		hr_qp->rq.max_gs = 0;
 		cap->max_recv_wr = 0;
 		cap->max_recv_sge = 0;
-	} else {
-		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) {
-			ibdev_err(ibdev, "Failed to check user max recv WR and SGE\n");
-			return -EINVAL;
-		}
 
-		if (hr_dev->caps.min_wqes)
-			max_cnt = max(cap->max_recv_wr, hr_dev->caps.min_wqes);
-		else
-			max_cnt = cap->max_recv_wr;
-
-		hr_qp->rq.wqe_cnt = roundup_pow_of_two(max_cnt);
-
-		if ((u32)hr_qp->rq.wqe_cnt > hr_dev->caps.max_wqes) {
-			ibdev_err(ibdev, "Failed to check RQ WQE count limit\n");
-			return -EINVAL;
-		}
-
-		max_cnt = max(1U, cap->max_recv_sge);
-		hr_qp->rq.max_gs = roundup_pow_of_two(max_cnt);
-		if (hr_dev->caps.max_rq_sg <= HNS_ROCE_SGE_IN_WQE)
-			hr_qp->rq.wqe_shift =
-					ilog2(hr_dev->caps.max_rq_desc_sz);
-		else
-			hr_qp->rq.wqe_shift =
-					ilog2(hr_dev->caps.max_rq_desc_sz
-					      * hr_qp->rq.max_gs);
+		return 0;
 	}
 
+	/* Check the validity of QP support capacity */
+	if (!cap->max_recv_wr || cap->max_recv_wr > hr_dev->caps.max_wqes ||
+	    cap->max_recv_sge > hr_dev->caps.max_rq_sg) {
+		ibdev_err(&hr_dev->ib_dev, "RQ config error, depth=%u, sge=%d\n",
+			  cap->max_recv_wr, cap->max_recv_sge);
+		return -EINVAL;
+	}
+
+	max_cnt = max(cap->max_recv_wr, hr_dev->caps.min_wqes);
+
+	hr_qp->rq.wqe_cnt = roundup_pow_of_two(max_cnt);
+	if ((u32)hr_qp->rq.wqe_cnt > hr_dev->caps.max_wqes) {
+		ibdev_err(&hr_dev->ib_dev, "rq depth %u too large\n",
+			  cap->max_recv_wr);
+		return -EINVAL;
+	}
+
+	max_cnt = max(1U, cap->max_recv_sge);
+	hr_qp->rq.max_gs = roundup_pow_of_two(max_cnt);
+
+	if (hr_dev->caps.max_rq_sg <= HNS_ROCE_SGE_IN_WQE)
+		hr_qp->rq.wqe_shift = ilog2(hr_dev->caps.max_rq_desc_sz);
+	else
+		hr_qp->rq.wqe_shift = ilog2(hr_dev->caps.max_rq_desc_sz *
+					    hr_qp->rq.max_gs);
+
 	cap->max_recv_wr = hr_qp->rq.wqe_cnt;
 	cap->max_recv_sge = hr_qp->rq.max_gs;
 
@@ -613,29 +605,27 @@ static int set_extend_sge_param(struct hns_roce_dev *hr_dev,
 static int set_kernel_sq_size(struct hns_roce_dev *hr_dev,
 			      struct ib_qp_cap *cap, struct hns_roce_qp *hr_qp)
 {
-	struct device *dev = hr_dev->dev;
 	u32 page_size;
 	u32 max_cnt;
 	int size;
 	int ret;
 
-	if (cap->max_send_wr  > hr_dev->caps.max_wqes  ||
+	if (!cap->max_send_wr || cap->max_send_wr > hr_dev->caps.max_wqes ||
 	    cap->max_send_sge > hr_dev->caps.max_sq_sg ||
 	    cap->max_inline_data > hr_dev->caps.max_sq_inline) {
-		dev_err(dev, "SQ WR or sge or inline data error!\n");
+		ibdev_err(&hr_dev->ib_dev,
+			  "SQ WR or sge or inline data error!\n");
 		return -EINVAL;
 	}
 
 	hr_qp->sq.wqe_shift = ilog2(hr_dev->caps.max_sq_desc_sz);
 
-	if (hr_dev->caps.min_wqes)
-		max_cnt = max(cap->max_send_wr, hr_dev->caps.min_wqes);
-	else
-		max_cnt = cap->max_send_wr;
+	max_cnt = max(cap->max_send_wr, hr_dev->caps.min_wqes);
 
 	hr_qp->sq.wqe_cnt = roundup_pow_of_two(max_cnt);
 	if ((u32)hr_qp->sq.wqe_cnt > hr_dev->caps.max_wqes) {
-		dev_err(dev, "while setting kernel sq size, sq.wqe_cnt too large\n");
+		ibdev_err(&hr_dev->ib_dev,
+			  "while setting kernel sq size, sq.wqe_cnt too large\n");
 		return -EINVAL;
 	}
 
@@ -648,7 +638,7 @@ static int set_kernel_sq_size(struct hns_roce_dev *hr_dev,
 
 	ret = set_extend_sge_param(hr_dev, hr_qp);
 	if (ret) {
-		dev_err(dev, "set extend sge parameters fail\n");
+		ibdev_err(&hr_dev->ib_dev, "set extend sge parameters fail\n");
 		return ret;
 	}
 
@@ -1372,11 +1362,10 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		goto out;
 
 	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
-		if (hr_dev->caps.min_wqes) {
+		if (hr_dev->hw_rev == HNS_ROCE_HW_VER1) {
 			ret = -EPERM;
 			ibdev_err(&hr_dev->ib_dev,
-				"cur_state=%d new_state=%d\n", cur_state,
-				new_state);
+				  "RST2RST state is not supported\n");
 		} else {
 			ret = 0;
 		}

From d61ba1b9aefe88f0c296e7c627d4946ac2a6c324 Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Mon, 16 Mar 2020 17:04:54 -0400
Subject: [PATCH 191/327] IB/rdmavt: Delete unused routine

This routine was obsoleted by the patch below.

Delete it.

Fixes: a2a074ef396f ("RDMA: Handle ucontext allocations by IB/core")
Link: https://lore.kernel.org/r/20200316210454.7753.94689.stgit@awfm-01.aw.intel.com
Reviewed-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/rdmavt/vt.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 986265ad6e79..72b031ab7092 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -284,12 +284,6 @@ static int rvt_query_gid(struct ib_device *ibdev, u8 port_num,
 					 &gid->global.interface_id);
 }
 
-static inline struct rvt_ucontext *to_iucontext(struct ib_ucontext
-						*ibucontext)
-{
-	return container_of(ibucontext, struct rvt_ucontext, ibucontext);
-}
-
 /**
  * rvt_alloc_ucontext - Allocate a user context
  * @uctx: Verbs context

From 5ab17a24cb82a636e80c62386dcf1676cbcabaf1 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Mon, 16 Mar 2020 17:05:00 -0400
Subject: [PATCH 192/327] IB/hfi1: Remove kobj from hfi1_devdata

The field kobj was added to hfi1_devdata structure to manage the life time
of the hfi1_devdata structure for PSM accesses:

commit e11ffbd57520 ("IB/hfi1: Do not free hfi1 cdev parent structure early")

Later another mechanism user_refcount/user_comp was introduced to provide
the same functionality:

commit acd7c8fe1493 ("IB/hfi1: Fix an Oops on pci device force remove")

This patch will remove this kobj field, as it is no longer needed.

Link: https://lore.kernel.org/r/20200316210500.7753.4145.stgit@awfm-01.aw.intel.com
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/file_ops.c |  4 +---
 drivers/infiniband/hw/hfi1/hfi.h      |  2 --
 drivers/infiniband/hw/hfi1/init.c     | 26 ++++----------------------
 3 files changed, 5 insertions(+), 27 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index 259115886d35..e7fdd70c6e78 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -209,7 +209,6 @@ static int hfi1_file_open(struct inode *inode, struct file *fp)
 	fd->mm = current->mm;
 	mmgrab(fd->mm);
 	fd->dd = dd;
-	kobject_get(&fd->dd->kobj);
 	fp->private_data = fd;
 	return 0;
 nomem:
@@ -713,7 +712,6 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
 	deallocate_ctxt(uctxt);
 done:
 	mmdrop(fdata->mm);
-	kobject_put(&dd->kobj);
 
 	if (atomic_dec_and_test(&dd->user_refcount))
 		complete(&dd->user_comp);
@@ -1696,7 +1694,7 @@ static int user_add(struct hfi1_devdata *dd)
 	snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
 	ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops,
 			     &dd->user_cdev, &dd->user_device,
-			     true, &dd->kobj);
+			     true, &dd->verbs_dev.rdi.ibdev.dev.kobj);
 	if (ret)
 		user_remove(dd);
 
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index cae12f416ca0..b06c2594105a 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -1413,8 +1413,6 @@ struct hfi1_devdata {
 	bool aspm_enabled;	/* ASPM state: enabled/disabled */
 	struct rhashtable *sdma_rht;
 
-	struct kobject kobj;
-
 	/* vnic data */
 	struct hfi1_vnic_data vnic;
 	/* Lock to protect IRQ SRC register access */
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index e3acda7a0800..3759d9233a1c 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -1198,13 +1198,13 @@ static void finalize_asic_data(struct hfi1_devdata *dd,
 }
 
 /**
- * hfi1_clean_devdata - cleans up per-unit data structure
+ * hfi1_free_devdata - cleans up and frees per-unit data structure
  * @dd: pointer to a valid devdata structure
  *
- * It cleans up all data structures set up by
+ * It cleans up and frees all data structures set up by
  * by hfi1_alloc_devdata().
  */
-static void hfi1_clean_devdata(struct hfi1_devdata *dd)
+void hfi1_free_devdata(struct hfi1_devdata *dd)
 {
 	struct hfi1_asic_data *ad;
 	unsigned long flags;
@@ -1231,23 +1231,6 @@ static void hfi1_clean_devdata(struct hfi1_devdata *dd)
 	rvt_dealloc_device(&dd->verbs_dev.rdi);
 }
 
-static void __hfi1_free_devdata(struct kobject *kobj)
-{
-	struct hfi1_devdata *dd =
-		container_of(kobj, struct hfi1_devdata, kobj);
-
-	hfi1_clean_devdata(dd);
-}
-
-static struct kobj_type hfi1_devdata_type = {
-	.release = __hfi1_free_devdata,
-};
-
-void hfi1_free_devdata(struct hfi1_devdata *dd)
-{
-	kobject_put(&dd->kobj);
-}
-
 /**
  * hfi1_alloc_devdata - Allocate our primary per-unit data structure.
  * @pdev: Valid PCI device
@@ -1333,11 +1316,10 @@ static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
 		goto bail;
 	}
 
-	kobject_init(&dd->kobj, &hfi1_devdata_type);
 	return dd;
 
 bail:
-	hfi1_clean_devdata(dd);
+	hfi1_free_devdata(dd);
 	return ERR_PTR(ret);
 }
 

From fa8a44f6b245ffa9d39d667ebbf81bebc61ca657 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 10 Mar 2020 11:14:30 +0200
Subject: [PATCH 193/327] RDMA/efa: Use in-kernel offsetofend() to check field
 availability

Remove custom and duplicated variant of offsetofend().

Link: https://lore.kernel.org/r/20200310091438.248429-4-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Acked-by: Gal Pressman <galpress@amazon.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/efa/efa_verbs.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index bf3120f140f7..5c57098a4aee 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -144,9 +144,6 @@ static inline bool is_rdma_read_cap(struct efa_dev *dev)
 	return dev->dev_attr.device_caps & EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK;
 }
 
-#define field_avail(x, fld, sz) (offsetof(typeof(x), fld) + \
-				 sizeof_field(typeof(x), fld) <= (sz))
-
 #define is_reserved_cleared(reserved) \
 	!memchr_inv(reserved, 0, sizeof(reserved))
 
@@ -609,7 +606,7 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 	if (err)
 		goto err_out;
 
-	if (!field_avail(cmd, driver_qp_type, udata->inlen)) {
+	if (offsetofend(typeof(cmd), driver_qp_type) > udata->inlen) {
 		ibdev_dbg(&dev->ibdev,
 			  "Incompatible ABI params, no input udata\n");
 		err = -EINVAL;
@@ -896,7 +893,7 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		goto err_out;
 	}
 
-	if (!field_avail(cmd, num_sub_cqs, udata->inlen)) {
+	if (offsetofend(typeof(cmd), num_sub_cqs) > udata->inlen) {
 		ibdev_dbg(ibdev,
 			  "Incompatible ABI params, no input udata\n");
 		err = -EINVAL;

From afc63da64f1e5e41875c98707020e85050f8a0c5 Mon Sep 17 00:00:00 2001
From: Heidi Fahim <heidifahim@google.com>
Date: Mon, 16 Mar 2020 13:21:24 -0700
Subject: [PATCH 194/327] kunit: kunit_parser: make parser more robust

Previously, kunit_parser did not properly handle kunit TAP output that
- had any prefixes (generated from different configs e.g.
CONFIG_PRINTK_TIME)
- had unrelated kernel output mixed in the middle of
it, which has shown up when testing with allyesconfig
To remove prefixes, the parser looks for the first line that includes
TAP output, "TAP version 14".  It then determines the length of the
string before this sequence, and strips that number of characters off
the beginning of the following lines until the last KUnit output line is
reached.
These fixes have been tested with additional tests in the
KUnitParseTest and their associated logs have also been added.

Signed-off-by: Heidi Fahim <heidifahim@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/.gitattributes            |  1 +
 tools/testing/kunit/kunit_parser.py           | 40 +++++------
 tools/testing/kunit/kunit_tool_test.py        | 69 +++++++++++++++++++
 .../test_data/test_config_printk_time.log     | 31 +++++++++
 .../test_data/test_interrupted_tap_output.log | 37 ++++++++++
 .../test_data/test_kernel_panic_interrupt.log | 25 +++++++
 .../test_data/test_multiple_prefixes.log      | 31 +++++++++
 ..._output_with_prefix_isolated_correctly.log | 33 +++++++++
 .../kunit/test_data/test_pound_no_prefix.log  | 33 +++++++++
 .../kunit/test_data/test_pound_sign.log       |  0
 10 files changed, 280 insertions(+), 20 deletions(-)
 create mode 100644 tools/testing/kunit/.gitattributes
 create mode 100644 tools/testing/kunit/test_data/test_config_printk_time.log
 create mode 100644 tools/testing/kunit/test_data/test_interrupted_tap_output.log
 create mode 100644 tools/testing/kunit/test_data/test_kernel_panic_interrupt.log
 create mode 100644 tools/testing/kunit/test_data/test_multiple_prefixes.log
 create mode 100644 tools/testing/kunit/test_data/test_output_with_prefix_isolated_correctly.log
 create mode 100644 tools/testing/kunit/test_data/test_pound_no_prefix.log
 create mode 100644 tools/testing/kunit/test_data/test_pound_sign.log

diff --git a/tools/testing/kunit/.gitattributes b/tools/testing/kunit/.gitattributes
new file mode 100644
index 000000000000..5b7da1fc3b8f
--- /dev/null
+++ b/tools/testing/kunit/.gitattributes
@@ -0,0 +1 @@
+test_data/* binary
diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index 4ffbae0f6732..adf86747b07f 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -46,19 +46,21 @@ class TestStatus(Enum):
 	TEST_CRASHED = auto()
 	NO_TESTS = auto()
 
-kunit_start_re = re.compile(r'^TAP version [0-9]+$')
-kunit_end_re = re.compile('List of all partitions:')
+kunit_start_re = re.compile(r'TAP version [0-9]+$')
+kunit_end_re = re.compile('(List of all partitions:|'
+			  'Kernel panic - not syncing: VFS:|reboot: System halted)')
 
 def isolate_kunit_output(kernel_output):
 	started = False
 	for line in kernel_output:
-		if kunit_start_re.match(line):
+		if kunit_start_re.search(line):
+			prefix_len = len(line.split('TAP version')[0])
 			started = True
-			yield line
-		elif kunit_end_re.match(line):
+			yield line[prefix_len:] if prefix_len > 0 else line
+		elif kunit_end_re.search(line):
 			break
 		elif started:
-			yield line
+			yield line[prefix_len:] if prefix_len > 0 else line
 
 def raw_output(kernel_output):
 	for line in kernel_output:
@@ -108,18 +110,16 @@ OK_NOT_OK_SUBTEST = re.compile(r'^\t(ok|not ok) [0-9]+ - (.*)$')
 
 OK_NOT_OK_MODULE = re.compile(r'^(ok|not ok) [0-9]+ - (.*)$')
 
-def parse_ok_not_ok_test_case(lines: List[str],
-			      test_case: TestCase,
-			      expecting_test_case: bool) -> bool:
+def parse_ok_not_ok_test_case(lines: List[str], test_case: TestCase) -> bool:
 	save_non_diagnositic(lines, test_case)
 	if not lines:
-		if expecting_test_case:
-			test_case.status = TestStatus.TEST_CRASHED
-			return True
-		else:
-			return False
+		test_case.status = TestStatus.TEST_CRASHED
+		return True
 	line = lines[0]
 	match = OK_NOT_OK_SUBTEST.match(line)
+	while not match and lines:
+		line = lines.pop(0)
+		match = OK_NOT_OK_SUBTEST.match(line)
 	if match:
 		test_case.log.append(lines.pop(0))
 		test_case.name = match.group(2)
@@ -150,12 +150,12 @@ def parse_diagnostic(lines: List[str], test_case: TestCase) -> bool:
 	else:
 		return False
 
-def parse_test_case(lines: List[str], expecting_test_case: bool) -> TestCase:
+def parse_test_case(lines: List[str]) -> TestCase:
 	test_case = TestCase()
 	save_non_diagnositic(lines, test_case)
 	while parse_diagnostic(lines, test_case):
 		pass
-	if parse_ok_not_ok_test_case(lines, test_case, expecting_test_case):
+	if parse_ok_not_ok_test_case(lines, test_case):
 		return test_case
 	else:
 		return None
@@ -234,11 +234,11 @@ def parse_test_suite(lines: List[str]) -> TestSuite:
 	expected_test_case_num = parse_subtest_plan(lines)
 	if not expected_test_case_num:
 		return None
-	test_case = parse_test_case(lines, expected_test_case_num > 0)
-	expected_test_case_num -= 1
-	while test_case:
+	while expected_test_case_num > 0:
+		test_case = parse_test_case(lines)
+		if not test_case:
+			break
 		test_suite.cases.append(test_case)
-		test_case = parse_test_case(lines, expected_test_case_num > 0)
 		expected_test_case_num -= 1
 	if parse_ok_not_ok_test_suite(lines, test_suite):
 		test_suite.status = bubble_up_test_case_errors(test_suite)
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index cba97756ac4a..0efae697f396 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -108,6 +108,36 @@ class KUnitParserTest(unittest.TestCase):
 		self.assertContains('ok 1 - example', result)
 		file.close()
 
+	def test_output_with_prefix_isolated_correctly(self):
+		log_path = get_absolute_path(
+			'test_data/test_pound_sign.log')
+		with open(log_path) as file:
+			result = kunit_parser.isolate_kunit_output(file.readlines())
+		self.assertContains('TAP version 14\n', result)
+		self.assertContains('	# Subtest: kunit-resource-test', result)
+		self.assertContains('	1..5', result)
+		self.assertContains('	ok 1 - kunit_resource_test_init_resources', result)
+		self.assertContains('	ok 2 - kunit_resource_test_alloc_resource', result)
+		self.assertContains('	ok 3 - kunit_resource_test_destroy_resource', result)
+		self.assertContains(' foo bar 	#', result)
+		self.assertContains('	ok 4 - kunit_resource_test_cleanup_resources', result)
+		self.assertContains('	ok 5 - kunit_resource_test_proper_free_ordering', result)
+		self.assertContains('ok 1 - kunit-resource-test', result)
+		self.assertContains(' foo bar 	# non-kunit output', result)
+		self.assertContains('	# Subtest: kunit-try-catch-test', result)
+		self.assertContains('	1..2', result)
+		self.assertContains('	ok 1 - kunit_test_try_catch_successful_try_no_catch',
+				    result)
+		self.assertContains('	ok 2 - kunit_test_try_catch_unsuccessful_try_does_catch',
+				    result)
+		self.assertContains('ok 2 - kunit-try-catch-test', result)
+		self.assertContains('	# Subtest: string-stream-test', result)
+		self.assertContains('	1..3', result)
+		self.assertContains('	ok 1 - string_stream_test_empty_on_creation', result)
+		self.assertContains('	ok 2 - string_stream_test_not_empty_after_add', result)
+		self.assertContains('	ok 3 - string_stream_test_get_string', result)
+		self.assertContains('ok 3 - string-stream-test', result)
+
 	def test_parse_successful_test_log(self):
 		all_passed_log = get_absolute_path(
 			'test_data/test_is_test_passed-all_passed.log')
@@ -150,6 +180,45 @@ class KUnitParserTest(unittest.TestCase):
 			result.status)
 		file.close()
 
+	def test_ignores_prefix_printk_time(self):
+		prefix_log = get_absolute_path(
+			'test_data/test_config_printk_time.log')
+		with open(prefix_log) as file:
+			result = kunit_parser.parse_run_tests(file.readlines())
+		self.assertEqual('kunit-resource-test', result.suites[0].name)
+
+	def test_ignores_multiple_prefixes(self):
+		prefix_log = get_absolute_path(
+			'test_data/test_multiple_prefixes.log')
+		with open(prefix_log) as file:
+			result = kunit_parser.parse_run_tests(file.readlines())
+		self.assertEqual('kunit-resource-test', result.suites[0].name)
+
+	def test_prefix_mixed_kernel_output(self):
+		mixed_prefix_log = get_absolute_path(
+			'test_data/test_interrupted_tap_output.log')
+		with open(mixed_prefix_log) as file:
+			result = kunit_parser.parse_run_tests(file.readlines())
+		self.assertEqual('kunit-resource-test', result.suites[0].name)
+
+	def test_prefix_poundsign(self):
+		pound_log = get_absolute_path('test_data/test_pound_sign.log')
+		with open(pound_log) as file:
+			result = kunit_parser.parse_run_tests(file.readlines())
+		self.assertEqual('kunit-resource-test', result.suites[0].name)
+
+	def test_kernel_panic_end(self):
+		panic_log = get_absolute_path('test_data/test_kernel_panic_interrupt.log')
+		with open(panic_log) as file:
+			result = kunit_parser.parse_run_tests(file.readlines())
+		self.assertEqual('kunit-resource-test', result.suites[0].name)
+
+	def test_pound_no_prefix(self):
+		pound_log = get_absolute_path('test_data/test_pound_no_prefix.log')
+		with open(pound_log) as file:
+			result = kunit_parser.parse_run_tests(file.readlines())
+		self.assertEqual('kunit-resource-test', result.suites[0].name)
+
 class StrContains(str):
 	def __eq__(self, other):
 		return self in other
diff --git a/tools/testing/kunit/test_data/test_config_printk_time.log b/tools/testing/kunit/test_data/test_config_printk_time.log
new file mode 100644
index 000000000000..c02ca773946d
--- /dev/null
+++ b/tools/testing/kunit/test_data/test_config_printk_time.log
@@ -0,0 +1,31 @@
+[    0.060000] printk: console [mc-1] enabled
+[    0.060000] random: get_random_bytes called from init_oops_id+0x35/0x40 with crng_init=0
+[    0.060000] TAP version 14
+[    0.060000] 	# Subtest: kunit-resource-test
+[    0.060000] 	1..5
+[    0.060000] 	ok 1 - kunit_resource_test_init_resources
+[    0.060000] 	ok 2 - kunit_resource_test_alloc_resource
+[    0.060000] 	ok 3 - kunit_resource_test_destroy_resource
+[    0.060000] 	ok 4 - kunit_resource_test_cleanup_resources
+[    0.060000] 	ok 5 - kunit_resource_test_proper_free_ordering
+[    0.060000] ok 1 - kunit-resource-test
+[    0.060000] 	# Subtest: kunit-try-catch-test
+[    0.060000] 	1..2
+[    0.060000] 	ok 1 - kunit_test_try_catch_successful_try_no_catch
+[    0.060000] 	ok 2 - kunit_test_try_catch_unsuccessful_try_does_catch
+[    0.060000] ok 2 - kunit-try-catch-test
+[    0.060000] 	# Subtest: string-stream-test
+[    0.060000] 	1..3
+[    0.060000] 	ok 1 - string_stream_test_empty_on_creation
+[    0.060000] 	ok 2 - string_stream_test_not_empty_after_add
+[    0.060000] 	ok 3 - string_stream_test_get_string
+[    0.060000] ok 3 - string-stream-test
+[    0.060000] List of all partitions:
+[    0.060000] No filesystem could mount root, tried:
+[    0.060000]
+[    0.060000] Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(98,0)
+[    0.060000] CPU: 0 PID: 1 Comm: swapper Not tainted 5.4.0-rc1-gea2dd7c0875e-dirty #2
+[    0.060000] Stack:
+[    0.060000]  602086f8 601bc260 705c0000 705c0000
+[    0.060000]  602086f8 6005fcec 705c0000 6002c6ab
+[    0.060000]  6005fcec 601bc260 705c0000 3000000010
\ No newline at end of file
diff --git a/tools/testing/kunit/test_data/test_interrupted_tap_output.log b/tools/testing/kunit/test_data/test_interrupted_tap_output.log
new file mode 100644
index 000000000000..5c73fb3a1c6f
--- /dev/null
+++ b/tools/testing/kunit/test_data/test_interrupted_tap_output.log
@@ -0,0 +1,37 @@
+[    0.060000] printk: console [mc-1] enabled
+[    0.060000] random: get_random_bytes called from init_oops_id+0x35/0x40 with crng_init=0
+[    0.060000] TAP version 14
+[    0.060000] 	# Subtest: kunit-resource-test
+[    0.060000] 	1..5
+[    0.060000] 	ok 1 - kunit_resource_test_init_resources
+[    0.060000] 	ok 2 - kunit_resource_test_alloc_resource
+[    0.060000] 	ok 3 - kunit_resource_test_destroy_resource
+[    0.060000] kAFS: Red Hat AFS client v0.1 registering.
+[    0.060000] FS-Cache: Netfs 'afs' registered for caching
+[    0.060000] *** VALIDATE kAFS ***
+[    0.060000] Btrfs loaded, crc32c=crc32c-generic, debug=on, assert=on, integrity-checker=on, ref-verify=on
+[    0.060000] BTRFS: selftest: sectorsize: 4096  nodesize: 4096
+[    0.060000] BTRFS: selftest: running btrfs free space cache tests
+[    0.060000] 	ok 4 - kunit_resource_test_cleanup_resources
+[    0.060000] 	ok 5 - kunit_resource_test_proper_free_ordering
+[    0.060000] ok 1 - kunit-resource-test
+[    0.060000] 	# Subtest: kunit-try-catch-test
+[    0.060000] 	1..2
+[    0.060000] 	ok 1 - kunit_test_try_catch_successful_try_no_catch
+[    0.060000] 	ok 2 - kunit_test_try_catch_unsuccessful_try_does_catch
+[    0.060000] ok 2 - kunit-try-catch-test
+[    0.060000] 	# Subtest: string-stream-test
+[    0.060000] 	1..3
+[    0.060000] 	ok 1 - string_stream_test_empty_on_creation
+[    0.060000] 	ok 2 - string_stream_test_not_empty_after_add
+[    0.060000] 	ok 3 - string_stream_test_get_string
+[    0.060000] ok 3 - string-stream-test
+[    0.060000] List of all partitions:
+[    0.060000] No filesystem could mount root, tried:
+[    0.060000]
+[    0.060000] Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(98,0)
+[    0.060000] CPU: 0 PID: 1 Comm: swapper Not tainted 5.4.0-rc1-gea2dd7c0875e-dirty #2
+[    0.060000] Stack:
+[    0.060000]  602086f8 601bc260 705c0000 705c0000
+[    0.060000]  602086f8 6005fcec 705c0000 6002c6ab
+[    0.060000]  6005fcec 601bc260 705c0000 3000000010
\ No newline at end of file
diff --git a/tools/testing/kunit/test_data/test_kernel_panic_interrupt.log b/tools/testing/kunit/test_data/test_kernel_panic_interrupt.log
new file mode 100644
index 000000000000..c045eee75f27
--- /dev/null
+++ b/tools/testing/kunit/test_data/test_kernel_panic_interrupt.log
@@ -0,0 +1,25 @@
+[    0.060000] printk: console [mc-1] enabled
+[    0.060000] random: get_random_bytes called from init_oops_id+0x35/0x40 with crng_init=0
+[    0.060000] TAP version 14
+[    0.060000] 	# Subtest: kunit-resource-test
+[    0.060000] 	1..5
+[    0.060000] 	ok 1 - kunit_resource_test_init_resources
+[    0.060000] 	ok 2 - kunit_resource_test_alloc_resource
+[    0.060000] 	ok 3 - kunit_resource_test_destroy_resource
+[    0.060000] 	ok 4 - kunit_resource_test_cleanup_resources
+[    0.060000] 	ok 5 - kunit_resource_test_proper_free_ordering
+[    0.060000] ok 1 - kunit-resource-test
+[    0.060000] 	# Subtest: kunit-try-catch-test
+[    0.060000] 	1..2
+[    0.060000] 	ok 1 - kunit_test_try_catch_successful_try_no_catch
+[    0.060000] 	ok 2 - kunit_test_try_catch_unsuccessful_try_does_catch
+[    0.060000] ok 2 - kunit-try-catch-test
+[    0.060000] 	# Subtest: string-stream-test
+[    0.060000] 	1..3
+[    0.060000] 	ok 1 - string_stream_test_empty_on_creation
+[    0.060000] 	Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(98,0)
+[    0.060000] CPU: 0 PID: 1 Comm: swapper Not tainted 5.4.0-rc1-gea2dd7c0875e-dirty #2
+[    0.060000] Stack:
+[    0.060000]  602086f8 601bc260 705c0000 705c0000
+[    0.060000]  602086f8 6005fcec 705c0000 6002c6ab
+[    0.060000]  6005fcec 601bc260 705c0000 3000000010
\ No newline at end of file
diff --git a/tools/testing/kunit/test_data/test_multiple_prefixes.log b/tools/testing/kunit/test_data/test_multiple_prefixes.log
new file mode 100644
index 000000000000..bc48407dcc36
--- /dev/null
+++ b/tools/testing/kunit/test_data/test_multiple_prefixes.log
@@ -0,0 +1,31 @@
+[    0.060000][    T1] printk: console [mc-1] enabled
+[    0.060000][    T1] random: get_random_bytes called from init_oops_id+0x35/0x40 with crng_init=0
+[    0.060000][    T1] TAP version 14
+[    0.060000][    T1] 	# Subtest: kunit-resource-test
+[    0.060000][    T1] 	1..5
+[    0.060000][    T1] 	ok 1 - kunit_resource_test_init_resources
+[    0.060000][    T1] 	ok 2 - kunit_resource_test_alloc_resource
+[    0.060000][    T1] 	ok 3 - kunit_resource_test_destroy_resource
+[    0.060000][    T1] 	ok 4 - kunit_resource_test_cleanup_resources
+[    0.060000][    T1] 	ok 5 - kunit_resource_test_proper_free_ordering
+[    0.060000][    T1] ok 1 - kunit-resource-test
+[    0.060000][    T1] 	# Subtest: kunit-try-catch-test
+[    0.060000][    T1] 	1..2
+[    0.060000][    T1] 	ok 1 - kunit_test_try_catch_successful_try_no_catch
+[    0.060000][    T1] 	ok 2 - kunit_test_try_catch_unsuccessful_try_does_catch
+[    0.060000][    T1] ok 2 - kunit-try-catch-test
+[    0.060000][    T1] 	# Subtest: string-stream-test
+[    0.060000][    T1] 	1..3
+[    0.060000][    T1] 	ok 1 - string_stream_test_empty_on_creation
+[    0.060000][    T1] 	ok 2 - string_stream_test_not_empty_after_add
+[    0.060000][    T1] 	ok 3 - string_stream_test_get_string
+[    0.060000][    T1] ok 3 - string-stream-test
+[    0.060000][    T1] List of all partitions:
+[    0.060000][    T1] No filesystem could mount root, tried:
+[    0.060000][    T1]
+[    0.060000][    T1] Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(98,0)
+[    0.060000][    T1] CPU: 0 PID: 1 Comm: swapper Not tainted 5.4.0-rc1-gea2dd7c0875e-dirty #2
+[    0.060000][    T1] Stack:
+[    0.060000][    T1]  602086f8 601bc260 705c0000 705c0000
+[    0.060000][    T1]  602086f8 6005fcec 705c0000 6002c6ab
+[    0.060000][    T1]  6005fcec 601bc260 705c0000 3000000010
\ No newline at end of file
diff --git a/tools/testing/kunit/test_data/test_output_with_prefix_isolated_correctly.log b/tools/testing/kunit/test_data/test_output_with_prefix_isolated_correctly.log
new file mode 100644
index 000000000000..0f87cdabebb0
--- /dev/null
+++ b/tools/testing/kunit/test_data/test_output_with_prefix_isolated_correctly.log
@@ -0,0 +1,33 @@
+[    0.060000] printk: console [mc-1] enabled
+[    0.060000] random: get_random_bytes called from init_oops_id+0x35/0x40 with crng_init=0
+[    0.060000] TAP version 14
+[    0.060000] 	# Subtest: kunit-resource-test
+[    0.060000] 	1..5
+[    0.060000] 	ok 1 - kunit_resource_test_init_resources
+[    0.060000] 	ok 2 - kunit_resource_test_alloc_resource
+[    0.060000] 	ok 3 - kunit_resource_test_destroy_resource
+[    0.060000]  foo bar 	#
+[    0.060000] 	ok 4 - kunit_resource_test_cleanup_resources
+[    0.060000] 	ok 5 - kunit_resource_test_proper_free_ordering
+[    0.060000] ok 1 - kunit-resource-test
+[    0.060000]  foo bar 	# non-kunit output
+[    0.060000] 	# Subtest: kunit-try-catch-test
+[    0.060000] 	1..2
+[    0.060000] 	ok 1 - kunit_test_try_catch_successful_try_no_catch
+[    0.060000] 	ok 2 - kunit_test_try_catch_unsuccessful_try_does_catch
+[    0.060000] ok 2 - kunit-try-catch-test
+[    0.060000] 	# Subtest: string-stream-test
+[    0.060000] 	1..3
+[    0.060000] 	ok 1 - string_stream_test_empty_on_creation
+[    0.060000] 	ok 2 - string_stream_test_not_empty_after_add
+[    0.060000] 	ok 3 - string_stream_test_get_string
+[    0.060000] ok 3 - string-stream-test
+[    0.060000] List of all partitions:
+[    0.060000] No filesystem could mount root, tried:
+[    0.060000]
+[    0.060000] Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(98,0)
+[    0.060000] CPU: 0 PID: 1 Comm: swapper Not tainted 5.4.0-rc1-gea2dd7c0875e-dirty #2
+[    0.060000] Stack:
+[    0.060000]  602086f8 601bc260 705c0000 705c0000
+[    0.060000]  602086f8 6005fcec 705c0000 6002c6ab
+[    0.060000]  6005fcec 601bc260 705c0000 3000000010
\ No newline at end of file
diff --git a/tools/testing/kunit/test_data/test_pound_no_prefix.log b/tools/testing/kunit/test_data/test_pound_no_prefix.log
new file mode 100644
index 000000000000..2ceb360be7d5
--- /dev/null
+++ b/tools/testing/kunit/test_data/test_pound_no_prefix.log
@@ -0,0 +1,33 @@
+ printk: console [mc-1] enabled
+ random: get_random_bytes called from init_oops_id+0x35/0x40 with crng_init=0
+ TAP version 14
+ 	# Subtest: kunit-resource-test
+ 	1..5
+ 	ok 1 - kunit_resource_test_init_resources
+ 	ok 2 - kunit_resource_test_alloc_resource
+ 	ok 3 - kunit_resource_test_destroy_resource
+  foo bar 	#
+ 	ok 4 - kunit_resource_test_cleanup_resources
+ 	ok 5 - kunit_resource_test_proper_free_ordering
+ ok 1 - kunit-resource-test
+  foo bar 	# non-kunit output
+ 	# Subtest: kunit-try-catch-test
+ 	1..2
+ 	ok 1 - kunit_test_try_catch_successful_try_no_catch
+ 	ok 2 - kunit_test_try_catch_unsuccessful_try_does_catch
+ ok 2 - kunit-try-catch-test
+ 	# Subtest: string-stream-test
+ 	1..3
+ 	ok 1 - string_stream_test_empty_on_creation
+ 	ok 2 - string_stream_test_not_empty_after_add
+ 	ok 3 - string_stream_test_get_string
+ ok 3 - string-stream-test
+ List of all partitions:
+ No filesystem could mount root, tried:
+
+ Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(98,0)
+ CPU: 0 PID: 1 Comm: swapper Not tainted 5.4.0-rc1-gea2dd7c0875e-dirty #2
+ Stack:
+  602086f8 601bc260 705c0000 705c0000
+  602086f8 6005fcec 705c0000 6002c6ab
+  6005fcec 601bc260 705c0000 3000000010
\ No newline at end of file
diff --git a/tools/testing/kunit/test_data/test_pound_sign.log b/tools/testing/kunit/test_data/test_pound_sign.log
new file mode 100644
index 000000000000..e69de29bb2d1

From 021ed9f551da33449a5238e45e849913422671d7 Mon Sep 17 00:00:00 2001
From: Heidi Fahim <heidifahim@google.com>
Date: Mon, 16 Mar 2020 13:21:25 -0700
Subject: [PATCH 195/327] kunit: Run all KUnit tests through allyesconfig

Implemented the functionality to run all KUnit tests through kunit_tool
by specifying an --alltests flag, which builds UML with allyesconfig
enabled, and consequently runs every KUnit test. A new function was
added to kunit_kernel: make_allyesconfig.
Firstly, if --alltests is specified, kunit.py triggers build_um_kernel
which call make_allyesconfig. This function calls the make command,
disables the broken configs that would otherwise prevent UML from
building, then starts the kernel with all possible configurations
enabled. All stdout and stderr is sent to test.log and read from there
then fed through kunit_parser to parse the tests to the user. Also added
a signal_handler in case kunit is interrupted while running.
Tested: Run under different conditions such as testing with
--raw_output, testing program interrupt then immediately running kunit
again without --alltests and making sure to clean the console.

Signed-off-by: Heidi Fahim <heidifahim@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 .../kunit/configs/broken_on_uml.config        | 41 ++++++++++++
 tools/testing/kunit/kunit.py                  | 30 +++++----
 tools/testing/kunit/kunit_kernel.py           | 62 ++++++++++++++-----
 tools/testing/kunit/kunit_parser.py           |  1 +
 tools/testing/kunit/kunit_tool_test.py        | 17 ++---
 5 files changed, 115 insertions(+), 36 deletions(-)
 create mode 100644 tools/testing/kunit/configs/broken_on_uml.config

diff --git a/tools/testing/kunit/configs/broken_on_uml.config b/tools/testing/kunit/configs/broken_on_uml.config
new file mode 100644
index 000000000000..239b9f03da2c
--- /dev/null
+++ b/tools/testing/kunit/configs/broken_on_uml.config
@@ -0,0 +1,41 @@
+# These are currently broken on UML and prevent allyesconfig from building
+# CONFIG_STATIC_LINK is not set
+# CONFIG_UML_NET_VECTOR is not set
+# CONFIG_UML_NET_VDE is not set
+# CONFIG_UML_NET_PCAP is not set
+# CONFIG_NET_PTP_CLASSIFY is not set
+# CONFIG_IP_VS is not set
+# CONFIG_BRIDGE_EBT_BROUTE is not set
+# CONFIG_BRIDGE_EBT_T_FILTER is not set
+# CONFIG_BRIDGE_EBT_T_NAT is not set
+# CONFIG_MTD_NAND_CADENCE is not set
+# CONFIG_MTD_NAND_NANDSIM is not set
+# CONFIG_BLK_DEV_NULL_BLK is not set
+# CONFIG_BLK_DEV_RAM is not set
+# CONFIG_SCSI_DEBUG is not set
+# CONFIG_NET_VENDOR_XILINX is not set
+# CONFIG_NULL_TTY is not set
+# CONFIG_PTP_1588_CLOCK is not set
+# CONFIG_PINCTRL_EQUILIBRIUM is not set
+# CONFIG_DMABUF_SELFTESTS is not set
+# CONFIG_COMEDI is not set
+# CONFIG_XIL_AXIS_FIFO is not set
+# CONFIG_EXFAT_FS is not set
+# CONFIG_STM_DUMMY is not set
+# CONFIG_FSI_MASTER_ASPEED is not set
+# CONFIG_JFS_FS is not set
+# CONFIG_UBIFS_FS is not set
+# CONFIG_CRAMFS is not set
+# CONFIG_CRYPTO_DEV_SAFEXCEL is not set
+# CONFIG_CRYPTO_DEV_AMLOGIC_GXL is not set
+# CONFIG_KCOV is not set
+# CONFIG_LKDTM is not set
+# CONFIG_REED_SOLOMON_TEST is not set
+# CONFIG_TEST_RHASHTABLE is not set
+# CONFIG_TEST_MEMINIT is not set
+# CONFIG_NETWORK_PHY_TIMESTAMPING is not set
+# CONFIG_DEBUG_INFO_BTF is not set
+# CONFIG_PTP_1588_CLOCK_INES is not set
+# CONFIG_QCOM_CPR is not set
+# CONFIG_RESET_BRCMSTB_RESCAL is not set
+# CONFIG_RESET_INTEL_GW is not set
diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index 180ad1e1b04f..650bb4cfc544 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -22,7 +22,9 @@ import kunit_parser
 
 KunitResult = namedtuple('KunitResult', ['status','result'])
 
-KunitRequest = namedtuple('KunitRequest', ['raw_output','timeout', 'jobs', 'build_dir', 'defconfig'])
+KunitRequest = namedtuple('KunitRequest', ['raw_output','timeout', 'jobs',
+					   'build_dir', 'defconfig',
+					   'alltests'])
 
 KernelDirectoryPath = sys.argv[0].split('tools/testing/kunit/')[0]
 
@@ -55,24 +57,23 @@ def run_tests(linux: kunit_kernel.LinuxSourceTree,
 	kunit_parser.print_with_timestamp('Building KUnit Kernel ...')
 
 	build_start = time.time()
-	success = linux.build_um_kernel(request.jobs, request.build_dir)
+	success = linux.build_um_kernel(request.alltests,
+					request.jobs,
+					request.build_dir)
 	build_end = time.time()
 	if not success:
 		return KunitResult(KunitStatus.BUILD_FAILURE, 'could not build kernel')
 
 	kunit_parser.print_with_timestamp('Starting KUnit Kernel ...')
 	test_start = time.time()
-
-	test_result = kunit_parser.TestResult(kunit_parser.TestStatus.SUCCESS,
-					      [],
-					      'Tests not Parsed.')
+	kunit_output = linux.run_kernel(
+		timeout=None if request.alltests else request.timeout,
+		build_dir=request.build_dir)
 	if request.raw_output:
-		kunit_parser.raw_output(
-			linux.run_kernel(timeout=request.timeout,
-					 build_dir=request.build_dir))
+		raw_output = kunit_parser.raw_output(kunit_output)
+		isolated = list(kunit_parser.isolate_kunit_output(raw_output))
+		test_result = kunit_parser.parse_test_result(isolated)
 	else:
-		kunit_output = linux.run_kernel(timeout=request.timeout,
-						build_dir=request.build_dir)
 		test_result = kunit_parser.parse_run_tests(kunit_output)
 	test_end = time.time()
 
@@ -120,6 +121,10 @@ def main(argv, linux=None):
 				help='Uses a default .kunitconfig.',
 				action='store_true')
 
+	run_parser.add_argument('--alltests',
+				help='Run all KUnit tests through allyesconfig',
+				action='store_true')
+
 	cli_args = parser.parse_args(argv)
 
 	if cli_args.subcommand == 'run':
@@ -143,7 +148,8 @@ def main(argv, linux=None):
 				       cli_args.timeout,
 				       cli_args.jobs,
 				       cli_args.build_dir,
-				       cli_args.defconfig)
+				       cli_args.defconfig,
+				       cli_args.alltests)
 		result = run_tests(linux, request)
 		if result.status != KunitStatus.SUCCESS:
 			sys.exit(1)
diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index d99ae75ef72f..415d8f3c4fe4 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -10,11 +10,16 @@
 import logging
 import subprocess
 import os
+import signal
+
+from contextlib import ExitStack
 
 import kunit_config
+import kunit_parser
 
 KCONFIG_PATH = '.config'
 kunitconfig_path = '.kunitconfig'
+BROKEN_ALLCONFIG_PATH = 'tools/testing/kunit/configs/broken_on_uml.config'
 
 class ConfigError(Exception):
 	"""Represents an error trying to configure the Linux kernel."""
@@ -40,12 +45,29 @@ class LinuxSourceTreeOperations(object):
 		if build_dir:
 			command += ['O=' + build_dir]
 		try:
-			subprocess.check_output(command)
+			subprocess.check_output(command, stderr=subprocess.PIPE)
 		except OSError as e:
 			raise ConfigError('Could not call make command: ' + e)
 		except subprocess.CalledProcessError as e:
 			raise ConfigError(e.output)
 
+	def make_allyesconfig(self):
+		kunit_parser.print_with_timestamp(
+			'Enabling all CONFIGs for UML...')
+		process = subprocess.Popen(
+			['make', 'ARCH=um', 'allyesconfig'],
+			stdout=subprocess.DEVNULL,
+			stderr=subprocess.STDOUT)
+		process.wait()
+		kunit_parser.print_with_timestamp(
+			'Disabling broken configs to run KUnit tests...')
+		with ExitStack() as es:
+			config = open(KCONFIG_PATH, 'a')
+			disable = open(BROKEN_ALLCONFIG_PATH, 'r').read()
+			config.write(disable)
+		kunit_parser.print_with_timestamp(
+			'Starting Kernel with all configs takes a few minutes...')
+
 	def make(self, jobs, build_dir):
 		command = ['make', 'ARCH=um', '--jobs=' + str(jobs)]
 		if build_dir:
@@ -57,18 +79,16 @@ class LinuxSourceTreeOperations(object):
 		except subprocess.CalledProcessError as e:
 			raise BuildError(e.output)
 
-	def linux_bin(self, params, timeout, build_dir):
+	def linux_bin(self, params, timeout, build_dir, outfile):
 		"""Runs the Linux UML binary. Must be named 'linux'."""
 		linux_bin = './linux'
 		if build_dir:
 			linux_bin = os.path.join(build_dir, 'linux')
-		process = subprocess.Popen(
-			[linux_bin] + params,
-			stdin=subprocess.PIPE,
-			stdout=subprocess.PIPE,
-			stderr=subprocess.PIPE)
-		process.wait(timeout=timeout)
-		return process
+		with open(outfile, 'w') as output:
+			process = subprocess.Popen([linux_bin] + params,
+						   stdout=output,
+						   stderr=subprocess.STDOUT)
+			process.wait(timeout)
 
 
 def get_kconfig_path(build_dir):
@@ -84,6 +104,7 @@ class LinuxSourceTree(object):
 		self._kconfig = kunit_config.Kconfig()
 		self._kconfig.read_from_file(kunitconfig_path)
 		self._ops = LinuxSourceTreeOperations()
+		signal.signal(signal.SIGINT, self.signal_handler)
 
 	def clean(self):
 		try:
@@ -135,7 +156,9 @@ class LinuxSourceTree(object):
 			print('Generating .config ...')
 			return self.build_config(build_dir)
 
-	def build_um_kernel(self, jobs, build_dir):
+	def build_um_kernel(self, alltests, jobs, build_dir):
+		if alltests:
+			self._ops.make_allyesconfig()
 		try:
 			self._ops.make_olddefconfig(build_dir)
 			self._ops.make(jobs, build_dir)
@@ -144,10 +167,15 @@ class LinuxSourceTree(object):
 			return False
 		return self.validate_config(build_dir)
 
-	def run_kernel(self, args=[], timeout=None, build_dir=''):
-		args.extend(['mem=256M'])
-		process = self._ops.linux_bin(args, timeout, build_dir)
-		with open(os.path.join(build_dir, 'test.log'), 'w') as f:
-			for line in process.stdout:
-				f.write(line.rstrip().decode('ascii') + '\n')
-				yield line.rstrip().decode('ascii')
+	def run_kernel(self, args=[], build_dir='', timeout=None):
+		args.extend(['mem=1G'])
+		outfile = 'test.log'
+		self._ops.linux_bin(args, timeout, build_dir, outfile)
+		subprocess.call(['stty', 'sane'])
+		with open(outfile, 'r') as file:
+			for line in file:
+				yield line
+
+	def signal_handler(self, sig, frame):
+		logging.error('Build interruption occurred. Cleaning console.')
+		subprocess.call(['stty', 'sane'])
diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index adf86747b07f..02815062d5a6 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -65,6 +65,7 @@ def isolate_kunit_output(kernel_output):
 def raw_output(kernel_output):
 	for line in kernel_output:
 		print(line)
+		yield line
 
 DIVIDER = '=' * 60
 
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index 0efae697f396..ce47e87b633a 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -243,7 +243,8 @@ class KUnitMainTest(unittest.TestCase):
 		kunit.main(['run'], self.linux_source_mock)
 		assert self.linux_source_mock.build_reconfig.call_count == 1
 		assert self.linux_source_mock.run_kernel.call_count == 1
-		self.linux_source_mock.run_kernel.assert_called_once_with(build_dir='', timeout=300)
+		self.linux_source_mock.run_kernel.assert_called_once_with(
+			build_dir='', timeout=300)
 		self.print_mock.assert_any_call(StrContains('Testing complete.'))
 
 	def test_run_passes_args_fail(self):
@@ -258,25 +259,27 @@ class KUnitMainTest(unittest.TestCase):
 
 	def test_run_raw_output(self):
 		self.linux_source_mock.run_kernel = mock.Mock(return_value=[])
-		kunit.main(['run', '--raw_output'], self.linux_source_mock)
+		with self.assertRaises(SystemExit) as e:
+			kunit.main(['run', '--raw_output'], self.linux_source_mock)
+		assert type(e.exception) == SystemExit
+		assert e.exception.code == 1
 		assert self.linux_source_mock.build_reconfig.call_count == 1
 		assert self.linux_source_mock.run_kernel.call_count == 1
-		for kall in self.print_mock.call_args_list:
-			assert kall != mock.call(StrContains('Testing complete.'))
-			assert kall != mock.call(StrContains(' 0 tests run'))
 
 	def test_run_timeout(self):
 		timeout = 3453
 		kunit.main(['run', '--timeout', str(timeout)], self.linux_source_mock)
 		assert self.linux_source_mock.build_reconfig.call_count == 1
-		self.linux_source_mock.run_kernel.assert_called_once_with(build_dir='', timeout=timeout)
+		self.linux_source_mock.run_kernel.assert_called_once_with(
+			build_dir='', timeout=timeout)
 		self.print_mock.assert_any_call(StrContains('Testing complete.'))
 
 	def test_run_builddir(self):
 		build_dir = '.kunit'
 		kunit.main(['run', '--build_dir', build_dir], self.linux_source_mock)
 		assert self.linux_source_mock.build_reconfig.call_count == 1
-		self.linux_source_mock.run_kernel.assert_called_once_with(build_dir=build_dir, timeout=300)
+		self.linux_source_mock.run_kernel.assert_called_once_with(
+			build_dir=build_dir, timeout=300)
 		self.print_mock.assert_any_call(StrContains('Testing complete.'))
 
 if __name__ == '__main__':

From 0476e69f39377192d638c459d11400c6e9a6ffb0 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Mon, 23 Mar 2020 12:04:59 -0700
Subject: [PATCH 196/327] kunit: add --make_options

The kunit.py utility builds an ARCH=um kernel and then runs it.  Add
optional --make_options flag to kunit.py allowing for the operator to
specify extra build options.

This allows use of the clang compiler for kunit:
  tools/testing/kunit/kunit.py run --defconfig \
    --make_options CC=clang --make_options HOSTCC=clang

Signed-off-by: Greg Thelen <gthelen@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Tested-by: David Gow <davidgow@google.com>
Signed-off-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py        | 14 ++++++++++----
 tools/testing/kunit/kunit_kernel.py | 24 ++++++++++++++----------
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index 650bb4cfc544..7dca74774dd2 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -24,7 +24,7 @@ KunitResult = namedtuple('KunitResult', ['status','result'])
 
 KunitRequest = namedtuple('KunitRequest', ['raw_output','timeout', 'jobs',
 					   'build_dir', 'defconfig',
-					   'alltests'])
+					   'alltests', 'make_options'])
 
 KernelDirectoryPath = sys.argv[0].split('tools/testing/kunit/')[0]
 
@@ -49,7 +49,7 @@ def get_kernel_root_path():
 def run_tests(linux: kunit_kernel.LinuxSourceTree,
 	      request: KunitRequest) -> KunitResult:
 	config_start = time.time()
-	success = linux.build_reconfig(request.build_dir)
+	success = linux.build_reconfig(request.build_dir, request.make_options)
 	config_end = time.time()
 	if not success:
 		return KunitResult(KunitStatus.CONFIG_FAILURE, 'could not configure kernel')
@@ -59,7 +59,8 @@ def run_tests(linux: kunit_kernel.LinuxSourceTree,
 	build_start = time.time()
 	success = linux.build_um_kernel(request.alltests,
 					request.jobs,
-					request.build_dir)
+					request.build_dir,
+					request.make_options)
 	build_end = time.time()
 	if not success:
 		return KunitResult(KunitStatus.BUILD_FAILURE, 'could not build kernel')
@@ -125,6 +126,10 @@ def main(argv, linux=None):
 				help='Run all KUnit tests through allyesconfig',
 				action='store_true')
 
+	run_parser.add_argument('--make_options',
+				help='X=Y make option, can be repeated.',
+				action='append')
+
 	cli_args = parser.parse_args(argv)
 
 	if cli_args.subcommand == 'run':
@@ -149,7 +154,8 @@ def main(argv, linux=None):
 				       cli_args.jobs,
 				       cli_args.build_dir,
 				       cli_args.defconfig,
-				       cli_args.alltests)
+				       cli_args.alltests,
+				       cli_args.make_options)
 		result = run_tests(linux, request)
 		if result.status != KunitStatus.SUCCESS:
 			sys.exit(1)
diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index 415d8f3c4fe4..63dbda2d029f 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -40,8 +40,10 @@ class LinuxSourceTreeOperations(object):
 		except subprocess.CalledProcessError as e:
 			raise ConfigError(e.output)
 
-	def make_olddefconfig(self, build_dir):
+	def make_olddefconfig(self, build_dir, make_options):
 		command = ['make', 'ARCH=um', 'olddefconfig']
+		if make_options:
+			command.extend(make_options)
 		if build_dir:
 			command += ['O=' + build_dir]
 		try:
@@ -68,8 +70,10 @@ class LinuxSourceTreeOperations(object):
 		kunit_parser.print_with_timestamp(
 			'Starting Kernel with all configs takes a few minutes...')
 
-	def make(self, jobs, build_dir):
+	def make(self, jobs, build_dir, make_options):
 		command = ['make', 'ARCH=um', '--jobs=' + str(jobs)]
+		if make_options:
+			command.extend(make_options)
 		if build_dir:
 			command += ['O=' + build_dir]
 		try:
@@ -128,19 +132,19 @@ class LinuxSourceTree(object):
 			return False
 		return True
 
-	def build_config(self, build_dir):
+	def build_config(self, build_dir, make_options):
 		kconfig_path = get_kconfig_path(build_dir)
 		if build_dir and not os.path.exists(build_dir):
 			os.mkdir(build_dir)
 		self._kconfig.write_to_file(kconfig_path)
 		try:
-			self._ops.make_olddefconfig(build_dir)
+			self._ops.make_olddefconfig(build_dir, make_options)
 		except ConfigError as e:
 			logging.error(e)
 			return False
 		return self.validate_config(build_dir)
 
-	def build_reconfig(self, build_dir):
+	def build_reconfig(self, build_dir, make_options):
 		"""Creates a new .config if it is not a subset of the .kunitconfig."""
 		kconfig_path = get_kconfig_path(build_dir)
 		if os.path.exists(kconfig_path):
@@ -149,19 +153,19 @@ class LinuxSourceTree(object):
 			if not self._kconfig.is_subset_of(existing_kconfig):
 				print('Regenerating .config ...')
 				os.remove(kconfig_path)
-				return self.build_config(build_dir)
+				return self.build_config(build_dir, make_options)
 			else:
 				return True
 		else:
 			print('Generating .config ...')
-			return self.build_config(build_dir)
+			return self.build_config(build_dir, make_options)
 
-	def build_um_kernel(self, alltests, jobs, build_dir):
+	def build_um_kernel(self, alltests, jobs, build_dir, make_options):
 		if alltests:
 			self._ops.make_allyesconfig()
 		try:
-			self._ops.make_olddefconfig(build_dir)
-			self._ops.make(jobs, build_dir)
+			self._ops.make_olddefconfig(build_dir, make_options)
+			self._ops.make(jobs, build_dir, make_options)
 		except (ConfigError, BuildError) as e:
 			logging.error(e)
 			return False

From 5fb5186383bba84937887d73eb2f7ab8819e7a3e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 23 Mar 2020 11:46:27 -0700
Subject: [PATCH 197/327] RDMA/siw: Suppress uninitialized var warning

drivers/infiniband/sw/siw/siw_qp_rx.c: In function siw_proc_send:
./include/linux/spinlock.h:288:3: warning: flags may be used uninitialized in this function [-Wmaybe-uninitialized]
   _raw_spin_unlock_irqrestore(lock, flags); \
   ^~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/infiniband/sw/siw/siw_qp_rx.c:335:16: note: flags was declared here
  unsigned long flags;

Link: https://lore.kernel.org/r/20200323184627.ZWPg91uin%akpm@linux-foundation.org
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/siw/siw_qp_rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c
index 9ccce2909ac4..650520244ed0 100644
--- a/drivers/infiniband/sw/siw/siw_qp_rx.c
+++ b/drivers/infiniband/sw/siw/siw_qp_rx.c
@@ -332,7 +332,7 @@ static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
 	struct siw_srq *srq;
 	struct siw_wqe *wqe = NULL;
 	bool srq_event = false;
-	unsigned long flags;
+	unsigned long uninitialized_var(flags);
 
 	srq = qp->srq;
 	if (srq) {

From 3fd53533a8bcc5a7f1fa275e28dfb6b05f28a941 Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Thu, 13 Feb 2020 12:11:26 +0800
Subject: [PATCH 198/327] dm crypt: use crypt_integrity_aead() helper

Replace test_bit(CRYPT_MODE_INTEGRITY_AEAD, XXX) with
crypt_integrity_aead().

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index c6a529873d0f..3df90daba89e 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -230,6 +230,8 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc,
 					     struct scatterlist *sg);
 
+static bool crypt_integrity_aead(struct crypt_config *cc);
+
 /*
  * Use this to access cipher attributes that are independent of the key.
  */
@@ -346,7 +348,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
 	unsigned bs;
 	int log;
 
-	if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags))
+	if (crypt_integrity_aead(cc))
 		bs = crypto_aead_blocksize(any_tfm_aead(cc));
 	else
 		bs = crypto_skcipher_blocksize(any_tfm(cc));
@@ -712,7 +714,7 @@ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv,
 static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 			    const char *opts)
 {
-	if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) {
+	if (crypt_integrity_aead(cc)) {
 		ti->error = "AEAD transforms not supported for EBOIV";
 		return -EINVAL;
 	}

From eaab4bde6e645a37febba9a4126dc71e5994e3a1 Mon Sep 17 00:00:00 2001
From: Erich Eckner <git@eckner.net>
Date: Wed, 12 Feb 2020 11:43:10 +0100
Subject: [PATCH 199/327] dm integrity: print device name in
 integrity_metadata() error message

Similar to f710126cfc89c8df477002a26dee8407eb0b4acd ("dm crypt: print
device name in integrity error message"), this message should also
better identify the device with the integrity failure.

Signed-off-by: Erich Eckner <git@eckner.net>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 2f03fecd312d..66b8cfb83087 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1558,7 +1558,8 @@ static void integrity_metadata(struct work_struct *w)
 						checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
 			if (unlikely(r)) {
 				if (r > 0) {
-					DMERR_LIMIT("Checksum failed at sector 0x%llx",
+					char b[BDEVNAME_SIZE];
+					DMERR_LIMIT("%s: Checksum failed at sector 0x%llx", bio_devname(bio, b),
 						    (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
 					r = -EILSEQ;
 					atomic64_inc(&ic->number_of_mismatches);

From d53f1fafec9d086f1c5166436abefdaef30e0363 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 24 Feb 2020 10:20:31 +0100
Subject: [PATCH 200/327] dm writecache: do direct write if the cache is full

If the cache device is full, we do a direct write to the origin device.
Note that we must not do it if the written block is already in the cache.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-writecache.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index a09bdc000e64..dc1e10c6ecd7 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -1194,6 +1194,7 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
 		}
 	} else {
 		do {
+			bool found_entry = false;
 			if (writecache_has_error(wc))
 				goto unlock_error;
 			e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
@@ -1204,9 +1205,21 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
 					wc->overwrote_committed = true;
 					goto bio_copy;
 				}
+				found_entry = true;
 			}
 			e = writecache_pop_from_freelist(wc, (sector_t)-1);
 			if (unlikely(!e)) {
+				if (!found_entry) {
+					e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
+					if (e) {
+						sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector;
+						BUG_ON(!next_boundary);
+						if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
+							dm_accept_partial_bio(bio, next_boundary);
+						}
+					}
+					goto unlock_remap_origin;
+				}
 				writecache_wait_on_freelist(wc);
 				continue;
 			}

From 93de44eb3fc8c3566f5315b0210630cc361526a7 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 24 Feb 2020 10:20:32 +0100
Subject: [PATCH 201/327] dm writecache: implement the "cleaner" policy

The "flush" or "flush_on_suspend" messages flush the whole cache. However,
these flushing methods can take some time and the process is left in
an interruptible state during the flush.

Implement a "cleaner" option that offers an alternate flushing method.
When this option is activated (either by a message or in the constructor
arguments), the cache will not promote new writes (however, writes to
already cached blocks are promoted, to avoid data corruption due to
misordered writes) and it will gradually writeback any cached data. The
userspace can then monitor the cleaning process with "dmsetup status".
When the number of cached bloks drops to zero, the userspace can unload
the dm-writecache target and replace it with dm-linear or other targets.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-writecache.c | 48 ++++++++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index dc1e10c6ecd7..3f17dcc6b73e 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -160,6 +160,7 @@ struct dm_writecache {
 	bool autocommit_time_set:1;
 	bool writeback_fua_set:1;
 	bool flush_on_suspend:1;
+	bool cleaner:1;
 
 	unsigned writeback_all;
 	struct workqueue_struct *writeback_wq;
@@ -1021,6 +1022,28 @@ static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_w
 	return 0;
 }
 
+static void activate_cleaner(struct dm_writecache *wc)
+{
+	wc->flush_on_suspend = true;
+	wc->cleaner = true;
+	wc->freelist_high_watermark = wc->n_blocks;
+	wc->freelist_low_watermark = wc->n_blocks;
+}
+
+static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
+{
+	if (argc != 1)
+		return -EINVAL;
+
+	wc_lock(wc);
+	activate_cleaner(wc);
+	if (!dm_suspended(wc->ti))
+		writecache_verify_watermark(wc);
+	wc_unlock(wc);
+
+	return 0;
+}
+
 static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
 			      char *result, unsigned maxlen)
 {
@@ -1031,6 +1054,8 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
 		r = process_flush_mesg(argc, argv, wc);
 	else if (!strcasecmp(argv[0], "flush_on_suspend"))
 		r = process_flush_on_suspend_mesg(argc, argv, wc);
+	else if (!strcasecmp(argv[0], "cleaner"))
+		r = process_cleaner_mesg(argc, argv, wc);
 	else
 		DMERR("unrecognised message received: %s", argv[0]);
 
@@ -1206,10 +1231,14 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
 					goto bio_copy;
 				}
 				found_entry = true;
+			} else {
+				if (unlikely(wc->cleaner))
+					goto direct_write;
 			}
 			e = writecache_pop_from_freelist(wc, (sector_t)-1);
 			if (unlikely(!e)) {
 				if (!found_entry) {
+direct_write:
 					e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
 					if (e) {
 						sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector;
@@ -2071,6 +2100,8 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
 				goto invalid_optional;
 			wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
 			wc->autocommit_time_set = true;
+		} else if (!strcasecmp(string, "cleaner")) {
+			wc->cleaner = true;
 		} else if (!strcasecmp(string, "fua")) {
 			if (WC_MODE_PMEM(wc)) {
 				wc->writeback_fua = true;
@@ -2248,6 +2279,9 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	do_div(x, 100);
 	wc->freelist_low_watermark = x;
 
+	if (wc->cleaner)
+		activate_cleaner(wc);
+
 	r = writecache_alloc_entries(wc);
 	if (r) {
 		ti->error = "Cannot allocate memory";
@@ -2291,9 +2325,9 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
 		extra_args = 0;
 		if (wc->start_sector)
 			extra_args += 2;
-		if (wc->high_wm_percent_set)
+		if (wc->high_wm_percent_set && !wc->cleaner)
 			extra_args += 2;
-		if (wc->low_wm_percent_set)
+		if (wc->low_wm_percent_set && !wc->cleaner)
 			extra_args += 2;
 		if (wc->max_writeback_jobs_set)
 			extra_args += 2;
@@ -2301,19 +2335,21 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
 			extra_args += 2;
 		if (wc->autocommit_time_set)
 			extra_args += 2;
+		if (wc->cleaner)
+			extra_args++;
 		if (wc->writeback_fua_set)
 			extra_args++;
 
 		DMEMIT("%u", extra_args);
 		if (wc->start_sector)
 			DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
-		if (wc->high_wm_percent_set) {
+		if (wc->high_wm_percent_set && !wc->cleaner) {
 			x = (uint64_t)wc->freelist_high_watermark * 100;
 			x += wc->n_blocks / 2;
 			do_div(x, (size_t)wc->n_blocks);
 			DMEMIT(" high_watermark %u", 100 - (unsigned)x);
 		}
-		if (wc->low_wm_percent_set) {
+		if (wc->low_wm_percent_set && !wc->cleaner) {
 			x = (uint64_t)wc->freelist_low_watermark * 100;
 			x += wc->n_blocks / 2;
 			do_div(x, (size_t)wc->n_blocks);
@@ -2325,6 +2361,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
 			DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
 		if (wc->autocommit_time_set)
 			DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
+		if (wc->cleaner)
+			DMEMIT(" cleaner");
 		if (wc->writeback_fua_set)
 			DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
 		break;
@@ -2333,7 +2371,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
 
 static struct target_type writecache_target = {
 	.name			= "writecache",
-	.version		= {1, 2, 0},
+	.version		= {1, 3, 0},
 	.module			= THIS_MODULE,
 	.ctr			= writecache_ctr,
 	.dtr			= writecache_dtr,

From 3923d4854e189d84c6ec22e66d536d3498f2747c Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 24 Feb 2020 10:20:33 +0100
Subject: [PATCH 202/327] dm writecache: implement gradual cleanup

If a block is stored in the cache for too long, it will now be
written to the underlying device and cleaned up.

Add a new option "max_age" that specifies the maximum age of a block
in milliseconds.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-writecache.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 3f17dcc6b73e..e5c7b9072dd0 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -26,6 +26,8 @@
 #define AUTOCOMMIT_BLOCKS_SSD		65536
 #define AUTOCOMMIT_BLOCKS_PMEM		64
 #define AUTOCOMMIT_MSEC			1000
+#define MAX_AGE_DIV			16
+#define MAX_AGE_UNSPECIFIED		-1UL
 
 #define BITMAP_GRANULARITY	65536
 #if BITMAP_GRANULARITY < PAGE_SIZE
@@ -88,6 +90,7 @@ struct wc_entry {
 		:47
 #endif
 	;
+	unsigned long age;
 #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 	uint64_t original_sector;
 	uint64_t seq_count;
@@ -119,6 +122,7 @@ struct dm_writecache {
 	size_t writeback_size;
 	size_t freelist_high_watermark;
 	size_t freelist_low_watermark;
+	unsigned long max_age;
 
 	unsigned uncommitted_blocks;
 	unsigned autocommit_blocks;
@@ -130,6 +134,8 @@ struct dm_writecache {
 	struct timer_list autocommit_timer;
 	struct wait_queue_head freelist_wait;
 
+	struct timer_list max_age_timer;
+
 	atomic_t bio_in_progress[2];
 	struct wait_queue_head bio_in_progress_wait[2];
 
@@ -597,6 +603,7 @@ static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *i
 	rb_link_node(&ins->rb_node, parent, node);
 	rb_insert_color(&ins->rb_node, &wc->tree);
 	list_add(&ins->lru, &wc->lru);
+	ins->age = jiffies;
 }
 
 static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
@@ -632,6 +639,16 @@ static inline void writecache_verify_watermark(struct dm_writecache *wc)
 		queue_work(wc->writeback_wq, &wc->writeback_work);
 }
 
+static void writecache_max_age_timer(struct timer_list *t)
+{
+	struct dm_writecache *wc = from_timer(wc, t, max_age_timer);
+
+	if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
+		queue_work(wc->writeback_wq, &wc->writeback_work);
+		mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
+	}
+}
+
 static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
 {
 	struct wc_entry *e;
@@ -838,6 +855,7 @@ static void writecache_suspend(struct dm_target *ti)
 	bool flush_on_suspend;
 
 	del_timer_sync(&wc->autocommit_timer);
+	del_timer_sync(&wc->max_age_timer);
 
 	wc_lock(wc);
 	writecache_flush(wc);
@@ -974,6 +992,9 @@ static void writecache_resume(struct dm_target *ti)
 
 	writecache_verify_watermark(wc);
 
+	if (wc->max_age != MAX_AGE_UNSPECIFIED)
+		mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
+
 	wc_unlock(wc);
 }
 
@@ -1661,7 +1682,9 @@ static void writecache_writeback(struct work_struct *work)
 	wbl.size = 0;
 	while (!list_empty(&wc->lru) &&
 	       (wc->writeback_all ||
-		wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) {
+		wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
+		(jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
+		 wc->max_age - wc->max_age / MAX_AGE_DIV))) {
 
 		n_walked++;
 		if (unlikely(n_walked > WRITEBACK_LATENCY) &&
@@ -1924,9 +1947,11 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	wc->ti = ti;
 
 	mutex_init(&wc->lock);
+	wc->max_age = MAX_AGE_UNSPECIFIED;
 	writecache_poison_lists(wc);
 	init_waitqueue_head(&wc->freelist_wait);
 	timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
+	timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
 
 	for (i = 0; i < 2; i++) {
 		atomic_set(&wc->bio_in_progress[i], 0);
@@ -2100,6 +2125,14 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
 				goto invalid_optional;
 			wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
 			wc->autocommit_time_set = true;
+		} else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
+			unsigned max_age_msecs;
+			string = dm_shift_arg(&as), opt_params--;
+			if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
+				goto invalid_optional;
+			if (max_age_msecs > 86400000)
+				goto invalid_optional;
+			wc->max_age = msecs_to_jiffies(max_age_msecs);
 		} else if (!strcasecmp(string, "cleaner")) {
 			wc->cleaner = true;
 		} else if (!strcasecmp(string, "fua")) {
@@ -2361,6 +2394,8 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
 			DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
 		if (wc->autocommit_time_set)
 			DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
+		if (wc->max_age != MAX_AGE_UNSPECIFIED)
+			DMEMIT(" max_age %u", jiffies_to_msecs(wc->max_age));
 		if (wc->cleaner)
 			DMEMIT(" cleaner");
 		if (wc->writeback_fua_set)

From dc8a01ae1dbd7bac98368da4d8f81632512429f5 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 24 Feb 2020 10:20:34 +0100
Subject: [PATCH 203/327] dm writecache: optimize superblock write

If we write a superblock in writecache_flush, we don't need to set bit and
scan the bitmap for it - we can just write the superblock directly. Also,
we can set the flag REQ_FUA on the write bio, so that we don't need to
submit a flush bio afterwards.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-writecache.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index e5c7b9072dd0..e274e5a4d425 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -509,6 +509,34 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
 	memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
 }
 
+static void ssd_commit_superblock(struct dm_writecache *wc)
+{
+	int r;
+	struct dm_io_region region;
+	struct dm_io_request req;
+
+	region.bdev = wc->ssd_dev->bdev;
+	region.sector = 0;
+	region.count = PAGE_SIZE;
+
+	if (unlikely(region.sector + region.count > wc->metadata_sectors))
+		region.count = wc->metadata_sectors - region.sector;
+
+	region.sector += wc->start_sector;
+
+	req.bi_op = REQ_OP_WRITE;
+	req.bi_op_flags = REQ_SYNC | REQ_FUA;
+	req.mem.type = DM_IO_VMA;
+	req.mem.ptr.vma = (char *)wc->memory_map;
+	req.client = wc->dm_io;
+	req.notify.fn = NULL;
+	req.notify.context = NULL;
+
+	r = dm_io(&req, 1, &region, NULL);
+	if (unlikely(r))
+		writecache_error(wc, r, "error writing superblock");
+}
+
 static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
 {
 	if (WC_MODE_PMEM(wc))
@@ -759,8 +787,10 @@ static void writecache_flush(struct dm_writecache *wc)
 
 	wc->seq_count++;
 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
-	writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
-	writecache_commit_flushed(wc, false);
+	if (WC_MODE_PMEM(wc))
+		writecache_commit_flushed(wc, false);
+	else
+		ssd_commit_superblock(wc);
 
 	wc->overwrote_committed = false;
 

From 75fa601934fda23d2f15bf44b09c2401942d8e15 Mon Sep 17 00:00:00 2001
From: "Shetty, Harshini X (EXT-Sony Mobile)" <Harshini.X.Shetty@sony.com>
Date: Tue, 17 Mar 2020 09:15:45 +0000
Subject: [PATCH 204/327] dm verity fec: fix memory leak in verity_fec_dtr

Fix below kmemleak detected in verity_fec_ctr. output_pool is
allocated for each dm-verity-fec device. But it is not freed when
dm-table for the verity target is removed. Hence free the output
mempool in destructor function verity_fec_dtr.

unreferenced object 0xffffffffa574d000 (size 4096):
  comm "init", pid 1667, jiffies 4294894890 (age 307.168s)
  hex dump (first 32 bytes):
    8e 36 00 98 66 a8 0b 9b 00 00 00 00 00 00 00 00  .6..f...........
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<0000000060e82407>] __kmalloc+0x2b4/0x340
    [<00000000dd99488f>] mempool_kmalloc+0x18/0x20
    [<000000002560172b>] mempool_init_node+0x98/0x118
    [<000000006c3574d2>] mempool_init+0x14/0x20
    [<0000000008cb266e>] verity_fec_ctr+0x388/0x3b0
    [<000000000887261b>] verity_ctr+0x87c/0x8d0
    [<000000002b1e1c62>] dm_table_add_target+0x174/0x348
    [<000000002ad89eda>] table_load+0xe4/0x328
    [<000000001f06f5e9>] dm_ctl_ioctl+0x3b4/0x5a0
    [<00000000bee5fbb7>] do_vfs_ioctl+0x5dc/0x928
    [<00000000b475b8f5>] __arm64_sys_ioctl+0x70/0x98
    [<000000005361e2e8>] el0_svc_common+0xa0/0x158
    [<000000001374818f>] el0_svc_handler+0x6c/0x88
    [<000000003364e9f4>] el0_svc+0x8/0xc
    [<000000009d84cec9>] 0xffffffffffffffff

Fixes: a739ff3f543af ("dm verity: add support for forward error correction")
Depends-on: 6f1c819c219f7 ("dm: convert to bioset_init()/mempool_init()")
Cc: stable@vger.kernel.org
Signed-off-by: Harshini Shetty <harshini.x.shetty@sony.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-verity-fec.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 3ceeb6b404ed..49147e634046 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -551,6 +551,7 @@ void verity_fec_dtr(struct dm_verity *v)
 	mempool_exit(&f->rs_pool);
 	mempool_exit(&f->prealloc_pool);
 	mempool_exit(&f->extra_pool);
+	mempool_exit(&f->output_pool);
 	kmem_cache_destroy(f->cache);
 
 	if (f->data_bufio)

From b8fdd090376a7a46d17db316638fe54b965c2fb0 Mon Sep 17 00:00:00 2001
From: Bob Liu <bob.liu@oracle.com>
Date: Tue, 24 Mar 2020 21:22:45 +0800
Subject: [PATCH 205/327] dm zoned: remove duplicate nr_rnd_zones increase in
 dmz_init_zone()

zmd->nr_rnd_zones was increased twice by mistake. The other place it
is increased in dmz_init_zone() is the only one needed:

1131                 zmd->nr_useable_zones++;
1132                 if (dmz_is_rnd(zone)) {
1133                         zmd->nr_rnd_zones++;
					^^^
Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target")
Cc: stable@vger.kernel.org
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-zoned-metadata.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 516c7b671d25..369de15c4e80 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1109,7 +1109,6 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data)
 	switch (blkz->type) {
 	case BLK_ZONE_TYPE_CONVENTIONAL:
 		set_bit(DMZ_RND, &zone->flags);
-		zmd->nr_rnd_zones++;
 		break;
 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
 	case BLK_ZONE_TYPE_SEQWRITE_PREF:

From b93b6643e9b5a7f260b931e97f56ffa3fa65e26d Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 22 Mar 2020 20:42:21 +0100
Subject: [PATCH 206/327] dm integrity: fix a crash with unusually large tag
 size

If the user specifies tag size larger than HASH_MAX_DIGESTSIZE,
there's a crash in integrity_metadata().

Cc: stable@vger.kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 66b8cfb83087..3cc12b55c34f 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1519,7 +1519,7 @@ static void integrity_metadata(struct work_struct *w)
 		struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
 		char *checksums;
 		unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
-		char checksums_onstack[HASH_MAX_DIGESTSIZE];
+		char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
 		unsigned sectors_to_process = dio->range.n_sectors;
 		sector_t sector = dio->range.logical_sector;
 
@@ -1749,7 +1749,7 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
 				} while (++s < ic->sectors_per_block);
 #ifdef INTERNAL_VERIFY
 				if (ic->internal_hash) {
-					char checksums_onstack[max(HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
+					char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
 
 					integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
 					if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {

From 7649194a1636ab5876e7c18337d7ddd63e1d4376 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 22 Mar 2020 20:42:22 +0100
Subject: [PATCH 207/327] dm integrity: remove sector type casts

Since the commit 72deb455b5ec619ff043c30bc90025aa3de3cdda ("block:
remove CONFIG_LBDAF") sector_t is always defined as unsigned long
long.

Delete the needless type casts in printk and avoids some warnings if
DEBUG_PRINT is defined.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 3cc12b55c34f..8ce69355b8dc 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -510,8 +510,8 @@ static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap,
 
 	if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) {
 		DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)",
-			(unsigned long long)sector,
-			(unsigned long long)n_sectors,
+			sector,
+			n_sectors,
 			ic->sb->log2_sectors_per_block,
 			ic->log2_blocks_per_bitmap_bit,
 			mode);
@@ -1560,7 +1560,7 @@ static void integrity_metadata(struct work_struct *w)
 				if (r > 0) {
 					char b[BDEVNAME_SIZE];
 					DMERR_LIMIT("%s: Checksum failed at sector 0x%llx", bio_devname(bio, b),
-						    (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size)));
+						    (sector - ((r + ic->tag_size - 1) / ic->tag_size)));
 					r = -EILSEQ;
 					atomic64_inc(&ic->number_of_mismatches);
 				}
@@ -1644,14 +1644,14 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 	}
 	if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) {
 		DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx",
-		      (unsigned long long)dio->range.logical_sector, bio_sectors(bio),
-		      (unsigned long long)ic->provided_data_sectors);
+		      dio->range.logical_sector, bio_sectors(bio),
+		      ic->provided_data_sectors);
 		return DM_MAPIO_KILL;
 	}
 	if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) {
 		DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x",
 		      ic->sectors_per_block,
-		      (unsigned long long)dio->range.logical_sector, bio_sectors(bio));
+		      dio->range.logical_sector, bio_sectors(bio));
 		return DM_MAPIO_KILL;
 	}
 
@@ -1754,7 +1754,7 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
 					integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
 					if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
 						DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
-							    (unsigned long long)logical_sector);
+							    logical_sector);
 					}
 				}
 #endif
@@ -2405,7 +2405,7 @@ static void integrity_recalc(struct work_struct *w)
 		get_area_and_offset(ic, logical_sector, &area, &offset);
 	}
 
-	DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors);
+	DEBUG_print("recalculating: %llx, %llx\n", logical_sector, n_sectors);
 
 	if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
 		recalc_write_super(ic);
@@ -2899,7 +2899,7 @@ static void dm_integrity_resume(struct dm_target *ti)
 	DEBUG_print("testing recalc: %x\n", ic->sb->flags);
 	if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
 		__u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector);
-		DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors);
+		DEBUG_print("recalc pos: %llx / %llx\n", recalc_pos, ic->provided_data_sectors);
 		if (recalc_pos < ic->provided_data_sectors) {
 			queue_work(ic->recalc_wq, &ic->recalc_work);
 		} else if (recalc_pos > ic->provided_data_sectors) {
@@ -2929,10 +2929,10 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 	switch (type) {
 	case STATUSTYPE_INFO:
 		DMEMIT("%llu %llu",
-			(unsigned long long)atomic64_read(&ic->number_of_mismatches),
-			(unsigned long long)ic->provided_data_sectors);
+			atomic64_read(&ic->number_of_mismatches),
+			ic->provided_data_sectors);
 		if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
-			DMEMIT(" %llu", (unsigned long long)le64_to_cpu(ic->sb->recalc_sector));
+			DMEMIT(" %llu", le64_to_cpu(ic->sb->recalc_sector));
 		else
 			DMEMIT(" -");
 		break;
@@ -2953,7 +2953,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 		arg_count += !!ic->journal_crypt_alg.alg_string;
 		arg_count += !!ic->journal_mac_alg.alg_string;
 		arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0;
-		DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
+		DMEMIT("%s %llu %u %c %u", ic->dev->name, ic->start,
 		       ic->tag_size, ic->mode, arg_count);
 		if (ic->meta_dev)
 			DMEMIT(" meta_device:%s", ic->meta_dev->name);
@@ -2969,7 +2969,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 			DMEMIT(" commit_time:%u", ic->autocommit_msec);
 		}
 		if (ic->mode == 'B') {
-			DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
+			DMEMIT(" sectors_per_bit:%llu", (sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
 			DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
 		}
 		if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0)
@@ -3995,10 +3995,9 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	DEBUG_print("	initial_sectors 0x%x\n", ic->initial_sectors);
 	DEBUG_print("	metadata_run 0x%x\n", ic->metadata_run);
 	DEBUG_print("	log2_metadata_run %d\n", ic->log2_metadata_run);
-	DEBUG_print("	provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
-		    (unsigned long long)ic->provided_data_sectors);
+	DEBUG_print("	provided_data_sectors 0x%llx (%llu)\n", ic->provided_data_sectors, ic->provided_data_sectors);
 	DEBUG_print("	log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
-	DEBUG_print("	bits_in_journal %llu\n", (unsigned long long)bits_in_journal);
+	DEBUG_print("	bits_in_journal %llu\n", bits_in_journal);
 
 	if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
 		ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);

From f6f72f32c22c0ba7b714685b13a257be981888f3 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 22 Mar 2020 20:42:23 +0100
Subject: [PATCH 208/327] dm integrity: don't replay journal data past the end
 of the device

Following commits will make it possible to shrink or extend the device. If
the device was shrunk, we don't want to replay journal data pointing past
the end of the device.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 8ce69355b8dc..3c10a672322f 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -2194,6 +2194,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 					sec &= ~(sector_t)(ic->sectors_per_block - 1);
 				}
 			}
+			if (unlikely(sec >= ic->provided_data_sectors))
+				continue;
 			get_area_and_offset(ic, sec, &area, &offset);
 			restore_last_bytes(ic, access_journal_data(ic, i, j), je);
 			for (k = j + 1; k < ic->journal_section_entries; k++) {
@@ -2203,6 +2205,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 					break;
 				BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay);
 				sec2 = journal_entry_get_sector(je2);
+				if (unlikely(sec2 >= ic->provided_data_sectors))
+					break;
 				get_area_and_offset(ic, sec2, &area2, &offset2);
 				if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block))
 					break;

From 87fb177b4cab154bc13efa14149e152628aeb37c Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 22 Mar 2020 20:42:24 +0100
Subject: [PATCH 209/327] dm integrity: factor out get_provided_data_sectors()

Move code to a new function get_provided_data_sectors().

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 3c10a672322f..90a9a5e62623 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -3078,6 +3078,24 @@ static int calculate_device_limits(struct dm_integrity_c *ic)
 	return 0;
 }
 
+static void get_provided_data_sectors(struct dm_integrity_c *ic)
+{
+	if (!ic->meta_dev) {
+		int test_bit;
+		ic->provided_data_sectors = 0;
+		for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) {
+			__u64 prev_data_sectors = ic->provided_data_sectors;
+
+			ic->provided_data_sectors |= (sector_t)1 << test_bit;
+			if (calculate_device_limits(ic))
+				ic->provided_data_sectors = prev_data_sectors;
+		}
+	} else {
+		ic->provided_data_sectors = ic->data_device_sectors;
+		ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1);
+	}
+}
+
 static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors)
 {
 	unsigned journal_sections;
@@ -3105,20 +3123,15 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec
 		ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
 		ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors);
 
-		ic->provided_data_sectors = 0;
-		for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) {
-			__u64 prev_data_sectors = ic->provided_data_sectors;
-
-			ic->provided_data_sectors |= (sector_t)1 << test_bit;
-			if (calculate_device_limits(ic))
-				ic->provided_data_sectors = prev_data_sectors;
-		}
+		get_provided_data_sectors(ic);
 		if (!ic->provided_data_sectors)
 			return -EINVAL;
 	} else {
 		ic->sb->log2_interleave_sectors = 0;
-		ic->provided_data_sectors = ic->data_device_sectors;
-		ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1);
+
+		get_provided_data_sectors(ic);
+		if (!ic->provided_data_sectors)
+			return -EINVAL;
 
 try_smaller_buffer:
 		ic->sb->journal_sections = cpu_to_le32(0);

From 1ac2c15a7bf859b92de269ba58597c94ce10811f Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 22 Mar 2020 20:42:25 +0100
Subject: [PATCH 210/327] dm integrity: allow resize of the integrity device

If the size of the underlying device changes, change the size of the
integrity device too.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 90a9a5e62623..fafd9ec1d56c 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -2833,9 +2833,29 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
 static void dm_integrity_resume(struct dm_target *ti)
 {
 	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+	__u64 old_provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
 	int r;
+
 	DEBUG_print("resume\n");
 
+	if (ic->provided_data_sectors != old_provided_data_sectors) {
+		if (ic->provided_data_sectors > old_provided_data_sectors &&
+		    ic->mode == 'B' &&
+		    ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) {
+			rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
+					   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+			block_bitmap_op(ic, ic->journal, old_provided_data_sectors,
+					ic->provided_data_sectors - old_provided_data_sectors, BITMAP_OP_SET);
+			rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+					   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+		}
+
+		ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors);
+		r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+		if (unlikely(r))
+			dm_integrity_io_error(ic, "writing superblock", r);
+	}
+
 	if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) {
 		DEBUG_print("resume dirty_bitmap\n");
 		rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
@@ -3938,19 +3958,19 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			goto bad;
 		}
 	}
-	ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors);
-	if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) {
-		/* test for overflow */
-		r = -EINVAL;
-		ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors";
-		goto bad;
-	}
 	if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) {
 		r = -EINVAL;
 		ti->error = "Journal mac mismatch";
 		goto bad;
 	}
 
+	get_provided_data_sectors(ic);
+	if (!ic->provided_data_sectors) {
+		r = -EINVAL;
+		ti->error = "The device is too small";
+		goto bad;
+	}
+
 try_smaller_buffer:
 	r = calculate_device_limits(ic);
 	if (r) {
@@ -4219,7 +4239,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
 
 static struct target_type integrity_target = {
 	.name			= "integrity",
-	.version		= {1, 5, 0},
+	.version		= {1, 6, 0},
 	.module			= THIS_MODULE,
 	.features		= DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
 	.ctr			= dm_integrity_ctr,

From 84597a44a9d86ac949900441cea7da0af0f2f473 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 22 Mar 2020 20:42:26 +0100
Subject: [PATCH 211/327] dm integrity: add optional discard support

Add an argument "allow_discards" that enables discard processing on
dm-integrity device. Discards are only allowed to devices using
internal hash.

When a block is discarded the integrity tag is filled with
DISCARD_FILLER (0xf6) bytes.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 177 +++++++++++++++++++++++++++++++-------
 1 file changed, 145 insertions(+), 32 deletions(-)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index fafd9ec1d56c..21eb35c606be 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -39,6 +39,7 @@
 #define RECALC_WRITE_SUPER		16
 #define BITMAP_BLOCK_SIZE		4096	/* don't change it */
 #define BITMAP_FLUSH_INTERVAL		(10 * HZ)
+#define DISCARD_FILLER			0xf6
 
 /*
  * Warning - DEBUG_PRINT prints security-sensitive data to the log,
@@ -257,6 +258,7 @@ struct dm_integrity_c {
 	bool just_formatted;
 	bool recalculate_flag;
 	bool fix_padding;
+	bool discard;
 
 	struct alg_spec internal_hash_alg;
 	struct alg_spec journal_crypt_alg;
@@ -284,7 +286,7 @@ struct dm_integrity_io {
 	struct work_struct work;
 
 	struct dm_integrity_c *ic;
-	bool write;
+	enum req_opf op;
 	bool fua;
 
 	struct dm_integrity_range range;
@@ -1299,6 +1301,11 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_
 static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
 			       unsigned *metadata_offset, unsigned total_size, int op)
 {
+#define MAY_BE_FILLER		1
+#define MAY_BE_HASH		2
+	unsigned hash_offset = 0;
+	unsigned may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+
 	do {
 		unsigned char *data, *dp;
 		struct dm_buffer *b;
@@ -1320,18 +1327,35 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
 		} else if (op == TAG_WRITE) {
 			memcpy(dp, tag, to_copy);
 			dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
-		} else  {
+		} else {
 			/* e.g.: op == TAG_CMP */
-			if (unlikely(memcmp(dp, tag, to_copy))) {
-				unsigned i;
 
-				for (i = 0; i < to_copy; i++) {
-					if (dp[i] != tag[i])
-						break;
-					total_size--;
+			if (likely(is_power_of_2(ic->tag_size))) {
+				if (unlikely(memcmp(dp, tag, to_copy)))
+					if (unlikely(!ic->discard) ||
+					    unlikely(!memchr_inv(dp, DISCARD_FILLER, to_copy))) {
+						goto thorough_test;
+				}
+			} else {
+				unsigned i, ts;
+thorough_test:
+				ts = total_size;
+
+				for (i = 0; i < to_copy; i++, ts--) {
+					if (unlikely(dp[i] != tag[i]))
+						may_be &= ~MAY_BE_HASH;
+					if (likely(dp[i] != DISCARD_FILLER))
+						may_be &= ~MAY_BE_FILLER;
+					hash_offset++;
+					if (unlikely(hash_offset == ic->tag_size)) {
+						if (unlikely(!may_be)) {
+							dm_bufio_release(b);
+							return ts;
+						}
+						hash_offset = 0;
+						may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+					}
 				}
-				dm_bufio_release(b);
-				return total_size;
 			}
 		}
 		dm_bufio_release(b);
@@ -1342,10 +1366,17 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
 			(*metadata_block)++;
 			*metadata_offset = 0;
 		}
+
+		if (unlikely(!is_power_of_2(ic->tag_size))) {
+			hash_offset = (hash_offset + to_copy) % ic->tag_size;
+		}
+
 		total_size -= to_copy;
 	} while (unlikely(total_size));
 
 	return 0;
+#undef MAY_BE_FILLER
+#undef MAY_BE_HASH
 }
 
 static void dm_integrity_flush_buffers(struct dm_integrity_c *ic)
@@ -1428,7 +1459,7 @@ static void dec_in_flight(struct dm_integrity_io *dio)
 
 		remove_range(ic, &dio->range);
 
-		if (unlikely(dio->write))
+		if (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))
 			schedule_autocommit(ic);
 
 		bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
@@ -1520,14 +1551,19 @@ static void integrity_metadata(struct work_struct *w)
 		char *checksums;
 		unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
 		char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
-		unsigned sectors_to_process = dio->range.n_sectors;
-		sector_t sector = dio->range.logical_sector;
+		sector_t sector;
+		unsigned sectors_to_process;
+		sector_t save_metadata_block;
+		unsigned save_metadata_offset;
 
 		if (unlikely(ic->mode == 'R'))
 			goto skip_io;
 
-		checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
-				    GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
+		if (likely(dio->op != REQ_OP_DISCARD))
+			checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
+					    GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
+		else
+			checksums = kmalloc(PAGE_SIZE, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
 		if (!checksums) {
 			checksums = checksums_onstack;
 			if (WARN_ON(extra_space &&
@@ -1537,6 +1573,43 @@ static void integrity_metadata(struct work_struct *w)
 			}
 		}
 
+		if (unlikely(dio->op == REQ_OP_DISCARD)) {
+			sector_t bi_sector = dio->bio_details.bi_iter.bi_sector;
+			unsigned bi_size = dio->bio_details.bi_iter.bi_size;
+			unsigned max_size = likely(checksums != checksums_onstack) ? PAGE_SIZE : HASH_MAX_DIGESTSIZE;
+			unsigned max_blocks = max_size / ic->tag_size;
+			memset(checksums, DISCARD_FILLER, max_size);
+
+			while (bi_size) {
+				unsigned this_step_blocks = bi_size >> (SECTOR_SHIFT + ic->sb->log2_sectors_per_block);
+				this_step_blocks = min(this_step_blocks, max_blocks);
+				r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
+							this_step_blocks * ic->tag_size, TAG_WRITE);
+				if (unlikely(r)) {
+					if (likely(checksums != checksums_onstack))
+						kfree(checksums);
+					goto error;
+				}
+
+				/*if (bi_size < this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block)) {
+					printk("BUGG: bi_sector: %llx, bi_size: %u\n", bi_sector, bi_size);
+					printk("BUGG: this_step_blocks: %u\n", this_step_blocks);
+					BUG();
+				}*/
+				bi_size -= this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block);
+				bi_sector += this_step_blocks << ic->sb->log2_sectors_per_block;
+			}
+
+			if (likely(checksums != checksums_onstack))
+				kfree(checksums);
+			goto skip_io;
+		}
+
+		save_metadata_block = dio->metadata_block;
+		save_metadata_offset = dio->metadata_offset;
+		sector = dio->range.logical_sector;
+		sectors_to_process = dio->range.n_sectors;
+
 		__bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) {
 			unsigned pos;
 			char *mem, *checksums_ptr;
@@ -1555,7 +1628,7 @@ static void integrity_metadata(struct work_struct *w)
 			kunmap_atomic(mem);
 
 			r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
-						checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE);
+						checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE);
 			if (unlikely(r)) {
 				if (r > 0) {
 					char b[BDEVNAME_SIZE];
@@ -1599,7 +1672,7 @@ static void integrity_metadata(struct work_struct *w)
 				tag = lowmem_page_address(biv.bv_page) + biv.bv_offset;
 				this_len = min(biv.bv_len, data_to_process);
 				r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset,
-							this_len, !dio->write ? TAG_READ : TAG_WRITE);
+							this_len, dio->op == REQ_OP_READ ? TAG_READ : TAG_WRITE);
 				if (unlikely(r))
 					goto error;
 				data_to_process -= this_len;
@@ -1626,6 +1699,20 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 
 	dio->ic = ic;
 	dio->bi_status = 0;
+	dio->op = bio_op(bio);
+
+	if (unlikely(dio->op == REQ_OP_DISCARD)) {
+		if (ti->max_io_len) {
+			sector_t sec = dm_target_offset(ti, bio->bi_iter.bi_sector);
+			unsigned log2_max_io_len = __fls(ti->max_io_len);
+			sector_t start_boundary = sec >> log2_max_io_len;
+			sector_t end_boundary = (sec + bio_sectors(bio) - 1) >> log2_max_io_len;
+			if (start_boundary < end_boundary) {
+				sector_t len = ti->max_io_len - (sec & (ti->max_io_len - 1));
+				dm_accept_partial_bio(bio, len);
+			}
+		}
+	}
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		submit_flush_bio(ic, dio);
@@ -1633,8 +1720,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 	}
 
 	dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
-	dio->write = bio_op(bio) == REQ_OP_WRITE;
-	dio->fua = dio->write && bio->bi_opf & REQ_FUA;
+	dio->fua = dio->op == REQ_OP_WRITE && bio->bi_opf & REQ_FUA;
 	if (unlikely(dio->fua)) {
 		/*
 		 * Don't pass down the FUA flag because we have to flush
@@ -1655,7 +1741,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_KILL;
 	}
 
-	if (ic->sectors_per_block > 1) {
+	if (ic->sectors_per_block > 1 && likely(dio->op != REQ_OP_DISCARD)) {
 		struct bvec_iter iter;
 		struct bio_vec bv;
 		bio_for_each_segment(bv, bio, iter) {
@@ -1688,7 +1774,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 		}
 	}
 
-	if (unlikely(ic->mode == 'R') && unlikely(dio->write))
+	if (unlikely(ic->mode == 'R') && unlikely(dio->op != REQ_OP_READ))
 		return DM_MAPIO_KILL;
 
 	get_area_and_offset(ic, dio->range.logical_sector, &area, &offset);
@@ -1718,13 +1804,13 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
 		bio_advance_iter(bio, &bio->bi_iter, bv.bv_len);
 retry_kmap:
 		mem = kmap_atomic(bv.bv_page);
-		if (likely(dio->write))
+		if (likely(dio->op == REQ_OP_WRITE))
 			flush_dcache_page(bv.bv_page);
 
 		do {
 			struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry);
 
-			if (unlikely(!dio->write)) {
+			if (unlikely(dio->op == REQ_OP_READ)) {
 				struct journal_sector *js;
 				char *mem_ptr;
 				unsigned s;
@@ -1771,7 +1857,7 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
 					char *tag_addr;
 					BUG_ON(PageHighMem(biv.bv_page));
 					tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset;
-					if (likely(dio->write))
+					if (likely(dio->op == REQ_OP_WRITE))
 						memcpy(tag_ptr, tag_addr, tag_now);
 					else
 						memcpy(tag_addr, tag_ptr, tag_now);
@@ -1779,12 +1865,12 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
 					tag_ptr += tag_now;
 					tag_todo -= tag_now;
 				} while (unlikely(tag_todo)); else {
-					if (likely(dio->write))
+					if (likely(dio->op == REQ_OP_WRITE))
 						memset(tag_ptr, 0, tag_todo);
 				}
 			}
 
-			if (likely(dio->write)) {
+			if (likely(dio->op == REQ_OP_WRITE)) {
 				struct journal_sector *js;
 				unsigned s;
 
@@ -1820,12 +1906,12 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
 			bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT;
 		} while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT);
 
-		if (unlikely(!dio->write))
+		if (unlikely(dio->op == REQ_OP_READ))
 			flush_dcache_page(bv.bv_page);
 		kunmap_atomic(mem);
 	} while (n_sectors);
 
-	if (likely(dio->write)) {
+	if (likely(dio->op == REQ_OP_WRITE)) {
 		smp_mb();
 		if (unlikely(waitqueue_active(&ic->copy_to_journal_wait)))
 			wake_up(&ic->copy_to_journal_wait);
@@ -1857,7 +1943,9 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
 	unsigned journal_section, journal_entry;
 	unsigned journal_read_pos;
 	struct completion read_comp;
-	bool need_sync_io = ic->internal_hash && !dio->write;
+	bool need_sync_io = ic->internal_hash && dio->op == REQ_OP_READ;
+	if (unlikely(dio->op == REQ_OP_DISCARD) && ic->mode != 'D')
+		need_sync_io = true;
 
 	if (need_sync_io && from_map) {
 		INIT_WORK(&dio->work, integrity_bio_wait);
@@ -1875,8 +1963,8 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
 	}
 	dio->range.n_sectors = bio_sectors(bio);
 	journal_read_pos = NOT_FOUND;
-	if (likely(ic->mode == 'J')) {
-		if (dio->write) {
+	if (ic->mode == 'J' && likely(dio->op != REQ_OP_DISCARD)) {
+		if (dio->op == REQ_OP_WRITE) {
 			unsigned next_entry, i, pos;
 			unsigned ws, we, range_sectors;
 
@@ -1979,7 +2067,7 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
 		goto journal_read_write;
 	}
 
-	if (ic->mode == 'B' && dio->write) {
+	if (ic->mode == 'B' && (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))) {
 		if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
 				     dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
 			struct bitmap_block_status *bbs;
@@ -2008,6 +2096,18 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
 	bio->bi_end_io = integrity_end_io;
 	bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT;
 
+	if (unlikely(dio->op == REQ_OP_DISCARD) && likely(ic->mode != 'D')) {
+		integrity_metadata(&dio->work);
+		dm_integrity_flush_buffers(ic);
+
+		dio->in_flight = (atomic_t)ATOMIC_INIT(1);
+		dio->completion = NULL;
+
+		generic_make_request(bio);
+
+		return;
+	}
+
 	generic_make_request(bio);
 
 	if (need_sync_io) {
@@ -2969,6 +3069,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 		arg_count += !!ic->meta_dev;
 		arg_count += ic->sectors_per_block != 1;
 		arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
+		arg_count += ic->discard;
 		arg_count += ic->mode == 'J';
 		arg_count += ic->mode == 'J';
 		arg_count += ic->mode == 'B';
@@ -2985,6 +3086,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 			DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
 		if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
 			DMEMIT(" recalculate");
+		if (ic->discard)
+			DMEMIT(" allow_discards");
 		DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
 		DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
 		DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
@@ -3771,6 +3874,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 				goto bad;
 		} else if (!strcmp(opt_string, "recalculate")) {
 			ic->recalculate_flag = true;
+		} else if (!strcmp(opt_string, "allow_discards")) {
+			ic->discard = true;
 		} else if (!strcmp(opt_string, "fix_padding")) {
 			ic->fix_padding = true;
 		} else {
@@ -3829,6 +3934,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
+	if (ic->discard && !ic->internal_hash) {
+		r = -EINVAL;
+		ti->error = "Discard can be only used with internal hash";
+		goto bad;
+	}
+
 	ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
 	ic->autocommit_msec = sync_msec;
 	timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
@@ -4158,6 +4269,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	ti->num_flush_bios = 1;
 	ti->flush_supported = true;
+	if (ic->discard)
+		ti->num_discard_bios = 1;
 
 	return 0;
 

From 31843edab7cb3924006544b901c9cab33941b684 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Sun, 22 Mar 2020 20:42:27 +0100
Subject: [PATCH 212/327] dm integrity: improve discard in journal mode

When we discard something that is present in the journal, we flush the
journal first, so that discarded blocks are not overwritten by the journal
content.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-integrity.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 21eb35c606be..b989d109d55d 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1943,6 +1943,7 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
 	unsigned journal_section, journal_entry;
 	unsigned journal_read_pos;
 	struct completion read_comp;
+	bool discard_retried = false;
 	bool need_sync_io = ic->internal_hash && dio->op == REQ_OP_READ;
 	if (unlikely(dio->op == REQ_OP_DISCARD) && ic->mode != 'D')
 		need_sync_io = true;
@@ -2059,6 +2060,21 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
 			}
 		}
 	}
+	if (ic->mode == 'J' && likely(dio->op == REQ_OP_DISCARD) && !discard_retried) {
+		sector_t next_sector;
+		unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector);
+		if (unlikely(new_pos != NOT_FOUND) ||
+		    unlikely(next_sector < dio->range.logical_sector - dio->range.n_sectors)) {
+			remove_range_unlocked(ic, &dio->range);
+			spin_unlock_irq(&ic->endio_wait.lock);
+			queue_work(ic->commit_wq, &ic->commit_work);
+			flush_workqueue(ic->commit_wq);
+			queue_work(ic->writer_wq, &ic->writer_work);
+			flush_workqueue(ic->writer_wq);
+			discard_retried = true;
+			goto lock_retry;
+		}
+	}
 	spin_unlock_irq(&ic->endio_wait.lock);
 
 	if (unlikely(journal_read_pos != NOT_FOUND)) {

From a766fa84738f52f8227eb96aed4362725a82ccf2 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 20 Mar 2020 16:26:41 +0300
Subject: [PATCH 213/327] IB/mlx5: Fix a NULL vs IS_ERR() check

The kzalloc() function returns NULL, not error pointers.

Fixes: 30f2fe40c72b ("IB/mlx5: Introduce UAPIs to manage packet pacing")
Link: https://lore.kernel.org/r/20200320132641.GF95012@mwanda
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/qos.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/qos.c b/drivers/infiniband/hw/mlx5/qos.c
index f822b06e7c9e..cac878a70edb 100644
--- a/drivers/infiniband/hw/mlx5/qos.c
+++ b/drivers/infiniband/hw/mlx5/qos.c
@@ -46,8 +46,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_PP_OBJ_ALLOC)(
 
 	dev = to_mdev(c->ibucontext.device);
 	pp_entry = kzalloc(sizeof(*pp_entry), GFP_KERNEL);
-	if (IS_ERR(pp_entry))
-		return PTR_ERR(pp_entry);
+	if (!pp_entry)
+		return -ENOMEM;
 
 	in_ctx = uverbs_attr_get_alloced_ptr(attrs,
 					     MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX);

From b1d56fdcb66ebe6604166d71a26744d3cd03fecb Mon Sep 17 00:00:00 2001
From: Selvin Xavier <selvin.xavier@broadcom.com>
Date: Fri, 13 Mar 2020 10:34:02 -0700
Subject: [PATCH 214/327] RDMA/bnxt_re: Wait for all the CQ events before
 freeing CQ data structures

Destroy CQ command to firmware returns the num_cnq_events as a
response. This indicates the driver about the number of CQ events
generated for this CQ. Driver should wait for all these events before
freeing the CQ host structures.  Also, add routine to clean all the
pending notification for the CQs getting destroyed. This avoids the
possibility of accessing the CQ data structures after its freed.

Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver")
Link: https://lore.kernel.org/r/1584120842-3200-1-git-send-email-selvin.xavier@broadcom.com
Signed-off-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/qplib_fp.c | 73 ++++++++++++++++++++++++
 drivers/infiniband/hw/bnxt_re/qplib_fp.h |  1 +
 2 files changed, 74 insertions(+)

diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index 2ccf1c3708d1..899a5d2c100e 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -43,6 +43,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
+#include <linux/delay.h>
 #include <linux/prefetch.h>
 #include <linux/if_ether.h>
 
@@ -231,6 +232,70 @@ static int bnxt_qplib_alloc_qp_hdr_buf(struct bnxt_qplib_res *res,
 	return rc;
 }
 
+static void clean_nq(struct bnxt_qplib_nq *nq, struct bnxt_qplib_cq *cq)
+{
+	struct bnxt_qplib_hwq *hwq = &nq->hwq;
+	struct nq_base *nqe, **nq_ptr;
+	int budget = nq->budget;
+	u32 sw_cons, raw_cons;
+	uintptr_t q_handle;
+	u16 type;
+
+	spin_lock_bh(&hwq->lock);
+	/* Service the NQ until empty */
+	raw_cons = hwq->cons;
+	while (budget--) {
+		sw_cons = HWQ_CMP(raw_cons, hwq);
+		nq_ptr = (struct nq_base **)hwq->pbl_ptr;
+		nqe = &nq_ptr[NQE_PG(sw_cons)][NQE_IDX(sw_cons)];
+		if (!NQE_CMP_VALID(nqe, raw_cons, hwq->max_elements))
+			break;
+
+		/*
+		 * The valid test of the entry must be done first before
+		 * reading any further.
+		 */
+		dma_rmb();
+
+		type = le16_to_cpu(nqe->info10_type) & NQ_BASE_TYPE_MASK;
+		switch (type) {
+		case NQ_BASE_TYPE_CQ_NOTIFICATION:
+		{
+			struct nq_cn *nqcne = (struct nq_cn *)nqe;
+
+			q_handle = le32_to_cpu(nqcne->cq_handle_low);
+			q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high)
+						     << 32;
+			if ((unsigned long)cq == q_handle) {
+				nqcne->cq_handle_low = 0;
+				nqcne->cq_handle_high = 0;
+				cq->cnq_events++;
+			}
+			break;
+		}
+		default:
+			break;
+		}
+		raw_cons++;
+	}
+	spin_unlock_bh(&hwq->lock);
+}
+
+/* Wait for receiving all NQEs for this CQ and clean the NQEs associated with
+ * this CQ.
+ */
+static void __wait_for_all_nqes(struct bnxt_qplib_cq *cq, u16 cnq_events)
+{
+	u32 retry_cnt = 100;
+
+	while (retry_cnt--) {
+		if (cnq_events == cq->cnq_events)
+			return;
+		usleep_range(50, 100);
+		clean_nq(cq->nq, cq);
+	}
+}
+
 static void bnxt_qplib_service_nq(unsigned long data)
 {
 	struct bnxt_qplib_nq *nq = (struct bnxt_qplib_nq *)data;
@@ -244,6 +309,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
 	uintptr_t q_handle;
 	u16 type;
 
+	spin_lock_bh(&hwq->lock);
 	/* Service the NQ until empty */
 	raw_cons = hwq->cons;
 	while (budget--) {
@@ -269,6 +335,8 @@ static void bnxt_qplib_service_nq(unsigned long data)
 			q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high)
 						     << 32;
 			cq = (struct bnxt_qplib_cq *)(unsigned long)q_handle;
+			if (!cq)
+				break;
 			bnxt_qplib_armen_db(&cq->dbinfo,
 					    DBC_DBC_TYPE_CQ_ARMENA);
 			spin_lock_bh(&cq->compl_lock);
@@ -278,6 +346,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
 			else
 				dev_warn(&nq->pdev->dev,
 					 "cqn - type 0x%x not handled\n", type);
+			cq->cnq_events++;
 			spin_unlock_bh(&cq->compl_lock);
 			break;
 		}
@@ -316,6 +385,7 @@ static void bnxt_qplib_service_nq(unsigned long data)
 		hwq->cons = raw_cons;
 		bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, nq->res->cctx, true);
 	}
+	spin_unlock_bh(&hwq->lock);
 }
 
 static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance)
@@ -2003,6 +2073,7 @@ int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
 	struct cmdq_destroy_cq req;
 	struct creq_destroy_cq_resp resp;
+	u16 total_cnq_events;
 	u16 cmd_flags = 0;
 	int rc;
 
@@ -2013,6 +2084,8 @@ int bnxt_qplib_destroy_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 					  (void *)&resp, NULL, 0);
 	if (rc)
 		return rc;
+	total_cnq_events = le16_to_cpu(resp.total_cnq_events);
+	__wait_for_all_nqes(cq, total_cnq_events);
 	bnxt_qplib_free_hwq(res, &cq->hwq);
 	return 0;
 }
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index 9e8d1c5c3f4a..7edb70b6bb16 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -402,6 +402,7 @@ struct bnxt_qplib_cq {
  * of the same QP while manipulating the flush list.
  */
 	spinlock_t			flush_lock; /* QP flush management */
+	u16				cnq_events;
 };
 
 #define BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE	sizeof(struct xrrq_irrq)

From 2f49de21f3e96d869d00659581394f106fa66371 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Tue, 17 Mar 2020 11:55:23 +0800
Subject: [PATCH 215/327] RDMA/hns: Optimize mhop get flow for multi-hop
 addressing

Splits hns_roce_table_mhop_get() into 4 sub-functions to make the code flow
clearer.

Link: https://lore.kernel.org/r/1584417324-2255-2-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hem.c | 311 ++++++++++++++---------
 1 file changed, 189 insertions(+), 122 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index 8380d7187494..cc557f126c82 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -39,6 +39,16 @@
 #define DMA_ADDR_T_SHIFT		12
 #define BT_BA_SHIFT			32
 
+#define HEM_INDEX_BUF			BIT(0)
+#define HEM_INDEX_L0			BIT(1)
+#define HEM_INDEX_L1			BIT(2)
+struct hns_roce_hem_index {
+	u64 buf;
+	u64 l0;
+	u64 l1;
+	u32 inited; /* indicate which index is available */
+};
+
 bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type)
 {
 	int hop_num = 0;
@@ -434,178 +444,235 @@ static int hns_roce_set_hem(struct hns_roce_dev *hr_dev,
 	return ret;
 }
 
-static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev,
-				   struct hns_roce_hem_table *table,
-				   unsigned long obj)
+static int calc_hem_config(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_hem_table *table, unsigned long obj,
+			   struct hns_roce_hem_mhop *mhop,
+			   struct hns_roce_hem_index *index)
 {
-	struct device *dev = hr_dev->dev;
-	struct hns_roce_hem_mhop mhop;
-	struct hns_roce_hem_iter iter;
-	u32 buf_chunk_size;
-	u32 bt_chunk_size;
-	u32 chunk_ba_num;
-	u32 hop_num;
-	u32 size;
-	u32 bt_num;
-	u64 hem_idx;
-	u64 bt_l1_idx = 0;
-	u64 bt_l0_idx = 0;
-	u64 bt_ba;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	unsigned long mhop_obj = obj;
-	int bt_l1_allocated = 0;
-	int bt_l0_allocated = 0;
-	int step_idx;
+	u32 l0_idx, l1_idx, l2_idx;
+	u32 chunk_ba_num;
+	u32 bt_num;
 	int ret;
 
-	ret = hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, &mhop);
+	ret = hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, mhop);
 	if (ret)
 		return ret;
 
-	buf_chunk_size = mhop.buf_chunk_size;
-	bt_chunk_size = mhop.bt_chunk_size;
-	hop_num = mhop.hop_num;
-	chunk_ba_num = bt_chunk_size / BA_BYTE_LEN;
-
-	bt_num = hns_roce_get_bt_num(table->type, hop_num);
+	l0_idx = mhop->l0_idx;
+	l1_idx = mhop->l1_idx;
+	l2_idx = mhop->l2_idx;
+	chunk_ba_num = mhop->bt_chunk_size / BA_BYTE_LEN;
+	bt_num = hns_roce_get_bt_num(table->type, mhop->hop_num);
 	switch (bt_num) {
 	case 3:
-		hem_idx = mhop.l0_idx * chunk_ba_num * chunk_ba_num +
-			  mhop.l1_idx * chunk_ba_num + mhop.l2_idx;
-		bt_l1_idx = mhop.l0_idx * chunk_ba_num + mhop.l1_idx;
-		bt_l0_idx = mhop.l0_idx;
+		index->l1 = l0_idx * chunk_ba_num + l1_idx;
+		index->l0 = l0_idx;
+		index->buf = l0_idx * chunk_ba_num * chunk_ba_num +
+			     l1_idx * chunk_ba_num + l2_idx;
 		break;
 	case 2:
-		hem_idx = mhop.l0_idx * chunk_ba_num + mhop.l1_idx;
-		bt_l0_idx = mhop.l0_idx;
+		index->l0 = l0_idx;
+		index->buf = l0_idx * chunk_ba_num + l1_idx;
 		break;
 	case 1:
-		hem_idx = mhop.l0_idx;
+		index->buf = l0_idx;
 		break;
 	default:
-		dev_err(dev, "Table %d not support hop_num = %d!\n",
-			     table->type, hop_num);
+		ibdev_err(ibdev, "Table %d not support mhop.hop_num = %d!\n",
+			  table->type, mhop->hop_num);
 		return -EINVAL;
 	}
 
-	if (unlikely(hem_idx >= table->num_hem)) {
-		dev_err(dev, "Table %d exceed hem limt idx = %llu,max = %lu!\n",
-			     table->type, hem_idx, table->num_hem);
+	if (unlikely(index->buf >= table->num_hem)) {
+		ibdev_err(ibdev, "Table %d exceed hem limt idx %llu,max %lu!\n",
+			  table->type, index->buf, table->num_hem);
 		return -EINVAL;
 	}
 
-	mutex_lock(&table->mutex);
+	return 0;
+}
 
-	if (table->hem[hem_idx]) {
-		++table->hem[hem_idx]->refcount;
-		goto out;
+static void free_mhop_hem(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_hem_table *table,
+			  struct hns_roce_hem_mhop *mhop,
+			  struct hns_roce_hem_index *index)
+{
+	u32 bt_size = mhop->bt_chunk_size;
+	struct device *dev = hr_dev->dev;
+
+	if (index->inited & HEM_INDEX_BUF) {
+		hns_roce_free_hem(hr_dev, table->hem[index->buf]);
+		table->hem[index->buf] = NULL;
 	}
 
+	if (index->inited & HEM_INDEX_L1) {
+		dma_free_coherent(dev, bt_size, table->bt_l1[index->l1],
+				  table->bt_l1_dma_addr[index->l1]);
+		table->bt_l1[index->l1] = NULL;
+	}
+
+	if (index->inited & HEM_INDEX_L0) {
+		dma_free_coherent(dev, bt_size, table->bt_l0[index->l0],
+				  table->bt_l0_dma_addr[index->l0]);
+		table->bt_l0[index->l0] = NULL;
+	}
+}
+
+static int alloc_mhop_hem(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_hem_table *table,
+			  struct hns_roce_hem_mhop *mhop,
+			  struct hns_roce_hem_index *index)
+{
+	u32 bt_size = mhop->bt_chunk_size;
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_hem_iter iter;
+	gfp_t flag;
+	u64 bt_ba;
+	u32 size;
+	int ret;
+
 	/* alloc L1 BA's chunk */
-	if ((check_whether_bt_num_3(table->type, hop_num) ||
-		check_whether_bt_num_2(table->type, hop_num)) &&
-		!table->bt_l0[bt_l0_idx]) {
-		table->bt_l0[bt_l0_idx] = dma_alloc_coherent(dev, bt_chunk_size,
-					    &(table->bt_l0_dma_addr[bt_l0_idx]),
+	if ((check_whether_bt_num_3(table->type, mhop->hop_num) ||
+	     check_whether_bt_num_2(table->type, mhop->hop_num)) &&
+	     !table->bt_l0[index->l0]) {
+		table->bt_l0[index->l0] = dma_alloc_coherent(dev, bt_size,
+					    &table->bt_l0_dma_addr[index->l0],
 					    GFP_KERNEL);
-		if (!table->bt_l0[bt_l0_idx]) {
+		if (!table->bt_l0[index->l0]) {
 			ret = -ENOMEM;
 			goto out;
 		}
-		bt_l0_allocated = 1;
-
-		/* set base address to hardware */
-		if (table->type < HEM_TYPE_MTT) {
-			step_idx = 0;
-			if (hr_dev->hw->set_hem(hr_dev, table, obj, step_idx)) {
-				ret = -ENODEV;
-				dev_err(dev, "set HEM base address to HW failed!\n");
-				goto err_dma_alloc_l1;
-			}
-		}
+		index->inited |= HEM_INDEX_L0;
 	}
 
 	/* alloc L2 BA's chunk */
-	if (check_whether_bt_num_3(table->type, hop_num) &&
-	    !table->bt_l1[bt_l1_idx])  {
-		table->bt_l1[bt_l1_idx] = dma_alloc_coherent(dev, bt_chunk_size,
-					    &(table->bt_l1_dma_addr[bt_l1_idx]),
+	if (check_whether_bt_num_3(table->type, mhop->hop_num) &&
+	    !table->bt_l1[index->l1])  {
+		table->bt_l1[index->l1] = dma_alloc_coherent(dev, bt_size,
+					    &table->bt_l1_dma_addr[index->l1],
 					    GFP_KERNEL);
-		if (!table->bt_l1[bt_l1_idx]) {
+		if (!table->bt_l1[index->l1]) {
 			ret = -ENOMEM;
-			goto err_dma_alloc_l1;
-		}
-		bt_l1_allocated = 1;
-		*(table->bt_l0[bt_l0_idx] + mhop.l1_idx) =
-					       table->bt_l1_dma_addr[bt_l1_idx];
-
-		/* set base address to hardware */
-		step_idx = 1;
-		if (hr_dev->hw->set_hem(hr_dev, table, obj, step_idx)) {
-			ret = -ENODEV;
-			dev_err(dev, "set HEM base address to HW failed!\n");
-			goto err_alloc_hem_buf;
+			goto err_alloc_hem;
 		}
+		index->inited |= HEM_INDEX_L1;
+		*(table->bt_l0[index->l0] + mhop->l1_idx) =
+					       table->bt_l1_dma_addr[index->l1];
 	}
 
 	/*
 	 * alloc buffer space chunk for QPC/MTPT/CQC/SRQC/SCCC.
 	 * alloc bt space chunk for MTT/CQE.
 	 */
-	size = table->type < HEM_TYPE_MTT ? buf_chunk_size : bt_chunk_size;
-	table->hem[hem_idx] = hns_roce_alloc_hem(hr_dev,
-						size >> PAGE_SHIFT,
-						size,
-						(table->lowmem ? GFP_KERNEL :
-						GFP_HIGHUSER) | __GFP_NOWARN);
-	if (!table->hem[hem_idx]) {
+	size = table->type < HEM_TYPE_MTT ? mhop->buf_chunk_size : bt_size;
+	flag = (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) | __GFP_NOWARN;
+	table->hem[index->buf] = hns_roce_alloc_hem(hr_dev, size >> PAGE_SHIFT,
+						    size, flag);
+	if (!table->hem[index->buf]) {
 		ret = -ENOMEM;
-		goto err_alloc_hem_buf;
+		goto err_alloc_hem;
 	}
 
-	hns_roce_hem_first(table->hem[hem_idx], &iter);
+	index->inited |= HEM_INDEX_BUF;
+	hns_roce_hem_first(table->hem[index->buf], &iter);
 	bt_ba = hns_roce_hem_addr(&iter);
-
 	if (table->type < HEM_TYPE_MTT) {
-		if (hop_num == 2) {
-			*(table->bt_l1[bt_l1_idx] + mhop.l2_idx) = bt_ba;
-			step_idx = 2;
-		} else if (hop_num == 1) {
-			*(table->bt_l0[bt_l0_idx] + mhop.l1_idx) = bt_ba;
-			step_idx = 1;
-		} else if (hop_num == HNS_ROCE_HOP_NUM_0) {
-			step_idx = 0;
-		} else {
-			ret = -EINVAL;
-			goto err_dma_alloc_l1;
-		}
-
-		/* set HEM base address to hardware */
-		if (hr_dev->hw->set_hem(hr_dev, table, obj, step_idx)) {
-			ret = -ENODEV;
-			dev_err(dev, "set HEM base address to HW failed!\n");
-			goto err_alloc_hem_buf;
-		}
-	} else if (hop_num == 2) {
-		*(table->bt_l0[bt_l0_idx] + mhop.l1_idx) = bt_ba;
+		if (mhop->hop_num == 2)
+			*(table->bt_l1[index->l1] + mhop->l2_idx) = bt_ba;
+		else if (mhop->hop_num == 1)
+			*(table->bt_l0[index->l0] + mhop->l1_idx) = bt_ba;
+	} else if (mhop->hop_num == 2) {
+		*(table->bt_l0[index->l0] + mhop->l1_idx) = bt_ba;
 	}
 
-	++table->hem[hem_idx]->refcount;
+	return 0;
+err_alloc_hem:
+	free_mhop_hem(hr_dev, table, mhop, index);
+out:
+	return ret;
+}
+
+static int set_mhop_hem(struct hns_roce_dev *hr_dev,
+			struct hns_roce_hem_table *table, unsigned long obj,
+			struct hns_roce_hem_mhop *mhop,
+			struct hns_roce_hem_index *index)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	int step_idx;
+	int ret;
+
+	if (index->inited & HEM_INDEX_L0) {
+		ret = hr_dev->hw->set_hem(hr_dev, table, obj, 0);
+		if (ret) {
+			ibdev_err(ibdev, "set HEM step 0 failed!\n");
+			goto out;
+		}
+	}
+
+	if (index->inited & HEM_INDEX_L1) {
+		ret = hr_dev->hw->set_hem(hr_dev, table, obj, 1);
+		if (ret) {
+			ibdev_err(ibdev, "set HEM step 1 failed!\n");
+			goto out;
+		}
+	}
+
+	if (index->inited & HEM_INDEX_BUF) {
+		if (mhop->hop_num == HNS_ROCE_HOP_NUM_0)
+			step_idx = 0;
+		else
+			step_idx = mhop->hop_num;
+		ret = hr_dev->hw->set_hem(hr_dev, table, obj, step_idx);
+		if (ret)
+			ibdev_err(ibdev, "set HEM step last failed!\n");
+	}
+out:
+	return ret;
+}
+
+static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev,
+				   struct hns_roce_hem_table *table,
+				   unsigned long obj)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_hem_index index = {};
+	struct hns_roce_hem_mhop mhop = {};
+	int ret;
+
+	ret = calc_hem_config(hr_dev, table, obj, &mhop, &index);
+	if (ret) {
+		ibdev_err(ibdev, "calc hem config failed!\n");
+		return ret;
+	}
+
+	mutex_lock(&table->mutex);
+	if (table->hem[index.buf]) {
+		++table->hem[index.buf]->refcount;
+		goto out;
+	}
+
+	ret = alloc_mhop_hem(hr_dev, table, &mhop, &index);
+	if (ret) {
+		ibdev_err(ibdev, "alloc mhop hem failed!\n");
+		goto out;
+	}
+
+	/* set HEM base address to hardware */
+	if (table->type < HEM_TYPE_MTT) {
+		ret = set_mhop_hem(hr_dev, table, obj, &mhop, &index);
+		if (ret) {
+			ibdev_err(ibdev, "set HEM address to HW failed!\n");
+			goto err_alloc;
+		}
+	}
+
+	++table->hem[index.buf]->refcount;
 	goto out;
 
-err_alloc_hem_buf:
-	if (bt_l1_allocated) {
-		dma_free_coherent(dev, bt_chunk_size, table->bt_l1[bt_l1_idx],
-				  table->bt_l1_dma_addr[bt_l1_idx]);
-		table->bt_l1[bt_l1_idx] = NULL;
-	}
-
-err_dma_alloc_l1:
-	if (bt_l0_allocated) {
-		dma_free_coherent(dev, bt_chunk_size, table->bt_l0[bt_l0_idx],
-				  table->bt_l0_dma_addr[bt_l0_idx]);
-		table->bt_l0[bt_l0_idx] = NULL;
-	}
-
+err_alloc:
+	free_mhop_hem(hr_dev, table, &mhop, &index);
 out:
 	mutex_unlock(&table->mutex);
 	return ret;

From 38dcb35048fdc9b11530b23c092b250b2e8135a1 Mon Sep 17 00:00:00 2001
From: Xi Wang <wangxi11@huawei.com>
Date: Tue, 17 Mar 2020 11:55:24 +0800
Subject: [PATCH 216/327] RDMA/hns: Optimize mhop put flow for multi-hop
 addressing

Optimizes hns_roce_table_mhop_get() by encapsulating code about clearing
hem into clear_mhop_hem(), which will make the code flow clearer.

Link: https://lore.kernel.org/r/1584417324-2255-3-git-send-email-liweihang@huawei.com
Signed-off-by: Xi Wang <wangxi11@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hem.c | 161 +++++++++--------------
 1 file changed, 61 insertions(+), 100 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index cc557f126c82..c96378718f88 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -94,25 +94,27 @@ bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type)
 	return hop_num ? true : false;
 }
 
-static bool hns_roce_check_hem_null(struct hns_roce_hem **hem, u64 start_idx,
-			    u32 bt_chunk_num, u64 hem_max_num)
+static bool hns_roce_check_hem_null(struct hns_roce_hem **hem, u64 hem_idx,
+				    u32 bt_chunk_num, u64 hem_max_num)
 {
+	u64 start_idx = round_down(hem_idx, bt_chunk_num);
 	u64 check_max_num = start_idx + bt_chunk_num;
 	u64 i;
 
 	for (i = start_idx; (i < check_max_num) && (i < hem_max_num); i++)
-		if (hem[i])
+		if (i != hem_idx && hem[i])
 			return false;
 
 	return true;
 }
 
-static bool hns_roce_check_bt_null(u64 **bt, u64 start_idx, u32 bt_chunk_num)
+static bool hns_roce_check_bt_null(u64 **bt, u64 ba_idx, u32 bt_chunk_num)
 {
+	u64 start_idx = round_down(ba_idx, bt_chunk_num);
 	int i;
 
 	for (i = 0; i < bt_chunk_num; i++)
-		if (bt[start_idx + i])
+		if (i != ba_idx && bt[start_idx + i])
 			return false;
 
 	return true;
@@ -723,116 +725,75 @@ int hns_roce_table_get(struct hns_roce_dev *hr_dev,
 	return ret;
 }
 
+static void clear_mhop_hem(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_hem_table *table, unsigned long obj,
+			   struct hns_roce_hem_mhop *mhop,
+			   struct hns_roce_hem_index *index)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	u32 hop_num = mhop->hop_num;
+	u32 chunk_ba_num;
+	int step_idx;
+
+	index->inited = HEM_INDEX_BUF;
+	chunk_ba_num = mhop->bt_chunk_size / BA_BYTE_LEN;
+	if (check_whether_bt_num_2(table->type, hop_num)) {
+		if (hns_roce_check_hem_null(table->hem, index->buf,
+					    chunk_ba_num, table->num_hem))
+			index->inited |= HEM_INDEX_L0;
+	} else if (check_whether_bt_num_3(table->type, hop_num)) {
+		if (hns_roce_check_hem_null(table->hem, index->buf,
+					    chunk_ba_num, table->num_hem)) {
+			index->inited |= HEM_INDEX_L1;
+			if (hns_roce_check_bt_null(table->bt_l1, index->l1,
+						   chunk_ba_num))
+				index->inited |= HEM_INDEX_L0;
+		}
+	}
+
+	if (table->type < HEM_TYPE_MTT) {
+		if (hop_num == HNS_ROCE_HOP_NUM_0)
+			step_idx = 0;
+		else
+			step_idx = hop_num;
+
+		if (hr_dev->hw->clear_hem(hr_dev, table, obj, step_idx))
+			ibdev_warn(ibdev, "Clear hop%d HEM failed.\n", hop_num);
+
+		if (index->inited & HEM_INDEX_L1)
+			if (hr_dev->hw->clear_hem(hr_dev, table, obj, 1))
+				ibdev_warn(ibdev, "Clear HEM step 1 failed.\n");
+
+		if (index->inited & HEM_INDEX_L0)
+			if (hr_dev->hw->clear_hem(hr_dev, table, obj, 0))
+				ibdev_warn(ibdev, "Clear HEM step 0 failed.\n");
+	}
+}
+
 static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev,
 				    struct hns_roce_hem_table *table,
 				    unsigned long obj,
 				    int check_refcount)
 {
-	struct device *dev = hr_dev->dev;
-	struct hns_roce_hem_mhop mhop;
-	unsigned long mhop_obj = obj;
-	u32 bt_chunk_size;
-	u32 chunk_ba_num;
-	u32 hop_num;
-	u32 start_idx;
-	u32 bt_num;
-	u64 hem_idx;
-	u64 bt_l1_idx = 0;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_hem_index index = {};
+	struct hns_roce_hem_mhop mhop = {};
 	int ret;
 
-	ret = hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, &mhop);
-	if (ret)
-		return;
-
-	bt_chunk_size = mhop.bt_chunk_size;
-	hop_num = mhop.hop_num;
-	chunk_ba_num = bt_chunk_size / BA_BYTE_LEN;
-
-	bt_num = hns_roce_get_bt_num(table->type, hop_num);
-	switch (bt_num) {
-	case 3:
-		hem_idx = mhop.l0_idx * chunk_ba_num * chunk_ba_num +
-			  mhop.l1_idx * chunk_ba_num + mhop.l2_idx;
-		bt_l1_idx = mhop.l0_idx * chunk_ba_num + mhop.l1_idx;
-		break;
-	case 2:
-		hem_idx = mhop.l0_idx * chunk_ba_num + mhop.l1_idx;
-		break;
-	case 1:
-		hem_idx = mhop.l0_idx;
-		break;
-	default:
-		dev_err(dev, "Table %d not support hop_num = %d!\n",
-			     table->type, hop_num);
+	ret = calc_hem_config(hr_dev, table, obj, &mhop, &index);
+	if (ret) {
+		ibdev_err(ibdev, "calc hem config failed!\n");
 		return;
 	}
 
 	mutex_lock(&table->mutex);
-
-	if (check_refcount && (--table->hem[hem_idx]->refcount > 0)) {
+	if (check_refcount && (--table->hem[index.buf]->refcount > 0)) {
 		mutex_unlock(&table->mutex);
 		return;
 	}
 
-	if (table->type < HEM_TYPE_MTT && hop_num == 1) {
-		if (hr_dev->hw->clear_hem(hr_dev, table, obj, 1))
-			dev_warn(dev, "Clear HEM base address failed.\n");
-	} else if (table->type < HEM_TYPE_MTT && hop_num == 2) {
-		if (hr_dev->hw->clear_hem(hr_dev, table, obj, 2))
-			dev_warn(dev, "Clear HEM base address failed.\n");
-	} else if (table->type < HEM_TYPE_MTT &&
-		   hop_num == HNS_ROCE_HOP_NUM_0) {
-		if (hr_dev->hw->clear_hem(hr_dev, table, obj, 0))
-			dev_warn(dev, "Clear HEM base address failed.\n");
-	}
-
-	/*
-	 * free buffer space chunk for QPC/MTPT/CQC/SRQC/SCCC.
-	 * free bt space chunk for MTT/CQE.
-	 */
-	hns_roce_free_hem(hr_dev, table->hem[hem_idx]);
-	table->hem[hem_idx] = NULL;
-
-	if (check_whether_bt_num_2(table->type, hop_num)) {
-		start_idx = mhop.l0_idx * chunk_ba_num;
-		if (hns_roce_check_hem_null(table->hem, start_idx,
-					    chunk_ba_num, table->num_hem)) {
-			if (table->type < HEM_TYPE_MTT &&
-			    hr_dev->hw->clear_hem(hr_dev, table, obj, 0))
-				dev_warn(dev, "Clear HEM base address failed.\n");
-
-			dma_free_coherent(dev, bt_chunk_size,
-					  table->bt_l0[mhop.l0_idx],
-					  table->bt_l0_dma_addr[mhop.l0_idx]);
-			table->bt_l0[mhop.l0_idx] = NULL;
-		}
-	} else if (check_whether_bt_num_3(table->type, hop_num)) {
-		start_idx = mhop.l0_idx * chunk_ba_num * chunk_ba_num +
-			    mhop.l1_idx * chunk_ba_num;
-		if (hns_roce_check_hem_null(table->hem, start_idx,
-					    chunk_ba_num, table->num_hem)) {
-			if (hr_dev->hw->clear_hem(hr_dev, table, obj, 1))
-				dev_warn(dev, "Clear HEM base address failed.\n");
-
-			dma_free_coherent(dev, bt_chunk_size,
-					  table->bt_l1[bt_l1_idx],
-					  table->bt_l1_dma_addr[bt_l1_idx]);
-			table->bt_l1[bt_l1_idx] = NULL;
-
-			start_idx = mhop.l0_idx * chunk_ba_num;
-			if (hns_roce_check_bt_null(table->bt_l1, start_idx,
-						   chunk_ba_num)) {
-				if (hr_dev->hw->clear_hem(hr_dev, table, obj,
-							  0))
-					dev_warn(dev, "Clear HEM base address failed.\n");
-
-				dma_free_coherent(dev, bt_chunk_size,
-					    table->bt_l0[mhop.l0_idx],
-					    table->bt_l0_dma_addr[mhop.l0_idx]);
-				table->bt_l0[mhop.l0_idx] = NULL;
-			}
-		}
-	}
+	clear_mhop_hem(hr_dev, table, obj, &mhop, &index);
+	free_mhop_hem(hr_dev, table, &mhop, &index);
 
 	mutex_unlock(&table->mutex);
 }

From 1f3db161881b7e21efb149e0ae8152b79a571a8f Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Wed, 18 Mar 2020 12:03:23 +0200
Subject: [PATCH 217/327] IB/mlx5: Generally use the WC auto detection test
 result

Now that we have direct and reliable detection of WC support by the
system, use is broadly. The only case we have to worry about is when the
WC autodetector cannot run.

For this fringe case generally assume that that WC is available, except in
the well defined case of no PAT support on x86 which is tested by calling
arch_can_pci_mmap_wc().

If WC is wrongly assumed to be available then it causes a small
performance hit on paths in userspace that are tuned to the assumption
that WC is available. There is no functional loss.

It is very unlikely that any platforms exist that lack WC and also care
about the micro optimization of WC in the fringe case where autodetection
does not work.

By removing the fairly bogus CONFIG tests this makes WC work broadly on
all arches and all platforms.

Link: https://lore.kernel.org/r/20200318100323.46659-1-leon@kernel.org
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Michael Guralnik <michaelgur@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c | 16 ++++------------
 drivers/infiniband/hw/mlx5/mem.c  |  2 +-
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b7151f8f8fa5..1f91d9e543e2 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -39,9 +39,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/bitmap.h>
-#if defined(CONFIG_X86)
-#include <asm/memtype.h>
-#endif
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/task.h>
@@ -2145,14 +2142,6 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 	switch (cmd) {
 	case MLX5_IB_MMAP_WC_PAGE:
 	case MLX5_IB_MMAP_ALLOC_WC:
-/* Some architectures don't support WC memory */
-#if defined(CONFIG_X86)
-		if (!pat_enabled())
-			return -EPERM;
-#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
-			return -EPERM;
-#endif
-	/* fall through */
 	case MLX5_IB_MMAP_REGULAR_PAGE:
 		/* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
 		prot = pgprot_writecombine(vma->vm_page_prot);
@@ -2298,9 +2287,12 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
 	command = get_command(vma->vm_pgoff);
 	switch (command) {
 	case MLX5_IB_MMAP_WC_PAGE:
+	case MLX5_IB_MMAP_ALLOC_WC:
+		if (!dev->wc_support)
+			return -EPERM;
+		fallthrough;
 	case MLX5_IB_MMAP_NC_PAGE:
 	case MLX5_IB_MMAP_REGULAR_PAGE:
-	case MLX5_IB_MMAP_ALLOC_WC:
 		return uar_mmap(dev, command, vma, context);
 
 	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index b90a3649e7d1..c19ec9fd8a63 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -316,7 +316,7 @@ int mlx5_ib_test_wc(struct mlx5_ib_dev *dev)
 	if (!dev->mdev->roce.roce_en &&
 	    port_type_cap == MLX5_CAP_PORT_TYPE_ETH) {
 		if (mlx5_core_is_pf(dev->mdev))
-			dev->wc_support = true;
+			dev->wc_support = arch_can_pci_mmap_wc();
 		return 0;
 	}
 

From 2ca7be7d55ad84fb0f7c2a23fb700a28fd76b19a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 25 Mar 2020 10:00:07 -0500
Subject: [PATCH 218/327] exec: Only compute current once in flush_old_exec

Make it clear that current only needs to be computed once in
flush_old_exec.  This may have some efficiency improvements and it
makes the code easier to change.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/exec.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index db17be51b112..c3f34791f2f0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1260,13 +1260,14 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
  */
 int flush_old_exec(struct linux_binprm * bprm)
 {
+	struct task_struct *me = current;
 	int retval;
 
 	/*
 	 * Make sure we have a private signal table and that
 	 * we are unassociated from the previous thread group.
 	 */
-	retval = de_thread(current);
+	retval = de_thread(me);
 	if (retval)
 		goto out;
 
@@ -1294,10 +1295,10 @@ int flush_old_exec(struct linux_binprm * bprm)
 	bprm->mm = NULL;
 
 	set_fs(USER_DS);
-	current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
+	me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
 					PF_NOFREEZE | PF_NO_SETAFFINITY);
 	flush_thread();
-	current->personality &= ~bprm->per_clear;
+	me->personality &= ~bprm->per_clear;
 
 	/*
 	 * We have to apply CLOEXEC before we change whether the process is
@@ -1305,7 +1306,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 * trying to access the should-be-closed file descriptors of a process
 	 * undergoing exec(2).
 	 */
-	do_close_on_exec(current->files);
+	do_close_on_exec(me->files);
 	return 0;
 
 out:

From 021691559245498dfa15454c9fc4351f367d0b7f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 25 Mar 2020 10:00:21 -0500
Subject: [PATCH 219/327] exec: Factor unshare_sighand out of de_thread and
 call it separately

This makes the code clearer and makes it easier to implement a mutex
that is not taken over any locations that may block indefinitely waiting
for userspace.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/exec.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index c3f34791f2f0..ff74b9a74d34 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1194,6 +1194,23 @@ static int de_thread(struct task_struct *tsk)
 	flush_itimer_signals();
 #endif
 
+	BUG_ON(!thread_group_leader(tsk));
+	return 0;
+
+killed:
+	/* protects against exit_notify() and __exit_signal() */
+	read_lock(&tasklist_lock);
+	sig->group_exit_task = NULL;
+	sig->notify_count = 0;
+	read_unlock(&tasklist_lock);
+	return -EAGAIN;
+}
+
+
+static int unshare_sighand(struct task_struct *me)
+{
+	struct sighand_struct *oldsighand = me->sighand;
+
 	if (refcount_read(&oldsighand->count) != 1) {
 		struct sighand_struct *newsighand;
 		/*
@@ -1210,23 +1227,13 @@ static int de_thread(struct task_struct *tsk)
 
 		write_lock_irq(&tasklist_lock);
 		spin_lock(&oldsighand->siglock);
-		rcu_assign_pointer(tsk->sighand, newsighand);
+		rcu_assign_pointer(me->sighand, newsighand);
 		spin_unlock(&oldsighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 
 		__cleanup_sighand(oldsighand);
 	}
-
-	BUG_ON(!thread_group_leader(tsk));
 	return 0;
-
-killed:
-	/* protects against exit_notify() and __exit_signal() */
-	read_lock(&tasklist_lock);
-	sig->group_exit_task = NULL;
-	sig->notify_count = 0;
-	read_unlock(&tasklist_lock);
-	return -EAGAIN;
 }
 
 char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
@@ -1264,13 +1271,19 @@ int flush_old_exec(struct linux_binprm * bprm)
 	int retval;
 
 	/*
-	 * Make sure we have a private signal table and that
-	 * we are unassociated from the previous thread group.
+	 * Make this the only thread in the thread group.
 	 */
 	retval = de_thread(me);
 	if (retval)
 		goto out;
 
+	/*
+	 * Make the signal table private.
+	 */
+	retval = unshare_sighand(me);
+	if (retval)
+		goto out;
+
 	/*
 	 * Must be called _before_ exec_mmap() as bprm->mm is
 	 * not visibile until then. This also enables the update

From 153ffb6ba49fd80dc607a9f230415af02b728d70 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 25 Mar 2020 10:00:38 -0500
Subject: [PATCH 220/327] exec: Move cleanup of posix timers on exec out of
 de_thread

These functions have very little to do with de_thread move them out
of de_thread an into flush_old_exec proper so it can be more clearly
seen what flush_old_exec is doing.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/exec.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index ff74b9a74d34..215d86f77b63 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1189,11 +1189,6 @@ static int de_thread(struct task_struct *tsk)
 	/* we have changed execution domain */
 	tsk->exit_signal = SIGCHLD;
 
-#ifdef CONFIG_POSIX_TIMERS
-	exit_itimers(sig);
-	flush_itimer_signals();
-#endif
-
 	BUG_ON(!thread_group_leader(tsk));
 	return 0;
 
@@ -1277,6 +1272,11 @@ int flush_old_exec(struct linux_binprm * bprm)
 	if (retval)
 		goto out;
 
+#ifdef CONFIG_POSIX_TIMERS
+	exit_itimers(me->signal);
+	flush_itimer_signals();
+#endif
+
 	/*
 	 * Make the signal table private.
 	 */

From ccf0fa6be02687868006fa0c85af32c40cae6fb6 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 25 Mar 2020 10:03:21 -0500
Subject: [PATCH 221/327] exec: Move exec_mmap right after de_thread in
 flush_old_exec

I have read through the code in exec_mmap and I do not see anything
that depends on sighand or the sighand lock, or on signals in anyway
so this should be safe.

This rearrangement of code has two significant benefits.  It makes
the determination of passing the point of no return by testing bprm->mm
accurate.  All failures prior to that point in flush_old_exec are
either truly recoverable or they are fatal.

Further this consolidates all of the possible indefinite waits for
userspace together at the top of flush_old_exec.  The possible wait
for a ptracer on PTRACE_EVENT_EXIT, the possible wait for a page fault
to be resolved in clear_child_tid, and the possible wait for a page
fault in exit_robust_list.

This consolidation allows the creation of a mutex to replace
cred_guard_mutex that is not held over possible indefinite userspace
waits.  Which will allow removing deadlock scenarios from the kernel.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/exec.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 215d86f77b63..d820a7272a76 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1272,18 +1272,6 @@ int flush_old_exec(struct linux_binprm * bprm)
 	if (retval)
 		goto out;
 
-#ifdef CONFIG_POSIX_TIMERS
-	exit_itimers(me->signal);
-	flush_itimer_signals();
-#endif
-
-	/*
-	 * Make the signal table private.
-	 */
-	retval = unshare_sighand(me);
-	if (retval)
-		goto out;
-
 	/*
 	 * Must be called _before_ exec_mmap() as bprm->mm is
 	 * not visibile until then. This also enables the update
@@ -1307,6 +1295,18 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 */
 	bprm->mm = NULL;
 
+#ifdef CONFIG_POSIX_TIMERS
+	exit_itimers(me->signal);
+	flush_itimer_signals();
+#endif
+
+	/*
+	 * Make the signal table private.
+	 */
+	retval = unshare_sighand(me);
+	if (retval)
+		goto out;
+
 	set_fs(USER_DS);
 	me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
 					PF_NOFREEZE | PF_NO_SETAFFINITY);

From eea9673250db4e854e9998ef9da6d4584857f0ea Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 25 Mar 2020 10:03:36 -0500
Subject: [PATCH 222/327] exec: Add exec_update_mutex to replace
 cred_guard_mutex

The cred_guard_mutex is problematic as it is held over possibly
indefinite waits for userspace.  The possible indefinite waits for
userspace that I have identified are: The cred_guard_mutex is held in
PTRACE_EVENT_EXIT waiting for the tracer.  The cred_guard_mutex is
held over "put_user(0, tsk->clear_child_tid)" in exit_mm().  The
cred_guard_mutex is held over "get_user(futex_offset, ...")  in
exit_robust_list.  The cred_guard_mutex held over copy_strings.

The functions get_user and put_user can trigger a page fault which can
potentially wait indefinitely in the case of userfaultfd or if
userspace implements part of the page fault path.

In any of those cases the userspace process that the kernel is waiting
for might make a different system call that winds up taking the
cred_guard_mutex and result in deadlock.

Holding a mutex over any of those possibly indefinite waits for
userspace does not appear necessary.  Add exec_update_mutex that will
just cover updating the process during exec where the permissions and
the objects pointed to by the task struct may be out of sync.

The plan is to switch the users of cred_guard_mutex to
exec_update_mutex one by one.  This lets us move forward while still
being careful and not introducing any regressions.

Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/
Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/
Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/
Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/
Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/
Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/exec.c                    | 22 +++++++++++++++++++---
 include/linux/binfmts.h      |  8 +++++++-
 include/linux/sched/signal.h |  9 ++++++++-
 init/init_task.c             |  1 +
 kernel/fork.c                |  1 +
 5 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index d820a7272a76..0e46ec57fe0a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1010,16 +1010,26 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
 }
 EXPORT_SYMBOL(read_code);
 
+/*
+ * Maps the mm_struct mm into the current task struct.
+ * On success, this function returns with the mutex
+ * exec_update_mutex locked.
+ */
 static int exec_mmap(struct mm_struct *mm)
 {
 	struct task_struct *tsk;
 	struct mm_struct *old_mm, *active_mm;
+	int ret;
 
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
 	old_mm = current->mm;
 	exec_mm_release(tsk, old_mm);
 
+	ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
+	if (ret)
+		return ret;
+
 	if (old_mm) {
 		sync_mm_rss(old_mm);
 		/*
@@ -1031,9 +1041,11 @@ static int exec_mmap(struct mm_struct *mm)
 		down_read(&old_mm->mmap_sem);
 		if (unlikely(old_mm->core_state)) {
 			up_read(&old_mm->mmap_sem);
+			mutex_unlock(&tsk->signal->exec_update_mutex);
 			return -EINTR;
 		}
 	}
+
 	task_lock(tsk);
 	active_mm = tsk->active_mm;
 	membarrier_exec_mmap(mm);
@@ -1288,11 +1300,12 @@ int flush_old_exec(struct linux_binprm * bprm)
 		goto out;
 
 	/*
-	 * After clearing bprm->mm (to mark that current is using the
-	 * prepared mm now), we have nothing left of the original
+	 * After setting bprm->called_exec_mmap (to mark that current is
+	 * using the prepared mm now), we have nothing left of the original
 	 * process. If anything from here on returns an error, the check
 	 * in search_binary_handler() will SEGV current.
 	 */
+	bprm->called_exec_mmap = 1;
 	bprm->mm = NULL;
 
 #ifdef CONFIG_POSIX_TIMERS
@@ -1438,6 +1451,8 @@ static void free_bprm(struct linux_binprm *bprm)
 {
 	free_arg_pages(bprm);
 	if (bprm->cred) {
+		if (bprm->called_exec_mmap)
+			mutex_unlock(&current->signal->exec_update_mutex);
 		mutex_unlock(&current->signal->cred_guard_mutex);
 		abort_creds(bprm->cred);
 	}
@@ -1487,6 +1502,7 @@ void install_exec_creds(struct linux_binprm *bprm)
 	 * credentials; any time after this it may be unlocked.
 	 */
 	security_bprm_committed_creds(bprm);
+	mutex_unlock(&current->signal->exec_update_mutex);
 	mutex_unlock(&current->signal->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
@@ -1678,7 +1694,7 @@ int search_binary_handler(struct linux_binprm *bprm)
 
 		read_lock(&binfmt_lock);
 		put_binfmt(fmt);
-		if (retval < 0 && !bprm->mm) {
+		if (retval < 0 && bprm->called_exec_mmap) {
 			/* we got to flush_old_exec() and failed after it */
 			read_unlock(&binfmt_lock);
 			force_sigsegv(SIGSEGV);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index b40fc633f3be..a345d9fed3d8 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -44,7 +44,13 @@ struct linux_binprm {
 		 * exec has happened. Used to sanitize execution environment
 		 * and to set AT_SECURE auxv for glibc.
 		 */
-		secureexec:1;
+		secureexec:1,
+		/*
+		 * Set by flush_old_exec, when exec_mmap has been called.
+		 * This is past the point of no return, when the
+		 * exec_update_mutex has been taken.
+		 */
+		called_exec_mmap:1;
 #ifdef __alpha__
 	unsigned int taso:1;
 #endif
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 88050259c466..a29df79540ce 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -224,7 +224,14 @@ struct signal_struct {
 
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
-					 * (notably. ptrace) */
+					 * (notably. ptrace)
+					 * Deprecated do not use in new code.
+					 * Use exec_update_mutex instead.
+					 */
+	struct mutex exec_update_mutex;	/* Held while task_struct is being
+					 * updated during exec, and may have
+					 * inconsistent permissions.
+					 */
 } __randomize_layout;
 
 /*
diff --git a/init/init_task.c b/init/init_task.c
index 9e5cbe5eab7b..bd403ed3e418 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -26,6 +26,7 @@ static struct signal_struct init_signals = {
 	.multiprocess	= HLIST_HEAD_INIT,
 	.rlim		= INIT_RLIMITS,
 	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
+	.exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),
 #ifdef CONFIG_POSIX_TIMERS
 	.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
 	.cputimer	= {
diff --git a/kernel/fork.c b/kernel/fork.c
index 60a1295f4384..12896a6ecee6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
 	mutex_init(&sig->cred_guard_mutex);
+	mutex_init(&sig->exec_update_mutex);
 
 	return 0;
 }

From 3e74fabd39710ee29fa25618d2c2b40cfa7d76c7 Mon Sep 17 00:00:00 2001
From: Bernd Edlinger <bernd.edlinger@hotmail.de>
Date: Fri, 20 Mar 2020 21:26:04 +0100
Subject: [PATCH 223/327] exec: Fix a deadlock in strace

This fixes a deadlock in the tracer when tracing a multi-threaded
application that calls execve while more than one thread are running.

I observed that when running strace on the gcc test suite, it always
blocks after a while, when expect calls execve, because other threads
have to be terminated.  They send ptrace events, but the strace is no
longer able to respond, since it is blocked in vm_access.

The deadlock is always happening when strace needs to access the
tracees process mmap, while another thread in the tracee starts to
execve a child process, but that cannot continue until the
PTRACE_EVENT_EXIT is handled and the WIFEXITED event is received:

strace          D    0 30614  30584 0x00000000
Call Trace:
__schedule+0x3ce/0x6e0
schedule+0x5c/0xd0
schedule_preempt_disabled+0x15/0x20
__mutex_lock.isra.13+0x1ec/0x520
__mutex_lock_killable_slowpath+0x13/0x20
mutex_lock_killable+0x28/0x30
mm_access+0x27/0xa0
process_vm_rw_core.isra.3+0xff/0x550
process_vm_rw+0xdd/0xf0
__x64_sys_process_vm_readv+0x31/0x40
do_syscall_64+0x64/0x220
entry_SYSCALL_64_after_hwframe+0x44/0xa9

expect          D    0 31933  30876 0x80004003
Call Trace:
__schedule+0x3ce/0x6e0
schedule+0x5c/0xd0
flush_old_exec+0xc4/0x770
load_elf_binary+0x35a/0x16c0
search_binary_handler+0x97/0x1d0
__do_execve_file.isra.40+0x5d4/0x8a0
__x64_sys_execve+0x49/0x60
do_syscall_64+0x64/0x220
entry_SYSCALL_64_after_hwframe+0x44/0xa9

This changes mm_access to use the new exec_update_mutex
instead of cred_guard_mutex.

This patch is based on the following patch by Eric W. Biederman:
"[PATCH 0/5] Infrastructure to allow fixing exec deadlocks"
Link: https://lore.kernel.org/lkml/87v9ne5y4y.fsf_-_@x220.int.ebiederm.org/

Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 12896a6ecee6..e0668a79bca1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1224,7 +1224,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 	struct mm_struct *mm;
 	int err;
 
-	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
+	err =  mutex_lock_killable(&task->signal->exec_update_mutex);
 	if (err)
 		return ERR_PTR(err);
 
@@ -1234,7 +1234,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 		mmput(mm);
 		mm = ERR_PTR(-EACCES);
 	}
-	mutex_unlock(&task->signal->cred_guard_mutex);
+	mutex_unlock(&task->signal->exec_update_mutex);
 
 	return mm;
 }

From 2de4e82318c7f9d34f4b08599a612cd4cd10bf0b Mon Sep 17 00:00:00 2001
From: Bernd Edlinger <bernd.edlinger@hotmail.de>
Date: Fri, 20 Mar 2020 21:26:19 +0100
Subject: [PATCH 224/327] selftests/ptrace: add test cases for dead-locks

This adds test cases for ptrace deadlocks.

Additionally fixes a compile problem in get_syscall_info.c,
observed with gcc-4.8.4:

get_syscall_info.c: In function 'get_syscall_info':
get_syscall_info.c:93:3: error: 'for' loop initial declarations are only
                                 allowed in C99 mode
   for (unsigned int i = 0; i < ARRAY_SIZE(args); ++i) {
   ^
get_syscall_info.c:93:3: note: use option -std=c99 or -std=gnu99 to compile
                               your code

Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 tools/testing/selftests/ptrace/Makefile   |  4 +-
 tools/testing/selftests/ptrace/vmaccess.c | 86 +++++++++++++++++++++++
 2 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/ptrace/vmaccess.c

diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile
index c0b7f89f0930..2f1f532c39db 100644
--- a/tools/testing/selftests/ptrace/Makefile
+++ b/tools/testing/selftests/ptrace/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
-CFLAGS += -iquote../../../../include/uapi -Wall
+CFLAGS += -std=c99 -pthread -iquote../../../../include/uapi -Wall
 
-TEST_GEN_PROGS := get_syscall_info peeksiginfo
+TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess
 
 include ../lib.mk
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c
new file mode 100644
index 000000000000..4db327b44586
--- /dev/null
+++ b/tools/testing/selftests/ptrace/vmaccess.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2020 Bernd Edlinger <bernd.edlinger@hotmail.de>
+ * All rights reserved.
+ *
+ * Check whether /proc/$pid/mem can be accessed without causing deadlocks
+ * when de_thread is blocked with ->cred_guard_mutex held.
+ */
+
+#include "../kselftest_harness.h"
+#include <stdio.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+
+static void *thread(void *arg)
+{
+	ptrace(PTRACE_TRACEME, 0, 0L, 0L);
+	return NULL;
+}
+
+TEST(vmaccess)
+{
+	int f, pid = fork();
+	char mm[64];
+
+	if (!pid) {
+		pthread_t pt;
+
+		pthread_create(&pt, NULL, thread, NULL);
+		pthread_join(pt, NULL);
+		execlp("true", "true", NULL);
+	}
+
+	sleep(1);
+	sprintf(mm, "/proc/%d/mem", pid);
+	f = open(mm, O_RDONLY);
+	ASSERT_GE(f, 0);
+	close(f);
+	f = kill(pid, SIGCONT);
+	ASSERT_EQ(f, 0);
+}
+
+TEST(attach)
+{
+	int s, k, pid = fork();
+
+	if (!pid) {
+		pthread_t pt;
+
+		pthread_create(&pt, NULL, thread, NULL);
+		pthread_join(pt, NULL);
+		execlp("sleep", "sleep", "2", NULL);
+	}
+
+	sleep(1);
+	k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
+	ASSERT_EQ(errno, EAGAIN);
+	ASSERT_EQ(k, -1);
+	k = waitpid(-1, &s, WNOHANG);
+	ASSERT_NE(k, -1);
+	ASSERT_NE(k, 0);
+	ASSERT_NE(k, pid);
+	ASSERT_EQ(WIFEXITED(s), 1);
+	ASSERT_EQ(WEXITSTATUS(s), 0);
+	sleep(1);
+	k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
+	ASSERT_EQ(k, 0);
+	k = waitpid(-1, &s, 0);
+	ASSERT_EQ(k, pid);
+	ASSERT_EQ(WIFSTOPPED(s), 1);
+	ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
+	k = ptrace(PTRACE_DETACH, pid, 0L, 0L);
+	ASSERT_EQ(k, 0);
+	k = waitpid(-1, &s, 0);
+	ASSERT_EQ(k, pid);
+	ASSERT_EQ(WIFEXITED(s), 1);
+	ASSERT_EQ(WEXITSTATUS(s), 0);
+	k = waitpid(-1, NULL, 0);
+	ASSERT_EQ(k, -1);
+	ASSERT_EQ(errno, ECHILD);
+}
+
+TEST_HARNESS_MAIN

From 8d09db80ef404d8e2e143677e0716cfcd544e29b Mon Sep 17 00:00:00 2001
From: Bernd Edlinger <bernd.edlinger@hotmail.de>
Date: Fri, 20 Mar 2020 21:26:34 +0100
Subject: [PATCH 225/327] mm: docs: Fix a comment in process_vm_rw_core

This removes a duplicate "a" in the comment in process_vm_rw_core.

Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 mm/process_vm_access.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index de41e830cdac..74e957e302fe 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -206,7 +206,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
 	if (!mm || IS_ERR(mm)) {
 		rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
 		/*
-		 * Explicitly map EACCES to EPERM as EPERM is a more a
+		 * Explicitly map EACCES to EPERM as EPERM is a more
 		 * appropriate error code for process_vw_readv/writev
 		 */
 		if (rc == -EACCES)

From aa884c11313656fee7b12972614b6333f154655c Mon Sep 17 00:00:00 2001
From: Bernd Edlinger <bernd.edlinger@hotmail.de>
Date: Fri, 20 Mar 2020 21:26:50 +0100
Subject: [PATCH 226/327] kernel: doc: remove outdated comment cred.c

This removes an outdated comment in prepare_kernel_cred.

There is no "cred_replace_mutex" any more, so the comment must
go away.

Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/cred.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/cred.c b/kernel/cred.c
index 809a985b1793..71a792616917 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -675,8 +675,6 @@ void __init cred_init(void)
  * The caller may change these controls afterwards if desired.
  *
  * Returns the new credentials or NULL if out of memory.
- *
- * Does not take, and does not return holding current->cred_replace_mutex.
  */
 struct cred *prepare_kernel_cred(struct task_struct *daemon)
 {

From 454e3126cb842388e22df6b3ac3da44062c00765 Mon Sep 17 00:00:00 2001
From: Bernd Edlinger <bernd.edlinger@hotmail.de>
Date: Fri, 20 Mar 2020 21:27:05 +0100
Subject: [PATCH 227/327] kernel/kcmp.c: Use new infrastructure to fix
 deadlocks in execve

This changes kcmp_epoll_target to use the new exec_update_mutex
instead of cred_guard_mutex.

This should be safe, as the credentials are only used for reading,
and furthermore ->mm and ->sighand are updated on execve,
but only under the new exec_update_mutex.

Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/kcmp.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index a0e3d7a0e8b8..b3ff9288c6cc 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -173,8 +173,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 	/*
 	 * One should have enough rights to inspect task details.
 	 */
-	ret = kcmp_lock(&task1->signal->cred_guard_mutex,
-			&task2->signal->cred_guard_mutex);
+	ret = kcmp_lock(&task1->signal->exec_update_mutex,
+			&task2->signal->exec_update_mutex);
 	if (ret)
 		goto err;
 	if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) ||
@@ -229,8 +229,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 	}
 
 err_unlock:
-	kcmp_unlock(&task1->signal->cred_guard_mutex,
-		    &task2->signal->cred_guard_mutex);
+	kcmp_unlock(&task1->signal->exec_update_mutex,
+		    &task2->signal->exec_update_mutex);
 err:
 	put_task_struct(task1);
 	put_task_struct(task2);

From 2db9dbf71bf98d02a0bf33e798e5bfd2a9944696 Mon Sep 17 00:00:00 2001
From: Bernd Edlinger <bernd.edlinger@hotmail.de>
Date: Fri, 20 Mar 2020 21:27:24 +0100
Subject: [PATCH 228/327] proc: Use new infrastructure to fix deadlocks in
 execve

This changes lock_trace to use the new exec_update_mutex
instead of cred_guard_mutex.

This fixes possible deadlocks when the trace is accessing
/proc/$pid/stack for instance.

This should be safe, as the credentials are only used for reading,
and task->mm is updated on execve under the new exec_update_mutex.

Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/base.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e7efe9d6f3d6..a278fda0458b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -405,11 +405,11 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
 
 static int lock_trace(struct task_struct *task)
 {
-	int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+	int err = mutex_lock_killable(&task->signal->exec_update_mutex);
 	if (err)
 		return err;
 	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
-		mutex_unlock(&task->signal->cred_guard_mutex);
+		mutex_unlock(&task->signal->exec_update_mutex);
 		return -EPERM;
 	}
 	return 0;
@@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task)
 
 static void unlock_trace(struct task_struct *task)
 {
-	mutex_unlock(&task->signal->cred_guard_mutex);
+	mutex_unlock(&task->signal->exec_update_mutex);
 }
 
 #ifdef CONFIG_STACKTRACE

From 76518d3798855242817e8a8ed76b2d72f4415624 Mon Sep 17 00:00:00 2001
From: Bernd Edlinger <bernd.edlinger@hotmail.de>
Date: Fri, 20 Mar 2020 21:27:41 +0100
Subject: [PATCH 229/327] proc: io_accounting: Use new infrastructure to fix
 deadlocks in execve

This changes do_io_accounting to use the new exec_update_mutex
instead of cred_guard_mutex.

This fixes possible deadlocks when the trace is accessing
/proc/$pid/io for instance.

This should be safe, as the credentials are only used for reading.

Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/base.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index a278fda0458b..74f948a6b621 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2883,7 +2883,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
 	unsigned long flags;
 	int result;
 
-	result = mutex_lock_killable(&task->signal->cred_guard_mutex);
+	result = mutex_lock_killable(&task->signal->exec_update_mutex);
 	if (result)
 		return result;
 
@@ -2919,7 +2919,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
 	result = 0;
 
 out_unlock:
-	mutex_unlock(&task->signal->cred_guard_mutex);
+	mutex_unlock(&task->signal->exec_update_mutex);
 	return result;
 }
 

From 6914303824bb572278568330d72fc1f8f9814e67 Mon Sep 17 00:00:00 2001
From: Bernd Edlinger <bernd.edlinger@hotmail.de>
Date: Fri, 20 Mar 2020 21:27:55 +0100
Subject: [PATCH 230/327] perf: Use new infrastructure to fix deadlocks in
 execve

This changes perf_event_set_clock to use the new exec_update_mutex
instead of cred_guard_mutex.

This should be safe, as the credentials are only used for reading.

Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/events/core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index e453589da97c..71cba8cfccbc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1249,7 +1249,7 @@ static void put_ctx(struct perf_event_context *ctx)
  * function.
  *
  * Lock order:
- *    cred_guard_mutex
+ *    exec_update_mutex
  *	task_struct::perf_event_mutex
  *	  perf_event_context::mutex
  *	    perf_event::child_mutex;
@@ -11263,14 +11263,14 @@ SYSCALL_DEFINE5(perf_event_open,
 	}
 
 	if (task) {
-		err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+		err = mutex_lock_interruptible(&task->signal->exec_update_mutex);
 		if (err)
 			goto err_task;
 
 		/*
 		 * Reuse ptrace permission checks for now.
 		 *
-		 * We must hold cred_guard_mutex across this and any potential
+		 * We must hold exec_update_mutex across this and any potential
 		 * perf_install_in_context() call for this new event to
 		 * serialize against exec() altering our credentials (and the
 		 * perf_event_exit_task() that could imply).
@@ -11559,7 +11559,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	mutex_unlock(&ctx->mutex);
 
 	if (task) {
-		mutex_unlock(&task->signal->cred_guard_mutex);
+		mutex_unlock(&task->signal->exec_update_mutex);
 		put_task_struct(task);
 	}
 
@@ -11595,7 +11595,7 @@ SYSCALL_DEFINE5(perf_event_open,
 		free_event(event);
 err_cred:
 	if (task)
-		mutex_unlock(&task->signal->cred_guard_mutex);
+		mutex_unlock(&task->signal->exec_update_mutex);
 err_task:
 	if (task)
 		put_task_struct(task);
@@ -11900,7 +11900,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 /*
  * When a child task exits, feed back event values to parent events.
  *
- * Can be called with cred_guard_mutex held when called from
+ * Can be called with exec_update_mutex held when called from
  * install_exec_creds().
  */
 void perf_event_exit_task(struct task_struct *child)

From 501f9328bf5c6b5e4863da4b50e0e86792de3aa9 Mon Sep 17 00:00:00 2001
From: Bernd Edlinger <bernd.edlinger@hotmail.de>
Date: Sat, 21 Mar 2020 02:46:16 +0000
Subject: [PATCH 231/327] pidfd: Use new infrastructure to fix deadlocks in
 execve

This changes __pidfd_fget to use the new exec_update_mutex
instead of cred_guard_mutex.

This should be safe, as the credentials do not change
before exec_update_mutex is locked.  Therefore whatever
file access is possible with holding the cred_guard_mutex
here is also possbile with the exec_update_mutex.

Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/pid.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index 60820e72634c..efd34874b3d1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -577,7 +577,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
 	struct file *file;
 	int ret;
 
-	ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
+	ret = mutex_lock_killable(&task->signal->exec_update_mutex);
 	if (ret)
 		return ERR_PTR(ret);
 
@@ -586,7 +586,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
 	else
 		file = ERR_PTR(-EPERM);
 
-	mutex_unlock(&task->signal->cred_guard_mutex);
+	mutex_unlock(&task->signal->exec_update_mutex);
 
 	return file ?: ERR_PTR(-EBADF);
 }

From 2d68df6cc4bf5822d78cd0f067174d6e29a2f739 Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Thu, 21 Nov 2019 15:50:58 -0800
Subject: [PATCH 232/327] kunit: Always print actual pointer values in asserts

KUnit assertions and expectations will print the values being tested. If
these are pointers (e.g., KUNIT_EXPECT_PTR_EQ(test, a, b)), these
pointers are currently printed with the %pK format specifier, which -- to
prevent information leaks which may compromise, e.g., ASLR -- are often
either hashed or replaced with ____ptrval____ or similar, making debugging
tests difficult.

By replacing %pK with %px as Documentation/core-api/printk-formats.rst
suggests, we disable this security feature for KUnit assertions and
expectations, allowing the actual pointer values to be printed. Given
that KUnit is not intended for use in production kernels, and the
pointers are only printed on failing tests, this seems like a worthwhile
tradeoff.

Signed-off-by: David Gow <davidgow@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 lib/kunit/assert.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/kunit/assert.c b/lib/kunit/assert.c
index b24bebca052d..02ecd0d7d80a 100644
--- a/lib/kunit/assert.c
+++ b/lib/kunit/assert.c
@@ -118,10 +118,10 @@ void kunit_binary_ptr_assert_format(const struct kunit_assert *assert,
 			 binary_assert->left_text,
 			 binary_assert->operation,
 			 binary_assert->right_text);
-	string_stream_add(stream, "\t\t%s == %pK\n",
+	string_stream_add(stream, "\t\t%s == %px\n",
 			 binary_assert->left_text,
 			 binary_assert->left_value);
-	string_stream_add(stream, "\t\t%s == %pK",
+	string_stream_add(stream, "\t\t%s == %px",
 			 binary_assert->right_text,
 			 binary_assert->right_value);
 	kunit_assert_print_msg(assert, stream);

From 97752c39bdaeea990260d3111250f6c861519808 Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Mon, 23 Mar 2020 19:43:33 -0700
Subject: [PATCH 233/327] kunit: kunit_tool: Allow .kunitconfig to disable
 config items

Rework kunit_tool in order to allow .kunitconfig files to better enforce
that disabled items in .kunitconfig are disabled in the generated
.config.

Previously, kunit_tool simply enforced that any line present in
.kunitconfig was also present in .config, but this could cause problems
if a config option was disabled in .kunitconfig, but not listed in .config
due to (for example) having disabled dependencies.

To fix this, re-work the parser to track config names and values, and
require values to match unless they are explicitly disabled with the
"CONFIG_x is not set" comment (or by setting its value to 'n'). Those
"disabled" values will pass validation if omitted from the .config, but
not if they have a different value.

Signed-off-by: David Gow <davidgow@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_config.py    | 41 ++++++++++++++++++++------
 tools/testing/kunit/kunit_tool_test.py | 22 +++++++-------
 2 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/tools/testing/kunit/kunit_config.py b/tools/testing/kunit/kunit_config.py
index ebf3942b23f5..e75063d603b5 100644
--- a/tools/testing/kunit/kunit_config.py
+++ b/tools/testing/kunit/kunit_config.py
@@ -9,16 +9,18 @@
 import collections
 import re
 
-CONFIG_IS_NOT_SET_PATTERN = r'^# CONFIG_\w+ is not set$'
-CONFIG_PATTERN = r'^CONFIG_\w+=\S+$'
-
-KconfigEntryBase = collections.namedtuple('KconfigEntry', ['raw_entry'])
+CONFIG_IS_NOT_SET_PATTERN = r'^# CONFIG_(\w+) is not set$'
+CONFIG_PATTERN = r'^CONFIG_(\w+)=(\S+)$'
 
+KconfigEntryBase = collections.namedtuple('KconfigEntry', ['name', 'value'])
 
 class KconfigEntry(KconfigEntryBase):
 
 	def __str__(self) -> str:
-		return self.raw_entry
+		if self.value == 'n':
+			return r'# CONFIG_%s is not set' % (self.name)
+		else:
+			return r'CONFIG_%s=%s' % (self.name, self.value)
 
 
 class KconfigParseError(Exception):
@@ -38,7 +40,17 @@ class Kconfig(object):
 		self._entries.append(entry)
 
 	def is_subset_of(self, other: 'Kconfig') -> bool:
-		return self.entries().issubset(other.entries())
+		for a in self.entries():
+			found = False
+			for b in other.entries():
+				if a.name != b.name:
+					continue
+				if a.value != b.value:
+					return False
+				found = True
+			if a.value != 'n' and found == False:
+				return False
+		return True
 
 	def write_to_file(self, path: str) -> None:
 		with open(path, 'w') as f:
@@ -54,9 +66,20 @@ class Kconfig(object):
 			line = line.strip()
 			if not line:
 				continue
-			elif config_matcher.match(line) or is_not_set_matcher.match(line):
-				self._entries.append(KconfigEntry(line))
-			elif line[0] == '#':
+
+			match = config_matcher.match(line)
+			if match:
+				entry = KconfigEntry(match.group(1), match.group(2))
+				self.add_entry(entry)
+				continue
+
+			empty_match = is_not_set_matcher.match(line)
+			if empty_match:
+				entry = KconfigEntry(empty_match.group(1), 'n')
+				self.add_entry(entry)
+				continue
+
+			if line[0] == '#':
 				continue
 			else:
 				raise KconfigParseError('Failed to parse: ' + line)
diff --git a/tools/testing/kunit/kunit_tool_test.py b/tools/testing/kunit/kunit_tool_test.py
index ce47e87b633a..984588d6ba95 100755
--- a/tools/testing/kunit/kunit_tool_test.py
+++ b/tools/testing/kunit/kunit_tool_test.py
@@ -37,7 +37,7 @@ class KconfigTest(unittest.TestCase):
 		self.assertTrue(kconfig0.is_subset_of(kconfig0))
 
 		kconfig1 = kunit_config.Kconfig()
-		kconfig1.add_entry(kunit_config.KconfigEntry('CONFIG_TEST=y'))
+		kconfig1.add_entry(kunit_config.KconfigEntry('TEST', 'y'))
 		self.assertTrue(kconfig1.is_subset_of(kconfig1))
 		self.assertTrue(kconfig0.is_subset_of(kconfig1))
 		self.assertFalse(kconfig1.is_subset_of(kconfig0))
@@ -51,15 +51,15 @@ class KconfigTest(unittest.TestCase):
 
 		expected_kconfig = kunit_config.Kconfig()
 		expected_kconfig.add_entry(
-			kunit_config.KconfigEntry('CONFIG_UML=y'))
+			kunit_config.KconfigEntry('UML', 'y'))
 		expected_kconfig.add_entry(
-			kunit_config.KconfigEntry('CONFIG_MMU=y'))
+			kunit_config.KconfigEntry('MMU', 'y'))
 		expected_kconfig.add_entry(
-			kunit_config.KconfigEntry('CONFIG_TEST=y'))
+			kunit_config.KconfigEntry('TEST', 'y'))
 		expected_kconfig.add_entry(
-			kunit_config.KconfigEntry('CONFIG_EXAMPLE_TEST=y'))
+			kunit_config.KconfigEntry('EXAMPLE_TEST', 'y'))
 		expected_kconfig.add_entry(
-			kunit_config.KconfigEntry('# CONFIG_MK8 is not set'))
+			kunit_config.KconfigEntry('MK8', 'n'))
 
 		self.assertEqual(kconfig.entries(), expected_kconfig.entries())
 
@@ -68,15 +68,15 @@ class KconfigTest(unittest.TestCase):
 
 		expected_kconfig = kunit_config.Kconfig()
 		expected_kconfig.add_entry(
-			kunit_config.KconfigEntry('CONFIG_UML=y'))
+			kunit_config.KconfigEntry('UML', 'y'))
 		expected_kconfig.add_entry(
-			kunit_config.KconfigEntry('CONFIG_MMU=y'))
+			kunit_config.KconfigEntry('MMU', 'y'))
 		expected_kconfig.add_entry(
-			kunit_config.KconfigEntry('CONFIG_TEST=y'))
+			kunit_config.KconfigEntry('TEST', 'y'))
 		expected_kconfig.add_entry(
-			kunit_config.KconfigEntry('CONFIG_EXAMPLE_TEST=y'))
+			kunit_config.KconfigEntry('EXAMPLE_TEST', 'y'))
 		expected_kconfig.add_entry(
-			kunit_config.KconfigEntry('# CONFIG_MK8 is not set'))
+			kunit_config.KconfigEntry('MK8', 'n'))
 
 		expected_kconfig.write_to_file(kconfig_path)
 

From cb88577bb6b27f8f5b3d61b1006c1ff7709916be Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Fri, 24 Jan 2020 11:45:08 -0800
Subject: [PATCH 234/327] Fix linked-list KUnit test when run multiple times

A few of the lists used in the linked-list KUnit tests (the
for_each_entry{,_reverse} tests) are declared 'static', and so are
not-reinitialised if the test runs multiple times. This was not a
problem when KUnit tests were run once on startup, but when tests are
able to be run manually (e.g. from debugfs[1]), this is no longer the
case.

Making these lists no longer 'static' causes the lists to be
reinitialised, and the test passes each time it is run. While there may
be some value in testing that initialising static lists works, the
for_each_entry_* tests are unlikely to be the right place for it.

Signed-off-by: David Gow <davidgow@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 lib/list-test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/list-test.c b/lib/list-test.c
index 76babb1df889..ee09505df16f 100644
--- a/lib/list-test.c
+++ b/lib/list-test.c
@@ -659,7 +659,7 @@ static void list_test_list_for_each_prev_safe(struct kunit *test)
 static void list_test_list_for_each_entry(struct kunit *test)
 {
 	struct list_test_struct entries[5], *cur;
-	static LIST_HEAD(list);
+	LIST_HEAD(list);
 	int i = 0;
 
 	for (i = 0; i < 5; ++i) {
@@ -680,7 +680,7 @@ static void list_test_list_for_each_entry(struct kunit *test)
 static void list_test_list_for_each_entry_reverse(struct kunit *test)
 {
 	struct list_test_struct entries[5], *cur;
-	static LIST_HEAD(list);
+	LIST_HEAD(list);
 	int i = 0;
 
 	for (i = 0; i < 5; ++i) {

From 0d5792c9bc458774a3bfdf4b194f1903fe558231 Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Fri, 28 Feb 2020 11:18:21 -0800
Subject: [PATCH 235/327] Documentation: kunit: Make the KUnit documentation
 less UML-specific

Remove some of the outmoded "Why KUnit" rationale, and move some
UML-specific information to the kunit_tool page. Also update the Getting
Started guide to mention running tests without the kunit_tool wrapper.

Signed-off-by: David Gow <davidgow@google.com>
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 Documentation/dev-tools/kunit/index.rst      | 40 ++++++----
 Documentation/dev-tools/kunit/kunit-tool.rst |  7 ++
 Documentation/dev-tools/kunit/start.rst      | 78 +++++++++++++++++---
 3 files changed, 98 insertions(+), 27 deletions(-)

diff --git a/Documentation/dev-tools/kunit/index.rst b/Documentation/dev-tools/kunit/index.rst
index d16a4d2c3a41..e93606ecfb01 100644
--- a/Documentation/dev-tools/kunit/index.rst
+++ b/Documentation/dev-tools/kunit/index.rst
@@ -17,14 +17,23 @@ What is KUnit?
 ==============
 
 KUnit is a lightweight unit testing and mocking framework for the Linux kernel.
-These tests are able to be run locally on a developer's workstation without a VM
-or special hardware.
 
 KUnit is heavily inspired by JUnit, Python's unittest.mock, and
 Googletest/Googlemock for C++. KUnit provides facilities for defining unit test
 cases, grouping related test cases into test suites, providing common
 infrastructure for running tests, and much more.
 
+KUnit consists of a kernel component, which provides a set of macros for easily
+writing unit tests. Tests written against KUnit will run on kernel boot if
+built-in, or when loaded if built as a module. These tests write out results to
+the kernel log in `TAP <https://testanything.org/>`_ format.
+
+To make running these tests (and reading the results) easier, KUnit offers
+:doc:`kunit_tool <kunit-tool>`, which builds a `User Mode Linux
+<http://user-mode-linux.sourceforge.net>`_ kernel, runs it, and parses the test
+results. This provides a quick way of running KUnit tests during development,
+without requiring a virtual machine or separate hardware.
+
 Get started now: :doc:`start`
 
 Why KUnit?
@@ -36,21 +45,20 @@ allow all possible code paths to be tested in the code under test; this is only
 possible if the code under test is very small and does not have any external
 dependencies outside of the test's control like hardware.
 
-Outside of KUnit, there are no testing frameworks currently
-available for the kernel that do not require installing the kernel on a test
-machine or in a VM and all require tests to be written in userspace running on
-the kernel; this is true for Autotest, and kselftest, disqualifying
-any of them from being considered unit testing frameworks.
+KUnit provides a common framework for unit tests within the kernel.
 
-KUnit addresses the problem of being able to run tests without needing a virtual
-machine or actual hardware with User Mode Linux. User Mode Linux is a Linux
-architecture, like ARM or x86; however, unlike other architectures it compiles
-to a standalone program that can be run like any other program directly inside
-of a host operating system; to be clear, it does not require any virtualization
-support; it is just a regular program.
+KUnit tests can be run on most architectures, and most tests are architecture
+independent. All built-in KUnit tests run on kernel startup.  Alternatively,
+KUnit and KUnit tests can be built as modules and tests will run when the test
+module is loaded.
 
-Alternatively, kunit and kunit tests can be built as modules and tests will
-run when the test module is loaded.
+.. note::
+
+        KUnit can also run tests without needing a virtual machine or actual
+        hardware under User Mode Linux. User Mode Linux is a Linux architecture,
+        like ARM or x86, which compiles the kernel as a Linux executable. KUnit
+        can be used with UML either by building with ``ARCH=um`` (like any other
+        architecture), or by using :doc:`kunit_tool <kunit-tool>`.
 
 KUnit is fast. Excluding build time, from invocation to completion KUnit can run
 several dozen tests in only 10 to 20 seconds; this might not sound like a big
@@ -81,3 +89,5 @@ How do I use it?
 *   :doc:`start` - for new users of KUnit
 *   :doc:`usage` - for a more detailed explanation of KUnit features
 *   :doc:`api/index` - for the list of KUnit APIs used for testing
+*   :doc:`kunit-tool` - for more information on the kunit_tool helper script
+*   :doc:`faq` - for answers to some common questions about KUnit
diff --git a/Documentation/dev-tools/kunit/kunit-tool.rst b/Documentation/dev-tools/kunit/kunit-tool.rst
index 50d46394e97e..949af2da81e5 100644
--- a/Documentation/dev-tools/kunit/kunit-tool.rst
+++ b/Documentation/dev-tools/kunit/kunit-tool.rst
@@ -12,6 +12,13 @@ the Linux kernel as UML (`User Mode Linux
 <http://user-mode-linux.sourceforge.net/>`_), running KUnit tests, parsing
 the test results and displaying them in a user friendly manner.
 
+kunit_tool addresses the problem of being able to run tests without needing a
+virtual machine or actual hardware with User Mode Linux. User Mode Linux is a
+Linux architecture, like ARM or x86; however, unlike other architectures it
+compiles the kernel as a standalone Linux executable that can be run like any
+other program directly inside of a host operating system. To be clear, it does
+not require any virtualization support: it is just a regular program.
+
 What is a kunitconfig?
 ======================
 
diff --git a/Documentation/dev-tools/kunit/start.rst b/Documentation/dev-tools/kunit/start.rst
index 4e1d24db6b13..e1c5ce80ce12 100644
--- a/Documentation/dev-tools/kunit/start.rst
+++ b/Documentation/dev-tools/kunit/start.rst
@@ -9,11 +9,10 @@ Installing dependencies
 KUnit has the same dependencies as the Linux kernel. As long as you can build
 the kernel, you can run KUnit.
 
-KUnit Wrapper
-=============
-Included with KUnit is a simple Python wrapper that helps format the output to
-easily use and read KUnit output. It handles building and running the kernel, as
-well as formatting the output.
+Running tests with the KUnit Wrapper
+====================================
+Included with KUnit is a simple Python wrapper which runs tests under User Mode
+Linux, and formats the test results.
 
 The wrapper can be run with:
 
@@ -21,22 +20,42 @@ The wrapper can be run with:
 
 	./tools/testing/kunit/kunit.py run --defconfig
 
-For more information on this wrapper (also called kunit_tool) checkout the
+For more information on this wrapper (also called kunit_tool) check out the
 :doc:`kunit-tool` page.
 
 Creating a .kunitconfig
-=======================
-The Python script is a thin wrapper around Kbuild. As such, it needs to be
-configured with a ``.kunitconfig`` file. This file essentially contains the
-regular Kernel config, with the specific test targets as well.
+-----------------------
+If you want to run a specific set of tests (rather than those listed in the
+KUnit defconfig), you can provide Kconfig options in the ``.kunitconfig`` file.
+This file essentially contains the regular Kernel config, with the specific
+test targets as well. The ``.kunitconfig`` should also contain any other config
+options required by the tests.
 
+A good starting point for a ``.kunitconfig`` is the KUnit defconfig:
 .. code-block:: bash
 
 	cd $PATH_TO_LINUX_REPO
 	cp arch/um/configs/kunit_defconfig .kunitconfig
 
-Verifying KUnit Works
----------------------
+You can then add any other Kconfig options you wish, e.g.:
+.. code-block:: none
+
+        CONFIG_LIST_KUNIT_TEST=y
+
+:doc:`kunit_tool <kunit-tool>` will ensure that all config options set in
+``.kunitconfig`` are set in the kernel ``.config`` before running the tests.
+It'll warn you if you haven't included the dependencies of the options you're
+using.
+
+.. note::
+   Note that removing something from the ``.kunitconfig`` will not trigger a
+   rebuild of the ``.config`` file: the configuration is only updated if the
+   ``.kunitconfig`` is not a subset of ``.config``. This means that you can use
+   other tools (such as make menuconfig) to adjust other config options.
+
+
+Running the tests
+-----------------
 
 To make sure that everything is set up correctly, simply invoke the Python
 wrapper from your kernel repo:
@@ -62,6 +81,41 @@ followed by a list of tests that are run. All of them should be passing.
 	Because it is building a lot of sources for the first time, the
 	``Building KUnit kernel`` step may take a while.
 
+Running tests without the KUnit Wrapper
+=======================================
+
+If you'd rather not use the KUnit Wrapper (if, for example, you need to
+integrate with other systems, or use an architecture other than UML), KUnit can
+be included in any kernel, and the results read out and parsed manually.
+
+.. note::
+   KUnit is not designed for use in a production system, and it's possible that
+   tests may reduce the stability or security of the system.
+
+
+
+Configuring the kernel
+----------------------
+
+In order to enable KUnit itself, you simply need to enable the ``CONFIG_KUNIT``
+Kconfig option (it's under Kernel Hacking/Kernel Testing and Coverage in
+menuconfig). From there, you can enable any KUnit tests you want: they usually
+have config options ending in ``_KUNIT_TEST``.
+
+KUnit and KUnit tests can be compiled as modules: in this case the tests in a
+module will be run when the module is loaded.
+
+Running the tests
+-----------------
+
+Build and run your kernel as usual. Test output will be written to the kernel
+log in `TAP <https://testanything.org/>`_ format.
+
+.. note::
+   It's possible that there will be other lines and/or data interspersed in the
+   TAP output.
+
+
 Writing your first test
 =======================
 

From dfdc22078f3f064d2659acdc42d886834f3a3863 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 28 Feb 2020 15:30:37 -0400
Subject: [PATCH 236/327] mm/hmm: add missing unmaps of the ptep during
 hmm_vma_handle_pte()

Many of the direct returns of error skipped doing the pte_unmap(). All non
zero exit paths must unmap the pte.

The pte_unmap() is split unnaturally like this because some of the error
exit paths trigger a sleep and must release the lock before sleeping.

Fixes: 992de9a8b751 ("mm/hmm: allow to mirror vma of a file on a DAX backed filesystem")
Fixes: 53f5c3f489ec ("mm/hmm: factor out pte and pmd handling to simplify hmm_vma_walk_pmd()")
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 72e5a6d9a417..35f85424176d 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -325,6 +325,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 		}
 
 		/* Report error for everything else */
+		pte_unmap(ptep);
 		*pfn = range->values[HMM_PFN_ERROR];
 		return -EFAULT;
 	} else {
@@ -339,10 +340,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	if (pte_devmap(pte)) {
 		hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
 					      hmm_vma_walk->pgmap);
-		if (unlikely(!hmm_vma_walk->pgmap))
+		if (unlikely(!hmm_vma_walk->pgmap)) {
+			pte_unmap(ptep);
 			return -EBUSY;
+		}
 	} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
 		if (!is_zero_pfn(pte_pfn(pte))) {
+			pte_unmap(ptep);
 			*pfn = range->values[HMM_PFN_SPECIAL];
 			return -EFAULT;
 		}
@@ -437,7 +441,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 
 		r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
 		if (r) {
-			/* hmm_vma_handle_pte() did unmap pte directory */
+			/* hmm_vma_handle_pte() did pte_unmap() */
 			hmm_vma_walk->last = addr;
 			return r;
 		}

From 05fc1df95e5dc09802813bab9c1e718f1e419d93 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Mon, 2 Mar 2020 15:26:44 -0400
Subject: [PATCH 237/327] mm/hmm: do not call hmm_vma_walk_hole() while holding
 a spinlock

This eventually calls into handle_mm_fault() which is a sleeping function.
Release the lock first.

hmm_vma_walk_hole() does not touch the contents of the PUD, so it does not
need the lock.

Fixes: 3afc423632a1 ("mm: pagewalk: add p4d_entry() and pgd_entry()")
Cc: Steven Price <steven.price@arm.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 35f85424176d..0e7a57246f03 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -491,8 +491,8 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 
 	pud = READ_ONCE(*pudp);
 	if (pud_none(pud)) {
-		ret = hmm_vma_walk_hole(start, end, -1, walk);
-		goto out_unlock;
+		spin_unlock(ptl);
+		return hmm_vma_walk_hole(start, end, -1, walk);
 	}
 
 	if (pud_huge(pud) && pud_devmap(pud)) {
@@ -501,8 +501,8 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 		bool fault, write_fault;
 
 		if (!pud_present(pud)) {
-			ret = hmm_vma_walk_hole(start, end, -1, walk);
-			goto out_unlock;
+			spin_unlock(ptl);
+			return hmm_vma_walk_hole(start, end, -1, walk);
 		}
 
 		i = (addr - range->start) >> PAGE_SHIFT;
@@ -513,9 +513,9 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 		hmm_range_need_fault(hmm_vma_walk, pfns, npages,
 				     cpu_flags, &fault, &write_fault);
 		if (fault || write_fault) {
-			ret = hmm_vma_walk_hole_(addr, end, fault,
-						 write_fault, walk);
-			goto out_unlock;
+			spin_unlock(ptl);
+			return hmm_vma_walk_hole_(addr, end, fault, write_fault,
+						  walk);
 		}
 
 		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);

From 7d082987e5e562c07a208503a607a733d50553ba Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 4 Mar 2020 16:25:56 -0400
Subject: [PATCH 238/327] mm/hmm: add missing pfns set to hmm_vma_walk_pmd()

All success exit paths from the walker functions must set the pfns array.

A migration entry with no required fault is a HMM_PFN_NONE return, just
like the pte case.

Fixes: d08faca018c4 ("mm/hmm: properly handle migration pmd")
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 0e7a57246f03..09fc4569468e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -402,7 +402,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 			pmd_migration_entry_wait(walk->mm, pmdp);
 			return -EBUSY;
 		}
-		return 0;
+		return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
 	} else if (!pmd_present(pmd))
 		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
 

From c2579c9c4add3110b2ce81198f8a6bbb5055cfda Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 5 Mar 2020 12:00:22 -0400
Subject: [PATCH 239/327] mm/hmm: add missing call to hmm_range_need_fault()
 before returning EFAULT

All return paths that do EFAULT must call hmm_range_need_fault() to
determine if the user requires this page to be valid.

If the page cannot be made valid if the user later requires it, due to vma
flags in this case, then the return should be HMM_PFN_ERROR.

Fixes: a3e0d41c2b1f ("mm/hmm: improve driver API to work and wait over a range")
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 09fc4569468e..a16b8e360e4c 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -604,18 +604,15 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
 	struct vm_area_struct *vma = walk->vma;
 
 	/*
-	 * Skip vma ranges that don't have struct page backing them or
-	 * map I/O devices directly.
-	 */
-	if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP))
-		return -EFAULT;
-
-	/*
+	 * Skip vma ranges that don't have struct page backing them or map I/O
+	 * devices directly.
+	 *
 	 * If the vma does not allow read access, then assume that it does not
-	 * allow write access either. HMM does not support architectures
-	 * that allow write without read.
+	 * allow write access either. HMM does not support architectures that
+	 * allow write without read.
 	 */
-	if (!(vma->vm_flags & VM_READ)) {
+	if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) ||
+	    !(vma->vm_flags & VM_READ)) {
 		bool fault, write_fault;
 
 		/*
@@ -629,7 +626,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
 		if (fault || write_fault)
 			return -EFAULT;
 
-		hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
+		hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
 		hmm_vma_walk->last = end;
 
 		/* Skip this vma and continue processing the next vma. */

From 76612d6ce4ccd21329ce8c90dc51c5f747057b5b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 28 Feb 2020 15:52:32 -0400
Subject: [PATCH 240/327] mm/hmm: reorganize how !pte_present is handled in
 hmm_vma_handle_pte()

The intention with this code is to determine if the caller required the
pages to be valid, and if so, then take some action to make them valid.
The action varies depending on the page type.

In all cases, if the caller doesn't ask for the page, then
hmm_range_fault() should not return an error.

Revise the implementation to be clearer, and fix some bugs:

 - hmm_pte_need_fault() must always be called before testing fault or
   write_fault otherwise the defaults of false apply and the if()'s don't
   work. This was missed on the is_migration_entry() branch

 - -EFAULT should not be returned unless hmm_pte_need_fault() indicates
   fault is required - ie snapshotting should not fail.

 - For !pte_present() the cpu_flags are always 0, except in the special
   case of is_device_private_entry(), calling pte_to_hmm_pfn_flags() is
   confusing.

Reorganize the flow so that it always follows the pattern of calling
hmm_pte_need_fault() and then checking fault || write_fault.

Fixes: 2aee09d8c116 ("mm/hmm: change hmm_vma_fault() to allow write fault on page basis")
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index a16b8e360e4c..b98f492984e5 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -286,15 +286,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	if (!pte_present(pte)) {
 		swp_entry_t entry = pte_to_swp_entry(pte);
 
-		if (!non_swap_entry(entry)) {
-			cpu_flags = pte_to_hmm_pfn_flags(range, pte);
-			hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
-					   &fault, &write_fault);
-			if (fault || write_fault)
-				goto fault;
-			return 0;
-		}
-
 		/*
 		 * This is a special swap entry, ignore migration, use
 		 * device and report anything else as error.
@@ -314,26 +305,30 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 			return 0;
 		}
 
-		if (is_migration_entry(entry)) {
-			if (fault || write_fault) {
-				pte_unmap(ptep);
-				hmm_vma_walk->last = addr;
-				migration_entry_wait(walk->mm, pmdp, addr);
-				return -EBUSY;
-			}
+		hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, &fault,
+				   &write_fault);
+		if (!fault && !write_fault)
 			return 0;
+
+		if (!non_swap_entry(entry))
+			goto fault;
+
+		if (is_migration_entry(entry)) {
+			pte_unmap(ptep);
+			hmm_vma_walk->last = addr;
+			migration_entry_wait(walk->mm, pmdp, addr);
+			return -EBUSY;
 		}
 
 		/* Report error for everything else */
 		pte_unmap(ptep);
 		*pfn = range->values[HMM_PFN_ERROR];
 		return -EFAULT;
-	} else {
-		cpu_flags = pte_to_hmm_pfn_flags(range, pte);
-		hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
-				   &fault, &write_fault);
 	}
 
+	cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+	hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, &fault,
+			   &write_fault);
 	if (fault || write_fault)
 		goto fault;
 

From 2288a9a68175cec9f91afb52948ba585b690774b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 5 Mar 2020 15:26:33 -0400
Subject: [PATCH 241/327] mm/hmm: return -EFAULT when setting HMM_PFN_ERROR on
 requested valid pages

hmm_range_fault() should never return 0 if the caller requested a valid
page, but the pfns output for that page would be HMM_PFN_ERROR.

hmm_pte_need_fault() must always be called before setting HMM_PFN_ERROR to
detect if the page is in faulting mode or not.

Fix two cases in hmm_vma_walk_pmd() and reorganize some of the duplicated
code.

Fixes: d08faca018c4 ("mm/hmm: properly handle migration pmd")
Fixes: da4c3c735ea4 ("mm/hmm/mirror: helper to snapshot CPU page table")
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index b98f492984e5..3a03fcff2668 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -371,8 +371,10 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 {
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
-	uint64_t *pfns = range->pfns;
-	unsigned long addr = start, i;
+	uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
+	unsigned long npages = (end - start) >> PAGE_SHIFT;
+	unsigned long addr = start;
+	bool fault, write_fault;
 	pte_t *ptep;
 	pmd_t pmd;
 
@@ -382,14 +384,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 		return hmm_vma_walk_hole(start, end, -1, walk);
 
 	if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
-		bool fault, write_fault;
-		unsigned long npages;
-		uint64_t *pfns;
-
-		i = (addr - range->start) >> PAGE_SHIFT;
-		npages = (end - addr) >> PAGE_SHIFT;
-		pfns = &range->pfns[i];
-
 		hmm_range_need_fault(hmm_vma_walk, pfns, npages,
 				     0, &fault, &write_fault);
 		if (fault || write_fault) {
@@ -398,8 +392,15 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 			return -EBUSY;
 		}
 		return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
-	} else if (!pmd_present(pmd))
+	}
+
+	if (!pmd_present(pmd)) {
+		hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault,
+				     &write_fault);
+		if (fault || write_fault)
+			return -EFAULT;
 		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+	}
 
 	if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
 		/*
@@ -416,8 +417,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 		if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
 			goto again;
 
-		i = (addr - range->start) >> PAGE_SHIFT;
-		return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
+		return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd);
 	}
 
 	/*
@@ -426,15 +426,19 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 	 * entry pointing to pte directory or it is a bad pmd that will not
 	 * recover.
 	 */
-	if (pmd_bad(pmd))
+	if (pmd_bad(pmd)) {
+		hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault,
+				     &write_fault);
+		if (fault || write_fault)
+			return -EFAULT;
 		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+	}
 
 	ptep = pte_offset_map(pmdp, addr);
-	i = (addr - range->start) >> PAGE_SHIFT;
-	for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
+	for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) {
 		int r;
 
-		r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
+		r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns);
 		if (r) {
 			/* hmm_vma_handle_pte() did pte_unmap() */
 			hmm_vma_walk->last = addr;

From 4055062749229101365e5f9e87cb1c5a93e292f8 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 5 Mar 2020 14:27:20 -0400
Subject: [PATCH 242/327] mm/hmm: add missing call to hmm_pte_need_fault in
 HMM_PFN_SPECIAL handling

Currently if a special PTE is encountered hmm_range_fault() immediately
returns EFAULT and sets the HMM_PFN_SPECIAL error output (which nothing
uses).

EFAULT should only be returned after testing with hmm_pte_need_fault().

Also pte_devmap() and pte_special() are exclusive, and there is no need to
check IS_ENABLED, pte_special() is stubbed out to return false on
unsupported architectures.

Fixes: 992de9a8b751 ("mm/hmm: allow to mirror vma of a file on a DAX backed filesystem")
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 3a03fcff2668..9c82ea972d4b 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -339,16 +339,21 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 			pte_unmap(ptep);
 			return -EBUSY;
 		}
-	} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
-		if (!is_zero_pfn(pte_pfn(pte))) {
+	}
+
+	/*
+	 * Since each architecture defines a struct page for the zero page, just
+	 * fall through and treat it like a normal page.
+	 */
+	if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) {
+		hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, &fault,
+				   &write_fault);
+		if (fault || write_fault) {
 			pte_unmap(ptep);
-			*pfn = range->values[HMM_PFN_SPECIAL];
 			return -EFAULT;
 		}
-		/*
-		 * Since each architecture defines a struct page for the zero
-		 * page, just fall through and treat it like a normal page.
-		 */
+		*pfn = range->values[HMM_PFN_SPECIAL];
+		return 0;
 	}
 
 	*pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;

From 24cee8ab41eec51ea4cabd19c311719038084648 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 11 Mar 2020 17:03:33 -0300
Subject: [PATCH 243/327] mm/hmm: do not check pmd_protnone twice in
 hmm_vma_handle_pmd()

pmd_to_hmm_pfn_flags() already checks it and makes the cpu flags 0. If no
fault is requested then the pfns should be returned with the not valid
flags.

It should not unconditionally fault if faulting is not requested.

Fixes: 2aee09d8c116 ("mm/hmm: change hmm_vma_fault() to allow write fault on page basis")
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 9c82ea972d4b..37a6fca7da43 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -226,7 +226,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
 	hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
 			     &fault, &write_fault);
 
-	if (pmd_protnone(pmd) || fault || write_fault)
+	if (fault || write_fault)
 		return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
 
 	pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);

From ddfaed17a779ddb728e5534a87596950842d6b04 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Mar 2020 14:53:06 +0100
Subject: [PATCH 244/327] mm/hmm: don't provide a stub for hmm_range_fault()

All callers of hmm_range_fault depend on CONFIG_HMM_MIRROR, so don't
bother with a stub.

Link: https://lore.kernel.org/r/20200316135310.899364-2-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ddf9f7144c43..c102e359b59d 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -225,17 +225,10 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
 /* Don't fault in missing PTEs, just snapshot the current state. */
 #define HMM_FAULT_SNAPSHOT		(1 << 1)
 
-#ifdef CONFIG_HMM_MIRROR
 /*
  * Please see Documentation/vm/hmm.rst for how to use the range API.
  */
 long hmm_range_fault(struct hmm_range *range, unsigned int flags);
-#else
-static inline long hmm_range_fault(struct hmm_range *range, unsigned int flags)
-{
-	return -EOPNOTSUPP;
-}
-#endif
 
 /*
  * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range

From 96268163f9c9443e7f73a202253f68566f93dc79 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Mar 2020 14:53:07 +0100
Subject: [PATCH 245/327] mm/hmm: remove the unused HMM_FAULT_ALLOW_RETRY flag

The HMM_FAULT_ALLOW_RETRY isn't used anywhere in the tree.  Remove it and
the weird -EAGAIN handling where handle_mm_fault() drops the mmap_sem.

Link: https://lore.kernel.org/r/20200316135310.899364-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h | 5 -----
 mm/hmm.c            | 7 -------
 2 files changed, 12 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index c102e359b59d..4bf8d6997b12 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -217,11 +217,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
 		range->flags[HMM_PFN_VALID];
 }
 
-/*
- * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case.
- */
-#define HMM_FAULT_ALLOW_RETRY		(1 << 0)
-
 /* Don't fault in missing PTEs, just snapshot the current state. */
 #define HMM_FAULT_SNAPSHOT		(1 << 1)
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 37a6fca7da43..df98297afe80 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -45,16 +45,10 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
 	if (!vma)
 		goto err;
 
-	if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY)
-		flags |= FAULT_FLAG_ALLOW_RETRY;
 	if (write_fault)
 		flags |= FAULT_FLAG_WRITE;
 
 	ret = handle_mm_fault(vma, addr, flags);
-	if (ret & VM_FAULT_RETRY) {
-		/* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */
-		return -EAGAIN;
-	}
 	if (ret & VM_FAULT_ERROR)
 		goto err;
 
@@ -662,7 +656,6 @@ static const struct mm_walk_ops hmm_walk_ops = {
  * -ENOMEM:	Out of memory.
  * -EPERM:	Invalid permission (e.g., asking for write and range is read
  *		only).
- * -EAGAIN:	A page fault needs to be retried and mmap_sem was dropped.
  * -EBUSY:	The range has been invalidated and the caller needs to wait for
  *		the invalidation to finish.
  * -EFAULT:	Invalid (i.e., either no valid vma or it is illegal to access

From 45050692dec83a67c0325535aae984f56560e3a9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Mar 2020 14:53:08 +0100
Subject: [PATCH 246/327] mm/hmm: simplify hmm_vma_walk_hugetlb_entry()

Remove the rather confusing goto label and just handle the fault case
directly in the branch checking for it.

Link: https://lore.kernel.org/r/20200316135310.899364-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index df98297afe80..d13dedfb4781 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -559,7 +559,6 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	bool fault, write_fault;
 	spinlock_t *ptl;
 	pte_t entry;
-	int ret = 0;
 
 	ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
 	entry = huge_ptep_get(pte);
@@ -572,8 +571,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
 			   &fault, &write_fault);
 	if (fault || write_fault) {
-		ret = -ENOENT;
-		goto unlock;
+		spin_unlock(ptl);
+		return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
 	}
 
 	pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
@@ -581,14 +580,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 		range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
 				 cpu_flags;
 	hmm_vma_walk->last = end;
-
-unlock:
 	spin_unlock(ptl);
-
-	if (ret == -ENOENT)
-		return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
-
-	return ret;
+	return 0;
 }
 #else
 #define hmm_vma_walk_hugetlb_entry NULL

From f8c888a304e12074d941428b4aa1b13f04dd54ee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Mar 2020 14:53:09 +0100
Subject: [PATCH 247/327] mm/hmm: don't handle the non-fault case in
 hmm_vma_walk_hole_()

Setting a pfns entry to NONE before returning -EBUSY is a bug that will
cause corruption of the input flags on the next loop.

There is just a single caller using hmm_vma_walk_hole_() for the non-fault
case.  Use hmm_pfns_fill() to fill the whole pfn array with zeroes in the
only caller for the non-fault case and remove the non-fault path from
hmm_vma_walk_hole_(). This avoids setting NONE before returning -EBUSY.

Also rename the function to hmm_vma_fault() to better describe what it
does.

Fixes: 2aee09d8c116 ("mm/hmm: change hmm_vma_fault() to allow write fault on page basis")
Link: https://lore.kernel.org/r/20200316135310.899364-5-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index d13dedfb4781..b15bf4041803 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -73,45 +73,41 @@ static int hmm_pfns_fill(unsigned long addr, unsigned long end,
 }
 
 /*
- * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s)
+ * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s)
  * @addr: range virtual start address (inclusive)
  * @end: range virtual end address (exclusive)
  * @fault: should we fault or not ?
  * @write_fault: write fault ?
  * @walk: mm_walk structure
- * Return: 0 on success, -EBUSY after page fault, or page fault error
+ * Return: -EBUSY after page fault, or page fault error
  *
  * This function will be called whenever pmd_none() or pte_none() returns true,
  * or whenever there is no page directory covering the virtual address range.
  */
-static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
+static int hmm_vma_fault(unsigned long addr, unsigned long end,
 			      bool fault, bool write_fault,
 			      struct mm_walk *walk)
 {
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
 	uint64_t *pfns = range->pfns;
-	unsigned long i;
+	unsigned long i = (addr - range->start) >> PAGE_SHIFT;
 
+	WARN_ON_ONCE(!fault && !write_fault);
 	hmm_vma_walk->last = addr;
-	i = (addr - range->start) >> PAGE_SHIFT;
 
 	if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE))
 		return -EPERM;
 
 	for (; addr < end; addr += PAGE_SIZE, i++) {
-		pfns[i] = range->values[HMM_PFN_NONE];
-		if (fault || write_fault) {
-			int ret;
+		int ret;
 
-			ret = hmm_vma_do_fault(walk, addr, write_fault,
-					       &pfns[i]);
-			if (ret != -EBUSY)
-				return ret;
-		}
+		ret = hmm_vma_do_fault(walk, addr, write_fault, &pfns[i]);
+		if (ret != -EBUSY)
+			return ret;
 	}
 
-	return (fault || write_fault) ? -EBUSY : 0;
+	return -EBUSY;
 }
 
 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
@@ -193,7 +189,10 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
 	pfns = &range->pfns[i];
 	hmm_range_need_fault(hmm_vma_walk, pfns, npages,
 			     0, &fault, &write_fault);
-	return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+	if (fault || write_fault)
+		return hmm_vma_fault(addr, end, fault, write_fault, walk);
+	hmm_vma_walk->last = addr;
+	return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE);
 }
 
 static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
@@ -221,7 +220,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
 			     &fault, &write_fault);
 
 	if (fault || write_fault)
-		return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+		return hmm_vma_fault(addr, end, fault, write_fault, walk);
 
 	pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
@@ -360,7 +359,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	}
 	pte_unmap(ptep);
 	/* Fault any virtual address we were asked to fault */
-	return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+	return hmm_vma_fault(addr, end, fault, write_fault, walk);
 }
 
 static int hmm_vma_walk_pmd(pmd_t *pmdp,
@@ -512,7 +511,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 				     cpu_flags, &fault, &write_fault);
 		if (fault || write_fault) {
 			spin_unlock(ptl);
-			return hmm_vma_walk_hole_(addr, end, fault, write_fault,
+			return hmm_vma_fault(addr, end, fault, write_fault,
 						  walk);
 		}
 
@@ -572,7 +571,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 			   &fault, &write_fault);
 	if (fault || write_fault) {
 		spin_unlock(ptl);
-		return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+		return hmm_vma_fault(addr, end, fault, write_fault, walk);
 	}
 
 	pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);

From 5a0c38d307afde167175602aa52ca4e6c5c42c44 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Mar 2020 14:53:10 +0100
Subject: [PATCH 248/327] mm: merge hmm_vma_do_fault into into
 hmm_vma_walk_hole_

There is no good reason for this split, as it just obsfucates the flow.

Link: https://lore.kernel.org/r/20200316135310.899364-6-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 50 ++++++++++++++++----------------------------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index b15bf4041803..a5f4f8010965 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -33,32 +33,6 @@ struct hmm_vma_walk {
 	unsigned int		flags;
 };
 
-static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
-			    bool write_fault, uint64_t *pfn)
-{
-	unsigned int flags = FAULT_FLAG_REMOTE;
-	struct hmm_vma_walk *hmm_vma_walk = walk->private;
-	struct hmm_range *range = hmm_vma_walk->range;
-	struct vm_area_struct *vma = walk->vma;
-	vm_fault_t ret;
-
-	if (!vma)
-		goto err;
-
-	if (write_fault)
-		flags |= FAULT_FLAG_WRITE;
-
-	ret = handle_mm_fault(vma, addr, flags);
-	if (ret & VM_FAULT_ERROR)
-		goto err;
-
-	return -EBUSY;
-
-err:
-	*pfn = range->values[HMM_PFN_ERROR];
-	return -EFAULT;
-}
-
 static int hmm_pfns_fill(unsigned long addr, unsigned long end,
 		struct hmm_range *range, enum hmm_pfn_value_e value)
 {
@@ -90,24 +64,32 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
 {
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
+	struct vm_area_struct *vma = walk->vma;
 	uint64_t *pfns = range->pfns;
 	unsigned long i = (addr - range->start) >> PAGE_SHIFT;
+	unsigned int fault_flags = FAULT_FLAG_REMOTE;
 
 	WARN_ON_ONCE(!fault && !write_fault);
 	hmm_vma_walk->last = addr;
 
-	if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE))
-		return -EPERM;
+	if (!vma)
+		goto out_error;
 
-	for (; addr < end; addr += PAGE_SIZE, i++) {
-		int ret;
-
-		ret = hmm_vma_do_fault(walk, addr, write_fault, &pfns[i]);
-		if (ret != -EBUSY)
-			return ret;
+	if (write_fault) {
+		if (!(vma->vm_flags & VM_WRITE))
+			return -EPERM;
+		fault_flags |= FAULT_FLAG_WRITE;
 	}
 
+	for (; addr < end; addr += PAGE_SIZE, i++)
+		if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR)
+			goto out_error;
+
 	return -EBUSY;
+
+out_error:
+	pfns[i] = range->values[HMM_PFN_ERROR];
+	return -EFAULT;
 }
 
 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,

From f894ddd5ff01d3bb48faf4dc533536bf5269c17a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Mar 2020 20:32:13 +0100
Subject: [PATCH 249/327] memremap: add an owner field to struct dev_pagemap

Add a new opaque owner field to struct dev_pagemap, which will allow the
hmm and migrate_vma code to identify who owns ZONE_DEVICE memory, and
refuse to work on mappings not owned by the calling entity.

Link: https://lore.kernel.org/r/20200316193216.920734-2-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Tested-by: Bharata B Rao <bharata@linux.ibm.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 arch/powerpc/kvm/book3s_hv_uvmem.c     | 2 ++
 drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 +
 include/linux/memremap.h               | 4 ++++
 mm/memremap.c                          | 4 ++++
 4 files changed, 11 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 79b1202b1c62..67fefb03b9b7 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -779,6 +779,8 @@ int kvmppc_uvmem_init(void)
 	kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE;
 	kvmppc_uvmem_pgmap.res = *res;
 	kvmppc_uvmem_pgmap.ops = &kvmppc_uvmem_ops;
+	/* just one global instance: */
+	kvmppc_uvmem_pgmap.owner = &kvmppc_uvmem_pgmap;
 	addr = memremap_pages(&kvmppc_uvmem_pgmap, NUMA_NO_NODE);
 	if (IS_ERR(addr)) {
 		ret = PTR_ERR(addr);
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 0ad5d87b5a8e..a4682272586e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -526,6 +526,7 @@ nouveau_dmem_init(struct nouveau_drm *drm)
 	drm->dmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
 	drm->dmem->pagemap.res = *res;
 	drm->dmem->pagemap.ops = &nouveau_dmem_pagemap_ops;
+	drm->dmem->pagemap.owner = drm->dev;
 	if (IS_ERR(devm_memremap_pages(device, &drm->dmem->pagemap)))
 		goto out_free;
 
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 6fefb09af7c3..60d97e8fd3c0 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -103,6 +103,9 @@ struct dev_pagemap_ops {
  * @type: memory type: see MEMORY_* in memory_hotplug.h
  * @flags: PGMAP_* flags to specify defailed behavior
  * @ops: method table
+ * @owner: an opaque pointer identifying the entity that manages this
+ *	instance.  Used by various helpers to make sure that no
+ *	foreign ZONE_DEVICE memory is accessed.
  */
 struct dev_pagemap {
 	struct vmem_altmap altmap;
@@ -113,6 +116,7 @@ struct dev_pagemap {
 	enum memory_type type;
 	unsigned int flags;
 	const struct dev_pagemap_ops *ops;
+	void *owner;
 };
 
 static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
diff --git a/mm/memremap.c b/mm/memremap.c
index 09b5b7adc773..9b2c97ceb775 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -181,6 +181,10 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
 			WARN(1, "Missing migrate_to_ram method\n");
 			return ERR_PTR(-EINVAL);
 		}
+		if (!pgmap->owner) {
+			WARN(1, "Missing owner\n");
+			return ERR_PTR(-EINVAL);
+		}
 		break;
 	case MEMORY_DEVICE_FS_DAX:
 		if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||

From 800bb1c8dc80bb4121446b56813067f3ea44edee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Mar 2020 20:32:14 +0100
Subject: [PATCH 250/327] mm: handle multiple owners of device private pages in
 migrate_vma

Add a new src_owner field to struct migrate_vma.  If the field is set,
only device private pages with page->pgmap->owner equal to that field are
migrated.  If the field is not set only "normal" pages are migrated.

Fixes: df6ad69838fc ("mm/device-public-memory: device memory cache coherent with CPU")
Link: https://lore.kernel.org/r/20200316193216.920734-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Tested-by: Bharata B Rao <bharata@linux.ibm.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 arch/powerpc/kvm/book3s_hv_uvmem.c     | 1 +
 drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 +
 include/linux/migrate.h                | 8 ++++++++
 mm/migrate.c                           | 9 ++++++---
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 67fefb03b9b7..f44f6b27950f 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -563,6 +563,7 @@ kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start,
 	mig.end = end;
 	mig.src = &src_pfn;
 	mig.dst = &dst_pfn;
+	mig.src_owner = &kvmppc_uvmem_pgmap;
 
 	mutex_lock(&kvm->arch.uvmem_lock);
 	/* The requested page is already paged-out, nothing to do */
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index a4682272586e..0e36345d395c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -176,6 +176,7 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
 		.end		= vmf->address + PAGE_SIZE,
 		.src		= &src,
 		.dst		= &dst,
+		.src_owner	= drm->dev,
 	};
 
 	/*
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 72120061b7d4..3e546cbf03dd 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -196,6 +196,14 @@ struct migrate_vma {
 	unsigned long		npages;
 	unsigned long		start;
 	unsigned long		end;
+
+	/*
+	 * Set to the owner value also stored in page->pgmap->owner for
+	 * migrating out of device private memory.  If set only device
+	 * private pages with this owner are migrated.  If not set
+	 * device private pages are not migrated at all.
+	 */
+	void			*src_owner;
 };
 
 int migrate_vma_setup(struct migrate_vma *args);
diff --git a/mm/migrate.c b/mm/migrate.c
index b1092876e537..7605d2c23433 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2241,7 +2241,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 	arch_enter_lazy_mmu_mode();
 
 	for (; addr < end; addr += PAGE_SIZE, ptep++) {
-		unsigned long mpfn, pfn;
+		unsigned long mpfn = 0, pfn;
 		struct page *page;
 		swp_entry_t entry;
 		pte_t pte;
@@ -2255,8 +2255,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 		}
 
 		if (!pte_present(pte)) {
-			mpfn = 0;
-
 			/*
 			 * Only care about unaddressable device page special
 			 * page table entry. Other special swap entries are not
@@ -2267,11 +2265,16 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 				goto next;
 
 			page = device_private_entry_to_page(entry);
+			if (page->pgmap->owner != migrate->src_owner)
+				goto next;
+
 			mpfn = migrate_pfn(page_to_pfn(page)) |
 					MIGRATE_PFN_MIGRATE;
 			if (is_write_device_private_entry(entry))
 				mpfn |= MIGRATE_PFN_WRITE;
 		} else {
+			if (migrate->src_owner)
+				goto next;
 			pfn = pte_pfn(pte);
 			if (is_zero_pfn(pfn)) {
 				mpfn = MIGRATE_PFN_MIGRATE;

From 17ffdc482982af92bddb59692af1c5e1de23d184 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Mar 2020 20:32:15 +0100
Subject: [PATCH 251/327] mm: simplify device private page handling in
 hmm_range_fault

Remove the HMM_PFN_DEVICE_PRIVATE flag, no driver has ever set this flag
on input, and the only place that uses it on output can be trivially
changed to use is_device_private_page().

This removes the ability to request that device_private pages are faulted
back into system memory.

Link: https://lore.kernel.org/r/20200316193216.920734-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  1 -
 drivers/gpu/drm/nouveau/nouveau_dmem.c  |  5 +++--
 drivers/gpu/drm/nouveau/nouveau_svm.c   |  1 -
 include/linux/hmm.h                     |  2 --
 mm/hmm.c                                | 25 +++++--------------------
 5 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index dee446278417..90821ce5e6ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -776,7 +776,6 @@ struct amdgpu_ttm_tt {
 static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
 	(1 << 0), /* HMM_PFN_VALID */
 	(1 << 1), /* HMM_PFN_WRITE */
-	0 /* HMM_PFN_DEVICE_PRIVATE */
 };
 
 static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 0e36345d395c..edfd0805fba4 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -28,6 +28,7 @@
 
 #include <nvif/class.h>
 #include <nvif/object.h>
+#include <nvif/if000c.h>
 #include <nvif/if500b.h>
 #include <nvif/if900b.h>
 
@@ -692,9 +693,8 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
 		if (page == NULL)
 			continue;
 
-		if (!(range->pfns[i] & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
+		if (!is_device_private_page(page))
 			continue;
-		}
 
 		if (!nouveau_dmem_page(drm, page)) {
 			WARN(1, "Some unknown device memory !\n");
@@ -705,5 +705,6 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
 		addr = nouveau_dmem_page_addr(page);
 		range->pfns[i] &= ((1UL << range->pfn_shift) - 1);
 		range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift;
+		range->pfns[i] |= NVIF_VMM_PFNMAP_V0_VRAM;
 	}
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index df9bf1fd1bc0..39c731a99937 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -367,7 +367,6 @@ static const u64
 nouveau_svm_pfn_flags[HMM_PFN_FLAG_MAX] = {
 	[HMM_PFN_VALID         ] = NVIF_VMM_PFNMAP_V0_V,
 	[HMM_PFN_WRITE         ] = NVIF_VMM_PFNMAP_V0_W,
-	[HMM_PFN_DEVICE_PRIVATE] = NVIF_VMM_PFNMAP_V0_VRAM,
 };
 
 static const u64
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4bf8d6997b12..5e6034f105c3 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -74,7 +74,6 @@
  * Flags:
  * HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
  * HMM_PFN_WRITE: CPU page table has write permission set
- * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE)
  *
  * The driver provides a flags array for mapping page protections to device
  * PTE bits. If the driver valid bit for an entry is bit 3,
@@ -86,7 +85,6 @@
 enum hmm_pfn_flag_e {
 	HMM_PFN_VALID = 0,
 	HMM_PFN_WRITE,
-	HMM_PFN_DEVICE_PRIVATE,
 	HMM_PFN_FLAG_MAX
 };
 
diff --git a/mm/hmm.c b/mm/hmm.c
index a5f4f8010965..613b34950c30 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -116,15 +116,6 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
 	/* We aren't ask to do anything ... */
 	if (!(pfns & range->flags[HMM_PFN_VALID]))
 		return;
-	/* If this is device memory then only fault if explicitly requested */
-	if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
-		/* Do we fault on device memory ? */
-		if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
-			*write_fault = pfns & range->flags[HMM_PFN_WRITE];
-			*fault = true;
-		}
-		return;
-	}
 
 	/* If CPU page table is not valid then we need to fault */
 	*fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
@@ -262,21 +253,15 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 		swp_entry_t entry = pte_to_swp_entry(pte);
 
 		/*
-		 * This is a special swap entry, ignore migration, use
-		 * device and report anything else as error.
+		 * Never fault in device private pages pages, but just report
+		 * the PFN even if not present.
 		 */
 		if (is_device_private_entry(entry)) {
-			cpu_flags = range->flags[HMM_PFN_VALID] |
-				range->flags[HMM_PFN_DEVICE_PRIVATE];
-			cpu_flags |= is_write_device_private_entry(entry) ?
-				range->flags[HMM_PFN_WRITE] : 0;
-			hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
-					   &fault, &write_fault);
-			if (fault || write_fault)
-				goto fault;
 			*pfn = hmm_device_entry_from_pfn(range,
 					    swp_offset(entry));
-			*pfn |= cpu_flags;
+			*pfn |= range->flags[HMM_PFN_VALID];
+			if (is_write_device_private_entry(entry))
+				*pfn |= range->flags[HMM_PFN_WRITE];
 			return 0;
 		}
 

From 08ddddda667b3b7aaac10641418283f78118c5cd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 16 Mar 2020 20:32:16 +0100
Subject: [PATCH 252/327] mm/hmm: check the device private page owner in
 hmm_range_fault()

hmm_range_fault() will succeed for any kind of device private memory, even
if it doesn't belong to the calling entity.  While nouveau has some crude
checks for that, they are broken because they assume nouveau is the only
user of device private memory.  Fix this by passing in an expected pgmap
owner in the hmm_range_fault structure.

If a device_private page is found and doesn't match the owner then it is
treated as an non-present and non-faultable page.

This prevents a bug in amdgpu, where it doesn't know how to handle
device_private pages, but hmm_range_fault would return them anyhow.

Fixes: 4ef589dc9b10 ("mm/hmm/devmem: device memory hotplug using ZONE_DEVICE")
Link: https://lore.kernel.org/r/20200316193216.920734-5-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/gpu/drm/nouveau/nouveau_dmem.c | 12 ------------
 include/linux/hmm.h                    |  2 ++
 mm/hmm.c                               | 10 +++++++++-
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index edfd0805fba4..ad89e09a0be3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -672,12 +672,6 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
 	return ret;
 }
 
-static inline bool
-nouveau_dmem_page(struct nouveau_drm *drm, struct page *page)
-{
-	return is_device_private_page(page) && drm->dmem == page_to_dmem(page);
-}
-
 void
 nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
 			 struct hmm_range *range)
@@ -696,12 +690,6 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
 		if (!is_device_private_page(page))
 			continue;
 
-		if (!nouveau_dmem_page(drm, page)) {
-			WARN(1, "Some unknown device memory !\n");
-			range->pfns[i] = 0;
-			continue;
-		}
-
 		addr = nouveau_dmem_page_addr(page);
 		range->pfns[i] &= ((1UL << range->pfn_shift) - 1);
 		range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift;
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 5e6034f105c3..bb6be4428633 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -132,6 +132,7 @@ enum hmm_pfn_value_e {
  * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
  * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
  * @valid: pfns array did not change since it has been fill by an HMM function
+ * @dev_private_owner: owner of device private pages
  */
 struct hmm_range {
 	struct mmu_interval_notifier *notifier;
@@ -144,6 +145,7 @@ struct hmm_range {
 	uint64_t		default_flags;
 	uint64_t		pfn_flags_mask;
 	uint8_t			pfn_shift;
+	void			*dev_private_owner;
 };
 
 /*
diff --git a/mm/hmm.c b/mm/hmm.c
index 613b34950c30..a491d9aaafe4 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -218,6 +218,14 @@ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
 		unsigned long end, uint64_t *pfns, pmd_t pmd);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+static inline bool hmm_is_device_private_entry(struct hmm_range *range,
+		swp_entry_t entry)
+{
+	return is_device_private_entry(entry) &&
+		device_private_entry_to_page(entry)->pgmap->owner ==
+		range->dev_private_owner;
+}
+
 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
 {
 	if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
@@ -256,7 +264,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 		 * Never fault in device private pages pages, but just report
 		 * the PFN even if not present.
 		 */
-		if (is_device_private_entry(entry)) {
+		if (hmm_is_device_private_entry(range, entry)) {
 			*pfn = hmm_device_entry_from_pfn(range,
 					    swp_offset(entry));
 			*pfn |= range->flags[HMM_PFN_VALID];

From 987914ab841e2ec281a35b54348ab109b4c0bb4e Mon Sep 17 00:00:00 2001
From: Avihai Horon <avihaih@mellanox.com>
Date: Wed, 18 Mar 2020 12:17:41 +0200
Subject: [PATCH 253/327] RDMA/cm: Update num_paths in cma_resolve_iboe_route
 error flow

After a successful allocation of path_rec, num_paths is set to 1, but any
error after such allocation will leave num_paths uncleared.

This causes to de-referencing a NULL pointer later on. Hence, num_paths
needs to be set back to 0 if such an error occurs.

The following crash from syzkaller revealed it.

  kasan: CONFIG_KASAN_INLINE enabled
  kasan: GPF could be caused by NULL-ptr deref or user memory access
  general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI
  CPU: 0 PID: 357 Comm: syz-executor060 Not tainted 4.18.0+ #311
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
  rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014
  RIP: 0010:ib_copy_path_rec_to_user+0x94/0x3e0
  Code: f1 f1 f1 f1 c7 40 0c 00 00 f4 f4 65 48 8b 04 25 28 00 00 00 48 89
  45 c8 31 c0 e8 d7 60 24 ff 48 8d 7b 4c 48 89 f8 48 c1 e8 03 <42> 0f b6
  14 30 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85
  RSP: 0018:ffff88006586f980 EFLAGS: 00010207
  RAX: 0000000000000009 RBX: 0000000000000000 RCX: 1ffff1000d5fe475
  RDX: ffff8800621e17c0 RSI: ffffffff820d45f9 RDI: 000000000000004c
  RBP: ffff88006586fa50 R08: ffffed000cb0df73 R09: ffffed000cb0df72
  R10: ffff88006586fa70 R11: ffffed000cb0df73 R12: 1ffff1000cb0df30
  R13: ffff88006586fae8 R14: dffffc0000000000 R15: ffff88006aff2200
  FS: 00000000016fc880(0000) GS:ffff88006d000000(0000)
  knlGS:0000000000000000
  CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000020000040 CR3: 0000000063fec000 CR4: 00000000000006b0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
  ? ib_copy_path_rec_from_user+0xcc0/0xcc0
  ? __mutex_unlock_slowpath+0xfc/0x670
  ? wait_for_completion+0x3b0/0x3b0
  ? ucma_query_route+0x818/0xc60
  ucma_query_route+0x818/0xc60
  ? ucma_listen+0x1b0/0x1b0
  ? sched_clock_cpu+0x18/0x1d0
  ? sched_clock_cpu+0x18/0x1d0
  ? ucma_listen+0x1b0/0x1b0
  ? ucma_write+0x292/0x460
  ucma_write+0x292/0x460
  ? ucma_close_id+0x60/0x60
  ? sched_clock_cpu+0x18/0x1d0
  ? sched_clock_cpu+0x18/0x1d0
  __vfs_write+0xf7/0x620
  ? ucma_close_id+0x60/0x60
  ? kernel_read+0x110/0x110
  ? time_hardirqs_on+0x19/0x580
  ? lock_acquire+0x18b/0x3a0
  ? finish_task_switch+0xf3/0x5d0
  ? _raw_spin_unlock_irq+0x29/0x40
  ? _raw_spin_unlock_irq+0x29/0x40
  ? finish_task_switch+0x1be/0x5d0
  ? __switch_to_asm+0x34/0x70
  ? __switch_to_asm+0x40/0x70
  ? security_file_permission+0x172/0x1e0
  vfs_write+0x192/0x460
  ksys_write+0xc6/0x1a0
  ? __ia32_sys_read+0xb0/0xb0
  ? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
  ? do_syscall_64+0x1d/0x470
  do_syscall_64+0x9e/0x470
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fixes: 3c86aa70bf67 ("RDMA/cm: Add RDMA CM support for IBoE devices")
Link: https://lore.kernel.org/r/20200318101741.47211-1-leon@kernel.org
Signed-off-by: Avihai Horon <avihaih@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 4df75ab4ee9d..26e6f7df247b 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -2978,6 +2978,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 err2:
 	kfree(route->path_rec);
 	route->path_rec = NULL;
+	route->num_paths = 0;
 err1:
 	kfree(work);
 	return ret;

From 23ab5261e29b6b95803ee8dc919ae76e260b358d Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 19 Mar 2020 16:46:41 +0100
Subject: [PATCH 254/327] IB/hfi1: Use scnprintf() for avoiding potential
 buffer overflow

Since snprintf() returns the would-be-output size instead of the actual
output size, the succeeding calls may go beyond the given buffer limit.
Fix it by replacing with scnprintf().

Link: https://lore.kernel.org/r/20200319154641.23711-1-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/fault.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c
index 986c12153e62..0dfbcfb048ca 100644
--- a/drivers/infiniband/hw/hfi1/fault.c
+++ b/drivers/infiniband/hw/hfi1/fault.c
@@ -222,11 +222,11 @@ static ssize_t fault_opcodes_read(struct file *file, char __user *buf,
 	while (bit < bitsize) {
 		zero = find_next_zero_bit(fault->opcodes, bitsize, bit);
 		if (zero - 1 != bit)
-			size += snprintf(data + size,
+			size += scnprintf(data + size,
 					 datalen - size - 1,
 					 "0x%lx-0x%lx,", bit, zero - 1);
 		else
-			size += snprintf(data + size,
+			size += scnprintf(data + size,
 					 datalen - size - 1, "0x%lx,",
 					 bit);
 		bit = find_next_bit(fault->opcodes, bitsize, zero);

From d0ca2c35dd15a3d989955caec02beea02f735ee6 Mon Sep 17 00:00:00 2001
From: Zhu Yanjun <yanjunz@mellanox.com>
Date: Mon, 23 Mar 2020 13:28:00 +0200
Subject: [PATCH 255/327] RDMA/rxe: Set sys_image_guid to be aligned with HW IB
 devices

The RXE driver doesn't set sys_image_guid and user space applications see
zeros. This causes to pyverbs tests to fail with the following traceback,
because the IBTA spec requires to have valid sys_image_guid.

 Traceback (most recent call last):
   File "./tests/test_device.py", line 51, in test_query_device
     self.verify_device_attr(attr)
   File "./tests/test_device.py", line 74, in verify_device_attr
     assert attr.sys_image_guid != 0

In order to fix it, set sys_image_guid to be equal to node_guid.

Before:
 5: rxe0: ... node_guid 5054:00ff:feaa:5363 sys_image_guid
 0000:0000:0000:0000

After:
 5: rxe0: ... node_guid 5054:00ff:feaa:5363 sys_image_guid
 5054:00ff:feaa:5363

Fixes: 8700e3e7c485 ("Soft RoCE driver")
Link: https://lore.kernel.org/r/20200323112800.1444784-1-leon@kernel.org
Signed-off-by: Zhu Yanjun <yanjunz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/rxe/rxe.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 0946a301a5c5..4afdd2e20883 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -103,6 +103,8 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
 	rxe->attr.max_fast_reg_page_list_len	= RXE_MAX_FMR_PAGE_LIST_LEN;
 	rxe->attr.max_pkeys			= RXE_MAX_PKEYS;
 	rxe->attr.local_ca_ack_delay		= RXE_LOCAL_CA_ACK_DELAY;
+	addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid,
+			rxe->ndev->dev_addr);
 
 	rxe->max_ucontext			= RXE_MAX_UCONTEXT;
 }

From 26e28deb813eed908cf31a6052870b6493ec0e86 Mon Sep 17 00:00:00 2001
From: Sergey Gorenko <sergeygo@mellanox.com>
Date: Wed, 25 Mar 2020 15:12:10 +0000
Subject: [PATCH 256/327] IB/iser: Always check sig MR before putting it to the
 free pool

libiscsi calls the check_protection transport handler only if SCSI-Respose
is received. So, the handler is never called if iSCSI task is completed
for some other reason like a timeout or error handling. And this behavior
looks correct. But the iSER does not handle this case properly because it
puts a non-checked signature MR to the free pool. Then the error occurs at
reusing the MR because it is not allowed to invalidate a signature MR
without checking.

This commit adds an extra check to iser_unreg_mem_fastreg(), which is a
part of the task cleanup flow. Now the signature MR is checked there if it
is needed.

Link: https://lore.kernel.org/r/20200325151210.1548-1-sergeygo@mellanox.com
Signed-off-by: Sergey Gorenko <sergeygo@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/iser/iser_memory.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 7a8f24de3631..999ef7cdd05e 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -292,12 +292,27 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
 {
 	struct iser_device *device = iser_task->iser_conn->ib_conn.device;
 	struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
+	struct iser_fr_desc *desc;
+	struct ib_mr_status mr_status;
 
-	if (!reg->mem_h)
+	desc = reg->mem_h;
+	if (!desc)
 		return;
 
-	device->reg_ops->reg_desc_put(&iser_task->iser_conn->ib_conn,
-				     reg->mem_h);
+	/*
+	 * The signature MR cannot be invalidated and reused without checking.
+	 * libiscsi calls the check_protection transport handler only if
+	 * SCSI-Response is received. And the signature MR is not checked if
+	 * the task is completed for some other reason like a timeout or error
+	 * handling. That's why we must check the signature MR here before
+	 * putting it to the free pool.
+	 */
+	if (unlikely(desc->sig_protected)) {
+		desc->sig_protected = false;
+		ib_check_mr_status(desc->rsc.sig_mr, IB_MR_CHECK_SIG_STATUS,
+				   &mr_status);
+	}
+	device->reg_ops->reg_desc_put(&iser_task->iser_conn->ib_conn, desc);
 	reg->mem_h = NULL;
 }
 

From ae1c61489c7fa02f32cc45464dfb5612fa812b32 Mon Sep 17 00:00:00 2001
From: Lijun Ou <oulijun@huawei.com>
Date: Fri, 20 Mar 2020 11:23:33 +0800
Subject: [PATCH 257/327] RDMA/hns: Unify format of prints

Use ibdev_err/dbg/warn() instead of dev_err/dbg/warn(), and modify some
prints into format of "failed to do something, ret = n".

Link: https://lore.kernel.org/r/1584674622-52773-2-git-send-email-liweihang@huawei.com
Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 156 ++++++++++++---------
 drivers/infiniband/hw/hns/hns_roce_pd.c    |   6 +-
 2 files changed, 89 insertions(+), 73 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 9bd8fbf2e96b..94cb2984ad6d 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -266,21 +266,24 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 static int check_send_valid(struct hns_roce_dev *hr_dev,
 			    struct hns_roce_qp *hr_qp)
 {
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct ib_qp *ibqp = &hr_qp->ibqp;
-	struct device *dev = hr_dev->dev;
 
 	if (unlikely(ibqp->qp_type != IB_QPT_RC &&
 		     ibqp->qp_type != IB_QPT_GSI &&
 		     ibqp->qp_type != IB_QPT_UD)) {
-		dev_err(dev, "Not supported QP(0x%x)type!\n", ibqp->qp_type);
+		ibdev_err(ibdev, "Not supported QP(0x%x)type!\n",
+			  ibqp->qp_type);
 		return -EOPNOTSUPP;
 	} else if (unlikely(hr_qp->state == IB_QPS_RESET ||
 		   hr_qp->state == IB_QPS_INIT ||
 		   hr_qp->state == IB_QPS_RTR)) {
-		dev_err(dev, "Post WQE fail, QP state %d!\n", hr_qp->state);
+		ibdev_err(ibdev, "failed to post WQE, QP state %d!\n",
+			  hr_qp->state);
 		return -EINVAL;
 	} else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) {
-		dev_err(dev, "Post WQE fail, dev state %d!\n", hr_dev->state);
+		ibdev_err(ibdev, "failed to post WQE, dev state %d!\n",
+			  hr_dev->state);
 		return -EIO;
 	}
 
@@ -625,9 +628,9 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_v2_wqe_data_seg *dseg;
 	struct hns_roce_rinl_sge *sge_list;
-	struct device *dev = hr_dev->dev;
 	unsigned long flags;
 	void *wqe = NULL;
 	u32 wqe_idx;
@@ -655,8 +658,8 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
 		wqe_idx = (hr_qp->rq.head + nreq) & (hr_qp->rq.wqe_cnt - 1);
 
 		if (unlikely(wr->num_sge > hr_qp->rq.max_gs)) {
-			dev_err(dev, "rq:num_sge=%d > qp->sq.max_gs=%d\n",
-				wr->num_sge, hr_qp->rq.max_gs);
+			ibdev_err(ibdev, "rq:num_sge=%d >= qp->sq.max_gs=%d\n",
+				  wr->num_sge, hr_qp->rq.max_gs);
 			ret = -EINVAL;
 			*bad_wr = wr;
 			goto out;
@@ -2440,7 +2443,9 @@ static int hns_roce_v2_set_gid(struct hns_roce_dev *hr_dev, u8 port,
 
 	ret = hns_roce_config_sgid_table(hr_dev, gid_index, gid, sgid_type);
 	if (ret)
-		dev_err(hr_dev->dev, "Configure sgid table failed(%d)!\n", ret);
+		ibdev_err(&hr_dev->ib_dev,
+			  "failed to configure sgid table, ret = %d!\n",
+			  ret);
 
 	return ret;
 }
@@ -3022,8 +3027,9 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 	if (!*cur_qp || (qpn & HNS_ROCE_V2_CQE_QPN_MASK) != (*cur_qp)->qpn) {
 		hr_qp = __hns_roce_qp_lookup(hr_dev, qpn);
 		if (unlikely(!hr_qp)) {
-			dev_err(hr_dev->dev, "CQ %06lx with entry for unknown QPN %06x\n",
-				hr_cq->cqn, (qpn & HNS_ROCE_V2_CQE_QPN_MASK));
+			ibdev_err(&hr_dev->ib_dev,
+				  "CQ %06lx with entry for unknown QPN %06x\n",
+				  hr_cq->cqn, qpn & HNS_ROCE_V2_CQE_QPN_MASK);
 			return -EINVAL;
 		}
 		*cur_qp = hr_qp;
@@ -3125,8 +3131,8 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 	 */
 	if (wc->status != IB_WC_SUCCESS &&
 	    wc->status != IB_WC_WR_FLUSH_ERR) {
-		dev_err(hr_dev->dev, "error cqe status is: 0x%x\n",
-			status & HNS_ROCE_V2_CQE_STATUS_MASK);
+		ibdev_err(&hr_dev->ib_dev, "error cqe status is: 0x%x\n",
+			  status & HNS_ROCE_V2_CQE_STATUS_MASK);
 
 		if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &hr_qp->flush_flag))
 			init_flush_work(hr_dev, hr_qp);
@@ -3974,21 +3980,22 @@ static bool check_wqe_rq_mtt_count(struct hns_roce_dev *hr_dev,
 				   struct hns_roce_qp *hr_qp, int mtt_cnt,
 				   u32 page_size)
 {
-	struct device *dev = hr_dev->dev;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 
 	if (hr_qp->rq.wqe_cnt < 1)
 		return true;
 
 	if (mtt_cnt < 1) {
-		dev_err(dev, "qp(0x%lx) rqwqe buf ba find failed\n",
-			hr_qp->qpn);
+		ibdev_err(ibdev, "failed to find RQWQE buf ba of QP(0x%lx)\n",
+			  hr_qp->qpn);
 		return false;
 	}
 
 	if (mtt_cnt < MTT_MIN_COUNT &&
 		(hr_qp->rq.offset + page_size) < hr_qp->buff_size) {
-		dev_err(dev, "qp(0x%lx) next rqwqe buf ba find failed\n",
-			hr_qp->qpn);
+		ibdev_err(ibdev,
+			  "failed to find next RQWQE buf ba of QP(0x%lx)\n",
+			  hr_qp->qpn);
 		return false;
 	}
 
@@ -4003,7 +4010,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
 	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
-	struct device *dev = hr_dev->dev;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	u64 mtts[MTT_MIN_COUNT] = { 0 };
 	dma_addr_t dma_handle_3;
 	dma_addr_t dma_handle_2;
@@ -4030,7 +4037,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
 	mtts_2 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.irrl_table,
 				     hr_qp->qpn, &dma_handle_2);
 	if (!mtts_2) {
-		dev_err(dev, "qp irrl_table find failed\n");
+		ibdev_err(ibdev, "failed to find QP irrl_table\n");
 		return -EINVAL;
 	}
 
@@ -4038,12 +4045,13 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
 	mtts_3 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.trrl_table,
 				     hr_qp->qpn, &dma_handle_3);
 	if (!mtts_3) {
-		dev_err(dev, "qp trrl_table find failed\n");
+		ibdev_err(ibdev, "failed to find QP trrl_table\n");
 		return -EINVAL;
 	}
 
 	if (attr_mask & IB_QP_ALT_PATH) {
-		dev_err(dev, "INIT2RTR attr_mask (0x%x) error\n", attr_mask);
+		ibdev_err(ibdev, "INIT2RTR attr_mask (0x%x) error\n",
+			  attr_mask);
 		return -EINVAL;
 	}
 
@@ -4246,7 +4254,7 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
-	struct device *dev = hr_dev->dev;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	u64 sge_cur_blk = 0;
 	u64 sq_cur_blk = 0;
 	u32 page_size;
@@ -4255,7 +4263,8 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
 	/* Search qp buf's mtts */
 	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, &sq_cur_blk, 1, NULL);
 	if (count < 1) {
-		dev_err(dev, "qp(0x%lx) buf pa find failed\n", hr_qp->qpn);
+		ibdev_err(ibdev, "failed to find buf pa of QP(0x%lx)\n",
+			  hr_qp->qpn);
 		return -EINVAL;
 	}
 
@@ -4265,8 +4274,8 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
 					  hr_qp->sge.offset / page_size,
 					  &sge_cur_blk, 1, NULL);
 		if (count < 1) {
-			dev_err(dev, "qp(0x%lx) sge pa find failed\n",
-				hr_qp->qpn);
+			ibdev_err(ibdev, "failed to find sge pa of QP(0x%lx)\n",
+				  hr_qp->qpn);
 			return -EINVAL;
 		}
 	}
@@ -4274,7 +4283,7 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
 	/* Not support alternate path and path migration */
 	if ((attr_mask & IB_QP_ALT_PATH) ||
 	    (attr_mask & IB_QP_PATH_MIG_STATE)) {
-		dev_err(dev, "RTR2RTS attr_mask (0x%x)error\n", attr_mask);
+		ibdev_err(ibdev, "RTR2RTS attr_mask (0x%x)error\n", attr_mask);
 		return -EINVAL;
 	}
 
@@ -4392,6 +4401,7 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp,
 	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	const struct ib_gid_attr *gid_attr = NULL;
 	int is_roce_protocol;
 	u16 vlan_id = 0xffff;
@@ -4433,13 +4443,13 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp,
 		       V2_QPC_BYTE_24_VLAN_ID_S, 0);
 
 	if (grh->sgid_index >= hr_dev->caps.gid_table_len[hr_port]) {
-		dev_err(hr_dev->dev, "sgid_index(%u) too large. max is %d\n",
-			grh->sgid_index, hr_dev->caps.gid_table_len[hr_port]);
+		ibdev_err(ibdev, "sgid_index(%u) too large. max is %d\n",
+			  grh->sgid_index, hr_dev->caps.gid_table_len[hr_port]);
 		return -EINVAL;
 	}
 
 	if (attr->ah_attr.type != RDMA_AH_ATTR_TYPE_ROCE) {
-		dev_err(hr_dev->dev, "ah attr is not RDMA roce type\n");
+		ibdev_err(ibdev, "ah attr is not RDMA roce type\n");
 		return -EINVAL;
 	}
 
@@ -4517,7 +4527,7 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp,
 		/* Nothing */
 		;
 	} else {
-		dev_err(hr_dev->dev, "Illegal state for QP!\n");
+		ibdev_err(&hr_dev->ib_dev, "Illegal state for QP!\n");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -4552,8 +4562,8 @@ static int hns_roce_v2_set_opt_fields(struct ib_qp *ibqp,
 				       V2_QPC_BYTE_28_AT_M, V2_QPC_BYTE_28_AT_S,
 				       0);
 		} else {
-			dev_warn(hr_dev->dev,
-				 "Local ACK timeout shall be 0 to 30.\n");
+			ibdev_warn(&hr_dev->ib_dev,
+				   "Local ACK timeout shall be 0 to 30.\n");
 		}
 	}
 
@@ -4721,7 +4731,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 	struct hns_roce_v2_qp_context ctx[2];
 	struct hns_roce_v2_qp_context *context = ctx;
 	struct hns_roce_v2_qp_context *qpc_mask = ctx + 1;
-	struct device *dev = hr_dev->dev;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	unsigned long sq_flag = 0;
 	unsigned long rq_flag = 0;
 	int ret;
@@ -4785,7 +4795,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 	/* SW pass context to HW */
 	ret = hns_roce_v2_qp_modify(hr_dev, ctx, hr_qp);
 	if (ret) {
-		dev_err(dev, "hns_roce_qp_modify failed(%d)\n", ret);
+		ibdev_err(ibdev, "failed to modify QP, ret = %d\n", ret);
 		goto out;
 	}
 
@@ -4842,10 +4852,8 @@ static int hns_roce_v2_query_qpc(struct hns_roce_dev *hr_dev,
 	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, hr_qp->qpn, 0,
 				HNS_ROCE_CMD_QUERY_QPC,
 				HNS_ROCE_CMD_TIMEOUT_MSECS);
-	if (ret) {
-		dev_err(hr_dev->dev, "QUERY QP cmd process error\n");
+	if (ret)
 		goto out;
-	}
 
 	memcpy(hr_context, mailbox->buf, sizeof(*hr_context));
 
@@ -4861,7 +4869,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	struct hns_roce_v2_qp_context context = {};
-	struct device *dev = hr_dev->dev;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	int tmp_qp_state;
 	int state;
 	int ret;
@@ -4879,7 +4887,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 
 	ret = hns_roce_v2_query_qpc(hr_dev, hr_qp, &context);
 	if (ret) {
-		dev_err(dev, "query qpc error\n");
+		ibdev_err(ibdev, "failed to query QPC, ret = %d\n", ret);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -4888,7 +4896,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 			       V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S);
 	tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state);
 	if (tmp_qp_state == -1) {
-		dev_err(dev, "Illegal ib_qp_state\n");
+		ibdev_err(ibdev, "Illegal ib_qp_state\n");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -4986,8 +4994,8 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 					 struct hns_roce_qp *hr_qp,
 					 struct ib_udata *udata)
 {
-	struct hns_roce_cq *send_cq, *recv_cq;
 	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_cq *send_cq, *recv_cq;
 	unsigned long flags;
 	int ret = 0;
 
@@ -4996,7 +5004,9 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 		ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0,
 					    hr_qp->state, IB_QPS_RESET);
 		if (ret)
-			ibdev_err(ibdev, "modify QP to Reset failed.\n");
+			ibdev_err(ibdev,
+				  "failed to modify QP to RST, ret = %d\n",
+				  ret);
 	}
 
 	send_cq = hr_qp->ibqp.send_cq ? to_hr_cq(hr_qp->ibqp.send_cq) : NULL;
@@ -5033,7 +5043,8 @@ static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 
 	ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
 	if (ret)
-		ibdev_err(&hr_dev->ib_dev, "Destroy qp 0x%06lx failed(%d)\n",
+		ibdev_err(&hr_dev->ib_dev,
+			  "failed to destroy QP 0x%06lx, ret = %d\n",
 			  hr_qp->qpn, ret);
 
 	hns_roce_qp_destroy(hr_dev, hr_qp, udata);
@@ -5042,8 +5053,9 @@ static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 }
 
 static int hns_roce_v2_qp_flow_control_init(struct hns_roce_dev *hr_dev,
-						struct hns_roce_qp *hr_qp)
+					    struct hns_roce_qp *hr_qp)
 {
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_sccc_clr_done *resp;
 	struct hns_roce_sccc_clr *clr;
 	struct hns_roce_cmq_desc desc;
@@ -5055,7 +5067,7 @@ static int hns_roce_v2_qp_flow_control_init(struct hns_roce_dev *hr_dev,
 	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_RESET_SCCC, false);
 	ret =  hns_roce_cmq_send(hr_dev, &desc, 1);
 	if (ret) {
-		dev_err(hr_dev->dev, "Reset SCC ctx  failed(%d)\n", ret);
+		ibdev_err(ibdev, "failed to reset SCC ctx, ret = %d\n", ret);
 		goto out;
 	}
 
@@ -5065,7 +5077,7 @@ static int hns_roce_v2_qp_flow_control_init(struct hns_roce_dev *hr_dev,
 	clr->qpn = cpu_to_le32(hr_qp->qpn);
 	ret =  hns_roce_cmq_send(hr_dev, &desc, 1);
 	if (ret) {
-		dev_err(hr_dev->dev, "Clear SCC ctx failed(%d)\n", ret);
+		ibdev_err(ibdev, "failed to clear SCC ctx, ret = %d\n", ret);
 		goto out;
 	}
 
@@ -5076,7 +5088,8 @@ static int hns_roce_v2_qp_flow_control_init(struct hns_roce_dev *hr_dev,
 					      HNS_ROCE_OPC_QUERY_SCCC, true);
 		ret = hns_roce_cmq_send(hr_dev, &desc, 1);
 		if (ret) {
-			dev_err(hr_dev->dev, "Query clr cmq failed(%d)\n", ret);
+			ibdev_err(ibdev, "failed to query clr cmq, ret = %d\n",
+				  ret);
 			goto out;
 		}
 
@@ -5086,7 +5099,7 @@ static int hns_roce_v2_qp_flow_control_init(struct hns_roce_dev *hr_dev,
 		msleep(20);
 	}
 
-	dev_err(hr_dev->dev, "Query SCC clr done flag overtime.\n");
+	ibdev_err(ibdev, "Query SCC clr done flag overtime.\n");
 	ret = -ETIMEDOUT;
 
 out:
@@ -5130,7 +5143,9 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
 				HNS_ROCE_CMD_TIMEOUT_MSECS);
 	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
 	if (ret)
-		dev_err(hr_dev->dev, "MODIFY CQ Failed to cmd mailbox.\n");
+		ibdev_err(&hr_dev->ib_dev,
+			  "failed to process cmd when modifying CQ, ret = %d\n",
+			  ret);
 
 	return ret;
 }
@@ -5139,54 +5154,54 @@ static void hns_roce_irq_work_handle(struct work_struct *work)
 {
 	struct hns_roce_work *irq_work =
 				container_of(work, struct hns_roce_work, work);
-	struct device *dev = irq_work->hr_dev->dev;
+	struct ib_device *ibdev = &irq_work->hr_dev->ib_dev;
 	u32 qpn = irq_work->qpn;
 	u32 cqn = irq_work->cqn;
 
 	switch (irq_work->event_type) {
 	case HNS_ROCE_EVENT_TYPE_PATH_MIG:
-		dev_info(dev, "Path migrated succeeded.\n");
+		ibdev_info(ibdev, "Path migrated succeeded.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
-		dev_warn(dev, "Path migration failed.\n");
+		ibdev_warn(ibdev, "Path migration failed.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_COMM_EST:
 		break;
 	case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
-		dev_warn(dev, "Send queue drained.\n");
+		ibdev_warn(ibdev, "Send queue drained.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
-		dev_err(dev, "Local work queue 0x%x catas error, sub_type:%d\n",
-			qpn, irq_work->sub_type);
+		ibdev_err(ibdev, "Local work queue 0x%x catast error, sub_event type is: %d\n",
+			  qpn, irq_work->sub_type);
 		break;
 	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
-		dev_err(dev, "Invalid request local work queue 0x%x error.\n",
-			qpn);
+		ibdev_err(ibdev, "Invalid request local work queue 0x%x error.\n",
+			  qpn);
 		break;
 	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
-		dev_err(dev, "Local access violation work queue 0x%x error, sub_type:%d\n",
-			qpn, irq_work->sub_type);
+		ibdev_err(ibdev, "Local access violation work queue 0x%x error, sub_event type is: %d\n",
+			  qpn, irq_work->sub_type);
 		break;
 	case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
-		dev_warn(dev, "SRQ limit reach.\n");
+		ibdev_warn(ibdev, "SRQ limit reach.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
-		dev_warn(dev, "SRQ last wqe reach.\n");
+		ibdev_warn(ibdev, "SRQ last wqe reach.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
-		dev_err(dev, "SRQ catas error.\n");
+		ibdev_err(ibdev, "SRQ catas error.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
-		dev_err(dev, "CQ 0x%x access err.\n", cqn);
+		ibdev_err(ibdev, "CQ 0x%x access err.\n", cqn);
 		break;
 	case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
-		dev_warn(dev, "CQ 0x%x overflow\n", cqn);
+		ibdev_warn(ibdev, "CQ 0x%x overflow\n", cqn);
 		break;
 	case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
-		dev_warn(dev, "DB overflow.\n");
+		ibdev_warn(ibdev, "DB overflow.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_FLR:
-		dev_warn(dev, "Function level reset.\n");
+		ibdev_warn(ibdev, "Function level reset.\n");
 		break;
 	default:
 		break;
@@ -6119,8 +6134,9 @@ static int hns_roce_v2_modify_srq(struct ib_srq *ibsrq,
 					HNS_ROCE_CMD_TIMEOUT_MSECS);
 		hns_roce_free_cmd_mailbox(hr_dev, mailbox);
 		if (ret) {
-			dev_err(hr_dev->dev,
-				"MODIFY SRQ Failed to cmd mailbox.\n");
+			ibdev_err(&hr_dev->ib_dev,
+				  "failed to process cmd when modifying SRQ, ret = %d\n",
+				  ret);
 			return ret;
 		}
 	}
@@ -6146,7 +6162,9 @@ static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
 				HNS_ROCE_CMD_QUERY_SRQC,
 				HNS_ROCE_CMD_TIMEOUT_MSECS);
 	if (ret) {
-		dev_err(hr_dev->dev, "QUERY SRQ cmd process error\n");
+		ibdev_err(&hr_dev->ib_dev,
+			  "failed to process cmd when querying SRQ, ret = %d\n",
+			  ret);
 		goto out;
 	}
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c
index 780c780fdb22..b10c50b8736e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_pd.c
+++ b/drivers/infiniband/hw/hns/hns_roce_pd.c
@@ -60,14 +60,12 @@ void hns_roce_cleanup_pd_table(struct hns_roce_dev *hr_dev)
 int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct ib_device *ib_dev = ibpd->device;
-	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
-	struct device *dev = hr_dev->dev;
 	struct hns_roce_pd *pd = to_hr_pd(ibpd);
 	int ret;
 
 	ret = hns_roce_pd_alloc(to_hr_dev(ib_dev), &pd->pdn);
 	if (ret) {
-		dev_err(dev, "[alloc_pd]hns_roce_pd_alloc failed!\n");
+		ibdev_err(ib_dev, "failed to alloc pd, ret = %d\n", ret);
 		return ret;
 	}
 
@@ -76,7 +74,7 @@ int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 
 		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
 			hns_roce_pd_free(to_hr_dev(ib_dev), pd->pdn);
-			dev_err(dev, "[alloc_pd]ib_copy_to_udata failed!\n");
+			ibdev_err(ib_dev, "failed to copy to udata\n");
 			return -EFAULT;
 		}
 	}

From 30d41e18c3a6eac39b7348ae7e3388f0ec7e680e Mon Sep 17 00:00:00 2001
From: Weihang Li <liweihang@huawei.com>
Date: Fri, 20 Mar 2020 11:23:34 +0800
Subject: [PATCH 258/327] RDMA/hns: Fix a wrong judgment of return value

hns_roce_alloc_mtt_range() never return -1, ret should be checked
whether it is zero instead of -1.

Fixes: 1ceb0b11a8a2 ("RDMA/hns: Fix non-standard error codes")
Link: https://lore.kernel.org/r/1584674622-52773-3-git-send-email-liweihang@huawei.com
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_mr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index b9898e71655a..176f34692f88 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -243,7 +243,7 @@ int hns_roce_mtt_init(struct hns_roce_dev *hr_dev, int npages, int page_shift,
 	/* Allocate MTT entry */
 	ret = hns_roce_alloc_mtt_range(hr_dev, mtt->order, &mtt->first_seg,
 				       mtt->mtt_type);
-	if (ret == -1)
+	if (ret)
 		return -ENOMEM;
 
 	return 0;

From d398d4ca5f17e7ba762bc897c82afca868e47a7c Mon Sep 17 00:00:00 2001
From: Lang Cheng <chenglang@huawei.com>
Date: Fri, 20 Mar 2020 11:23:35 +0800
Subject: [PATCH 259/327] RDMA/hns: Simplify attribute judgment code

Combine attribute flags before masking them.

Link: https://lore.kernel.org/r/1584674622-52773-4-git-send-email-liweihang@huawei.com
Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 94cb2984ad6d..518a6491c03f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -4281,8 +4281,7 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
 	}
 
 	/* Not support alternate path and path migration */
-	if ((attr_mask & IB_QP_ALT_PATH) ||
-	    (attr_mask & IB_QP_PATH_MIG_STATE)) {
+	if (attr_mask & (IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE)) {
 		ibdev_err(ibdev, "RTR2RTS attr_mask (0x%x)error\n", attr_mask);
 		return -EINVAL;
 	}

From 99e713f8daf8e0ddb728ba543a05a2b67d8c47cc Mon Sep 17 00:00:00 2001
From: Lijun Ou <oulijun@huawei.com>
Date: Fri, 20 Mar 2020 11:23:36 +0800
Subject: [PATCH 260/327] RDMA/hns: Optimize hns_roce_alloc_vf_resource()

The capbilities of hardware should be got at first and then used in
hns_roce_alloc_vf_resource(). Also removes an unnecessary if ... else
condition in it.

Link: https://lore.kernel.org/r/1584674622-52773-5-git-send-email-liweihang@huawei.com
Signed-off-by: Lijun Ou <oulijun@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 143 +++++++++------------
 1 file changed, 62 insertions(+), 81 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 518a6491c03f..aff7c5da7080 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -1432,82 +1432,63 @@ static int hns_roce_alloc_vf_resource(struct hns_roce_dev *hr_dev)
 			desc[i].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
 		else
 			desc[i].flag &= ~cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
-
-		if (i == 0) {
-			roce_set_field(req_a->vf_qpc_bt_idx_num,
-				       VF_RES_A_DATA_1_VF_QPC_BT_IDX_M,
-				       VF_RES_A_DATA_1_VF_QPC_BT_IDX_S, 0);
-			roce_set_field(req_a->vf_qpc_bt_idx_num,
-				       VF_RES_A_DATA_1_VF_QPC_BT_NUM_M,
-				       VF_RES_A_DATA_1_VF_QPC_BT_NUM_S,
-				       HNS_ROCE_VF_QPC_BT_NUM);
-
-			roce_set_field(req_a->vf_srqc_bt_idx_num,
-				       VF_RES_A_DATA_2_VF_SRQC_BT_IDX_M,
-				       VF_RES_A_DATA_2_VF_SRQC_BT_IDX_S, 0);
-			roce_set_field(req_a->vf_srqc_bt_idx_num,
-				       VF_RES_A_DATA_2_VF_SRQC_BT_NUM_M,
-				       VF_RES_A_DATA_2_VF_SRQC_BT_NUM_S,
-				       HNS_ROCE_VF_SRQC_BT_NUM);
-
-			roce_set_field(req_a->vf_cqc_bt_idx_num,
-				       VF_RES_A_DATA_3_VF_CQC_BT_IDX_M,
-				       VF_RES_A_DATA_3_VF_CQC_BT_IDX_S, 0);
-			roce_set_field(req_a->vf_cqc_bt_idx_num,
-				       VF_RES_A_DATA_3_VF_CQC_BT_NUM_M,
-				       VF_RES_A_DATA_3_VF_CQC_BT_NUM_S,
-				       HNS_ROCE_VF_CQC_BT_NUM);
-
-			roce_set_field(req_a->vf_mpt_bt_idx_num,
-				       VF_RES_A_DATA_4_VF_MPT_BT_IDX_M,
-				       VF_RES_A_DATA_4_VF_MPT_BT_IDX_S, 0);
-			roce_set_field(req_a->vf_mpt_bt_idx_num,
-				       VF_RES_A_DATA_4_VF_MPT_BT_NUM_M,
-				       VF_RES_A_DATA_4_VF_MPT_BT_NUM_S,
-				       HNS_ROCE_VF_MPT_BT_NUM);
-
-			roce_set_field(req_a->vf_eqc_bt_idx_num,
-				       VF_RES_A_DATA_5_VF_EQC_IDX_M,
-				       VF_RES_A_DATA_5_VF_EQC_IDX_S, 0);
-			roce_set_field(req_a->vf_eqc_bt_idx_num,
-				       VF_RES_A_DATA_5_VF_EQC_NUM_M,
-				       VF_RES_A_DATA_5_VF_EQC_NUM_S,
-				       HNS_ROCE_VF_EQC_NUM);
-		} else {
-			roce_set_field(req_b->vf_smac_idx_num,
-				       VF_RES_B_DATA_1_VF_SMAC_IDX_M,
-				       VF_RES_B_DATA_1_VF_SMAC_IDX_S, 0);
-			roce_set_field(req_b->vf_smac_idx_num,
-				       VF_RES_B_DATA_1_VF_SMAC_NUM_M,
-				       VF_RES_B_DATA_1_VF_SMAC_NUM_S,
-				       HNS_ROCE_VF_SMAC_NUM);
-
-			roce_set_field(req_b->vf_sgid_idx_num,
-				       VF_RES_B_DATA_2_VF_SGID_IDX_M,
-				       VF_RES_B_DATA_2_VF_SGID_IDX_S, 0);
-			roce_set_field(req_b->vf_sgid_idx_num,
-				       VF_RES_B_DATA_2_VF_SGID_NUM_M,
-				       VF_RES_B_DATA_2_VF_SGID_NUM_S,
-				       HNS_ROCE_VF_SGID_NUM);
-
-			roce_set_field(req_b->vf_qid_idx_sl_num,
-				       VF_RES_B_DATA_3_VF_QID_IDX_M,
-				       VF_RES_B_DATA_3_VF_QID_IDX_S, 0);
-			roce_set_field(req_b->vf_qid_idx_sl_num,
-				       VF_RES_B_DATA_3_VF_SL_NUM_M,
-				       VF_RES_B_DATA_3_VF_SL_NUM_S,
-				       HNS_ROCE_VF_SL_NUM);
-
-			roce_set_field(req_b->vf_sccc_idx_num,
-				       VF_RES_B_DATA_4_VF_SCCC_BT_IDX_M,
-				       VF_RES_B_DATA_4_VF_SCCC_BT_IDX_S, 0);
-			roce_set_field(req_b->vf_sccc_idx_num,
-				       VF_RES_B_DATA_4_VF_SCCC_BT_NUM_M,
-				       VF_RES_B_DATA_4_VF_SCCC_BT_NUM_S,
-				       HNS_ROCE_VF_SCCC_BT_NUM);
-		}
 	}
 
+	roce_set_field(req_a->vf_qpc_bt_idx_num,
+		       VF_RES_A_DATA_1_VF_QPC_BT_IDX_M,
+		       VF_RES_A_DATA_1_VF_QPC_BT_IDX_S, 0);
+	roce_set_field(req_a->vf_qpc_bt_idx_num,
+		       VF_RES_A_DATA_1_VF_QPC_BT_NUM_M,
+		       VF_RES_A_DATA_1_VF_QPC_BT_NUM_S, HNS_ROCE_VF_QPC_BT_NUM);
+
+	roce_set_field(req_a->vf_srqc_bt_idx_num,
+		       VF_RES_A_DATA_2_VF_SRQC_BT_IDX_M,
+		       VF_RES_A_DATA_2_VF_SRQC_BT_IDX_S, 0);
+	roce_set_field(req_a->vf_srqc_bt_idx_num,
+		       VF_RES_A_DATA_2_VF_SRQC_BT_NUM_M,
+		       VF_RES_A_DATA_2_VF_SRQC_BT_NUM_S,
+		       HNS_ROCE_VF_SRQC_BT_NUM);
+
+	roce_set_field(req_a->vf_cqc_bt_idx_num,
+		       VF_RES_A_DATA_3_VF_CQC_BT_IDX_M,
+		       VF_RES_A_DATA_3_VF_CQC_BT_IDX_S, 0);
+	roce_set_field(req_a->vf_cqc_bt_idx_num,
+		       VF_RES_A_DATA_3_VF_CQC_BT_NUM_M,
+		       VF_RES_A_DATA_3_VF_CQC_BT_NUM_S, HNS_ROCE_VF_CQC_BT_NUM);
+
+	roce_set_field(req_a->vf_mpt_bt_idx_num,
+		       VF_RES_A_DATA_4_VF_MPT_BT_IDX_M,
+		       VF_RES_A_DATA_4_VF_MPT_BT_IDX_S, 0);
+	roce_set_field(req_a->vf_mpt_bt_idx_num,
+		       VF_RES_A_DATA_4_VF_MPT_BT_NUM_M,
+		       VF_RES_A_DATA_4_VF_MPT_BT_NUM_S, HNS_ROCE_VF_MPT_BT_NUM);
+
+	roce_set_field(req_a->vf_eqc_bt_idx_num, VF_RES_A_DATA_5_VF_EQC_IDX_M,
+		       VF_RES_A_DATA_5_VF_EQC_IDX_S, 0);
+	roce_set_field(req_a->vf_eqc_bt_idx_num, VF_RES_A_DATA_5_VF_EQC_NUM_M,
+		       VF_RES_A_DATA_5_VF_EQC_NUM_S, HNS_ROCE_VF_EQC_NUM);
+
+	roce_set_field(req_b->vf_smac_idx_num, VF_RES_B_DATA_1_VF_SMAC_IDX_M,
+		       VF_RES_B_DATA_1_VF_SMAC_IDX_S, 0);
+	roce_set_field(req_b->vf_smac_idx_num, VF_RES_B_DATA_1_VF_SMAC_NUM_M,
+		       VF_RES_B_DATA_1_VF_SMAC_NUM_S, HNS_ROCE_VF_SMAC_NUM);
+
+	roce_set_field(req_b->vf_sgid_idx_num, VF_RES_B_DATA_2_VF_SGID_IDX_M,
+		       VF_RES_B_DATA_2_VF_SGID_IDX_S, 0);
+	roce_set_field(req_b->vf_sgid_idx_num, VF_RES_B_DATA_2_VF_SGID_NUM_M,
+		       VF_RES_B_DATA_2_VF_SGID_NUM_S, HNS_ROCE_VF_SGID_NUM);
+
+	roce_set_field(req_b->vf_qid_idx_sl_num, VF_RES_B_DATA_3_VF_QID_IDX_M,
+		       VF_RES_B_DATA_3_VF_QID_IDX_S, 0);
+	roce_set_field(req_b->vf_qid_idx_sl_num, VF_RES_B_DATA_3_VF_SL_NUM_M,
+		       VF_RES_B_DATA_3_VF_SL_NUM_S, HNS_ROCE_VF_SL_NUM);
+
+	roce_set_field(req_b->vf_sccc_idx_num, VF_RES_B_DATA_4_VF_SCCC_BT_IDX_M,
+		       VF_RES_B_DATA_4_VF_SCCC_BT_IDX_S, 0);
+	roce_set_field(req_b->vf_sccc_idx_num, VF_RES_B_DATA_4_VF_SCCC_BT_NUM_M,
+		       VF_RES_B_DATA_4_VF_SCCC_BT_NUM_S,
+		       HNS_ROCE_VF_SCCC_BT_NUM);
+
 	return hns_roce_cmq_send(hr_dev, desc, 2);
 }
 
@@ -2001,13 +1982,6 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 		}
 	}
 
-	ret = hns_roce_alloc_vf_resource(hr_dev);
-	if (ret) {
-		dev_err(hr_dev->dev, "Allocate vf resource fail, ret = %d.\n",
-			ret);
-		return ret;
-	}
-
 	hr_dev->vendor_part_id = hr_dev->pci_dev->device;
 	hr_dev->sys_image_guid = be64_to_cpu(hr_dev->ib_dev.node_guid);
 
@@ -2028,6 +2002,13 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 	if (ret)
 		set_default_caps(hr_dev);
 
+	ret = hns_roce_alloc_vf_resource(hr_dev);
+	if (ret) {
+		dev_err(hr_dev->dev, "Allocate vf resource fail, ret = %d.\n",
+			ret);
+		return ret;
+	}
+
 	ret = hns_roce_v2_set_bt(hr_dev);
 	if (ret)
 		dev_err(hr_dev->dev, "Configure bt attribute fail, ret = %d.\n",

From fd72926c332eaa28845b1f655b24006158ec5207 Mon Sep 17 00:00:00 2001
From: Lang Cheng <chenglang@huawei.com>
Date: Fri, 20 Mar 2020 11:23:37 +0800
Subject: [PATCH 261/327] RDMA/hns: Adjust the qp status value sequence of the
 hardware

Interchange SQD and SQE to match the protocol.

Link: https://lore.kernel.org/r/1584674622-52773-6-git-send-email-liweihang@huawei.com
Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 2a117ff6a6be..83e94df7f190 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -460,8 +460,8 @@ enum hns_roce_v2_qp_state {
 	HNS_ROCE_QP_ST_INIT,
 	HNS_ROCE_QP_ST_RTR,
 	HNS_ROCE_QP_ST_RTS,
-	HNS_ROCE_QP_ST_SQER,
 	HNS_ROCE_QP_ST_SQD,
+	HNS_ROCE_QP_ST_SQER,
 	HNS_ROCE_QP_ST_ERR,
 	HNS_ROCE_QP_ST_SQ_DRAINING,
 	HNS_ROCE_QP_NUM_ST

From f91b9196875262f256cc018bb55585bd1e473fd8 Mon Sep 17 00:00:00 2001
From: Lang Cheng <chenglang@huawei.com>
Date: Fri, 20 Mar 2020 11:23:38 +0800
Subject: [PATCH 262/327] RDMA/hns: Remove definition of cq doorbell structure

The struct hns_roce_v2_cq_db is unused, it should be removed.

Link: https://lore.kernel.org/r/1584674622-52773-7-git-send-email-liweihang@huawei.com
Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 83e94df7f190..7c999536a4c5 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -1056,11 +1056,6 @@ struct hns_roce_v2_mpt_entry {
 #define V2_DB_PARAMETER_SL_S 16
 #define V2_DB_PARAMETER_SL_M GENMASK(18, 16)
 
-struct hns_roce_v2_cq_db {
-	__le32	byte_4;
-	__le32	parameter;
-};
-
 #define	V2_CQ_DB_BYTE_4_TAG_S 0
 #define V2_CQ_DB_BYTE_4_TAG_M GENMASK(23, 0)
 

From bceda6e67b8f211419876c64ce2125f683927e64 Mon Sep 17 00:00:00 2001
From: Wenpeng Liang <liangwenpeng@huawei.com>
Date: Fri, 20 Mar 2020 11:23:39 +0800
Subject: [PATCH 263/327] RDMA/hns: Remove meaningless prints

ceq and aeq is a ring buffer, consumer index of them will be set to zero
after reaching the maximum value. The warning should be removed or it may
mislead the users.

Link: https://lore.kernel.org/r/1584674622-52773-8-git-send-email-liweihang@huawei.com
Signed-off-by: Wenpeng Liang <liangwenpeng@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 9 ++-------
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 5 +----
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 2e5304502a08..5ff028d77be3 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -3935,10 +3935,8 @@ static int hns_roce_v1_aeq_int(struct hns_roce_dev *hr_dev,
 		eq->cons_index++;
 		aeqes_found = 1;
 
-		if (eq->cons_index > 2 * hr_dev->caps.aeqe_depth - 1) {
-			dev_warn(dev, "cons_index overflow, set back to 0.\n");
+		if (eq->cons_index > 2 * hr_dev->caps.aeqe_depth - 1)
 			eq->cons_index = 0;
-		}
 	}
 
 	set_eq_cons_index_v1(eq, 0);
@@ -3988,11 +3986,8 @@ static int hns_roce_v1_ceq_int(struct hns_roce_dev *hr_dev,
 		ceqes_found = 1;
 
 		if (eq->cons_index >
-		    EQ_DEPTH_COEFF * hr_dev->caps.ceqe_depth - 1) {
-			dev_warn(&eq->hr_dev->pdev->dev,
-				"cons_index overflow, set back to 0.\n");
+		    EQ_DEPTH_COEFF * hr_dev->caps.ceqe_depth - 1)
 			eq->cons_index = 0;
-		}
 	}
 
 	set_eq_cons_index_v1(eq, 0);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index aff7c5da7080..bd14e71f4566 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -5361,7 +5361,6 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq)
 static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev,
 			       struct hns_roce_eq *eq)
 {
-	struct device *dev = hr_dev->dev;
 	struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq);
 	int ceqe_found = 0;
 	u32 cqn;
@@ -5380,10 +5379,8 @@ static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev,
 		++eq->cons_index;
 		ceqe_found = 1;
 
-		if (eq->cons_index > (EQ_DEPTH_COEFF * eq->entries - 1)) {
-			dev_warn(dev, "cons_index overflow, set back to 0.\n");
+		if (eq->cons_index > (EQ_DEPTH_COEFF * eq->entries - 1))
 			eq->cons_index = 0;
-		}
 
 		ceqe = next_ceqe_sw_v2(eq);
 	}

From f4c5d869c8270703c3f3267242e4cc968c723370 Mon Sep 17 00:00:00 2001
From: Lang Cheng <chenglang@huawei.com>
Date: Fri, 20 Mar 2020 11:23:40 +0800
Subject: [PATCH 264/327] RDMA/hns: Remove redundant qpc setup operations

Before calling modify_qp_reset_to_init(), the entire qpc mask has been
cleared, so it is no longer necessary to clear the specific fields in the
mask.

Link: https://lore.kernel.org/r/1584674622-52773-9-git-send-email-liweihang@huawei.com
Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 237 +--------------------
 1 file changed, 1 insertion(+), 236 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index bd14e71f4566..2b03c722c2d9 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -3541,14 +3541,9 @@ static void set_qpc_wqe_cnt(struct hns_roce_qp *hr_qp,
 			       HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE ?
 			       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
 
-	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
-		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
-
 	roce_set_field(context->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S,
 		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S, 0);
 
 	roce_set_field(context->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
@@ -3556,9 +3551,6 @@ static void set_qpc_wqe_cnt(struct hns_roce_qp *hr_qp,
 		       hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT ||
 		       hr_qp->ibqp.srq) ? 0 :
 		       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
-
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
 }
 
 static void modify_qp_reset_to_init(struct ib_qp *ibqp,
@@ -3578,280 +3570,53 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
 	 */
 	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
 		       V2_QPC_BYTE_4_TST_S, to_hr_qp_type(hr_qp->ibqp.qp_type));
-	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
-		       V2_QPC_BYTE_4_TST_S, 0);
 
 	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
 		       V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn);
-	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
-		       V2_QPC_BYTE_4_SQPN_S, 0);
 
 	roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
 		       V2_QPC_BYTE_16_PD_S, to_hr_pd(ibqp->pd)->pdn);
-	roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
-		       V2_QPC_BYTE_16_PD_S, 0);
 
 	roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQWS_M,
 		       V2_QPC_BYTE_20_RQWS_S, ilog2(hr_qp->rq.max_gs));
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQWS_M,
-		       V2_QPC_BYTE_20_RQWS_S, 0);
 
 	set_qpc_wqe_cnt(hr_qp, context, qpc_mask);
 
 	/* No VLAN need to set 0xFFF */
 	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
 		       V2_QPC_BYTE_24_VLAN_ID_S, 0xfff);
-	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
-		       V2_QPC_BYTE_24_VLAN_ID_S, 0);
 
-	/*
-	 * Set some fields in context to zero, Because the default values
-	 * of all fields in context are zero, we need not set them to 0 again.
-	 * but we should set the relevant fields of context mask to 0.
-	 */
-	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_SQ_TX_ERR_S, 0);
-	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_SQ_RX_ERR_S, 0);
-	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_TX_ERR_S, 0);
-	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_RX_ERR_S, 0);
-
-	roce_set_field(qpc_mask->byte_60_qpst_tempid, V2_QPC_BYTE_60_TEMPID_M,
-		       V2_QPC_BYTE_60_TEMPID_S, 0);
-
-	roce_set_field(qpc_mask->byte_60_qpst_tempid,
-		       V2_QPC_BYTE_60_SCC_TOKEN_M, V2_QPC_BYTE_60_SCC_TOKEN_S,
-		       0);
-	roce_set_bit(qpc_mask->byte_60_qpst_tempid,
-		     V2_QPC_BYTE_60_SQ_DB_DOING_S, 0);
-	roce_set_bit(qpc_mask->byte_60_qpst_tempid,
-		     V2_QPC_BYTE_60_RQ_DB_DOING_S, 0);
-	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0);
-	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0);
-
-	if (hr_qp->rdb_en) {
+	if (hr_qp->rdb_en)
 		roce_set_bit(context->byte_68_rq_db,
 			     V2_QPC_BYTE_68_RQ_RECORD_EN_S, 1);
-		roce_set_bit(qpc_mask->byte_68_rq_db,
-			     V2_QPC_BYTE_68_RQ_RECORD_EN_S, 0);
-	}
 
 	roce_set_field(context->byte_68_rq_db,
 		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_M,
 		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_S,
 		       ((u32)hr_qp->rdb.dma) >> 1);
-	roce_set_field(qpc_mask->byte_68_rq_db,
-		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_M,
-		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_S, 0);
 	context->rq_db_record_addr = cpu_to_le32(hr_qp->rdb.dma >> 32);
-	qpc_mask->rq_db_record_addr = 0;
 
 	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S,
 		    (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) ? 1 : 0);
-	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 0);
 
 	roce_set_field(context->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
 		       V2_QPC_BYTE_80_RX_CQN_S, to_hr_cq(ibqp->recv_cq)->cqn);
-	roce_set_field(qpc_mask->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
-		       V2_QPC_BYTE_80_RX_CQN_S, 0);
 	if (ibqp->srq) {
 		roce_set_field(context->byte_76_srqn_op_en,
 			       V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S,
 			       to_hr_srq(ibqp->srq)->srqn);
-		roce_set_field(qpc_mask->byte_76_srqn_op_en,
-			       V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S, 0);
 		roce_set_bit(context->byte_76_srqn_op_en,
 			     V2_QPC_BYTE_76_SRQ_EN_S, 1);
-		roce_set_bit(qpc_mask->byte_76_srqn_op_en,
-			     V2_QPC_BYTE_76_SRQ_EN_S, 0);
 	}
 
-	roce_set_field(qpc_mask->byte_84_rq_ci_pi,
-		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
-		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
-	roce_set_field(qpc_mask->byte_84_rq_ci_pi,
-		       V2_QPC_BYTE_84_RQ_CONSUMER_IDX_M,
-		       V2_QPC_BYTE_84_RQ_CONSUMER_IDX_S, 0);
-
-	roce_set_field(qpc_mask->byte_92_srq_info, V2_QPC_BYTE_92_SRQ_INFO_M,
-		       V2_QPC_BYTE_92_SRQ_INFO_S, 0);
-
-	roce_set_field(qpc_mask->byte_96_rx_reqmsn, V2_QPC_BYTE_96_RX_REQ_MSN_M,
-		       V2_QPC_BYTE_96_RX_REQ_MSN_S, 0);
-
-	roce_set_field(qpc_mask->byte_104_rq_sge,
-		       V2_QPC_BYTE_104_RQ_CUR_WQE_SGE_NUM_M,
-		       V2_QPC_BYTE_104_RQ_CUR_WQE_SGE_NUM_S, 0);
-
-	roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
-		     V2_QPC_BYTE_108_RX_REQ_PSN_ERR_S, 0);
-	roce_set_field(qpc_mask->byte_108_rx_reqepsn,
-		       V2_QPC_BYTE_108_RX_REQ_LAST_OPTYPE_M,
-		       V2_QPC_BYTE_108_RX_REQ_LAST_OPTYPE_S, 0);
-	roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
-		     V2_QPC_BYTE_108_RX_REQ_RNR_S, 0);
-
-	qpc_mask->rq_rnr_timer = 0;
-	qpc_mask->rx_msg_len = 0;
-	qpc_mask->rx_rkey_pkt_info = 0;
-	qpc_mask->rx_va = 0;
-
-	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_HEAD_MAX_M,
-		       V2_QPC_BYTE_132_TRRL_HEAD_MAX_S, 0);
-	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_TAIL_MAX_M,
-		       V2_QPC_BYTE_132_TRRL_TAIL_MAX_S, 0);
-
-	roce_set_bit(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RQ_RTY_WAIT_DO_S,
-		     0);
-	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_HEAD_M,
-		       V2_QPC_BYTE_140_RAQ_TRRL_HEAD_S, 0);
-	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_TAIL_M,
-		       V2_QPC_BYTE_140_RAQ_TRRL_TAIL_S, 0);
-
-	roce_set_field(qpc_mask->byte_144_raq,
-		       V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_M,
-		       V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_S, 0);
-	roce_set_field(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_CREDIT_M,
-		       V2_QPC_BYTE_144_RAQ_CREDIT_S, 0);
-	roce_set_bit(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RESP_RTY_FLG_S, 0);
-
-	roce_set_field(qpc_mask->byte_148_raq, V2_QPC_BYTE_148_RQ_MSN_M,
-		       V2_QPC_BYTE_148_RQ_MSN_S, 0);
-	roce_set_field(qpc_mask->byte_148_raq, V2_QPC_BYTE_148_RAQ_SYNDROME_M,
-		       V2_QPC_BYTE_148_RAQ_SYNDROME_S, 0);
-
-	roce_set_field(qpc_mask->byte_152_raq, V2_QPC_BYTE_152_RAQ_PSN_M,
-		       V2_QPC_BYTE_152_RAQ_PSN_S, 0);
-	roce_set_field(qpc_mask->byte_152_raq,
-		       V2_QPC_BYTE_152_RAQ_TRRL_RTY_HEAD_M,
-		       V2_QPC_BYTE_152_RAQ_TRRL_RTY_HEAD_S, 0);
-
-	roce_set_field(qpc_mask->byte_156_raq, V2_QPC_BYTE_156_RAQ_USE_PKTN_M,
-		       V2_QPC_BYTE_156_RAQ_USE_PKTN_S, 0);
-
-	roce_set_field(qpc_mask->byte_160_sq_ci_pi,
-		       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
-		       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0);
-	roce_set_field(qpc_mask->byte_160_sq_ci_pi,
-		       V2_QPC_BYTE_160_SQ_CONSUMER_IDX_M,
-		       V2_QPC_BYTE_160_SQ_CONSUMER_IDX_S, 0);
-
-	roce_set_bit(qpc_mask->byte_168_irrl_idx,
-		     V2_QPC_BYTE_168_POLL_DB_WAIT_DO_S, 0);
-	roce_set_bit(qpc_mask->byte_168_irrl_idx,
-		     V2_QPC_BYTE_168_SCC_TOKEN_FORBID_SQ_DEQ_S, 0);
-	roce_set_bit(qpc_mask->byte_168_irrl_idx,
-		     V2_QPC_BYTE_168_WAIT_ACK_TIMEOUT_S, 0);
-	roce_set_bit(qpc_mask->byte_168_irrl_idx,
-		     V2_QPC_BYTE_168_MSG_RTY_LP_FLG_S, 0);
-	roce_set_bit(qpc_mask->byte_168_irrl_idx,
-		     V2_QPC_BYTE_168_SQ_INVLD_FLG_S, 0);
-	roce_set_field(qpc_mask->byte_168_irrl_idx,
-		       V2_QPC_BYTE_168_IRRL_IDX_LSB_M,
-		       V2_QPC_BYTE_168_IRRL_IDX_LSB_S, 0);
-
 	roce_set_field(context->byte_172_sq_psn, V2_QPC_BYTE_172_ACK_REQ_FREQ_M,
 		       V2_QPC_BYTE_172_ACK_REQ_FREQ_S, 4);
-	roce_set_field(qpc_mask->byte_172_sq_psn,
-		       V2_QPC_BYTE_172_ACK_REQ_FREQ_M,
-		       V2_QPC_BYTE_172_ACK_REQ_FREQ_S, 0);
-
-	roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_MSG_RNR_FLG_S,
-		     0);
 
 	roce_set_bit(context->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 1);
-	roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 0);
-
-	roce_set_field(qpc_mask->byte_176_msg_pktn,
-		       V2_QPC_BYTE_176_MSG_USE_PKTN_M,
-		       V2_QPC_BYTE_176_MSG_USE_PKTN_S, 0);
-	roce_set_field(qpc_mask->byte_176_msg_pktn,
-		       V2_QPC_BYTE_176_IRRL_HEAD_PRE_M,
-		       V2_QPC_BYTE_176_IRRL_HEAD_PRE_S, 0);
-
-	roce_set_field(qpc_mask->byte_184_irrl_idx,
-		       V2_QPC_BYTE_184_IRRL_IDX_MSB_M,
-		       V2_QPC_BYTE_184_IRRL_IDX_MSB_S, 0);
-
-	qpc_mask->cur_sge_offset = 0;
-
-	roce_set_field(qpc_mask->byte_192_ext_sge,
-		       V2_QPC_BYTE_192_CUR_SGE_IDX_M,
-		       V2_QPC_BYTE_192_CUR_SGE_IDX_S, 0);
-	roce_set_field(qpc_mask->byte_192_ext_sge,
-		       V2_QPC_BYTE_192_EXT_SGE_NUM_LEFT_M,
-		       V2_QPC_BYTE_192_EXT_SGE_NUM_LEFT_S, 0);
-
-	roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_IRRL_HEAD_M,
-		       V2_QPC_BYTE_196_IRRL_HEAD_S, 0);
-
-	roce_set_field(qpc_mask->byte_200_sq_max, V2_QPC_BYTE_200_SQ_MAX_IDX_M,
-		       V2_QPC_BYTE_200_SQ_MAX_IDX_S, 0);
-	roce_set_field(qpc_mask->byte_200_sq_max,
-		       V2_QPC_BYTE_200_LCL_OPERATED_CNT_M,
-		       V2_QPC_BYTE_200_LCL_OPERATED_CNT_S, 0);
-
-	roce_set_bit(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_PKT_RNR_FLG_S, 0);
-	roce_set_bit(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_PKT_RTY_FLG_S, 0);
-
-	roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_CHECK_FLG_M,
-		       V2_QPC_BYTE_212_CHECK_FLG_S, 0);
-
-	qpc_mask->sq_timer = 0;
-
-	roce_set_field(qpc_mask->byte_220_retry_psn_msn,
-		       V2_QPC_BYTE_220_RETRY_MSG_MSN_M,
-		       V2_QPC_BYTE_220_RETRY_MSG_MSN_S, 0);
-	roce_set_field(qpc_mask->byte_232_irrl_sge,
-		       V2_QPC_BYTE_232_IRRL_SGE_IDX_M,
-		       V2_QPC_BYTE_232_IRRL_SGE_IDX_S, 0);
-
-	roce_set_bit(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_SO_LP_VLD_S,
-		     0);
-	roce_set_bit(qpc_mask->byte_232_irrl_sge,
-		     V2_QPC_BYTE_232_FENCE_LP_VLD_S, 0);
-	roce_set_bit(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_IRRL_LP_VLD_S,
-		     0);
-
-	qpc_mask->irrl_cur_sge_offset = 0;
-
-	roce_set_field(qpc_mask->byte_240_irrl_tail,
-		       V2_QPC_BYTE_240_IRRL_TAIL_REAL_M,
-		       V2_QPC_BYTE_240_IRRL_TAIL_REAL_S, 0);
-	roce_set_field(qpc_mask->byte_240_irrl_tail,
-		       V2_QPC_BYTE_240_IRRL_TAIL_RD_M,
-		       V2_QPC_BYTE_240_IRRL_TAIL_RD_S, 0);
-	roce_set_field(qpc_mask->byte_240_irrl_tail,
-		       V2_QPC_BYTE_240_RX_ACK_MSN_M,
-		       V2_QPC_BYTE_240_RX_ACK_MSN_S, 0);
-
-	roce_set_field(qpc_mask->byte_248_ack_psn, V2_QPC_BYTE_248_IRRL_PSN_M,
-		       V2_QPC_BYTE_248_IRRL_PSN_S, 0);
-	roce_set_bit(qpc_mask->byte_248_ack_psn, V2_QPC_BYTE_248_ACK_PSN_ERR_S,
-		     0);
-	roce_set_field(qpc_mask->byte_248_ack_psn,
-		       V2_QPC_BYTE_248_ACK_LAST_OPTYPE_M,
-		       V2_QPC_BYTE_248_ACK_LAST_OPTYPE_S, 0);
-	roce_set_bit(qpc_mask->byte_248_ack_psn, V2_QPC_BYTE_248_IRRL_PSN_VLD_S,
-		     0);
-	roce_set_bit(qpc_mask->byte_248_ack_psn,
-		     V2_QPC_BYTE_248_RNR_RETRY_FLAG_S, 0);
-	roce_set_bit(qpc_mask->byte_248_ack_psn, V2_QPC_BYTE_248_CQ_ERR_IND_S,
-		     0);
 
 	hr_qp->access_flags = attr->qp_access_flags;
 	roce_set_field(context->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
 		       V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->send_cq)->cqn);
-	roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
-		       V2_QPC_BYTE_252_TX_CQN_S, 0);
-
-	roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_ERR_TYPE_M,
-		       V2_QPC_BYTE_252_ERR_TYPE_S, 0);
-
-	roce_set_field(qpc_mask->byte_256_sqflush_rqcqe,
-		       V2_QPC_BYTE_256_RQ_CQE_IDX_M,
-		       V2_QPC_BYTE_256_RQ_CQE_IDX_S, 0);
-	roce_set_field(qpc_mask->byte_256_sqflush_rqcqe,
-		       V2_QPC_BYTE_256_SQ_FLUSH_IDX_M,
-		       V2_QPC_BYTE_256_SQ_FLUSH_IDX_S, 0);
 }
 
 static void modify_qp_init_to_init(struct ib_qp *ibqp,

From cd4a70bb7d19063f63c8189bef08e2149f116ef0 Mon Sep 17 00:00:00 2001
From: Weihang Li <liweihang@huawei.com>
Date: Fri, 20 Mar 2020 11:23:41 +0800
Subject: [PATCH 265/327] RDMA/hns: Remove redundant assignment of wc->smac
 when polling cq

The field smac in ib_wc was used for create AH and then it will be treated
as destination mac address in UD sqwqe, but related code about filling smac
into AH has been removed in core. Actually, the dmac in UD sqwqe is parsed
from the dgid in grh which is passed in by ULP now, so this assignment
should be removed.

Link: https://lore.kernel.org/r/1584674622-52773-10-git-send-email-liweihang@huawei.com
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 2b03c722c2d9..31b6146bf975 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -3236,14 +3236,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 		wc->port_num = roce_get_field(cqe->byte_32,
 				V2_CQE_BYTE_32_PORTN_M, V2_CQE_BYTE_32_PORTN_S);
 		wc->pkey_index = 0;
-		memcpy(wc->smac, cqe->smac, 4);
-		wc->smac[4] = roce_get_field(cqe->byte_28,
-					     V2_CQE_BYTE_28_SMAC_4_M,
-					     V2_CQE_BYTE_28_SMAC_4_S);
-		wc->smac[5] = roce_get_field(cqe->byte_28,
-					     V2_CQE_BYTE_28_SMAC_5_M,
-					     V2_CQE_BYTE_28_SMAC_5_S);
-		wc->wc_flags |= IB_WC_WITH_SMAC;
+
 		if (roce_get_bit(cqe->byte_28, V2_CQE_BYTE_28_VID_VLD_S)) {
 			wc->vlan_id = (u16)roce_get_field(cqe->byte_28,
 							  V2_CQE_BYTE_28_VID_M,

From e0b0722643fc3bb7e2401a868cc28e65c31ce07d Mon Sep 17 00:00:00 2001
From: Weihang Li <liweihang@huawei.com>
Date: Fri, 20 Mar 2020 11:23:42 +0800
Subject: [PATCH 266/327] RDMA/hns: Remove redundant judgment of qp_type

Type of qp has been checked in check_send_valid(), so this judgment should
be removed.

Link: https://lore.kernel.org/r/1584674622-52773-11-git-send-email-liweihang@huawei.com
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 31b6146bf975..7eceeea9ccea 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -583,13 +583,6 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 			ret = set_ud_wqe(qp, wr, wqe, &sge_idx, owner_bit);
 		else if (ibqp->qp_type == IB_QPT_RC)
 			ret = set_rc_wqe(qp, wr, wqe, &sge_idx, owner_bit);
-		else {
-			ibdev_err(ibdev, "Illegal qp_type(0x%x)\n",
-				  ibqp->qp_type);
-			spin_unlock_irqrestore(&qp->sq.lock, flags);
-			*bad_wr = wr;
-			return -EOPNOTSUPP;
-		}
 
 		if (ret) {
 			*bad_wr = wr;

From e2219db280e3fe52e5cc242e4225dd2685af3c56 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Thu, 26 Mar 2020 14:25:07 +0000
Subject: [PATCH 267/327] kunit: add debugfs
 /sys/kernel/debug/kunit/<suite>/results display

add debugfs support for displaying kunit test suite results; this is
especially useful for module-loaded tests to allow disentangling of
test result display from other dmesg events.  debugfs support is
provided if CONFIG_KUNIT_DEBUGFS=y.

As well as printk()ing messages, we append them to a per-test log.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/test.h   |  54 ++++++++++++---
 lib/kunit/Kconfig      |   8 +++
 lib/kunit/Makefile     |   4 ++
 lib/kunit/debugfs.c    | 116 ++++++++++++++++++++++++++++++++
 lib/kunit/debugfs.h    |  30 +++++++++
 lib/kunit/kunit-test.c |   4 +-
 lib/kunit/test.c       | 147 ++++++++++++++++++++++++++++++++---------
 7 files changed, 322 insertions(+), 41 deletions(-)
 create mode 100644 lib/kunit/debugfs.c
 create mode 100644 lib/kunit/debugfs.h

diff --git a/include/kunit/test.h b/include/kunit/test.h
index 2dfb550c6723..f7b2ed4c127e 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -81,6 +81,9 @@ struct kunit_resource {
 
 struct kunit;
 
+/* Size of log associated with test. */
+#define KUNIT_LOG_SIZE	512
+
 /**
  * struct kunit_case - represents an individual test case.
  *
@@ -123,8 +126,14 @@ struct kunit_case {
 
 	/* private: internal use only. */
 	bool success;
+	char *log;
 };
 
+static inline char *kunit_status_to_string(bool status)
+{
+	return status ? "ok" : "not ok";
+}
+
 /**
  * KUNIT_CASE - A helper for creating a &struct kunit_case
  *
@@ -157,6 +166,10 @@ struct kunit_suite {
 	int (*init)(struct kunit *test);
 	void (*exit)(struct kunit *test);
 	struct kunit_case *test_cases;
+
+	/* private - internal use only */
+	struct dentry *debugfs;
+	char *log;
 };
 
 /**
@@ -175,6 +188,7 @@ struct kunit {
 
 	/* private: internal use only. */
 	const char *name; /* Read only after initialization! */
+	char *log; /* Points at case log after initialization */
 	struct kunit_try_catch try_catch;
 	/*
 	 * success starts as true, and may only be set to false during a
@@ -193,10 +207,19 @@ struct kunit {
 	struct list_head resources; /* Protected by lock. */
 };
 
-void kunit_init_test(struct kunit *test, const char *name);
+void kunit_init_test(struct kunit *test, const char *name, char *log);
 
 int kunit_run_tests(struct kunit_suite *suite);
 
+size_t kunit_suite_num_test_cases(struct kunit_suite *suite);
+
+unsigned int kunit_test_case_num(struct kunit_suite *suite,
+				 struct kunit_case *test_case);
+
+int __kunit_test_suites_init(struct kunit_suite **suites);
+
+void __kunit_test_suites_exit(struct kunit_suite **suites);
+
 /**
  * kunit_test_suites() - used to register one or more &struct kunit_suite
  *			 with KUnit.
@@ -226,20 +249,22 @@ int kunit_run_tests(struct kunit_suite *suite);
 	static struct kunit_suite *suites[] = { __VA_ARGS__, NULL};	\
 	static int kunit_test_suites_init(void)				\
 	{								\
-		unsigned int i;						\
-		for (i = 0; suites[i] != NULL; i++)			\
-			kunit_run_tests(suites[i]);			\
-		return 0;						\
+		return __kunit_test_suites_init(suites);		\
 	}								\
 	late_initcall(kunit_test_suites_init);				\
 	static void __exit kunit_test_suites_exit(void)			\
 	{								\
-		return;							\
+		return __kunit_test_suites_exit(suites);		\
 	}								\
 	module_exit(kunit_test_suites_exit)
 
 #define kunit_test_suite(suite)	kunit_test_suites(&suite)
 
+#define kunit_suite_for_each_test_case(suite, test_case)		\
+	for (test_case = suite->test_cases; test_case->run_case; test_case++)
+
+bool kunit_suite_has_succeeded(struct kunit_suite *suite);
+
 /*
  * Like kunit_alloc_resource() below, but returns the struct kunit_resource
  * object that contains the allocation. This is mostly for testing purposes.
@@ -356,8 +381,21 @@ static inline void *kunit_kzalloc(struct kunit *test, size_t size, gfp_t gfp)
 
 void kunit_cleanup(struct kunit *test);
 
-#define kunit_printk(lvl, test, fmt, ...) \
-	printk(lvl "\t# %s: " fmt, (test)->name, ##__VA_ARGS__)
+void kunit_log_append(char *log, const char *fmt, ...);
+
+/*
+ * printk and log to per-test or per-suite log buffer.  Logging only done
+ * if CONFIG_KUNIT_DEBUGFS is 'y'; if it is 'n', no log is allocated/used.
+ */
+#define kunit_log(lvl, test_or_suite, fmt, ...)				\
+	do {								\
+		printk(lvl fmt, ##__VA_ARGS__);				\
+		kunit_log_append((test_or_suite)->log,	fmt "\n",	\
+				 ##__VA_ARGS__);			\
+	} while (0)
+
+#define kunit_printk(lvl, test, fmt, ...)				\
+	kunit_log(lvl, test, "\t# %s: " fmt, (test)->name, ##__VA_ARGS__)
 
 /**
  * kunit_info() - Prints an INFO level message associated with @test.
diff --git a/lib/kunit/Kconfig b/lib/kunit/Kconfig
index 065aa16f448b..95d12e3d6d95 100644
--- a/lib/kunit/Kconfig
+++ b/lib/kunit/Kconfig
@@ -14,6 +14,14 @@ menuconfig KUNIT
 
 if KUNIT
 
+config KUNIT_DEBUGFS
+	bool "KUnit - Enable /sys/kernel/debug/kunit debugfs representation"
+	help
+	  Enable debugfs representation for kunit.  Currently this consists
+	  of /sys/kernel/debug/kunit/<test_suite>/results files for each
+	  test suite, which allow users to see results of the last test suite
+	  run that occurred.
+
 config KUNIT_TEST
 	tristate "KUnit test for KUnit"
 	help
diff --git a/lib/kunit/Makefile b/lib/kunit/Makefile
index fab55649b69a..724b94311ca3 100644
--- a/lib/kunit/Makefile
+++ b/lib/kunit/Makefile
@@ -5,6 +5,10 @@ kunit-objs +=				test.o \
 					assert.o \
 					try-catch.o
 
+ifeq ($(CONFIG_KUNIT_DEBUGFS),y)
+kunit-objs +=				debugfs.o
+endif
+
 obj-$(CONFIG_KUNIT_TEST) +=		kunit-test.o
 
 # string-stream-test compiles built-in only.
diff --git a/lib/kunit/debugfs.c b/lib/kunit/debugfs.c
new file mode 100644
index 000000000000..9214c493d8b7
--- /dev/null
+++ b/lib/kunit/debugfs.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates.
+ *    Author: Alan Maguire <alan.maguire@oracle.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+
+#include <kunit/test.h>
+
+#include "string-stream.h"
+
+#define KUNIT_DEBUGFS_ROOT             "kunit"
+#define KUNIT_DEBUGFS_RESULTS          "results"
+
+/*
+ * Create a debugfs representation of test suites:
+ *
+ * Path						Semantics
+ * /sys/kernel/debug/kunit/<testsuite>/results	Show results of last run for
+ *						testsuite
+ *
+ */
+
+static struct dentry *debugfs_rootdir;
+
+void kunit_debugfs_cleanup(void)
+{
+	debugfs_remove_recursive(debugfs_rootdir);
+}
+
+void kunit_debugfs_init(void)
+{
+	if (!debugfs_rootdir)
+		debugfs_rootdir = debugfs_create_dir(KUNIT_DEBUGFS_ROOT, NULL);
+}
+
+static void debugfs_print_result(struct seq_file *seq,
+				 struct kunit_suite *suite,
+				 struct kunit_case *test_case)
+{
+	if (!test_case || !test_case->log)
+		return;
+
+	seq_printf(seq, "%s", test_case->log);
+}
+
+/*
+ * /sys/kernel/debug/kunit/<testsuite>/results shows all results for testsuite.
+ */
+static int debugfs_print_results(struct seq_file *seq, void *v)
+{
+	struct kunit_suite *suite = (struct kunit_suite *)seq->private;
+	bool success = kunit_suite_has_succeeded(suite);
+	struct kunit_case *test_case;
+
+	if (!suite || !suite->log)
+		return 0;
+
+	seq_printf(seq, "%s", suite->log);
+
+	kunit_suite_for_each_test_case(suite, test_case)
+		debugfs_print_result(seq, suite, test_case);
+
+	seq_printf(seq, "%s %d - %s\n",
+		   kunit_status_to_string(success), 1, suite->name);
+	return 0;
+}
+
+static int debugfs_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+
+static int debugfs_results_open(struct inode *inode, struct file *file)
+{
+	struct kunit_suite *suite;
+
+	suite = (struct kunit_suite *)inode->i_private;
+
+	return single_open(file, debugfs_print_results, suite);
+}
+
+static const struct file_operations debugfs_results_fops = {
+	.open = debugfs_results_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = debugfs_release,
+};
+
+void kunit_debugfs_create_suite(struct kunit_suite *suite)
+{
+	struct kunit_case *test_case;
+
+	/* Allocate logs before creating debugfs representation. */
+	suite->log = kzalloc(KUNIT_LOG_SIZE, GFP_KERNEL);
+	kunit_suite_for_each_test_case(suite, test_case)
+		test_case->log = kzalloc(KUNIT_LOG_SIZE, GFP_KERNEL);
+
+	suite->debugfs = debugfs_create_dir(suite->name, debugfs_rootdir);
+
+	debugfs_create_file(KUNIT_DEBUGFS_RESULTS, S_IFREG | 0444,
+			    suite->debugfs,
+			    suite, &debugfs_results_fops);
+}
+
+void kunit_debugfs_destroy_suite(struct kunit_suite *suite)
+{
+	struct kunit_case *test_case;
+
+	debugfs_remove_recursive(suite->debugfs);
+	kfree(suite->log);
+	kunit_suite_for_each_test_case(suite, test_case)
+		kfree(test_case->log);
+}
diff --git a/lib/kunit/debugfs.h b/lib/kunit/debugfs.h
new file mode 100644
index 000000000000..dcc7d7556107
--- /dev/null
+++ b/lib/kunit/debugfs.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020, Oracle and/or its affiliates.
+ */
+
+#ifndef _KUNIT_DEBUGFS_H
+#define _KUNIT_DEBUGFS_H
+
+#include <kunit/test.h>
+
+#ifdef CONFIG_KUNIT_DEBUGFS
+
+void kunit_debugfs_create_suite(struct kunit_suite *suite);
+void kunit_debugfs_destroy_suite(struct kunit_suite *suite);
+void kunit_debugfs_init(void);
+void kunit_debugfs_cleanup(void);
+
+#else
+
+static inline void kunit_debugfs_create_suite(struct kunit_suite *suite) { }
+
+static inline void kunit_debugfs_destroy_suite(struct kunit_suite *suite) { }
+
+static inline void kunit_debugfs_init(void) { }
+
+static inline void kunit_debugfs_cleanup(void) { }
+
+#endif /* CONFIG_KUNIT_DEBUGFS */
+
+#endif /* _KUNIT_DEBUGFS_H */
diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c
index ccb8d2e332f7..aceb5bf0212f 100644
--- a/lib/kunit/kunit-test.c
+++ b/lib/kunit/kunit-test.c
@@ -134,7 +134,7 @@ static void kunit_resource_test_init_resources(struct kunit *test)
 {
 	struct kunit_test_resource_context *ctx = test->priv;
 
-	kunit_init_test(&ctx->test, "testing_test_init_test");
+	kunit_init_test(&ctx->test, "testing_test_init_test", NULL);
 
 	KUNIT_EXPECT_TRUE(test, list_empty(&ctx->test.resources));
 }
@@ -301,7 +301,7 @@ static int kunit_resource_test_init(struct kunit *test)
 
 	test->priv = ctx;
 
-	kunit_init_test(&ctx->test, "test_test_context");
+	kunit_init_test(&ctx->test, "test_test_context", NULL);
 
 	return 0;
 }
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index 9242f932896c..a3fa21f9d918 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/sched/debug.h>
 
+#include "debugfs.h"
 #include "string-stream.h"
 #include "try-catch-impl.h"
 
@@ -28,73 +29,116 @@ static void kunit_print_tap_version(void)
 	}
 }
 
-static size_t kunit_test_cases_len(struct kunit_case *test_cases)
+/*
+ * Append formatted message to log, size of which is limited to
+ * KUNIT_LOG_SIZE bytes (including null terminating byte).
+ */
+void kunit_log_append(char *log, const char *fmt, ...)
+{
+	char line[KUNIT_LOG_SIZE];
+	va_list args;
+	int len_left;
+
+	if (!log)
+		return;
+
+	len_left = KUNIT_LOG_SIZE - strlen(log) - 1;
+	if (len_left <= 0)
+		return;
+
+	va_start(args, fmt);
+	vsnprintf(line, sizeof(line), fmt, args);
+	va_end(args);
+
+	strncat(log, line, len_left);
+}
+EXPORT_SYMBOL_GPL(kunit_log_append);
+
+size_t kunit_suite_num_test_cases(struct kunit_suite *suite)
 {
 	struct kunit_case *test_case;
 	size_t len = 0;
 
-	for (test_case = test_cases; test_case->run_case; test_case++)
+	kunit_suite_for_each_test_case(suite, test_case)
 		len++;
 
 	return len;
 }
+EXPORT_SYMBOL_GPL(kunit_suite_num_test_cases);
 
 static void kunit_print_subtest_start(struct kunit_suite *suite)
 {
 	kunit_print_tap_version();
-	pr_info("\t# Subtest: %s\n", suite->name);
-	pr_info("\t1..%zd\n", kunit_test_cases_len(suite->test_cases));
+	kunit_log(KERN_INFO, suite, "\t# Subtest: %s", suite->name);
+	kunit_log(KERN_INFO, suite, "\t1..%zd",
+		  kunit_suite_num_test_cases(suite));
 }
 
-static void kunit_print_ok_not_ok(bool should_indent,
+static void kunit_print_ok_not_ok(void *test_or_suite,
+				  bool is_test,
 				  bool is_ok,
 				  size_t test_number,
 				  const char *description)
 {
-	const char *indent, *ok_not_ok;
+	struct kunit_suite *suite = is_test ? NULL : test_or_suite;
+	struct kunit *test = is_test ? test_or_suite : NULL;
 
-	if (should_indent)
-		indent = "\t";
+	/*
+	 * We do not log the test suite results as doing so would
+	 * mean debugfs display would consist of the test suite
+	 * description and status prior to individual test results.
+	 * Hence directly printk the suite status, and we will
+	 * separately seq_printf() the suite status for the debugfs
+	 * representation.
+	 */
+	if (suite)
+		pr_info("%s %zd - %s",
+			kunit_status_to_string(is_ok),
+			test_number, description);
 	else
-		indent = "";
-
-	if (is_ok)
-		ok_not_ok = "ok";
-	else
-		ok_not_ok = "not ok";
-
-	pr_info("%s%s %zd - %s\n", indent, ok_not_ok, test_number, description);
+		kunit_log(KERN_INFO, test, "\t%s %zd - %s",
+			  kunit_status_to_string(is_ok),
+			  test_number, description);
 }
 
-static bool kunit_suite_has_succeeded(struct kunit_suite *suite)
+bool kunit_suite_has_succeeded(struct kunit_suite *suite)
 {
 	const struct kunit_case *test_case;
 
-	for (test_case = suite->test_cases; test_case->run_case; test_case++)
+	kunit_suite_for_each_test_case(suite, test_case) {
 		if (!test_case->success)
 			return false;
+	}
 
 	return true;
 }
+EXPORT_SYMBOL_GPL(kunit_suite_has_succeeded);
 
 static void kunit_print_subtest_end(struct kunit_suite *suite)
 {
 	static size_t kunit_suite_counter = 1;
 
-	kunit_print_ok_not_ok(false,
+	kunit_print_ok_not_ok((void *)suite, false,
 			      kunit_suite_has_succeeded(suite),
 			      kunit_suite_counter++,
 			      suite->name);
 }
 
-static void kunit_print_test_case_ok_not_ok(struct kunit_case *test_case,
-					    size_t test_number)
+unsigned int kunit_test_case_num(struct kunit_suite *suite,
+				 struct kunit_case *test_case)
 {
-	kunit_print_ok_not_ok(true,
-			      test_case->success,
-			      test_number,
-			      test_case->name);
+	struct kunit_case *tc;
+	unsigned int i = 1;
+
+	kunit_suite_for_each_test_case(suite, tc) {
+		if (tc == test_case)
+			return i;
+		i++;
+	}
+
+	return 0;
 }
+EXPORT_SYMBOL_GPL(kunit_test_case_num);
 
 static void kunit_print_string_stream(struct kunit *test,
 				      struct string_stream *stream)
@@ -102,6 +146,9 @@ static void kunit_print_string_stream(struct kunit *test,
 	struct string_stream_fragment *fragment;
 	char *buf;
 
+	if (string_stream_is_empty(stream))
+		return;
+
 	buf = string_stream_get_string(stream);
 	if (!buf) {
 		kunit_err(test,
@@ -175,11 +222,14 @@ void kunit_do_assertion(struct kunit *test,
 }
 EXPORT_SYMBOL_GPL(kunit_do_assertion);
 
-void kunit_init_test(struct kunit *test, const char *name)
+void kunit_init_test(struct kunit *test, const char *name, char *log)
 {
 	spin_lock_init(&test->lock);
 	INIT_LIST_HEAD(&test->resources);
 	test->name = name;
+	test->log = log;
+	if (test->log)
+		test->log[0] = '\0';
 	test->success = true;
 }
 EXPORT_SYMBOL_GPL(kunit_init_test);
@@ -290,7 +340,7 @@ static void kunit_run_case_catch_errors(struct kunit_suite *suite,
 	struct kunit_try_catch *try_catch;
 	struct kunit test;
 
-	kunit_init_test(&test, test_case->name);
+	kunit_init_test(&test, test_case->name, test_case->log);
 	try_catch = &test.try_catch;
 
 	kunit_try_catch_init(try_catch,
@@ -303,19 +353,20 @@ static void kunit_run_case_catch_errors(struct kunit_suite *suite,
 	kunit_try_catch_run(try_catch, &context);
 
 	test_case->success = test.success;
+
+	kunit_print_ok_not_ok(&test, true, test_case->success,
+			      kunit_test_case_num(suite, test_case),
+			      test_case->name);
 }
 
 int kunit_run_tests(struct kunit_suite *suite)
 {
 	struct kunit_case *test_case;
-	size_t test_case_count = 1;
 
 	kunit_print_subtest_start(suite);
 
-	for (test_case = suite->test_cases; test_case->run_case; test_case++) {
+	kunit_suite_for_each_test_case(suite, test_case)
 		kunit_run_case_catch_errors(suite, test_case);
-		kunit_print_test_case_ok_not_ok(test_case, test_case_count++);
-	}
 
 	kunit_print_subtest_end(suite);
 
@@ -323,6 +374,37 @@ int kunit_run_tests(struct kunit_suite *suite)
 }
 EXPORT_SYMBOL_GPL(kunit_run_tests);
 
+static void kunit_init_suite(struct kunit_suite *suite)
+{
+	kunit_debugfs_create_suite(suite);
+}
+
+int __kunit_test_suites_init(struct kunit_suite **suites)
+{
+	unsigned int i;
+
+	for (i = 0; suites[i] != NULL; i++) {
+		kunit_init_suite(suites[i]);
+		kunit_run_tests(suites[i]);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__kunit_test_suites_init);
+
+static void kunit_exit_suite(struct kunit_suite *suite)
+{
+	kunit_debugfs_destroy_suite(suite);
+}
+
+void __kunit_test_suites_exit(struct kunit_suite **suites)
+{
+	unsigned int i;
+
+	for (i = 0; suites[i] != NULL; i++)
+		kunit_exit_suite(suites[i]);
+}
+EXPORT_SYMBOL_GPL(__kunit_test_suites_exit);
+
 struct kunit_resource *kunit_alloc_and_get_resource(struct kunit *test,
 						    kunit_resource_init_t init,
 						    kunit_resource_free_t free,
@@ -489,12 +571,15 @@ EXPORT_SYMBOL_GPL(kunit_cleanup);
 
 static int __init kunit_init(void)
 {
+	kunit_debugfs_init();
+
 	return 0;
 }
 late_initcall(kunit_init);
 
 static void __exit kunit_exit(void)
 {
+	kunit_debugfs_cleanup();
 }
 module_exit(kunit_exit);
 

From eda8e324f70815e90360bef1032d49ef7c61e877 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Thu, 26 Mar 2020 14:25:08 +0000
Subject: [PATCH 268/327] kunit: add log test

the logging test ensures multiple strings logged appear in the
log string associated with the test when CONFIG_KUNIT_DEBUGFS is
enabled.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 lib/kunit/kunit-test.c | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c
index aceb5bf0212f..4f3d36a72f8f 100644
--- a/lib/kunit/kunit-test.c
+++ b/lib/kunit/kunit-test.c
@@ -329,6 +329,44 @@ static struct kunit_suite kunit_resource_test_suite = {
 	.exit = kunit_resource_test_exit,
 	.test_cases = kunit_resource_test_cases,
 };
-kunit_test_suites(&kunit_try_catch_test_suite, &kunit_resource_test_suite);
+
+static void kunit_log_test(struct kunit *test);
+
+static struct kunit_case kunit_log_test_cases[] = {
+	KUNIT_CASE(kunit_log_test),
+	{}
+};
+
+static struct kunit_suite kunit_log_test_suite = {
+	.name = "kunit-log-test",
+	.test_cases = kunit_log_test_cases,
+};
+
+static void kunit_log_test(struct kunit *test)
+{
+	struct kunit_suite *suite = &kunit_log_test_suite;
+
+	kunit_log(KERN_INFO, test, "put this in log.");
+	kunit_log(KERN_INFO, test, "this too.");
+	kunit_log(KERN_INFO, suite, "add to suite log.");
+	kunit_log(KERN_INFO, suite, "along with this.");
+
+#ifdef CONFIG_KUNIT_DEBUGFS
+	KUNIT_EXPECT_NOT_ERR_OR_NULL(test,
+				     strstr(test->log, "put this in log."));
+	KUNIT_EXPECT_NOT_ERR_OR_NULL(test,
+				     strstr(test->log, "this too."));
+	KUNIT_EXPECT_NOT_ERR_OR_NULL(test,
+				     strstr(suite->log, "add to suite log."));
+	KUNIT_EXPECT_NOT_ERR_OR_NULL(test,
+				     strstr(suite->log, "along with this."));
+#else
+	KUNIT_EXPECT_PTR_EQ(test, test->log, (char *)NULL);
+	KUNIT_EXPECT_PTR_EQ(test, suite->log, (char *)NULL);
+#endif
+}
+
+kunit_test_suites(&kunit_try_catch_test_suite, &kunit_resource_test_suite,
+		  &kunit_log_test_suite);
 
 MODULE_LICENSE("GPL v2");

From c3bba690a2643245f59a4d5d66e6b687459696d9 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Thu, 26 Mar 2020 14:25:09 +0000
Subject: [PATCH 269/327] kunit: subtests should be indented 4 spaces according
 to TAP

Introduce KUNIT_SUBTEST_INDENT macro which corresponds to 4-space
indentation and KUNIT_SUBSUBTEST_INDENT macro which corresponds to
8-space indentation in line with TAP spec (e.g. see "Subtests"
section of https://node-tap.org/tap-protocol/).

Use these macros in place of one or two tabs in strings to clarify
why we are indenting.

Suggested-by: Frank Rowand <frowand.list@gmail.com>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/test.h                | 11 +++-
 lib/kunit/assert.c                  | 79 +++++++++++++++--------------
 lib/kunit/test.c                    |  7 +--
 tools/testing/kunit/kunit_parser.py | 10 ++--
 4 files changed, 59 insertions(+), 48 deletions(-)

diff --git a/include/kunit/test.h b/include/kunit/test.h
index f7b2ed4c127e..9b0c46a6ca1f 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -84,6 +84,14 @@ struct kunit;
 /* Size of log associated with test. */
 #define KUNIT_LOG_SIZE	512
 
+/*
+ * TAP specifies subtest stream indentation of 4 spaces, 8 spaces for a
+ * sub-subtest.  See the "Subtests" section in
+ * https://node-tap.org/tap-protocol/
+ */
+#define KUNIT_SUBTEST_INDENT		"    "
+#define KUNIT_SUBSUBTEST_INDENT		"        "
+
 /**
  * struct kunit_case - represents an individual test case.
  *
@@ -395,7 +403,8 @@ void kunit_log_append(char *log, const char *fmt, ...);
 	} while (0)
 
 #define kunit_printk(lvl, test, fmt, ...)				\
-	kunit_log(lvl, test, "\t# %s: " fmt, (test)->name, ##__VA_ARGS__)
+	kunit_log(lvl, test, KUNIT_SUBTEST_INDENT "# %s: " fmt,		\
+		  (test)->name,	##__VA_ARGS__)
 
 /**
  * kunit_info() - Prints an INFO level message associated with @test.
diff --git a/lib/kunit/assert.c b/lib/kunit/assert.c
index 02ecd0d7d80a..33acdaa28a7d 100644
--- a/lib/kunit/assert.c
+++ b/lib/kunit/assert.c
@@ -6,6 +6,7 @@
  * Author: Brendan Higgins <brendanhiggins@google.com>
  */
 #include <kunit/assert.h>
+#include <kunit/test.h>
 
 #include "string-stream.h"
 
@@ -53,12 +54,12 @@ void kunit_unary_assert_format(const struct kunit_assert *assert,
 	kunit_base_assert_format(assert, stream);
 	if (unary_assert->expected_true)
 		string_stream_add(stream,
-				 "\tExpected %s to be true, but is false\n",
-				 unary_assert->condition);
+				  KUNIT_SUBTEST_INDENT "Expected %s to be true, but is false\n",
+				  unary_assert->condition);
 	else
 		string_stream_add(stream,
-				 "\tExpected %s to be false, but is true\n",
-				 unary_assert->condition);
+				  KUNIT_SUBTEST_INDENT "Expected %s to be false, but is true\n",
+				  unary_assert->condition);
 	kunit_assert_print_msg(assert, stream);
 }
 EXPORT_SYMBOL_GPL(kunit_unary_assert_format);
@@ -72,13 +73,13 @@ void kunit_ptr_not_err_assert_format(const struct kunit_assert *assert,
 	kunit_base_assert_format(assert, stream);
 	if (!ptr_assert->value) {
 		string_stream_add(stream,
-				 "\tExpected %s is not null, but is\n",
-				 ptr_assert->text);
+				  KUNIT_SUBTEST_INDENT "Expected %s is not null, but is\n",
+				  ptr_assert->text);
 	} else if (IS_ERR(ptr_assert->value)) {
 		string_stream_add(stream,
-				 "\tExpected %s is not error, but is: %ld\n",
-				 ptr_assert->text,
-				 PTR_ERR(ptr_assert->value));
+				  KUNIT_SUBTEST_INDENT "Expected %s is not error, but is: %ld\n",
+				  ptr_assert->text,
+				  PTR_ERR(ptr_assert->value));
 	}
 	kunit_assert_print_msg(assert, stream);
 }
@@ -92,16 +93,16 @@ void kunit_binary_assert_format(const struct kunit_assert *assert,
 
 	kunit_base_assert_format(assert, stream);
 	string_stream_add(stream,
-			 "\tExpected %s %s %s, but\n",
-			 binary_assert->left_text,
-			 binary_assert->operation,
-			 binary_assert->right_text);
-	string_stream_add(stream, "\t\t%s == %lld\n",
-			 binary_assert->left_text,
-			 binary_assert->left_value);
-	string_stream_add(stream, "\t\t%s == %lld",
-			 binary_assert->right_text,
-			 binary_assert->right_value);
+			  KUNIT_SUBTEST_INDENT "Expected %s %s %s, but\n",
+			  binary_assert->left_text,
+			  binary_assert->operation,
+			  binary_assert->right_text);
+	string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s == %lld\n",
+			  binary_assert->left_text,
+			  binary_assert->left_value);
+	string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s == %lld",
+			  binary_assert->right_text,
+			  binary_assert->right_value);
 	kunit_assert_print_msg(assert, stream);
 }
 EXPORT_SYMBOL_GPL(kunit_binary_assert_format);
@@ -114,16 +115,16 @@ void kunit_binary_ptr_assert_format(const struct kunit_assert *assert,
 
 	kunit_base_assert_format(assert, stream);
 	string_stream_add(stream,
-			 "\tExpected %s %s %s, but\n",
-			 binary_assert->left_text,
-			 binary_assert->operation,
-			 binary_assert->right_text);
-	string_stream_add(stream, "\t\t%s == %px\n",
-			 binary_assert->left_text,
-			 binary_assert->left_value);
-	string_stream_add(stream, "\t\t%s == %px",
-			 binary_assert->right_text,
-			 binary_assert->right_value);
+			  KUNIT_SUBTEST_INDENT "Expected %s %s %s, but\n",
+			  binary_assert->left_text,
+			  binary_assert->operation,
+			  binary_assert->right_text);
+	string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s == %px\n",
+			  binary_assert->left_text,
+			  binary_assert->left_value);
+	string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s == %px",
+			  binary_assert->right_text,
+			  binary_assert->right_value);
 	kunit_assert_print_msg(assert, stream);
 }
 EXPORT_SYMBOL_GPL(kunit_binary_ptr_assert_format);
@@ -136,16 +137,16 @@ void kunit_binary_str_assert_format(const struct kunit_assert *assert,
 
 	kunit_base_assert_format(assert, stream);
 	string_stream_add(stream,
-			 "\tExpected %s %s %s, but\n",
-			 binary_assert->left_text,
-			 binary_assert->operation,
-			 binary_assert->right_text);
-	string_stream_add(stream, "\t\t%s == %s\n",
-			 binary_assert->left_text,
-			 binary_assert->left_value);
-	string_stream_add(stream, "\t\t%s == %s",
-			 binary_assert->right_text,
-			 binary_assert->right_value);
+			  KUNIT_SUBTEST_INDENT "Expected %s %s %s, but\n",
+			  binary_assert->left_text,
+			  binary_assert->operation,
+			  binary_assert->right_text);
+	string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s == %s\n",
+			  binary_assert->left_text,
+			  binary_assert->left_value);
+	string_stream_add(stream, KUNIT_SUBSUBTEST_INDENT "%s == %s",
+			  binary_assert->right_text,
+			  binary_assert->right_value);
 	kunit_assert_print_msg(assert, stream);
 }
 EXPORT_SYMBOL_GPL(kunit_binary_str_assert_format);
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index a3fa21f9d918..7a6430a7fca0 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -69,8 +69,9 @@ EXPORT_SYMBOL_GPL(kunit_suite_num_test_cases);
 static void kunit_print_subtest_start(struct kunit_suite *suite)
 {
 	kunit_print_tap_version();
-	kunit_log(KERN_INFO, suite, "\t# Subtest: %s", suite->name);
-	kunit_log(KERN_INFO, suite, "\t1..%zd",
+	kunit_log(KERN_INFO, suite, KUNIT_SUBTEST_INDENT "# Subtest: %s",
+		  suite->name);
+	kunit_log(KERN_INFO, suite, KUNIT_SUBTEST_INDENT "1..%zd",
 		  kunit_suite_num_test_cases(suite));
 }
 
@@ -96,7 +97,7 @@ static void kunit_print_ok_not_ok(void *test_or_suite,
 			kunit_status_to_string(is_ok),
 			test_number, description);
 	else
-		kunit_log(KERN_INFO, test, "\t%s %zd - %s",
+		kunit_log(KERN_INFO, test, KUNIT_SUBTEST_INDENT "%s %zd - %s",
 			  kunit_status_to_string(is_ok),
 			  test_number, description);
 }
diff --git a/tools/testing/kunit/kunit_parser.py b/tools/testing/kunit/kunit_parser.py
index 02815062d5a6..64aac9dcd431 100644
--- a/tools/testing/kunit/kunit_parser.py
+++ b/tools/testing/kunit/kunit_parser.py
@@ -94,7 +94,7 @@ def print_log(log):
 	for m in log:
 		print_with_timestamp(m)
 
-TAP_ENTRIES = re.compile(r'^(TAP|\t?ok|\t?not ok|\t?[0-9]+\.\.[0-9]+|\t?#).*$')
+TAP_ENTRIES = re.compile(r'^(TAP|[\s]*ok|[\s]*not ok|[\s]*[0-9]+\.\.[0-9]+|[\s]*#).*$')
 
 def consume_non_diagnositic(lines: List[str]) -> None:
 	while lines and not TAP_ENTRIES.match(lines[0]):
@@ -107,7 +107,7 @@ def save_non_diagnositic(lines: List[str], test_case: TestCase) -> None:
 
 OkNotOkResult = namedtuple('OkNotOkResult', ['is_ok','description', 'text'])
 
-OK_NOT_OK_SUBTEST = re.compile(r'^\t(ok|not ok) [0-9]+ - (.*)$')
+OK_NOT_OK_SUBTEST = re.compile(r'^[\s]+(ok|not ok) [0-9]+ - (.*)$')
 
 OK_NOT_OK_MODULE = re.compile(r'^(ok|not ok) [0-9]+ - (.*)$')
 
@@ -134,7 +134,7 @@ def parse_ok_not_ok_test_case(lines: List[str], test_case: TestCase) -> bool:
 	else:
 		return False
 
-SUBTEST_DIAGNOSTIC = re.compile(r'^\t# .*?: (.*)$')
+SUBTEST_DIAGNOSTIC = re.compile(r'^[\s]+# .*?: (.*)$')
 DIAGNOSTIC_CRASH_MESSAGE = 'kunit test case crashed!'
 
 def parse_diagnostic(lines: List[str], test_case: TestCase) -> bool:
@@ -161,7 +161,7 @@ def parse_test_case(lines: List[str]) -> TestCase:
 	else:
 		return None
 
-SUBTEST_HEADER = re.compile(r'^\t# Subtest: (.*)$')
+SUBTEST_HEADER = re.compile(r'^[\s]+# Subtest: (.*)$')
 
 def parse_subtest_header(lines: List[str]) -> str:
 	consume_non_diagnositic(lines)
@@ -174,7 +174,7 @@ def parse_subtest_header(lines: List[str]) -> str:
 	else:
 		return None
 
-SUBTEST_PLAN = re.compile(r'\t[0-9]+\.\.([0-9]+)')
+SUBTEST_PLAN = re.compile(r'[\s]+[0-9]+\.\.([0-9]+)')
 
 def parse_subtest_plan(lines: List[str]) -> int:
 	consume_non_diagnositic(lines)

From 32526908fe86dce55eb1af184335bdb863578eb2 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Thu, 26 Mar 2020 14:25:10 +0000
Subject: [PATCH 270/327] kunit: update documentation to describe debugfs
 representation

Documentation should describe debugfs layout and semantics.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 Documentation/dev-tools/kunit/usage.rst | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst
index 607758a66a99..473a2361ec37 100644
--- a/Documentation/dev-tools/kunit/usage.rst
+++ b/Documentation/dev-tools/kunit/usage.rst
@@ -591,3 +591,17 @@ able to run one test case per invocation.
 
 .. TODO(brendanhiggins@google.com): Add an actual example of an architecture
    dependent KUnit test.
+
+KUnit debugfs representation
+============================
+When kunit test suites are initialized, they create an associated directory
+in /sys/kernel/debug/kunit/<test-suite>.  The directory contains one file
+
+- results: "cat results" displays results of each test case and the results
+  of the entire suite for the last test run.
+
+The debugfs representation is primarily of use when kunit test suites are
+run in a native environment, either as modules or builtin.  Having a way
+to display results like this is valuable as otherwise results can be
+intermixed with other events in dmesg output.  The maximum size of each
+results file is KUNIT_LOG_SIZE bytes (defined in include/kunit/test.h).

From e23349af9ee25a5760112a2f8476b94a4ec86f1c Mon Sep 17 00:00:00 2001
From: Brendan Higgins <brendanhiggins@google.com>
Date: Mon, 23 Mar 2020 12:22:36 -0700
Subject: [PATCH 271/327] kunit: tool: add missing test data file content

Add a missing raw dmesg test log to test the kunit_tool's dmesg parser.
test_prefix_poundsign and test_output_with_prefix_isolated_correctly
fail without this test log.

Signed-off-by: Brendan Higgins <brendanhiggins@google.com>
Tested-by: David Gow <davidgow@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 .../kunit/test_data/test_pound_sign.log       | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tools/testing/kunit/test_data/test_pound_sign.log b/tools/testing/kunit/test_data/test_pound_sign.log
index e69de29bb2d1..28ffa5ba03bf 100644
--- a/tools/testing/kunit/test_data/test_pound_sign.log
+++ b/tools/testing/kunit/test_data/test_pound_sign.log
@@ -0,0 +1,33 @@
+[    0.060000] printk: console [mc-1] enabled
+[    0.060000] random: get_random_bytes called from init_oops_id+0x35/0x40 with crng_init=0
+[    0.060000] TAP version 14
+[    0.060000] 	# Subtest: kunit-resource-test
+[    0.060000] 	1..5
+[    0.060000] 	ok 1 - kunit_resource_test_init_resources
+[    0.060000] 	ok 2 - kunit_resource_test_alloc_resource
+[    0.060000] 	ok 3 - kunit_resource_test_destroy_resource
+[    0.060000]  foo bar 	#
+[    0.060000] 	ok 4 - kunit_resource_test_cleanup_resources
+[    0.060000] 	ok 5 - kunit_resource_test_proper_free_ordering
+[    0.060000] ok 1 - kunit-resource-test
+[    0.060000]  foo bar 	# non-kunit output
+[    0.060000] 	# Subtest: kunit-try-catch-test
+[    0.060000] 	1..2
+[    0.060000] 	ok 1 - kunit_test_try_catch_successful_try_no_catch
+[    0.060000] 	ok 2 - kunit_test_try_catch_unsuccessful_try_does_catch
+[    0.060000] ok 2 - kunit-try-catch-test
+[    0.060000] 	# Subtest: string-stream-test
+[    0.060000] 	1..3
+[    0.060000] 	ok 1 - string_stream_test_empty_on_creation
+[    0.060000] 	ok 2 - string_stream_test_not_empty_after_add
+[    0.060000] 	ok 3 - string_stream_test_get_string
+[    0.060000] ok 3 - string-stream-test
+[    0.060000] List of all partitions:
+[    0.060000] No filesystem could mount root, tried:
+[    0.060000]
+[    0.060000] Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(98,0)
+[    0.060000] CPU: 0 PID: 1 Comm: swapper Not tainted 5.4.0-rc1-gea2dd7c0875e-dirty #2
+[    0.060000] Stack:
+[    0.060000]  602086f8 601bc260 705c0000 705c0000
+[    0.060000]  602086f8 6005fcec 705c0000 6002c6ab
+[    0.060000]  6005fcec 601bc260 705c0000 3000000010

From f46f576280595f7c8feba454526b95a54e635798 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 13 Mar 2020 16:12:51 -0700
Subject: [PATCH 272/327] selftests/harness: Move test child waiting logic

In order to better handle timeout failures, rearrange the child waiting
logic into a separate function. This is mostly a copy/paste with an
indentation change. To handle pid tracking, a new field is added for
the child pid. Also move the alarm() pairing into the function.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/kselftest_harness.h | 93 +++++++++++----------
 1 file changed, 49 insertions(+), 44 deletions(-)

diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index 5336b26506ab..c7b67e379219 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -635,6 +635,7 @@
 struct __test_metadata {
 	const char *name;
 	void (*fn)(struct __test_metadata *);
+	pid_t pid;	/* pid of test when being run */
 	int termsig;
 	int passed;
 	int trigger; /* extra handler after the evaluation */
@@ -695,64 +696,68 @@ static inline int __bail(int for_realz, bool no_print, __u8 step)
 	return 0;
 }
 
-void __run_test(struct __test_metadata *t)
+void __wait_for_test(struct __test_metadata *t)
 {
-	pid_t child_pid;
 	int status;
 
+	alarm(t->timeout);
+	waitpid(t->pid, &status, 0);
+	alarm(0);
+
+	if (WIFEXITED(status)) {
+		t->passed = t->termsig == -1 ? !WEXITSTATUS(status) : 0;
+		if (t->termsig != -1) {
+			fprintf(TH_LOG_STREAM,
+				"%s: Test exited normally "
+				"instead of by signal (code: %d)\n",
+				t->name,
+				WEXITSTATUS(status));
+		} else if (!t->passed) {
+			fprintf(TH_LOG_STREAM,
+				"%s: Test failed at step #%d\n",
+				t->name,
+				WEXITSTATUS(status));
+		}
+	} else if (WIFSIGNALED(status)) {
+		t->passed = 0;
+		if (WTERMSIG(status) == SIGABRT) {
+			fprintf(TH_LOG_STREAM,
+				"%s: Test terminated by assertion\n",
+				t->name);
+		} else if (WTERMSIG(status) == t->termsig) {
+			t->passed = 1;
+		} else {
+			fprintf(TH_LOG_STREAM,
+				"%s: Test terminated unexpectedly "
+				"by signal %d\n",
+				t->name,
+				WTERMSIG(status));
+		}
+	} else {
+		fprintf(TH_LOG_STREAM,
+			"%s: Test ended in some other way [%u]\n",
+			t->name,
+			status);
+	}
+}
+
+void __run_test(struct __test_metadata *t)
+{
 	t->passed = 1;
 	t->trigger = 0;
 	printf("[ RUN      ] %s\n", t->name);
-	alarm(t->timeout);
-	child_pid = fork();
-	if (child_pid < 0) {
+	t->pid = fork();
+	if (t->pid < 0) {
 		printf("ERROR SPAWNING TEST CHILD\n");
 		t->passed = 0;
-	} else if (child_pid == 0) {
+	} else if (t->pid == 0) {
 		t->fn(t);
 		/* return the step that failed or 0 */
 		_exit(t->passed ? 0 : t->step);
 	} else {
-		/* TODO(wad) add timeout support. */
-		waitpid(child_pid, &status, 0);
-		if (WIFEXITED(status)) {
-			t->passed = t->termsig == -1 ? !WEXITSTATUS(status) : 0;
-			if (t->termsig != -1) {
-				fprintf(TH_LOG_STREAM,
-					"%s: Test exited normally "
-					"instead of by signal (code: %d)\n",
-					t->name,
-					WEXITSTATUS(status));
-			} else if (!t->passed) {
-				fprintf(TH_LOG_STREAM,
-					"%s: Test failed at step #%d\n",
-					t->name,
-					WEXITSTATUS(status));
-			}
-		} else if (WIFSIGNALED(status)) {
-			t->passed = 0;
-			if (WTERMSIG(status) == SIGABRT) {
-				fprintf(TH_LOG_STREAM,
-					"%s: Test terminated by assertion\n",
-					t->name);
-			} else if (WTERMSIG(status) == t->termsig) {
-				t->passed = 1;
-			} else {
-				fprintf(TH_LOG_STREAM,
-					"%s: Test terminated unexpectedly "
-					"by signal %d\n",
-					t->name,
-					WTERMSIG(status));
-			}
-		} else {
-			fprintf(TH_LOG_STREAM,
-				"%s: Test ended in some other way [%u]\n",
-				t->name,
-				status);
-		}
+		__wait_for_test(t);
 	}
 	printf("[     %4s ] %s\n", (t->passed ? "OK" : "FAIL"), t->name);
-	alarm(0);
 }
 
 static int test_harness_run(int __attribute__((unused)) argc,

From c31801da6e3d7ef6115a0b5af1816a8f7ab120c2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 13 Mar 2020 16:12:52 -0700
Subject: [PATCH 273/327] selftests/harness: Handle timeouts cleanly

When a selftest would timeout before, the program would just fall over
and no accounting of failures would be reported (i.e. it would result in
an incomplete TAP report). Instead, add an explicit SIGALRM handler to
cleanly catch and report the timeout.

Before:

	[==========] Running 2 tests from 2 test cases.
	[ RUN      ] timeout.finish
	[       OK ] timeout.finish
	[ RUN      ] timeout.too_long
	Alarm clock

After:

	[==========] Running 2 tests from 2 test cases.
	[ RUN      ] timeout.finish
	[       OK ] timeout.finish
	[ RUN      ] timeout.too_long
	timeout.too_long: Test terminated by timeout
	[     FAIL ] timeout.too_long
	[==========] 1 / 2 tests passed.
	[  FAILED  ]

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/kselftest_harness.h | 53 ++++++++++++++++++++-
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
index c7b67e379219..2902f6a78f8a 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -639,7 +639,8 @@ struct __test_metadata {
 	int termsig;
 	int passed;
 	int trigger; /* extra handler after the evaluation */
-	int timeout;
+	int timeout;	/* seconds to wait for test timeout */
+	bool timed_out;	/* did this test timeout instead of exiting? */
 	__u8 step;
 	bool no_print; /* manual trigger when TH_LOG_STREAM is not available */
 	struct __test_metadata *prev, *next;
@@ -696,15 +697,63 @@ static inline int __bail(int for_realz, bool no_print, __u8 step)
 	return 0;
 }
 
+struct __test_metadata *__active_test;
+static void __timeout_handler(int sig, siginfo_t *info, void *ucontext)
+{
+	struct __test_metadata *t = __active_test;
+
+	/* Sanity check handler execution environment. */
+	if (!t) {
+		fprintf(TH_LOG_STREAM,
+			"no active test in SIGARLM handler!?\n");
+		abort();
+	}
+	if (sig != SIGALRM || sig != info->si_signo) {
+		fprintf(TH_LOG_STREAM,
+			"%s: SIGALRM handler caught signal %d!?\n",
+			t->name, sig != SIGALRM ? sig : info->si_signo);
+		abort();
+	}
+
+	t->timed_out = true;
+	kill(t->pid, SIGKILL);
+}
+
 void __wait_for_test(struct __test_metadata *t)
 {
+	struct sigaction action = {
+		.sa_sigaction = __timeout_handler,
+		.sa_flags = SA_SIGINFO,
+	};
+	struct sigaction saved_action;
 	int status;
 
+	if (sigaction(SIGALRM, &action, &saved_action)) {
+		t->passed = 0;
+		fprintf(TH_LOG_STREAM,
+			"%s: unable to install SIGARLM handler\n",
+			t->name);
+		return;
+	}
+	__active_test = t;
+	t->timed_out = false;
 	alarm(t->timeout);
 	waitpid(t->pid, &status, 0);
 	alarm(0);
+	if (sigaction(SIGALRM, &saved_action, NULL)) {
+		t->passed = 0;
+		fprintf(TH_LOG_STREAM,
+			"%s: unable to uninstall SIGARLM handler\n",
+			t->name);
+		return;
+	}
+	__active_test = NULL;
 
-	if (WIFEXITED(status)) {
+	if (t->timed_out) {
+		t->passed = 0;
+		fprintf(TH_LOG_STREAM,
+			"%s: Test terminated by timeout\n", t->name);
+	} else if (WIFEXITED(status)) {
 		t->passed = t->termsig == -1 ? !WEXITSTATUS(status) : 0;
 		if (t->termsig != -1) {
 			fprintf(TH_LOG_STREAM,

From 860f0a7792a5abcaae047a114a8b24807ce0880f Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Wed, 25 Mar 2020 17:16:02 -0600
Subject: [PATCH 274/327] selftests: Fix seccomp to support relocatable build
 (O=objdir)

Fix seccomp relocatable builds. This is a simple fix to use the
right lib.mk variable TEST_GEN_PROGS. Local header dependency
is addressed in a change to lib.mk as a framework change that
enforces the dependency without requiring changes to individual
tests.

The following use-cases work with this change:

In seccomp directory:
make all and make clean

From top level from main Makefile:
make kselftest-install O=objdir ARCH=arm64 HOSTCC=gcc \
 CROSS_COMPILE=aarch64-linux-gnu- TARGETS=seccomp

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/seccomp/Makefile | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/seccomp/Makefile b/tools/testing/selftests/seccomp/Makefile
index 1760b3e39730..0ebfe8b0e147 100644
--- a/tools/testing/selftests/seccomp/Makefile
+++ b/tools/testing/selftests/seccomp/Makefile
@@ -1,17 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
-all:
-
-include ../lib.mk
-
-.PHONY: all clean
-
-BINARIES := seccomp_bpf seccomp_benchmark
 CFLAGS += -Wl,-no-as-needed -Wall
+LDFLAGS += -lpthread
 
-seccomp_bpf: seccomp_bpf.c ../kselftest_harness.h
-	$(CC) $(CFLAGS) $(LDFLAGS) $< -lpthread -o $@
-
-TEST_PROGS += $(BINARIES)
-EXTRA_CLEAN := $(BINARIES)
-
-all: $(BINARIES)
+TEST_GEN_PROGS := seccomp_bpf seccomp_benchmark
+include ../lib.mk

From d3fd949abd3e3005dab7ccb185c3c8bc0ad639f1 Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Wed, 25 Mar 2020 17:17:09 -0600
Subject: [PATCH 275/327] selftests: Fix memfd to support relocatable build
 (O=objdir)

Fix memfd to support relocatable build (O=objdir). This calls out
source files necessary to build tests and simplfies the dependency
enforcement.

Tested the following:

Note that cross-build for fuse_mnt has dependency on -lfuse.

make all
make clean
make kselftest-install O=/arm64_build/ ARCH=arm64 HOSTCC=gcc \
CROSS_COMPILE=aarch64-linux-gnu- TARGETS=memfd

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/memfd/Makefile | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile
index 53a848109f7b..0a15f9e23431 100644
--- a/tools/testing/selftests/memfd/Makefile
+++ b/tools/testing/selftests/memfd/Makefile
@@ -4,9 +4,8 @@ CFLAGS += -I../../../../include/uapi/
 CFLAGS += -I../../../../include/
 CFLAGS += -I../../../../usr/include/
 
-TEST_GEN_PROGS := memfd_test
+TEST_GEN_PROGS := memfd_test fuse_test fuse_mnt
 TEST_PROGS := run_fuse_test.sh run_hugetlbfs_test.sh
-TEST_GEN_FILES := fuse_mnt fuse_test
 
 fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags)
 
@@ -14,7 +13,7 @@ include ../lib.mk
 
 $(OUTPUT)/fuse_mnt: LDLIBS += $(shell pkg-config fuse --libs)
 
-$(OUTPUT)/memfd_test: memfd_test.c common.o
-$(OUTPUT)/fuse_test: fuse_test.c common.o
+$(OUTPUT)/memfd_test: memfd_test.c common.c
+$(OUTPUT)/fuse_test: fuse_test.c common.c
 
-EXTRA_CLEAN = common.o
+EXTRA_CLEAN = $(OUTPUT)/common.o

From 1056d3d2c97e47397d0037cbbdf24235ae8f88cb Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Wed, 25 Mar 2020 17:16:49 -0600
Subject: [PATCH 276/327] selftests: enforce local header dependency in lib.mk

Add local header dependency in lib.mk. This enforces the dependency
blindly even when a test doesn't include the file, with the benefit
of a simpler common logic without requiring individual tests to have
special rule for it.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/lib.mk | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk
index 1c8a1963d03f..70f03dff5479 100644
--- a/tools/testing/selftests/lib.mk
+++ b/tools/testing/selftests/lib.mk
@@ -134,7 +134,8 @@ endif
 # Selftest makefiles can override those targets by setting
 # OVERRIDE_TARGETS = 1.
 ifeq ($(OVERRIDE_TARGETS),)
-$(OUTPUT)/%:%.c
+LOCAL_HDRS := $(selfdir)/kselftest_harness.h $(selfdir)/kselftest.h
+$(OUTPUT)/%:%.c $(LOCAL_HDRS)
 	$(LINK.c) $^ $(LDLIBS) -o $@
 
 $(OUTPUT)/%.o:%.S

From a4da83c215ac4e69ef7ae9ec3ee9ef63a3edafdf Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 17 Mar 2020 15:54:23 +0100
Subject: [PATCH 277/327] IB/hfi1: Get rid of a warning

The right markup for a variable is @foo, and not @foo[].

Using a wrong markup caused this warning:

	./drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h:243: WARNING: Inline strong start-string without end-string.

Link: https://lore.kernel.org/r/9dce702510505556d75a13d9641e09218a4b4a65.1584456635.git.mchehab+huawei@kernel.org
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
index 0b3570dc606d..d324312a373c 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.h
@@ -239,7 +239,7 @@ struct opa_veswport_mactable_entry {
  * @offset: mac table starting offset
  * @num_entries: Number of entries to get or set
  * @mac_tbl_digest: mac table digest
- * @tbl_entries[]: Array of table entries
+ * @tbl_entries: Array of table entries
  *
  * The EM sends down this structure in a MAD indicating
  * the starting offset in the forwarding table that this

From 342ee59de98a2ecdf15a46849a2534e7c808eb1f Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Tue, 24 Mar 2020 08:01:39 +0200
Subject: [PATCH 278/327] IB/mlx5: Expose UAR object and its alloc/destroy
 commands

Expose UAR object and its alloc/destroy commands to be used over the ioctl
interface by user space applications.

This API supports both BF & NC modes and enables a dynamic allocation of
UARs once really needed.

As the number of driver objects were limited by the core ones when the
merged tree is prepared, had to decrease the number of core objects to
enable the new UAR object usage.

Link: https://lore.kernel.org/r/20200324060143.1569116-2-leon@kernel.org
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Michael Guralnik <michaelgur@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/main.c         | 172 ++++++++++++++++++++--
 drivers/infiniband/hw/mlx5/mlx5_ib.h      |   2 +
 include/rdma/uverbs_ioctl.h               |   2 +-
 include/uapi/rdma/mlx5_user_ioctl_cmds.h  |  18 +++
 include/uapi/rdma/mlx5_user_ioctl_verbs.h |   5 +
 5 files changed, 189 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 1f91d9e543e2..289445f6af21 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2021,6 +2021,17 @@ static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
 	return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
 }
 
+static u64 uar_index2paddress(struct mlx5_ib_dev *dev,
+				 int uar_idx)
+{
+	unsigned int fw_uars_per_page;
+
+	fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+				MLX5_UARS_IN_PAGE : 1;
+
+	return (dev->mdev->bar_addr + (uar_idx / fw_uars_per_page) * PAGE_SIZE);
+}
+
 static int get_command(unsigned long offset)
 {
 	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
@@ -2105,6 +2116,11 @@ static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry)
 		mutex_unlock(&var_table->bitmap_lock);
 		kfree(mentry);
 		break;
+	case MLX5_IB_MMAP_TYPE_UAR_WC:
+	case MLX5_IB_MMAP_TYPE_UAR_NC:
+		mlx5_cmd_free_uar(dev->mdev, mentry->page_idx);
+		kfree(mentry);
+		break;
 	default:
 		WARN_ON(true);
 	}
@@ -2256,7 +2272,8 @@ static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev,
 
 	mentry = to_mmmap(entry);
 	pfn = (mentry->address >> PAGE_SHIFT);
-	if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR)
+	if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR ||
+	    mentry->mmap_flag == MLX5_IB_MMAP_TYPE_UAR_NC)
 		prot = pgprot_noncached(vma->vm_page_prot);
 	else
 		prot = pgprot_writecombine(vma->vm_page_prot);
@@ -6078,9 +6095,9 @@ static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
 	mlx5_nic_vport_disable_roce(dev->mdev);
 }
 
-static int var_obj_cleanup(struct ib_uobject *uobject,
-			   enum rdma_remove_reason why,
-			   struct uverbs_attr_bundle *attrs)
+static int mmap_obj_cleanup(struct ib_uobject *uobject,
+			    enum rdma_remove_reason why,
+			    struct uverbs_attr_bundle *attrs)
 {
 	struct mlx5_user_mmap_entry *obj = uobject->object;
 
@@ -6088,6 +6105,16 @@ static int var_obj_cleanup(struct ib_uobject *uobject,
 	return 0;
 }
 
+static int mlx5_rdma_user_mmap_entry_insert(struct mlx5_ib_ucontext *c,
+					    struct mlx5_user_mmap_entry *entry,
+					    size_t length)
+{
+	return rdma_user_mmap_entry_insert_range(
+		&c->ibucontext, &entry->rdma_entry, length,
+		(MLX5_IB_MMAP_OFFSET_START << 16),
+		((MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1));
+}
+
 static struct mlx5_user_mmap_entry *
 alloc_var_entry(struct mlx5_ib_ucontext *c)
 {
@@ -6118,10 +6145,8 @@ alloc_var_entry(struct mlx5_ib_ucontext *c)
 	entry->page_idx = page_idx;
 	entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR;
 
-	err = rdma_user_mmap_entry_insert_range(
-		&c->ibucontext, &entry->rdma_entry, var_table->stride_size,
-		MLX5_IB_MMAP_OFFSET_START << 16,
-		(MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1);
+	err = mlx5_rdma_user_mmap_entry_insert(c, entry,
+					       var_table->stride_size);
 	if (err)
 		goto err_insert;
 
@@ -6205,7 +6230,7 @@ DECLARE_UVERBS_NAMED_METHOD_DESTROY(
 			UA_MANDATORY));
 
 DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_VAR,
-			    UVERBS_TYPE_ALLOC_IDR(var_obj_cleanup),
+			    UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_ALLOC),
 			    &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_DESTROY));
 
@@ -6217,6 +6242,134 @@ static bool var_is_supported(struct ib_device *device)
 			MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q);
 }
 
+static struct mlx5_user_mmap_entry *
+alloc_uar_entry(struct mlx5_ib_ucontext *c,
+		enum mlx5_ib_uapi_uar_alloc_type alloc_type)
+{
+	struct mlx5_user_mmap_entry *entry;
+	struct mlx5_ib_dev *dev;
+	u32 uar_index;
+	int err;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return ERR_PTR(-ENOMEM);
+
+	dev = to_mdev(c->ibucontext.device);
+	err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
+	if (err)
+		goto end;
+
+	entry->page_idx = uar_index;
+	entry->address = uar_index2paddress(dev, uar_index);
+	if (alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
+		entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_WC;
+	else
+		entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_NC;
+
+	err = mlx5_rdma_user_mmap_entry_insert(c, entry, PAGE_SIZE);
+	if (err)
+		goto err_insert;
+
+	return entry;
+
+err_insert:
+	mlx5_cmd_free_uar(dev->mdev, uar_index);
+end:
+	kfree(entry);
+	return ERR_PTR(err);
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE);
+	enum mlx5_ib_uapi_uar_alloc_type alloc_type;
+	struct mlx5_ib_ucontext *c;
+	struct mlx5_user_mmap_entry *entry;
+	u64 mmap_offset;
+	u32 length;
+	int err;
+
+	c = to_mucontext(ib_uverbs_get_ucontext(attrs));
+	if (IS_ERR(c))
+		return PTR_ERR(c);
+
+	err = uverbs_get_const(&alloc_type, attrs,
+			       MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE);
+	if (err)
+		return err;
+
+	if (alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF &&
+	    alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC)
+		return -EOPNOTSUPP;
+
+	if (!to_mdev(c->ibucontext.device)->wc_support &&
+	    alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
+		return -EOPNOTSUPP;
+
+	entry = alloc_uar_entry(c, alloc_type);
+	if (IS_ERR(entry))
+		return PTR_ERR(entry);
+
+	mmap_offset = mlx5_entry_to_mmap_offset(entry);
+	length = entry->rdma_entry.npages * PAGE_SIZE;
+	uobj->object = entry;
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
+			     &mmap_offset, sizeof(mmap_offset));
+	if (err)
+		goto err;
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
+			     &entry->page_idx, sizeof(entry->page_idx));
+	if (err)
+		goto err;
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
+			     &length, sizeof(length));
+	if (err)
+		goto err;
+
+	return 0;
+
+err:
+	rdma_user_mmap_entry_remove(&entry->rdma_entry);
+	return err;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_UAR_OBJ_ALLOC,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE,
+			MLX5_IB_OBJECT_UAR,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE,
+			     enum mlx5_ib_uapi_uar_alloc_type,
+			     UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
+			    UVERBS_ATTR_TYPE(u64),
+			    UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	MLX5_IB_METHOD_UAR_OBJ_DESTROY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE,
+			MLX5_IB_OBJECT_UAR,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_UAR,
+			    UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_ALLOC),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_DESTROY));
+
 ADD_UVERBS_ATTRIBUTES_SIMPLE(
 	mlx5_ib_dm,
 	UVERBS_OBJECT_DM,
@@ -6248,6 +6401,7 @@ static const struct uapi_definition mlx5_ib_defs[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
 				UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR),
 	{}
 };
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 2be773a24dda..3ba6175f949e 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -124,6 +124,8 @@ enum {
 enum mlx5_ib_mmap_type {
 	MLX5_IB_MMAP_TYPE_MEMIC = 1,
 	MLX5_IB_MMAP_TYPE_VAR = 2,
+	MLX5_IB_MMAP_TYPE_UAR_WC = 3,
+	MLX5_IB_MMAP_TYPE_UAR_NC = 4,
 };
 
 struct mlx5_ib_ucontext {
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
index 28570ac2b6a0..9f3b1e004046 100644
--- a/include/rdma/uverbs_ioctl.h
+++ b/include/rdma/uverbs_ioctl.h
@@ -173,7 +173,7 @@ enum uapi_radix_data {
 	UVERBS_API_OBJ_KEY_BITS = 5,
 	UVERBS_API_OBJ_KEY_SHIFT =
 		UVERBS_API_METHOD_KEY_BITS + UVERBS_API_METHOD_KEY_SHIFT,
-	UVERBS_API_OBJ_KEY_NUM_CORE = 24,
+	UVERBS_API_OBJ_KEY_NUM_CORE = 20,
 	UVERBS_API_OBJ_KEY_NUM_DRIVER =
 		(1 << UVERBS_API_OBJ_KEY_BITS) - UVERBS_API_OBJ_KEY_NUM_CORE,
 	UVERBS_API_OBJ_KEY_MASK = GENMASK(31, UVERBS_API_OBJ_KEY_SHIFT),
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index 8f4a417fc70a..24f3388c3182 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -131,6 +131,23 @@ enum mlx5_ib_var_obj_methods {
 	MLX5_IB_METHOD_VAR_OBJ_DESTROY,
 };
 
+enum mlx5_ib_uar_alloc_attrs {
+	MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE,
+	MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
+	MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
+	MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
+};
+
+enum mlx5_ib_uar_obj_destroy_attrs {
+	MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
+};
+
+enum mlx5_ib_uar_obj_methods {
+	MLX5_IB_METHOD_UAR_OBJ_ALLOC = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_METHOD_UAR_OBJ_DESTROY,
+};
+
 enum mlx5_ib_devx_umem_reg_attrs {
 	MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE = (1U << UVERBS_ID_NS_SHIFT),
 	MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR,
@@ -190,6 +207,7 @@ enum mlx5_ib_objects {
 	MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
 	MLX5_IB_OBJECT_VAR,
 	MLX5_IB_OBJECT_PP,
+	MLX5_IB_OBJECT_UAR,
 };
 
 enum mlx5_ib_flow_matcher_create_attrs {
diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
index b4641a7865f7..3f7a97c28045 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
@@ -77,5 +77,10 @@ enum mlx5_ib_uapi_pp_alloc_flags {
 	MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX = 1 << 0,
 };
 
+enum mlx5_ib_uapi_uar_alloc_type {
+	MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF = 0x0,
+	MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC = 0x1,
+};
+
 #endif
 

From 64d99f6a62b98532886ede9913a026b2e2bc0419 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Tue, 24 Mar 2020 08:01:40 +0200
Subject: [PATCH 279/327] IB/mlx5: Extend CQ creation to get uar page index
 from user space

Extend CQ creation to get uar page index from user space, this mode can be
used with the UAR dynamic mode APIs to allocate/destroy a UAR object.

Link: https://lore.kernel.org/r/20200324060143.1569116-3-leon@kernel.org
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Michael Guralnik <michaelgur@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c | 17 +++++++++++------
 include/uapi/rdma/mlx5-abi.h    |  4 ++++
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 367a71bc5f4b..1d184bd5c759 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -692,17 +692,19 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 	struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
 		udata, struct mlx5_ib_ucontext, ibucontext);
 
-	ucmdlen = udata->inlen < sizeof(ucmd) ?
-		  (sizeof(ucmd) - sizeof(ucmd.flags)) : sizeof(ucmd);
+	ucmdlen = min(udata->inlen, sizeof(ucmd));
+	if (ucmdlen < offsetof(struct mlx5_ib_create_cq, flags))
+		return -EINVAL;
 
 	if (ib_copy_from_udata(&ucmd, udata, ucmdlen))
 		return -EFAULT;
 
-	if (ucmdlen == sizeof(ucmd) &&
-	    (ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD)))
+	if ((ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD |
+			    MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX)))
 		return -EINVAL;
 
-	if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128)
+	if ((ucmd.cqe_size != 64 && ucmd.cqe_size != 128) ||
+	    ucmd.reserved0 || ucmd.reserved1)
 		return -EINVAL;
 
 	*cqe_size = ucmd.cqe_size;
@@ -739,7 +741,10 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 	MLX5_SET(cqc, cqc, log_page_size,
 		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
-	*index = context->bfregi.sys_pages[0];
+	if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX)
+		*index = ucmd.uar_page_index;
+	else
+		*index = context->bfregi.sys_pages[0];
 
 	if (ucmd.cqe_comp_en == 1) {
 		int mini_cqe_format;
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 624f5b53eb1f..e900f9a64feb 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -266,6 +266,7 @@ struct mlx5_ib_query_device_resp {
 
 enum mlx5_ib_create_cq_flags {
 	MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD	= 1 << 0,
+	MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX  = 1 << 1,
 };
 
 struct mlx5_ib_create_cq {
@@ -275,6 +276,9 @@ struct mlx5_ib_create_cq {
 	__u8    cqe_comp_en;
 	__u8    cqe_comp_res_format;
 	__u16	flags;
+	__u16	uar_page_index;
+	__u16	reserved0;
+	__u32	reserved1;
 };
 
 struct mlx5_ib_create_cq_resp {

From ac42a5ee922503f99e8a71d41b0067fa19f23ca6 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Tue, 24 Mar 2020 08:01:41 +0200
Subject: [PATCH 280/327] IB/mlx5: Extend QP creation to get uar page index
 from user space

Extend QP creation to get uar page index from user space, this mode can be
used with the UAR dynamic mode APIs to allocate/destroy a UAR object.

As part of enabling this option blocked the weird/un-supported cross
channel option which uses index 0 hard-coded.

This QP flag wasn't exposed to user space as part of any formal upstream
release, the dynamic option can allow having valid UAR page index instead.

Link: https://lore.kernel.org/r/20200324060143.1569116-4-leon@kernel.org
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Michael Guralnik <michaelgur@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/qp.c | 27 +++++++++++++++++----------
 include/uapi/rdma/mlx5-abi.h    |  1 +
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 7b4e936ad210..b708a0484f77 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -919,6 +919,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 	void *qpc;
 	int err;
 	u16 uid;
+	u32 uar_flags;
 
 	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
 	if (err) {
@@ -928,24 +929,29 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 
 	context = rdma_udata_to_drv_context(udata, struct mlx5_ib_ucontext,
 					    ibucontext);
-	if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) {
+	uar_flags = ucmd.flags & (MLX5_QP_FLAG_UAR_PAGE_INDEX |
+				  MLX5_QP_FLAG_BFREG_INDEX);
+	switch (uar_flags) {
+	case MLX5_QP_FLAG_UAR_PAGE_INDEX:
+		uar_index = ucmd.bfreg_index;
+		bfregn = MLX5_IB_INVALID_BFREG;
+		break;
+	case MLX5_QP_FLAG_BFREG_INDEX:
 		uar_index = bfregn_to_uar_index(dev, &context->bfregi,
 						ucmd.bfreg_index, true);
 		if (uar_index < 0)
 			return uar_index;
-
 		bfregn = MLX5_IB_INVALID_BFREG;
-	} else if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) {
-		/*
-		 * TBD: should come from the verbs when we have the API
-		 */
-		/* In CROSS_CHANNEL CQ and QP must use the same UAR */
-		bfregn = MLX5_CROSS_CHANNEL_BFREG;
-	}
-	else {
+		break;
+	case 0:
+		if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+			return -EINVAL;
 		bfregn = alloc_bfreg(dev, &context->bfregi);
 		if (bfregn < 0)
 			return bfregn;
+		break;
+	default:
+		return -EINVAL;
 	}
 
 	mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index);
@@ -2100,6 +2106,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 				      MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC |
 				      MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC |
 				      MLX5_QP_FLAG_TUNNEL_OFFLOADS |
+				      MLX5_QP_FLAG_UAR_PAGE_INDEX |
 				      MLX5_QP_FLAG_TYPE_DCI |
 				      MLX5_QP_FLAG_TYPE_DCT))
 			return -EINVAL;
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index e900f9a64feb..a65d60b44829 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -49,6 +49,7 @@ enum {
 	MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC = 1 << 7,
 	MLX5_QP_FLAG_ALLOW_SCATTER_CQE	= 1 << 8,
 	MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE	= 1 << 9,
+	MLX5_QP_FLAG_UAR_PAGE_INDEX = 1 << 10,
 };
 
 enum {

From 2152862298fbfd237d37c231dfca8ae8f3ed0e48 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@mellanox.com>
Date: Tue, 24 Mar 2020 08:01:42 +0200
Subject: [PATCH 281/327] IB/mlx5: Limit the scope of struct mlx5_bfreg_info to
 mlx5_ib

struct mlx5_bfreg_info is used by mlx5_ib only but is exposed to both RDMA
and netdev parts of mlx5 driver. Move that struct to mlx5_ib namespace,
clean vertical space alignment and convert lib_uar_4k from bool to
bitfield.

Link: https://lore.kernel.org/r/20200324060143.1569116-5-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 17 +++++++++++++++++
 include/linux/mlx5/driver.h          | 17 -----------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 3ba6175f949e..ae6289133cd8 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -128,6 +128,23 @@ enum mlx5_ib_mmap_type {
 	MLX5_IB_MMAP_TYPE_UAR_NC = 4,
 };
 
+struct mlx5_bfreg_info {
+	u32 *sys_pages;
+	int num_low_latency_bfregs;
+	unsigned int *count;
+
+	/*
+	 * protect bfreg allocation data structs
+	 */
+	struct mutex lock;
+	u32 ver;
+	u8 lib_uar_4k : 1;
+	u32 num_sys_pages;
+	u32 num_static_sys_pages;
+	u32 total_num_bfregs;
+	u32 num_dyn_bfregs;
+};
+
 struct mlx5_ib_ucontext {
 	struct ib_ucontext	ibucontext;
 	struct list_head	db_page_list;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1de78f001d26..a30d834fdf7e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -213,23 +213,6 @@ enum mlx5_port_status {
 	MLX5_PORT_DOWN      = 2,
 };
 
-struct mlx5_bfreg_info {
-	u32		       *sys_pages;
-	int			num_low_latency_bfregs;
-	unsigned int	       *count;
-
-	/*
-	 * protect bfreg allocation data structs
-	 */
-	struct mutex		lock;
-	u32			ver;
-	bool			lib_uar_4k;
-	u32			num_sys_pages;
-	u32			num_static_sys_pages;
-	u32			total_num_bfregs;
-	u32			num_dyn_bfregs;
-};
-
 struct mlx5_cmd_first {
 	__be32		data[4];
 };

From 0a2fd01c28ae490a639a32a52b81fb2df48b92a0 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Tue, 24 Mar 2020 08:01:43 +0200
Subject: [PATCH 282/327] IB/mlx5: Move to fully dynamic UAR mode once user
 space supports it

Move to fully dynamic UAR mode once user space supports it.  In this case
we prevent any legacy mode of UARs on the allocated context and prevent
redundant allocation of the static ones.

Link: https://lore.kernel.org/r/20200324060143.1569116-6-leon@kernel.org
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Reviewed-by: Michael Guralnik <michaelgur@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/cq.c      |  8 ++++++--
 drivers/infiniband/hw/mlx5/main.c    | 13 ++++++++++++-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  1 +
 drivers/infiniband/hw/mlx5/qp.c      |  6 ++++++
 include/uapi/rdma/mlx5-abi.h         |  1 +
 5 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 1d184bd5c759..f1c7fa561b16 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -741,10 +741,14 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 	MLX5_SET(cqc, cqc, log_page_size,
 		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
-	if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX)
+	if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) {
 		*index = ucmd.uar_page_index;
-	else
+	} else if (context->bfregi.lib_uar_dyn) {
+		err = -EINVAL;
+		goto err_cqb;
+	} else {
 		*index = context->bfregi.sys_pages[0];
+	}
 
 	if (ucmd.cqe_comp_en == 1) {
 		int mini_cqe_format;
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 289445f6af21..12273851f2db 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1786,6 +1786,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
 				     max_cqe_version);
 	u32 dump_fill_mkey;
 	bool lib_uar_4k;
+	bool lib_uar_dyn;
 
 	if (!dev->ib_active)
 		return -EAGAIN;
@@ -1844,8 +1845,14 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
 	}
 
 	lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
+	lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR;
 	bfregi = &context->bfregi;
 
+	if (lib_uar_dyn) {
+		bfregi->lib_uar_dyn = lib_uar_dyn;
+		goto uar_done;
+	}
+
 	/* updates req->total_num_bfregs */
 	err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
 	if (err)
@@ -1872,6 +1879,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
 	if (err)
 		goto out_sys_pages;
 
+uar_done:
 	if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
 		err = mlx5_ib_devx_create(dev, true);
 		if (err < 0)
@@ -1893,7 +1901,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
 
-	resp.tot_bfregs = req.total_num_bfregs;
+	resp.tot_bfregs = lib_uar_dyn ? 0 : req.total_num_bfregs;
 	resp.num_ports = dev->num_ports;
 
 	if (offsetofend(typeof(resp), cqe_version) <= udata->outlen)
@@ -2141,6 +2149,9 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 	int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
 				bfregi->num_static_sys_pages;
 
+	if (bfregi->lib_uar_dyn)
+		return -EINVAL;
+
 	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
 		return -EINVAL;
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index ae6289133cd8..b88414acb993 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -139,6 +139,7 @@ struct mlx5_bfreg_info {
 	struct mutex lock;
 	u32 ver;
 	u8 lib_uar_4k : 1;
+	u8 lib_uar_dyn : 1;
 	u32 num_sys_pages;
 	u32 num_static_sys_pages;
 	u32 total_num_bfregs;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index b708a0484f77..aa7834d80493 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -697,6 +697,9 @@ static int alloc_bfreg(struct mlx5_ib_dev *dev,
 {
 	int bfregn = -ENOMEM;
 
+	if (bfregi->lib_uar_dyn)
+		return -EINVAL;
+
 	mutex_lock(&bfregi->lock);
 	if (bfregi->ver >= 2) {
 		bfregn = alloc_high_class_bfreg(dev, bfregi);
@@ -768,6 +771,9 @@ int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
 	u32 index_of_sys_page;
 	u32 offset;
 
+	if (bfregi->lib_uar_dyn)
+		return -EINVAL;
+
 	bfregs_per_sys_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k) *
 				MLX5_NON_FP_BFREGS_PER_UAR;
 	index_of_sys_page = bfregn / bfregs_per_sys_page;
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index a65d60b44829..df1cc3641bda 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -79,6 +79,7 @@ struct mlx5_ib_alloc_ucontext_req {
 
 enum mlx5_lib_caps {
 	MLX5_LIB_CAP_4K_UAR	= (__u64)1 << 0,
+	MLX5_LIB_CAP_DYN_UAR	= (__u64)1 << 1,
 };
 
 enum mlx5_ib_alloc_uctx_v2_flags {

From 5c15abc4328ad696fa61e2f3604918ed0c207755 Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Thu, 26 Mar 2020 12:38:07 -0400
Subject: [PATCH 283/327] IB/hfi1: Fix memory leaks in sysfs registration and
 unregistration

When the hfi1 driver is unloaded, kmemleak will report the following
issue:

unreferenced object 0xffff8888461a4c08 (size 8):
comm "kworker/0:0", pid 5, jiffies 4298601264 (age 2047.134s)
hex dump (first 8 bytes):
73 64 6d 61 30 00 ff ff sdma0...
backtrace:
[<00000000311a6ef5>] kvasprintf+0x62/0xd0
[<00000000ade94d9f>] kobject_set_name_vargs+0x1c/0x90
[<0000000060657dbb>] kobject_init_and_add+0x5d/0xb0
[<00000000346fe72b>] 0xffffffffa0c5ecba
[<000000006cfc5819>] 0xffffffffa0c866b9
[<0000000031c65580>] 0xffffffffa0c38e87
[<00000000e9739b3f>] local_pci_probe+0x41/0x80
[<000000006c69911d>] work_for_cpu_fn+0x16/0x20
[<00000000601267b5>] process_one_work+0x171/0x380
[<0000000049a0eefa>] worker_thread+0x1d1/0x3f0
[<00000000909cf2b9>] kthread+0xf8/0x130
[<0000000058f5f874>] ret_from_fork+0x35/0x40

This patch fixes the issue by:

- Releasing dd->per_sdma[i].kobject in hfi1_unregister_sysfs().
  - This will fix the memory leak.

- Calling kobject_put() to unwind operations only for those entries in
   dd->per_sdma[] whose operations have succeeded (including the current
   one that has just failed) in hfi1_verbs_register_sysfs().

Cc: <stable@vger.kernel.org>
Fixes: 0cb2aa690c7e ("IB/hfi1: Add sysfs interface for affinity setup")
Link: https://lore.kernel.org/r/20200326163807.21129.27371.stgit@awfm-01.aw.intel.com
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/sysfs.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
index 90f62c4bddba..f1bcecf92535 100644
--- a/drivers/infiniband/hw/hfi1/sysfs.c
+++ b/drivers/infiniband/hw/hfi1/sysfs.c
@@ -853,8 +853,13 @@ int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd)
 
 	return 0;
 bail:
-	for (i = 0; i < dd->num_sdma; i++)
-		kobject_del(&dd->per_sdma[i].kobj);
+	/*
+	 * The function kobject_put() will call kobject_del() if the kobject
+	 * has been added successfully. The sysfs files created under the
+	 * kobject directory will also be removed during the process.
+	 */
+	for (; i >= 0; i--)
+		kobject_put(&dd->per_sdma[i].kobj);
 
 	return ret;
 }
@@ -867,6 +872,10 @@ void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd)
 	struct hfi1_pportdata *ppd;
 	int i;
 
+	/* Unwind operations in hfi1_verbs_register_sysfs() */
+	for (i = 0; i < dd->num_sdma; i++)
+		kobject_put(&dd->per_sdma[i].kobj);
+
 	for (i = 0; i < dd->num_pports; i++) {
 		ppd = &dd->pport[i];
 

From dfb5394f804ed4fcea1fc925be275a38d66712ab Mon Sep 17 00:00:00 2001
From: Kaike Wan <kaike.wan@intel.com>
Date: Thu, 26 Mar 2020 12:38:14 -0400
Subject: [PATCH 284/327] IB/hfi1: Call kobject_put() when
 kobject_init_and_add() fails

When kobject_init_and_add() returns an error in the function
hfi1_create_port_files(), the function kobject_put() is not called for the
corresponding kobject, which potentially leads to memory leak.

This patch fixes the issue by calling kobject_put() even if
kobject_init_and_add() fails.

Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20200326163813.21129.44280.stgit@awfm-01.aw.intel.com
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Kaike Wan <kaike.wan@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/sysfs.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
index f1bcecf92535..074ec71772d2 100644
--- a/drivers/infiniband/hw/hfi1/sysfs.c
+++ b/drivers/infiniband/hw/hfi1/sysfs.c
@@ -674,7 +674,11 @@ int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
 		dd_dev_err(dd,
 			   "Skipping sc2vl sysfs info, (err %d) port %u\n",
 			   ret, port_num);
-		goto bail;
+		/*
+		 * Based on the documentation for kobject_init_and_add(), the
+		 * caller should call kobject_put even if this call fails.
+		 */
+		goto bail_sc2vl;
 	}
 	kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD);
 
@@ -684,7 +688,7 @@ int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
 		dd_dev_err(dd,
 			   "Skipping sl2sc sysfs info, (err %d) port %u\n",
 			   ret, port_num);
-		goto bail_sc2vl;
+		goto bail_sl2sc;
 	}
 	kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD);
 
@@ -694,7 +698,7 @@ int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
 		dd_dev_err(dd,
 			   "Skipping vl2mtu sysfs info, (err %d) port %u\n",
 			   ret, port_num);
-		goto bail_sl2sc;
+		goto bail_vl2mtu;
 	}
 	kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD);
 
@@ -704,7 +708,7 @@ int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
 		dd_dev_err(dd,
 			   "Skipping Congestion Control sysfs info, (err %d) port %u\n",
 			   ret, port_num);
-		goto bail_vl2mtu;
+		goto bail_cc;
 	}
 
 	kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
@@ -742,7 +746,6 @@ int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
 	kobject_put(&ppd->sl2sc_kobj);
 bail_sc2vl:
 	kobject_put(&ppd->sc2vl_kobj);
-bail:
 	return ret;
 }
 

From 24670b1a31661815777c2e88b94c162e47ea43fc Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@mellanox.com>
Date: Tue, 24 Mar 2020 08:14:24 +0200
Subject: [PATCH 285/327] net/mlx5: Add support for RDMA TX steering

Add new RDMA TX flow steering namespace. Flow steering rules in
this namespace are used to filter transmitted RDMA traffic.

Link: https://lore.kernel.org/r/20200324061425.1570190-2-leon@kernel.org
Signed-off-by: Michael Guralnik <michaelgur@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/fs_cmd.c  |  1 +
 .../net/ethernet/mellanox/mlx5/core/fs_core.c | 53 +++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/fs_core.h |  7 ++-
 include/linux/mlx5/device.h                   |  6 +++
 include/linux/mlx5/fs.h                       |  1 +
 include/linux/mlx5/mlx5_ifc.h                 |  2 +-
 6 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index b25465d9e030..90048697b2ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -904,6 +904,7 @@ const struct mlx5_flow_cmds *mlx5_fs_cmd_get_default(enum fs_flow_table_type typ
 	case FS_FT_SNIFFER_TX:
 	case FS_FT_NIC_TX:
 	case FS_FT_RDMA_RX:
+	case FS_FT_RDMA_TX:
 		return mlx5_fs_cmd_get_fw_cmds();
 	default:
 		return mlx5_fs_cmd_get_stub_cmds();
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 9dc24241dc91..98c74a867ef4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -87,6 +87,15 @@
 			       .identified_miss_table_mode),                   \
 		FS_CAP(flow_table_properties_nic_transmit.flow_table_modify))
 
+#define FS_CHAINING_CAPS_RDMA_TX                                                \
+	FS_REQUIRED_CAPS(                                                       \
+		FS_CAP(flow_table_properties_nic_transmit_rdma.flow_modify_en), \
+		FS_CAP(flow_table_properties_nic_transmit_rdma.modify_root),    \
+		FS_CAP(flow_table_properties_nic_transmit_rdma                  \
+			       .identified_miss_table_mode),                    \
+		FS_CAP(flow_table_properties_nic_transmit_rdma                  \
+			       .flow_table_modify))
+
 #define LEFTOVERS_NUM_LEVELS 1
 #define LEFTOVERS_NUM_PRIOS 1
 
@@ -202,6 +211,18 @@ static struct init_tree_node rdma_rx_root_fs = {
 	}
 };
 
+static struct init_tree_node rdma_tx_root_fs = {
+	.type = FS_TYPE_NAMESPACE,
+	.ar_size = 1,
+	.children = (struct init_tree_node[]) {
+		ADD_PRIO(0, MLX5_BY_PASS_NUM_PRIOS, 0,
+			 FS_CHAINING_CAPS_RDMA_TX,
+			 ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF,
+				ADD_MULTIPLE_PRIO(MLX5_BY_PASS_NUM_PRIOS,
+						  BY_PASS_PRIO_NUM_LEVELS))),
+	}
+};
+
 enum fs_i_lock_class {
 	FS_LOCK_GRANDPARENT,
 	FS_LOCK_PARENT,
@@ -2132,6 +2153,8 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 	} else if (type == MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL) {
 		root_ns = steering->rdma_rx_root_ns;
 		prio = RDMA_RX_KERNEL_PRIO;
+	} else if (type == MLX5_FLOW_NAMESPACE_RDMA_TX) {
+		root_ns = steering->rdma_tx_root_ns;
 	} else { /* Must be NIC RX */
 		root_ns = steering->root_ns;
 		prio = type;
@@ -2535,6 +2558,7 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
 	cleanup_root_ns(steering->sniffer_rx_root_ns);
 	cleanup_root_ns(steering->sniffer_tx_root_ns);
 	cleanup_root_ns(steering->rdma_rx_root_ns);
+	cleanup_root_ns(steering->rdma_tx_root_ns);
 	cleanup_root_ns(steering->egress_root_ns);
 	mlx5_cleanup_fc_stats(dev);
 	kmem_cache_destroy(steering->ftes_cache);
@@ -2591,6 +2615,29 @@ static int init_rdma_rx_root_ns(struct mlx5_flow_steering *steering)
 	return err;
 }
 
+static int init_rdma_tx_root_ns(struct mlx5_flow_steering *steering)
+{
+	int err;
+
+	steering->rdma_tx_root_ns = create_root_ns(steering, FS_FT_RDMA_TX);
+	if (!steering->rdma_tx_root_ns)
+		return -ENOMEM;
+
+	err = init_root_tree(steering, &rdma_tx_root_fs,
+			     &steering->rdma_tx_root_ns->ns.node);
+	if (err)
+		goto out_err;
+
+	set_prio_attrs(steering->rdma_tx_root_ns);
+
+	return 0;
+
+out_err:
+	cleanup_root_ns(steering->rdma_tx_root_ns);
+	steering->rdma_tx_root_ns = NULL;
+	return err;
+}
+
 /* FT and tc chains are stored in the same array so we can re-use the
  * mlx5_get_fdb_sub_ns() and tc api for FT chains.
  * When creating a new ns for each chain store it in the first available slot.
@@ -2890,6 +2937,12 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
 			goto err;
 	}
 
+	if (MLX5_CAP_FLOWTABLE_RDMA_TX(dev, ft_support)) {
+		err = init_rdma_tx_root_ns(steering);
+		if (err)
+			goto err;
+	}
+
 	if (MLX5_IPSEC_DEV(dev) || MLX5_CAP_FLOWTABLE_NIC_TX(dev, ft_support)) {
 		err = init_egress_root_ns(steering);
 		if (err)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index be5f5e32c1e8..508108c58dae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -86,7 +86,8 @@ enum fs_flow_table_type {
 	FS_FT_SNIFFER_RX	= 0X5,
 	FS_FT_SNIFFER_TX	= 0X6,
 	FS_FT_RDMA_RX		= 0X7,
-	FS_FT_MAX_TYPE = FS_FT_RDMA_RX,
+	FS_FT_RDMA_TX		= 0X8,
+	FS_FT_MAX_TYPE = FS_FT_RDMA_TX,
 };
 
 enum fs_flow_table_op_mod {
@@ -116,6 +117,7 @@ struct mlx5_flow_steering {
 	struct mlx5_flow_root_namespace	*sniffer_tx_root_ns;
 	struct mlx5_flow_root_namespace	*sniffer_rx_root_ns;
 	struct mlx5_flow_root_namespace	*rdma_rx_root_ns;
+	struct mlx5_flow_root_namespace	*rdma_tx_root_ns;
 	struct mlx5_flow_root_namespace	*egress_root_ns;
 };
 
@@ -316,7 +318,8 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
 	(type == FS_FT_SNIFFER_RX) ? MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) :		\
 	(type == FS_FT_SNIFFER_TX) ? MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) :		\
 	(type == FS_FT_RDMA_RX) ? MLX5_CAP_FLOWTABLE_RDMA_RX(mdev, cap) :		\
-	(BUILD_BUG_ON_ZERO(FS_FT_RDMA_RX != FS_FT_MAX_TYPE))\
+	(type == FS_FT_RDMA_TX) ? MLX5_CAP_FLOWTABLE_RDMA_TX(mdev, cap) :      \
+	(BUILD_BUG_ON_ZERO(FS_FT_RDMA_TX != FS_FT_MAX_TYPE))\
 	)
 
 #endif
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 0e62c3db45e5..2b90097a6cf9 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1211,6 +1211,12 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_FLOWTABLE_RDMA_RX_MAX(mdev, cap) \
 	MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_receive_rdma.cap)
 
+#define MLX5_CAP_FLOWTABLE_RDMA_TX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_transmit_rdma.cap)
+
+#define MLX5_CAP_FLOWTABLE_RDMA_TX_MAX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_transmit_rdma.cap)
+
 #define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \
 	MLX5_GET(flow_table_eswitch_cap, \
 		 mdev->caps.hca_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 4cae16016b2b..44c9fe792fc4 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -77,6 +77,7 @@ enum mlx5_flow_namespace_type {
 	MLX5_FLOW_NAMESPACE_EGRESS,
 	MLX5_FLOW_NAMESPACE_RDMA_RX,
 	MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL,
+	MLX5_FLOW_NAMESPACE_RDMA_TX,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 9b8ff4e57002..600366ef2d84 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -705,7 +705,7 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit;
 
-	u8         reserved_at_a00[0x200];
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_rdma;
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer;
 

From af9c38411d188021900031d00bd8e8dafd4ad557 Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@mellanox.com>
Date: Tue, 24 Mar 2020 08:14:25 +0200
Subject: [PATCH 286/327] RDMA/mlx5: Add support for RDMA TX flow table

Enable user application to add rules for RDMA TX steering table.
Rules in this steering table will allow to steer transmitted RDMA
traffic.

Link: https://lore.kernel.org/r/20200324061425.1570190-3-leon@kernel.org
Signed-off-by: Michael Guralnik <michaelgur@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/flow.c         | 3 +++
 drivers/infiniband/hw/mlx5/main.c         | 7 +++++++
 drivers/infiniband/hw/mlx5/mlx5_ib.h      | 1 +
 include/uapi/rdma/mlx5_user_ioctl_verbs.h | 1 +
 4 files changed, 12 insertions(+)

diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c
index dbee17d22d50..862b7bf3e646 100644
--- a/drivers/infiniband/hw/mlx5/flow.c
+++ b/drivers/infiniband/hw/mlx5/flow.c
@@ -35,6 +35,9 @@ mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type,
 	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX:
 		*namespace = MLX5_FLOW_NAMESPACE_RDMA_RX;
 		break;
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX:
+		*namespace = MLX5_FLOW_NAMESPACE_RDMA_TX;
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index fce863621414..96dcaecf69f2 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4045,6 +4045,11 @@ _get_flow_table(struct mlx5_ib_dev *dev,
 			BIT(MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev,
 						       log_max_ft_size));
 		priority = fs_matcher->priority;
+	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) {
+		max_table_size =
+			BIT(MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev,
+						       log_max_ft_size));
+		priority = fs_matcher->priority;
 	}
 
 	max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
@@ -4061,6 +4066,8 @@ _get_flow_table(struct mlx5_ib_dev *dev,
 		prio = &dev->flow_db->fdb;
 	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX)
 		prio = &dev->flow_db->rdma_rx[priority];
+	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX)
+		prio = &dev->flow_db->rdma_tx[priority];
 
 	if (!prio)
 		return ERR_PTR(-EINVAL);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 89a050e516a8..6fe01d6142aa 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -210,6 +210,7 @@ struct mlx5_ib_flow_db {
 	struct mlx5_ib_flow_prio	egress[MLX5_IB_NUM_EGRESS_FTS];
 	struct mlx5_ib_flow_prio	fdb;
 	struct mlx5_ib_flow_prio	rdma_rx[MLX5_IB_NUM_FLOW_FT];
+	struct mlx5_ib_flow_prio	rdma_tx[MLX5_IB_NUM_FLOW_FT];
 	struct mlx5_flow_table		*lag_demux_ft;
 	/* Protect flow steering bypass flow tables
 	 * when add/del flow rules.
diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
index 88b6ca70c2fe..7ab4b92966e2 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
@@ -44,6 +44,7 @@ enum mlx5_ib_uapi_flow_table_type {
 	MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX	= 0x1,
 	MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB	= 0x2,
 	MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX	= 0x3,
+	MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX	= 0x4,
 };
 
 enum mlx5_ib_uapi_flow_action_packet_reformat_type {

From 1edaa447d958bec24c6a79685a5790d98976fd16 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 27 Mar 2020 07:22:36 -0400
Subject: [PATCH 287/327] dm writecache: add cond_resched to avoid CPU hangs

Initializing a dm-writecache device can take a long time when the
persistent memory device is large.  Add cond_resched() to a few loops
to avoid warnings that the CPU is stuck.

Cc: stable@vger.kernel.org # v4.18+
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-writecache.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index e274e5a4d425..114927da9cc9 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -925,6 +925,7 @@ static int writecache_alloc_entries(struct dm_writecache *wc)
 		struct wc_entry *e = &wc->entries[b];
 		e->index = b;
 		e->write_in_progress = false;
+		cond_resched();
 	}
 
 	return 0;
@@ -979,6 +980,7 @@ static void writecache_resume(struct dm_target *ti)
 			e->original_sector = le64_to_cpu(wme.original_sector);
 			e->seq_count = le64_to_cpu(wme.seq_count);
 		}
+		cond_resched();
 	}
 #endif
 	for (b = 0; b < wc->n_blocks; b++) {
@@ -1886,8 +1888,10 @@ static int init_memory(struct dm_writecache *wc)
 	pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
 
-	for (b = 0; b < wc->n_blocks; b++)
+	for (b = 0; b < wc->n_blocks; b++) {
 		write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
+		cond_resched();
+	}
 
 	writecache_flush_all_metadata(wc);
 	writecache_commit_flushed(wc, false);

From 4b5142905d4ff58a4b93f7c8eaa7ba829c0a53c9 Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Fri, 27 Mar 2020 16:01:08 +0200
Subject: [PATCH 288/327] dm clone: Fix handling of partial region discards

There is a bug in the way dm-clone handles discards, which can lead to
discarding the wrong blocks or trying to discard blocks beyond the end
of the device.

This could lead to data corruption, if the destination device indeed
discards the underlying blocks, i.e., if the discard operation results
in the original contents of a block to be lost.

The root of the problem is the code that calculates the range of regions
covered by a discard request and decides which regions to discard.

Since dm-clone handles the device in units of regions, we don't discard
parts of a region, only whole regions.

The range is calculated as:

    rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
    re = bio_end_sector(bio) >> clone->region_shift;

, where 'rs' is the first region to discard and (re - rs) is the number
of regions to discard.

The bug manifests when we try to discard part of a single region, i.e.,
when we try to discard a block with size < region_size, and the discard
request both starts at an offset with respect to the beginning of that
region and ends before the end of the region.

The root cause is the following comparison:

  if (rs == re)
    // skip discard and complete original bio immediately

, which doesn't take into account that 'rs' might be greater than 're'.

Thus, we then issue a discard request for the wrong blocks, instead of
skipping the discard all together.

Fix the check to also take into account the above case, so we don't end
up discarding the wrong blocks.

Also, add some range checks to dm_clone_set_region_hydrated() and
dm_clone_cond_set_range(), which update dm-clone's region bitmap.

Note that the aforementioned bug doesn't cause invalid memory accesses,
because dm_clone_is_range_hydrated() returns True for this case, so the
checks are just precautionary.

Fixes: 7431b7835f55 ("dm: add clone target")
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-clone-metadata.c | 13 ++++++++++
 drivers/md/dm-clone-target.c   | 43 +++++++++++++++++++++++-----------
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c
index c05b12110456..199e7af00858 100644
--- a/drivers/md/dm-clone-metadata.c
+++ b/drivers/md/dm-clone-metadata.c
@@ -850,6 +850,12 @@ int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long re
 	struct dirty_map *dmap;
 	unsigned long word, flags;
 
+	if (unlikely(region_nr >= cmd->nr_regions)) {
+		DMERR("Region %lu out of range (total number of regions %lu)",
+		      region_nr, cmd->nr_regions);
+		return -ERANGE;
+	}
+
 	word = region_nr / BITS_PER_LONG;
 
 	spin_lock_irqsave(&cmd->bitmap_lock, flags);
@@ -879,6 +885,13 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
 	struct dirty_map *dmap;
 	unsigned long word, region_nr;
 
+	if (unlikely(start >= cmd->nr_regions || (start + nr_regions) < start ||
+		     (start + nr_regions) > cmd->nr_regions)) {
+		DMERR("Invalid region range: start %lu, nr_regions %lu (total number of regions %lu)",
+		      start, nr_regions, cmd->nr_regions);
+		return -ERANGE;
+	}
+
 	spin_lock_irq(&cmd->bitmap_lock);
 
 	if (cmd->read_only) {
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index d1e1b5b56b1b..022dddcad647 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -293,10 +293,17 @@ static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio)
 
 /* Get the region range covered by the bio */
 static void bio_region_range(struct clone *clone, struct bio *bio,
-			     unsigned long *rs, unsigned long *re)
+			     unsigned long *rs, unsigned long *nr_regions)
 {
+	unsigned long end;
+
 	*rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
-	*re = bio_end_sector(bio) >> clone->region_shift;
+	end = bio_end_sector(bio) >> clone->region_shift;
+
+	if (*rs >= end)
+		*nr_regions = 0;
+	else
+		*nr_regions = end - *rs;
 }
 
 /* Check whether a bio overwrites a region */
@@ -454,7 +461,7 @@ static void trim_bio(struct bio *bio, sector_t sector, unsigned int len)
 
 static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success)
 {
-	unsigned long rs, re;
+	unsigned long rs, nr_regions;
 
 	/*
 	 * If the destination device supports discards, remap and trim the
@@ -463,9 +470,9 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ
 	 */
 	if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
 		remap_to_dest(clone, bio);
-		bio_region_range(clone, bio, &rs, &re);
+		bio_region_range(clone, bio, &rs, &nr_regions);
 		trim_bio(bio, rs << clone->region_shift,
-			 (re - rs) << clone->region_shift);
+			 nr_regions << clone->region_shift);
 		generic_make_request(bio);
 	} else
 		bio_endio(bio);
@@ -473,12 +480,21 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ
 
 static void process_discard_bio(struct clone *clone, struct bio *bio)
 {
-	unsigned long rs, re;
+	unsigned long rs, nr_regions;
 
-	bio_region_range(clone, bio, &rs, &re);
-	BUG_ON(re > clone->nr_regions);
+	bio_region_range(clone, bio, &rs, &nr_regions);
+	if (!nr_regions) {
+		bio_endio(bio);
+		return;
+	}
 
-	if (unlikely(rs == re)) {
+	if (WARN_ON(rs >= clone->nr_regions || (rs + nr_regions) < rs ||
+		    (rs + nr_regions) > clone->nr_regions)) {
+		DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)",
+		      clone_device_name(clone), rs, nr_regions,
+		      clone->nr_regions,
+		      (unsigned long long)bio->bi_iter.bi_sector,
+		      bio_sectors(bio));
 		bio_endio(bio);
 		return;
 	}
@@ -487,7 +503,7 @@ static void process_discard_bio(struct clone *clone, struct bio *bio)
 	 * The covered regions are already hydrated so we just need to pass
 	 * down the discard.
 	 */
-	if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) {
+	if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) {
 		complete_discard_bio(clone, bio, true);
 		return;
 	}
@@ -1169,7 +1185,7 @@ static void process_deferred_discards(struct clone *clone)
 	int r = -EPERM;
 	struct bio *bio;
 	struct blk_plug plug;
-	unsigned long rs, re;
+	unsigned long rs, nr_regions;
 	struct bio_list discards = BIO_EMPTY_LIST;
 
 	spin_lock_irq(&clone->lock);
@@ -1185,14 +1201,13 @@ static void process_deferred_discards(struct clone *clone)
 
 	/* Update the metadata */
 	bio_list_for_each(bio, &discards) {
-		bio_region_range(clone, bio, &rs, &re);
+		bio_region_range(clone, bio, &rs, &nr_regions);
 		/*
 		 * A discard request might cover regions that have been already
 		 * hydrated. There is no need to update the metadata for these
 		 * regions.
 		 */
-		r = dm_clone_cond_set_range(clone->cmd, rs, re - rs);
-
+		r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions);
 		if (unlikely(r))
 			break;
 	}

From cd481c12269b4d276f1a52eda0ebd419079bfe3a Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Fri, 27 Mar 2020 16:01:09 +0200
Subject: [PATCH 289/327] dm clone: Add overflow check for number of regions

Add overflow check for clone->nr_regions variable, which holds the
number of regions of the target.

The overflow can occur with sufficiently large devices, if BITS_PER_LONG
== 32. E.g., if the region size is 8 sectors (4K), the overflow would
occur for device sizes > 34359738360 sectors (~16TB).

This could result in multiple device sectors wrongly mapping to the same
region number, due to the truncation from 64 bits to 32 bits, which
would lead to data corruption.

Fixes: 7431b7835f55 ("dm: add clone target")
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-clone-target.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index 022dddcad647..6ee85fb3388a 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -1790,6 +1790,7 @@ static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char
 static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	int r;
+	sector_t nr_regions;
 	struct clone *clone;
 	struct dm_arg_set as;
 
@@ -1831,7 +1832,16 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto out_with_source_dev;
 
 	clone->region_shift = __ffs(clone->region_size);
-	clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size);
+	nr_regions = dm_sector_div_up(ti->len, clone->region_size);
+
+	/* Check for overflow */
+	if (nr_regions != (unsigned long)nr_regions) {
+		ti->error = "Too many regions. Consider increasing the region size";
+		r = -EOVERFLOW;
+		goto out_with_source_dev;
+	}
+
+	clone->nr_regions = nr_regions;
 
 	r = validate_nr_regions(clone->nr_regions, &ti->error);
 	if (r)

From 9fc06ff56845cc5ccafec52f545fc2e08d22f849 Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Fri, 27 Mar 2020 16:01:10 +0200
Subject: [PATCH 290/327] dm clone: Add missing casts to prevent overflows and
 data corruption

Add missing casts when converting from regions to sectors.

In case BITS_PER_LONG == 32, the lack of the appropriate casts can lead
to overflows and miscalculation of the device sector.

As a result, we could end up discarding and/or copying the wrong parts
of the device, thus corrupting the device's data.

Fixes: 7431b7835f55 ("dm: add clone target")
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-clone-target.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index 6ee85fb3388a..ca5020c58f7c 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -282,7 +282,7 @@ static bool bio_triggers_commit(struct clone *clone, struct bio *bio)
 /* Get the address of the region in sectors */
 static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr)
 {
-	return (region_nr << clone->region_shift);
+	return ((sector_t)region_nr << clone->region_shift);
 }
 
 /* Get the region number of the bio */
@@ -471,7 +471,7 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ
 	if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
 		remap_to_dest(clone, bio);
 		bio_region_range(clone, bio, &rs, &nr_regions);
-		trim_bio(bio, rs << clone->region_shift,
+		trim_bio(bio, region_to_sector(clone, rs),
 			 nr_regions << clone->region_shift);
 		generic_make_request(bio);
 	} else
@@ -804,11 +804,14 @@ static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr
 	struct dm_io_region from, to;
 	struct clone *clone = hd->clone;
 
+	if (WARN_ON(!nr_regions))
+		return;
+
 	region_size = clone->region_size;
 	region_start = hd->region_nr;
 	region_end = region_start + nr_regions - 1;
 
-	total_size = (nr_regions - 1) << clone->region_shift;
+	total_size = region_to_sector(clone, nr_regions - 1);
 
 	if (region_end == clone->nr_regions - 1) {
 		/*

From 81d5553d1288c2ec0390f02f84d71ca0f0f9f137 Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Fri, 27 Mar 2020 16:01:11 +0200
Subject: [PATCH 291/327] dm clone metadata: Fix return type of
 dm_clone_nr_of_hydrated_regions()

dm_clone_nr_of_hydrated_regions() returns the number of regions that
have been hydrated so far. In order to do so it employs bitmap_weight().

Until now, the return type of dm_clone_nr_of_hydrated_regions() was
unsigned long.

Because bitmap_weight() returns an int, in case BITS_PER_LONG == 64 and
the return value of bitmap_weight() is 2^31 (the maximum allowed number
of regions for a device), the result is sign extended from 32 bits to 64
bits and an incorrect value is displayed, in the status output of
dm-clone, as the number of hydrated regions.

Fix this by having dm_clone_nr_of_hydrated_regions() return an unsigned
int.

Fixes: 7431b7835f55 ("dm: add clone target")
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-clone-metadata.c | 2 +-
 drivers/md/dm-clone-metadata.h | 2 +-
 drivers/md/dm-clone-target.c   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c
index 199e7af00858..17712456fa63 100644
--- a/drivers/md/dm-clone-metadata.c
+++ b/drivers/md/dm-clone-metadata.c
@@ -656,7 +656,7 @@ bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd,
 	return (bit >= (start + nr_regions));
 }
 
-unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd)
+unsigned int dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd)
 {
 	return bitmap_weight(cmd->region_map, cmd->nr_regions);
 }
diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h
index 14af1ebd853f..d848b8799c07 100644
--- a/drivers/md/dm-clone-metadata.h
+++ b/drivers/md/dm-clone-metadata.h
@@ -156,7 +156,7 @@ bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd,
 /*
  * Returns the number of hydrated regions.
  */
-unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd);
+unsigned int dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd);
 
 /*
  * Returns the first unhydrated region with region_nr >= @start
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index ca5020c58f7c..5ce96ddf1ce1 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -1473,7 +1473,7 @@ static void clone_status(struct dm_target *ti, status_type_t type,
 			goto error;
 		}
 
-		DMEMIT("%u %llu/%llu %llu %lu/%lu %u ",
+		DMEMIT("%u %llu/%llu %llu %u/%lu %u ",
 		       DM_CLONE_METADATA_BLOCK_SIZE,
 		       (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks),
 		       (unsigned long long)nr_metadata_blocks,

From 068354ade5dd9e2b07d9b0c57055a681db6f4e37 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 27 Mar 2020 17:00:13 -0300
Subject: [PATCH 292/327] mm/hmm: remove pgmap checking for devmap pages

The checking boils down to some racy check if the pagemap is still
available or not. Instead of checking this, rely entirely on the
notifiers, if a pagemap is destroyed then all pages that belong to it must
be removed from the tables and the notifiers triggered.

Link: https://lore.kernel.org/r/20200327200021.29372-2-jgg@ziepe.ca
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 50 ++------------------------------------------------
 1 file changed, 2 insertions(+), 48 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index a491d9aaafe4..3a2610e07133 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -28,7 +28,6 @@
 
 struct hmm_vma_walk {
 	struct hmm_range	*range;
-	struct dev_pagemap	*pgmap;
 	unsigned long		last;
 	unsigned int		flags;
 };
@@ -196,19 +195,8 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
 		return hmm_vma_fault(addr, end, fault, write_fault, walk);
 
 	pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
-		if (pmd_devmap(pmd)) {
-			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
-					      hmm_vma_walk->pgmap);
-			if (unlikely(!hmm_vma_walk->pgmap))
-				return -EBUSY;
-		}
+	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
 		pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
-	}
-	if (hmm_vma_walk->pgmap) {
-		put_dev_pagemap(hmm_vma_walk->pgmap);
-		hmm_vma_walk->pgmap = NULL;
-	}
 	hmm_vma_walk->last = end;
 	return 0;
 }
@@ -300,15 +288,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	if (fault || write_fault)
 		goto fault;
 
-	if (pte_devmap(pte)) {
-		hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
-					      hmm_vma_walk->pgmap);
-		if (unlikely(!hmm_vma_walk->pgmap)) {
-			pte_unmap(ptep);
-			return -EBUSY;
-		}
-	}
-
 	/*
 	 * Since each architecture defines a struct page for the zero page, just
 	 * fall through and treat it like a normal page.
@@ -328,10 +307,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	return 0;
 
 fault:
-	if (hmm_vma_walk->pgmap) {
-		put_dev_pagemap(hmm_vma_walk->pgmap);
-		hmm_vma_walk->pgmap = NULL;
-	}
 	pte_unmap(ptep);
 	/* Fault any virtual address we were asked to fault */
 	return hmm_vma_fault(addr, end, fault, write_fault, walk);
@@ -418,16 +393,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 			return r;
 		}
 	}
-	if (hmm_vma_walk->pgmap) {
-		/*
-		 * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
-		 * so that we can leverage get_dev_pagemap() optimization which
-		 * will not re-take a reference on a pgmap if we already have
-		 * one.
-		 */
-		put_dev_pagemap(hmm_vma_walk->pgmap);
-		hmm_vma_walk->pgmap = NULL;
-	}
 	pte_unmap(ptep - 1);
 
 	hmm_vma_walk->last = addr;
@@ -491,20 +456,9 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 		}
 
 		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-		for (i = 0; i < npages; ++i, ++pfn) {
-			hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
-					      hmm_vma_walk->pgmap);
-			if (unlikely(!hmm_vma_walk->pgmap)) {
-				ret = -EBUSY;
-				goto out_unlock;
-			}
+		for (i = 0; i < npages; ++i, ++pfn)
 			pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
 				  cpu_flags;
-		}
-		if (hmm_vma_walk->pgmap) {
-			put_dev_pagemap(hmm_vma_walk->pgmap);
-			hmm_vma_walk->pgmap = NULL;
-		}
 		hmm_vma_walk->last = end;
 		goto out_unlock;
 	}

From a3eb13c1579ba97d79fbbc98bc5b1296a3688a25 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 27 Mar 2020 17:00:14 -0300
Subject: [PATCH 293/327] mm/hmm: return the fault type from
 hmm_pte_need_fault()

Using two bools instead of flags return is not necessary and leads to
bugs. Returning a value is easier for the compiler to check and easier to
pass around the code flow.

Convert the two bools into flags and push the change to all callers.

Link: https://lore.kernel.org/r/20200327200021.29372-3-jgg@ziepe.ca
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 183 ++++++++++++++++++++++++-------------------------------
 1 file changed, 81 insertions(+), 102 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 3a2610e07133..d208ddd35106 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -32,6 +32,12 @@ struct hmm_vma_walk {
 	unsigned int		flags;
 };
 
+enum {
+	HMM_NEED_FAULT = 1 << 0,
+	HMM_NEED_WRITE_FAULT = 1 << 1,
+	HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
+};
+
 static int hmm_pfns_fill(unsigned long addr, unsigned long end,
 		struct hmm_range *range, enum hmm_pfn_value_e value)
 {
@@ -49,8 +55,7 @@ static int hmm_pfns_fill(unsigned long addr, unsigned long end,
  * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s)
  * @addr: range virtual start address (inclusive)
  * @end: range virtual end address (exclusive)
- * @fault: should we fault or not ?
- * @write_fault: write fault ?
+ * @required_fault: HMM_NEED_* flags
  * @walk: mm_walk structure
  * Return: -EBUSY after page fault, or page fault error
  *
@@ -58,8 +63,7 @@ static int hmm_pfns_fill(unsigned long addr, unsigned long end,
  * or whenever there is no page directory covering the virtual address range.
  */
 static int hmm_vma_fault(unsigned long addr, unsigned long end,
-			      bool fault, bool write_fault,
-			      struct mm_walk *walk)
+			 unsigned int required_fault, struct mm_walk *walk)
 {
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
@@ -68,13 +72,13 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
 	unsigned long i = (addr - range->start) >> PAGE_SHIFT;
 	unsigned int fault_flags = FAULT_FLAG_REMOTE;
 
-	WARN_ON_ONCE(!fault && !write_fault);
+	WARN_ON_ONCE(!required_fault);
 	hmm_vma_walk->last = addr;
 
 	if (!vma)
 		goto out_error;
 
-	if (write_fault) {
+	if (required_fault & HMM_NEED_WRITE_FAULT) {
 		if (!(vma->vm_flags & VM_WRITE))
 			return -EPERM;
 		fault_flags |= FAULT_FLAG_WRITE;
@@ -91,14 +95,13 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
 	return -EFAULT;
 }
 
-static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
-				      uint64_t pfns, uint64_t cpu_flags,
-				      bool *fault, bool *write_fault)
+static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
+				       uint64_t pfns, uint64_t cpu_flags)
 {
 	struct hmm_range *range = hmm_vma_walk->range;
 
 	if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
-		return;
+		return 0;
 
 	/*
 	 * So we not only consider the individual per page request we also
@@ -114,37 +117,37 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
 
 	/* We aren't ask to do anything ... */
 	if (!(pfns & range->flags[HMM_PFN_VALID]))
-		return;
+		return 0;
 
-	/* If CPU page table is not valid then we need to fault */
-	*fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
 	/* Need to write fault ? */
 	if ((pfns & range->flags[HMM_PFN_WRITE]) &&
-	    !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
-		*write_fault = true;
-		*fault = true;
-	}
+	    !(cpu_flags & range->flags[HMM_PFN_WRITE]))
+		return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;
+
+	/* If CPU page table is not valid then we need to fault */
+	if (!(cpu_flags & range->flags[HMM_PFN_VALID]))
+		return HMM_NEED_FAULT;
+	return 0;
 }
 
-static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
-				 const uint64_t *pfns, unsigned long npages,
-				 uint64_t cpu_flags, bool *fault,
-				 bool *write_fault)
+static unsigned int
+hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
+		     const uint64_t *pfns, unsigned long npages,
+		     uint64_t cpu_flags)
 {
+	unsigned int required_fault = 0;
 	unsigned long i;
 
-	if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) {
-		*fault = *write_fault = false;
-		return;
-	}
+	if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
+		return 0;
 
-	*fault = *write_fault = false;
 	for (i = 0; i < npages; ++i) {
-		hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
-				   fault, write_fault);
-		if ((*write_fault))
-			return;
+		required_fault |=
+			hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags);
+		if (required_fault == HMM_NEED_ALL_BITS)
+			return required_fault;
 	}
+	return required_fault;
 }
 
 static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
@@ -152,17 +155,16 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
 {
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
-	bool fault, write_fault;
+	unsigned int required_fault;
 	unsigned long i, npages;
 	uint64_t *pfns;
 
 	i = (addr - range->start) >> PAGE_SHIFT;
 	npages = (end - addr) >> PAGE_SHIFT;
 	pfns = &range->pfns[i];
-	hmm_range_need_fault(hmm_vma_walk, pfns, npages,
-			     0, &fault, &write_fault);
-	if (fault || write_fault)
-		return hmm_vma_fault(addr, end, fault, write_fault, walk);
+	required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0);
+	if (required_fault)
+		return hmm_vma_fault(addr, end, required_fault, walk);
 	hmm_vma_walk->last = addr;
 	return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE);
 }
@@ -183,16 +185,15 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
 	unsigned long pfn, npages, i;
-	bool fault, write_fault;
+	unsigned int required_fault;
 	uint64_t cpu_flags;
 
 	npages = (end - addr) >> PAGE_SHIFT;
 	cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
-	hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
-			     &fault, &write_fault);
-
-	if (fault || write_fault)
-		return hmm_vma_fault(addr, end, fault, write_fault, walk);
+	required_fault =
+		hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags);
+	if (required_fault)
+		return hmm_vma_fault(addr, end, required_fault, walk);
 
 	pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 	for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
@@ -229,18 +230,15 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 {
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
 	struct hmm_range *range = hmm_vma_walk->range;
-	bool fault, write_fault;
+	unsigned int required_fault;
 	uint64_t cpu_flags;
 	pte_t pte = *ptep;
 	uint64_t orig_pfn = *pfn;
 
 	*pfn = range->values[HMM_PFN_NONE];
-	fault = write_fault = false;
-
 	if (pte_none(pte)) {
-		hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
-				   &fault, &write_fault);
-		if (fault || write_fault)
+		required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
+		if (required_fault)
 			goto fault;
 		return 0;
 	}
@@ -261,9 +259,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 			return 0;
 		}
 
-		hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, &fault,
-				   &write_fault);
-		if (!fault && !write_fault)
+		required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
+		if (!required_fault)
 			return 0;
 
 		if (!non_swap_entry(entry))
@@ -283,9 +280,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	}
 
 	cpu_flags = pte_to_hmm_pfn_flags(range, pte);
-	hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, &fault,
-			   &write_fault);
-	if (fault || write_fault)
+	required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
+	if (required_fault)
 		goto fault;
 
 	/*
@@ -293,9 +289,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	 * fall through and treat it like a normal page.
 	 */
 	if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) {
-		hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, &fault,
-				   &write_fault);
-		if (fault || write_fault) {
+		if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) {
 			pte_unmap(ptep);
 			return -EFAULT;
 		}
@@ -309,7 +303,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 fault:
 	pte_unmap(ptep);
 	/* Fault any virtual address we were asked to fault */
-	return hmm_vma_fault(addr, end, fault, write_fault, walk);
+	return hmm_vma_fault(addr, end, required_fault, walk);
 }
 
 static int hmm_vma_walk_pmd(pmd_t *pmdp,
@@ -322,7 +316,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 	uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
 	unsigned long npages = (end - start) >> PAGE_SHIFT;
 	unsigned long addr = start;
-	bool fault, write_fault;
 	pte_t *ptep;
 	pmd_t pmd;
 
@@ -332,9 +325,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 		return hmm_vma_walk_hole(start, end, -1, walk);
 
 	if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
-		hmm_range_need_fault(hmm_vma_walk, pfns, npages,
-				     0, &fault, &write_fault);
-		if (fault || write_fault) {
+		if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) {
 			hmm_vma_walk->last = addr;
 			pmd_migration_entry_wait(walk->mm, pmdp);
 			return -EBUSY;
@@ -343,9 +334,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 	}
 
 	if (!pmd_present(pmd)) {
-		hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault,
-				     &write_fault);
-		if (fault || write_fault)
+		if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
 			return -EFAULT;
 		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
 	}
@@ -375,9 +364,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 	 * recover.
 	 */
 	if (pmd_bad(pmd)) {
-		hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0, &fault,
-				     &write_fault);
-		if (fault || write_fault)
+		if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
 			return -EFAULT;
 		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
 	}
@@ -434,8 +421,8 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 
 	if (pud_huge(pud) && pud_devmap(pud)) {
 		unsigned long i, npages, pfn;
+		unsigned int required_fault;
 		uint64_t *pfns, cpu_flags;
-		bool fault, write_fault;
 
 		if (!pud_present(pud)) {
 			spin_unlock(ptl);
@@ -447,12 +434,11 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
 		pfns = &range->pfns[i];
 
 		cpu_flags = pud_to_hmm_pfn_flags(range, pud);
-		hmm_range_need_fault(hmm_vma_walk, pfns, npages,
-				     cpu_flags, &fault, &write_fault);
-		if (fault || write_fault) {
+		required_fault = hmm_range_need_fault(hmm_vma_walk, pfns,
+						      npages, cpu_flags);
+		if (required_fault) {
 			spin_unlock(ptl);
-			return hmm_vma_fault(addr, end, fault, write_fault,
-						  walk);
+			return hmm_vma_fault(addr, end, required_fault, walk);
 		}
 
 		pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -484,7 +470,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	struct hmm_range *range = hmm_vma_walk->range;
 	struct vm_area_struct *vma = walk->vma;
 	uint64_t orig_pfn, cpu_flags;
-	bool fault, write_fault;
+	unsigned int required_fault;
 	spinlock_t *ptl;
 	pte_t entry;
 
@@ -495,12 +481,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 	orig_pfn = range->pfns[i];
 	range->pfns[i] = range->values[HMM_PFN_NONE];
 	cpu_flags = pte_to_hmm_pfn_flags(range, entry);
-	fault = write_fault = false;
-	hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
-			   &fault, &write_fault);
-	if (fault || write_fault) {
+	required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
+	if (required_fault) {
 		spin_unlock(ptl);
-		return hmm_vma_fault(addr, end, fault, write_fault, walk);
+		return hmm_vma_fault(addr, end, required_fault, walk);
 	}
 
 	pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
@@ -522,37 +506,32 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
 	struct hmm_range *range = hmm_vma_walk->range;
 	struct vm_area_struct *vma = walk->vma;
 
+	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) &&
+	    vma->vm_flags & VM_READ)
+		return 0;
+
 	/*
-	 * Skip vma ranges that don't have struct page backing them or map I/O
-	 * devices directly.
+	 * vma ranges that don't have struct page backing them or map I/O
+	 * devices directly cannot be handled by hmm_range_fault().
 	 *
 	 * If the vma does not allow read access, then assume that it does not
 	 * allow write access either. HMM does not support architectures that
 	 * allow write without read.
+	 *
+	 * If a fault is requested for an unsupported range then it is a hard
+	 * failure.
 	 */
-	if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) ||
-	    !(vma->vm_flags & VM_READ)) {
-		bool fault, write_fault;
+	if (hmm_range_need_fault(hmm_vma_walk,
+				 range->pfns +
+					 ((start - range->start) >> PAGE_SHIFT),
+				 (end - start) >> PAGE_SHIFT, 0))
+		return -EFAULT;
 
-		/*
-		 * Check to see if a fault is requested for any page in the
-		 * range.
-		 */
-		hmm_range_need_fault(hmm_vma_walk, range->pfns +
-					((start - range->start) >> PAGE_SHIFT),
-					(end - start) >> PAGE_SHIFT,
-					0, &fault, &write_fault);
-		if (fault || write_fault)
-			return -EFAULT;
+	hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
+	hmm_vma_walk->last = end;
 
-		hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
-		hmm_vma_walk->last = end;
-
-		/* Skip this vma and continue processing the next vma. */
-		return 1;
-	}
-
-	return 0;
+	/* Skip this vma and continue processing the next vma. */
+	return 1;
 }
 
 static const struct mm_walk_ops hmm_walk_ops = {

From f970b977e068aa54e6eaf916a964a0abaf028afe Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 27 Mar 2020 17:00:15 -0300
Subject: [PATCH 294/327] mm/hmm: remove unused code and tidy comments

Delete several functions that are never called, fix some desync between
comments and structure content, toss the now out of date top of file
header, and move one function only used by hmm.c into hmm.c

Link: https://lore.kernel.org/r/20200327200021.29372-4-jgg@ziepe.ca
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h | 104 +-------------------------------------------
 mm/hmm.c            |  24 +++++++---
 2 files changed, 19 insertions(+), 109 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index bb6be4428633..daee6508a3f6 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -3,58 +3,8 @@
  * Copyright 2013 Red Hat Inc.
  *
  * Authors: Jérôme Glisse <jglisse@redhat.com>
- */
-/*
- * Heterogeneous Memory Management (HMM)
  *
- * See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it
- * is for. Here we focus on the HMM API description, with some explanation of
- * the underlying implementation.
- *
- * Short description: HMM provides a set of helpers to share a virtual address
- * space between CPU and a device, so that the device can access any valid
- * address of the process (while still obeying memory protection). HMM also
- * provides helpers to migrate process memory to device memory, and back. Each
- * set of functionality (address space mirroring, and migration to and from
- * device memory) can be used independently of the other.
- *
- *
- * HMM address space mirroring API:
- *
- * Use HMM address space mirroring if you want to mirror a range of the CPU
- * page tables of a process into a device page table. Here, "mirror" means "keep
- * synchronized". Prerequisites: the device must provide the ability to write-
- * protect its page tables (at PAGE_SIZE granularity), and must be able to
- * recover from the resulting potential page faults.
- *
- * HMM guarantees that at any point in time, a given virtual address points to
- * either the same memory in both CPU and device page tables (that is: CPU and
- * device page tables each point to the same pages), or that one page table (CPU
- * or device) points to no entry, while the other still points to the old page
- * for the address. The latter case happens when the CPU page table update
- * happens first, and then the update is mirrored over to the device page table.
- * This does not cause any issue, because the CPU page table cannot start
- * pointing to a new page until the device page table is invalidated.
- *
- * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any
- * updates to each device driver that has registered a mirror. It also provides
- * some API calls to help with taking a snapshot of the CPU page table, and to
- * synchronize with any updates that might happen concurrently.
- *
- *
- * HMM migration to and from device memory:
- *
- * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with
- * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page
- * of the device memory, and allows the device driver to manage its memory
- * using those struct pages. Having struct pages for device memory makes
- * migration easier. Because that memory is not addressable by the CPU it must
- * never be pinned to the device; in other words, any CPU page fault can always
- * cause the device memory to be migrated (copied/moved) back to regular memory.
- *
- * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that
- * allows use of a device DMA engine to perform the copy operation between
- * regular system memory and device memory.
+ * See Documentation/vm/hmm.rst for reasons and overview of what HMM is.
  */
 #ifndef LINUX_HMM_H
 #define LINUX_HMM_H
@@ -120,9 +70,6 @@ enum hmm_pfn_value_e {
  *
  * @notifier: a mmu_interval_notifier that includes the start/end
  * @notifier_seq: result of mmu_interval_read_begin()
- * @hmm: the core HMM structure this range is active against
- * @vma: the vm area struct for the range
- * @list: all range lock are on a list
  * @start: range virtual start address (inclusive)
  * @end: range virtual end address (exclusive)
  * @pfns: array of pfns (big enough for the range)
@@ -130,8 +77,7 @@ enum hmm_pfn_value_e {
  * @values: pfn value for some special case (none, special, error, ...)
  * @default_flags: default flags for the range (write, read, ... see hmm doc)
  * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
- * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
- * @valid: pfns array did not change since it has been fill by an HMM function
+ * @pfn_shift: pfn shift value (should be <= PAGE_SHIFT)
  * @dev_private_owner: owner of device private pages
  */
 struct hmm_range {
@@ -171,52 +117,6 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang
 	return pfn_to_page(entry >> range->pfn_shift);
 }
 
-/*
- * hmm_device_entry_to_pfn() - return pfn value store in a device entry
- * @range: range use to decode device entry value
- * @entry: device entry to extract pfn from
- * Return: pfn value if device entry is valid, -1UL otherwise
- */
-static inline unsigned long
-hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
-{
-	if (pfn == range->values[HMM_PFN_NONE])
-		return -1UL;
-	if (pfn == range->values[HMM_PFN_ERROR])
-		return -1UL;
-	if (pfn == range->values[HMM_PFN_SPECIAL])
-		return -1UL;
-	if (!(pfn & range->flags[HMM_PFN_VALID]))
-		return -1UL;
-	return (pfn >> range->pfn_shift);
-}
-
-/*
- * hmm_device_entry_from_page() - create a valid device entry for a page
- * @range: range use to encode HMM pfn value
- * @page: page for which to create the device entry
- * Return: valid device entry for the page
- */
-static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
-						  struct page *page)
-{
-	return (page_to_pfn(page) << range->pfn_shift) |
-		range->flags[HMM_PFN_VALID];
-}
-
-/*
- * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
- * @range: range use to encode HMM pfn value
- * @pfn: pfn value for which to create the device entry
- * Return: valid device entry for the pfn
- */
-static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
-						 unsigned long pfn)
-{
-	return (pfn << range->pfn_shift) |
-		range->flags[HMM_PFN_VALID];
-}
-
 /* Don't fault in missing PTEs, just snapshot the current state. */
 #define HMM_FAULT_SNAPSHOT		(1 << 1)
 
diff --git a/mm/hmm.c b/mm/hmm.c
index d208ddd35106..136de474221d 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -38,6 +38,18 @@ enum {
 	HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
 };
 
+/*
+ * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
+ * @range: range use to encode HMM pfn value
+ * @pfn: pfn value for which to create the device entry
+ * Return: valid device entry for the pfn
+ */
+static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
+					  unsigned long pfn)
+{
+	return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID];
+}
+
 static int hmm_pfns_fill(unsigned long addr, unsigned long end,
 		struct hmm_range *range, enum hmm_pfn_value_e value)
 {
@@ -544,7 +556,7 @@ static const struct mm_walk_ops hmm_walk_ops = {
 
 /**
  * hmm_range_fault - try to fault some address in a virtual address range
- * @range:	range being faulted
+ * @range:	argument structure
  * @flags:	HMM_FAULT_* flags
  *
  * Return: the number of valid pages in range->pfns[] (from range start
@@ -558,13 +570,11 @@ static const struct mm_walk_ops hmm_walk_ops = {
  *		only).
  * -EBUSY:	The range has been invalidated and the caller needs to wait for
  *		the invalidation to finish.
- * -EFAULT:	Invalid (i.e., either no valid vma or it is illegal to access
- *		that range) number of valid pages in range->pfns[] (from
- *              range start address).
+ * -EFAULT:     A page was requested to be valid and could not be made valid
+ *              ie it has no backing VMA or it is illegal to access
  *
- * This is similar to a regular CPU page fault except that it will not trigger
- * any memory migration if the memory being faulted is not accessible by CPUs
- * and caller does not ask for migration.
+ * This is similar to get_user_pages(), except that it can read the page tables
+ * without mutating them (ie causing faults).
  *
  * On error, for one virtual address in the range, the function will mark the
  * corresponding HMM pfn entry with an error flag.

From 6bfef2f9194519ca23dee405a9f4db461a7a7826 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 27 Mar 2020 17:00:16 -0300
Subject: [PATCH 295/327] mm/hmm: remove HMM_FAULT_SNAPSHOT

Now that flags are handled on a fine-grained per-page basis this global
flag is redundant and has a confusing overlap with the pfn_flags_mask and
default_flags.

Normalize the HMM_FAULT_SNAPSHOT behavior into one place. Callers needing
the SNAPSHOT behavior should set a pfn_flags_mask and default_flags that
always results in a cleared HMM_PFN_VALID. Then no pages will be faulted,
and HMM_FAULT_SNAPSHOT is not a special flow that overrides the masking
mechanism.

As this is the last flag, also remove the flags argument. If future flags
are needed they can be part of the struct hmm_range function arguments.

Link: https://lore.kernel.org/r/20200327200021.29372-5-jgg@ziepe.ca
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 Documentation/vm/hmm.rst                | 12 +++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  2 +-
 drivers/gpu/drm/nouveau/nouveau_svm.c   |  2 +-
 include/linux/hmm.h                     |  5 +----
 mm/hmm.c                                | 17 +++++++++--------
 5 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 95fec5968362..4e3e9362afeb 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -161,13 +161,11 @@ device must complete the update before the driver callback returns.
 When the device driver wants to populate a range of virtual addresses, it can
 use::
 
-  long hmm_range_fault(struct hmm_range *range, unsigned int flags);
+  long hmm_range_fault(struct hmm_range *range);
 
-With the HMM_RANGE_SNAPSHOT flag, it will only fetch present CPU page table
-entries and will not trigger a page fault on missing or non-present entries.
-Without that flag, it does trigger a page fault on missing or read-only entries
-if write access is requested (see below). Page faults use the generic mm page
-fault code path just like a CPU page fault.
+It will trigger a page fault on missing or read-only entries if write access is
+requested (see below). Page faults use the generic mm page fault code path just
+like a CPU page fault.
 
 Both functions copy CPU page table entries into their pfns array argument. Each
 entry in that array corresponds to an address in the virtual range. HMM
@@ -197,7 +195,7 @@ The usage pattern is::
  again:
       range.notifier_seq = mmu_interval_read_begin(&interval_sub);
       down_read(&mm->mmap_sem);
-      ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT);
+      ret = hmm_range_fault(&range);
       if (ret) {
           up_read(&mm->mmap_sem);
           if (ret == -EBUSY)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 90821ce5e6ca..c52029070937 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -856,7 +856,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
 	range->notifier_seq = mmu_interval_read_begin(&bo->notifier);
 
 	down_read(&mm->mmap_sem);
-	r = hmm_range_fault(range, 0);
+	r = hmm_range_fault(range);
 	up_read(&mm->mmap_sem);
 	if (unlikely(r <= 0)) {
 		/*
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 39c731a99937..e3797b2d4d17 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -540,7 +540,7 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
 		range.default_flags = 0;
 		range.pfn_flags_mask = -1UL;
 		down_read(&mm->mmap_sem);
-		ret = hmm_range_fault(&range, 0);
+		ret = hmm_range_fault(&range);
 		up_read(&mm->mmap_sem);
 		if (ret <= 0) {
 			if (ret == 0 || ret == -EBUSY)
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index daee6508a3f6..7475051100c7 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -117,13 +117,10 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang
 	return pfn_to_page(entry >> range->pfn_shift);
 }
 
-/* Don't fault in missing PTEs, just snapshot the current state. */
-#define HMM_FAULT_SNAPSHOT		(1 << 1)
-
 /*
  * Please see Documentation/vm/hmm.rst for how to use the range API.
  */
-long hmm_range_fault(struct hmm_range *range, unsigned int flags);
+long hmm_range_fault(struct hmm_range *range);
 
 /*
  * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
diff --git a/mm/hmm.c b/mm/hmm.c
index 136de474221d..8dbd9e1d0308 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -29,7 +29,6 @@
 struct hmm_vma_walk {
 	struct hmm_range	*range;
 	unsigned long		last;
-	unsigned int		flags;
 };
 
 enum {
@@ -112,9 +111,6 @@ static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
 {
 	struct hmm_range *range = hmm_vma_walk->range;
 
-	if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
-		return 0;
-
 	/*
 	 * So we not only consider the individual per page request we also
 	 * consider the default flags requested for the range. The API can
@@ -147,10 +143,17 @@ hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
 		     const uint64_t *pfns, unsigned long npages,
 		     uint64_t cpu_flags)
 {
+	struct hmm_range *range = hmm_vma_walk->range;
 	unsigned int required_fault = 0;
 	unsigned long i;
 
-	if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
+	/*
+	 * If the default flags do not request to fault pages, and the mask does
+	 * not allow for individual pages to be faulted, then
+	 * hmm_pte_need_fault() will always return 0.
+	 */
+	if (!((range->default_flags | range->pfn_flags_mask) &
+	      range->flags[HMM_PFN_VALID]))
 		return 0;
 
 	for (i = 0; i < npages; ++i) {
@@ -557,7 +560,6 @@ static const struct mm_walk_ops hmm_walk_ops = {
 /**
  * hmm_range_fault - try to fault some address in a virtual address range
  * @range:	argument structure
- * @flags:	HMM_FAULT_* flags
  *
  * Return: the number of valid pages in range->pfns[] (from range start
  * address), which may be zero.  On error one of the following status codes
@@ -579,12 +581,11 @@ static const struct mm_walk_ops hmm_walk_ops = {
  * On error, for one virtual address in the range, the function will mark the
  * corresponding HMM pfn entry with an error flag.
  */
-long hmm_range_fault(struct hmm_range *range, unsigned int flags)
+long hmm_range_fault(struct hmm_range *range)
 {
 	struct hmm_vma_walk hmm_vma_walk = {
 		.range = range,
 		.last = range->start,
-		.flags = flags,
 	};
 	struct mm_struct *mm = range->notifier->mm;
 	int ret;

From 9d04d56c47b11962391b96202e181042941e30b3 Mon Sep 17 00:00:00 2001
From: Jihua Tao <taojihua4@huawei.com>
Date: Thu, 26 Mar 2020 11:40:16 +0800
Subject: [PATCH 296/327] RDMA/hns: Reduce PFC frames in congestion scenarios

The original value means sending 16 packets at a time, and it should be
configured to 0 which means sending 1 packet instead. It is modified to
reduce the number of PFC frames to make sure the performance meets
expectations when flow control is enabled on hip08.

Link: https://lore.kernel.org/r/1585194018-4381-2-git-send-email-liweihang@huawei.com
Signed-off-by: Jihua Tao <taojihua4@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 7eceeea9ccea..c3316672b70e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -3928,7 +3928,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
 
 	/* mtu*(2^LP_PKTN_INI) should not bigger than 1 message length 64kb */
 	roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M,
-		       V2_QPC_BYTE_56_LP_PKTN_INI_S, 4);
+		       V2_QPC_BYTE_56_LP_PKTN_INI_S, 0);
 	roce_set_field(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M,
 		       V2_QPC_BYTE_56_LP_PKTN_INI_S, 0);
 

From 019cd05ce59d1f7c9cde4b77237d2c5fd114ce83 Mon Sep 17 00:00:00 2001
From: Lang Cheng <chenglang@huawei.com>
Date: Thu, 26 Mar 2020 11:40:17 +0800
Subject: [PATCH 297/327] RDMA/hns: Reduce the maximum number of extend SGE per
 WQE

Just reduce the default number to 64 for backward compatibility, the
driver can still get this configuration from the firmware.

Link: https://lore.kernel.org/r/1585194018-4381-3-git-send-email-liweihang@huawei.com
Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 7c999536a4c5..c74bf3cdc9b1 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -50,15 +50,14 @@
 #define HNS_ROCE_V2_MAX_WQE_NUM			0x8000
 #define	HNS_ROCE_V2_MAX_SRQ			0x100000
 #define HNS_ROCE_V2_MAX_SRQ_WR			0x8000
-#define HNS_ROCE_V2_MAX_SRQ_SGE			0x100
+#define HNS_ROCE_V2_MAX_SRQ_SGE			64
 #define HNS_ROCE_V2_MAX_CQ_NUM			0x100000
 #define HNS_ROCE_V2_MAX_CQC_TIMER_NUM		0x100
 #define HNS_ROCE_V2_MAX_SRQ_NUM			0x100000
 #define HNS_ROCE_V2_MAX_CQE_NUM			0x400000
 #define HNS_ROCE_V2_MAX_SRQWQE_NUM		0x8000
-#define HNS_ROCE_V2_MAX_RQ_SGE_NUM		0x100
-#define HNS_ROCE_V2_MAX_SQ_SGE_NUM		0xff
-#define HNS_ROCE_V2_MAX_SRQ_SGE_NUM		0x100
+#define HNS_ROCE_V2_MAX_RQ_SGE_NUM		64
+#define HNS_ROCE_V2_MAX_SQ_SGE_NUM		64
 #define HNS_ROCE_V2_MAX_EXTEND_SGE_NUM		0x200000
 #define HNS_ROCE_V2_MAX_SQ_INLINE		0x20
 #define HNS_ROCE_V2_UAR_NUM			256

From 90e735aecc4ce94eb4a2838f9fe04ee8da70b529 Mon Sep 17 00:00:00 2001
From: Lang Cheng <chenglang@huawei.com>
Date: Thu, 26 Mar 2020 11:40:18 +0800
Subject: [PATCH 298/327] RDMA/hns: Modify the mask of QP number for CQE of
 hip08

The hip08 supports up to 1M QPs, so the qpn mask of cqe should be
modified.

Link: https://lore.kernel.org/r/1585194018-4381-4-git-send-email-liweihang@huawei.com
Signed-off-by: Lang Cheng <chenglang@huawei.com>
Signed-off-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index c74bf3cdc9b1..82dd9f6f4845 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -162,7 +162,7 @@ enum {
 
 #define	GID_LEN_V2				16
 
-#define HNS_ROCE_V2_CQE_QPN_MASK		0x3ffff
+#define HNS_ROCE_V2_CQE_QPN_MASK		0xfffff
 
 enum {
 	HNS_ROCE_V2_WQE_OP_SEND				= 0x0,

From d35dc58dd26e2aef5dd7ed1929179c9974f8bb97 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Fri, 27 Mar 2020 21:35:39 -0500
Subject: [PATCH 299/327] RDMA/hns: Fix uninitialized variable bug

There is a potential execution path in which variable *ret* is returned
without being properly initialized, previously.

Fix this by initializing variable *ret* to 0.

Link: https://lore.kernel.org/r/20200328023539.GA32016@embeddedor
Addresses-Coverity-ID: 1491917 ("Uninitialized scalar variable")
Fixes: 2f49de21f3e9 ("RDMA/hns: Optimize mhop get flow for multi-hop addressing")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: Weihang Li <liweihang@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hns/hns_roce_hem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index c96378718f88..263338b90d7a 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -603,7 +603,7 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev,
 {
 	struct ib_device *ibdev = &hr_dev->ib_dev;
 	int step_idx;
-	int ret;
+	int ret = 0;
 
 	if (index->inited & HEM_INDEX_L0) {
 		ret = hr_dev->hw->set_hem(hr_dev, table, obj, 0);

From 3e87f4313035c86999281336582ff554e9a17bef Mon Sep 17 00:00:00 2001
From: George Spelvin <lkml@sdf.org>
Date: Wed, 27 Mar 2019 12:55:00 -0400
Subject: [PATCH 300/327] IB/qib: Delete struct qib_ivdev.qp_rnd

I was checking the field to see if it needed the full get_random_bytes()
and discovered it's unused.

Only compile-tested, as I don't have the hardware, but I'm still pretty
confident.

Link: https://lore.kernel.org/r/202003281643.02SGh6eG002694@sdf.org
Signed-off-by: George Spelvin <lkml@sdf.org>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/qib/qib_verbs.c | 2 --
 drivers/infiniband/hw/qib/qib_verbs.h | 1 -
 2 files changed, 3 deletions(-)

diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index 5ef93f8f17a1..7508abb6a0fa 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -39,7 +39,6 @@
 #include <linux/utsname.h>
 #include <linux/rculist.h>
 #include <linux/mm.h>
-#include <linux/random.h>
 #include <linux/vmalloc.h>
 #include <rdma/rdma_vt.h>
 
@@ -1503,7 +1502,6 @@ int qib_register_ib_device(struct qib_devdata *dd)
 	unsigned i, ctxt;
 	int ret;
 
-	get_random_bytes(&dev->qp_rnd, sizeof(dev->qp_rnd));
 	for (i = 0; i < dd->num_pports; i++)
 		init_ibport(ppd + i);
 
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index 8bf414b47b96..dc0e81f3b6f4 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -177,7 +177,6 @@ struct qib_ibdev {
 	struct timer_list mem_timer;
 	struct qib_pio_header *pio_hdrs;
 	dma_addr_t pio_hdrs_phys;
-	u32 qp_rnd; /* random bytes for hash */
 
 	u32 n_piowait;
 	u32 n_txwait;

From b4d8ddf8356d8ac73fb931d16bcc661a83b2c0fe Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Mon, 30 Mar 2020 19:02:19 +0800
Subject: [PATCH 301/327] RDMA/bnxt_re: make bnxt_re_ib_init static

Fix sparse warning:

drivers/infiniband/hw/bnxt_re/main.c:1313:5:
 warning: symbol 'bnxt_re_ib_init' was not declared. Should it be static?

Link: https://lore.kernel.org/r/20200330110219.24448-1-yuehaibing@huawei.com
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/bnxt_re/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 4a8fb1ad74a8..b12fbc857f94 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -1310,7 +1310,7 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev)
 		le16_to_cpu(resp.hwrm_intf_patch);
 }
 
-int bnxt_re_ib_init(struct bnxt_re_dev *rdev)
+static int bnxt_re_ib_init(struct bnxt_re_dev *rdev)
 {
 	int rc = 0;
 	u32 event;

From f66c9a33aee943aa43b3698c1f6f2619e28a1c77 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 27 Mar 2020 17:00:18 -0300
Subject: [PATCH 302/327] mm/hmm: use device_private_entry_to_pfn()

swp_offset() should not be called directly, the wrappers are supposed to
abstract away the encoding of the device_private specific information in
the swap entry.

Link: https://lore.kernel.org/r/20200327200021.29372-7-jgg@ziepe.ca
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 8dbd9e1d0308..53192c32abc9 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -267,7 +267,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 		 */
 		if (hmm_is_device_private_entry(range, entry)) {
 			*pfn = hmm_device_entry_from_pfn(range,
-					    swp_offset(entry));
+				device_private_entry_to_pfn(entry));
 			*pfn |= range->flags[HMM_PFN_VALID];
 			if (is_write_device_private_entry(entry))
 				*pfn |= range->flags[HMM_PFN_WRITE];

From 846babe85efdda49feba5b169668333dcf3edf25 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 27 Mar 2020 17:00:19 -0300
Subject: [PATCH 303/327] mm/hmm: do not unconditionally set pfns when
 returning EBUSY

In hmm_vma_handle_pte() and hmm_vma_walk_hugetlb_entry() if fault happens
then -EBUSY will be returned and the pfns input flags will have been
destroyed.

For hmm_vma_handle_pte() set HMM_PFN_NONE only on the success returns that
don't otherwise store to pfns.

For hmm_vma_walk_hugetlb_entry() all exit paths already set pfns, so
remove the redundant store.

Fixes: 2aee09d8c116 ("mm/hmm: change hmm_vma_fault() to allow write fault on page basis")
Link: https://lore.kernel.org/r/20200327200021.29372-8-jgg@ziepe.ca
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 53192c32abc9..660e8db44811 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -250,11 +250,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 	pte_t pte = *ptep;
 	uint64_t orig_pfn = *pfn;
 
-	*pfn = range->values[HMM_PFN_NONE];
 	if (pte_none(pte)) {
 		required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
 		if (required_fault)
 			goto fault;
+		*pfn = range->values[HMM_PFN_NONE];
 		return 0;
 	}
 
@@ -275,8 +275,10 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 		}
 
 		required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
-		if (!required_fault)
+		if (!required_fault) {
+			*pfn = range->values[HMM_PFN_NONE];
 			return 0;
+		}
 
 		if (!non_swap_entry(entry))
 			goto fault;
@@ -494,7 +496,6 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
 
 	i = (start - range->start) >> PAGE_SHIFT;
 	orig_pfn = range->pfns[i];
-	range->pfns[i] = range->values[HMM_PFN_NONE];
 	cpu_flags = pte_to_hmm_pfn_flags(range, entry);
 	required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
 	if (required_fault) {

From 53bfe17ff88faaadf024956e7cb2b295fae7744b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 27 Mar 2020 17:00:20 -0300
Subject: [PATCH 304/327] mm/hmm: do not set pfns when returning an error code

Most places that return an error code, like -EFAULT, do not set
HMM_PFN_ERROR, only two places do this.

Resolve this inconsistency by never setting the pfns on an error
exit. This doesn't seem like a worthwhile thing to do anyhow.

If for some reason it becomes important, it makes more sense to directly
return the address of the failing page rather than have the caller scan
for the HMM_PFN_ERROR.

No caller inspects the pnfs output array if hmm_range_fault() fails.

Link: https://lore.kernel.org/r/20200327200021.29372-9-jgg@ziepe.ca
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 660e8db44811..9b6a8a26a1fa 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -77,17 +77,14 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
 			 unsigned int required_fault, struct mm_walk *walk)
 {
 	struct hmm_vma_walk *hmm_vma_walk = walk->private;
-	struct hmm_range *range = hmm_vma_walk->range;
 	struct vm_area_struct *vma = walk->vma;
-	uint64_t *pfns = range->pfns;
-	unsigned long i = (addr - range->start) >> PAGE_SHIFT;
 	unsigned int fault_flags = FAULT_FLAG_REMOTE;
 
 	WARN_ON_ONCE(!required_fault);
 	hmm_vma_walk->last = addr;
 
 	if (!vma)
-		goto out_error;
+		return -EFAULT;
 
 	if (required_fault & HMM_NEED_WRITE_FAULT) {
 		if (!(vma->vm_flags & VM_WRITE))
@@ -95,15 +92,10 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
 		fault_flags |= FAULT_FLAG_WRITE;
 	}
 
-	for (; addr < end; addr += PAGE_SIZE, i++)
+	for (; addr < end; addr += PAGE_SIZE)
 		if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR)
-			goto out_error;
-
+			return -EFAULT;
 	return -EBUSY;
-
-out_error:
-	pfns[i] = range->values[HMM_PFN_ERROR];
-	return -EFAULT;
 }
 
 static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
@@ -292,7 +284,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
 
 		/* Report error for everything else */
 		pte_unmap(ptep);
-		*pfn = range->values[HMM_PFN_ERROR];
 		return -EFAULT;
 	}
 
@@ -578,9 +569,6 @@ static const struct mm_walk_ops hmm_walk_ops = {
  *
  * This is similar to get_user_pages(), except that it can read the page tables
  * without mutating them (ie causing faults).
- *
- * On error, for one virtual address in the range, the function will mark the
- * corresponding HMM pfn entry with an error flag.
  */
 long hmm_range_fault(struct hmm_range *range)
 {

From bd5d3587b218d33d70a835582dfe1d8f8498e702 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 27 Mar 2020 17:00:21 -0300
Subject: [PATCH 305/327] mm/hmm: return error for non-vma snapshots

The pagewalker does not call most ops with NULL vma, those are all routed
to hmm_vma_walk_hole() via ops->pte_hole instead.

Thus hmm_vma_fault() is only called with a NULL vma from
hmm_vma_walk_hole(), so hoist the NULL vma check to there.

Now it is clear that snapshotting with no vma is a HMM_PFN_ERROR as
without a vma we have no path to call hmm_vma_fault().

Link: https://lore.kernel.org/r/20200327200021.29372-10-jgg@ziepe.ca
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 mm/hmm.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 9b6a8a26a1fa..280585833adf 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -83,9 +83,6 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
 	WARN_ON_ONCE(!required_fault);
 	hmm_vma_walk->last = addr;
 
-	if (!vma)
-		return -EFAULT;
-
 	if (required_fault & HMM_NEED_WRITE_FAULT) {
 		if (!(vma->vm_flags & VM_WRITE))
 			return -EPERM;
@@ -170,6 +167,11 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
 	npages = (end - addr) >> PAGE_SHIFT;
 	pfns = &range->pfns[i];
 	required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0);
+	if (!walk->vma) {
+		if (required_fault)
+			return -EFAULT;
+		return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR);
+	}
 	if (required_fault)
 		return hmm_vma_fault(addr, end, required_fault, walk);
 	hmm_vma_walk->last = addr;

From d1e7fd6462ca9fc76650fbe6ca800e35b24267da Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 30 Mar 2020 19:01:04 -0500
Subject: [PATCH 306/327] signal: Extend exec_id to 64bits

Replace the 32bit exec_id with a 64bit exec_id to make it impossible
to wrap the exec_id counter.  With care an attacker can cause exec_id
wrap and send arbitrary signals to a newly exec'd parent.  This
bypasses the signal sending checks if the parent changes their
credentials during exec.

The severity of this problem can been seen that in my limited testing
of a 32bit exec_id it can take as little as 19s to exec 65536 times.
Which means that it can take as little as 14 days to wrap a 32bit
exec_id.  Adam Zabrocki has succeeded wrapping the self_exe_id in 7
days.  Even my slower timing is in the uptime of a typical server.
Which means self_exec_id is simply a speed bump today, and if exec
gets noticably faster self_exec_id won't even be a speed bump.

Extending self_exec_id to 64bits introduces a problem on 32bit
architectures where reading self_exec_id is no longer atomic and can
take two read instructions.  Which means that is is possible to hit
a window where the read value of exec_id does not match the written
value.  So with very lucky timing after this change this still
remains expoiltable.

I have updated the update of exec_id on exec to use WRITE_ONCE
and the read of exec_id in do_notify_parent to use READ_ONCE
to make it clear that there is no locking between these two
locations.

Link: https://lore.kernel.org/kernel-hardening/20200324215049.GA3710@pi3.com.pl
Fixes: 2.3.23pre2
Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/exec.c             | 2 +-
 include/linux/sched.h | 4 ++--
 kernel/signal.c       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 0e46ec57fe0a..d55710a36056 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1413,7 +1413,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
-	current->self_exec_id++;
+	WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1);
 	flush_signal_handlers(current, 0);
 }
 EXPORT_SYMBOL(setup_new_exec);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 04278493bf15..0323e4f0982a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -939,8 +939,8 @@ struct task_struct {
 	struct seccomp			seccomp;
 
 	/* Thread group tracking: */
-	u32				parent_exec_id;
-	u32				self_exec_id;
+	u64				parent_exec_id;
+	u64				self_exec_id;
 
 	/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
 	spinlock_t			alloc_lock;
diff --git a/kernel/signal.c b/kernel/signal.c
index 9ad8dea93dbb..5383b562df85 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1926,7 +1926,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 		 * This is only possible if parent == real_parent.
 		 * Check if it has changed security domain.
 		 */
-		if (tsk->parent_exec_id != tsk->parent->self_exec_id)
+		if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id))
 			sig = SIGCHLD;
 	}
 

From c2df196876609e00d90909f51b7a3392dc8e377c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Feb 2020 14:33:30 -0500
Subject: [PATCH 307/327] lift all calls of step_into() out of
 follow_dotdot/follow_dotdot_rcu

lift step_into() into handle_dots() (where they merge with each other);
have follow_... return dentry and pass inode/seq to the caller.

[braino fix folded; kudos to Qian Cai <cai@lca.pw> for reporting it]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 71 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 34 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 9c775013368a..fda239b9baff 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1687,31 +1687,29 @@ static const char *step_into(struct nameidata *nd, int flags,
 	return pick_link(nd, &path, inode, seq, flags);
 }
 
-static const char *follow_dotdot_rcu(struct nameidata *nd)
+static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
+					struct inode **inodep,
+					unsigned *seqp)
 {
-	struct dentry *parent = NULL;
-	struct inode *inode = nd->inode;
-	unsigned seq;
-
 	while (1) {
 		if (path_equal(&nd->path, &nd->root))
 			break;
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			struct dentry *old = nd->path.dentry;
+			struct dentry *parent = old->d_parent;
 
-			parent = old->d_parent;
-			inode = parent->d_inode;
-			seq = read_seqcount_begin(&parent->d_seq);
+			*inodep = parent->d_inode;
+			*seqp = read_seqcount_begin(&parent->d_seq);
 			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
 				return ERR_PTR(-ECHILD);
 			if (unlikely(!path_connected(nd->path.mnt, parent)))
 				return ERR_PTR(-ECHILD);
-			break;
+			return parent;
 		} else {
 			struct mount *mnt = real_mount(nd->path.mnt);
 			struct mount *mparent = mnt->mnt_parent;
 			struct dentry *mountpoint = mnt->mnt_mountpoint;
-			struct inode *inode2 = mountpoint->d_inode;
+			struct inode *inode = mountpoint->d_inode;
 			unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
 			if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
 				return ERR_PTR(-ECHILD);
@@ -1722,54 +1720,51 @@ static const char *follow_dotdot_rcu(struct nameidata *nd)
 			/* we know that mountpoint was pinned */
 			nd->path.dentry = mountpoint;
 			nd->path.mnt = &mparent->mnt;
-			inode = nd->inode = inode2;
+			nd->inode = inode;
 			nd->seq = seq;
 		}
 	}
-	if (unlikely(!parent)) {
-		if (unlikely(nd->flags & LOOKUP_BENEATH))
-			return ERR_PTR(-ECHILD);
-		return step_into(nd, WALK_NOFOLLOW,
-				 nd->path.dentry, nd->inode, nd->seq);
-	} else {
-		return step_into(nd, WALK_NOFOLLOW, parent, inode, seq);
-	}
+	if (unlikely(nd->flags & LOOKUP_BENEATH))
+		return ERR_PTR(-ECHILD);
+	return NULL;
 }
 
-static const char *follow_dotdot(struct nameidata *nd)
+static struct dentry *follow_dotdot(struct nameidata *nd,
+				 struct inode **inodep,
+				 unsigned *seqp)
 {
-	struct dentry *parent = NULL;
 	while (1) {
 		if (path_equal(&nd->path, &nd->root))
 			break;
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			/* rare case of legitimate dget_parent()... */
-			parent = dget_parent(nd->path.dentry);
+			struct dentry *parent = dget_parent(nd->path.dentry);
 			if (unlikely(!path_connected(nd->path.mnt, parent))) {
 				dput(parent);
 				return ERR_PTR(-ENOENT);
 			}
-			break;
+			*seqp = 0;
+			*inodep = parent->d_inode;
+			return parent;
 		}
 		if (!follow_up(&nd->path))
 			break;
 		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
 			return ERR_PTR(-EXDEV);
 	}
-	if (unlikely(!parent)) {
-		if (unlikely(nd->flags & LOOKUP_BENEATH))
-			return ERR_PTR(-EXDEV);
-		return step_into(nd, WALK_NOFOLLOW,
-				 dget(nd->path.dentry), nd->inode, nd->seq);
-	} else {
-		return step_into(nd, WALK_NOFOLLOW, parent, parent->d_inode, 0);
-	}
+	if (unlikely(nd->flags & LOOKUP_BENEATH))
+		return ERR_PTR(-EXDEV);
+	dget(nd->path.dentry);
+	return NULL;
 }
 
 static const char *handle_dots(struct nameidata *nd, int type)
 {
 	if (type == LAST_DOTDOT) {
 		const char *error = NULL;
+		struct dentry *parent;
+		struct inode *inode;
+		unsigned seq;
 
 		if (!nd->root.mnt) {
 			error = ERR_PTR(set_root(nd));
@@ -1777,10 +1772,18 @@ static const char *handle_dots(struct nameidata *nd, int type)
 				return error;
 		}
 		if (nd->flags & LOOKUP_RCU)
-			error = follow_dotdot_rcu(nd);
+			parent = follow_dotdot_rcu(nd, &inode, &seq);
 		else
-			error = follow_dotdot(nd);
-		if (error)
+			parent = follow_dotdot(nd, &inode, &seq);
+		if (IS_ERR(parent))
+			return ERR_CAST(parent);
+		if (unlikely(!parent))
+			error = step_into(nd, WALK_NOFOLLOW,
+					 nd->path.dentry, nd->inode, nd->seq);
+		else
+			error = step_into(nd, WALK_NOFOLLOW,
+					 parent, inode, seq);
+		if (unlikely(error))
 			return error;
 
 		if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {

From 12487f3067d8d68a90dd2fc0dbc2a287ae699236 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Feb 2020 14:59:56 -0500
Subject: [PATCH 308/327] follow_dotdot{,_rcu}(): massage loops

The logics in both of them is the same:
	while true
		if in process' root	// uncommon
			break
		if *not* in mount root	// normal case
			find the parent
			return
		if at absolute root	// very uncommon
			break
		move to underlying mountpoint
	report that we are in root

Pull the common path out of the loop:
	if in process' root		// uncommon
		goto in_root
	if unlikely(in mount root)
		while true
			if at absolute root
				goto in_root
			move to underlying mountpoint
			if in process' root
				goto in_root
			if in mount root
				break;
	find the parent	// we are not in mount root
	return
in_root:
	report that we are in root

The reason for that transformation is that we get to keep the
common path straight *and* get a separate block for "move
through underlying mountpoints", which will allow to sanitize
NO_XDEV handling there.  What's more, the pared-down loops
will be easier to deal with - in particular, non-RCU case
has no need to grab mount_lock and rewriting it to the
form that wouldn't do that is a non-trivial change.  Better
do that with less stuff getting in the way...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 77 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 32 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index fda239b9baff..8cfbd5fd0c78 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1691,21 +1691,12 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
 					struct inode **inodep,
 					unsigned *seqp)
 {
-	while (1) {
-		if (path_equal(&nd->path, &nd->root))
-			break;
-		if (nd->path.dentry != nd->path.mnt->mnt_root) {
-			struct dentry *old = nd->path.dentry;
-			struct dentry *parent = old->d_parent;
+	struct dentry *parent, *old;
 
-			*inodep = parent->d_inode;
-			*seqp = read_seqcount_begin(&parent->d_seq);
-			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
-				return ERR_PTR(-ECHILD);
-			if (unlikely(!path_connected(nd->path.mnt, parent)))
-				return ERR_PTR(-ECHILD);
-			return parent;
-		} else {
+	if (path_equal(&nd->path, &nd->root))
+		goto in_root;
+	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
+		while (1) {
 			struct mount *mnt = real_mount(nd->path.mnt);
 			struct mount *mparent = mnt->mnt_parent;
 			struct dentry *mountpoint = mnt->mnt_mountpoint;
@@ -1714,7 +1705,7 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
 			if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
 				return ERR_PTR(-ECHILD);
 			if (&mparent->mnt == nd->path.mnt)
-				break;
+				goto in_root;
 			if (unlikely(nd->flags & LOOKUP_NO_XDEV))
 				return ERR_PTR(-ECHILD);
 			/* we know that mountpoint was pinned */
@@ -1722,8 +1713,22 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
 			nd->path.mnt = &mparent->mnt;
 			nd->inode = inode;
 			nd->seq = seq;
+			if (path_equal(&nd->path, &nd->root))
+				goto in_root;
+			if (nd->path.dentry != nd->path.mnt->mnt_root)
+				break;
 		}
 	}
+	old = nd->path.dentry;
+	parent = old->d_parent;
+	*inodep = parent->d_inode;
+	*seqp = read_seqcount_begin(&parent->d_seq);
+	if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
+		return ERR_PTR(-ECHILD);
+	if (unlikely(!path_connected(nd->path.mnt, parent)))
+		return ERR_PTR(-ECHILD);
+	return parent;
+in_root:
 	if (unlikely(nd->flags & LOOKUP_BENEATH))
 		return ERR_PTR(-ECHILD);
 	return NULL;
@@ -1733,25 +1738,33 @@ static struct dentry *follow_dotdot(struct nameidata *nd,
 				 struct inode **inodep,
 				 unsigned *seqp)
 {
-	while (1) {
-		if (path_equal(&nd->path, &nd->root))
-			break;
-		if (nd->path.dentry != nd->path.mnt->mnt_root) {
-			/* rare case of legitimate dget_parent()... */
-			struct dentry *parent = dget_parent(nd->path.dentry);
-			if (unlikely(!path_connected(nd->path.mnt, parent))) {
-				dput(parent);
-				return ERR_PTR(-ENOENT);
-			}
-			*seqp = 0;
-			*inodep = parent->d_inode;
-			return parent;
+	struct dentry *parent;
+
+	if (path_equal(&nd->path, &nd->root))
+		goto in_root;
+	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
+		while (1) {
+			if (!follow_up(&nd->path))
+				goto in_root;
+			if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+				return ERR_PTR(-EXDEV);
+			if (path_equal(&nd->path, &nd->root))
+				goto in_root;
+			if (nd->path.dentry != nd->path.mnt->mnt_root)
+				break;
 		}
-		if (!follow_up(&nd->path))
-			break;
-		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-			return ERR_PTR(-EXDEV);
 	}
+	/* rare case of legitimate dget_parent()... */
+	parent = dget_parent(nd->path.dentry);
+	if (unlikely(!path_connected(nd->path.mnt, parent))) {
+		dput(parent);
+		return ERR_PTR(-ENOENT);
+	}
+	*seqp = 0;
+	*inodep = parent->d_inode;
+	return parent;
+
+in_root:
 	if (unlikely(nd->flags & LOOKUP_BENEATH))
 		return ERR_PTR(-EXDEV);
 	dget(nd->path.dentry);

From efe772d6283b3672c0880b10fc4ceea69bc782ca Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 28 Feb 2020 10:06:37 -0500
Subject: [PATCH 309/327] follow_dotdot_rcu(): be lazy about changing nd->path

Change nd->path only after the loop is done and only in case we hadn't
ended up finding ourselves in root.  Same for NO_XDEV check.  Don't
recheck mount_lock on each step either.

That separates the "check how far back do we need to go through the
mount stack" logics from the rest of .. traversal.

Note that the sequence for d_seq/d_inode here is
	* sample mount_lock seqcount
...
	* sample d_seq
	* fetch d_inode
	* verify mount_lock seqcount
The last step makes sure that d_inode value we'd got matches d_seq -
it dentry is guaranteed to have been a mountpoint through the
entire thing, so its d_inode must have been stable.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 8cfbd5fd0c78..af18ede95e2d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1696,28 +1696,31 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
 	if (path_equal(&nd->path, &nd->root))
 		goto in_root;
 	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
+		struct path path = nd->path;
+		unsigned seq;
+
 		while (1) {
-			struct mount *mnt = real_mount(nd->path.mnt);
+			struct mount *mnt = real_mount(path.mnt);
 			struct mount *mparent = mnt->mnt_parent;
 			struct dentry *mountpoint = mnt->mnt_mountpoint;
-			struct inode *inode = mountpoint->d_inode;
-			unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
-			if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
-				return ERR_PTR(-ECHILD);
-			if (&mparent->mnt == nd->path.mnt)
+			seq = read_seqcount_begin(&mountpoint->d_seq);
+			if (&mparent->mnt == path.mnt)
 				goto in_root;
-			if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-				return ERR_PTR(-ECHILD);
-			/* we know that mountpoint was pinned */
-			nd->path.dentry = mountpoint;
-			nd->path.mnt = &mparent->mnt;
-			nd->inode = inode;
-			nd->seq = seq;
-			if (path_equal(&nd->path, &nd->root))
+			path.dentry = mountpoint;
+			path.mnt = &mparent->mnt;
+			if (path_equal(&path, &nd->root))
 				goto in_root;
-			if (nd->path.dentry != nd->path.mnt->mnt_root)
+			if (path.dentry != path.mnt->mnt_root)
 				break;
 		}
+		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+			return ERR_PTR(-ECHILD);
+		nd->path = path;
+		nd->inode = path.dentry->d_inode;
+		nd->seq = seq;
+		if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+			return ERR_PTR(-ECHILD);
+		/* we know that mountpoint was pinned */
 	}
 	old = nd->path.dentry;
 	parent = old->d_parent;
@@ -1729,6 +1732,8 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
 		return ERR_PTR(-ECHILD);
 	return parent;
 in_root:
+	if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
+		return ERR_PTR(-ECHILD);
 	if (unlikely(nd->flags & LOOKUP_BENEATH))
 		return ERR_PTR(-ECHILD);
 	return NULL;

From 165200d6cb88a3cc1fdeb12e11e97fb96bf685d9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 28 Feb 2020 10:17:52 -0500
Subject: [PATCH 310/327] follow_dotdot(): be lazy about changing nd->path

Change nd->path only after the loop is done and only in case we hadn't
ended up finding ourselves in root.  Same for NO_XDEV check.

That separates the "check how far back do we need to go through the
mount stack" logics from the rest of .. traversal.

NOTE: path_get/path_put introduced here are temporary.  They will
go away later in the series.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index af18ede95e2d..9cfb7096e307 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1748,16 +1748,24 @@ static struct dentry *follow_dotdot(struct nameidata *nd,
 	if (path_equal(&nd->path, &nd->root))
 		goto in_root;
 	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
+		struct path path = nd->path;
+		path_get(&path);
 		while (1) {
-			if (!follow_up(&nd->path))
+			if (!follow_up(&path)) {
+				path_put(&path);
 				goto in_root;
-			if (unlikely(nd->flags & LOOKUP_NO_XDEV))
-				return ERR_PTR(-EXDEV);
-			if (path_equal(&nd->path, &nd->root))
+			}
+			if (path_equal(&path, &nd->root)) {
+				path_put(&path);
 				goto in_root;
-			if (nd->path.dentry != nd->path.mnt->mnt_root)
+			}
+			if (path.dentry != nd->path.mnt->mnt_root)
 				break;
 		}
+		path_put(&nd->path);
+		nd->path = path;
+		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
+			return ERR_PTR(-EXDEV);
 	}
 	/* rare case of legitimate dget_parent()... */
 	parent = dget_parent(nd->path.dentry);

From 7ef482fa65513b18eed05a5c5f00413aad137810 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Feb 2020 17:50:13 -0500
Subject: [PATCH 311/327] helper for mount rootwards traversal

The loops in follow_dotdot{_rcu()} are doing the same thing:
we have a mount and we want to find out how far up the chain
of mounts do we need to go.

We follow the chain of mount until we find one that is not
directly overmounting the root of another mount.  If such
a mount is found, we want the location it's mounted upon.
If we run out of chain (i.e. get to a mount that is not
mounted on anything else) or run into process' root, we
report failure.

On success, we want (in RCU case) d_seq of resulting location
sampled or (in non-RCU case) references to that location
acquired.

This commit introduces such primitive for RCU case and
switches follow_dotdot_rcu() to it; non-RCU case will be
go in the next commit.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 9cfb7096e307..6529c2506491 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1134,6 +1134,26 @@ int follow_up(struct path *path)
 }
 EXPORT_SYMBOL(follow_up);
 
+static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
+				  struct path *path, unsigned *seqp)
+{
+	while (mnt_has_parent(m)) {
+		struct dentry *mountpoint = m->mnt_mountpoint;
+
+		m = m->mnt_parent;
+		if (unlikely(root->dentry == mountpoint &&
+			     root->mnt == &m->mnt))
+			break;
+		if (mountpoint != m->mnt.mnt_root) {
+			path->mnt = &m->mnt;
+			path->dentry = mountpoint;
+			*seqp = read_seqcount_begin(&mountpoint->d_seq);
+			return true;
+		}
+	}
+	return false;
+}
+
 /*
  * Perform an automount
  * - return -EISDIR to tell follow_managed() to stop and return the path we
@@ -1696,23 +1716,11 @@ static struct dentry *follow_dotdot_rcu(struct nameidata *nd,
 	if (path_equal(&nd->path, &nd->root))
 		goto in_root;
 	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
-		struct path path = nd->path;
+		struct path path;
 		unsigned seq;
-
-		while (1) {
-			struct mount *mnt = real_mount(path.mnt);
-			struct mount *mparent = mnt->mnt_parent;
-			struct dentry *mountpoint = mnt->mnt_mountpoint;
-			seq = read_seqcount_begin(&mountpoint->d_seq);
-			if (&mparent->mnt == path.mnt)
-				goto in_root;
-			path.dentry = mountpoint;
-			path.mnt = &mparent->mnt;
-			if (path_equal(&path, &nd->root))
-				goto in_root;
-			if (path.dentry != path.mnt->mnt_root)
-				break;
-		}
+		if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
+					   &nd->root, &path, &seq))
+			goto in_root;
 		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
 			return ERR_PTR(-ECHILD);
 		nd->path = path;

From 2aa38470853a65dc9b1b4bd0989d34cd3fc57ebd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Feb 2020 19:19:05 -0500
Subject: [PATCH 312/327] non-RCU analogue of the previous commit

new helper: choose_mountpoint().  Wrapper around choose_mountpoint_rcu(),
similar to lookup_mnt() vs. __lookup_mnt().  follow_dotdot() switched to
it.  Now we don't grab mount_lock exclusive anymore; note that the
primitive used non-RCU mount traversals in other direction (lookup_mnt())
doesn't bother with that either - it uses mount_lock seqcount instead.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 56 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 17 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 6529c2506491..669a0dfa3012 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -605,10 +605,9 @@ static void terminate_walk(struct nameidata *nd)
 }
 
 /* path_put is needed afterwards regardless of success or failure */
-static bool legitimize_path(struct nameidata *nd,
-			    struct path *path, unsigned seq)
+static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
 {
-	int res = __legitimize_mnt(path->mnt, nd->m_seq);
+	int res = __legitimize_mnt(path->mnt, mseq);
 	if (unlikely(res)) {
 		if (res > 0)
 			path->mnt = NULL;
@@ -622,6 +621,12 @@ static bool legitimize_path(struct nameidata *nd,
 	return !read_seqcount_retry(&path->dentry->d_seq, seq);
 }
 
+static inline bool legitimize_path(struct nameidata *nd,
+			    struct path *path, unsigned seq)
+{
+	return __legitimize_path(path, nd->m_seq, seq);
+}
+
 static bool legitimize_links(struct nameidata *nd)
 {
 	int i;
@@ -1154,6 +1159,31 @@ static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
 	return false;
 }
 
+static bool choose_mountpoint(struct mount *m, const struct path *root,
+			      struct path *path)
+{
+	bool found;
+
+	rcu_read_lock();
+	while (1) {
+		unsigned seq, mseq = read_seqbegin(&mount_lock);
+
+		found = choose_mountpoint_rcu(m, root, path, &seq);
+		if (unlikely(!found)) {
+			if (!read_seqretry(&mount_lock, mseq))
+				break;
+		} else {
+			if (likely(__legitimize_path(path, seq, mseq)))
+				break;
+			rcu_read_unlock();
+			path_put(path);
+			rcu_read_lock();
+		}
+	}
+	rcu_read_unlock();
+	return found;
+}
+
 /*
  * Perform an automount
  * - return -EISDIR to tell follow_managed() to stop and return the path we
@@ -1756,22 +1786,14 @@ static struct dentry *follow_dotdot(struct nameidata *nd,
 	if (path_equal(&nd->path, &nd->root))
 		goto in_root;
 	if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
-		struct path path = nd->path;
-		path_get(&path);
-		while (1) {
-			if (!follow_up(&path)) {
-				path_put(&path);
-				goto in_root;
-			}
-			if (path_equal(&path, &nd->root)) {
-				path_put(&path);
-				goto in_root;
-			}
-			if (path.dentry != nd->path.mnt->mnt_root)
-				break;
-		}
+		struct path path;
+
+		if (!choose_mountpoint(real_mount(nd->path.mnt),
+				       &nd->root, &path))
+			goto in_root;
 		path_put(&nd->path);
 		nd->path = path;
+		nd->inode = path.dentry->d_inode;
 		if (unlikely(nd->flags & LOOKUP_NO_XDEV))
 			return ERR_PTR(-EXDEV);
 	}

From 19f6028a01469276abe520c46a2c0ed0204e28d7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Feb 2020 20:09:37 -0500
Subject: [PATCH 313/327] fs/namei.c: kill follow_mount()

The only remaining caller (path_pts()) should be using follow_down()
anyway.  And clean path_pts() a bit.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 669a0dfa3012..1f092f9e321b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1413,22 +1413,6 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
 	return ret;
 }
 
-/*
- * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
- */
-static void follow_mount(struct path *path)
-{
-	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path);
-		if (!mounted)
-			break;
-		dput(path->dentry);
-		mntput(path->mnt);
-		path->mnt = mounted;
-		path->dentry = dget(mounted->mnt_root);
-	}
-}
-
 /*
  * This looks up the name in dcache and possibly revalidates the found dentry.
  * NULL is returned if the dentry does not exist in the cache.
@@ -2640,7 +2624,7 @@ int path_pts(struct path *path)
 	 */
 	struct dentry *parent = dget_parent(path->dentry);
 	struct dentry *child;
-	struct qstr this;
+	struct qstr this = QSTR_INIT("pts", 3);
 
 	if (unlikely(!path_connected(path->mnt, parent))) {
 		dput(parent);
@@ -2648,15 +2632,13 @@ int path_pts(struct path *path)
 	}
 	dput(path->dentry);
 	path->dentry = parent;
-	this.name = "pts";
-	this.len = 3;
 	child = d_hash_and_lookup(parent, &this);
 	if (!child)
 		return -ENOENT;
 
 	path->dentry = child;
 	dput(parent);
-	follow_mount(path);
+	follow_down(path);
 	return 0;
 }
 #endif

From 84f0cd9e83daa9a9c3e640f6d108e7e86edf0d86 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 3 Mar 2020 10:14:30 -0500
Subject: [PATCH 314/327] pick_link(): pass it struct path already with normal
 refcounting rules

step_into() tries to avoid grabbing and dropping mount references
on the steps that do not involve crossing mountpoints (which is
obviously the majority of cases).  So it uses a local struct path
with unusual refcounting rules - path.mnt is pinned if and only if
it's not equal to nd->path.mnt.

We used to have similar beasts all over the place and we had quite
a few bugs crop up in their handling - it's easy to get confused
when changing e.g. cleanup on failure exits (or adding a new check,
etc.)

Now that's mostly gone - the step_into() instance (which is what
we need them for) is the only one left.  It is exposed to mount
traversal and it's (shortly) seen by pick_link().  Since pick_link()
needs to store it in link stack, where the normal rules apply,
it has to make sure that mount is pinned regardless of nd->path.mnt
value.  That's done on all calls of pick_link() and very early
in those.  Let's do that in the caller (step_into()) instead -
that way the fewer places need to be aware of such struct path
instances.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 1f092f9e321b..aab4eee6d56a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1602,13 +1602,10 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 	int error;
 
 	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
-		path_to_nameidata(link, nd);
+		if (!(nd->flags & LOOKUP_RCU))
+			path_put(link);
 		return ERR_PTR(-ELOOP);
 	}
-	if (!(nd->flags & LOOKUP_RCU)) {
-		if (link->mnt == nd->path.mnt)
-			mntget(link->mnt);
-	}
 	error = nd_alloc_stack(nd);
 	if (unlikely(error)) {
 		if (error == -ECHILD) {
@@ -1713,10 +1710,13 @@ static const char *step_into(struct nameidata *nd, int flags,
 		nd->seq = seq;
 		return NULL;
 	}
-	/* make sure that d_is_symlink above matches inode */
 	if (nd->flags & LOOKUP_RCU) {
+		/* make sure that d_is_symlink above matches inode */
 		if (read_seqcount_retry(&path.dentry->d_seq, seq))
 			return ERR_PTR(-ECHILD);
+	} else {
+		if (path.mnt == nd->path.mnt)
+			mntget(path.mnt);
 	}
 	return pick_link(nd, &path, inode, seq, flags);
 }

From c99687a03a78775f77d57fe9b07af4c8ec3dd03c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 3 Mar 2020 10:56:17 -0500
Subject: [PATCH 315/327] fold path_to_nameidata() into its only remaining
 caller

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index aab4eee6d56a..0ce515e03e11 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -860,18 +860,6 @@ static int set_root(struct nameidata *nd)
 	return 0;
 }
 
-static inline void path_to_nameidata(const struct path *path,
-					struct nameidata *nd)
-{
-	if (!(nd->flags & LOOKUP_RCU)) {
-		dput(nd->path.dentry);
-		if (nd->path.mnt != path->mnt)
-			mntput(nd->path.mnt);
-	}
-	nd->path.mnt = path->mnt;
-	nd->path.dentry = path->dentry;
-}
-
 static int nd_jump_root(struct nameidata *nd)
 {
 	if (unlikely(nd->flags & LOOKUP_BENEATH))
@@ -1705,7 +1693,12 @@ static const char *step_into(struct nameidata *nd, int flags,
 	   ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
 	   (flags & WALK_NOFOLLOW)) {
 		/* not a symlink or should not follow */
-		path_to_nameidata(&path, nd);
+		if (!(nd->flags & LOOKUP_RCU)) {
+			dput(nd->path.dentry);
+			if (nd->path.mnt != path.mnt)
+				mntput(nd->path.mnt);
+		}
+		nd->path = path;
 		nd->inode = inode;
 		nd->seq = seq;
 		return NULL;

From aef9404d8cc3a5730dea74d05e9cb2e4c2b4a511 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 2 Mar 2020 16:46:14 -0500
Subject: [PATCH 316/327] pick_link(): more straightforward handling of
 allocation failures

pick_link() needs to push onto stack; we start with using two-element
array embedded into struct nameidata and the first time we need
more than that we switch to separately allocated array.

Allocation can fail, of course, and handling of that would be simple
enough - we need to drop 'link' and bugger off.  However, the things
get more complicated in RCU mode.  There we must do GFP_ATOMIC
allocation.  If that fails, we try to switch to non-RCU mode and
repeat the allocation.

To switch to non-RCU mode we need to grab references to 'link' and
to everything in nameidata.  The latter done by unlazy_walk();
the former - legitimize_path().  'link' must go first - after
unlazy_walk() we are out of RCU-critical period and it's too
late to call legitimize_path() since the references in link->mnt
and link->dentry might be pointing to freed and reused memory.

So we do legitimize_path(), then unlazy_walk().  And that's where
it gets too subtle: what to do if the former fails?  We MUST
do path_put(link) to avoid leaks.  And we can't do that under
rcu_read_lock().  Solution in mainline was to empty then nameidata
manually, drop out of RCU mode and then do put_path().

In effect, we open-code the things eventual terminate_walk()
would've done on error in RCU mode.  That looks badly out of place
and confusing.  We could add a comment along the lines of the
explanation above, but... there's a simpler solution.  Call
unlazy_walk() even if legitimaze_path() fails.  It will take
us out of RCU mode, so we'll be able to do path_put(link).

Yes, it will do unnecessary work - attempt to grab references
on the stuff in nameidata, only to have them dropped as soon
as we return the error to upper layer and get terminate_walk()
called there.  So what?  We are thoroughly off the fast path
by that point - we had GFP_ATOMIC allocation fail, we had
->d_seq or mount_lock mismatch and we are about to try walking
the same path from scratch in non-RCU mode.  Which will need
to do the same allocation, this time with GFP_KERNEL, so it will
be able to apply memory pressure for blocking stuff.

Compared to that the cost of several lockref_get_not_dead()
is noise.  And the logics become much easier to understand
that way.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 0ce515e03e11..1f3d3981fe02 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1597,14 +1597,13 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 	error = nd_alloc_stack(nd);
 	if (unlikely(error)) {
 		if (error == -ECHILD) {
-			if (unlikely(!legitimize_path(nd, link, seq))) {
-				drop_links(nd);
-				nd->depth = 0;
-				nd->flags &= ~LOOKUP_RCU;
-				nd->path.mnt = NULL;
-				nd->path.dentry = NULL;
-				rcu_read_unlock();
-			} else if (likely(unlazy_walk(nd)) == 0)
+			// we must grab link first
+			bool grabbed_link = legitimize_path(nd, link, seq);
+			// ... and we must unlazy to be able to clean up
+			error = unlazy_walk(nd);
+			if (unlikely(!grabbed_link))
+				error = -ECHILD;
+			if (!error)
 				error = nd_alloc_stack(nd);
 		}
 		if (error) {

From 49055906af51b59d50a34b18820f85cc7d085218 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 3 Mar 2020 11:22:34 -0500
Subject: [PATCH 317/327] pick_link(): take reserving space on stack into a new
 helper

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 46 +++++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 1f3d3981fe02..6025406e01e5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1580,6 +1580,28 @@ static inline int may_lookup(struct nameidata *nd)
 	return inode_permission(nd->inode, MAY_EXEC);
 }
 
+static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
+{
+	int error;
+
+	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
+		return -ELOOP;
+	error = nd_alloc_stack(nd);
+	if (likely(!error))
+		return 0;
+	if (error == -ECHILD) {
+		// we must grab link first
+		bool grabbed_link = legitimize_path(nd, link, seq);
+		// ... and we must unlazy to be able to clean up
+		error = unlazy_walk(nd);
+		if (unlikely(!grabbed_link))
+			error = -ECHILD;
+		if (!error)
+			error = nd_alloc_stack(nd);
+	}
+	return error;
+}
+
 enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};
 
 static const char *pick_link(struct nameidata *nd, struct path *link,
@@ -1587,31 +1609,13 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
 {
 	struct saved *last;
 	const char *res;
-	int error;
+	int error = reserve_stack(nd, link, seq);
 
-	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
+	if (unlikely(error)) {
 		if (!(nd->flags & LOOKUP_RCU))
 			path_put(link);
-		return ERR_PTR(-ELOOP);
+		return ERR_PTR(error);
 	}
-	error = nd_alloc_stack(nd);
-	if (unlikely(error)) {
-		if (error == -ECHILD) {
-			// we must grab link first
-			bool grabbed_link = legitimize_path(nd, link, seq);
-			// ... and we must unlazy to be able to clean up
-			error = unlazy_walk(nd);
-			if (unlikely(!grabbed_link))
-				error = -ECHILD;
-			if (!error)
-				error = nd_alloc_stack(nd);
-		}
-		if (error) {
-			path_put(link);
-			return ERR_PTR(error);
-		}
-	}
-
 	last = nd->stack + nd->depth++;
 	last->link = *link;
 	clear_delayed_call(&last->done);

From 4542576b79a680062dc344a8a6e79d34b4c91bf1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 3 Mar 2020 11:25:31 -0500
Subject: [PATCH 318/327] reserve_stack(): switch to __nd_alloc_stack()

expand the call of nd_alloc_stack() into it (and don't
recheck the depth on the second call)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 6025406e01e5..0539a26dd9b4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -566,15 +566,6 @@ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
 	return is_subdir(dentry, mnt->mnt_root);
 }
 
-static inline int nd_alloc_stack(struct nameidata *nd)
-{
-	if (likely(nd->depth != EMBEDDED_LEVELS))
-		return 0;
-	if (likely(nd->stack != nd->internal))
-		return 0;
-	return __nd_alloc_stack(nd);
-}
-
 static void drop_links(struct nameidata *nd)
 {
 	int i = nd->depth;
@@ -1586,7 +1577,13 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
 
 	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
 		return -ELOOP;
-	error = nd_alloc_stack(nd);
+
+	if (likely(nd->depth != EMBEDDED_LEVELS))
+		return 0;
+	if (likely(nd->stack != nd->internal))
+		return 0;
+
+	error = __nd_alloc_stack(nd);
 	if (likely(!error))
 		return 0;
 	if (error == -ECHILD) {
@@ -1597,7 +1594,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
 		if (unlikely(!grabbed_link))
 			error = -ECHILD;
 		if (!error)
-			error = nd_alloc_stack(nd);
+			error = __nd_alloc_stack(nd);
 	}
 	return error;
 }

From 60ef60c7d7291d7cdc43661363cd05a5102e69f6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 3 Mar 2020 11:43:55 -0500
Subject: [PATCH 319/327] __nd_alloc_stack(): make it return bool

... and adjust the caller (reserve_stack()).  Rename to nd_alloc_stack(),
while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 45 ++++++++++++++++++---------------------------
 1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 0539a26dd9b4..7a95a3fcbf68 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -529,24 +529,17 @@ static void restore_nameidata(void)
 		kfree(now->stack);
 }
 
-static int __nd_alloc_stack(struct nameidata *nd)
+static bool nd_alloc_stack(struct nameidata *nd)
 {
 	struct saved *p;
 
-	if (nd->flags & LOOKUP_RCU) {
-		p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
-				  GFP_ATOMIC);
-		if (unlikely(!p))
-			return -ECHILD;
-	} else {
-		p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
-				  GFP_KERNEL);
-		if (unlikely(!p))
-			return -ENOMEM;
-	}
+	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
+			 nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
+	if (unlikely(!p))
+		return false;
 	memcpy(p, nd->internal, sizeof(nd->internal));
 	nd->stack = p;
-	return 0;
+	return true;
 }
 
 /**
@@ -1573,8 +1566,6 @@ static inline int may_lookup(struct nameidata *nd)
 
 static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
 {
-	int error;
-
 	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
 		return -ELOOP;
 
@@ -1582,21 +1573,21 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
 		return 0;
 	if (likely(nd->stack != nd->internal))
 		return 0;
-
-	error = __nd_alloc_stack(nd);
-	if (likely(!error))
+	if (likely(nd_alloc_stack(nd)))
 		return 0;
-	if (error == -ECHILD) {
-		// we must grab link first
+
+	if (nd->flags & LOOKUP_RCU) {
+		// we need to grab link before we do unlazy.  And we can't skip
+		// unlazy even if we fail to grab the link - cleanup needs it
 		bool grabbed_link = legitimize_path(nd, link, seq);
-		// ... and we must unlazy to be able to clean up
-		error = unlazy_walk(nd);
-		if (unlikely(!grabbed_link))
-			error = -ECHILD;
-		if (!error)
-			error = __nd_alloc_stack(nd);
+
+		if (unlazy_walk(nd) != 0 || !grabbed_link)
+			return -ECHILD;
+
+		if (nd_alloc_stack(nd))
+			return 0;
 	}
-	return error;
+	return -ENOMEM;
 }
 
 enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

From 0f705953012a753068b20016f55dc12d1f7e17e5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 5 Mar 2020 11:34:48 -0500
Subject: [PATCH 320/327] link_path_walk(): sample parent's i_uid and i_mode
 for the last component

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 7a95a3fcbf68..e737d884592d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -505,6 +505,8 @@ struct nameidata {
 	struct nameidata *saved;
 	unsigned	root_seq;
 	int		dfd;
+	kuid_t		dir_uid;
+	umode_t		dir_mode;
 } __randomize_layout;
 
 static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
@@ -938,9 +940,6 @@ int sysctl_protected_regular __read_mostly;
  */
 static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
 {
-	const struct inode *parent;
-	kuid_t puid;
-
 	if (!sysctl_protected_symlinks)
 		return 0;
 
@@ -949,13 +948,11 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod
 		return 0;
 
 	/* Allowed if parent directory not sticky and world-writable. */
-	parent = nd->inode;
-	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
+	if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
 		return 0;
 
 	/* Allowed if parent directory and link owner match. */
-	puid = parent->i_uid;
-	if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
+	if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid))
 		return 0;
 
 	if (nd->flags & LOOKUP_RCU)
@@ -2159,6 +2156,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 OK:
 			/* pathname or trailing symlink, done */
 			if (!depth) {
+				nd->dir_uid = nd->inode->i_uid;
+				nd->dir_mode = nd->inode->i_mode;
 				nd->flags &= ~LOOKUP_PARENT;
 				return 0;
 			}
@@ -3224,8 +3223,6 @@ static const char *open_last_lookups(struct nameidata *nd,
 static const char *do_last(struct nameidata *nd,
 		   struct file *file, const struct open_flags *op)
 {
-	kuid_t dir_uid = nd->inode->i_uid;
-	umode_t dir_mode = nd->inode->i_mode;
 	int open_flag = op->open_flag;
 	bool do_truncate;
 	int acc_mode;
@@ -3241,7 +3238,7 @@ static const char *do_last(struct nameidata *nd,
 	if (open_flag & O_CREAT) {
 		if (d_is_dir(nd->path.dentry))
 			return ERR_PTR(-EISDIR);
-		error = may_create_in_sticky(dir_mode, dir_uid,
+		error = may_create_in_sticky(nd->dir_mode, nd->dir_uid,
 					     d_backing_inode(nd->path.dentry));
 		if (unlikely(error))
 			return ERR_PTR(error);

From c5971b8c6354a95c9ee7eb722928af5000bac247 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 5 Mar 2020 11:41:29 -0500
Subject: [PATCH 321/327] take post-lookup part of do_last() out of loop

now we can have open_last_lookups() directly from the loop in
path_openat() - the rest of do_last() never returns a symlink
to follow, so we can bloody well leave the loop first.

Rename the rest of that thing from do_last() to do_open() and
make it return an int.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index e737d884592d..329ade345e62 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3220,31 +3220,26 @@ static const char *open_last_lookups(struct nameidata *nd,
 /*
  * Handle the last step of open()
  */
-static const char *do_last(struct nameidata *nd,
+static int do_open(struct nameidata *nd,
 		   struct file *file, const struct open_flags *op)
 {
 	int open_flag = op->open_flag;
 	bool do_truncate;
 	int acc_mode;
-	const char *link;
 	int error;
 
-	link = open_last_lookups(nd, file, op);
-	if (unlikely(link))
-		return link;
-
 	if (!(file->f_mode & FMODE_CREATED))
 		audit_inode(nd->name, nd->path.dentry, 0);
 	if (open_flag & O_CREAT) {
 		if (d_is_dir(nd->path.dentry))
-			return ERR_PTR(-EISDIR);
+			return -EISDIR;
 		error = may_create_in_sticky(nd->dir_mode, nd->dir_uid,
 					     d_backing_inode(nd->path.dentry));
 		if (unlikely(error))
-			return ERR_PTR(error);
+			return error;
 	}
 	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
-		return ERR_PTR(-ENOTDIR);
+		return -ENOTDIR;
 
 	do_truncate = false;
 	acc_mode = op->acc_mode;
@@ -3255,7 +3250,7 @@ static const char *do_last(struct nameidata *nd,
 	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
-			return ERR_PTR(error);
+			return error;
 		do_truncate = true;
 	}
 	error = may_open(&nd->path, acc_mode, open_flag);
@@ -3271,7 +3266,7 @@ static const char *do_last(struct nameidata *nd,
 	}
 	if (do_truncate)
 		mnt_drop_write(nd->path.mnt);
-	return ERR_PTR(error);
+	return error;
 }
 
 struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
@@ -3374,8 +3369,10 @@ static struct file *path_openat(struct nameidata *nd,
 	} else {
 		const char *s = path_init(nd, flags);
 		while (!(error = link_path_walk(s, nd)) &&
-		       (s = do_last(nd, file, op)) != NULL)
+		       (s = open_last_lookups(nd, file, op)) != NULL)
 			;
+		if (!error)
+			error = do_open(nd, file, op);
 		terminate_walk(nd);
 	}
 	if (likely(!error)) {

From f7bb959d9623ddf9fe9cc44234c210051548ea18 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 5 Mar 2020 13:25:20 -0500
Subject: [PATCH 322/327] open_last_lookups(): consolidate fsnotify_create()
 calls

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 329ade345e62..6cb41608c47c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2957,8 +2957,6 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
 				dput(dentry);
 				dentry = dget(file->f_path.dentry);
 			}
-			if (file->f_mode & FMODE_CREATED)
-				fsnotify_create(dir, dentry);
 		} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
 			error = -EIO;
 		} else {
@@ -2966,8 +2964,6 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
 				dput(dentry);
 				dentry = file->f_path.dentry;
 			}
-			if (file->f_mode & FMODE_CREATED)
-				fsnotify_create(dir, dentry);
 			if (unlikely(d_is_negative(dentry)))
 				error = -ENOENT;
 		}
@@ -3103,7 +3099,6 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 						open_flag & O_EXCL);
 		if (error)
 			goto out_dput;
-		fsnotify_create(dir_inode, dentry);
 	}
 	if (unlikely(create_error) && !dentry->d_inode) {
 		error = create_error;
@@ -3182,6 +3177,8 @@ static const char *open_last_lookups(struct nameidata *nd,
 	else
 		inode_lock_shared(dir->d_inode);
 	dentry = lookup_open(nd, file, op, got_write);
+	if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
+		fsnotify_create(dir->d_inode, dentry);
 	if (open_flag & O_CREAT)
 		inode_unlock(dir->d_inode);
 	else

From 72287417abd16c42f1d1ea8d93ee60ba08022fc1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Mar 2020 10:09:26 -0400
Subject: [PATCH 323/327] open_last_lookups(): don't abuse complete_walk() when
 all we want is unlazy

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 6cb41608c47c..9f2d95aa2502 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3147,15 +3147,11 @@ static const char *open_last_lookups(struct nameidata *nd,
 		BUG_ON(nd->flags & LOOKUP_RCU);
 	} else {
 		/* create side of things */
-		/*
-		 * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
-		 * has been cleared when we got to the last component we are
-		 * about to look up
-		 */
-		error = complete_walk(nd);
-		if (unlikely(error))
-			return ERR_PTR(error);
-
+		if (nd->flags & LOOKUP_RCU) {
+			error = unlazy_walk(nd);
+			if (unlikely(error))
+				return ERR_PTR(error);
+		}
 		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
 		/* trailing slashes? */
 		if (unlikely(nd->last.name[nd->last.len]))

From b94e0b32c899914f045ddf05949517d64fd32499 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Mar 2020 10:13:53 -0400
Subject: [PATCH 324/327] open_last_lookups(): lift O_EXCL|O_CREAT handling
 into do_open()

Currently path_openat() has "EEXIST on O_EXCL|O_CREAT" checks done on one
of the ways out of open_last_lookups().  There are 4 cases:
	1) the last component is . or ..; check is not done.
	2) we had FMODE_OPENED or FMODE_CREATED set while in lookup_open();
check is not done.
	3) symlink to be traversed is found; check is not done (nor
should it be)
	4) everything else: check done (before complete_walk(), even).

In case (1) O_EXCL|O_CREAT ends up failing with -EISDIR - that's
	open("/tmp/.", O_CREAT|O_EXCL, 0600)
Note that in the same conditions
	open("/tmp", O_CREAT|O_EXCL, 0600)
would have yielded EEXIST.  Either error is allowed, switching to -EEXIST
in these cases would've been more consistent.

Case (2) is more subtle; first of all, if we have FMODE_CREATED set, the
object hadn't existed prior to the call.  The check should not be done in
such a case.  The rest is problematic, though - we have
	FMODE_OPENED set (i.e. it went through ->atomic_open() and got
successfully opened there)
	FMODE_CREATED is *NOT* set
	O_CREAT and O_EXCL are both set.
Any such case is a bug - either we failed to set FMODE_CREATED when we
had, in fact, created an object (no such instances in the tree) or
we have opened a pre-existing file despite having had both O_CREAT and
O_EXCL passed.  One of those was, in fact caught (and fixed) while
sorting out this mess (gfs2 on cold dcache).  And in such situations
we should fail with EEXIST.

Note that for (1) and (4) FMODE_CREATED is not set - for (1) there's nothing
in handle_dots() to set it, for (4) we'd explicitly checked that.

And (1), (2) and (4) are exactly the cases when we leave the loop in
the caller, with do_open() called immediately after that loop.  IOW, we
can move the check over there, and make it

	If we have O_CREAT|O_EXCL and after successful pathname resolution
FMODE_CREATED is *not* set, we must have run into a preexisting file and
should fail with EEXIST.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 9f2d95aa2502..2e9205599931 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3201,11 +3201,6 @@ static const char *open_last_lookups(struct nameidata *nd,
 		return res;
 	}
 
-	if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
-		audit_inode(nd->name, nd->path.dentry, 0);
-		return ERR_PTR(-EEXIST);
-	}
-
 	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
 	return ERR_PTR(complete_walk(nd));
 }
@@ -3224,6 +3219,8 @@ static int do_open(struct nameidata *nd,
 	if (!(file->f_mode & FMODE_CREATED))
 		audit_inode(nd->name, nd->path.dentry, 0);
 	if (open_flag & O_CREAT) {
+		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
+			return -EEXIST;
 		if (d_is_dir(nd->path.dentry))
 			return -EISDIR;
 		error = may_create_in_sticky(nd->dir_mode, nd->dir_uid,

From ff326a3299660e46d4eb8876b217fee38638281a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Mar 2020 10:19:24 -0400
Subject: [PATCH 325/327] open_last_lookups(): move complete_walk() into
 do_open()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 2e9205599931..2c901a022faf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3128,10 +3128,7 @@ static const char *open_last_lookups(struct nameidata *nd,
 	if (nd->last_type != LAST_NORM) {
 		if (nd->depth)
 			put_link(nd);
-		res = handle_dots(nd, nd->last_type);
-		if (likely(!res))
-			res = ERR_PTR(complete_walk(nd));
-		return res;
+		return handle_dots(nd, nd->last_type);
 	}
 
 	if (!(open_flag & O_CREAT)) {
@@ -3196,13 +3193,9 @@ static const char *open_last_lookups(struct nameidata *nd,
 	if (nd->depth)
 		put_link(nd);
 	res = step_into(nd, WALK_TRAILING, dentry, inode, seq);
-	if (unlikely(res)) {
+	if (unlikely(res))
 		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-		return res;
-	}
-
-	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
-	return ERR_PTR(complete_walk(nd));
+	return res;
 }
 
 /*
@@ -3216,6 +3209,11 @@ static int do_open(struct nameidata *nd,
 	int acc_mode;
 	int error;
 
+	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
+		error = complete_walk(nd);
+		if (error)
+			return error;
+	}
 	if (!(file->f_mode & FMODE_CREATED))
 		audit_inode(nd->name, nd->path.dentry, 0);
 	if (open_flag & O_CREAT) {

From d489cf9a3e88aafc6c9f55c36ff7bbca2c25d54a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 11 Mar 2020 08:07:53 -0400
Subject: [PATCH 326/327] atomic_open(): no need to pass struct open_flags
 anymore

argument had been unused since 1643b43fbd052 (lookup_open(): lift the
"fallback to !O_CREAT" logics from atomic_open()) back in 2016

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 2c901a022faf..1607560f2932 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2933,7 +2933,6 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m
  */
 static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
 				  struct file *file,
-				  const struct open_flags *op,
 				  int open_flag, umode_t mode)
 {
 	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
@@ -3066,7 +3065,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 	}
 
 	if (dir_inode->i_op->atomic_open) {
-		dentry = atomic_open(nd, dentry, file, op, open_flag, mode);
+		dentry = atomic_open(nd, dentry, file, open_flag, mode);
 		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
 			dentry = ERR_PTR(create_error);
 		return dentry;

From 99a4a90c8e9337e364136393286544e3753673c3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 12 Mar 2020 14:07:27 -0400
Subject: [PATCH 327/327] lookup_open(): don't bother with fallbacks to
 lookup+create

We fall back to lookup+create (instead of atomic_open) in several cases:
	1) we don't have write access to filesystem and O_TRUNC is
present in the flags.  It's not something we want ->atomic_open() to
see - it just might go ahead and truncate the file.  However, we can
pass it the flags sans O_TRUNC - eventually do_open() will call
handle_truncate() anyway.
	2) we have O_CREAT | O_EXCL and we can't write to parent.
That's going to be an error, of course, but we want to know _which_
error should that be - might be EEXIST (if file exists), might be
EACCES or EROFS.  Simply stripping O_CREAT (and checking if we see
ENOENT) would suffice, if not for O_EXCL.  However, we used to have
->atomic_open() fully responsible for rejecting O_CREAT | O_EXCL
on existing file and just stripping O_CREAT would've disarmed
those checks.  With nothing downstream to catch the problem -
FMODE_OPENED used to be "don't bother with EEXIST checks,
->atomic_open() has done those".  Now EEXIST checks downstream
are skipped only if FMODE_CREATED is set - FMODE_OPENED alone
is not enough.  That has eliminated the need to fall back onto
lookup+create path in this case.
	3) O_WRONLY or O_RDWR when we have no write access to
filesystem, with nothing else objectionable.  Fallback is
(and had always been) pointless.

IOW, we don't really need that fallback; all we need in such
cases is to trim O_TRUNC and O_CREAT properly.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 34 +++++++++-------------------------
 1 file changed, 9 insertions(+), 25 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 1607560f2932..61fdb77a7d58 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2939,9 +2939,6 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
 	struct inode *dir =  nd->path.dentry->d_inode;
 	int error;
 
-	if (!(~open_flag & (O_EXCL | O_CREAT)))	/* both O_EXCL and O_CREAT */
-		open_flag &= ~O_TRUNC;
-
 	if (nd->flags & LOOKUP_DIRECTORY)
 		open_flag |= O_DIRECTORY;
 
@@ -3038,32 +3035,20 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 	 * Another problem is returing the "right" error value (e.g. for an
 	 * O_EXCL open we want to return EEXIST not EROFS).
 	 */
+	if (unlikely(!got_write))
+		open_flag &= ~O_TRUNC;
 	if (open_flag & O_CREAT) {
+		if (open_flag & O_EXCL)
+			open_flag &= ~O_TRUNC;
 		if (!IS_POSIXACL(dir->d_inode))
 			mode &= ~current_umask();
-		if (unlikely(!got_write)) {
-			create_error = -EROFS;
-			open_flag &= ~O_CREAT;
-			if (open_flag & (O_EXCL | O_TRUNC))
-				goto no_open;
-			/* No side effects, safe to clear O_CREAT */
-		} else {
+		if (likely(got_write))
 			create_error = may_o_create(&nd->path, dentry, mode);
-			if (create_error) {
-				open_flag &= ~O_CREAT;
-				if (open_flag & O_EXCL)
-					goto no_open;
-			}
-		}
-	} else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) &&
-		   unlikely(!got_write)) {
-		/*
-		 * No O_CREATE -> atomicity not a requirement -> fall
-		 * back to lookup + open
-		 */
-		goto no_open;
+		else
+			create_error = -EROFS;
 	}
-
+	if (create_error)
+		open_flag &= ~O_CREAT;
 	if (dir_inode->i_op->atomic_open) {
 		dentry = atomic_open(nd, dentry, file, open_flag, mode);
 		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
@@ -3071,7 +3056,6 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 		return dentry;
 	}
 
-no_open:
 	if (d_in_lookup(dentry)) {
 		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
 							     nd->flags);