From 937c6b27c73e02cd4114f95f5c37ba2c29fadba1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 9 Oct 2019 17:02:30 +0200
Subject: [PATCH 01/43] cgroup: freezer: call cgroup_enter_frozen() with
 preemption disabled in ptrace_stop()

ptrace_stop() does preempt_enable_no_resched() to avoid the preemption,
but after that cgroup_enter_frozen() does spin_lock/unlock and this adds
another preemption point.

Reported-and-tested-by: Bruce Ashfield <bruce.ashfield@gmail.com>
Fixes: 76f969e8948d ("cgroup: cgroup v2 freezer")
Cc: stable@vger.kernel.org # v5.2+
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index c4da1ef56fdf..bcd46f547db3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2205,8 +2205,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 		 */
 		preempt_disable();
 		read_unlock(&tasklist_lock);
-		preempt_enable_no_resched();
 		cgroup_enter_frozen();
+		preempt_enable_no_resched();
 		freezable_schedule();
 		cgroup_leave_frozen(true);
 	} else {

From c2955f270a84762343000f103e0640d29c7a96f3 Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Wed, 23 Oct 2019 10:45:50 +0200
Subject: [PATCH 02/43] x86/msr: Add the IA32_TSX_CTRL MSR

Transactional Synchronization Extensions (TSX) may be used on certain
processors as part of a speculative side channel attack.  A microcode
update for existing processors that are vulnerable to this attack will
add a new MSR - IA32_TSX_CTRL to allow the system administrator the
option to disable TSX as one of the possible mitigations.

The CPUs which get this new MSR after a microcode upgrade are the ones
which do not set MSR_IA32_ARCH_CAPABILITIES.MDS_NO (bit 5) because those
CPUs have CPUID.MD_CLEAR, i.e., the VERW implementation which clears all
CPU buffers takes care of the TAA case as well.

  [ Note that future processors that are not vulnerable will also
    support the IA32_TSX_CTRL MSR. ]

Add defines for the new IA32_TSX_CTRL MSR and its bits.

TSX has two sub-features:

1. Restricted Transactional Memory (RTM) is an explicitly-used feature
   where new instructions begin and end TSX transactions.
2. Hardware Lock Elision (HLE) is implicitly used when certain kinds of
   "old" style locks are used by software.

Bit 7 of the IA32_ARCH_CAPABILITIES indicates the presence of the
IA32_TSX_CTRL MSR.

There are two control bits in IA32_TSX_CTRL MSR:

  Bit 0: When set, it disables the Restricted Transactional Memory (RTM)
         sub-feature of TSX (will force all transactions to abort on the
	 XBEGIN instruction).

  Bit 1: When set, it disables the enumeration of the RTM and HLE feature
         (i.e. it will make CPUID(EAX=7).EBX{bit4} and
	  CPUID(EAX=7).EBX{bit11} read as 0).

The other TSX sub-feature, Hardware Lock Elision (HLE), is
unconditionally disabled by the new microcode but still enumerated
as present by CPUID(EAX=7).EBX{bit4}, unless disabled by
IA32_TSX_CTRL_MSR[1] - TSX_CTRL_CPUID_CLEAR.

Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Neelima Krishnan <neelima.krishnan@intel.com>
Reviewed-by: Mark Gross <mgross@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/include/asm/msr-index.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 20ce682a2540..da4caf6da739 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -93,6 +93,7 @@
 						  * Microarchitectural Data
 						  * Sampling (MDS) vulnerabilities.
 						  */
+#define ARCH_CAP_TSX_CTRL_MSR		BIT(7)	/* MSR for TSX control is available. */
 
 #define MSR_IA32_FLUSH_CMD		0x0000010b
 #define L1D_FLUSH			BIT(0)	/*
@@ -103,6 +104,10 @@
 #define MSR_IA32_BBL_CR_CTL		0x00000119
 #define MSR_IA32_BBL_CR_CTL3		0x0000011e
 
+#define MSR_IA32_TSX_CTRL		0x00000122
+#define TSX_CTRL_RTM_DISABLE		BIT(0)	/* Disable RTM feature */
+#define TSX_CTRL_CPUID_CLEAR		BIT(1)	/* Disable TSX enumeration */
+
 #define MSR_IA32_SYSENTER_CS		0x00000174
 #define MSR_IA32_SYSENTER_ESP		0x00000175
 #define MSR_IA32_SYSENTER_EIP		0x00000176

From 286836a70433fb64131d2590f4bf512097c255e1 Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Wed, 23 Oct 2019 10:52:35 +0200
Subject: [PATCH 03/43] x86/cpu: Add a helper function x86_read_arch_cap_msr()

Add a helper function to read the IA32_ARCH_CAPABILITIES MSR.

Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Neelima Krishnan <neelima.krishnan@intel.com>
Reviewed-by: Mark Gross <mgross@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/kernel/cpu/common.c | 15 +++++++++++----
 arch/x86/kernel/cpu/cpu.h    |  2 ++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9ae7d1bcd4f4..897c8302d982 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1092,19 +1092,26 @@ static bool __init cpu_matches(unsigned long which)
 	return m && !!(m->driver_data & which);
 }
 
-static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+u64 x86_read_arch_cap_msr(void)
 {
 	u64 ia32_cap = 0;
 
+	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+
+	return ia32_cap;
+}
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+{
+	u64 ia32_cap = x86_read_arch_cap_msr();
+
 	if (cpu_matches(NO_SPECULATION))
 		return;
 
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
 
-	if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
-		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
-
 	if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
 	   !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
 		setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index c0e2407abdd6..a5cd0eba2746 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -62,4 +62,6 @@ unsigned int aperfmperf_get_khz(int cpu);
 
 extern void x86_spec_ctrl_setup_ap(void);
 
+extern u64 x86_read_arch_cap_msr(void);
+
 #endif /* ARCH_X86_CPU_H */

From 95c5824f75f3ba4c9e8e5a4b1a623c95390ac266 Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Wed, 23 Oct 2019 11:01:53 +0200
Subject: [PATCH 04/43] x86/cpu: Add a "tsx=" cmdline option with TSX disabled
 by default

Add a kernel cmdline parameter "tsx" to control the Transactional
Synchronization Extensions (TSX) feature. On CPUs that support TSX
control, use "tsx=on|off" to enable or disable TSX. Not specifying this
option is equivalent to "tsx=off". This is because on certain processors
TSX may be used as a part of a speculative side channel attack.

Carve out the TSX controlling functionality into a separate compilation
unit because TSX is a CPU feature while the TSX async abort control
machinery will go to cpu/bugs.c.

 [ bp: - Massage, shorten and clear the arg buffer.
       - Clarifications of the tsx= possible options - Josh.
       - Expand on TSX_CTRL availability - Pawan. ]

Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 .../admin-guide/kernel-parameters.txt         |  26 ++++
 arch/x86/kernel/cpu/Makefile                  |   2 +-
 arch/x86/kernel/cpu/common.c                  |   2 +
 arch/x86/kernel/cpu/cpu.h                     |  16 +++
 arch/x86/kernel/cpu/intel.c                   |   5 +
 arch/x86/kernel/cpu/tsx.c                     | 125 ++++++++++++++++++
 6 files changed, 175 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/cpu/tsx.c

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a84a83f8881e..af4b1d95b08f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4848,6 +4848,32 @@
 			interruptions from clocksource watchdog are not
 			acceptable).
 
+	tsx=		[X86] Control Transactional Synchronization
+			Extensions (TSX) feature in Intel processors that
+			support TSX control.
+
+			This parameter controls the TSX feature. The options are:
+
+			on	- Enable TSX on the system. Although there are
+				mitigations for all known security vulnerabilities,
+				TSX has been known to be an accelerator for
+				several previous speculation-related CVEs, and
+				so there may be unknown	security risks associated
+				with leaving it enabled.
+
+			off	- Disable TSX on the system. (Note that this
+				option takes effect only on newer CPUs which are
+				not vulnerable to MDS, i.e., have
+				MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 and which get
+				the new IA32_TSX_CTRL MSR through a microcode
+				update. This new MSR allows for the reliable
+				deactivation of the TSX functionality.)
+
+			Not specifying this option is equivalent to tsx=off.
+
+			See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
+			for more details.
+
 	turbografx.map[2|3]=	[HW,JOY]
 			TurboGraFX parallel port interface
 			Format:
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d7a1e5a9331c..890f60083eca 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,7 +30,7 @@ obj-$(CONFIG_PROC_FS)	+= proc.o
 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
 
 ifdef CONFIG_CPU_SUP_INTEL
-obj-y			+= intel.o intel_pconfig.o
+obj-y			+= intel.o intel_pconfig.o tsx.o
 obj-$(CONFIG_PM)	+= intel_epb.o
 endif
 obj-$(CONFIG_CPU_SUP_AMD)		+= amd.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 897c8302d982..885d4ac2111a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1561,6 +1561,8 @@ void __init identify_boot_cpu(void)
 #endif
 	cpu_detect_tlb(&boot_cpu_data);
 	setup_cr_pinning();
+
+	tsx_init();
 }
 
 void identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index a5cd0eba2746..38ab6e115eac 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -44,6 +44,22 @@ struct _tlb_table {
 extern const struct cpu_dev *const __x86_cpu_dev_start[],
 			    *const __x86_cpu_dev_end[];
 
+#ifdef CONFIG_CPU_SUP_INTEL
+enum tsx_ctrl_states {
+	TSX_CTRL_ENABLE,
+	TSX_CTRL_DISABLE,
+	TSX_CTRL_NOT_SUPPORTED,
+};
+
+extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
+
+extern void __init tsx_init(void);
+extern void tsx_enable(void);
+extern void tsx_disable(void);
+#else
+static inline void tsx_init(void) { }
+#endif /* CONFIG_CPU_SUP_INTEL */
+
 extern void get_cpu_cap(struct cpuinfo_x86 *c);
 extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
 extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index c2fdc00df163..11d5c5950e2d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -762,6 +762,11 @@ static void init_intel(struct cpuinfo_x86 *c)
 		detect_tme(c);
 
 	init_intel_misc_features(c);
+
+	if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+		tsx_enable();
+	if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+		tsx_disable();
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
new file mode 100644
index 000000000000..04471c4378d8
--- /dev/null
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Transactional Synchronization Extensions (TSX) control.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author:
+ *	Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <asm/cmdline.h>
+
+#include "cpu.h"
+
+enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
+
+void tsx_disable(void)
+{
+	u64 tsx;
+
+	rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+	/* Force all transactions to immediately abort */
+	tsx |= TSX_CTRL_RTM_DISABLE;
+
+	/*
+	 * Ensure TSX support is not enumerated in CPUID.
+	 * This is visible to userspace and will ensure they
+	 * do not waste resources trying TSX transactions that
+	 * will always abort.
+	 */
+	tsx |= TSX_CTRL_CPUID_CLEAR;
+
+	wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+void tsx_enable(void)
+{
+	u64 tsx;
+
+	rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+
+	/* Enable the RTM feature in the cpu */
+	tsx &= ~TSX_CTRL_RTM_DISABLE;
+
+	/*
+	 * Ensure TSX support is enumerated in CPUID.
+	 * This is visible to userspace and will ensure they
+	 * can enumerate and use the TSX feature.
+	 */
+	tsx &= ~TSX_CTRL_CPUID_CLEAR;
+
+	wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static bool __init tsx_ctrl_is_supported(void)
+{
+	u64 ia32_cap = x86_read_arch_cap_msr();
+
+	/*
+	 * TSX is controlled via MSR_IA32_TSX_CTRL.  However, support for this
+	 * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+	 *
+	 * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+	 * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+	 * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+	 * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+	 * tsx= cmdline requests will do nothing on CPUs without
+	 * MSR_IA32_TSX_CTRL support.
+	 */
+	return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
+}
+
+void __init tsx_init(void)
+{
+	char arg[4] = {};
+	int ret;
+
+	if (!tsx_ctrl_is_supported())
+		return;
+
+	ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
+	if (ret >= 0) {
+		if (!strcmp(arg, "on")) {
+			tsx_ctrl_state = TSX_CTRL_ENABLE;
+		} else if (!strcmp(arg, "off")) {
+			tsx_ctrl_state = TSX_CTRL_DISABLE;
+		} else {
+			tsx_ctrl_state = TSX_CTRL_DISABLE;
+			pr_err("tsx: invalid option, defaulting to off\n");
+		}
+	} else {
+		/* tsx= not provided, defaulting to off */
+		tsx_ctrl_state = TSX_CTRL_DISABLE;
+	}
+
+	if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
+		tsx_disable();
+
+		/*
+		 * tsx_disable() will change the state of the
+		 * RTM CPUID bit.  Clear it here since it is now
+		 * expected to be not set.
+		 */
+		setup_clear_cpu_cap(X86_FEATURE_RTM);
+	} else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
+
+		/*
+		 * HW defaults TSX to be enabled at bootup.
+		 * We may still need the TSX enable support
+		 * during init for special cases like
+		 * kexec after TSX is disabled.
+		 */
+		tsx_enable();
+
+		/*
+		 * tsx_enable() will change the state of the
+		 * RTM CPUID bit.  Force it here since it is now
+		 * expected to be set.
+		 */
+		setup_force_cpu_cap(X86_FEATURE_RTM);
+	}
+}

From 1b42f017415b46c317e71d41c34ec088417a1883 Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Wed, 23 Oct 2019 11:30:45 +0200
Subject: [PATCH 05/43] x86/speculation/taa: Add mitigation for TSX Async Abort

TSX Async Abort (TAA) is a side channel vulnerability to the internal
buffers in some Intel processors similar to Microachitectural Data
Sampling (MDS). In this case, certain loads may speculatively pass
invalid data to dependent operations when an asynchronous abort
condition is pending in a TSX transaction.

This includes loads with no fault or assist condition. Such loads may
speculatively expose stale data from the uarch data structures as in
MDS. Scope of exposure is within the same-thread and cross-thread. This
issue affects all current processors that support TSX, but do not have
ARCH_CAP_TAA_NO (bit 8) set in MSR_IA32_ARCH_CAPABILITIES.

On CPUs which have their IA32_ARCH_CAPABILITIES MSR bit MDS_NO=0,
CPUID.MD_CLEAR=1 and the MDS mitigation is clearing the CPU buffers
using VERW or L1D_FLUSH, there is no additional mitigation needed for
TAA. On affected CPUs with MDS_NO=1 this issue can be mitigated by
disabling the Transactional Synchronization Extensions (TSX) feature.

A new MSR IA32_TSX_CTRL in future and current processors after a
microcode update can be used to control the TSX feature. There are two
bits in that MSR:

* TSX_CTRL_RTM_DISABLE disables the TSX sub-feature Restricted
Transactional Memory (RTM).

* TSX_CTRL_CPUID_CLEAR clears the RTM enumeration in CPUID. The other
TSX sub-feature, Hardware Lock Elision (HLE), is unconditionally
disabled with updated microcode but still enumerated as present by
CPUID(EAX=7).EBX{bit4}.

The second mitigation approach is similar to MDS which is clearing the
affected CPU buffers on return to user space and when entering a guest.
Relevant microcode update is required for the mitigation to work.  More
details on this approach can be found here:

  https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html

The TSX feature can be controlled by the "tsx" command line parameter.
If it is force-enabled then "Clear CPU buffers" (MDS mitigation) is
deployed. The effective mitigation state can be read from sysfs.

 [ bp:
   - massage + comments cleanup
   - s/TAA_MITIGATION_TSX_DISABLE/TAA_MITIGATION_TSX_DISABLED/g - Josh.
   - remove partial TAA mitigation in update_mds_branch_idle() - Josh.
   - s/tsx_async_abort_cmdline/tsx_async_abort_parse_cmdline/g
 ]

Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/include/asm/cpufeatures.h   |   1 +
 arch/x86/include/asm/msr-index.h     |   4 +
 arch/x86/include/asm/nospec-branch.h |   4 +-
 arch/x86/include/asm/processor.h     |   7 ++
 arch/x86/kernel/cpu/bugs.c           | 108 +++++++++++++++++++++++++++
 arch/x86/kernel/cpu/common.c         |  15 ++++
 6 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 0652d3eed9bd..989e03544f18 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -399,5 +399,6 @@
 #define X86_BUG_MDS			X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
 #define X86_BUG_MSBDS_ONLY		X86_BUG(20) /* CPU is only affected by the  MSDBS variant of BUG_MDS */
 #define X86_BUG_SWAPGS			X86_BUG(21) /* CPU is affected by speculation through SWAPGS */
+#define X86_BUG_TAA			X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index da4caf6da739..b3a8bb2af0b6 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -94,6 +94,10 @@
 						  * Sampling (MDS) vulnerabilities.
 						  */
 #define ARCH_CAP_TSX_CTRL_MSR		BIT(7)	/* MSR for TSX control is available. */
+#define ARCH_CAP_TAA_NO			BIT(8)	/*
+						 * Not susceptible to
+						 * TSX Async Abort (TAA) vulnerabilities.
+						 */
 
 #define MSR_IA32_FLUSH_CMD		0x0000010b
 #define L1D_FLUSH			BIT(0)	/*
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 80bc209c0708..5c24a7b35166 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -314,7 +314,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
 #include <asm/segment.h>
 
 /**
- * mds_clear_cpu_buffers - Mitigation for MDS vulnerability
+ * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
  *
  * This uses the otherwise unused and obsolete VERW instruction in
  * combination with microcode which triggers a CPU buffer flush when the
@@ -337,7 +337,7 @@ static inline void mds_clear_cpu_buffers(void)
 }
 
 /**
- * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability
+ * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability
  *
  * Clear CPU buffers if the corresponding static key is enabled
  */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6e0a3b43d027..54f5d54280f6 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -988,4 +988,11 @@ enum mds_mitigations {
 	MDS_MITIGATION_VMWERV,
 };
 
+enum taa_mitigations {
+	TAA_MITIGATION_OFF,
+	TAA_MITIGATION_UCODE_NEEDED,
+	TAA_MITIGATION_VERW,
+	TAA_MITIGATION_TSX_DISABLED,
+};
+
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 91c2561b905f..58fe3746e333 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -39,6 +39,7 @@ static void __init spectre_v2_select_mitigation(void);
 static void __init ssb_select_mitigation(void);
 static void __init l1tf_select_mitigation(void);
 static void __init mds_select_mitigation(void);
+static void __init taa_select_mitigation(void);
 
 /* The base value of the SPEC_CTRL MSR that always has to be preserved. */
 u64 x86_spec_ctrl_base;
@@ -105,6 +106,7 @@ void __init check_bugs(void)
 	ssb_select_mitigation();
 	l1tf_select_mitigation();
 	mds_select_mitigation();
+	taa_select_mitigation();
 
 	arch_smt_update();
 
@@ -268,6 +270,100 @@ static int __init mds_cmdline(char *str)
 }
 early_param("mds", mds_cmdline);
 
+#undef pr_fmt
+#define pr_fmt(fmt)	"TAA: " fmt
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
+static bool taa_nosmt __ro_after_init;
+
+static const char * const taa_strings[] = {
+	[TAA_MITIGATION_OFF]		= "Vulnerable",
+	[TAA_MITIGATION_UCODE_NEEDED]	= "Vulnerable: Clear CPU buffers attempted, no microcode",
+	[TAA_MITIGATION_VERW]		= "Mitigation: Clear CPU buffers",
+	[TAA_MITIGATION_TSX_DISABLED]	= "Mitigation: TSX disabled",
+};
+
+static void __init taa_select_mitigation(void)
+{
+	u64 ia32_cap;
+
+	if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+		taa_mitigation = TAA_MITIGATION_OFF;
+		return;
+	}
+
+	/* TSX previously disabled by tsx=off */
+	if (!boot_cpu_has(X86_FEATURE_RTM)) {
+		taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
+		goto out;
+	}
+
+	if (cpu_mitigations_off()) {
+		taa_mitigation = TAA_MITIGATION_OFF;
+		return;
+	}
+
+	/* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */
+	if (taa_mitigation == TAA_MITIGATION_OFF)
+		goto out;
+
+	if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
+		taa_mitigation = TAA_MITIGATION_VERW;
+	else
+		taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+	/*
+	 * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+	 * A microcode update fixes this behavior to clear CPU buffers. It also
+	 * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+	 * ARCH_CAP_TSX_CTRL_MSR bit.
+	 *
+	 * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+	 * update is required.
+	 */
+	ia32_cap = x86_read_arch_cap_msr();
+	if ( (ia32_cap & ARCH_CAP_MDS_NO) &&
+	    !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))
+		taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+	/*
+	 * TSX is enabled, select alternate mitigation for TAA which is
+	 * the same as MDS. Enable MDS static branch to clear CPU buffers.
+	 *
+	 * For guests that can't determine whether the correct microcode is
+	 * present on host, enable the mitigation for UCODE_NEEDED as well.
+	 */
+	static_branch_enable(&mds_user_clear);
+
+	if (taa_nosmt || cpu_mitigations_auto_nosmt())
+		cpu_smt_disable(false);
+
+out:
+	pr_info("%s\n", taa_strings[taa_mitigation]);
+}
+
+static int __init tsx_async_abort_parse_cmdline(char *str)
+{
+	if (!boot_cpu_has_bug(X86_BUG_TAA))
+		return 0;
+
+	if (!str)
+		return -EINVAL;
+
+	if (!strcmp(str, "off")) {
+		taa_mitigation = TAA_MITIGATION_OFF;
+	} else if (!strcmp(str, "full")) {
+		taa_mitigation = TAA_MITIGATION_VERW;
+	} else if (!strcmp(str, "full,nosmt")) {
+		taa_mitigation = TAA_MITIGATION_VERW;
+		taa_nosmt = true;
+	}
+
+	return 0;
+}
+early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
+
 #undef pr_fmt
 #define pr_fmt(fmt)     "Spectre V1 : " fmt
 
@@ -786,6 +882,7 @@ static void update_mds_branch_idle(void)
 }
 
 #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
 
 void cpu_bugs_smt_update(void)
 {
@@ -819,6 +916,17 @@ void cpu_bugs_smt_update(void)
 		break;
 	}
 
+	switch (taa_mitigation) {
+	case TAA_MITIGATION_VERW:
+	case TAA_MITIGATION_UCODE_NEEDED:
+		if (sched_smt_active())
+			pr_warn_once(TAA_MSG_SMT);
+		break;
+	case TAA_MITIGATION_TSX_DISABLED:
+	case TAA_MITIGATION_OFF:
+		break;
+	}
+
 	mutex_unlock(&spec_ctrl_mutex);
 }
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 885d4ac2111a..f8b8afc8f5b5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1128,6 +1128,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 	if (!cpu_matches(NO_SWAPGS))
 		setup_force_cpu_bug(X86_BUG_SWAPGS);
 
+	/*
+	 * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when:
+	 *	- TSX is supported or
+	 *	- TSX_CTRL is present
+	 *
+	 * TSX_CTRL check is needed for cases when TSX could be disabled before
+	 * the kernel boot e.g. kexec.
+	 * TSX_CTRL check alone is not sufficient for cases when the microcode
+	 * update is not present or running as guest that don't get TSX_CTRL.
+	 */
+	if (!(ia32_cap & ARCH_CAP_TAA_NO) &&
+	    (cpu_has(c, X86_FEATURE_RTM) ||
+	     (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
+		setup_force_cpu_bug(X86_BUG_TAA);
+
 	if (cpu_matches(NO_MELTDOWN))
 		return;
 

From 6608b45ac5ecb56f9e171252229c39580cc85f0f Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Wed, 23 Oct 2019 12:19:51 +0200
Subject: [PATCH 06/43] x86/speculation/taa: Add sysfs reporting for TSX Async
 Abort

Add the sysfs reporting file for TSX Async Abort. It exposes the
vulnerability and the mitigation state similar to the existing files for
the other hardware vulnerabilities.

Sysfs file path is:
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort

Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Neelima Krishnan <neelima.krishnan@intel.com>
Reviewed-by: Mark Gross <mgross@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/kernel/cpu/bugs.c | 23 +++++++++++++++++++++++
 drivers/base/cpu.c         |  9 +++++++++
 include/linux/cpu.h        |  3 +++
 3 files changed, 35 insertions(+)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 58fe3746e333..43c647e19439 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1436,6 +1436,21 @@ static ssize_t mds_show_state(char *buf)
 		       sched_smt_active() ? "vulnerable" : "disabled");
 }
 
+static ssize_t tsx_async_abort_show_state(char *buf)
+{
+	if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
+	    (taa_mitigation == TAA_MITIGATION_OFF))
+		return sprintf(buf, "%s\n", taa_strings[taa_mitigation]);
+
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+		return sprintf(buf, "%s; SMT Host state unknown\n",
+			       taa_strings[taa_mitigation]);
+	}
+
+	return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+		       sched_smt_active() ? "vulnerable" : "disabled");
+}
+
 static char *stibp_state(void)
 {
 	if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
@@ -1506,6 +1521,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
 	case X86_BUG_MDS:
 		return mds_show_state(buf);
 
+	case X86_BUG_TAA:
+		return tsx_async_abort_show_state(buf);
+
 	default:
 		break;
 	}
@@ -1542,4 +1560,9 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu
 {
 	return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
 }
+
+ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_TAA);
+}
 #endif
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index cc37511de866..0fccd8c0312e 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -554,12 +554,20 @@ ssize_t __weak cpu_show_mds(struct device *dev,
 	return sprintf(buf, "Not affected\n");
 }
 
+ssize_t __weak cpu_show_tsx_async_abort(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	return sprintf(buf, "Not affected\n");
+}
+
 static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
 static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
 static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
 static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
 static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
 static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);
+static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL);
 
 static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_meltdown.attr,
@@ -568,6 +576,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_spec_store_bypass.attr,
 	&dev_attr_l1tf.attr,
 	&dev_attr_mds.attr,
+	&dev_attr_tsx_async_abort.attr,
 	NULL
 };
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d0633ebdaa9c..f35369f79771 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -59,6 +59,9 @@ extern ssize_t cpu_show_l1tf(struct device *dev,
 			     struct device_attribute *attr, char *buf);
 extern ssize_t cpu_show_mds(struct device *dev,
 			    struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_tsx_async_abort(struct device *dev,
+					struct device_attribute *attr,
+					char *buf);
 
 extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,

From e1d38b63acd843cfdd4222bf19a26700fd5c699e Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Wed, 23 Oct 2019 12:23:33 +0200
Subject: [PATCH 07/43] kvm/x86: Export MDS_NO=0 to guests when TSX is enabled

Export the IA32_ARCH_CAPABILITIES MSR bit MDS_NO=0 to guests on TSX
Async Abort(TAA) affected hosts that have TSX enabled and updated
microcode. This is required so that the guests don't complain,

  "Vulnerable: Clear CPU buffers attempted, no microcode"

when the host has the updated microcode to clear CPU buffers.

Microcode update also adds support for MSR_IA32_TSX_CTRL which is
enumerated by the ARCH_CAP_TSX_CTRL bit in IA32_ARCH_CAPABILITIES MSR.
Guests can't do this check themselves when the ARCH_CAP_TSX_CTRL bit is
not exported to the guests.

In this case export MDS_NO=0 to the guests. When guests have
CPUID.MD_CLEAR=1, they deploy MDS mitigation which also mitigates TAA.

Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Neelima Krishnan <neelima.krishnan@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/kvm/x86.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ff395f812719..32d70ca2a7fd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1298,6 +1298,25 @@ static u64 kvm_get_arch_capabilities(void)
 	if (!boot_cpu_has_bug(X86_BUG_MDS))
 		data |= ARCH_CAP_MDS_NO;
 
+	/*
+	 * On TAA affected systems, export MDS_NO=0 when:
+	 *	- TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
+	 *	- Updated microcode is present. This is detected by
+	 *	  the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
+	 *	  that VERW clears CPU buffers.
+	 *
+	 * When MDS_NO=0 is exported, guests deploy clear CPU buffer
+	 * mitigation and don't complain:
+	 *
+	 *	"Vulnerable: Clear CPU buffers attempted, no microcode"
+	 *
+	 * If TSX is disabled on the system, guests are also mitigated against
+	 * TAA and clear CPU buffer mitigation is not required for guests.
+	 */
+	if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) &&
+	    (data & ARCH_CAP_TSX_CTRL_MSR))
+		data &= ~ARCH_CAP_MDS_NO;
+
 	return data;
 }
 

From 7531a3596e3272d1f6841e0d601a614555dc6b65 Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Wed, 23 Oct 2019 12:28:57 +0200
Subject: [PATCH 08/43] x86/tsx: Add "auto" option to the tsx= cmdline
 parameter

Platforms which are not affected by X86_BUG_TAA may want the TSX feature
enabled. Add "auto" option to the TSX cmdline parameter. When tsx=auto
disable TSX when X86_BUG_TAA is present, otherwise enable TSX.

More details on X86_BUG_TAA can be found here:
https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html

 [ bp: Extend the arg buffer to accommodate "auto\0". ]

Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 3 +++
 arch/x86/kernel/cpu/tsx.c                       | 7 ++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index af4b1d95b08f..6e548cddb6c3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4869,6 +4869,9 @@
 				update. This new MSR allows for the reliable
 				deactivation of the TSX functionality.)
 
+			auto	- Disable TSX if X86_BUG_TAA is present,
+				  otherwise enable TSX on the system.
+
 			Not specifying this option is equivalent to tsx=off.
 
 			See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index 04471c4378d8..dda328ec2ba1 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -75,7 +75,7 @@ static bool __init tsx_ctrl_is_supported(void)
 
 void __init tsx_init(void)
 {
-	char arg[4] = {};
+	char arg[5] = {};
 	int ret;
 
 	if (!tsx_ctrl_is_supported())
@@ -87,6 +87,11 @@ void __init tsx_init(void)
 			tsx_ctrl_state = TSX_CTRL_ENABLE;
 		} else if (!strcmp(arg, "off")) {
 			tsx_ctrl_state = TSX_CTRL_DISABLE;
+		} else if (!strcmp(arg, "auto")) {
+			if (boot_cpu_has_bug(X86_BUG_TAA))
+				tsx_ctrl_state = TSX_CTRL_DISABLE;
+			else
+				tsx_ctrl_state = TSX_CTRL_ENABLE;
 		} else {
 			tsx_ctrl_state = TSX_CTRL_DISABLE;
 			pr_err("tsx: invalid option, defaulting to off\n");

From a7a248c593e4fd7a67c50b5f5318fe42a0db335e Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Wed, 23 Oct 2019 12:32:55 +0200
Subject: [PATCH 09/43] x86/speculation/taa: Add documentation for TSX Async
 Abort

Add the documenation for TSX Async Abort. Include the description of
the issue, how to check the mitigation state, control the mitigation,
guidance for system administrators.

 [ bp: Add proper SPDX tags, touch ups by Josh and me. ]

Co-developed-by: Antonio Gomez Iglesias <antonio.gomez.iglesias@intel.com>

Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Antonio Gomez Iglesias <antonio.gomez.iglesias@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mark Gross <mgross@linux.intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 .../ABI/testing/sysfs-devices-system-cpu      |   1 +
 Documentation/admin-guide/hw-vuln/index.rst   |   1 +
 .../admin-guide/hw-vuln/tsx_async_abort.rst   | 276 ++++++++++++++++++
 .../admin-guide/kernel-parameters.txt         |  38 +++
 Documentation/x86/index.rst                   |   1 +
 Documentation/x86/tsx_async_abort.rst         | 117 ++++++++
 6 files changed, 434 insertions(+)
 create mode 100644 Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
 create mode 100644 Documentation/x86/tsx_async_abort.rst

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 06d0931119cc..0e77569bd5e0 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -486,6 +486,7 @@ What:		/sys/devices/system/cpu/vulnerabilities
 		/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
 		/sys/devices/system/cpu/vulnerabilities/l1tf
 		/sys/devices/system/cpu/vulnerabilities/mds
+		/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
 Date:		January 2018
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	Information about CPU vulnerabilities
diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
index 49311f3da6f2..0802b1c67452 100644
--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -12,3 +12,4 @@ are configurable at compile, boot or run time.
    spectre
    l1tf
    mds
+   tsx_async_abort
diff --git a/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
new file mode 100644
index 000000000000..fddbd7579c53
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
@@ -0,0 +1,276 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+TAA - TSX Asynchronous Abort
+======================================
+
+TAA is a hardware vulnerability that allows unprivileged speculative access to
+data which is available in various CPU internal buffers by using asynchronous
+aborts within an Intel TSX transactional region.
+
+Affected processors
+-------------------
+
+This vulnerability only affects Intel processors that support Intel
+Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8)
+is 0 in the IA32_ARCH_CAPABILITIES MSR.  On processors where the MDS_NO bit
+(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations
+also mitigate against TAA.
+
+Whether a processor is affected or not can be read out from the TAA
+vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`.
+
+Related CVEs
+------------
+
+The following CVE entry is related to this TAA issue:
+
+   ==============  =====  ===================================================
+   CVE-2019-11135  TAA    TSX Asynchronous Abort (TAA) condition on some
+                          microprocessors utilizing speculative execution may
+                          allow an authenticated user to potentially enable
+                          information disclosure via a side channel with
+                          local access.
+   ==============  =====  ===================================================
+
+Problem
+-------
+
+When performing store, load or L1 refill operations, processors write
+data into temporary microarchitectural structures (buffers). The data in
+those buffers can be forwarded to load operations as an optimization.
+
+Intel TSX is an extension to the x86 instruction set architecture that adds
+hardware transactional memory support to improve performance of multi-threaded
+software. TSX lets the processor expose and exploit concurrency hidden in an
+application due to dynamically avoiding unnecessary synchronization.
+
+TSX supports atomic memory transactions that are either committed (success) or
+aborted. During an abort, operations that happened within the transactional region
+are rolled back. An asynchronous abort takes place, among other options, when a
+different thread accesses a cache line that is also used within the transactional
+region when that access might lead to a data race.
+
+Immediately after an uncompleted asynchronous abort, certain speculatively
+executed loads may read data from those internal buffers and pass it to dependent
+operations. This can be then used to infer the value via a cache side channel
+attack.
+
+Because the buffers are potentially shared between Hyper-Threads cross
+Hyper-Thread attacks are possible.
+
+The victim of a malicious actor does not need to make use of TSX. Only the
+attacker needs to begin a TSX transaction and raise an asynchronous abort
+which in turn potenitally leaks data stored in the buffers.
+
+More detailed technical information is available in the TAA specific x86
+architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`.
+
+
+Attack scenarios
+----------------
+
+Attacks against the TAA vulnerability can be implemented from unprivileged
+applications running on hosts or guests.
+
+As for MDS, the attacker has no control over the memory addresses that can
+be leaked. Only the victim is responsible for bringing data to the CPU. As
+a result, the malicious actor has to sample as much data as possible and
+then postprocess it to try to infer any useful information from it.
+
+A potential attacker only has read access to the data. Also, there is no direct
+privilege escalation by using this technique.
+
+
+.. _tsx_async_abort_sys_info:
+
+TAA system information
+-----------------------
+
+The Linux kernel provides a sysfs interface to enumerate the current TAA status
+of mitigated systems. The relevant sysfs file is:
+
+/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
+
+The possible values in this file are:
+
+.. list-table::
+
+   * - 'Vulnerable'
+     - The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied.
+   * - 'Vulnerable: Clear CPU buffers attempted, no microcode'
+     - The system tries to clear the buffers but the microcode might not support the operation.
+   * - 'Mitigation: Clear CPU buffers'
+     - The microcode has been updated to clear the buffers. TSX is still enabled.
+   * - 'Mitigation: TSX disabled'
+     - TSX is disabled.
+   * - 'Not affected'
+     - The CPU is not affected by this issue.
+
+.. _ucode_needed:
+
+Best effort mitigation mode
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If the processor is vulnerable, but the availability of the microcode-based
+mitigation mechanism is not advertised via CPUID the kernel selects a best
+effort mitigation mode.  This mode invokes the mitigation instructions
+without a guarantee that they clear the CPU buffers.
+
+This is done to address virtualization scenarios where the host has the
+microcode update applied, but the hypervisor is not yet updated to expose the
+CPUID to the guest. If the host has updated microcode the protection takes
+effect; otherwise a few CPU cycles are wasted pointlessly.
+
+The state in the tsx_async_abort sysfs file reflects this situation
+accordingly.
+
+
+Mitigation mechanism
+--------------------
+
+The kernel detects the affected CPUs and the presence of the microcode which is
+required. If a CPU is affected and the microcode is available, then the kernel
+enables the mitigation by default.
+
+
+The mitigation can be controlled at boot time via a kernel command line option.
+See :ref:`taa_mitigation_control_command_line`.
+
+.. _virt_mechanism:
+
+Virtualization mitigation
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Affected systems where the host has TAA microcode and TAA is mitigated by
+having disabled TSX previously, are not vulnerable regardless of the status
+of the VMs.
+
+In all other cases, if the host either does not have the TAA microcode or
+the kernel is not mitigated, the system might be vulnerable.
+
+
+.. _taa_mitigation_control_command_line:
+
+Mitigation control on the kernel command line
+---------------------------------------------
+
+The kernel command line allows to control the TAA mitigations at boot time with
+the option "tsx_async_abort=". The valid arguments for this option are:
+
+  ============  =============================================================
+  off		This option disables the TAA mitigation on affected platforms.
+                If the system has TSX enabled (see next parameter) and the CPU
+                is affected, the system is vulnerable.
+
+  full	        TAA mitigation is enabled. If TSX is enabled, on an affected
+                system it will clear CPU buffers on ring transitions. On
+                systems which are MDS-affected and deploy MDS mitigation,
+                TAA is also mitigated. Specifying this option on those
+                systems will have no effect.
+
+  full,nosmt    The same as tsx_async_abort=full, with SMT disabled on
+                vulnerable CPUs that have TSX enabled. This is the complete
+                mitigation. When TSX is disabled, SMT is not disabled because
+                CPU is not vulnerable to cross-thread TAA attacks.
+  ============  =============================================================
+
+Not specifying this option is equivalent to "tsx_async_abort=full".
+
+The kernel command line also allows to control the TSX feature using the
+parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used
+to control the TSX feature and the enumeration of the TSX feature bits (RTM
+and HLE) in CPUID.
+
+The valid options are:
+
+  ============  =============================================================
+  off		Disables TSX on the system.
+
+                Note that this option takes effect only on newer CPUs which are
+                not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1
+                and which get the new IA32_TSX_CTRL MSR through a microcode
+                update. This new MSR allows for the reliable deactivation of
+                the TSX functionality.
+
+  on		Enables TSX.
+
+                Although there are mitigations for all known security
+                vulnerabilities, TSX has been known to be an accelerator for
+                several previous speculation-related CVEs, and so there may be
+                unknown security risks associated with leaving it enabled.
+
+  auto		Disables TSX if X86_BUG_TAA is present, otherwise enables TSX
+                on the system.
+  ============  =============================================================
+
+Not specifying this option is equivalent to "tsx=off".
+
+The following combinations of the "tsx_async_abort" and "tsx" are possible. For
+affected platforms tsx=auto is equivalent to tsx=off and the result will be:
+
+  =========  ==========================   =========================================
+  tsx=on     tsx_async_abort=full         The system will use VERW to clear CPU
+                                          buffers. Cross-thread attacks are still
+					  possible on SMT machines.
+  tsx=on     tsx_async_abort=full,nosmt   As above, cross-thread attacks on SMT
+                                          mitigated.
+  tsx=on     tsx_async_abort=off          The system is vulnerable.
+  tsx=off    tsx_async_abort=full         TSX might be disabled if microcode
+                                          provides a TSX control MSR. If so,
+					  system is not vulnerable.
+  tsx=off    tsx_async_abort=full,nosmt   Ditto
+  tsx=off    tsx_async_abort=off          ditto
+  =========  ==========================   =========================================
+
+
+For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU
+buffers.  For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0)
+"tsx" command line argument has no effect.
+
+For the affected platforms below table indicates the mitigation status for the
+combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO
+and TSX_CTRL_MSR.
+
+  =======  =========  =============  ========================================
+  MDS_NO   MD_CLEAR   TSX_CTRL_MSR   Status
+  =======  =========  =============  ========================================
+    0          0            0        Vulnerable (needs microcode)
+    0          1            0        MDS and TAA mitigated via VERW
+    1          1            0        MDS fixed, TAA vulnerable if TSX enabled
+                                     because MD_CLEAR has no meaning and
+                                     VERW is not guaranteed to clear buffers
+    1          X            1        MDS fixed, TAA can be mitigated by
+                                     VERW or TSX_CTRL_MSR
+  =======  =========  =============  ========================================
+
+Mitigation selection guide
+--------------------------
+
+1. Trusted userspace and guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If all user space applications are from a trusted source and do not execute
+untrusted code which is supplied externally, then the mitigation can be
+disabled. The same applies to virtualized environments with trusted guests.
+
+
+2. Untrusted userspace and guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If there are untrusted applications or guests on the system, enabling TSX
+might allow a malicious actor to leak data from the host or from other
+processes running on the same physical core.
+
+If the microcode is available and the TSX is disabled on the host, attacks
+are prevented in a virtualized environment as well, even if the VMs do not
+explicitly enable the mitigation.
+
+
+.. _taa_default_mitigations:
+
+Default mitigations
+-------------------
+
+The kernel's default action for vulnerable processors is:
+
+  - Deploy TSX disable mitigation (tsx_async_abort=full tsx=off).
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6e548cddb6c3..fa8f03ddff24 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2636,6 +2636,7 @@
 					       ssbd=force-off [ARM64]
 					       l1tf=off [X86]
 					       mds=off [X86]
+					       tsx_async_abort=off [X86]
 
 			auto (default)
 				Mitigate all CPU vulnerabilities, but leave SMT
@@ -2651,6 +2652,7 @@
 				be fully mitigated, even if it means losing SMT.
 				Equivalent to: l1tf=flush,nosmt [X86]
 					       mds=full,nosmt [X86]
+					       tsx_async_abort=full,nosmt [X86]
 
 	mminit_loglevel=
 			[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
@@ -4877,6 +4879,42 @@
 			See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
 			for more details.
 
+	tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async
+			Abort (TAA) vulnerability.
+
+			Similar to Micro-architectural Data Sampling (MDS)
+			certain CPUs that support Transactional
+			Synchronization Extensions (TSX) are vulnerable to an
+			exploit against CPU internal buffers which can forward
+			information to a disclosure gadget under certain
+			conditions.
+
+			In vulnerable processors, the speculatively forwarded
+			data can be used in a cache side channel attack, to
+			access data to which the attacker does not have direct
+			access.
+
+			This parameter controls the TAA mitigation.  The
+			options are:
+
+			full       - Enable TAA mitigation on vulnerable CPUs
+				     if TSX is enabled.
+
+			full,nosmt - Enable TAA mitigation and disable SMT on
+				     vulnerable CPUs. If TSX is disabled, SMT
+				     is not disabled because CPU is not
+				     vulnerable to cross-thread TAA attacks.
+			off        - Unconditionally disable TAA mitigation
+
+			Not specifying this option is equivalent to
+			tsx_async_abort=full.  On CPUs which are MDS affected
+			and deploy MDS mitigation, TAA mitigation is not
+			required and doesn't provide any additional
+			mitigation.
+
+			For details see:
+			Documentation/admin-guide/hw-vuln/tsx_async_abort.rst
+
 	turbografx.map[2|3]=	[HW,JOY]
 			TurboGraFX parallel port interface
 			Format:
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index af64c4bb4447..a8de2fbc1caa 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -27,6 +27,7 @@ x86-specific Documentation
    mds
    microcode
    resctrl_ui
+   tsx_async_abort
    usb-legacy-support
    i386/index
    x86_64/index
diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/x86/tsx_async_abort.rst
new file mode 100644
index 000000000000..583ddc185ba2
--- /dev/null
+++ b/Documentation/x86/tsx_async_abort.rst
@@ -0,0 +1,117 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+TSX Async Abort (TAA) mitigation
+================================
+
+.. _tsx_async_abort:
+
+Overview
+--------
+
+TSX Async Abort (TAA) is a side channel attack on internal buffers in some
+Intel processors similar to Microachitectural Data Sampling (MDS).  In this
+case certain loads may speculatively pass invalid data to dependent operations
+when an asynchronous abort condition is pending in a Transactional
+Synchronization Extensions (TSX) transaction.  This includes loads with no
+fault or assist condition. Such loads may speculatively expose stale data from
+the same uarch data structures as in MDS, with same scope of exposure i.e.
+same-thread and cross-thread. This issue affects all current processors that
+support TSX.
+
+Mitigation strategy
+-------------------
+
+a) TSX disable - one of the mitigations is to disable TSX. A new MSR
+IA32_TSX_CTRL will be available in future and current processors after
+microcode update which can be used to disable TSX. In addition, it
+controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID.
+
+b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this
+vulnerability. More details on this approach can be found in
+:ref:`Documentation/admin-guide/hw-vuln/mds.rst <mds>`.
+
+Kernel internal mitigation modes
+--------------------------------
+
+ =============    ============================================================
+ off              Mitigation is disabled. Either the CPU is not affected or
+                  tsx_async_abort=off is supplied on the kernel command line.
+
+ tsx disabled     Mitigation is enabled. TSX feature is disabled by default at
+                  bootup on processors that support TSX control.
+
+ verw             Mitigation is enabled. CPU is affected and MD_CLEAR is
+                  advertised in CPUID.
+
+ ucode needed     Mitigation is enabled. CPU is affected and MD_CLEAR is not
+                  advertised in CPUID. That is mainly for virtualization
+                  scenarios where the host has the updated microcode but the
+                  hypervisor does not expose MD_CLEAR in CPUID. It's a best
+                  effort approach without guarantee.
+ =============    ============================================================
+
+If the CPU is affected and the "tsx_async_abort" kernel command line parameter is
+not provided then the kernel selects an appropriate mitigation depending on the
+status of RTM and MD_CLEAR CPUID bits.
+
+Below tables indicate the impact of tsx=on|off|auto cmdline options on state of
+TAA mitigation, VERW behavior and TSX feature for various combinations of
+MSR_IA32_ARCH_CAPABILITIES bits.
+
+1. "tsx=off"
+
+=========  =========  ============  ============  ==============  ===================  ======================
+MSR_IA32_ARCH_CAPABILITIES bits     Result with cmdline tsx=off
+----------------------------------  -------------------------------------------------------------------------
+TAA_NO     MDS_NO     TSX_CTRL_MSR  TSX state     VERW can clear  TAA mitigation       TAA mitigation
+                                    after bootup  CPU buffers     tsx_async_abort=off  tsx_async_abort=full
+=========  =========  ============  ============  ==============  ===================  ======================
+    0          0           0         HW default         Yes           Same as MDS           Same as MDS
+    0          0           1        Invalid case   Invalid case       Invalid case          Invalid case
+    0          1           0         HW default         No         Need ucode update     Need ucode update
+    0          1           1          Disabled          Yes           TSX disabled          TSX disabled
+    1          X           1          Disabled           X             None needed           None needed
+=========  =========  ============  ============  ==============  ===================  ======================
+
+2. "tsx=on"
+
+=========  =========  ============  ============  ==============  ===================  ======================
+MSR_IA32_ARCH_CAPABILITIES bits     Result with cmdline tsx=on
+----------------------------------  -------------------------------------------------------------------------
+TAA_NO     MDS_NO     TSX_CTRL_MSR  TSX state     VERW can clear  TAA mitigation       TAA mitigation
+                                    after bootup  CPU buffers     tsx_async_abort=off  tsx_async_abort=full
+=========  =========  ============  ============  ==============  ===================  ======================
+    0          0           0         HW default        Yes            Same as MDS          Same as MDS
+    0          0           1        Invalid case   Invalid case       Invalid case         Invalid case
+    0          1           0         HW default        No          Need ucode update     Need ucode update
+    0          1           1          Enabled          Yes               None              Same as MDS
+    1          X           1          Enabled          X              None needed          None needed
+=========  =========  ============  ============  ==============  ===================  ======================
+
+3. "tsx=auto"
+
+=========  =========  ============  ============  ==============  ===================  ======================
+MSR_IA32_ARCH_CAPABILITIES bits     Result with cmdline tsx=auto
+----------------------------------  -------------------------------------------------------------------------
+TAA_NO     MDS_NO     TSX_CTRL_MSR  TSX state     VERW can clear  TAA mitigation       TAA mitigation
+                                    after bootup  CPU buffers     tsx_async_abort=off  tsx_async_abort=full
+=========  =========  ============  ============  ==============  ===================  ======================
+    0          0           0         HW default    Yes                Same as MDS           Same as MDS
+    0          0           1        Invalid case  Invalid case        Invalid case          Invalid case
+    0          1           0         HW default    No              Need ucode update     Need ucode update
+    0          1           1          Disabled      Yes               TSX disabled          TSX disabled
+    1          X           1          Enabled       X                 None needed           None needed
+=========  =========  ============  ============  ==============  ===================  ======================
+
+In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that
+indicates whether MSR_IA32_TSX_CTRL is supported.
+
+There are two control bits in IA32_TSX_CTRL MSR:
+
+      Bit 0: When set it disables the Restricted Transactional Memory (RTM)
+             sub-feature of TSX (will force all transactions to abort on the
+             XBEGIN instruction).
+
+      Bit 1: When set it disables the enumeration of the RTM and HLE feature
+             (i.e. it will make CPUID(EAX=7).EBX{bit4} and
+             CPUID(EAX=7).EBX{bit11} read as 0).

From db616173d787395787ecc93eef075fa975227b10 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 23 Oct 2019 12:35:50 +0200
Subject: [PATCH 10/43] x86/tsx: Add config options to set tsx=on|off|auto

There is a general consensus that TSX usage is not largely spread while
the history shows there is a non trivial space for side channel attacks
possible. Therefore the tsx is disabled by default even on platforms
that might have a safe implementation of TSX according to the current
knowledge. This is a fair trade off to make.

There are, however, workloads that really do benefit from using TSX and
updating to a newer kernel with TSX disabled might introduce a
noticeable regressions. This would be especially a problem for Linux
distributions which will provide TAA mitigations.

Introduce config options X86_INTEL_TSX_MODE_OFF, X86_INTEL_TSX_MODE_ON
and X86_INTEL_TSX_MODE_AUTO to control the TSX feature. The config
setting can be overridden by the tsx cmdline options.

 [ bp: Text cleanups from Josh. ]

Suggested-by: Borislav Petkov <bpetkov@suse.de>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/Kconfig          | 45 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/tsx.c | 22 +++++++++++++------
 2 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d6e1faa28c58..8ef85139553f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1940,6 +1940,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
 
 	  If unsure, say y.
 
+choice
+	prompt "TSX enable mode"
+	depends on CPU_SUP_INTEL
+	default X86_INTEL_TSX_MODE_OFF
+	help
+	  Intel's TSX (Transactional Synchronization Extensions) feature
+	  allows to optimize locking protocols through lock elision which
+	  can lead to a noticeable performance boost.
+
+	  On the other hand it has been shown that TSX can be exploited
+	  to form side channel attacks (e.g. TAA) and chances are there
+	  will be more of those attacks discovered in the future.
+
+	  Therefore TSX is not enabled by default (aka tsx=off). An admin
+	  might override this decision by tsx=on the command line parameter.
+	  Even with TSX enabled, the kernel will attempt to enable the best
+	  possible TAA mitigation setting depending on the microcode available
+	  for the particular machine.
+
+	  This option allows to set the default tsx mode between tsx=on, =off
+	  and =auto. See Documentation/admin-guide/kernel-parameters.txt for more
+	  details.
+
+	  Say off if not sure, auto if TSX is in use but it should be used on safe
+	  platforms or on if TSX is in use and the security aspect of tsx is not
+	  relevant.
+
+config X86_INTEL_TSX_MODE_OFF
+	bool "off"
+	help
+	  TSX is disabled if possible - equals to tsx=off command line parameter.
+
+config X86_INTEL_TSX_MODE_ON
+	bool "on"
+	help
+	  TSX is always enabled on TSX capable HW - equals the tsx=on command
+	  line parameter.
+
+config X86_INTEL_TSX_MODE_AUTO
+	bool "auto"
+	help
+	  TSX is enabled on TSX capable HW that is believed to be safe against
+	  side channel attacks- equals the tsx=auto command line parameter.
+endchoice
+
 config EFI
 	bool "EFI runtime service support"
 	depends on ACPI
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index dda328ec2ba1..3e20d322bc98 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -73,6 +73,14 @@ static bool __init tsx_ctrl_is_supported(void)
 	return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
 }
 
+static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
+{
+	if (boot_cpu_has_bug(X86_BUG_TAA))
+		return TSX_CTRL_DISABLE;
+
+	return TSX_CTRL_ENABLE;
+}
+
 void __init tsx_init(void)
 {
 	char arg[5] = {};
@@ -88,17 +96,19 @@ void __init tsx_init(void)
 		} else if (!strcmp(arg, "off")) {
 			tsx_ctrl_state = TSX_CTRL_DISABLE;
 		} else if (!strcmp(arg, "auto")) {
-			if (boot_cpu_has_bug(X86_BUG_TAA))
-				tsx_ctrl_state = TSX_CTRL_DISABLE;
-			else
-				tsx_ctrl_state = TSX_CTRL_ENABLE;
+			tsx_ctrl_state = x86_get_tsx_auto_mode();
 		} else {
 			tsx_ctrl_state = TSX_CTRL_DISABLE;
 			pr_err("tsx: invalid option, defaulting to off\n");
 		}
 	} else {
-		/* tsx= not provided, defaulting to off */
-		tsx_ctrl_state = TSX_CTRL_DISABLE;
+		/* tsx= not provided */
+		if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO))
+			tsx_ctrl_state = x86_get_tsx_auto_mode();
+		else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF))
+			tsx_ctrl_state = TSX_CTRL_DISABLE;
+		else
+			tsx_ctrl_state = TSX_CTRL_ENABLE;
 	}
 
 	if (tsx_ctrl_state == TSX_CTRL_DISABLE) {

From db4d30fbb71b47e4ecb11c4efa5d8aad4b03dfae Mon Sep 17 00:00:00 2001
From: Vineela Tummalapalli <vineela.tummalapalli@intel.com>
Date: Mon, 4 Nov 2019 12:22:01 +0100
Subject: [PATCH 11/43] x86/bugs: Add ITLB_MULTIHIT bug infrastructure

Some processors may incur a machine check error possibly resulting in an
unrecoverable CPU lockup when an instruction fetch encounters a TLB
multi-hit in the instruction TLB. This can occur when the page size is
changed along with either the physical address or cache type. The relevant
erratum can be found here:

   https://bugzilla.kernel.org/show_bug.cgi?id=205195

There are other processors affected for which the erratum does not fully
disclose the impact.

This issue affects both bare-metal x86 page tables and EPT.

It can be mitigated by either eliminating the use of large pages or by
using careful TLB invalidations when changing the page size in the page
tables.

Just like Spectre, Meltdown, L1TF and MDS, a new bit has been allocated in
MSR_IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) and will be set on CPUs which
are mitigated against this issue.

Signed-off-by: Vineela Tummalapalli <vineela.tummalapalli@intel.com>
Co-developed-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 .../ABI/testing/sysfs-devices-system-cpu      |  1 +
 arch/x86/include/asm/cpufeatures.h            |  1 +
 arch/x86/include/asm/msr-index.h              |  7 +++
 arch/x86/kernel/cpu/bugs.c                    | 13 ++++
 arch/x86/kernel/cpu/common.c                  | 63 ++++++++++---------
 drivers/base/cpu.c                            |  8 +++
 include/linux/cpu.h                           |  2 +
 7 files changed, 66 insertions(+), 29 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 0e77569bd5e0..fc20cde63d1e 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -487,6 +487,7 @@ What:		/sys/devices/system/cpu/vulnerabilities
 		/sys/devices/system/cpu/vulnerabilities/l1tf
 		/sys/devices/system/cpu/vulnerabilities/mds
 		/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
+		/sys/devices/system/cpu/vulnerabilities/itlb_multihit
 Date:		January 2018
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	Information about CPU vulnerabilities
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 989e03544f18..c4fbe379cc0b 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -400,5 +400,6 @@
 #define X86_BUG_MSBDS_ONLY		X86_BUG(20) /* CPU is only affected by the  MSDBS variant of BUG_MDS */
 #define X86_BUG_SWAPGS			X86_BUG(21) /* CPU is affected by speculation through SWAPGS */
 #define X86_BUG_TAA			X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
+#define X86_BUG_ITLB_MULTIHIT		X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
 
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b3a8bb2af0b6..6a3124664289 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -93,6 +93,13 @@
 						  * Microarchitectural Data
 						  * Sampling (MDS) vulnerabilities.
 						  */
+#define ARCH_CAP_PSCHANGE_MC_NO		BIT(6)	 /*
+						  * The processor is not susceptible to a
+						  * machine check error due to modifying the
+						  * code page size along with either the
+						  * physical address or cache type
+						  * without TLB invalidation.
+						  */
 #define ARCH_CAP_TSX_CTRL_MSR		BIT(7)	/* MSR for TSX control is available. */
 #define ARCH_CAP_TAA_NO			BIT(8)	/*
 						 * Not susceptible to
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 43c647e19439..5364beda8c61 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1419,6 +1419,11 @@ static ssize_t l1tf_show_state(char *buf)
 }
 #endif
 
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+	return sprintf(buf, "Processor vulnerable\n");
+}
+
 static ssize_t mds_show_state(char *buf)
 {
 	if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
@@ -1524,6 +1529,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
 	case X86_BUG_TAA:
 		return tsx_async_abort_show_state(buf);
 
+	case X86_BUG_ITLB_MULTIHIT:
+		return itlb_multihit_show_state(buf);
+
 	default:
 		break;
 	}
@@ -1565,4 +1573,9 @@ ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *at
 {
 	return cpu_show_common(dev, attr, buf, X86_BUG_TAA);
 }
+
+ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
+}
 #endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f8b8afc8f5b5..d29b71ca3ca7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1016,13 +1016,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
 #endif
 }
 
-#define NO_SPECULATION	BIT(0)
-#define NO_MELTDOWN	BIT(1)
-#define NO_SSB		BIT(2)
-#define NO_L1TF		BIT(3)
-#define NO_MDS		BIT(4)
-#define MSBDS_ONLY	BIT(5)
-#define NO_SWAPGS	BIT(6)
+#define NO_SPECULATION		BIT(0)
+#define NO_MELTDOWN		BIT(1)
+#define NO_SSB			BIT(2)
+#define NO_L1TF			BIT(3)
+#define NO_MDS			BIT(4)
+#define MSBDS_ONLY		BIT(5)
+#define NO_SWAPGS		BIT(6)
+#define NO_ITLB_MULTIHIT	BIT(7)
 
 #define VULNWL(_vendor, _family, _model, _whitelist)	\
 	{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
@@ -1043,27 +1044,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 	VULNWL(NSC,	5, X86_MODEL_ANY,	NO_SPECULATION),
 
 	/* Intel Family 6 */
-	VULNWL_INTEL(ATOM_SALTWELL,		NO_SPECULATION),
-	VULNWL_INTEL(ATOM_SALTWELL_TABLET,	NO_SPECULATION),
-	VULNWL_INTEL(ATOM_SALTWELL_MID,		NO_SPECULATION),
-	VULNWL_INTEL(ATOM_BONNELL,		NO_SPECULATION),
-	VULNWL_INTEL(ATOM_BONNELL_MID,		NO_SPECULATION),
+	VULNWL_INTEL(ATOM_SALTWELL,		NO_SPECULATION | NO_ITLB_MULTIHIT),
+	VULNWL_INTEL(ATOM_SALTWELL_TABLET,	NO_SPECULATION | NO_ITLB_MULTIHIT),
+	VULNWL_INTEL(ATOM_SALTWELL_MID,		NO_SPECULATION | NO_ITLB_MULTIHIT),
+	VULNWL_INTEL(ATOM_BONNELL,		NO_SPECULATION | NO_ITLB_MULTIHIT),
+	VULNWL_INTEL(ATOM_BONNELL_MID,		NO_SPECULATION | NO_ITLB_MULTIHIT),
 
-	VULNWL_INTEL(ATOM_SILVERMONT,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
-	VULNWL_INTEL(ATOM_SILVERMONT_D,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
-	VULNWL_INTEL(ATOM_SILVERMONT_MID,	NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
-	VULNWL_INTEL(ATOM_AIRMONT,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
-	VULNWL_INTEL(XEON_PHI_KNL,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
-	VULNWL_INTEL(XEON_PHI_KNM,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
+	VULNWL_INTEL(ATOM_SILVERMONT,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+	VULNWL_INTEL(ATOM_SILVERMONT_D,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+	VULNWL_INTEL(ATOM_SILVERMONT_MID,	NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+	VULNWL_INTEL(ATOM_AIRMONT,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+	VULNWL_INTEL(XEON_PHI_KNL,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+	VULNWL_INTEL(XEON_PHI_KNM,		NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
 
 	VULNWL_INTEL(CORE_YONAH,		NO_SSB),
 
-	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF | MSBDS_ONLY | NO_SWAPGS),
-	VULNWL_INTEL(ATOM_AIRMONT_NP,		NO_L1TF | NO_SWAPGS),
+	VULNWL_INTEL(ATOM_AIRMONT_MID,		NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+	VULNWL_INTEL(ATOM_AIRMONT_NP,		NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
 
-	VULNWL_INTEL(ATOM_GOLDMONT,		NO_MDS | NO_L1TF | NO_SWAPGS),
-	VULNWL_INTEL(ATOM_GOLDMONT_D,		NO_MDS | NO_L1TF | NO_SWAPGS),
-	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_MDS | NO_L1TF | NO_SWAPGS),
+	VULNWL_INTEL(ATOM_GOLDMONT,		NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+	VULNWL_INTEL(ATOM_GOLDMONT_D,		NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+	VULNWL_INTEL(ATOM_GOLDMONT_PLUS,	NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
 
 	/*
 	 * Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1074,14 +1075,14 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 	 */
 
 	/* AMD Family 0xf - 0x12 */
-	VULNWL_AMD(0x0f,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
-	VULNWL_AMD(0x10,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
-	VULNWL_AMD(0x11,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
-	VULNWL_AMD(0x12,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS),
+	VULNWL_AMD(0x0f,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+	VULNWL_AMD(0x10,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+	VULNWL_AMD(0x11,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+	VULNWL_AMD(0x12,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
 
 	/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
-	VULNWL_AMD(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS),
-	VULNWL_HYGON(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS),
+	VULNWL_AMD(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+	VULNWL_HYGON(X86_FAMILY_ANY,	NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
 	{}
 };
 
@@ -1106,6 +1107,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 {
 	u64 ia32_cap = x86_read_arch_cap_msr();
 
+	/* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
+	if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+		setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
+
 	if (cpu_matches(NO_SPECULATION))
 		return;
 
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 0fccd8c0312e..6265871a4af2 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -561,6 +561,12 @@ ssize_t __weak cpu_show_tsx_async_abort(struct device *dev,
 	return sprintf(buf, "Not affected\n");
 }
 
+ssize_t __weak cpu_show_itlb_multihit(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "Not affected\n");
+}
+
 static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
 static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
 static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
@@ -568,6 +574,7 @@ static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
 static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
 static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);
 static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL);
+static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL);
 
 static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_meltdown.attr,
@@ -577,6 +584,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_l1tf.attr,
 	&dev_attr_mds.attr,
 	&dev_attr_tsx_async_abort.attr,
+	&dev_attr_itlb_multihit.attr,
 	NULL
 };
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f35369f79771..2a093434e975 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -62,6 +62,8 @@ extern ssize_t cpu_show_mds(struct device *dev,
 extern ssize_t cpu_show_tsx_async_abort(struct device *dev,
 					struct device_attribute *attr,
 					char *buf);
+extern ssize_t cpu_show_itlb_multihit(struct device *dev,
+				      struct device_attribute *attr, char *buf);
 
 extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,

From cad14885a8d32c1c0d8eaa7bf5c0152a22b6080e Mon Sep 17 00:00:00 2001
From: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Date: Mon, 4 Nov 2019 12:22:01 +0100
Subject: [PATCH 12/43] x86/cpu: Add Tremont to the cpu vulnerability whitelist

Add the new cpu family ATOM_TREMONT_D to the cpu vunerability
whitelist. ATOM_TREMONT_D is not affected by X86_BUG_ITLB_MULTIHIT.

ATOM_TREMONT_D might have mitigations against other issues as well, but
only the ITLB multihit mitigation is confirmed at this point.

Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/common.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d29b71ca3ca7..fffe21945374 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1074,6 +1074,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 	 * good enough for our purposes.
 	 */
 
+	VULNWL_INTEL(ATOM_TREMONT_D,		NO_ITLB_MULTIHIT),
+
 	/* AMD Family 0xf - 0x12 */
 	VULNWL_AMD(0x0f,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
 	VULNWL_AMD(0x10,	NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),

From 731dc9df975a5da21237a18c3384f811a7a41cc6 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@canonical.com>
Date: Mon, 4 Nov 2019 12:22:02 +0100
Subject: [PATCH 13/43] cpu/speculation: Uninline and export CPU mitigations
 helpers

A kernel module may need to check the value of the "mitigations=" kernel
command line parameter as part of its setup when the module needs
to perform software mitigations for a CPU flaw.

Uninline and export the helper functions surrounding the cpu_mitigations
enum to allow for their usage from a module.

Lastly, privatize the enum and cpu_mitigations variable since the value of
cpu_mitigations can be checked with the exported helper functions.

Signed-off-by: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h | 25 ++-----------------------
 kernel/cpu.c        | 27 ++++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 2a093434e975..bc6c879bd110 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -218,28 +218,7 @@ static inline int cpuhp_smt_enable(void) { return 0; }
 static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; }
 #endif
 
-/*
- * These are used for a global "mitigations=" cmdline option for toggling
- * optional CPU mitigations.
- */
-enum cpu_mitigations {
-	CPU_MITIGATIONS_OFF,
-	CPU_MITIGATIONS_AUTO,
-	CPU_MITIGATIONS_AUTO_NOSMT,
-};
-
-extern enum cpu_mitigations cpu_mitigations;
-
-/* mitigations=off */
-static inline bool cpu_mitigations_off(void)
-{
-	return cpu_mitigations == CPU_MITIGATIONS_OFF;
-}
-
-/* mitigations=auto,nosmt */
-static inline bool cpu_mitigations_auto_nosmt(void)
-{
-	return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
-}
+extern bool cpu_mitigations_off(void);
+extern bool cpu_mitigations_auto_nosmt(void);
 
 #endif /* _LINUX_CPU_H_ */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index fc28e17940e0..e2cad3ee2ead 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2373,7 +2373,18 @@ void __init boot_cpu_hotplug_init(void)
 	this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
 }
 
-enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
+/*
+ * These are used for a global "mitigations=" cmdline option for toggling
+ * optional CPU mitigations.
+ */
+enum cpu_mitigations {
+	CPU_MITIGATIONS_OFF,
+	CPU_MITIGATIONS_AUTO,
+	CPU_MITIGATIONS_AUTO_NOSMT,
+};
+
+static enum cpu_mitigations cpu_mitigations __ro_after_init =
+	CPU_MITIGATIONS_AUTO;
 
 static int __init mitigations_parse_cmdline(char *arg)
 {
@@ -2390,3 +2401,17 @@ static int __init mitigations_parse_cmdline(char *arg)
 	return 0;
 }
 early_param("mitigations", mitigations_parse_cmdline);
+
+/* mitigations=off */
+bool cpu_mitigations_off(void)
+{
+	return cpu_mitigations == CPU_MITIGATIONS_OFF;
+}
+EXPORT_SYMBOL_GPL(cpu_mitigations_off);
+
+/* mitigations=auto,nosmt */
+bool cpu_mitigations_auto_nosmt(void)
+{
+	return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
+}
+EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);

From b8e8c8303ff28c61046a4d0f6ea99aea609a7dc0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 4 Nov 2019 12:22:02 +0100
Subject: [PATCH 14/43] kvm: mmu: ITLB_MULTIHIT mitigation

With some Intel processors, putting the same virtual address in the TLB
as both a 4 KiB and 2 MiB page can confuse the instruction fetch unit
and cause the processor to issue a machine check resulting in a CPU lockup.

Unfortunately when EPT page tables use huge pages, it is possible for a
malicious guest to cause this situation.

Add a knob to mark huge pages as non-executable. When the nx_huge_pages
parameter is enabled (and we are using EPT), all huge pages are marked as
NX. If the guest attempts to execute in one of those pages, the page is
broken down into 4K pages, which are then marked executable.

This is not an issue for shadow paging (except nested EPT), because then
the host is in control of TLB flushes and the problematic situation cannot
happen.  With nested EPT, again the nested guest can cause problems shadow
and direct EPT is treated in the same way.

[ tglx: Fixup default to auto and massage wording a bit ]

Originally-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 .../admin-guide/kernel-parameters.txt         |  19 +++
 arch/x86/include/asm/kvm_host.h               |   2 +
 arch/x86/kernel/cpu/bugs.c                    |  13 +-
 arch/x86/kvm/mmu.c                            | 141 +++++++++++++++++-
 arch/x86/kvm/paging_tmpl.h                    |  29 +++-
 arch/x86/kvm/x86.c                            |   9 ++
 6 files changed, 200 insertions(+), 13 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fa8f03ddff24..9d5f123cc218 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2055,6 +2055,19 @@
 			KVM MMU at runtime.
 			Default is 0 (off)
 
+	kvm.nx_huge_pages=
+			[KVM] Controls the software workaround for the
+			X86_BUG_ITLB_MULTIHIT bug.
+			force	: Always deploy workaround.
+			off	: Never deploy workaround.
+			auto    : Deploy workaround based on the presence of
+				  X86_BUG_ITLB_MULTIHIT.
+
+			Default is 'auto'.
+
+			If the software workaround is enabled for the host,
+			guests do need not to enable it for nested guests.
+
 	kvm-amd.nested=	[KVM,AMD] Allow nested virtualization in KVM/SVM.
 			Default is 1 (enabled)
 
@@ -2637,6 +2650,12 @@
 					       l1tf=off [X86]
 					       mds=off [X86]
 					       tsx_async_abort=off [X86]
+					       kvm.nx_huge_pages=off [X86]
+
+				Exceptions:
+					       This does not have any effect on
+					       kvm.nx_huge_pages when
+					       kvm.nx_huge_pages=force.
 
 			auto (default)
 				Mitigate all CPU vulnerabilities, but leave SMT
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 24d6598dea29..a37b03483b66 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -315,6 +315,7 @@ struct kvm_mmu_page {
 	bool unsync;
 	u8 mmu_valid_gen;
 	bool mmio_cached;
+	bool lpage_disallowed; /* Can't be replaced by an equiv large page */
 
 	/*
 	 * The following two entries are used to key the shadow page in the
@@ -946,6 +947,7 @@ struct kvm_vm_stat {
 	ulong mmu_unsync;
 	ulong remote_tlb_flush;
 	ulong lpages;
+	ulong nx_lpage_splits;
 	ulong max_mmu_page_hash_collisions;
 };
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 5364beda8c61..850005590167 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1257,6 +1257,9 @@ void x86_spec_ctrl_setup_ap(void)
 		x86_amd_ssb_disable();
 }
 
+bool itlb_multihit_kvm_mitigation;
+EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
+
 #undef pr_fmt
 #define pr_fmt(fmt)	"L1TF: " fmt
 
@@ -1412,17 +1415,25 @@ static ssize_t l1tf_show_state(char *buf)
 		       l1tf_vmx_states[l1tf_vmx_mitigation],
 		       sched_smt_active() ? "vulnerable" : "disabled");
 }
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+	if (itlb_multihit_kvm_mitigation)
+		return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
+	else
+		return sprintf(buf, "KVM: Vulnerable\n");
+}
 #else
 static ssize_t l1tf_show_state(char *buf)
 {
 	return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
 }
-#endif
 
 static ssize_t itlb_multihit_show_state(char *buf)
 {
 	return sprintf(buf, "Processor vulnerable\n");
 }
+#endif
 
 static ssize_t mds_show_state(char *buf)
 {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 24c23c66b226..bedf6864b092 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -47,6 +47,20 @@
 #include <asm/kvm_page_track.h>
 #include "trace.h"
 
+extern bool itlb_multihit_kvm_mitigation;
+
+static int __read_mostly nx_huge_pages = -1;
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+
+static struct kernel_param_ops nx_huge_pages_ops = {
+	.set = set_nx_huge_pages,
+	.get = param_get_bool,
+};
+
+module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages, "bool");
+
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
  * where the hardware walks 2 page tables:
@@ -352,6 +366,11 @@ static inline bool spte_ad_need_write_protect(u64 spte)
 	return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
 }
 
+static bool is_nx_huge_page_enabled(void)
+{
+	return READ_ONCE(nx_huge_pages);
+}
+
 static inline u64 spte_shadow_accessed_mask(u64 spte)
 {
 	MMU_WARN_ON(is_mmio_spte(spte));
@@ -1190,6 +1209,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	kvm_mmu_gfn_disallow_lpage(slot, gfn);
 }
 
+static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	if (sp->lpage_disallowed)
+		return;
+
+	++kvm->stat.nx_lpage_splits;
+	sp->lpage_disallowed = true;
+}
+
 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	struct kvm_memslots *slots;
@@ -1207,6 +1235,12 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	kvm_mmu_gfn_allow_lpage(slot, gfn);
 }
 
+static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	--kvm->stat.nx_lpage_splits;
+	sp->lpage_disallowed = false;
+}
+
 static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
 					  struct kvm_memory_slot *slot)
 {
@@ -2792,6 +2826,9 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
 			kvm_reload_remote_mmus(kvm);
 	}
 
+	if (sp->lpage_disallowed)
+		unaccount_huge_nx_page(kvm, sp);
+
 	sp->role.invalid = 1;
 	return list_unstable;
 }
@@ -3013,6 +3050,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (!speculative)
 		spte |= spte_shadow_accessed_mask(spte);
 
+	if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
+	    is_nx_huge_page_enabled()) {
+		pte_access &= ~ACC_EXEC_MASK;
+	}
+
 	if (pte_access & ACC_EXEC_MASK)
 		spte |= shadow_x_mask;
 	else
@@ -3233,9 +3275,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
 	__direct_pte_prefetch(vcpu, sp, sptep);
 }
 
+static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
+				       gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
+{
+	int level = *levelp;
+	u64 spte = *it.sptep;
+
+	if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
+	    is_nx_huge_page_enabled() &&
+	    is_shadow_present_pte(spte) &&
+	    !is_large_pte(spte)) {
+		/*
+		 * A small SPTE exists for this pfn, but FNAME(fetch)
+		 * and __direct_map would like to create a large PTE
+		 * instead: just force them to go down another level,
+		 * patching back for them into pfn the next 9 bits of
+		 * the address.
+		 */
+		u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
+		*pfnp |= gfn & page_mask;
+		(*levelp)--;
+	}
+}
+
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
 			int map_writable, int level, kvm_pfn_t pfn,
-			bool prefault)
+			bool prefault, bool lpage_disallowed)
 {
 	struct kvm_shadow_walk_iterator it;
 	struct kvm_mmu_page *sp;
@@ -3248,6 +3313,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
 
 	trace_kvm_mmu_spte_requested(gpa, level, pfn);
 	for_each_shadow_entry(vcpu, gpa, it) {
+		/*
+		 * We cannot overwrite existing page tables with an NX
+		 * large page, as the leaf could be executable.
+		 */
+		disallowed_hugepage_adjust(it, gfn, &pfn, &level);
+
 		base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
 		if (it.level == level)
 			break;
@@ -3258,6 +3329,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
 					      it.level - 1, true, ACC_ALL);
 
 			link_shadow_page(vcpu, it.sptep, sp);
+			if (lpage_disallowed)
+				account_huge_nx_page(vcpu->kvm, sp);
 		}
 	}
 
@@ -3550,11 +3623,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 {
 	int r;
 	int level;
-	bool force_pt_level = false;
+	bool force_pt_level;
 	kvm_pfn_t pfn;
 	unsigned long mmu_seq;
 	bool map_writable, write = error_code & PFERR_WRITE_MASK;
+	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+				is_nx_huge_page_enabled();
 
+	force_pt_level = lpage_disallowed;
 	level = mapping_level(vcpu, gfn, &force_pt_level);
 	if (likely(!force_pt_level)) {
 		/*
@@ -3588,7 +3664,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 		goto out_unlock;
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-	r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
+	r = __direct_map(vcpu, v, write, map_writable, level, pfn,
+			 prefault, false);
 out_unlock:
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	kvm_release_pfn_clean(pfn);
@@ -4174,6 +4251,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	unsigned long mmu_seq;
 	int write = error_code & PFERR_WRITE_MASK;
 	bool map_writable;
+	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+				is_nx_huge_page_enabled();
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
 
@@ -4184,8 +4263,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (r)
 		return r;
 
-	force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
-							   PT_DIRECTORY_LEVEL);
+	force_pt_level =
+		lpage_disallowed ||
+		!check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
 	level = mapping_level(vcpu, gfn, &force_pt_level);
 	if (likely(!force_pt_level)) {
 		if (level > PT_DIRECTORY_LEVEL &&
@@ -4214,7 +4294,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 		goto out_unlock;
 	if (likely(!force_pt_level))
 		transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-	r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
+	r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
+			 prefault, lpage_disallowed);
 out_unlock:
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	kvm_release_pfn_clean(pfn);
@@ -6155,10 +6236,58 @@ static void kvm_set_mmio_spte_mask(void)
 	kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
 }
 
+static bool get_nx_auto_mode(void)
+{
+	/* Return true when CPU has the bug, and mitigations are ON */
+	return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
+}
+
+static void __set_nx_huge_pages(bool val)
+{
+	nx_huge_pages = itlb_multihit_kvm_mitigation = val;
+}
+
+static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
+{
+	bool old_val = nx_huge_pages;
+	bool new_val;
+
+	/* In "auto" mode deploy workaround only if CPU has the bug. */
+	if (sysfs_streq(val, "off"))
+		new_val = 0;
+	else if (sysfs_streq(val, "force"))
+		new_val = 1;
+	else if (sysfs_streq(val, "auto"))
+		new_val = get_nx_auto_mode();
+	else if (strtobool(val, &new_val) < 0)
+		return -EINVAL;
+
+	__set_nx_huge_pages(new_val);
+
+	if (new_val != old_val) {
+		struct kvm *kvm;
+		int idx;
+
+		mutex_lock(&kvm_lock);
+
+		list_for_each_entry(kvm, &vm_list, vm_list) {
+			idx = srcu_read_lock(&kvm->srcu);
+			kvm_mmu_zap_all_fast(kvm);
+			srcu_read_unlock(&kvm->srcu, idx);
+		}
+		mutex_unlock(&kvm_lock);
+	}
+
+	return 0;
+}
+
 int kvm_mmu_module_init(void)
 {
 	int ret = -ENOMEM;
 
+	if (nx_huge_pages == -1)
+		__set_nx_huge_pages(get_nx_auto_mode());
+
 	/*
 	 * MMU roles use union aliasing which is, generally speaking, an
 	 * undefined behavior. However, we supposedly know how compilers behave
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 7d5cdb3af594..97b21e7fd013 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -614,13 +614,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *gw,
 			 int write_fault, int hlevel,
-			 kvm_pfn_t pfn, bool map_writable, bool prefault)
+			 kvm_pfn_t pfn, bool map_writable, bool prefault,
+			 bool lpage_disallowed)
 {
 	struct kvm_mmu_page *sp = NULL;
 	struct kvm_shadow_walk_iterator it;
 	unsigned direct_access, access = gw->pt_access;
 	int top_level, ret;
-	gfn_t base_gfn;
+	gfn_t gfn, base_gfn;
 
 	direct_access = gw->pte_access;
 
@@ -665,13 +666,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			link_shadow_page(vcpu, it.sptep, sp);
 	}
 
-	base_gfn = gw->gfn;
+	/*
+	 * FNAME(page_fault) might have clobbered the bottom bits of
+	 * gw->gfn, restore them from the virtual address.
+	 */
+	gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
+	base_gfn = gfn;
 
 	trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
 
 	for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
 		clear_sp_write_flooding_count(it.sptep);
-		base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+
+		/*
+		 * We cannot overwrite existing page tables with an NX
+		 * large page, as the leaf could be executable.
+		 */
+		disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
+
+		base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
 		if (it.level == hlevel)
 			break;
 
@@ -683,6 +696,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
 					      it.level - 1, true, direct_access);
 			link_shadow_page(vcpu, it.sptep, sp);
+			if (lpage_disallowed)
+				account_huge_nx_page(vcpu->kvm, sp);
 		}
 	}
 
@@ -759,9 +774,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int r;
 	kvm_pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
-	bool force_pt_level = false;
 	unsigned long mmu_seq;
 	bool map_writable, is_self_change_mapping;
+	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
+				is_nx_huge_page_enabled();
+	bool force_pt_level = lpage_disallowed;
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
@@ -851,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	if (!force_pt_level)
 		transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
 	r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
-			 level, pfn, map_writable, prefault);
+			 level, pfn, map_writable, prefault, lpage_disallowed);
 	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 
 out_unlock:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 32d70ca2a7fd..b087d178a774 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -213,6 +213,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmu_unsync", VM_STAT(mmu_unsync) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages, .mode = 0444) },
+	{ "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
 	{ "max_mmu_page_hash_collisions",
 		VM_STAT(max_mmu_page_hash_collisions) },
 	{ NULL }
@@ -1279,6 +1280,14 @@ static u64 kvm_get_arch_capabilities(void)
 	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
 		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
 
+	/*
+	 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
+	 * the nested hypervisor runs with NX huge pages.  If it is not,
+	 * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+	 * L1 guests, so it need not worry about its own (L2) guests.
+	 */
+	data |= ARCH_CAP_PSCHANGE_MC_NO;
+
 	/*
 	 * If we're doing cache flushes (either "always" or "cond")
 	 * we will do one whenever the guest does a vmlaunch/vmresume.

From c57c80467f90e5504c8df9ad3555d2c78800bf94 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Mon, 4 Nov 2019 12:22:02 +0100
Subject: [PATCH 15/43] kvm: Add helper function for creating VM worker threads

Add a function to create a kernel thread associated with a given VM. In
particular, it ensures that the worker thread inherits the priority and
cgroups of the calling thread.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/kvm_host.h |  6 +++
 virt/kvm/kvm_main.c      | 84 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 719fc3e15ea4..52ed5f66e8f9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1382,4 +1382,10 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */
 
+typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);
+
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
+				uintptr_t data, const char *name,
+				struct task_struct **thread_ptr);
+
 #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d6f0696d98ef..8aed32b604d9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -50,6 +50,7 @@
 #include <linux/bsearch.h>
 #include <linux/io.h>
 #include <linux/lockdep.h>
+#include <linux/kthread.h>
 
 #include <asm/processor.h>
 #include <asm/ioctl.h>
@@ -4371,3 +4372,86 @@ void kvm_exit(void)
 	kvm_vfio_ops_exit();
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
+
+struct kvm_vm_worker_thread_context {
+	struct kvm *kvm;
+	struct task_struct *parent;
+	struct completion init_done;
+	kvm_vm_thread_fn_t thread_fn;
+	uintptr_t data;
+	int err;
+};
+
+static int kvm_vm_worker_thread(void *context)
+{
+	/*
+	 * The init_context is allocated on the stack of the parent thread, so
+	 * we have to locally copy anything that is needed beyond initialization
+	 */
+	struct kvm_vm_worker_thread_context *init_context = context;
+	struct kvm *kvm = init_context->kvm;
+	kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
+	uintptr_t data = init_context->data;
+	int err;
+
+	err = kthread_park(current);
+	/* kthread_park(current) is never supposed to return an error */
+	WARN_ON(err != 0);
+	if (err)
+		goto init_complete;
+
+	err = cgroup_attach_task_all(init_context->parent, current);
+	if (err) {
+		kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
+			__func__, err);
+		goto init_complete;
+	}
+
+	set_user_nice(current, task_nice(init_context->parent));
+
+init_complete:
+	init_context->err = err;
+	complete(&init_context->init_done);
+	init_context = NULL;
+
+	if (err)
+		return err;
+
+	/* Wait to be woken up by the spawner before proceeding. */
+	kthread_parkme();
+
+	if (!kthread_should_stop())
+		err = thread_fn(kvm, data);
+
+	return err;
+}
+
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
+				uintptr_t data, const char *name,
+				struct task_struct **thread_ptr)
+{
+	struct kvm_vm_worker_thread_context init_context = {};
+	struct task_struct *thread;
+
+	*thread_ptr = NULL;
+	init_context.kvm = kvm;
+	init_context.parent = current;
+	init_context.thread_fn = thread_fn;
+	init_context.data = data;
+	init_completion(&init_context.init_done);
+
+	thread = kthread_run(kvm_vm_worker_thread, &init_context,
+			     "%s-%d", name, task_pid_nr(current));
+	if (IS_ERR(thread))
+		return PTR_ERR(thread);
+
+	/* kthread_run is never supposed to return NULL */
+	WARN_ON(thread == NULL);
+
+	wait_for_completion(&init_context.init_done);
+
+	if (!init_context.err)
+		*thread_ptr = thread;
+
+	return init_context.err;
+}

From 1aa9b9572b10529c2e64e2b8f44025d86e124308 Mon Sep 17 00:00:00 2001
From: Junaid Shahid <junaids@google.com>
Date: Mon, 4 Nov 2019 20:26:00 +0100
Subject: [PATCH 16/43] kvm: x86: mmu: Recovery of shattered NX large pages

The page table pages corresponding to broken down large pages are zapped in
FIFO order, so that the large page can potentially be recovered, if it is
not longer being used for execution.  This removes the performance penalty
for walking deeper EPT page tables.

By default, one large page will last about one hour once the guest
reaches a steady state.

Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 .../admin-guide/kernel-parameters.txt         |   6 +
 arch/x86/include/asm/kvm_host.h               |   4 +
 arch/x86/kvm/mmu.c                            | 129 ++++++++++++++++++
 arch/x86/kvm/mmu.h                            |   4 +
 arch/x86/kvm/x86.c                            |  11 ++
 virt/kvm/kvm_main.c                           |  28 ++++
 6 files changed, 182 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9d5f123cc218..8dee8f68fe15 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2068,6 +2068,12 @@
 			If the software workaround is enabled for the host,
 			guests do need not to enable it for nested guests.
 
+	kvm.nx_huge_pages_recovery_ratio=
+			[KVM] Controls how many 4KiB pages are periodically zapped
+			back to huge pages.  0 disables the recovery, otherwise if
+			the value is N KVM will zap 1/Nth of the 4KiB pages every
+			minute.  The default is 60.
+
 	kvm-amd.nested=	[KVM,AMD] Allow nested virtualization in KVM/SVM.
 			Default is 1 (enabled)
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a37b03483b66..4fc61483919a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -312,6 +312,8 @@ struct kvm_rmap_head {
 struct kvm_mmu_page {
 	struct list_head link;
 	struct hlist_node hash_link;
+	struct list_head lpage_disallowed_link;
+
 	bool unsync;
 	u8 mmu_valid_gen;
 	bool mmio_cached;
@@ -860,6 +862,7 @@ struct kvm_arch {
 	 */
 	struct list_head active_mmu_pages;
 	struct list_head zapped_obsolete_pages;
+	struct list_head lpage_disallowed_mmu_pages;
 	struct kvm_page_track_notifier_node mmu_sp_tracker;
 	struct kvm_page_track_notifier_head track_notifier_head;
 
@@ -934,6 +937,7 @@ struct kvm_arch {
 	bool exception_payload_enabled;
 
 	struct kvm_pmu_event_filter *pmu_event_filter;
+	struct task_struct *nx_lpage_recovery_thread;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bedf6864b092..529589a42afb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,7 @@
 #include <linux/uaccess.h>
 #include <linux/hash.h>
 #include <linux/kern_levels.h>
+#include <linux/kthread.h>
 
 #include <asm/page.h>
 #include <asm/pat.h>
@@ -50,16 +51,26 @@
 extern bool itlb_multihit_kvm_mitigation;
 
 static int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
 
 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
 
 static struct kernel_param_ops nx_huge_pages_ops = {
 	.set = set_nx_huge_pages,
 	.get = param_get_bool,
 };
 
+static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
+	.set = set_nx_huge_pages_recovery_ratio,
+	.get = param_get_uint,
+};
+
 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+		&nx_huge_pages_recovery_ratio, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
 
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
@@ -1215,6 +1226,8 @@ static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 		return;
 
 	++kvm->stat.nx_lpage_splits;
+	list_add_tail(&sp->lpage_disallowed_link,
+		      &kvm->arch.lpage_disallowed_mmu_pages);
 	sp->lpage_disallowed = true;
 }
 
@@ -1239,6 +1252,7 @@ static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	--kvm->stat.nx_lpage_splits;
 	sp->lpage_disallowed = false;
+	list_del(&sp->lpage_disallowed_link);
 }
 
 static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
@@ -6274,6 +6288,8 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
 			idx = srcu_read_lock(&kvm->srcu);
 			kvm_mmu_zap_all_fast(kvm);
 			srcu_read_unlock(&kvm->srcu, idx);
+
+			wake_up_process(kvm->arch.nx_lpage_recovery_thread);
 		}
 		mutex_unlock(&kvm_lock);
 	}
@@ -6367,3 +6383,116 @@ void kvm_mmu_module_exit(void)
 	unregister_shrinker(&mmu_shrinker);
 	mmu_audit_disable();
 }
+
+static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+{
+	unsigned int old_val;
+	int err;
+
+	old_val = nx_huge_pages_recovery_ratio;
+	err = param_set_uint(val, kp);
+	if (err)
+		return err;
+
+	if (READ_ONCE(nx_huge_pages) &&
+	    !old_val && nx_huge_pages_recovery_ratio) {
+		struct kvm *kvm;
+
+		mutex_lock(&kvm_lock);
+
+		list_for_each_entry(kvm, &vm_list, vm_list)
+			wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+
+		mutex_unlock(&kvm_lock);
+	}
+
+	return err;
+}
+
+static void kvm_recover_nx_lpages(struct kvm *kvm)
+{
+	int rcu_idx;
+	struct kvm_mmu_page *sp;
+	unsigned int ratio;
+	LIST_HEAD(invalid_list);
+	ulong to_zap;
+
+	rcu_idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+
+	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+	to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+	while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+		/*
+		 * We use a separate list instead of just using active_mmu_pages
+		 * because the number of lpage_disallowed pages is expected to
+		 * be relatively small compared to the total.
+		 */
+		sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+				      struct kvm_mmu_page,
+				      lpage_disallowed_link);
+		WARN_ON_ONCE(!sp->lpage_disallowed);
+		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+		WARN_ON_ONCE(sp->lpage_disallowed);
+
+		if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+			kvm_mmu_commit_zap_page(kvm, &invalid_list);
+			if (to_zap)
+				cond_resched_lock(&kvm->mmu_lock);
+		}
+	}
+
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, rcu_idx);
+}
+
+static long get_nx_lpage_recovery_timeout(u64 start_time)
+{
+	return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
+		? start_time + 60 * HZ - get_jiffies_64()
+		: MAX_SCHEDULE_TIMEOUT;
+}
+
+static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+{
+	u64 start_time;
+	long remaining_time;
+
+	while (true) {
+		start_time = get_jiffies_64();
+		remaining_time = get_nx_lpage_recovery_timeout(start_time);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		while (!kthread_should_stop() && remaining_time > 0) {
+			schedule_timeout(remaining_time);
+			remaining_time = get_nx_lpage_recovery_timeout(start_time);
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+
+		set_current_state(TASK_RUNNING);
+
+		if (kthread_should_stop())
+			return 0;
+
+		kvm_recover_nx_lpages(kvm);
+	}
+}
+
+int kvm_mmu_post_init_vm(struct kvm *kvm)
+{
+	int err;
+
+	err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+					  "kvm-nx-lpage-recovery",
+					  &kvm->arch.nx_lpage_recovery_thread);
+	if (!err)
+		kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+
+	return err;
+}
+
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
+{
+	if (kvm->arch.nx_lpage_recovery_thread)
+		kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 11f8ec89433b..d55674f44a18 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -210,4 +210,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 				    struct kvm_memory_slot *slot, u64 gfn);
 int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
+
+int kvm_mmu_post_init_vm(struct kvm *kvm);
+void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b087d178a774..a30e9962a6ef 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9456,6 +9456,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
+	INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 
@@ -9484,6 +9485,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	return kvm_x86_ops->vm_init(kvm);
 }
 
+int kvm_arch_post_init_vm(struct kvm *kvm)
+{
+	return kvm_mmu_post_init_vm(kvm);
+}
+
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 {
 	vcpu_load(vcpu);
@@ -9585,6 +9591,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 }
 EXPORT_SYMBOL_GPL(x86_set_memory_region);
 
+void kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+	kvm_mmu_pre_destroy_vm(kvm);
+}
+
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	if (current->mm == kvm->mm) {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8aed32b604d9..4aab3547a165 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -626,6 +626,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
 	return 0;
 }
 
+/*
+ * Called after the VM is otherwise initialized, but just before adding it to
+ * the vm_list.
+ */
+int __weak kvm_arch_post_init_vm(struct kvm *kvm)
+{
+	return 0;
+}
+
+/*
+ * Called just after removing the VM from the vm_list, but before doing any
+ * other destruction.
+ */
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+}
+
 static struct kvm *kvm_create_vm(unsigned long type)
 {
 	struct kvm *kvm = kvm_arch_alloc_vm();
@@ -682,6 +699,10 @@ static struct kvm *kvm_create_vm(unsigned long type)
 		goto out_err_no_irq_srcu;
 
 	r = kvm_init_mmu_notifier(kvm);
+	if (r)
+		goto out_err_no_mmu_notifier;
+
+	r = kvm_arch_post_init_vm(kvm);
 	if (r)
 		goto out_err;
 
@@ -694,6 +715,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	return kvm;
 
 out_err:
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+	if (kvm->mmu_notifier.ops)
+		mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
+#endif
+out_err_no_mmu_notifier:
 	cleanup_srcu_struct(&kvm->irq_srcu);
 out_err_no_irq_srcu:
 	cleanup_srcu_struct(&kvm->srcu);
@@ -738,6 +764,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	mutex_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	mutex_unlock(&kvm_lock);
+	kvm_arch_pre_destroy_vm(kvm);
+
 	kvm_free_irq_routing(kvm);
 	for (i = 0; i < KVM_NR_BUSES; i++) {
 		struct kvm_io_bus *bus = kvm_get_bus(kvm, i);

From 7f00cc8d4a51074eb0ad4c3f16c15757b1ddfb7d Mon Sep 17 00:00:00 2001
From: "Gomez Iglesias, Antonio" <antonio.gomez.iglesias@intel.com>
Date: Mon, 4 Nov 2019 20:26:00 +0100
Subject: [PATCH 17/43] Documentation: Add ITLB_MULTIHIT documentation

Add the initial ITLB_MULTIHIT documentation.

[ tglx: Add it to the index so it gets actually built. ]

Signed-off-by: Antonio Gomez Iglesias <antonio.gomez.iglesias@intel.com>
Signed-off-by: Nelson D'Souza <nelson.dsouza@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/admin-guide/hw-vuln/index.rst   |   1 +
 .../admin-guide/hw-vuln/multihit.rst          | 163 ++++++++++++++++++
 2 files changed, 164 insertions(+)
 create mode 100644 Documentation/admin-guide/hw-vuln/multihit.rst

diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
index 0802b1c67452..0795e3c2643f 100644
--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -13,3 +13,4 @@ are configurable at compile, boot or run time.
    l1tf
    mds
    tsx_async_abort
+   multihit.rst
diff --git a/Documentation/admin-guide/hw-vuln/multihit.rst b/Documentation/admin-guide/hw-vuln/multihit.rst
new file mode 100644
index 000000000000..ba9988d8bce5
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/multihit.rst
@@ -0,0 +1,163 @@
+iTLB multihit
+=============
+
+iTLB multihit is an erratum where some processors may incur a machine check
+error, possibly resulting in an unrecoverable CPU lockup, when an
+instruction fetch hits multiple entries in the instruction TLB. This can
+occur when the page size is changed along with either the physical address
+or cache type. A malicious guest running on a virtualized system can
+exploit this erratum to perform a denial of service attack.
+
+
+Affected processors
+-------------------
+
+Variations of this erratum are present on most Intel Core and Xeon processor
+models. The erratum is not present on:
+
+   - non-Intel processors
+
+   - Some Atoms (Airmont, Bonnell, Goldmont, GoldmontPlus, Saltwell, Silvermont)
+
+   - Intel processors that have the PSCHANGE_MC_NO bit set in the
+     IA32_ARCH_CAPABILITIES MSR.
+
+
+Related CVEs
+------------
+
+The following CVE entry is related to this issue:
+
+   ==============  =================================================
+   CVE-2018-12207  Machine Check Error Avoidance on Page Size Change
+   ==============  =================================================
+
+
+Problem
+-------
+
+Privileged software, including OS and virtual machine managers (VMM), are in
+charge of memory management. A key component in memory management is the control
+of the page tables. Modern processors use virtual memory, a technique that creates
+the illusion of a very large memory for processors. This virtual space is split
+into pages of a given size. Page tables translate virtual addresses to physical
+addresses.
+
+To reduce latency when performing a virtual to physical address translation,
+processors include a structure, called TLB, that caches recent translations.
+There are separate TLBs for instruction (iTLB) and data (dTLB).
+
+Under this errata, instructions are fetched from a linear address translated
+using a 4 KB translation cached in the iTLB. Privileged software modifies the
+paging structure so that the same linear address using large page size (2 MB, 4
+MB, 1 GB) with a different physical address or memory type.  After the page
+structure modification but before the software invalidates any iTLB entries for
+the linear address, a code fetch that happens on the same linear address may
+cause a machine-check error which can result in a system hang or shutdown.
+
+
+Attack scenarios
+----------------
+
+Attacks against the iTLB multihit erratum can be mounted from malicious
+guests in a virtualized system.
+
+
+iTLB multihit system information
+--------------------------------
+
+The Linux kernel provides a sysfs interface to enumerate the current iTLB
+multihit status of the system:whether the system is vulnerable and which
+mitigations are active. The relevant sysfs file is:
+
+/sys/devices/system/cpu/vulnerabilities/itlb_multihit
+
+The possible values in this file are:
+
+.. list-table::
+
+     * - Not affected
+       - The processor is not vulnerable.
+     * - KVM: Mitigation: Split huge pages
+       - Software changes mitigate this issue.
+     * - KVM: Vulnerable
+       - The processor is vulnerable, but no mitigation enabled
+
+
+Enumeration of the erratum
+--------------------------------
+
+A new bit has been allocated in the IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) msr
+and will be set on CPU's which are mitigated against this issue.
+
+   =======================================   ===========   ===============================
+   IA32_ARCH_CAPABILITIES MSR                Not present   Possibly vulnerable,check model
+   IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO]    '0'           Likely vulnerable,check model
+   IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO]    '1'           Not vulnerable
+   =======================================   ===========   ===============================
+
+
+Mitigation mechanism
+-------------------------
+
+This erratum can be mitigated by restricting the use of large page sizes to
+non-executable pages.  This forces all iTLB entries to be 4K, and removes
+the possibility of multiple hits.
+
+In order to mitigate the vulnerability, KVM initially marks all huge pages
+as non-executable. If the guest attempts to execute in one of those pages,
+the page is broken down into 4K pages, which are then marked executable.
+
+If EPT is disabled or not available on the host, KVM is in control of TLB
+flushes and the problematic situation cannot happen.  However, the shadow
+EPT paging mechanism used by nested virtualization is vulnerable, because
+the nested guest can trigger multiple iTLB hits by modifying its own
+(non-nested) page tables.  For simplicity, KVM will make large pages
+non-executable in all shadow paging modes.
+
+Mitigation control on the kernel command line and KVM - module parameter
+------------------------------------------------------------------------
+
+The KVM hypervisor mitigation mechanism for marking huge pages as
+non-executable can be controlled with a module parameter "nx_huge_pages=".
+The kernel command line allows to control the iTLB multihit mitigations at
+boot time with the option "kvm.nx_huge_pages=".
+
+The valid arguments for these options are:
+
+  ==========  ================================================================
+  force       Mitigation is enabled. In this case, the mitigation implements
+              non-executable huge pages in Linux kernel KVM module. All huge
+              pages in the EPT are marked as non-executable.
+              If a guest attempts to execute in one of those pages, the page is
+              broken down into 4K pages, which are then marked executable.
+
+  off	      Mitigation is disabled.
+
+  auto        Enable mitigation only if the platform is affected and the kernel
+              was not booted with the "mitigations=off" command line parameter.
+	      This is the default option.
+  ==========  ================================================================
+
+
+Mitigation selection guide
+--------------------------
+
+1. No virtualization in use
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   The system is protected by the kernel unconditionally and no further
+   action is required.
+
+2. Virtualization with trusted guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+   If the guest comes from a trusted source, you may assume that the guest will
+   not attempt to maliciously exploit these errata and no further action is
+   required.
+
+3. Virtualization with untrusted guests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+   If the guest comes from an untrusted source, the guest host kernel will need
+   to apply iTLB multihit mitigation via the kernel command line or kvm
+   module parameter.

From 0a2f661b6c21815a7fa60e30babe975fee8e73c6 Mon Sep 17 00:00:00 2001
From: Jon Bloomfield <jon.bloomfield@intel.com>
Date: Fri, 20 Apr 2018 14:26:01 -0700
Subject: [PATCH 18/43] drm/i915: Rename gen7 cmdparser tables

We're about to introduce some new tables for later gens, and the
current naming for the gen7 tables will no longer make sense.

v2: rebase

Signed-off-by: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Chris Wilson <chris.p.wilson@intel.com>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 70 +++++++++++++-------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 24555102e198..20b27cb9ed97 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -213,7 +213,7 @@ struct drm_i915_cmd_table {
 
 /*            Command                          Mask   Fixed Len   Action
 	      ---------------------------------------------------------- */
-static const struct drm_i915_cmd_descriptor common_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_common_cmds[] = {
 	CMD(  MI_NOOP,                          SMI,    F,  1,      S  ),
 	CMD(  MI_USER_INTERRUPT,                SMI,    F,  1,      R  ),
 	CMD(  MI_WAIT_FOR_EVENT,                SMI,    F,  1,      M  ),
@@ -246,7 +246,7 @@ static const struct drm_i915_cmd_descriptor common_cmds[] = {
 	CMD(  MI_BATCH_BUFFER_START,            SMI,   !F,  0xFF,   S  ),
 };
 
-static const struct drm_i915_cmd_descriptor render_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_render_cmds[] = {
 	CMD(  MI_FLUSH,                         SMI,    F,  1,      S  ),
 	CMD(  MI_ARB_ON_OFF,                    SMI,    F,  1,      R  ),
 	CMD(  MI_PREDICATE,                     SMI,    F,  1,      S  ),
@@ -330,7 +330,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = {
 	CMD(  GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS,  S3D,   !F,  0x1FF,  S  ),
 };
 
-static const struct drm_i915_cmd_descriptor video_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_video_cmds[] = {
 	CMD(  MI_ARB_ON_OFF,                    SMI,    F,  1,      R  ),
 	CMD(  MI_SET_APPID,                     SMI,    F,  1,      S  ),
 	CMD(  MI_STORE_DWORD_IMM,               SMI,   !F,  0xFF,   B,
@@ -374,7 +374,7 @@ static const struct drm_i915_cmd_descriptor video_cmds[] = {
 	CMD(  MFX_WAIT,                         SMFX,   F,  1,      S  ),
 };
 
-static const struct drm_i915_cmd_descriptor vecs_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_vecs_cmds[] = {
 	CMD(  MI_ARB_ON_OFF,                    SMI,    F,  1,      R  ),
 	CMD(  MI_SET_APPID,                     SMI,    F,  1,      S  ),
 	CMD(  MI_STORE_DWORD_IMM,               SMI,   !F,  0xFF,   B,
@@ -412,7 +412,7 @@ static const struct drm_i915_cmd_descriptor vecs_cmds[] = {
 	      }},						       ),
 };
 
-static const struct drm_i915_cmd_descriptor blt_cmds[] = {
+static const struct drm_i915_cmd_descriptor gen7_blt_cmds[] = {
 	CMD(  MI_DISPLAY_FLIP,                  SMI,   !F,  0xFF,   R  ),
 	CMD(  MI_STORE_DWORD_IMM,               SMI,   !F,  0x3FF,  B,
 	      .bits = {{
@@ -465,35 +465,35 @@ static const struct drm_i915_cmd_descriptor noop_desc =
 #undef B
 #undef M
 
-static const struct drm_i915_cmd_table gen7_render_cmds[] = {
-	{ common_cmds, ARRAY_SIZE(common_cmds) },
-	{ render_cmds, ARRAY_SIZE(render_cmds) },
+static const struct drm_i915_cmd_table gen7_render_cmd_table[] = {
+	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+	{ gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) },
 };
 
-static const struct drm_i915_cmd_table hsw_render_ring_cmds[] = {
-	{ common_cmds, ARRAY_SIZE(common_cmds) },
-	{ render_cmds, ARRAY_SIZE(render_cmds) },
+static const struct drm_i915_cmd_table hsw_render_ring_cmd_table[] = {
+	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+	{ gen7_render_cmds, ARRAY_SIZE(gen7_render_cmds) },
 	{ hsw_render_cmds, ARRAY_SIZE(hsw_render_cmds) },
 };
 
-static const struct drm_i915_cmd_table gen7_video_cmds[] = {
-	{ common_cmds, ARRAY_SIZE(common_cmds) },
-	{ video_cmds, ARRAY_SIZE(video_cmds) },
+static const struct drm_i915_cmd_table gen7_video_cmd_table[] = {
+	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+	{ gen7_video_cmds, ARRAY_SIZE(gen7_video_cmds) },
 };
 
-static const struct drm_i915_cmd_table hsw_vebox_cmds[] = {
-	{ common_cmds, ARRAY_SIZE(common_cmds) },
-	{ vecs_cmds, ARRAY_SIZE(vecs_cmds) },
+static const struct drm_i915_cmd_table hsw_vebox_cmd_table[] = {
+	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+	{ gen7_vecs_cmds, ARRAY_SIZE(gen7_vecs_cmds) },
 };
 
-static const struct drm_i915_cmd_table gen7_blt_cmds[] = {
-	{ common_cmds, ARRAY_SIZE(common_cmds) },
-	{ blt_cmds, ARRAY_SIZE(blt_cmds) },
+static const struct drm_i915_cmd_table gen7_blt_cmd_table[] = {
+	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+	{ gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) },
 };
 
-static const struct drm_i915_cmd_table hsw_blt_ring_cmds[] = {
-	{ common_cmds, ARRAY_SIZE(common_cmds) },
-	{ blt_cmds, ARRAY_SIZE(blt_cmds) },
+static const struct drm_i915_cmd_table hsw_blt_ring_cmd_table[] = {
+	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
+	{ gen7_blt_cmds, ARRAY_SIZE(gen7_blt_cmds) },
 	{ hsw_blt_cmds, ARRAY_SIZE(hsw_blt_cmds) },
 };
 
@@ -873,12 +873,12 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 	switch (engine->class) {
 	case RENDER_CLASS:
 		if (IS_HASWELL(engine->i915)) {
-			cmd_tables = hsw_render_ring_cmds;
+			cmd_tables = hsw_render_ring_cmd_table;
 			cmd_table_count =
-				ARRAY_SIZE(hsw_render_ring_cmds);
+				ARRAY_SIZE(hsw_render_ring_cmd_table);
 		} else {
-			cmd_tables = gen7_render_cmds;
-			cmd_table_count = ARRAY_SIZE(gen7_render_cmds);
+			cmd_tables = gen7_render_cmd_table;
+			cmd_table_count = ARRAY_SIZE(gen7_render_cmd_table);
 		}
 
 		if (IS_HASWELL(engine->i915)) {
@@ -892,17 +892,17 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 		engine->get_cmd_length_mask = gen7_render_get_cmd_length_mask;
 		break;
 	case VIDEO_DECODE_CLASS:
-		cmd_tables = gen7_video_cmds;
-		cmd_table_count = ARRAY_SIZE(gen7_video_cmds);
+		cmd_tables = gen7_video_cmd_table;
+		cmd_table_count = ARRAY_SIZE(gen7_video_cmd_table);
 		engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
 		break;
 	case COPY_ENGINE_CLASS:
 		if (IS_HASWELL(engine->i915)) {
-			cmd_tables = hsw_blt_ring_cmds;
-			cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds);
+			cmd_tables = hsw_blt_ring_cmd_table;
+			cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmd_table);
 		} else {
-			cmd_tables = gen7_blt_cmds;
-			cmd_table_count = ARRAY_SIZE(gen7_blt_cmds);
+			cmd_tables = gen7_blt_cmd_table;
+			cmd_table_count = ARRAY_SIZE(gen7_blt_cmd_table);
 		}
 
 		if (IS_HASWELL(engine->i915)) {
@@ -916,8 +916,8 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 		engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask;
 		break;
 	case VIDEO_ENHANCEMENT_CLASS:
-		cmd_tables = hsw_vebox_cmds;
-		cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds);
+		cmd_tables = hsw_vebox_cmd_table;
+		cmd_table_count = ARRAY_SIZE(hsw_vebox_cmd_table);
 		/* VECS can use the same length_mask function as VCS */
 		engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
 		break;

From 44157641d448cbc0c4b73c5231d2b911f0cb0427 Mon Sep 17 00:00:00 2001
From: Jon Bloomfield <jon.bloomfield@intel.com>
Date: Fri, 8 Jun 2018 08:53:46 -0700
Subject: [PATCH 19/43] drm/i915: Disable Secure Batches for gen6+

Retroactively stop reporting support for secure batches
through the api for gen6+ so that older binaries trigger
the fallback path instead.

Older binaries use secure batches pre gen6 to access resources
that are not available to normal usermode processes. However,
all known userspace explicitly checks for HAS_SECURE_BATCHES
before relying on the secure batch feature.

Since there are no known binaries relying on this for newer gens
we can kill secure batches from gen6, via I915_PARAM_HAS_SECURE_BATCHES.

v2: rebase (Mika)
v3: rebase (Mika)

Signed-off-by: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Chris Wilson <chris.p.wilson@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 12 ++++++++++--
 drivers/gpu/drm/i915/i915_drv.h                |  1 +
 drivers/gpu/drm/i915/i915_getparam.c           |  2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index b5f6937369ea..17561203ab58 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -2421,6 +2421,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		       struct drm_i915_gem_exec_object2 *exec,
 		       struct drm_syncobj **fences)
 {
+	struct drm_i915_private *i915 = to_i915(dev);
 	struct i915_execbuffer eb;
 	struct dma_fence *in_fence = NULL;
 	struct dma_fence *exec_fence = NULL;
@@ -2432,7 +2433,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	BUILD_BUG_ON(__EXEC_OBJECT_INTERNAL_FLAGS &
 		     ~__EXEC_OBJECT_UNKNOWN_FLAGS);
 
-	eb.i915 = to_i915(dev);
+	eb.i915 = i915;
 	eb.file = file;
 	eb.args = args;
 	if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC))
@@ -2452,8 +2453,15 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 
 	eb.batch_flags = 0;
 	if (args->flags & I915_EXEC_SECURE) {
+		if (INTEL_GEN(i915) >= 11)
+			return -ENODEV;
+
+		/* Return -EPERM to trigger fallback code on old binaries. */
+		if (!HAS_SECURE_BATCHES(i915))
+			return -EPERM;
+
 		if (!drm_is_current_master(file) || !capable(CAP_SYS_ADMIN))
-		    return -EPERM;
+			return -EPERM;
 
 		eb.batch_flags |= I915_DISPATCH_SECURE;
 	}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 953e1d12c23c..68974df69f2f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2078,6 +2078,7 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
 #define HAS_LLC(dev_priv)	(INTEL_INFO(dev_priv)->has_llc)
 #define HAS_SNOOP(dev_priv)	(INTEL_INFO(dev_priv)->has_snoop)
 #define HAS_EDRAM(dev_priv)	((dev_priv)->edram_size_mb)
+#define HAS_SECURE_BATCHES(dev_priv) (INTEL_GEN(dev_priv) < 6)
 #define HAS_WT(dev_priv)	((IS_HASWELL(dev_priv) || \
 				 IS_BROADWELL(dev_priv)) && HAS_EDRAM(dev_priv))
 
diff --git a/drivers/gpu/drm/i915/i915_getparam.c b/drivers/gpu/drm/i915/i915_getparam.c
index 5d9101376a3d..9f1517af5b7f 100644
--- a/drivers/gpu/drm/i915/i915_getparam.c
+++ b/drivers/gpu/drm/i915/i915_getparam.c
@@ -62,7 +62,7 @@ int i915_getparam_ioctl(struct drm_device *dev, void *data,
 		value = !!(i915->caps.scheduler & I915_SCHEDULER_CAP_SEMAPHORES);
 		break;
 	case I915_PARAM_HAS_SECURE_BATCHES:
-		value = capable(CAP_SYS_ADMIN);
+		value = HAS_SECURE_BATCHES(i915) && capable(CAP_SYS_ADMIN);
 		break;
 	case I915_PARAM_CMD_PARSER_VERSION:
 		value = i915_cmd_parser_get_version(i915);

From 66d8aba1cd6db34af10de465c0d52af679288cb6 Mon Sep 17 00:00:00 2001
From: Jon Bloomfield <jon.bloomfield@intel.com>
Date: Fri, 8 Jun 2018 10:05:26 -0700
Subject: [PATCH 20/43] drm/i915: Remove Master tables from cmdparser

The previous patch has killed support for secure batches
on gen6+, and hence the cmdparsers master tables are
now dead code. Remove them.

Signed-off-by: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tyler Hicks <tyhicks@canonical.com>
Reviewed-by: Chris Wilson <chris.p.wilson@intel.com>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |  7 +-
 drivers/gpu/drm/i915/i915_cmd_parser.c        | 84 +++++--------------
 drivers/gpu/drm/i915/i915_drv.h               |  3 +-
 3 files changed, 26 insertions(+), 68 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 17561203ab58..e8da0729d266 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -1955,7 +1955,7 @@ static int i915_reset_gen7_sol_offsets(struct i915_request *rq)
 	return 0;
 }
 
-static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
+static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
 {
 	struct intel_engine_pool_node *pool;
 	struct i915_vma *vma;
@@ -1969,8 +1969,7 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
 				      eb->batch->obj,
 				      pool->obj,
 				      eb->batch_start_offset,
-				      eb->batch_len,
-				      is_master);
+				      eb->batch_len);
 	if (err) {
 		if (err == -EACCES) /* unhandled chained batch */
 			vma = NULL;
@@ -2541,7 +2540,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	if (eb_use_cmdparser(&eb)) {
 		struct i915_vma *vma;
 
-		vma = eb_parse(&eb, drm_is_current_master(file));
+		vma = eb_parse(&eb);
 		if (IS_ERR(vma)) {
 			err = PTR_ERR(vma);
 			goto err_vma;
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 20b27cb9ed97..fb582343373c 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -53,13 +53,11 @@
  * granting userspace undue privileges. There are three categories of privilege.
  *
  * First, commands which are explicitly defined as privileged or which should
- * only be used by the kernel driver. The parser generally rejects such
- * commands, though it may allow some from the drm master process.
+ * only be used by the kernel driver. The parser rejects such commands
  *
  * Second, commands which access registers. To support correct/enhanced
  * userspace functionality, particularly certain OpenGL extensions, the parser
- * provides a whitelist of registers which userspace may safely access (for both
- * normal and drm master processes).
+ * provides a whitelist of registers which userspace may safely access
  *
  * Third, commands which access privileged memory (i.e. GGTT, HWS page, etc).
  * The parser always rejects such commands.
@@ -84,9 +82,9 @@
  * in the per-engine command tables.
  *
  * Other command table entries map fairly directly to high level categories
- * mentioned above: rejected, master-only, register whitelist. The parser
- * implements a number of checks, including the privileged memory checks, via a
- * general bitmasking mechanism.
+ * mentioned above: rejected, register whitelist. The parser implements a number
+ * of checks, including the privileged memory checks, via a general bitmasking
+ * mechanism.
  */
 
 /*
@@ -104,8 +102,6 @@ struct drm_i915_cmd_descriptor {
 	 * CMD_DESC_REJECT: The command is never allowed
 	 * CMD_DESC_REGISTER: The command should be checked against the
 	 *                    register whitelist for the appropriate ring
-	 * CMD_DESC_MASTER: The command is allowed if the submitting process
-	 *                  is the DRM master
 	 */
 	u32 flags;
 #define CMD_DESC_FIXED    (1<<0)
@@ -113,7 +109,6 @@ struct drm_i915_cmd_descriptor {
 #define CMD_DESC_REJECT   (1<<2)
 #define CMD_DESC_REGISTER (1<<3)
 #define CMD_DESC_BITMASK  (1<<4)
-#define CMD_DESC_MASTER   (1<<5)
 
 	/*
 	 * The command's unique identification bits and the bitmask to get them.
@@ -209,14 +204,13 @@ struct drm_i915_cmd_table {
 #define R CMD_DESC_REJECT
 #define W CMD_DESC_REGISTER
 #define B CMD_DESC_BITMASK
-#define M CMD_DESC_MASTER
 
 /*            Command                          Mask   Fixed Len   Action
 	      ---------------------------------------------------------- */
 static const struct drm_i915_cmd_descriptor gen7_common_cmds[] = {
 	CMD(  MI_NOOP,                          SMI,    F,  1,      S  ),
 	CMD(  MI_USER_INTERRUPT,                SMI,    F,  1,      R  ),
-	CMD(  MI_WAIT_FOR_EVENT,                SMI,    F,  1,      M  ),
+	CMD(  MI_WAIT_FOR_EVENT,                SMI,    F,  1,      R  ),
 	CMD(  MI_ARB_CHECK,                     SMI,    F,  1,      S  ),
 	CMD(  MI_REPORT_HEAD,                   SMI,    F,  1,      S  ),
 	CMD(  MI_SUSPEND_FLUSH,                 SMI,    F,  1,      S  ),
@@ -313,7 +307,7 @@ static const struct drm_i915_cmd_descriptor hsw_render_cmds[] = {
 	CMD(  MI_URB_ATOMIC_ALLOC,              SMI,    F,  1,      S  ),
 	CMD(  MI_SET_APPID,                     SMI,    F,  1,      S  ),
 	CMD(  MI_RS_CONTEXT,                    SMI,    F,  1,      S  ),
-	CMD(  MI_LOAD_SCAN_LINES_INCL,          SMI,   !F,  0x3F,   M  ),
+	CMD(  MI_LOAD_SCAN_LINES_INCL,          SMI,   !F,  0x3F,   R  ),
 	CMD(  MI_LOAD_SCAN_LINES_EXCL,          SMI,   !F,  0x3F,   R  ),
 	CMD(  MI_LOAD_REGISTER_REG,             SMI,   !F,  0xFF,   W,
 	      .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 }    ),
@@ -446,7 +440,7 @@ static const struct drm_i915_cmd_descriptor gen7_blt_cmds[] = {
 };
 
 static const struct drm_i915_cmd_descriptor hsw_blt_cmds[] = {
-	CMD(  MI_LOAD_SCAN_LINES_INCL,          SMI,   !F,  0x3F,   M  ),
+	CMD(  MI_LOAD_SCAN_LINES_INCL,          SMI,   !F,  0x3F,   R  ),
 	CMD(  MI_LOAD_SCAN_LINES_EXCL,          SMI,   !F,  0x3F,   R  ),
 };
 
@@ -463,7 +457,6 @@ static const struct drm_i915_cmd_descriptor noop_desc =
 #undef R
 #undef W
 #undef B
-#undef M
 
 static const struct drm_i915_cmd_table gen7_render_cmd_table[] = {
 	{ gen7_common_cmds, ARRAY_SIZE(gen7_common_cmds) },
@@ -612,47 +605,29 @@ static const struct drm_i915_reg_descriptor gen7_blt_regs[] = {
 	REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE),
 };
 
-static const struct drm_i915_reg_descriptor ivb_master_regs[] = {
-	REG32(FORCEWAKE_MT),
-	REG32(DERRMR),
-	REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_A)),
-	REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_B)),
-	REG32(GEN7_PIPE_DE_LOAD_SL(PIPE_C)),
-};
-
-static const struct drm_i915_reg_descriptor hsw_master_regs[] = {
-	REG32(FORCEWAKE_MT),
-	REG32(DERRMR),
-};
-
 #undef REG64
 #undef REG32
 
 struct drm_i915_reg_table {
 	const struct drm_i915_reg_descriptor *regs;
 	int num_regs;
-	bool master;
 };
 
 static const struct drm_i915_reg_table ivb_render_reg_tables[] = {
-	{ gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false },
-	{ ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true },
+	{ gen7_render_regs, ARRAY_SIZE(gen7_render_regs) },
 };
 
 static const struct drm_i915_reg_table ivb_blt_reg_tables[] = {
-	{ gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false },
-	{ ivb_master_regs, ARRAY_SIZE(ivb_master_regs), true },
+	{ gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) },
 };
 
 static const struct drm_i915_reg_table hsw_render_reg_tables[] = {
-	{ gen7_render_regs, ARRAY_SIZE(gen7_render_regs), false },
-	{ hsw_render_regs, ARRAY_SIZE(hsw_render_regs), false },
-	{ hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true },
+	{ gen7_render_regs, ARRAY_SIZE(gen7_render_regs) },
+	{ hsw_render_regs, ARRAY_SIZE(hsw_render_regs) },
 };
 
 static const struct drm_i915_reg_table hsw_blt_reg_tables[] = {
-	{ gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs), false },
-	{ hsw_master_regs, ARRAY_SIZE(hsw_master_regs), true },
+	{ gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) },
 };
 
 static u32 gen7_render_get_cmd_length_mask(u32 cmd_header)
@@ -1029,22 +1004,16 @@ __find_reg(const struct drm_i915_reg_descriptor *table, int count, u32 addr)
 }
 
 static const struct drm_i915_reg_descriptor *
-find_reg(const struct intel_engine_cs *engine, bool is_master, u32 addr)
+find_reg(const struct intel_engine_cs *engine, u32 addr)
 {
 	const struct drm_i915_reg_table *table = engine->reg_tables;
+	const struct drm_i915_reg_descriptor *reg = NULL;
 	int count = engine->reg_table_count;
 
-	for (; count > 0; ++table, --count) {
-		if (!table->master || is_master) {
-			const struct drm_i915_reg_descriptor *reg;
+	for (; !reg && (count > 0); ++table, --count)
+		reg = __find_reg(table->regs, table->num_regs, addr);
 
-			reg = __find_reg(table->regs, table->num_regs, addr);
-			if (reg != NULL)
-				return reg;
-		}
-	}
-
-	return NULL;
+	return reg;
 }
 
 /* Returns a vmap'd pointer to dst_obj, which the caller must unmap */
@@ -1128,8 +1097,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
 
 static bool check_cmd(const struct intel_engine_cs *engine,
 		      const struct drm_i915_cmd_descriptor *desc,
-		      const u32 *cmd, u32 length,
-		      const bool is_master)
+		      const u32 *cmd, u32 length)
 {
 	if (desc->flags & CMD_DESC_SKIP)
 		return true;
@@ -1139,12 +1107,6 @@ static bool check_cmd(const struct intel_engine_cs *engine,
 		return false;
 	}
 
-	if ((desc->flags & CMD_DESC_MASTER) && !is_master) {
-		DRM_DEBUG_DRIVER("CMD: Rejected master-only command: 0x%08X\n",
-				 *cmd);
-		return false;
-	}
-
 	if (desc->flags & CMD_DESC_REGISTER) {
 		/*
 		 * Get the distance between individual register offset
@@ -1158,7 +1120,7 @@ static bool check_cmd(const struct intel_engine_cs *engine,
 		     offset += step) {
 			const u32 reg_addr = cmd[offset] & desc->reg.mask;
 			const struct drm_i915_reg_descriptor *reg =
-				find_reg(engine, is_master, reg_addr);
+				find_reg(engine, reg_addr);
 
 			if (!reg) {
 				DRM_DEBUG_DRIVER("CMD: Rejected register 0x%08X in command: 0x%08X (%s)\n",
@@ -1245,7 +1207,6 @@ static bool check_cmd(const struct intel_engine_cs *engine,
  * @shadow_batch_obj: copy of the batch buffer in question
  * @batch_start_offset: byte offset in the batch at which execution starts
  * @batch_len: length of the commands in batch_obj
- * @is_master: is the submitting process the drm master?
  *
  * Parses the specified batch buffer looking for privilege violations as
  * described in the overview.
@@ -1257,8 +1218,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 			    struct drm_i915_gem_object *batch_obj,
 			    struct drm_i915_gem_object *shadow_batch_obj,
 			    u32 batch_start_offset,
-			    u32 batch_len,
-			    bool is_master)
+			    u32 batch_len)
 {
 	u32 *cmd, *batch_end;
 	struct drm_i915_cmd_descriptor default_desc = noop_desc;
@@ -1324,7 +1284,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 			break;
 		}
 
-		if (!check_cmd(engine, desc, cmd, length, is_master)) {
+		if (!check_cmd(engine, desc, cmd, length)) {
 			ret = -EACCES;
 			break;
 		}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 68974df69f2f..5a16abea3465 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2398,8 +2398,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 			    struct drm_i915_gem_object *batch_obj,
 			    struct drm_i915_gem_object *shadow_batch_obj,
 			    u32 batch_start_offset,
-			    u32 batch_len,
-			    bool is_master);
+			    u32 batch_len);
 
 /* intel_device_info.c */
 static inline struct intel_device_info *

From 311a50e76a33d1e029563c24b2ff6db0c02b5afe Mon Sep 17 00:00:00 2001
From: Jon Bloomfield <jon.bloomfield@intel.com>
Date: Wed, 1 Aug 2018 09:33:59 -0700
Subject: [PATCH 21/43] drm/i915: Add support for mandatory cmdparsing

The existing cmdparser for gen7 can be bypassed by specifying
batch_len=0 in the execbuf call. This is safe because bypassing
simply reduces the cmd-set available.

In a later patch we will introduce cmdparsing for gen9, as a
security measure, which must be strictly enforced since without
it we are vulnerable to DoS attacks.

Introduce the concept of 'required' cmd parsing that cannot be
bypassed by submitting zero-length bb's.

v2: rebase (Mika)
v2: rebase (Mika)
v3: fix conflict on engine flags (Mika)

Signed-off-by: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Chris Wilson <chris.p.wilson@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c |  3 ++-
 drivers/gpu/drm/i915/gt/intel_engine_types.h   | 13 ++++++++++---
 drivers/gpu/drm/i915/i915_cmd_parser.c         |  6 +++---
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index e8da0729d266..2426efc05c09 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -296,7 +296,8 @@ static inline u64 gen8_noncanonical_addr(u64 address)
 
 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
 {
-	return intel_engine_needs_cmd_parser(eb->engine) && eb->batch_len;
+	return intel_engine_requires_cmd_parser(eb->engine) ||
+		(intel_engine_using_cmd_parser(eb->engine) && eb->batch_len);
 }
 
 static int eb_create(struct i915_execbuffer *eb)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index a82cea95c2f2..9dd8c299cb2d 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -475,12 +475,13 @@ struct intel_engine_cs {
 
 	struct intel_engine_hangcheck hangcheck;
 
-#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
+#define I915_ENGINE_USING_CMD_PARSER BIT(0)
 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
 #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
 #define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
 #define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4)
 #define I915_ENGINE_IS_VIRTUAL       BIT(5)
+#define I915_ENGINE_REQUIRES_CMD_PARSER BIT(7)
 	unsigned int flags;
 
 	/*
@@ -541,9 +542,15 @@ struct intel_engine_cs {
 };
 
 static inline bool
-intel_engine_needs_cmd_parser(const struct intel_engine_cs *engine)
+intel_engine_using_cmd_parser(const struct intel_engine_cs *engine)
 {
-	return engine->flags & I915_ENGINE_NEEDS_CMD_PARSER;
+	return engine->flags & I915_ENGINE_USING_CMD_PARSER;
+}
+
+static inline bool
+intel_engine_requires_cmd_parser(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_REQUIRES_CMD_PARSER;
 }
 
 static inline bool
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index fb582343373c..832b1ac282c0 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -918,7 +918,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 		return;
 	}
 
-	engine->flags |= I915_ENGINE_NEEDS_CMD_PARSER;
+	engine->flags |= I915_ENGINE_USING_CMD_PARSER;
 }
 
 /**
@@ -930,7 +930,7 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
  */
 void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine)
 {
-	if (!intel_engine_needs_cmd_parser(engine))
+	if (!intel_engine_using_cmd_parser(engine))
 		return;
 
 	fini_hash_table(engine);
@@ -1317,7 +1317,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv)
 
 	/* If the command parser is not enabled, report 0 - unsupported */
 	for_each_uabi_engine(engine, dev_priv) {
-		if (intel_engine_needs_cmd_parser(engine)) {
+		if (intel_engine_using_cmd_parser(engine)) {
 			active = true;
 			break;
 		}

From 4f7af1948abcb18b4772fe1bcd84d7d27d96258c Mon Sep 17 00:00:00 2001
From: Jon Bloomfield <jon.bloomfield@intel.com>
Date: Tue, 22 May 2018 13:59:06 -0700
Subject: [PATCH 22/43] drm/i915: Support ro ppgtt mapped cmdparser shadow
 buffers

For Gen7, the original cmdparser motive was to permit limited
use of register read/write instructions in unprivileged BB's.
This worked by copying the user supplied bb to a kmd owned
bb, and running it in secure mode, from the ggtt, only if
the scanner finds no unsafe commands or registers.

For Gen8+ we can't use this same technique because running bb's
from the ggtt also disables access to ppgtt space. But we also
do not actually require 'secure' execution since we are only
trying to reduce the available command/register set. Instead we
will copy the user buffer to a kmd owned read-only bb in ppgtt,
and run in the usual non-secure mode.

Note that ro pages are only supported by ppgtt (not ggtt), but
luckily that's exactly what we need.

Add the required paths to map the shadow buffer to ppgtt ro for Gen8+

v2: IS_GEN7/IS_GEN (Mika)
v3: rebase
v4: rebase
v5: rebase

Signed-off-by: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Chris Wilson <chris.p.wilson@intel.com>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 58 +++++++++++++------
 drivers/gpu/drm/i915/i915_drv.h               | 14 +++++
 drivers/gpu/drm/i915/i915_gem.c               | 16 ++++-
 3 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 2426efc05c09..1f423bb2d644 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -1956,6 +1956,34 @@ static int i915_reset_gen7_sol_offsets(struct i915_request *rq)
 	return 0;
 }
 
+static struct i915_vma *
+shadow_batch_pin(struct i915_execbuffer *eb, struct drm_i915_gem_object *obj)
+{
+	struct drm_i915_private *dev_priv = eb->i915;
+	struct i915_vma * const vma = *eb->vma;
+	struct i915_address_space *vm;
+	u64 flags;
+
+	/*
+	 * PPGTT backed shadow buffers must be mapped RO, to prevent
+	 * post-scan tampering
+	 */
+	if (CMDPARSER_USES_GGTT(dev_priv)) {
+		flags = PIN_GLOBAL;
+		vm = &dev_priv->ggtt.vm;
+		eb->batch_flags |= I915_DISPATCH_SECURE;
+	} else if (vma->vm->has_read_only) {
+		flags = PIN_USER;
+		vm = vma->vm;
+		i915_gem_object_set_readonly(obj);
+	} else {
+		DRM_DEBUG("Cannot prevent post-scan tampering without RO capable vm\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	return i915_gem_object_pin(obj, vm, NULL, 0, 0, flags);
+}
+
 static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
 {
 	struct intel_engine_pool_node *pool;
@@ -1972,14 +2000,21 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
 				      eb->batch_start_offset,
 				      eb->batch_len);
 	if (err) {
-		if (err == -EACCES) /* unhandled chained batch */
+		/*
+		 * Unsafe GGTT-backed buffers can still be submitted safely
+		 * as non-secure.
+		 * For PPGTT backing however, we have no choice but to forcibly
+		 * reject unsafe buffers
+		 */
+		if (CMDPARSER_USES_GGTT(eb->i915) && (err == -EACCES))
+			/* Execute original buffer non-secure */
 			vma = NULL;
 		else
 			vma = ERR_PTR(err);
 		goto err;
 	}
 
-	vma = i915_gem_object_ggtt_pin(pool->obj, NULL, 0, 0, 0);
+	vma = shadow_batch_pin(eb, pool->obj);
 	if (IS_ERR(vma))
 		goto err;
 
@@ -1989,6 +2024,10 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
 	vma->exec_flags = &eb->flags[eb->buffer_count];
 	eb->buffer_count++;
 
+	eb->batch_start_offset = 0;
+	eb->batch = vma;
+	/* eb->batch_len unchanged */
+
 	vma->private = pool;
 	return vma;
 
@@ -2546,21 +2585,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 			err = PTR_ERR(vma);
 			goto err_vma;
 		}
-
-		if (vma) {
-			/*
-			 * Batch parsed and accepted:
-			 *
-			 * Set the DISPATCH_SECURE bit to remove the NON_SECURE
-			 * bit from MI_BATCH_BUFFER_START commands issued in
-			 * the dispatch_execbuffer implementations. We
-			 * specifically don't want that set on batches the
-			 * command parser has accepted.
-			 */
-			eb.batch_flags |= I915_DISPATCH_SECURE;
-			eb.batch_start_offset = 0;
-			eb.batch = vma;
-		}
 	}
 
 	if (eb.batch_len == 0)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5a16abea3465..5b338e1b79fd 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2075,6 +2075,12 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
 #define VEBOX_MASK(dev_priv) \
 	ENGINE_INSTANCES_MASK(dev_priv, VECS0, I915_MAX_VECS)
 
+/*
+ * The Gen7 cmdparser copies the scanned buffer to the ggtt for execution
+ * All later gens can run the final buffer from the ppgtt
+ */
+#define CMDPARSER_USES_GGTT(dev_priv) IS_GEN(dev_priv, 7)
+
 #define HAS_LLC(dev_priv)	(INTEL_INFO(dev_priv)->has_llc)
 #define HAS_SNOOP(dev_priv)	(INTEL_INFO(dev_priv)->has_snoop)
 #define HAS_EDRAM(dev_priv)	((dev_priv)->edram_size_mb)
@@ -2285,6 +2291,14 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj,
 			   unsigned long flags);
 #define I915_GEM_OBJECT_UNBIND_ACTIVE BIT(0)
 
+struct i915_vma * __must_check
+i915_gem_object_pin(struct drm_i915_gem_object *obj,
+		    struct i915_address_space *vm,
+		    const struct i915_ggtt_view *view,
+		    u64 size,
+		    u64 alignment,
+		    u64 flags);
+
 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv);
 
 static inline int __must_check
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d0f94f239919..98305d987ac1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -964,6 +964,20 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
 {
 	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 	struct i915_address_space *vm = &dev_priv->ggtt.vm;
+
+	return i915_gem_object_pin(obj, vm, view, size, alignment,
+				   flags | PIN_GLOBAL);
+}
+
+struct i915_vma *
+i915_gem_object_pin(struct drm_i915_gem_object *obj,
+		    struct i915_address_space *vm,
+		    const struct i915_ggtt_view *view,
+		    u64 size,
+		    u64 alignment,
+		    u64 flags)
+{
+	struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
 	struct i915_vma *vma;
 	int ret;
 
@@ -1038,7 +1052,7 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
 			return ERR_PTR(ret);
 	}
 
-	ret = i915_vma_pin(vma, size, alignment, flags | PIN_GLOBAL);
+	ret = i915_vma_pin(vma, size, alignment, flags);
 	if (ret)
 		return ERR_PTR(ret);
 

From 435e8fc059dbe0eec823a75c22da2972390ba9e0 Mon Sep 17 00:00:00 2001
From: Jon Bloomfield <jon.bloomfield@intel.com>
Date: Wed, 1 Aug 2018 09:45:50 -0700
Subject: [PATCH 23/43] drm/i915: Allow parsing of unsized batches

In "drm/i915: Add support for mandatory cmdparsing" we introduced the
concept of mandatory parsing. This allows the cmdparser to be invoked
even when user passes batch_len=0 to the execbuf ioctl's.

However, the cmdparser needs to know the extents of the buffer being
scanned. Refactor the code to ensure the cmdparser uses the actual
object size, instead of the incoming length, if user passes 0.

Signed-off-by: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tyler Hicks <tyhicks@canonical.com>
Reviewed-by: Chris Wilson <chris.p.wilson@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 1f423bb2d644..8237b2935b5f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -297,7 +297,8 @@ static inline u64 gen8_noncanonical_addr(u64 address)
 static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb)
 {
 	return intel_engine_requires_cmd_parser(eb->engine) ||
-		(intel_engine_using_cmd_parser(eb->engine) && eb->batch_len);
+		(intel_engine_using_cmd_parser(eb->engine) &&
+		 eb->args->batch_len);
 }
 
 static int eb_create(struct i915_execbuffer *eb)
@@ -2577,6 +2578,9 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		goto err_vma;
 	}
 
+	if (eb.batch_len == 0)
+		eb.batch_len = eb.batch->size - eb.batch_start_offset;
+
 	if (eb_use_cmdparser(&eb)) {
 		struct i915_vma *vma;
 
@@ -2587,9 +2591,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 		}
 	}
 
-	if (eb.batch_len == 0)
-		eb.batch_len = eb.batch->size - eb.batch_start_offset;
-
 	/*
 	 * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure
 	 * batch" bit. Hence we need to pin secure batches into the global gtt.

From 0f2f39758341df70202ae1c42d5a1e4ee392b6d3 Mon Sep 17 00:00:00 2001
From: Jon Bloomfield <jon.bloomfield@intel.com>
Date: Mon, 23 Apr 2018 11:12:15 -0700
Subject: [PATCH 24/43] drm/i915: Add gen9 BCS cmdparsing

For gen9 we enable cmdparsing on the BCS ring, specifically
to catch inadvertent accesses to sensitive registers

Unlike gen7/hsw, we use the parser only to block certain
registers. We can rely on h/w to block restricted commands,
so the command tables only provide enough info to allow the
parser to delineate each command, and identify commands that
access registers.

Note: This patch deliberately ignores checkpatch issues in
favour of matching the style of the surrounding code. We'll
correct the entire file in one go in a later patch.

v3: rebase (Mika)
v4: Add RING_TIMESTAMP registers to whitelist (Jon)

Signed-off-by: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Chris Wilson <chris.p.wilson@intel.com>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 116 ++++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_reg.h        |   4 +
 2 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 832b1ac282c0..6794034c8ced 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -444,6 +444,47 @@ static const struct drm_i915_cmd_descriptor hsw_blt_cmds[] = {
 	CMD(  MI_LOAD_SCAN_LINES_EXCL,          SMI,   !F,  0x3F,   R  ),
 };
 
+/*
+ * For Gen9 we can still rely on the h/w to enforce cmd security, and only
+ * need to re-enforce the register access checks. We therefore only need to
+ * teach the cmdparser how to find the end of each command, and identify
+ * register accesses. The table doesn't need to reject any commands, and so
+ * the only commands listed here are:
+ *   1) Those that touch registers
+ *   2) Those that do not have the default 8-bit length
+ *
+ * Note that the default MI length mask chosen for this table is 0xFF, not
+ * the 0x3F used on older devices. This is because the vast majority of MI
+ * cmds on Gen9 use a standard 8-bit Length field.
+ * All the Gen9 blitter instructions are standard 0xFF length mask, and
+ * none allow access to non-general registers, so in fact no BLT cmds are
+ * included in the table at all.
+ *
+ */
+static const struct drm_i915_cmd_descriptor gen9_blt_cmds[] = {
+	CMD(  MI_NOOP,                          SMI,    F,  1,      S  ),
+	CMD(  MI_USER_INTERRUPT,                SMI,    F,  1,      S  ),
+	CMD(  MI_WAIT_FOR_EVENT,                SMI,    F,  1,      S  ),
+	CMD(  MI_FLUSH,                         SMI,    F,  1,      S  ),
+	CMD(  MI_ARB_CHECK,                     SMI,    F,  1,      S  ),
+	CMD(  MI_REPORT_HEAD,                   SMI,    F,  1,      S  ),
+	CMD(  MI_ARB_ON_OFF,                    SMI,    F,  1,      S  ),
+	CMD(  MI_SUSPEND_FLUSH,                 SMI,    F,  1,      S  ),
+	CMD(  MI_LOAD_SCAN_LINES_INCL,          SMI,   !F,  0x3F,   S  ),
+	CMD(  MI_LOAD_SCAN_LINES_EXCL,          SMI,   !F,  0x3F,   S  ),
+	CMD(  MI_STORE_DWORD_IMM,               SMI,   !F,  0x3FF,  S  ),
+	CMD(  MI_LOAD_REGISTER_IMM(1),          SMI,   !F,  0xFF,   W,
+	      .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 2 }    ),
+	CMD(  MI_UPDATE_GTT,                    SMI,   !F,  0x3FF,  S  ),
+	CMD(  MI_STORE_REGISTER_MEM_GEN8,       SMI,    F,  4,      W,
+	      .reg = { .offset = 1, .mask = 0x007FFFFC }               ),
+	CMD(  MI_FLUSH_DW,                      SMI,   !F,  0x3F,   S  ),
+	CMD(  MI_LOAD_REGISTER_MEM_GEN8,        SMI,    F,  4,      W,
+	      .reg = { .offset = 1, .mask = 0x007FFFFC }               ),
+	CMD(  MI_LOAD_REGISTER_REG,             SMI,    !F,  0xFF,  W,
+	      .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 }    ),
+};
+
 static const struct drm_i915_cmd_descriptor noop_desc =
 	CMD(MI_NOOP, SMI, F, 1, S);
 
@@ -490,6 +531,11 @@ static const struct drm_i915_cmd_table hsw_blt_ring_cmd_table[] = {
 	{ hsw_blt_cmds, ARRAY_SIZE(hsw_blt_cmds) },
 };
 
+static const struct drm_i915_cmd_table gen9_blt_cmd_table[] = {
+	{ gen9_blt_cmds, ARRAY_SIZE(gen9_blt_cmds) },
+};
+
+
 /*
  * Register whitelists, sorted by increasing register offset.
  */
@@ -605,6 +651,29 @@ static const struct drm_i915_reg_descriptor gen7_blt_regs[] = {
 	REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE),
 };
 
+static const struct drm_i915_reg_descriptor gen9_blt_regs[] = {
+	REG64_IDX(RING_TIMESTAMP, RENDER_RING_BASE),
+	REG64_IDX(RING_TIMESTAMP, BSD_RING_BASE),
+	REG32(BCS_SWCTRL),
+	REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE),
+	REG64_IDX(BCS_GPR, 0),
+	REG64_IDX(BCS_GPR, 1),
+	REG64_IDX(BCS_GPR, 2),
+	REG64_IDX(BCS_GPR, 3),
+	REG64_IDX(BCS_GPR, 4),
+	REG64_IDX(BCS_GPR, 5),
+	REG64_IDX(BCS_GPR, 6),
+	REG64_IDX(BCS_GPR, 7),
+	REG64_IDX(BCS_GPR, 8),
+	REG64_IDX(BCS_GPR, 9),
+	REG64_IDX(BCS_GPR, 10),
+	REG64_IDX(BCS_GPR, 11),
+	REG64_IDX(BCS_GPR, 12),
+	REG64_IDX(BCS_GPR, 13),
+	REG64_IDX(BCS_GPR, 14),
+	REG64_IDX(BCS_GPR, 15),
+};
+
 #undef REG64
 #undef REG32
 
@@ -630,6 +699,10 @@ static const struct drm_i915_reg_table hsw_blt_reg_tables[] = {
 	{ gen7_blt_regs, ARRAY_SIZE(gen7_blt_regs) },
 };
 
+static const struct drm_i915_reg_table gen9_blt_reg_tables[] = {
+	{ gen9_blt_regs, ARRAY_SIZE(gen9_blt_regs) },
+};
+
 static u32 gen7_render_get_cmd_length_mask(u32 cmd_header)
 {
 	u32 client = cmd_header >> INSTR_CLIENT_SHIFT;
@@ -685,6 +758,17 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header)
 	return 0;
 }
 
+static u32 gen9_blt_get_cmd_length_mask(u32 cmd_header)
+{
+	u32 client = cmd_header >> INSTR_CLIENT_SHIFT;
+
+	if (client == INSTR_MI_CLIENT || client == INSTR_BC_CLIENT)
+		return 0xFF;
+
+	DRM_DEBUG_DRIVER("CMD: Abnormal blt cmd length! 0x%08X\n", cmd_header);
+	return 0;
+}
+
 static bool validate_cmds_sorted(const struct intel_engine_cs *engine,
 				 const struct drm_i915_cmd_table *cmd_tables,
 				 int cmd_table_count)
@@ -842,7 +926,8 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 	int cmd_table_count;
 	int ret;
 
-	if (!IS_GEN(engine->i915, 7))
+	if (!IS_GEN(engine->i915, 7) && !(IS_GEN(engine->i915, 9) &&
+					  engine->class == COPY_ENGINE_CLASS))
 		return;
 
 	switch (engine->class) {
@@ -863,7 +948,6 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 			engine->reg_tables = ivb_render_reg_tables;
 			engine->reg_table_count = ARRAY_SIZE(ivb_render_reg_tables);
 		}
-
 		engine->get_cmd_length_mask = gen7_render_get_cmd_length_mask;
 		break;
 	case VIDEO_DECODE_CLASS:
@@ -872,7 +956,16 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 		engine->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
 		break;
 	case COPY_ENGINE_CLASS:
-		if (IS_HASWELL(engine->i915)) {
+		engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask;
+		if (IS_GEN(engine->i915, 9)) {
+			cmd_tables = gen9_blt_cmd_table;
+			cmd_table_count = ARRAY_SIZE(gen9_blt_cmd_table);
+			engine->get_cmd_length_mask =
+				gen9_blt_get_cmd_length_mask;
+
+			/* BCS Engine unsafe without parser */
+			engine->flags |= I915_ENGINE_REQUIRES_CMD_PARSER;
+		} else if (IS_HASWELL(engine->i915)) {
 			cmd_tables = hsw_blt_ring_cmd_table;
 			cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmd_table);
 		} else {
@@ -880,15 +973,17 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine)
 			cmd_table_count = ARRAY_SIZE(gen7_blt_cmd_table);
 		}
 
-		if (IS_HASWELL(engine->i915)) {
+		if (IS_GEN(engine->i915, 9)) {
+			engine->reg_tables = gen9_blt_reg_tables;
+			engine->reg_table_count =
+				ARRAY_SIZE(gen9_blt_reg_tables);
+		} else if (IS_HASWELL(engine->i915)) {
 			engine->reg_tables = hsw_blt_reg_tables;
 			engine->reg_table_count = ARRAY_SIZE(hsw_blt_reg_tables);
 		} else {
 			engine->reg_tables = ivb_blt_reg_tables;
 			engine->reg_table_count = ARRAY_SIZE(ivb_blt_reg_tables);
 		}
-
-		engine->get_cmd_length_mask = gen7_blt_get_cmd_length_mask;
 		break;
 	case VIDEO_ENHANCEMENT_CLASS:
 		cmd_tables = hsw_vebox_cmd_table;
@@ -1261,9 +1356,9 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 		}
 
 		/*
-		 * If the batch buffer contains a chained batch, return an
-		 * error that tells the caller to abort and dispatch the
-		 * workload as a non-secure batch.
+		 * We don't try to handle BATCH_BUFFER_START because it adds
+		 * non-trivial complexity. Instead we abort the scan and return
+		 * and error to indicate that the batch is unsafe.
 		 */
 		if (desc->cmd.value == MI_BATCH_BUFFER_START) {
 			ret = -EACCES;
@@ -1342,6 +1437,7 @@ int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv)
 	 *    the parser enabled.
 	 * 9. Don't whitelist or handle oacontrol specially, as ownership
 	 *    for oacontrol state is moving to i915-perf.
+	 * 10. Support for Gen9 BCS Parsing
 	 */
-	return 9;
+	return 10;
 }
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 2abd199093c5..e69fe05228fb 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -555,6 +555,10 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
  */
 #define BCS_SWCTRL _MMIO(0x22200)
 
+/* There are 16 GPR registers */
+#define BCS_GPR(n)	_MMIO(0x22600 + (n) * 8)
+#define BCS_GPR_UDW(n)	_MMIO(0x22600 + (n) * 8 + 4)
+
 #define GPGPU_THREADS_DISPATCHED        _MMIO(0x2290)
 #define GPGPU_THREADS_DISPATCHED_UDW	_MMIO(0x2290 + 4)
 #define HS_INVOCATION_COUNT             _MMIO(0x2300)

From 0546a29cd884fb8184731c79ab008927ca8859d0 Mon Sep 17 00:00:00 2001
From: Jon Bloomfield <jon.bloomfield@intel.com>
Date: Thu, 27 Sep 2018 10:23:17 -0700
Subject: [PATCH 25/43] drm/i915/cmdparser: Use explicit goto for error paths

In the next patch we will be adding a second valid
termination condition which will require a small
amount of refactoring to share logic with the BB_END
case.

Refactor all error conditions to jump to a dedicated
exit path, with 'break' reserved only for a successful
parse.

Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Jon Bloomfield <jon.bloomfield@intel.com>
Reviewed-by: Chris Wilson <chris.p.wilson@intel.com>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 6794034c8ced..dc5bcbc3ba6e 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -1338,21 +1338,15 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 	do {
 		u32 length;
 
-		if (*cmd == MI_BATCH_BUFFER_END) {
-			if (needs_clflush_after) {
-				void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping);
-				drm_clflush_virt_range(ptr,
-						       (void *)(cmd + 1) - ptr);
-			}
+		if (*cmd == MI_BATCH_BUFFER_END)
 			break;
-		}
 
 		desc = find_cmd(engine, *cmd, desc, &default_desc);
 		if (!desc) {
 			DRM_DEBUG_DRIVER("CMD: Unrecognized command: 0x%08X\n",
 					 *cmd);
 			ret = -EINVAL;
-			break;
+			goto err;
 		}
 
 		/*
@@ -1362,7 +1356,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 		 */
 		if (desc->cmd.value == MI_BATCH_BUFFER_START) {
 			ret = -EACCES;
-			break;
+			goto err;
 		}
 
 		if (desc->flags & CMD_DESC_FIXED)
@@ -1376,22 +1370,29 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 					 length,
 					 batch_end - cmd);
 			ret = -EINVAL;
-			break;
+			goto err;
 		}
 
 		if (!check_cmd(engine, desc, cmd, length)) {
 			ret = -EACCES;
-			break;
+			goto err;
 		}
 
 		cmd += length;
 		if  (cmd >= batch_end) {
 			DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
 			ret = -EINVAL;
-			break;
+			goto err;
 		}
 	} while (1);
 
+	if (needs_clflush_after) {
+		void *ptr = page_mask_bits(shadow_batch_obj->mm.mapping);
+
+		drm_clflush_virt_range(ptr, (void *)(cmd + 1) - ptr);
+	}
+
+err:
 	i915_gem_object_unpin_map(shadow_batch_obj);
 	return ret;
 }

From f8c08d8faee5567803c8c533865296ca30286bbf Mon Sep 17 00:00:00 2001
From: Jon Bloomfield <jon.bloomfield@intel.com>
Date: Thu, 20 Sep 2018 09:58:36 -0700
Subject: [PATCH 26/43] drm/i915/cmdparser: Add support for backward jumps

To keep things manageable, the pre-gen9 cmdparser does not
attempt to track any form of nested BB_START's. This did not
prevent usermode from using nested starts, or even chained
batches because the cmdparser is not strictly enforced pre gen9.

Instead, the existence of a nested BB_START would cause the batch
to be emitted in insecure mode, and any privileged capabilities
would not be available.

For Gen9, the cmdparser becomes mandatory (for BCS at least), and
so not providing any form of nested BB_START support becomes
overly restrictive. Any such batch will simply not run.

We make heavy use of backward jumps in igt, and it is much easier
to add support for this restricted subset of nested jumps, than to
rewrite the whole of our test suite to avoid them.

Add the required logic to support limited backward jumps, to
instructions that have already been validated by the parser.

Note that it's not sufficient to simply approve any BB_START
that jumps backwards in the buffer because this would allow an
attacker to embed a rogue instruction sequence within the
operand words of a harmless instruction (say LRI) and jump to
that.

We introduce a bit array to track every instr offset successfully
validated, and test the target of BB_START against this. If the
target offset hits, it is re-written to the same offset in the
shadow buffer and the BB_START cmd is allowed.

Note: This patch deliberately ignores checkpatch issues in the
cmdtables, in order to match the style of the surrounding code.
We'll correct the entire file in one go in a later patch.

v2: set dispatch secure late (Mika)
v3: rebase (Mika)
v4: Clear whitelist on each parse
    Minor review updates (Chris)
v5: Correct backward jump batching
v6: fix compilation error due to struct eb shuffle (Mika)

Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Jon Bloomfield <jon.bloomfield@intel.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Chris Wilson <chris.p.wilson@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   |   5 +
 .../gpu/drm/i915/gem/i915_gem_context_types.h |   7 +
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    |  32 +++-
 drivers/gpu/drm/i915/i915_cmd_parser.c        | 151 ++++++++++++++++--
 drivers/gpu/drm/i915/i915_drv.h               |   9 +-
 5 files changed, 178 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 1cdfe05514c3..e41fd94ae5a9 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -319,6 +319,8 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
 	free_engines(rcu_access_pointer(ctx->engines));
 	mutex_destroy(&ctx->engines_mutex);
 
+	kfree(ctx->jump_whitelist);
+
 	if (ctx->timeline)
 		intel_timeline_put(ctx->timeline);
 
@@ -441,6 +443,9 @@ __create_context(struct drm_i915_private *i915)
 	for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++)
 		ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES;
 
+	ctx->jump_whitelist = NULL;
+	ctx->jump_whitelist_cmds = 0;
+
 	return ctx;
 
 err_free:
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index 260d59cc3de8..00537b9d7006 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -192,6 +192,13 @@ struct i915_gem_context {
 	 * per vm, which may be one per context or shared with the global GTT)
 	 */
 	struct radix_tree_root handles_vma;
+
+	/** jump_whitelist: Bit array for tracking cmds during cmdparsing
+	 *  Guarded by struct_mutex
+	 */
+	unsigned long *jump_whitelist;
+	/** jump_whitelist_cmds: No of cmd slots available */
+	u32 jump_whitelist_cmds;
 };
 
 #endif /* __I915_GEM_CONTEXT_TYPES_H__ */
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 8237b2935b5f..e635e1e5f4d3 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -1972,7 +1972,6 @@ shadow_batch_pin(struct i915_execbuffer *eb, struct drm_i915_gem_object *obj)
 	if (CMDPARSER_USES_GGTT(dev_priv)) {
 		flags = PIN_GLOBAL;
 		vm = &dev_priv->ggtt.vm;
-		eb->batch_flags |= I915_DISPATCH_SECURE;
 	} else if (vma->vm->has_read_only) {
 		flags = PIN_USER;
 		vm = vma->vm;
@@ -1989,18 +1988,35 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
 {
 	struct intel_engine_pool_node *pool;
 	struct i915_vma *vma;
+	u64 batch_start;
+	u64 shadow_batch_start;
 	int err;
 
 	pool = intel_engine_pool_get(&eb->engine->pool, eb->batch_len);
 	if (IS_ERR(pool))
 		return ERR_CAST(pool);
 
-	err = intel_engine_cmd_parser(eb->engine,
+	vma = shadow_batch_pin(eb, pool->obj);
+	if (IS_ERR(vma))
+		goto err;
+
+	batch_start = gen8_canonical_addr(eb->batch->node.start) +
+		      eb->batch_start_offset;
+
+	shadow_batch_start = gen8_canonical_addr(vma->node.start);
+
+	err = intel_engine_cmd_parser(eb->gem_context,
+				      eb->engine,
 				      eb->batch->obj,
-				      pool->obj,
+				      batch_start,
 				      eb->batch_start_offset,
-				      eb->batch_len);
+				      eb->batch_len,
+				      pool->obj,
+				      shadow_batch_start);
+
 	if (err) {
+		i915_vma_unpin(vma);
+
 		/*
 		 * Unsafe GGTT-backed buffers can still be submitted safely
 		 * as non-secure.
@@ -2015,10 +2031,6 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
 		goto err;
 	}
 
-	vma = shadow_batch_pin(eb, pool->obj);
-	if (IS_ERR(vma))
-		goto err;
-
 	eb->vma[eb->buffer_count] = i915_vma_get(vma);
 	eb->flags[eb->buffer_count] =
 		__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_REF;
@@ -2027,6 +2039,10 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
 
 	eb->batch_start_offset = 0;
 	eb->batch = vma;
+
+	if (CMDPARSER_USES_GGTT(eb->i915))
+		eb->batch_flags |= I915_DISPATCH_SECURE;
+
 	/* eb->batch_len unchanged */
 
 	vma->private = pool;
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index dc5bcbc3ba6e..365eea2b95bd 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -483,6 +483,19 @@ static const struct drm_i915_cmd_descriptor gen9_blt_cmds[] = {
 	      .reg = { .offset = 1, .mask = 0x007FFFFC }               ),
 	CMD(  MI_LOAD_REGISTER_REG,             SMI,    !F,  0xFF,  W,
 	      .reg = { .offset = 1, .mask = 0x007FFFFC, .step = 1 }    ),
+
+	/*
+	 * We allow BB_START but apply further checks. We just sanitize the
+	 * basic fields here.
+	 */
+#define MI_BB_START_OPERAND_MASK   GENMASK(SMI-1, 0)
+#define MI_BB_START_OPERAND_EXPECT (MI_BATCH_PPGTT_HSW | 1)
+	CMD(  MI_BATCH_BUFFER_START_GEN8,       SMI,    !F,  0xFF,  B,
+	      .bits = {{
+			.offset = 0,
+			.mask = MI_BB_START_OPERAND_MASK,
+			.expected = MI_BB_START_OPERAND_EXPECT,
+	      }},						       ),
 };
 
 static const struct drm_i915_cmd_descriptor noop_desc =
@@ -1293,15 +1306,113 @@ static bool check_cmd(const struct intel_engine_cs *engine,
 	return true;
 }
 
+static int check_bbstart(const struct i915_gem_context *ctx,
+			 u32 *cmd, u32 offset, u32 length,
+			 u32 batch_len,
+			 u64 batch_start,
+			 u64 shadow_batch_start)
+{
+	u64 jump_offset, jump_target;
+	u32 target_cmd_offset, target_cmd_index;
+
+	/* For igt compatibility on older platforms */
+	if (CMDPARSER_USES_GGTT(ctx->i915)) {
+		DRM_DEBUG("CMD: Rejecting BB_START for ggtt based submission\n");
+		return -EACCES;
+	}
+
+	if (length != 3) {
+		DRM_DEBUG("CMD: Recursive BB_START with bad length(%u)\n",
+			  length);
+		return -EINVAL;
+	}
+
+	jump_target = *(u64*)(cmd+1);
+	jump_offset = jump_target - batch_start;
+
+	/*
+	 * Any underflow of jump_target is guaranteed to be outside the range
+	 * of a u32, so >= test catches both too large and too small
+	 */
+	if (jump_offset >= batch_len) {
+		DRM_DEBUG("CMD: BB_START to 0x%llx jumps out of BB\n",
+			  jump_target);
+		return -EINVAL;
+	}
+
+	/*
+	 * This cannot overflow a u32 because we already checked jump_offset
+	 * is within the BB, and the batch_len is a u32
+	 */
+	target_cmd_offset = lower_32_bits(jump_offset);
+	target_cmd_index = target_cmd_offset / sizeof(u32);
+
+	*(u64*)(cmd + 1) = shadow_batch_start + target_cmd_offset;
+
+	if (target_cmd_index == offset)
+		return 0;
+
+	if (ctx->jump_whitelist_cmds <= target_cmd_index) {
+		DRM_DEBUG("CMD: Rejecting BB_START - truncated whitelist array\n");
+		return -EINVAL;
+	} else if (!test_bit(target_cmd_index, ctx->jump_whitelist)) {
+		DRM_DEBUG("CMD: BB_START to 0x%llx not a previously executed cmd\n",
+			  jump_target);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void init_whitelist(struct i915_gem_context *ctx, u32 batch_len)
+{
+	const u32 batch_cmds = DIV_ROUND_UP(batch_len, sizeof(u32));
+	const u32 exact_size = BITS_TO_LONGS(batch_cmds);
+	u32 next_size = BITS_TO_LONGS(roundup_pow_of_two(batch_cmds));
+	unsigned long *next_whitelist;
+
+	if (CMDPARSER_USES_GGTT(ctx->i915))
+		return;
+
+	if (batch_cmds <= ctx->jump_whitelist_cmds) {
+		memset(ctx->jump_whitelist, 0, exact_size * sizeof(u32));
+		return;
+	}
+
+again:
+	next_whitelist = kcalloc(next_size, sizeof(long), GFP_KERNEL);
+	if (next_whitelist) {
+		kfree(ctx->jump_whitelist);
+		ctx->jump_whitelist = next_whitelist;
+		ctx->jump_whitelist_cmds =
+			next_size * BITS_PER_BYTE * sizeof(long);
+		return;
+	}
+
+	if (next_size > exact_size) {
+		next_size = exact_size;
+		goto again;
+	}
+
+	DRM_DEBUG("CMD: Failed to extend whitelist. BB_START may be disallowed\n");
+	memset(ctx->jump_whitelist, 0,
+	       BITS_TO_LONGS(ctx->jump_whitelist_cmds) * sizeof(u32));
+
+	return;
+}
+
 #define LENGTH_BIAS 2
 
 /**
  * i915_parse_cmds() - parse a submitted batch buffer for privilege violations
+ * @ctx: the context in which the batch is to execute
  * @engine: the engine on which the batch is to execute
  * @batch_obj: the batch buffer in question
- * @shadow_batch_obj: copy of the batch buffer in question
+ * @batch_start: Canonical base address of batch
  * @batch_start_offset: byte offset in the batch at which execution starts
  * @batch_len: length of the commands in batch_obj
+ * @shadow_batch_obj: copy of the batch buffer in question
+ * @shadow_batch_start: Canonical base address of shadow_batch_obj
  *
  * Parses the specified batch buffer looking for privilege violations as
  * described in the overview.
@@ -1309,13 +1420,17 @@ static bool check_cmd(const struct intel_engine_cs *engine,
  * Return: non-zero if the parser finds violations or otherwise fails; -EACCES
  * if the batch appears legal but should use hardware parsing
  */
-int intel_engine_cmd_parser(struct intel_engine_cs *engine,
+
+int intel_engine_cmd_parser(struct i915_gem_context *ctx,
+			    struct intel_engine_cs *engine,
 			    struct drm_i915_gem_object *batch_obj,
-			    struct drm_i915_gem_object *shadow_batch_obj,
+			    u64 batch_start,
 			    u32 batch_start_offset,
-			    u32 batch_len)
+			    u32 batch_len,
+			    struct drm_i915_gem_object *shadow_batch_obj,
+			    u64 shadow_batch_start)
 {
-	u32 *cmd, *batch_end;
+	u32 *cmd, *batch_end, offset = 0;
 	struct drm_i915_cmd_descriptor default_desc = noop_desc;
 	const struct drm_i915_cmd_descriptor *desc = &default_desc;
 	bool needs_clflush_after = false;
@@ -1329,6 +1444,8 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 		return PTR_ERR(cmd);
 	}
 
+	init_whitelist(ctx, batch_len);
+
 	/*
 	 * We use the batch length as size because the shadow object is as
 	 * large or larger and copy_batch() will write MI_NOPs to the extra
@@ -1349,16 +1466,6 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 			goto err;
 		}
 
-		/*
-		 * We don't try to handle BATCH_BUFFER_START because it adds
-		 * non-trivial complexity. Instead we abort the scan and return
-		 * and error to indicate that the batch is unsafe.
-		 */
-		if (desc->cmd.value == MI_BATCH_BUFFER_START) {
-			ret = -EACCES;
-			goto err;
-		}
-
 		if (desc->flags & CMD_DESC_FIXED)
 			length = desc->length.fixed;
 		else
@@ -1378,7 +1485,21 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine,
 			goto err;
 		}
 
+		if (desc->cmd.value == MI_BATCH_BUFFER_START) {
+			ret = check_bbstart(ctx, cmd, offset, length,
+					    batch_len, batch_start,
+					    shadow_batch_start);
+
+			if (ret)
+				goto err;
+			break;
+		}
+
+		if (ctx->jump_whitelist_cmds > offset)
+			set_bit(offset, ctx->jump_whitelist);
+
 		cmd += length;
+		offset += length;
 		if  (cmd >= batch_end) {
 			DRM_DEBUG_DRIVER("CMD: Got to the end of the buffer w/o a BBE cmd!\n");
 			ret = -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5b338e1b79fd..b20424e66097 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2408,11 +2408,14 @@ const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
 int i915_cmd_parser_get_version(struct drm_i915_private *dev_priv);
 void intel_engine_init_cmd_parser(struct intel_engine_cs *engine);
 void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine);
-int intel_engine_cmd_parser(struct intel_engine_cs *engine,
+int intel_engine_cmd_parser(struct i915_gem_context *cxt,
+			    struct intel_engine_cs *engine,
 			    struct drm_i915_gem_object *batch_obj,
-			    struct drm_i915_gem_object *shadow_batch_obj,
+			    u64 user_batch_start,
 			    u32 batch_start_offset,
-			    u32 batch_len);
+			    u32 batch_len,
+			    struct drm_i915_gem_object *shadow_batch_obj,
+			    u64 shadow_batch_start);
 
 /* intel_device_info.c */
 static inline struct intel_device_info *

From 926abff21a8f29ef159a3ac893b05c6e50e043c3 Mon Sep 17 00:00:00 2001
From: Jon Bloomfield <jon.bloomfield@intel.com>
Date: Thu, 20 Sep 2018 09:45:10 -0700
Subject: [PATCH 27/43] drm/i915/cmdparser: Ignore Length operands during
 command matching

Some of the gen instruction macros (e.g. MI_DISPLAY_FLIP) have the
length directly encoded in them. Since these are used directly in
the tables, the Length becomes part of the comparison used for
matching during parsing. Thus, if the cmd being parsed has a
different length to that in the table, it is not matched and the
cmd is accepted via the default variable length path.

Fix by masking out everything except the Opcode in the cmd tables

Cc: Tony Luck <tony.luck@intel.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tyler Hicks <tyhicks@canonical.com>
Signed-off-by: Jon Bloomfield <jon.bloomfield@intel.com>
Reviewed-by: Chris Wilson <chris.p.wilson@intel.com>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index 365eea2b95bd..d78debed06e2 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -189,7 +189,7 @@ struct drm_i915_cmd_table {
 #define CMD(op, opm, f, lm, fl, ...)				\
 	{							\
 		.flags = (fl) | ((f) ? CMD_DESC_FIXED : 0),	\
-		.cmd = { (op), ~0u << (opm) },			\
+		.cmd = { (op & ~0u << (opm)), ~0u << (opm) },	\
 		.length = { (lm) },				\
 		__VA_ARGS__					\
 	}

From 1d85a299c4db57c55e0229615132c964d17aa765 Mon Sep 17 00:00:00 2001
From: Uma Shankar <uma.shankar@intel.com>
Date: Tue, 7 Aug 2018 21:15:35 +0530
Subject: [PATCH 28/43] drm/i915: Lower RM timeout to avoid DSI hard hangs

In BXT/APL, device 2 MMIO reads from MIPI controller requires its PLL
to be turned ON. When MIPI PLL is turned off (MIPI Display is not
active or connected), and someone (host or GT engine) tries to read
MIPI registers, it causes hard hang. This is a hardware restriction
or limitation.

Driver by itself doesn't read MIPI registers when MIPI display is off.
But any userspace application can submit unprivileged batch buffer for
execution. In that batch buffer there can be mmio reads. And these
reads are allowed even for unprivileged applications. If these
register reads are for MIPI DSI controller and MIPI display is not
active during that time, then the MMIO read operation causes system
hard hang and only way to recover is hard reboot. A genuine
process/application won't submit batch buffer like this and doesn't
cause any issue. But on a compromised system, a malign userspace
process/app can generate such batch buffer and can trigger system
hard hang (denial of service attack).

The fix is to lower the internal MMIO timeout value to an optimum
value of 950us as recommended by hardware team. If the timeout is
beyond 1ms (which will hit for any value we choose if MMIO READ on a
DSI specific register is performed without PLL ON), it causes the
system hang. But if the timeout value is lower than it will be below
the threshold (even if timeout happens) and system will not get into
a hung state. This will avoid a system hang without losing any
programming or GT interrupts, taking the worst case of lowest CDCLK
frequency and early DC5 abort into account.

Signed-off-by: Uma Shankar <uma.shankar@intel.com>
Reviewed-by: Jon Bloomfield <jon.bloomfield@intel.com>
---
 drivers/gpu/drm/i915/i915_reg.h | 4 ++++
 drivers/gpu/drm/i915/intel_pm.c | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index e69fe05228fb..9b76d63cc1ac 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7215,6 +7215,10 @@ enum {
 #define TGL_DMC_DEBUG_DC5_COUNT	_MMIO(0x101084)
 #define TGL_DMC_DEBUG_DC6_COUNT	_MMIO(0x101088)
 
+/* Display Internal Timeout Register */
+#define RM_TIMEOUT		_MMIO(0x42060)
+#define  MMIO_TIMEOUT_US(us)	((us) << 0)
+
 /* interrupts */
 #define DE_MASTER_IRQ_CONTROL   (1 << 31)
 #define DE_SPRITEB_FLIP_DONE    (1 << 29)
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 75ee027abb80..b5903ee25dea 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -126,6 +126,14 @@ static void bxt_init_clock_gating(struct drm_i915_private *dev_priv)
 	 */
 	I915_WRITE(GEN9_CLKGATE_DIS_0, I915_READ(GEN9_CLKGATE_DIS_0) |
 		   PWM1_GATING_DIS | PWM2_GATING_DIS);
+
+	/*
+	 * Lower the display internal timeout.
+	 * This is needed to avoid any hard hangs when DSI port PLL
+	 * is off and a MMIO access is attempted by any privilege
+	 * application, using batch buffers or any other means.
+	 */
+	I915_WRITE(RM_TIMEOUT, MMIO_TIMEOUT_US(950));
 }
 
 static void glk_init_clock_gating(struct drm_i915_private *dev_priv)

From 7e34f4e4aad3fd34c02b294a3cf2321adf5b4438 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Mon, 9 Jul 2018 18:24:27 +0300
Subject: [PATCH 29/43] drm/i915/gen8+: Add RC6 CTX corruption WA

In some circumstances the RC6 context can get corrupted. We can detect
this and take the required action, that is disable RC6 and runtime PM.
The HW recovers from the corrupted state after a system suspend/resume
cycle, so detect the recovery and re-enable RC6 and runtime PM.

v2: rebase (Mika)
v3:
- Move intel_suspend_gt_powersave() to the end of the GEM suspend
  sequence.
- Add commit message.
v4:
- Rebased on intel_uncore_forcewake_put(i915->uncore, ...) API
  change.
v5: rebased on gem/gt split (Mika)

Signed-off-by: Imre Deak <imre.deak@intel.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gt_pm.c |   8 ++
 drivers/gpu/drm/i915/i915_drv.c       |   4 +
 drivers/gpu/drm/i915/i915_drv.h       |   8 +-
 drivers/gpu/drm/i915/i915_reg.h       |   2 +
 drivers/gpu/drm/i915/intel_pm.c       | 114 +++++++++++++++++++++++++-
 drivers/gpu/drm/i915/intel_pm.h       |   3 +
 6 files changed, 135 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 1363e069ec83..fac75afed35b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -38,6 +38,9 @@ static int __gt_unpark(struct intel_wakeref *wf)
 	gt->awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
 	GEM_BUG_ON(!gt->awake);
 
+	if (NEEDS_RC6_CTX_CORRUPTION_WA(i915))
+		intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL);
+
 	intel_enable_gt_powersave(i915);
 
 	i915_update_gfx_val(i915);
@@ -67,6 +70,11 @@ static int __gt_park(struct intel_wakeref *wf)
 	if (INTEL_GEN(i915) >= 6)
 		gen6_rps_idle(i915);
 
+	if (NEEDS_RC6_CTX_CORRUPTION_WA(i915)) {
+		i915_rc6_ctx_wa_check(i915);
+		intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL);
+	}
+
 	/* Everything switched off, flush any residual interrupt just in case */
 	intel_synchronize_irq(i915);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index bb6f86c7067a..fe4d7cabfdf1 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1850,6 +1850,8 @@ static int i915_drm_suspend_late(struct drm_device *dev, bool hibernation)
 
 	i915_gem_suspend_late(dev_priv);
 
+	i915_rc6_ctx_wa_suspend(dev_priv);
+
 	intel_uncore_suspend(&dev_priv->uncore);
 
 	intel_power_domains_suspend(dev_priv,
@@ -2053,6 +2055,8 @@ static int i915_drm_resume_early(struct drm_device *dev)
 
 	intel_power_domains_resume(dev_priv);
 
+	i915_rc6_ctx_wa_resume(dev_priv);
+
 	intel_gt_sanitize(&dev_priv->gt, true);
 
 	enable_rpm_wakeref_asserts(&dev_priv->runtime_pm);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b20424e66097..89b6112bd66b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -593,6 +593,8 @@ struct intel_rps {
 
 struct intel_rc6 {
 	bool enabled;
+	bool ctx_corrupted;
+	intel_wakeref_t ctx_corrupted_wakeref;
 	u64 prev_hw_residency[4];
 	u64 cur_residency[4];
 };
@@ -2117,10 +2119,12 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
 /* Early gen2 have a totally busted CS tlb and require pinned batches. */
 #define HAS_BROKEN_CS_TLB(dev_priv)	(IS_I830(dev_priv) || IS_I845G(dev_priv))
 
+#define NEEDS_RC6_CTX_CORRUPTION_WA(dev_priv)	\
+	(IS_BROADWELL(dev_priv) || IS_GEN(dev_priv, 9))
+
 /* WaRsDisableCoarsePowerGating:skl,cnl */
 #define NEEDS_WaRsDisableCoarsePowerGating(dev_priv) \
-	(IS_CANNONLAKE(dev_priv) || \
-	 IS_SKL_GT3(dev_priv) || IS_SKL_GT4(dev_priv))
+	(IS_CANNONLAKE(dev_priv) || IS_GEN(dev_priv, 9))
 
 #define HAS_GMBUS_IRQ(dev_priv) (INTEL_GEN(dev_priv) >= 4)
 #define HAS_GMBUS_BURST_READ(dev_priv) (INTEL_GEN(dev_priv) >= 10 || \
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 9b76d63cc1ac..f8ee9aba3955 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -471,6 +471,8 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   ECOCHK_PPGTT_WT_HSW		(0x2 << 3)
 #define   ECOCHK_PPGTT_WB_HSW		(0x3 << 3)
 
+#define GEN8_RC6_CTX_INFO		_MMIO(0x8504)
+
 #define GAC_ECO_BITS			_MMIO(0x14090)
 #define   ECOBITS_SNB_BIT		(1 << 13)
 #define   ECOBITS_PPGTT_CACHE64B	(3 << 8)
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index b5903ee25dea..2efe1d12d5a9 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -8552,6 +8552,100 @@ static void intel_init_emon(struct drm_i915_private *dev_priv)
 	dev_priv->ips.corr = (lcfuse & LCFUSE_HIV_MASK);
 }
 
+static bool i915_rc6_ctx_corrupted(struct drm_i915_private *dev_priv)
+{
+	return !I915_READ(GEN8_RC6_CTX_INFO);
+}
+
+static void i915_rc6_ctx_wa_init(struct drm_i915_private *i915)
+{
+	if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915))
+		return;
+
+	if (i915_rc6_ctx_corrupted(i915)) {
+		DRM_INFO("RC6 context corrupted, disabling runtime power management\n");
+		i915->gt_pm.rc6.ctx_corrupted = true;
+		i915->gt_pm.rc6.ctx_corrupted_wakeref =
+			intel_runtime_pm_get(&i915->runtime_pm);
+	}
+}
+
+static void i915_rc6_ctx_wa_cleanup(struct drm_i915_private *i915)
+{
+	if (i915->gt_pm.rc6.ctx_corrupted) {
+		intel_runtime_pm_put(&i915->runtime_pm,
+				     i915->gt_pm.rc6.ctx_corrupted_wakeref);
+		i915->gt_pm.rc6.ctx_corrupted = false;
+	}
+}
+
+/**
+ * i915_rc6_ctx_wa_suspend - system suspend sequence for the RC6 CTX WA
+ * @i915: i915 device
+ *
+ * Perform any steps needed to clean up the RC6 CTX WA before system suspend.
+ */
+void i915_rc6_ctx_wa_suspend(struct drm_i915_private *i915)
+{
+	if (i915->gt_pm.rc6.ctx_corrupted)
+		intel_runtime_pm_put(&i915->runtime_pm,
+				     i915->gt_pm.rc6.ctx_corrupted_wakeref);
+}
+
+/**
+ * i915_rc6_ctx_wa_resume - system resume sequence for the RC6 CTX WA
+ * @i915: i915 device
+ *
+ * Perform any steps needed to re-init the RC6 CTX WA after system resume.
+ */
+void i915_rc6_ctx_wa_resume(struct drm_i915_private *i915)
+{
+	if (!i915->gt_pm.rc6.ctx_corrupted)
+		return;
+
+	if (i915_rc6_ctx_corrupted(i915)) {
+		i915->gt_pm.rc6.ctx_corrupted_wakeref =
+			intel_runtime_pm_get(&i915->runtime_pm);
+		return;
+	}
+
+	DRM_INFO("RC6 context restored, re-enabling runtime power management\n");
+	i915->gt_pm.rc6.ctx_corrupted = false;
+}
+
+static void intel_disable_rc6(struct drm_i915_private *dev_priv);
+
+/**
+ * i915_rc6_ctx_wa_check - check for a new RC6 CTX corruption
+ * @i915: i915 device
+ *
+ * Check if an RC6 CTX corruption has happened since the last check and if so
+ * disable RC6 and runtime power management.
+ *
+ * Return false if no context corruption has happened since the last call of
+ * this function, true otherwise.
+*/
+bool i915_rc6_ctx_wa_check(struct drm_i915_private *i915)
+{
+	if (!NEEDS_RC6_CTX_CORRUPTION_WA(i915))
+		return false;
+
+	if (i915->gt_pm.rc6.ctx_corrupted)
+		return false;
+
+	if (!i915_rc6_ctx_corrupted(i915))
+		return false;
+
+	DRM_NOTE("RC6 context corruption, disabling runtime power management\n");
+
+	intel_disable_rc6(i915);
+	i915->gt_pm.rc6.ctx_corrupted = true;
+	i915->gt_pm.rc6.ctx_corrupted_wakeref =
+		intel_runtime_pm_get_noresume(&i915->runtime_pm);
+
+	return true;
+}
+
 void intel_init_gt_powersave(struct drm_i915_private *dev_priv)
 {
 	struct intel_rps *rps = &dev_priv->gt_pm.rps;
@@ -8565,6 +8659,8 @@ void intel_init_gt_powersave(struct drm_i915_private *dev_priv)
 		pm_runtime_get(&dev_priv->drm.pdev->dev);
 	}
 
+	i915_rc6_ctx_wa_init(dev_priv);
+
 	/* Initialize RPS limits (for userspace) */
 	if (IS_CHERRYVIEW(dev_priv))
 		cherryview_init_gt_powersave(dev_priv);
@@ -8603,6 +8699,8 @@ void intel_cleanup_gt_powersave(struct drm_i915_private *dev_priv)
 	if (IS_VALLEYVIEW(dev_priv))
 		valleyview_cleanup_gt_powersave(dev_priv);
 
+	i915_rc6_ctx_wa_cleanup(dev_priv);
+
 	if (!HAS_RC6(dev_priv))
 		pm_runtime_put(&dev_priv->drm.pdev->dev);
 }
@@ -8631,7 +8729,7 @@ static inline void intel_disable_llc_pstate(struct drm_i915_private *i915)
 	i915->gt_pm.llc_pstate.enabled = false;
 }
 
-static void intel_disable_rc6(struct drm_i915_private *dev_priv)
+static void __intel_disable_rc6(struct drm_i915_private *dev_priv)
 {
 	lockdep_assert_held(&dev_priv->gt_pm.rps.lock);
 
@@ -8650,6 +8748,15 @@ static void intel_disable_rc6(struct drm_i915_private *dev_priv)
 	dev_priv->gt_pm.rc6.enabled = false;
 }
 
+static void intel_disable_rc6(struct drm_i915_private *dev_priv)
+{
+	struct intel_rps *rps = &dev_priv->gt_pm.rps;
+
+	mutex_lock(&rps->lock);
+	__intel_disable_rc6(dev_priv);
+	mutex_unlock(&rps->lock);
+}
+
 static void intel_disable_rps(struct drm_i915_private *dev_priv)
 {
 	lockdep_assert_held(&dev_priv->gt_pm.rps.lock);
@@ -8675,7 +8782,7 @@ void intel_disable_gt_powersave(struct drm_i915_private *dev_priv)
 {
 	mutex_lock(&dev_priv->gt_pm.rps.lock);
 
-	intel_disable_rc6(dev_priv);
+	__intel_disable_rc6(dev_priv);
 	intel_disable_rps(dev_priv);
 	if (HAS_LLC(dev_priv))
 		intel_disable_llc_pstate(dev_priv);
@@ -8702,6 +8809,9 @@ static void intel_enable_rc6(struct drm_i915_private *dev_priv)
 	if (dev_priv->gt_pm.rc6.enabled)
 		return;
 
+	if (dev_priv->gt_pm.rc6.ctx_corrupted)
+		return;
+
 	if (IS_CHERRYVIEW(dev_priv))
 		cherryview_enable_rc6(dev_priv);
 	else if (IS_VALLEYVIEW(dev_priv))
diff --git a/drivers/gpu/drm/i915/intel_pm.h b/drivers/gpu/drm/i915/intel_pm.h
index e3573e1e16e3..0f7390c850ec 100644
--- a/drivers/gpu/drm/i915/intel_pm.h
+++ b/drivers/gpu/drm/i915/intel_pm.h
@@ -36,6 +36,9 @@ void intel_cleanup_gt_powersave(struct drm_i915_private *dev_priv);
 void intel_sanitize_gt_powersave(struct drm_i915_private *dev_priv);
 void intel_enable_gt_powersave(struct drm_i915_private *dev_priv);
 void intel_disable_gt_powersave(struct drm_i915_private *dev_priv);
+bool i915_rc6_ctx_wa_check(struct drm_i915_private *i915);
+void i915_rc6_ctx_wa_suspend(struct drm_i915_private *i915);
+void i915_rc6_ctx_wa_resume(struct drm_i915_private *i915);
 void gen6_rps_busy(struct drm_i915_private *dev_priv);
 void gen6_rps_idle(struct drm_i915_private *dev_priv);
 void gen6_rps_boost(struct i915_request *rq);

From edc1f5432f450b486091b79697c4ddc7f0f3845a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Sun, 27 Oct 2019 23:05:44 +0900
Subject: [PATCH 30/43] scsi: sd_zbc: Fix sd_zbc_complete()

The ILLEGAL REQUEST/INVALID FIELD IN CDB error generated by an attempt to
reset a conventional zone does not apply to the reset write pointer command
with the ALL bit set, that is, to REQ_OP_ZONE_RESET_ALL requests. Fix
sd_zbc_complete() to be quiet only in the case of REQ_OP_ZONE_RESET,
excluding REQ_OP_ZONE_RESET_ALL.

Since REQ_OP_ZONE_RESET is the only request handled by sd_zbc_complete(),
also simplify the code using a simple if statement.

[mkp: applied by hand]

Fixes: d81e9d494354 ("scsi: implement REQ_OP_ZONE_RESET_ALL")
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20191027140549.26272-4-damien.lemoal@wdc.com
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/sd_zbc.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index de4019dc0f0b..1efc69e194f8 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -263,25 +263,16 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
 	int result = cmd->result;
 	struct request *rq = cmd->request;
 
-	switch (req_op(rq)) {
-	case REQ_OP_ZONE_RESET:
-	case REQ_OP_ZONE_RESET_ALL:
-
-		if (result &&
-		    sshdr->sense_key == ILLEGAL_REQUEST &&
-		    sshdr->asc == 0x24)
-			/*
-			 * INVALID FIELD IN CDB error: reset of a conventional
-			 * zone was attempted. Nothing to worry about, so be
-			 * quiet about the error.
-			 */
-			rq->rq_flags |= RQF_QUIET;
-		break;
-
-	case REQ_OP_WRITE:
-	case REQ_OP_WRITE_ZEROES:
-	case REQ_OP_WRITE_SAME:
-		break;
+	if (req_op(rq) == REQ_OP_ZONE_RESET &&
+	    result &&
+	    sshdr->sense_key == ILLEGAL_REQUEST &&
+	    sshdr->asc == 0x24) {
+		/*
+		 * INVALID FIELD IN CDB error: reset of a conventional
+		 * zone was attempted. Nothing to worry about, so be
+		 * quiet about the error.
+		 */
+		rq->rq_flags |= RQF_QUIET;
 	}
 }
 

From 8b1062d513880b3ec696945f15dc84d179f1f3c1 Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Tue, 5 Nov 2019 14:56:00 +0000
Subject: [PATCH 31/43] scsi: qla2xxx: fix NPIV tear down process

Fix two issues with commit f5187b7d1ac6 ("scsi: qla2xxx: Optimize NPIV
tear down process"): a missing negation in a wait_event_timeout()
condition, and a missing loop end condition.

Fixes: f5187b7d1ac6 ("scsi: qla2xxx: Optimize NPIV tear down process")
Link: https://lore.kernel.org/r/20191105145550.10268-1-martin.wilck@suse.com
Signed-off-by: Martin Wilck <mwilck@suse.com>
Acked-by: Himanshu Madhani <hmadhani@marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/qla2xxx/qla_mid.c | 8 +++++---
 drivers/scsi/qla2xxx/qla_os.c  | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c
index 6afad68e5ba2..238240984bc1 100644
--- a/drivers/scsi/qla2xxx/qla_mid.c
+++ b/drivers/scsi/qla2xxx/qla_mid.c
@@ -76,9 +76,11 @@ qla24xx_deallocate_vp_id(scsi_qla_host_t *vha)
 	 * ensures no active vp_list traversal while the vport is removed
 	 * from the queue)
 	 */
-	for (i = 0; i < 10 && atomic_read(&vha->vref_count); i++)
-		wait_event_timeout(vha->vref_waitq,
-		    atomic_read(&vha->vref_count), HZ);
+	for (i = 0; i < 10; i++) {
+		if (wait_event_timeout(vha->vref_waitq,
+		    !atomic_read(&vha->vref_count), HZ) > 0)
+			break;
+	}
 
 	spin_lock_irqsave(&ha->vport_slock, flags);
 	if (atomic_read(&vha->vref_count)) {
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 3568031c6504..e6ff17f38178 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -1119,9 +1119,11 @@ qla2x00_wait_for_sess_deletion(scsi_qla_host_t *vha)
 
 	qla2x00_mark_all_devices_lost(vha, 0);
 
-	for (i = 0; i < 10; i++)
-		wait_event_timeout(vha->fcport_waitQ, test_fcport_count(vha),
-		    HZ);
+	for (i = 0; i < 10; i++) {
+		if (wait_event_timeout(vha->fcport_waitQ,
+		    test_fcport_count(vha), HZ) > 0)
+			break;
+	}
 
 	flush_workqueue(vha->hw->wq);
 }

From 9393c8de628cf0968d81a17cc11841e42191e041 Mon Sep 17 00:00:00 2001
From: Michael Schmitz <schmitzmic@gmail.com>
Date: Tue, 5 Nov 2019 15:49:10 +1300
Subject: [PATCH 32/43] scsi: core: Handle drivers which set sg_tablesize to
 zero

In scsi_mq_setup_tags(), cmd_size is calculated based on zero size for the
scatter-gather list in case the low level driver uses SG_NONE in its host
template.

cmd_size is passed on to the block layer for calculation of the request
size, and we've seen NULL pointer dereference errors from the block layer
in drivers where SG_NONE is used and a mq IO scheduler is active,
apparently as a consequence of this (see commit 68ab2d76e4be ("scsi:
cxlflash: Set sg_tablesize to 1 instead of SG_NONE"), and a recent patch by
Finn Thain converting the three m68k NFR5380 drivers to avoid setting
SG_NONE).

Try to avoid these errors by accounting for at least one sg list entry when
calculating cmd_size, regardless of whether the low level driver set a zero
sg_tablesize.

Tested on 030 m68k with the atari_scsi driver - setting sg_tablesize to
SG_NONE no longer results in a crash when loading this driver.

CC: Finn Thain <fthain@telegraphics.com.au>
Link: https://lore.kernel.org/r/1572922150-4358-1-git-send-email-schmitzmic@gmail.com
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_lib.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index dc210b9d4896..3a352a4601b1 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1882,7 +1882,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
 {
 	unsigned int cmd_size, sgl_size;
 
-	sgl_size = scsi_mq_inline_sgl_size(shost);
+	sgl_size = max_t(unsigned int, sizeof(struct scatterlist),
+				scsi_mq_inline_sgl_size(shost));
 	cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size + sgl_size;
 	if (scsi_host_get_prot(shost))
 		cmd_size += sizeof(struct scsi_data_buffer) +

From 012206a822a8b6ac09125bfaa210a95b9eb8f1c1 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 6 Nov 2019 20:26:46 -0600
Subject: [PATCH 33/43] x86/speculation/taa: Fix printing of TAA_MSG_SMT on
 IBRS_ALL CPUs

For new IBRS_ALL CPUs, the Enhanced IBRS check at the beginning of
cpu_bugs_smt_update() causes the function to return early, unintentionally
skipping the MDS and TAA logic.

This is not a problem for MDS, because there appears to be no overlap
between IBRS_ALL and MDS-affected CPUs.  So the MDS mitigation would be
disabled and nothing would need to be done in this function anyway.

But for TAA, the TAA_MSG_SMT string will never get printed on Cascade
Lake and newer.

The check is superfluous anyway: when 'spectre_v2_enabled' is
SPECTRE_V2_IBRS_ENHANCED, 'spectre_v2_user' is always
SPECTRE_V2_USER_NONE, and so the 'spectre_v2_user' switch statement
handles it appropriately by doing nothing.  So just remove the check.

Fixes: 1b42f017415b ("x86/speculation/taa: Add mitigation for TSX Async Abort")
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Tyler Hicks <tyhicks@canonical.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/bugs.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 850005590167..4c7b0fa15a19 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -886,10 +886,6 @@ static void update_mds_branch_idle(void)
 
 void cpu_bugs_smt_update(void)
 {
-	/* Enhanced IBRS implies STIBP. No update required. */
-	if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
-		return;
-
 	mutex_lock(&spec_ctrl_mutex);
 
 	switch (spectre_v2_user) {

From 8a44119a98bee4381d28f3ed1e41dfacf5c3aa6d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 4 Nov 2019 12:16:49 +0100
Subject: [PATCH 34/43] KVM: Fix NULL-ptr deref after kvm_create_vm fails

Reported by syzkaller:

    kasan: CONFIG_KASAN_INLINE enabled
    kasan: GPF could be caused by NULL-ptr deref or user memory access
    general protection fault: 0000 [#1] PREEMPT SMP KASAN
    CPU: 0 PID: 14727 Comm: syz-executor.3 Not tainted 5.4.0-rc4+ #0
    RIP: 0010:kvm_coalesced_mmio_init+0x5d/0x110 arch/x86/kvm/../../../virt/kvm/coalesced_mmio.c:121
    Call Trace:
     kvm_dev_ioctl_create_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:3446 [inline]
     kvm_dev_ioctl+0x781/0x1490 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3494
     vfs_ioctl fs/ioctl.c:46 [inline]
     file_ioctl fs/ioctl.c:509 [inline]
     do_vfs_ioctl+0x196/0x1150 fs/ioctl.c:696
     ksys_ioctl+0x62/0x90 fs/ioctl.c:713
     __do_sys_ioctl fs/ioctl.c:720 [inline]
     __se_sys_ioctl fs/ioctl.c:718 [inline]
     __x64_sys_ioctl+0x6e/0xb0 fs/ioctl.c:718
     do_syscall_64+0xca/0x5d0 arch/x86/entry/common.c:290
     entry_SYSCALL_64_after_hwframe+0x49/0xbe

Commit 9121923c457d ("kvm: Allocate memslots and buses before calling kvm_arch_init_vm")
moves memslots and buses allocations around, however, if kvm->srcu/irq_srcu fails
initialization, NULL will be returned instead of error code, NULL will not be intercepted
in kvm_dev_ioctl_create_vm() and be dereferenced by kvm_coalesced_mmio_init(), this patch
fixes it.

Moving the initialization is required anyway to avoid an incorrect synchronize_srcu that
was also reported by syzkaller:

 wait_for_completion+0x29c/0x440 kernel/sched/completion.c:136
 __synchronize_srcu+0x197/0x250 kernel/rcu/srcutree.c:921
 synchronize_srcu_expedited kernel/rcu/srcutree.c:946 [inline]
 synchronize_srcu+0x239/0x3e8 kernel/rcu/srcutree.c:997
 kvm_page_track_unregister_notifier+0xe7/0x130 arch/x86/kvm/page_track.c:212
 kvm_mmu_uninit_vm+0x1e/0x30 arch/x86/kvm/mmu.c:5828
 kvm_arch_destroy_vm+0x4a2/0x5f0 arch/x86/kvm/x86.c:9579
 kvm_create_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:702 [inline]

so do it.

Reported-by: syzbot+89a8060879fa0bd2db4f@syzkaller.appspotmail.com
Reported-by: syzbot+e27e7027eb2b80e44225@syzkaller.appspotmail.com
Fixes: 9121923c457d ("kvm: Allocate memslots and buses before calling kvm_arch_init_vm")
Cc: Jim Mattson <jmattson@google.com>
Cc: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d6f0696d98ef..e22ff63e5b1a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -645,6 +645,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
 
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
 
+	if (init_srcu_struct(&kvm->srcu))
+		goto out_err_no_srcu;
+	if (init_srcu_struct(&kvm->irq_srcu))
+		goto out_err_no_irq_srcu;
+
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 		struct kvm_memslots *slots = kvm_alloc_memslots();
 
@@ -675,11 +680,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
 #endif
 
-	if (init_srcu_struct(&kvm->srcu))
-		goto out_err_no_srcu;
-	if (init_srcu_struct(&kvm->irq_srcu))
-		goto out_err_no_irq_srcu;
-
 	r = kvm_init_mmu_notifier(kvm);
 	if (r)
 		goto out_err;
@@ -693,10 +693,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	return kvm;
 
 out_err:
-	cleanup_srcu_struct(&kvm->irq_srcu);
-out_err_no_irq_srcu:
-	cleanup_srcu_struct(&kvm->srcu);
-out_err_no_srcu:
 	hardware_disable_all();
 out_err_no_disable:
 	kvm_arch_destroy_vm(kvm);
@@ -706,6 +702,10 @@ static struct kvm *kvm_create_vm(unsigned long type)
 		kfree(kvm_get_bus(kvm, i));
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
 		kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
+	cleanup_srcu_struct(&kvm->irq_srcu);
+out_err_no_irq_srcu:
+	cleanup_srcu_struct(&kvm->srcu);
+out_err_no_srcu:
 	kvm_arch_free_vm(kvm);
 	mmdrop(current->mm);
 	return ERR_PTR(r);

From e2d3fcaf939dded3da604a25ebbea9fb954c2280 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 4 Nov 2019 13:23:53 +0100
Subject: [PATCH 35/43] KVM: fix placement of refcount initialization

Reported by syzkaller:

   =============================
   WARNING: suspicious RCU usage
   -----------------------------
   ./include/linux/kvm_host.h:536 suspicious rcu_dereference_check() usage!

   other info that might help us debug this:

   rcu_scheduler_active = 2, debug_locks = 1
   no locks held by repro_11/12688.

   stack backtrace:
   Call Trace:
    dump_stack+0x7d/0xc5
    lockdep_rcu_suspicious+0x123/0x170
    kvm_dev_ioctl+0x9a9/0x1260 [kvm]
    do_vfs_ioctl+0x1a1/0xfb0
    ksys_ioctl+0x6d/0x80
    __x64_sys_ioctl+0x73/0xb0
    do_syscall_64+0x108/0xaa0
    entry_SYSCALL_64_after_hwframe+0x49/0xbe

Commit a97b0e773e4 (kvm: call kvm_arch_destroy_vm if vm creation fails)
sets users_count to 1 before kvm_arch_init_vm(), however, if kvm_arch_init_vm()
fails, we need to decrease this count.  By moving it earlier, we can push
the decrease to out_err_no_arch_destroy_vm without introducing yet another
error label.

syzkaller source: https://syzkaller.appspot.com/x/repro.c?x=15209b84e00000

Reported-by: syzbot+75475908cd0910f141ee@syzkaller.appspotmail.com
Fixes: a97b0e773e49 ("kvm: call kvm_arch_destroy_vm if vm creation fails")
Cc: Jim Mattson <jmattson@google.com>
Analyzed-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e22ff63e5b1a..e7a07132cd7f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -650,6 +650,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	if (init_srcu_struct(&kvm->irq_srcu))
 		goto out_err_no_irq_srcu;
 
+	refcount_set(&kvm->users_count, 1);
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 		struct kvm_memslots *slots = kvm_alloc_memslots();
 
@@ -667,7 +668,6 @@ static struct kvm *kvm_create_vm(unsigned long type)
 			goto out_err_no_arch_destroy_vm;
 	}
 
-	refcount_set(&kvm->users_count, 1);
 	r = kvm_arch_init_vm(kvm, type);
 	if (r)
 		goto out_err_no_arch_destroy_vm;
@@ -696,8 +696,8 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	hardware_disable_all();
 out_err_no_disable:
 	kvm_arch_destroy_vm(kvm);
-	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
 out_err_no_arch_destroy_vm:
+	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
 	for (i = 0; i < KVM_NR_BUSES; i++)
 		kfree(kvm_get_bus(kvm, i));
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)

From ea0b163b13ffc52818c079adb00d55e227a6da6f Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Mon, 11 Nov 2019 08:13:24 -0800
Subject: [PATCH 36/43] drm/i915/cmdparser: Fix jump whitelist clearing

When a jump_whitelist bitmap is reused, it needs to be cleared.
Currently this is done with memset() and the size calculation assumes
bitmaps are made of 32-bit words, not longs.  So on 64-bit
architectures, only the first half of the bitmap is cleared.

If some whitelist bits are carried over between successive batches
submitted on the same context, this will presumably allow embedding
the rogue instructions that we're trying to reject.

Use bitmap_zero() instead, which gets the calculation right.

Fixes: f8c08d8faee5 ("drm/i915/cmdparser: Add support for backward jumps")
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Jon Bloomfield <jon.bloomfield@intel.com>
---
 drivers/gpu/drm/i915/i915_cmd_parser.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index d78debed06e2..f24096e27bef 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -1375,7 +1375,7 @@ static void init_whitelist(struct i915_gem_context *ctx, u32 batch_len)
 		return;
 
 	if (batch_cmds <= ctx->jump_whitelist_cmds) {
-		memset(ctx->jump_whitelist, 0, exact_size * sizeof(u32));
+		bitmap_zero(ctx->jump_whitelist, batch_cmds);
 		return;
 	}
 
@@ -1395,8 +1395,7 @@ static void init_whitelist(struct i915_gem_context *ctx, u32 batch_len)
 	}
 
 	DRM_DEBUG("CMD: Failed to extend whitelist. BB_START may be disallowed\n");
-	memset(ctx->jump_whitelist, 0,
-	       BITS_TO_LONGS(ctx->jump_whitelist_cmds) * sizeof(u32));
+	bitmap_zero(ctx->jump_whitelist, ctx->jump_whitelist_cmds);
 
 	return;
 }

From 7a5ee6edb42e0bb487954806d34877995b6b8d59 Mon Sep 17 00:00:00 2001
From: Chenyi Qiang <chenyi.qiang@intel.com>
Date: Wed, 6 Nov 2019 14:35:20 +0800
Subject: [PATCH 37/43] KVM: X86: Fix initialization of MSR lists

The three MSR lists(msrs_to_save[], emulated_msrs[] and
msr_based_features[]) are global arrays of kvm.ko, which are
adjusted (copy supported MSRs forward to override the unsupported MSRs)
when insmod kvm-{intel,amd}.ko, but it doesn't reset these three arrays
to their initial value when rmmod kvm-{intel,amd}.ko. Thus, at the next
installation, kvm-{intel,amd}.ko will do operations on the modified
arrays with some MSRs lost and some MSRs duplicated.

So define three constant arrays to hold the initial MSR lists and
initialize msrs_to_save[], emulated_msrs[] and msr_based_features[]
based on the constant arrays.

Cc: stable@vger.kernel.org
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com>
[Remove now useless conditionals. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 56 +++++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ff395f812719..8c8a5e20ea06 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1132,13 +1132,15 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
  *
- * This list is modified at module load time to reflect the
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
+ * extract the supported MSRs from the related const lists.
+ * msrs_to_save is selected from the msrs_to_save_all to reflect the
  * capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
+ * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
  * may depend on host virtualization features rather than host cpu features.
  */
 
-static u32 msrs_to_save[] = {
+static const u32 msrs_to_save_all[] = {
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -1179,9 +1181,10 @@ static u32 msrs_to_save[] = {
 	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
 };
 
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
 static unsigned num_msrs_to_save;
 
-static u32 emulated_msrs[] = {
+static const u32 emulated_msrs_all[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
@@ -1220,7 +1223,7 @@ static u32 emulated_msrs[] = {
 	 * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
 	 * We always support the "true" VMX control MSRs, even if the host
 	 * processor does not, so I am putting these registers here rather
-	 * than in msrs_to_save.
+	 * than in msrs_to_save_all.
 	 */
 	MSR_IA32_VMX_BASIC,
 	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
@@ -1239,13 +1242,14 @@ static u32 emulated_msrs[] = {
 	MSR_KVM_POLL_CONTROL,
 };
 
+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
 static unsigned num_emulated_msrs;
 
 /*
  * List of msr numbers which are used to expose MSR-based features that
  * can be used by a hypervisor to validate requested CPU features.
  */
-static u32 msr_based_features[] = {
+static const u32 msr_based_features_all[] = {
 	MSR_IA32_VMX_BASIC,
 	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
 	MSR_IA32_VMX_PINBASED_CTLS,
@@ -1270,6 +1274,7 @@ static u32 msr_based_features[] = {
 	MSR_IA32_ARCH_CAPABILITIES,
 };
 
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
 static unsigned int num_msr_based_features;
 
 static u64 kvm_get_arch_capabilities(void)
@@ -5090,22 +5095,22 @@ static void kvm_init_msr_list(void)
 {
 	struct x86_pmu_capability x86_pmu;
 	u32 dummy[2];
-	unsigned i, j;
+	unsigned i;
 
 	BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
-			 "Please update the fixed PMCs in msrs_to_save[]");
+			 "Please update the fixed PMCs in msrs_to_saved_all[]");
 
 	perf_get_x86_pmu_capability(&x86_pmu);
 
-	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
-		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+	for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
+		if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
 			continue;
 
 		/*
 		 * Even MSRs that are valid in the host may not be exposed
 		 * to the guests in some cases.
 		 */
-		switch (msrs_to_save[i]) {
+		switch (msrs_to_save_all[i]) {
 		case MSR_IA32_BNDCFGS:
 			if (!kvm_mpx_supported())
 				continue;
@@ -5133,17 +5138,17 @@ static void kvm_init_msr_list(void)
 			break;
 		case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
 			if (!kvm_x86_ops->pt_supported() ||
-				msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >=
+				msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
 				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
 				continue;
 			break;
 		case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
-			if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
+			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
 			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
 				continue;
 			break;
 		case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
-			if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
+			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
 			    min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
 				continue;
 		}
@@ -5151,34 +5156,25 @@ static void kvm_init_msr_list(void)
 			break;
 		}
 
-		if (j < i)
-			msrs_to_save[j] = msrs_to_save[i];
-		j++;
+		msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
 	}
-	num_msrs_to_save = j;
 
-	for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
-		if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
+	for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
+		if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i]))
 			continue;
 
-		if (j < i)
-			emulated_msrs[j] = emulated_msrs[i];
-		j++;
+		emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
 	}
-	num_emulated_msrs = j;
 
-	for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
+	for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
 		struct kvm_msr_entry msr;
 
-		msr.index = msr_based_features[i];
+		msr.index = msr_based_features_all[i];
 		if (kvm_get_msr_feature(&msr))
 			continue;
 
-		if (j < i)
-			msr_based_features[j] = msr_based_features[i];
-		j++;
+		msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
 	}
-	num_msr_based_features = j;
 }
 
 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,

From d9ff2744eea21aac43fafa22f6178541bfe2e3d8 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Mon, 11 Nov 2019 14:25:25 +0200
Subject: [PATCH 38/43] KVM: VMX: Fix comment to specify PID.ON instead of
 PIR.ON

The Outstanding Notification (ON) bit is part of the Posted Interrupt
Descriptor (PID) as opposed to the Posted Interrupts Register (PIR).
The latter is a bitmap for pending vectors.

Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5d21a4ab28cf..f53b0c74f7c8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6137,7 +6137,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 	if (pi_test_on(&vmx->pi_desc)) {
 		pi_clear_on(&vmx->pi_desc);
 		/*
-		 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+		 * IOMMU can write to PID.ON, so the barrier matters even on UP.
 		 * But on x86 this is just a compiler barrier anyway.
 		 */
 		smp_mb__after_atomic();

From 9482ae458b7ae1b47c76333592bbe013d47e579f Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Mon, 11 Nov 2019 17:20:10 +0000
Subject: [PATCH 39/43] KVM: VMX: Consider PID.PIR to determine if vCPU has
 pending interrupts

Commit 17e433b54393 ("KVM: Fix leak vCPU's VMCS value into other pCPU")
introduced vmx_dy_apicv_has_pending_interrupt() in order to determine
if a vCPU have a pending posted interrupt. This routine is used by
kvm_vcpu_on_spin() when searching for a a new runnable vCPU to schedule
on pCPU instead of a vCPU doing busy loop.

vmx_dy_apicv_has_pending_interrupt() determines if a
vCPU has a pending posted interrupt solely based on PID.ON. However,
when a vCPU is preempted, vmx_vcpu_pi_put() sets PID.SN which cause
raised posted interrupts to only set bit in PID.PIR without setting
PID.ON (and without sending notification vector), as depicted in VT-d
manual section 5.2.3 "Interrupt-Posting Hardware Operation".

Therefore, checking PID.ON is insufficient to determine if a vCPU has
pending posted interrupts and instead we should also check if there is
some bit set on PID.PIR if PID.SN=1.

Fixes: 17e433b54393 ("KVM: Fix leak vCPU's VMCS value into other pCPU")
Reviewed-by: Jagannathan Raman <jag.raman@oracle.com>
Co-developed-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f53b0c74f7c8..623914dc3a3e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6167,7 +6167,11 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 
 static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
 {
-	return pi_test_on(vcpu_to_pi_desc(vcpu));
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+	return pi_test_on(pi_desc) ||
+		(pi_test_sn(pi_desc) &&
+		!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS));
 }
 
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)

From 132194ffa138863eac620abb3b6f983278e61b4a Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Mon, 11 Nov 2019 17:20:11 +0000
Subject: [PATCH 40/43] KVM: VMX: Do not change PID.NDST when loading a blocked
 vCPU

When vCPU enters block phase, pi_pre_block() inserts vCPU to a per pCPU
linked list of all vCPUs that are blocked on this pCPU. Afterwards, it
changes PID.NV to POSTED_INTR_WAKEUP_VECTOR which its handler
(wakeup_handler()) is responsible to kick (unblock) any vCPU on that
linked list that now has pending posted interrupts.

While vCPU is blocked (in kvm_vcpu_block()), it may be preempted which
will cause vmx_vcpu_pi_put() to set PID.SN.  If later the vCPU will be
scheduled to run on a different pCPU, vmx_vcpu_pi_load() will clear
PID.SN but will also *overwrite PID.NDST to this different pCPU*.
Instead of keeping it with original pCPU which vCPU had entered block
phase on.

This results in an issue because when a posted interrupt is delivered, as
the wakeup_handler() will be executed and fail to find blocked vCPU on
its per pCPU linked list of all vCPUs that are blocked on this pCPU.
Which is due to the vCPU being placed on a *different* per pCPU
linked list i.e. the original pCPU in which it entered block phase.

The regression is introduced by commit c112b5f50232 ("KVM: x86:
Recompute PID.ON when clearing PID.SN"). Therefore, partially revert
it and reintroduce the condition in vmx_vcpu_pi_load() responsible for
avoiding changing PID.NDST when loading a blocked vCPU.

Fixes: c112b5f50232 ("KVM: x86: Recompute PID.ON when clearing PID.SN")
Tested-by: Nathan Ni <nathan.ni@oracle.com>
Co-developed-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 14 ++++++++++++++
 arch/x86/kvm/vmx/vmx.h |  6 ++++++
 2 files changed, 20 insertions(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 623914dc3a3e..54458c5d5a01 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1268,6 +1268,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
 		return;
 
+	/*
+	 * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+	 * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
+	 * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
+	 * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
+	 * correctly.
+	 */
+	if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
+		pi_clear_sn(pi_desc);
+		goto after_clear_sn;
+	}
+
 	/* The full case.  */
 	do {
 		old.control = new.control = pi_desc->control;
@@ -1283,6 +1295,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	} while (cmpxchg64(&pi_desc->control, old.control,
 			   new.control) != old.control);
 
+after_clear_sn:
+
 	/*
 	 * Clear SN before reading the bitmap.  The VT-d firmware
 	 * writes the bitmap and reads SN atomically (5.2.3 in the
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index bee16687dc0b..1e32ab54fc2d 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -373,6 +373,12 @@ static inline void pi_clear_on(struct pi_desc *pi_desc)
 		(unsigned long *)&pi_desc->control);
 }
 
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+	clear_bit(POSTED_INTR_SN,
+		(unsigned long *)&pi_desc->control);
+}
+
 static inline int pi_test_on(struct pi_desc *pi_desc)
 {
 	return test_bit(POSTED_INTR_ON,

From 29881b6ec6e453ff8df37ad8f44e17bf0d4e1e12 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Mon, 11 Nov 2019 17:20:12 +0000
Subject: [PATCH 41/43] KVM: VMX: Introduce pi_is_pir_empty() helper

Streamline the PID.PIR check and change its call sites to use
the newly added helper.

Suggested-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 5 ++---
 arch/x86/kvm/vmx/vmx.h | 5 +++++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 54458c5d5a01..04a8212704c1 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1305,7 +1305,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	 */
 	smp_mb__after_atomic();
 
-	if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
+	if (!pi_is_pir_empty(pi_desc))
 		pi_set_on(pi_desc);
 }
 
@@ -6184,8 +6184,7 @@ static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
 
 	return pi_test_on(pi_desc) ||
-		(pi_test_sn(pi_desc) &&
-		!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS));
+		(pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
 }
 
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 1e32ab54fc2d..5a0f34b1e226 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -355,6 +355,11 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
 	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
 }
 
+static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
+{
+	return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+}
+
 static inline void pi_set_sn(struct pi_desc *pi_desc)
 {
 	set_bit(POSTED_INTR_SN,

From a78986aae9b2988f8493f9f65a587ee433e83bc3 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Mon, 11 Nov 2019 14:12:27 -0800
Subject: [PATCH 42/43] KVM: MMU: Do not treat ZONE_DEVICE pages as being
 reserved

Explicitly exempt ZONE_DEVICE pages from kvm_is_reserved_pfn() and
instead manually handle ZONE_DEVICE on a case-by-case basis.  For things
like page refcounts, KVM needs to treat ZONE_DEVICE pages like normal
pages, e.g. put pages grabbed via gup().  But for flows such as setting
A/D bits or shifting refcounts for transparent huge pages, KVM needs to
to avoid processing ZONE_DEVICE pages as the flows in question lack the
underlying machinery for proper handling of ZONE_DEVICE pages.

This fixes a hang reported by Adam Borowski[*] in dev_pagemap_cleanup()
when running a KVM guest backed with /dev/dax memory, as KVM straight up
doesn't put any references to ZONE_DEVICE pages acquired by gup().

Note, Dan Williams proposed an alternative solution of doing put_page()
on ZONE_DEVICE pages immediately after gup() in order to simplify the
auditing needed to ensure is_zone_device_page() is called if and only if
the backing device is pinned (via gup()).  But that approach would break
kvm_vcpu_{un}map() as KVM requires the page to be pinned from map() 'til
unmap() when accessing guest memory, unlike KVM's secondary MMU, which
coordinates with mmu_notifier invalidations to avoid creating stale
page references, i.e. doesn't rely on pages being pinned.

[*] http://lkml.kernel.org/r/20190919115547.GA17963@angband.pl

Reported-by: Adam Borowski <kilobyte@angband.pl>
Analyzed-by: David Hildenbrand <david@redhat.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: stable@vger.kernel.org
Fixes: 3565fce3a659 ("mm, x86: get_user_pages() for dax mappings")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c       |  8 ++++----
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 26 +++++++++++++++++++++++---
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 24c23c66b226..bf82b1f2e834 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3306,7 +3306,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 	 * here.
 	 */
 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
-	    level == PT_PAGE_TABLE_LEVEL &&
+	    !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
 	    PageTransCompoundMap(pfn_to_page(pfn)) &&
 	    !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
 		unsigned long mask;
@@ -5914,9 +5914,9 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 		 * the guest, and the guest page table is using 4K page size
 		 * mapping if the indirect sp has level = 1.
 		 */
-		if (sp->role.direct &&
-			!kvm_is_reserved_pfn(pfn) &&
-			PageTransCompoundMap(pfn_to_page(pfn))) {
+		if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
+		    !kvm_is_zone_device_pfn(pfn) &&
+		    PageTransCompoundMap(pfn_to_page(pfn))) {
 			pte_list_remove(rmap_head, sptep);
 
 			if (kvm_available_flush_tlb_with_range())
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 719fc3e15ea4..290dbe353a47 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -966,6 +966,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 
 bool kvm_is_reserved_pfn(kvm_pfn_t pfn);
+bool kvm_is_zone_device_pfn(kvm_pfn_t pfn);
 
 struct kvm_irq_ack_notifier {
 	struct hlist_node link;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e7a07132cd7f..0dac149ead16 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -149,10 +149,30 @@ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 	return 0;
 }
 
+bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
+{
+	/*
+	 * The metadata used by is_zone_device_page() to determine whether or
+	 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
+	 * the device has been pinned, e.g. by get_user_pages().  WARN if the
+	 * page_count() is zero to help detect bad usage of this helper.
+	 */
+	if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
+		return false;
+
+	return is_zone_device_page(pfn_to_page(pfn));
+}
+
 bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 {
+	/*
+	 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
+	 * perspective they are "normal" pages, albeit with slightly different
+	 * usage rules.
+	 */
 	if (pfn_valid(pfn))
-		return PageReserved(pfn_to_page(pfn));
+		return PageReserved(pfn_to_page(pfn)) &&
+		       !kvm_is_zone_device_pfn(pfn);
 
 	return true;
 }
@@ -1857,7 +1877,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
 
 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 {
-	if (!kvm_is_reserved_pfn(pfn)) {
+	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
 		struct page *page = pfn_to_page(pfn);
 
 		SetPageDirty(page);
@@ -1867,7 +1887,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
 {
-	if (!kvm_is_reserved_pfn(pfn))
+	if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
 		mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);

From 0e3f1ad80fc8cb0c517fd9a9afb22752b741fa76 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 12 Nov 2019 15:22:24 -0800
Subject: [PATCH 43/43] Remove VirtualBox guest shared folders filesystem

This went into staging in rc7.  It turns out that was a mistake, and
apparently it wasn't even supposed to go there at all, but be introduced
as a regular filesystem.

We don't try to sneak in whole new filesystems this late in the rc, just
delete the whole thing, and it can be re-introduced as a proper patch
with proper acks from actual filesystem people instead of some odd
late-rc staging back-door.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                              |   6 -
 drivers/staging/Kconfig                  |   2 -
 drivers/staging/Makefile                 |   1 -
 drivers/staging/vboxsf/Kconfig           |  10 -
 drivers/staging/vboxsf/Makefile          |   5 -
 drivers/staging/vboxsf/TODO              |   7 -
 drivers/staging/vboxsf/dir.c             | 418 -----------
 drivers/staging/vboxsf/file.c            | 370 ----------
 drivers/staging/vboxsf/shfl_hostintf.h   | 901 -----------------------
 drivers/staging/vboxsf/super.c           | 501 -------------
 drivers/staging/vboxsf/utils.c           | 551 --------------
 drivers/staging/vboxsf/vboxsf_wrappers.c | 371 ----------
 drivers/staging/vboxsf/vfsmod.h          | 137 ----
 13 files changed, 3280 deletions(-)
 delete mode 100644 drivers/staging/vboxsf/Kconfig
 delete mode 100644 drivers/staging/vboxsf/Makefile
 delete mode 100644 drivers/staging/vboxsf/TODO
 delete mode 100644 drivers/staging/vboxsf/dir.c
 delete mode 100644 drivers/staging/vboxsf/file.c
 delete mode 100644 drivers/staging/vboxsf/shfl_hostintf.h
 delete mode 100644 drivers/staging/vboxsf/super.c
 delete mode 100644 drivers/staging/vboxsf/utils.c
 delete mode 100644 drivers/staging/vboxsf/vboxsf_wrappers.c
 delete mode 100644 drivers/staging/vboxsf/vfsmod.h

diff --git a/MAINTAINERS b/MAINTAINERS
index eb19fad370d7..f77959ecf7e0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17338,12 +17338,6 @@ F:	include/linux/vbox_utils.h
 F:	include/uapi/linux/vbox*.h
 F:	drivers/virt/vboxguest/
 
-VIRTUAL BOX SHARED FOLDER VFS DRIVER:
-M:	Hans de Goede <hdegoede@redhat.com>
-L:	linux-fsdevel@vger.kernel.org
-S:	Maintained
-F:	drivers/staging/vboxsf/*
-
 VIRTUAL SERIO DEVICE DRIVER
 M:	Stephen Chandler Paul <thatslyude@gmail.com>
 S:	Maintained
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 927d29eb92c6..6f1fa4c849a1 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -125,6 +125,4 @@ source "drivers/staging/exfat/Kconfig"
 
 source "drivers/staging/qlge/Kconfig"
 
-source "drivers/staging/vboxsf/Kconfig"
-
 endif # STAGING
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index f01f04199073..a90f9b308c8d 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -53,4 +53,3 @@ obj-$(CONFIG_UWB)		+= uwb/
 obj-$(CONFIG_USB_WUSB)		+= wusbcore/
 obj-$(CONFIG_EXFAT_FS)		+= exfat/
 obj-$(CONFIG_QLGE)		+= qlge/
-obj-$(CONFIG_VBOXSF_FS)		+= vboxsf/
diff --git a/drivers/staging/vboxsf/Kconfig b/drivers/staging/vboxsf/Kconfig
deleted file mode 100644
index b84586ae08b3..000000000000
--- a/drivers/staging/vboxsf/Kconfig
+++ /dev/null
@@ -1,10 +0,0 @@
-config VBOXSF_FS
-	tristate "VirtualBox guest shared folder (vboxsf) support"
-	depends on X86 && VBOXGUEST
-	select NLS
-	help
-	  VirtualBox hosts can share folders with guests, this driver
-	  implements the Linux-guest side of this allowing folders exported
-	  by the host to be mounted under Linux.
-
-	  If you want to use shared folders in VirtualBox guests, answer Y or M.
diff --git a/drivers/staging/vboxsf/Makefile b/drivers/staging/vboxsf/Makefile
deleted file mode 100644
index 9e4328e79623..000000000000
--- a/drivers/staging/vboxsf/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-# SPDX-License-Identifier: MIT
-
-obj-$(CONFIG_VBOXSF_FS) += vboxsf.o
-
-vboxsf-y := dir.o file.o utils.o vboxsf_wrappers.o super.o
diff --git a/drivers/staging/vboxsf/TODO b/drivers/staging/vboxsf/TODO
deleted file mode 100644
index 8b9193d0d4f0..000000000000
--- a/drivers/staging/vboxsf/TODO
+++ /dev/null
@@ -1,7 +0,0 @@
-TODO:
-- Find a file-system developer to review this and give their Reviewed-By
-- Address any items coming up during review
-- Move to fs/vboxfs
-
-Please send any patches to Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-and Hans de Goede <hdegoede@redhat.com>
diff --git a/drivers/staging/vboxsf/dir.c b/drivers/staging/vboxsf/dir.c
deleted file mode 100644
index f260b5cc1646..000000000000
--- a/drivers/staging/vboxsf/dir.c
+++ /dev/null
@@ -1,418 +0,0 @@
-// SPDX-License-Identifier: MIT
-/*
- * VirtualBox Guest Shared Folders support: Directory inode and file operations
- *
- * Copyright (C) 2006-2018 Oracle Corporation
- */
-
-#include <linux/namei.h>
-#include <linux/vbox_utils.h>
-#include "vfsmod.h"
-
-static int vboxsf_dir_open(struct inode *inode, struct file *file)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(inode->i_sb);
-	struct shfl_createparms params = {};
-	struct vboxsf_dir_info *sf_d;
-	int err;
-
-	sf_d = vboxsf_dir_info_alloc();
-	if (!sf_d)
-		return -ENOMEM;
-
-	params.handle = SHFL_HANDLE_NIL;
-	params.create_flags = SHFL_CF_DIRECTORY | SHFL_CF_ACT_OPEN_IF_EXISTS |
-			      SHFL_CF_ACT_FAIL_IF_NEW | SHFL_CF_ACCESS_READ;
-
-	err = vboxsf_create_at_dentry(file_dentry(file), &params);
-	if (err)
-		goto err_free_dir_info;
-
-	if (params.result != SHFL_FILE_EXISTS) {
-		err = -ENOENT;
-		goto err_close;
-	}
-
-	err = vboxsf_dir_read_all(sbi, sf_d, params.handle);
-	if (err)
-		goto err_close;
-
-	vboxsf_close(sbi->root, params.handle);
-	file->private_data = sf_d;
-	return 0;
-
-err_close:
-	vboxsf_close(sbi->root, params.handle);
-err_free_dir_info:
-	vboxsf_dir_info_free(sf_d);
-	return err;
-}
-
-static int vboxsf_dir_release(struct inode *inode, struct file *file)
-{
-	if (file->private_data)
-		vboxsf_dir_info_free(file->private_data);
-
-	return 0;
-}
-
-static unsigned int vboxsf_get_d_type(u32 mode)
-{
-	unsigned int d_type;
-
-	switch (mode & SHFL_TYPE_MASK) {
-	case SHFL_TYPE_FIFO:
-		d_type = DT_FIFO;
-		break;
-	case SHFL_TYPE_DEV_CHAR:
-		d_type = DT_CHR;
-		break;
-	case SHFL_TYPE_DIRECTORY:
-		d_type = DT_DIR;
-		break;
-	case SHFL_TYPE_DEV_BLOCK:
-		d_type = DT_BLK;
-		break;
-	case SHFL_TYPE_FILE:
-		d_type = DT_REG;
-		break;
-	case SHFL_TYPE_SYMLINK:
-		d_type = DT_LNK;
-		break;
-	case SHFL_TYPE_SOCKET:
-		d_type = DT_SOCK;
-		break;
-	case SHFL_TYPE_WHITEOUT:
-		d_type = DT_WHT;
-		break;
-	default:
-		d_type = DT_UNKNOWN;
-		break;
-	}
-	return d_type;
-}
-
-static bool vboxsf_dir_emit(struct file *dir, struct dir_context *ctx)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(file_inode(dir)->i_sb);
-	struct vboxsf_dir_info *sf_d = dir->private_data;
-	struct shfl_dirinfo *info;
-	struct vboxsf_dir_buf *b;
-	unsigned int d_type;
-	loff_t i, cur = 0;
-	ino_t fake_ino;
-	size_t size;
-	int err;
-
-	list_for_each_entry(b, &sf_d->info_list, head) {
-try_next_entry:
-		if (ctx->pos >= cur + b->entries) {
-			cur += b->entries;
-			continue;
-		}
-
-		/*
-		 * Note the vboxsf_dir_info objects we are iterating over here
-		 * are variable sized, so the info pointer may end up being
-		 * unaligned. This is how we get the data from the host.
-		 * Since vboxsf is only supported on x86 machines this is not
-		 * a problem.
-		 */
-		for (i = 0, info = b->buf; i < ctx->pos - cur; i++) {
-			size = offsetof(struct shfl_dirinfo, name.string) +
-			       info->name.size;
-			info = (struct shfl_dirinfo *)((uintptr_t)info + size);
-		}
-
-		/* Info now points to the right entry, emit it. */
-		d_type = vboxsf_get_d_type(info->info.attr.mode);
-
-		/*
-		 * On 32 bit systems pos is 64 signed, while ino is 32 bit
-		 * unsigned so fake_ino may overflow, check for this.
-		 */
-		if ((ino_t)(ctx->pos + 1) != (u64)(ctx->pos + 1)) {
-			vbg_err("vboxsf: fake ino overflow, truncating dir\n");
-			return false;
-		}
-		fake_ino = ctx->pos + 1;
-
-		if (sbi->nls) {
-			char d_name[NAME_MAX];
-
-			err = vboxsf_nlscpy(sbi, d_name, NAME_MAX,
-					    info->name.string.utf8,
-					    info->name.length);
-			if (err) {
-				/* skip erroneous entry and proceed */
-				ctx->pos += 1;
-				goto try_next_entry;
-			}
-
-			return dir_emit(ctx, d_name, strlen(d_name),
-					fake_ino, d_type);
-		}
-
-		return dir_emit(ctx, info->name.string.utf8, info->name.length,
-				fake_ino, d_type);
-	}
-
-	return false;
-}
-
-static int vboxsf_dir_iterate(struct file *dir, struct dir_context *ctx)
-{
-	bool keep_iterating;
-
-	for (keep_iterating = true; keep_iterating; ctx->pos += 1)
-		keep_iterating = vboxsf_dir_emit(dir, ctx);
-
-	return 0;
-}
-
-const struct file_operations vboxsf_dir_fops = {
-	.open = vboxsf_dir_open,
-	.iterate = vboxsf_dir_iterate,
-	.release = vboxsf_dir_release,
-	.read = generic_read_dir,
-	.llseek = generic_file_llseek,
-};
-
-/*
- * This is called during name resolution/lookup to check if the @dentry in
- * the cache is still valid. the job is handled by vboxsf_inode_revalidate.
- */
-static int vboxsf_dentry_revalidate(struct dentry *dentry, unsigned int flags)
-{
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	if (d_really_is_positive(dentry))
-		return vboxsf_inode_revalidate(dentry) == 0;
-	else
-		return vboxsf_stat_dentry(dentry, NULL) == -ENOENT;
-}
-
-const struct dentry_operations vboxsf_dentry_ops = {
-	.d_revalidate = vboxsf_dentry_revalidate
-};
-
-/* iops */
-
-static struct dentry *vboxsf_dir_lookup(struct inode *parent,
-					struct dentry *dentry,
-					unsigned int flags)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb);
-	struct shfl_fsobjinfo fsinfo;
-	struct inode *inode;
-	int err;
-
-	dentry->d_time = jiffies;
-
-	err = vboxsf_stat_dentry(dentry, &fsinfo);
-	if (err) {
-		inode = (err == -ENOENT) ? NULL : ERR_PTR(err);
-	} else {
-		inode = vboxsf_new_inode(parent->i_sb);
-		if (!IS_ERR(inode))
-			vboxsf_init_inode(sbi, inode, &fsinfo);
-	}
-
-	return d_splice_alias(inode, dentry);
-}
-
-static int vboxsf_dir_instantiate(struct inode *parent, struct dentry *dentry,
-				  struct shfl_fsobjinfo *info)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb);
-	struct vboxsf_inode *sf_i;
-	struct inode *inode;
-
-	inode = vboxsf_new_inode(parent->i_sb);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	sf_i = VBOXSF_I(inode);
-	/* The host may have given us different attr then requested */
-	sf_i->force_restat = 1;
-	vboxsf_init_inode(sbi, inode, info);
-
-	d_instantiate(dentry, inode);
-
-	return 0;
-}
-
-static int vboxsf_dir_create(struct inode *parent, struct dentry *dentry,
-			     umode_t mode, int is_dir)
-{
-	struct vboxsf_inode *sf_parent_i = VBOXSF_I(parent);
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb);
-	struct shfl_createparms params = {};
-	int err;
-
-	params.handle = SHFL_HANDLE_NIL;
-	params.create_flags = SHFL_CF_ACT_CREATE_IF_NEW |
-			      SHFL_CF_ACT_FAIL_IF_EXISTS |
-			      SHFL_CF_ACCESS_READWRITE |
-			      (is_dir ? SHFL_CF_DIRECTORY : 0);
-	params.info.attr.mode = (mode & 0777) |
-				(is_dir ? SHFL_TYPE_DIRECTORY : SHFL_TYPE_FILE);
-	params.info.attr.additional = SHFLFSOBJATTRADD_NOTHING;
-
-	err = vboxsf_create_at_dentry(dentry, &params);
-	if (err)
-		return err;
-
-	if (params.result != SHFL_FILE_CREATED)
-		return -EPERM;
-
-	vboxsf_close(sbi->root, params.handle);
-
-	err = vboxsf_dir_instantiate(parent, dentry, &params.info);
-	if (err)
-		return err;
-
-	/* parent directory access/change time changed */
-	sf_parent_i->force_restat = 1;
-
-	return 0;
-}
-
-static int vboxsf_dir_mkfile(struct inode *parent, struct dentry *dentry,
-			     umode_t mode, bool excl)
-{
-	return vboxsf_dir_create(parent, dentry, mode, 0);
-}
-
-static int vboxsf_dir_mkdir(struct inode *parent, struct dentry *dentry,
-			    umode_t mode)
-{
-	return vboxsf_dir_create(parent, dentry, mode, 1);
-}
-
-static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb);
-	struct vboxsf_inode *sf_parent_i = VBOXSF_I(parent);
-	struct inode *inode = d_inode(dentry);
-	struct shfl_string *path;
-	u32 flags;
-	int err;
-
-	if (S_ISDIR(inode->i_mode))
-		flags = SHFL_REMOVE_DIR;
-	else
-		flags = SHFL_REMOVE_FILE;
-
-	if (S_ISLNK(inode->i_mode))
-		flags |= SHFL_REMOVE_SYMLINK;
-
-	path = vboxsf_path_from_dentry(sbi, dentry);
-	if (IS_ERR(path))
-		return PTR_ERR(path);
-
-	err = vboxsf_remove(sbi->root, path, flags);
-	__putname(path);
-	if (err)
-		return err;
-
-	/* parent directory access/change time changed */
-	sf_parent_i->force_restat = 1;
-
-	return 0;
-}
-
-static int vboxsf_dir_rename(struct inode *old_parent,
-			     struct dentry *old_dentry,
-			     struct inode *new_parent,
-			     struct dentry *new_dentry,
-			     unsigned int flags)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(old_parent->i_sb);
-	struct vboxsf_inode *sf_old_parent_i = VBOXSF_I(old_parent);
-	struct vboxsf_inode *sf_new_parent_i = VBOXSF_I(new_parent);
-	u32 shfl_flags = SHFL_RENAME_FILE | SHFL_RENAME_REPLACE_IF_EXISTS;
-	struct shfl_string *old_path, *new_path;
-	int err;
-
-	if (flags)
-		return -EINVAL;
-
-	old_path = vboxsf_path_from_dentry(sbi, old_dentry);
-	if (IS_ERR(old_path))
-		return PTR_ERR(old_path);
-
-	new_path = vboxsf_path_from_dentry(sbi, new_dentry);
-	if (IS_ERR(new_path)) {
-		err = PTR_ERR(new_path);
-		goto err_put_old_path;
-	}
-
-	if (d_inode(old_dentry)->i_mode & S_IFDIR)
-		shfl_flags = 0;
-
-	err = vboxsf_rename(sbi->root, old_path, new_path, shfl_flags);
-	if (err == 0) {
-		/* parent directories access/change time changed */
-		sf_new_parent_i->force_restat = 1;
-		sf_old_parent_i->force_restat = 1;
-	}
-
-	__putname(new_path);
-err_put_old_path:
-	__putname(old_path);
-	return err;
-}
-
-static int vboxsf_dir_symlink(struct inode *parent, struct dentry *dentry,
-			      const char *symname)
-{
-	struct vboxsf_inode *sf_parent_i = VBOXSF_I(parent);
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb);
-	int symname_size = strlen(symname) + 1;
-	struct shfl_string *path, *ssymname;
-	struct shfl_fsobjinfo info;
-	int err;
-
-	path = vboxsf_path_from_dentry(sbi, dentry);
-	if (IS_ERR(path))
-		return PTR_ERR(path);
-
-	ssymname = kmalloc(SHFLSTRING_HEADER_SIZE + symname_size, GFP_KERNEL);
-	if (!ssymname) {
-		__putname(path);
-		return -ENOMEM;
-	}
-	ssymname->length = symname_size - 1;
-	ssymname->size = symname_size;
-	memcpy(ssymname->string.utf8, symname, symname_size);
-
-	err = vboxsf_symlink(sbi->root, path, ssymname, &info);
-	kfree(ssymname);
-	__putname(path);
-	if (err) {
-		/* -EROFS means symlinks are note support -> -EPERM */
-		return (err == -EROFS) ? -EPERM : err;
-	}
-
-	err = vboxsf_dir_instantiate(parent, dentry, &info);
-	if (err)
-		return err;
-
-	/* parent directory access/change time changed */
-	sf_parent_i->force_restat = 1;
-	return 0;
-}
-
-const struct inode_operations vboxsf_dir_iops = {
-	.lookup  = vboxsf_dir_lookup,
-	.create  = vboxsf_dir_mkfile,
-	.mkdir   = vboxsf_dir_mkdir,
-	.rmdir   = vboxsf_dir_unlink,
-	.unlink  = vboxsf_dir_unlink,
-	.rename  = vboxsf_dir_rename,
-	.symlink = vboxsf_dir_symlink,
-	.getattr = vboxsf_getattr,
-	.setattr = vboxsf_setattr,
-};
diff --git a/drivers/staging/vboxsf/file.c b/drivers/staging/vboxsf/file.c
deleted file mode 100644
index 4b61ccf83fca..000000000000
--- a/drivers/staging/vboxsf/file.c
+++ /dev/null
@@ -1,370 +0,0 @@
-// SPDX-License-Identifier: MIT
-/*
- * VirtualBox Guest Shared Folders support: Regular file inode and file ops.
- *
- * Copyright (C) 2006-2018 Oracle Corporation
- */
-
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-#include <linux/sizes.h>
-#include "vfsmod.h"
-
-struct vboxsf_handle {
-	u64 handle;
-	u32 root;
-	u32 access_flags;
-	struct kref refcount;
-	struct list_head head;
-};
-
-static int vboxsf_file_open(struct inode *inode, struct file *file)
-{
-	struct vboxsf_inode *sf_i = VBOXSF_I(inode);
-	struct shfl_createparms params = {};
-	struct vboxsf_handle *sf_handle;
-	u32 access_flags = 0;
-	int err;
-
-	sf_handle = kmalloc(sizeof(*sf_handle), GFP_KERNEL);
-	if (!sf_handle)
-		return -ENOMEM;
-
-	/*
-	 * We check the value of params.handle afterwards to find out if
-	 * the call succeeded or failed, as the API does not seem to cleanly
-	 * distinguish error and informational messages.
-	 *
-	 * Furthermore, we must set params.handle to SHFL_HANDLE_NIL to
-	 * make the shared folders host service use our mode parameter.
-	 */
-	params.handle = SHFL_HANDLE_NIL;
-	if (file->f_flags & O_CREAT) {
-		params.create_flags |= SHFL_CF_ACT_CREATE_IF_NEW;
-		/*
-		 * We ignore O_EXCL, as the Linux kernel seems to call create
-		 * beforehand itself, so O_EXCL should always fail.
-		 */
-		if (file->f_flags & O_TRUNC)
-			params.create_flags |= SHFL_CF_ACT_OVERWRITE_IF_EXISTS;
-		else
-			params.create_flags |= SHFL_CF_ACT_OPEN_IF_EXISTS;
-	} else {
-		params.create_flags |= SHFL_CF_ACT_FAIL_IF_NEW;
-		if (file->f_flags & O_TRUNC)
-			params.create_flags |= SHFL_CF_ACT_OVERWRITE_IF_EXISTS;
-	}
-
-	switch (file->f_flags & O_ACCMODE) {
-	case O_RDONLY:
-		access_flags |= SHFL_CF_ACCESS_READ;
-		break;
-
-	case O_WRONLY:
-		access_flags |= SHFL_CF_ACCESS_WRITE;
-		break;
-
-	case O_RDWR:
-		access_flags |= SHFL_CF_ACCESS_READWRITE;
-		break;
-
-	default:
-		WARN_ON(1);
-	}
-
-	if (file->f_flags & O_APPEND)
-		access_flags |= SHFL_CF_ACCESS_APPEND;
-
-	params.create_flags |= access_flags;
-	params.info.attr.mode = inode->i_mode;
-
-	err = vboxsf_create_at_dentry(file_dentry(file), &params);
-	if (err == 0 && params.handle == SHFL_HANDLE_NIL)
-		err = (params.result == SHFL_FILE_EXISTS) ? -EEXIST : -ENOENT;
-	if (err) {
-		kfree(sf_handle);
-		return err;
-	}
-
-	/* the host may have given us different attr then requested */
-	sf_i->force_restat = 1;
-
-	/* init our handle struct and add it to the inode's handles list */
-	sf_handle->handle = params.handle;
-	sf_handle->root = VBOXSF_SBI(inode->i_sb)->root;
-	sf_handle->access_flags = access_flags;
-	kref_init(&sf_handle->refcount);
-
-	mutex_lock(&sf_i->handle_list_mutex);
-	list_add(&sf_handle->head, &sf_i->handle_list);
-	mutex_unlock(&sf_i->handle_list_mutex);
-
-	file->private_data = sf_handle;
-	return 0;
-}
-
-static void vboxsf_handle_release(struct kref *refcount)
-{
-	struct vboxsf_handle *sf_handle =
-		container_of(refcount, struct vboxsf_handle, refcount);
-
-	vboxsf_close(sf_handle->root, sf_handle->handle);
-	kfree(sf_handle);
-}
-
-static int vboxsf_file_release(struct inode *inode, struct file *file)
-{
-	struct vboxsf_inode *sf_i = VBOXSF_I(inode);
-	struct vboxsf_handle *sf_handle = file->private_data;
-
-	/*
-	 * When a file is closed on our (the guest) side, we want any subsequent
-	 * accesses done on the host side to see all changes done from our side.
-	 */
-	filemap_write_and_wait(inode->i_mapping);
-
-	mutex_lock(&sf_i->handle_list_mutex);
-	list_del(&sf_handle->head);
-	mutex_unlock(&sf_i->handle_list_mutex);
-
-	kref_put(&sf_handle->refcount, vboxsf_handle_release);
-	return 0;
-}
-
-/*
- * Write back dirty pages now, because there may not be any suitable
- * open files later
- */
-static void vboxsf_vma_close(struct vm_area_struct *vma)
-{
-	filemap_write_and_wait(vma->vm_file->f_mapping);
-}
-
-static const struct vm_operations_struct vboxsf_file_vm_ops = {
-	.close		= vboxsf_vma_close,
-	.fault		= filemap_fault,
-	.map_pages	= filemap_map_pages,
-};
-
-static int vboxsf_file_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	int err;
-
-	err = generic_file_mmap(file, vma);
-	if (!err)
-		vma->vm_ops = &vboxsf_file_vm_ops;
-
-	return err;
-}
-
-/*
- * Note that since we are accessing files on the host's filesystem, files
- * may always be changed underneath us by the host!
- *
- * The vboxsf API between the guest and the host does not offer any functions
- * to deal with this. There is no inode-generation to check for changes, no
- * events / callback on changes and no way to lock files.
- *
- * To avoid returning stale data when a file gets *opened* on our (the guest)
- * side, we do a "stat" on the host side, then compare the mtime with the
- * last known mtime and invalidate the page-cache if they differ.
- * This is done from vboxsf_inode_revalidate().
- *
- * When reads are done through the read_iter fop, it is possible to do
- * further cache revalidation then, there are 3 options to deal with this:
- *
- * 1)  Rely solely on the revalidation done at open time
- * 2)  Do another "stat" and compare mtime again. Unfortunately the vboxsf
- *     host API does not allow stat on handles, so we would need to use
- *     file->f_path.dentry and the stat will then fail if the file was unlinked
- *     or renamed (and there is no thing like NFS' silly-rename). So we get:
- * 2a) "stat" and compare mtime, on stat failure invalidate the cache
- * 2b) "stat" and compare mtime, on stat failure do nothing
- * 3)  Simply always call invalidate_inode_pages2_range on the range of the read
- *
- * Currently we are keeping things KISS and using option 1. this allows
- * directly using generic_file_read_iter without wrapping it.
- *
- * This means that only data written on the host side before open() on
- * the guest side is guaranteed to be seen by the guest. If necessary
- * we may provide other read-cache strategies in the future and make this
- * configurable through a mount option.
- */
-const struct file_operations vboxsf_reg_fops = {
-	.llseek = generic_file_llseek,
-	.read_iter = generic_file_read_iter,
-	.write_iter = generic_file_write_iter,
-	.mmap = vboxsf_file_mmap,
-	.open = vboxsf_file_open,
-	.release = vboxsf_file_release,
-	.fsync = noop_fsync,
-	.splice_read = generic_file_splice_read,
-};
-
-const struct inode_operations vboxsf_reg_iops = {
-	.getattr = vboxsf_getattr,
-	.setattr = vboxsf_setattr
-};
-
-static int vboxsf_readpage(struct file *file, struct page *page)
-{
-	struct vboxsf_handle *sf_handle = file->private_data;
-	loff_t off = page_offset(page);
-	u32 nread = PAGE_SIZE;
-	u8 *buf;
-	int err;
-
-	buf = kmap(page);
-
-	err = vboxsf_read(sf_handle->root, sf_handle->handle, off, &nread, buf);
-	if (err == 0) {
-		memset(&buf[nread], 0, PAGE_SIZE - nread);
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-	} else {
-		SetPageError(page);
-	}
-
-	kunmap(page);
-	unlock_page(page);
-	return err;
-}
-
-static struct vboxsf_handle *vboxsf_get_write_handle(struct vboxsf_inode *sf_i)
-{
-	struct vboxsf_handle *h, *sf_handle = NULL;
-
-	mutex_lock(&sf_i->handle_list_mutex);
-	list_for_each_entry(h, &sf_i->handle_list, head) {
-		if (h->access_flags == SHFL_CF_ACCESS_WRITE ||
-		    h->access_flags == SHFL_CF_ACCESS_READWRITE) {
-			kref_get(&h->refcount);
-			sf_handle = h;
-			break;
-		}
-	}
-	mutex_unlock(&sf_i->handle_list_mutex);
-
-	return sf_handle;
-}
-
-static int vboxsf_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	struct vboxsf_inode *sf_i = VBOXSF_I(inode);
-	struct vboxsf_handle *sf_handle;
-	loff_t off = page_offset(page);
-	loff_t size = i_size_read(inode);
-	u32 nwrite = PAGE_SIZE;
-	u8 *buf;
-	int err;
-
-	if (off + PAGE_SIZE > size)
-		nwrite = size & ~PAGE_MASK;
-
-	sf_handle = vboxsf_get_write_handle(sf_i);
-	if (!sf_handle)
-		return -EBADF;
-
-	buf = kmap(page);
-	err = vboxsf_write(sf_handle->root, sf_handle->handle,
-			   off, &nwrite, buf);
-	kunmap(page);
-
-	kref_put(&sf_handle->refcount, vboxsf_handle_release);
-
-	if (err == 0) {
-		ClearPageError(page);
-		/* mtime changed */
-		sf_i->force_restat = 1;
-	} else {
-		ClearPageUptodate(page);
-	}
-
-	unlock_page(page);
-	return err;
-}
-
-static int vboxsf_write_end(struct file *file, struct address_space *mapping,
-			    loff_t pos, unsigned int len, unsigned int copied,
-			    struct page *page, void *fsdata)
-{
-	struct inode *inode = mapping->host;
-	struct vboxsf_handle *sf_handle = file->private_data;
-	unsigned int from = pos & ~PAGE_MASK;
-	u32 nwritten = len;
-	u8 *buf;
-	int err;
-
-	buf = kmap(page);
-	err = vboxsf_write(sf_handle->root, sf_handle->handle,
-			   pos, &nwritten, buf + from);
-	kunmap(page);
-
-	if (err) {
-		nwritten = 0;
-		goto out;
-	}
-
-	/* mtime changed */
-	VBOXSF_I(inode)->force_restat = 1;
-
-	if (!PageUptodate(page) && nwritten == PAGE_SIZE)
-		SetPageUptodate(page);
-
-	pos += nwritten;
-	if (pos > inode->i_size)
-		i_size_write(inode, pos);
-
-out:
-	unlock_page(page);
-	put_page(page);
-
-	return nwritten;
-}
-
-const struct address_space_operations vboxsf_reg_aops = {
-	.readpage = vboxsf_readpage,
-	.writepage = vboxsf_writepage,
-	.set_page_dirty = __set_page_dirty_nobuffers,
-	.write_begin = simple_write_begin,
-	.write_end = vboxsf_write_end,
-};
-
-static const char *vboxsf_get_link(struct dentry *dentry, struct inode *inode,
-				   struct delayed_call *done)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(inode->i_sb);
-	struct shfl_string *path;
-	char *link;
-	int err;
-
-	if (!dentry)
-		return ERR_PTR(-ECHILD);
-
-	path = vboxsf_path_from_dentry(sbi, dentry);
-	if (IS_ERR(path))
-		return (char *)path;
-
-	link = kzalloc(PATH_MAX, GFP_KERNEL);
-	if (!link) {
-		__putname(path);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	err = vboxsf_readlink(sbi->root, path, PATH_MAX, link);
-	__putname(path);
-	if (err) {
-		kfree(link);
-		return ERR_PTR(err);
-	}
-
-	set_delayed_call(done, kfree_link, link);
-	return link;
-}
-
-const struct inode_operations vboxsf_lnk_iops = {
-	.get_link = vboxsf_get_link
-};
diff --git a/drivers/staging/vboxsf/shfl_hostintf.h b/drivers/staging/vboxsf/shfl_hostintf.h
deleted file mode 100644
index aca829062c12..000000000000
--- a/drivers/staging/vboxsf/shfl_hostintf.h
+++ /dev/null
@@ -1,901 +0,0 @@
-/* SPDX-License-Identifier: MIT */
-/*
- * VirtualBox Shared Folders: host interface definition.
- *
- * Copyright (C) 2006-2018 Oracle Corporation
- */
-
-#ifndef SHFL_HOSTINTF_H
-#define SHFL_HOSTINTF_H
-
-#include <linux/vbox_vmmdev_types.h>
-
-/* The max in/out buffer size for a FN_READ or FN_WRITE call */
-#define SHFL_MAX_RW_COUNT           (16 * SZ_1M)
-
-/*
- * Structures shared between guest and the service
- * can be relocated and use offsets to point to variable
- * length parts.
- *
- * Shared folders protocol works with handles.
- * Before doing any action on a file system object,
- * one have to obtain the object handle via a SHFL_FN_CREATE
- * request. A handle must be closed with SHFL_FN_CLOSE.
- */
-
-enum {
-	SHFL_FN_QUERY_MAPPINGS = 1,	/* Query mappings changes. */
-	SHFL_FN_QUERY_MAP_NAME = 2,	/* Query map name. */
-	SHFL_FN_CREATE = 3,		/* Open/create object. */
-	SHFL_FN_CLOSE = 4,		/* Close object handle. */
-	SHFL_FN_READ = 5,		/* Read object content. */
-	SHFL_FN_WRITE = 6,		/* Write new object content. */
-	SHFL_FN_LOCK = 7,		/* Lock/unlock a range in the object. */
-	SHFL_FN_LIST = 8,		/* List object content. */
-	SHFL_FN_INFORMATION = 9,	/* Query/set object information. */
-	/* Note function number 10 is not used! */
-	SHFL_FN_REMOVE = 11,		/* Remove object */
-	SHFL_FN_MAP_FOLDER_OLD = 12,	/* Map folder (legacy) */
-	SHFL_FN_UNMAP_FOLDER = 13,	/* Unmap folder */
-	SHFL_FN_RENAME = 14,		/* Rename object */
-	SHFL_FN_FLUSH = 15,		/* Flush file */
-	SHFL_FN_SET_UTF8 = 16,		/* Select UTF8 filename encoding */
-	SHFL_FN_MAP_FOLDER = 17,	/* Map folder */
-	SHFL_FN_READLINK = 18,		/* Read symlink dest (as of VBox 4.0) */
-	SHFL_FN_SYMLINK = 19,		/* Create symlink (as of VBox 4.0) */
-	SHFL_FN_SET_SYMLINKS = 20,	/* Ask host to show symlinks (4.0+) */
-};
-
-/* Root handles for a mapping are of type u32, Root handles are unique. */
-#define SHFL_ROOT_NIL		UINT_MAX
-
-/* Shared folders handle for an opened object are of type u64. */
-#define SHFL_HANDLE_NIL		ULLONG_MAX
-
-/* Hardcoded maximum length (in chars) of a shared folder name. */
-#define SHFL_MAX_LEN         (256)
-/* Hardcoded maximum number of shared folder mapping available to the guest. */
-#define SHFL_MAX_MAPPINGS    (64)
-
-/** Shared folder string buffer structure. */
-struct shfl_string {
-	/** Allocated size of the string member in bytes. */
-	u16 size;
-
-	/** Length of string without trailing nul in bytes. */
-	u16 length;
-
-	/** UTF-8 or UTF-16 string. Nul terminated. */
-	union {
-		u8 utf8[2];
-		u16 utf16[1];
-		u16 ucs2[1]; /* misnomer, use utf16. */
-	} string;
-};
-VMMDEV_ASSERT_SIZE(shfl_string, 6);
-
-/* The size of shfl_string w/o the string part. */
-#define SHFLSTRING_HEADER_SIZE  4
-
-/* Calculate size of the string. */
-static inline u32 shfl_string_buf_size(const struct shfl_string *string)
-{
-	return string ? SHFLSTRING_HEADER_SIZE + string->size : 0;
-}
-
-/* Set user id on execution (S_ISUID). */
-#define SHFL_UNIX_ISUID             0004000U
-/* Set group id on execution (S_ISGID). */
-#define SHFL_UNIX_ISGID             0002000U
-/* Sticky bit (S_ISVTX / S_ISTXT). */
-#define SHFL_UNIX_ISTXT             0001000U
-
-/* Owner readable (S_IRUSR). */
-#define SHFL_UNIX_IRUSR             0000400U
-/* Owner writable (S_IWUSR). */
-#define SHFL_UNIX_IWUSR             0000200U
-/* Owner executable (S_IXUSR). */
-#define SHFL_UNIX_IXUSR             0000100U
-
-/* Group readable (S_IRGRP). */
-#define SHFL_UNIX_IRGRP             0000040U
-/* Group writable (S_IWGRP). */
-#define SHFL_UNIX_IWGRP             0000020U
-/* Group executable (S_IXGRP). */
-#define SHFL_UNIX_IXGRP             0000010U
-
-/* Other readable (S_IROTH). */
-#define SHFL_UNIX_IROTH             0000004U
-/* Other writable (S_IWOTH). */
-#define SHFL_UNIX_IWOTH             0000002U
-/* Other executable (S_IXOTH). */
-#define SHFL_UNIX_IXOTH             0000001U
-
-/* Named pipe (fifo) (S_IFIFO). */
-#define SHFL_TYPE_FIFO              0010000U
-/* Character device (S_IFCHR). */
-#define SHFL_TYPE_DEV_CHAR          0020000U
-/* Directory (S_IFDIR). */
-#define SHFL_TYPE_DIRECTORY         0040000U
-/* Block device (S_IFBLK). */
-#define SHFL_TYPE_DEV_BLOCK         0060000U
-/* Regular file (S_IFREG). */
-#define SHFL_TYPE_FILE              0100000U
-/* Symbolic link (S_IFLNK). */
-#define SHFL_TYPE_SYMLINK           0120000U
-/* Socket (S_IFSOCK). */
-#define SHFL_TYPE_SOCKET            0140000U
-/* Whiteout (S_IFWHT). */
-#define SHFL_TYPE_WHITEOUT          0160000U
-/* Type mask (S_IFMT). */
-#define SHFL_TYPE_MASK              0170000U
-
-/* Checks the mode flags indicate a directory (S_ISDIR). */
-#define SHFL_IS_DIRECTORY(m)   (((m) & SHFL_TYPE_MASK) == SHFL_TYPE_DIRECTORY)
-/* Checks the mode flags indicate a symbolic link (S_ISLNK). */
-#define SHFL_IS_SYMLINK(m)     (((m) & SHFL_TYPE_MASK) == SHFL_TYPE_SYMLINK)
-
-/** The available additional information in a shfl_fsobjattr object. */
-enum shfl_fsobjattr_add {
-	/** No additional information is available / requested. */
-	SHFLFSOBJATTRADD_NOTHING = 1,
-	/**
-	 * The additional unix attributes (shfl_fsobjattr::u::unix_attr) are
-	 *  available / requested.
-	 */
-	SHFLFSOBJATTRADD_UNIX,
-	/**
-	 * The additional extended attribute size (shfl_fsobjattr::u::size) is
-	 *  available / requested.
-	 */
-	SHFLFSOBJATTRADD_EASIZE,
-	/**
-	 * The last valid item (inclusive).
-	 * The valid range is SHFLFSOBJATTRADD_NOTHING thru
-	 * SHFLFSOBJATTRADD_LAST.
-	 */
-	SHFLFSOBJATTRADD_LAST = SHFLFSOBJATTRADD_EASIZE,
-
-	/** The usual 32-bit hack. */
-	SHFLFSOBJATTRADD_32BIT_SIZE_HACK = 0x7fffffff
-};
-
-/**
- * Additional unix Attributes, these are available when
- * shfl_fsobjattr.additional == SHFLFSOBJATTRADD_UNIX.
- */
-struct shfl_fsobjattr_unix {
-	/**
-	 * The user owning the filesystem object (st_uid).
-	 * This field is ~0U if not supported.
-	 */
-	u32 uid;
-
-	/**
-	 * The group the filesystem object is assigned (st_gid).
-	 * This field is ~0U if not supported.
-	 */
-	u32 gid;
-
-	/**
-	 * Number of hard links to this filesystem object (st_nlink).
-	 * This field is 1 if the filesystem doesn't support hardlinking or
-	 * the information isn't available.
-	 */
-	u32 hardlinks;
-
-	/**
-	 * The device number of the device which this filesystem object resides
-	 * on (st_dev). This field is 0 if this information is not available.
-	 */
-	u32 inode_id_device;
-
-	/**
-	 * The unique identifier (within the filesystem) of this filesystem
-	 * object (st_ino). Together with inode_id_device, this field can be
-	 * used as a OS wide unique id, when both their values are not 0.
-	 * This field is 0 if the information is not available.
-	 */
-	u64 inode_id;
-
-	/**
-	 * User flags (st_flags).
-	 * This field is 0 if this information is not available.
-	 */
-	u32 flags;
-
-	/**
-	 * The current generation number (st_gen).
-	 * This field is 0 if this information is not available.
-	 */
-	u32 generation_id;
-
-	/**
-	 * The device number of a char. or block device type object (st_rdev).
-	 * This field is 0 if the file isn't a char. or block device or when
-	 * the OS doesn't use the major+minor device idenfication scheme.
-	 */
-	u32 device;
-} __packed;
-
-/** Extended attribute size. */
-struct shfl_fsobjattr_easize {
-	/** Size of EAs. */
-	s64 cb;
-} __packed;
-
-/** Shared folder filesystem object attributes. */
-struct shfl_fsobjattr {
-	/** Mode flags (st_mode). SHFL_UNIX_*, SHFL_TYPE_*, and SHFL_DOS_*. */
-	u32 mode;
-
-	/** The additional attributes available. */
-	enum shfl_fsobjattr_add additional;
-
-	/**
-	 * Additional attributes.
-	 *
-	 * Unless explicitly specified to an API, the API can provide additional
-	 * data as it is provided by the underlying OS.
-	 */
-	union {
-		struct shfl_fsobjattr_unix unix_attr;
-		struct shfl_fsobjattr_easize size;
-	} __packed u;
-} __packed;
-VMMDEV_ASSERT_SIZE(shfl_fsobjattr, 44);
-
-struct shfl_timespec {
-	s64 ns_relative_to_unix_epoch;
-};
-
-/** Filesystem object information structure. */
-struct shfl_fsobjinfo {
-	/**
-	 * Logical size (st_size).
-	 * For normal files this is the size of the file.
-	 * For symbolic links, this is the length of the path name contained
-	 * in the symbolic link.
-	 * For other objects this fields needs to be specified.
-	 */
-	s64 size;
-
-	/** Disk allocation size (st_blocks * DEV_BSIZE). */
-	s64 allocated;
-
-	/** Time of last access (st_atime). */
-	struct shfl_timespec access_time;
-
-	/** Time of last data modification (st_mtime). */
-	struct shfl_timespec modification_time;
-
-	/**
-	 * Time of last status change (st_ctime).
-	 * If not available this is set to modification_time.
-	 */
-	struct shfl_timespec change_time;
-
-	/**
-	 * Time of file birth (st_birthtime).
-	 * If not available this is set to change_time.
-	 */
-	struct shfl_timespec birth_time;
-
-	/** Attributes. */
-	struct shfl_fsobjattr attr;
-
-} __packed;
-VMMDEV_ASSERT_SIZE(shfl_fsobjinfo, 92);
-
-/**
- * result of an open/create request.
- * Along with handle value the result code
- * identifies what has happened while
- * trying to open the object.
- */
-enum shfl_create_result {
-	SHFL_NO_RESULT,
-	/** Specified path does not exist. */
-	SHFL_PATH_NOT_FOUND,
-	/** Path to file exists, but the last component does not. */
-	SHFL_FILE_NOT_FOUND,
-	/** File already exists and either has been opened or not. */
-	SHFL_FILE_EXISTS,
-	/** New file was created. */
-	SHFL_FILE_CREATED,
-	/** Existing file was replaced or overwritten. */
-	SHFL_FILE_REPLACED
-};
-
-/* No flags. Initialization value. */
-#define SHFL_CF_NONE                  (0x00000000)
-
-/*
- * Only lookup the object, do not return a handle. When this is set all other
- * flags are ignored.
- */
-#define SHFL_CF_LOOKUP                (0x00000001)
-
-/*
- * Open parent directory of specified object.
- * Useful for the corresponding Windows FSD flag
- * and for opening paths like \\dir\\*.* to search the 'dir'.
- */
-#define SHFL_CF_OPEN_TARGET_DIRECTORY (0x00000002)
-
-/* Create/open a directory. */
-#define SHFL_CF_DIRECTORY             (0x00000004)
-
-/*
- *  Open/create action to do if object exists
- *  and if the object does not exists.
- *  REPLACE file means atomically DELETE and CREATE.
- *  OVERWRITE file means truncating the file to 0 and
- *  setting new size.
- *  When opening an existing directory REPLACE and OVERWRITE
- *  actions are considered invalid, and cause returning
- *  FILE_EXISTS with NIL handle.
- */
-#define SHFL_CF_ACT_MASK_IF_EXISTS      (0x000000f0)
-#define SHFL_CF_ACT_MASK_IF_NEW         (0x00000f00)
-
-/* What to do if object exists. */
-#define SHFL_CF_ACT_OPEN_IF_EXISTS      (0x00000000)
-#define SHFL_CF_ACT_FAIL_IF_EXISTS      (0x00000010)
-#define SHFL_CF_ACT_REPLACE_IF_EXISTS   (0x00000020)
-#define SHFL_CF_ACT_OVERWRITE_IF_EXISTS (0x00000030)
-
-/* What to do if object does not exist. */
-#define SHFL_CF_ACT_CREATE_IF_NEW       (0x00000000)
-#define SHFL_CF_ACT_FAIL_IF_NEW         (0x00000100)
-
-/* Read/write requested access for the object. */
-#define SHFL_CF_ACCESS_MASK_RW          (0x00003000)
-
-/* No access requested. */
-#define SHFL_CF_ACCESS_NONE             (0x00000000)
-/* Read access requested. */
-#define SHFL_CF_ACCESS_READ             (0x00001000)
-/* Write access requested. */
-#define SHFL_CF_ACCESS_WRITE            (0x00002000)
-/* Read/Write access requested. */
-#define SHFL_CF_ACCESS_READWRITE	(0x00003000)
-
-/* Requested share access for the object. */
-#define SHFL_CF_ACCESS_MASK_DENY        (0x0000c000)
-
-/* Allow any access. */
-#define SHFL_CF_ACCESS_DENYNONE         (0x00000000)
-/* Do not allow read. */
-#define SHFL_CF_ACCESS_DENYREAD         (0x00004000)
-/* Do not allow write. */
-#define SHFL_CF_ACCESS_DENYWRITE        (0x00008000)
-/* Do not allow access. */
-#define SHFL_CF_ACCESS_DENYALL          (0x0000c000)
-
-/* Requested access to attributes of the object. */
-#define SHFL_CF_ACCESS_MASK_ATTR        (0x00030000)
-
-/* No access requested. */
-#define SHFL_CF_ACCESS_ATTR_NONE        (0x00000000)
-/* Read access requested. */
-#define SHFL_CF_ACCESS_ATTR_READ        (0x00010000)
-/* Write access requested. */
-#define SHFL_CF_ACCESS_ATTR_WRITE       (0x00020000)
-/* Read/Write access requested. */
-#define SHFL_CF_ACCESS_ATTR_READWRITE   (0x00030000)
-
-/*
- * The file is opened in append mode.
- * Ignored if SHFL_CF_ACCESS_WRITE is not set.
- */
-#define SHFL_CF_ACCESS_APPEND           (0x00040000)
-
-/** Create parameters buffer struct for SHFL_FN_CREATE call */
-struct shfl_createparms {
-	/** Returned handle of opened object. */
-	u64 handle;
-
-	/** Returned result of the operation */
-	enum shfl_create_result result;
-
-	/** SHFL_CF_* */
-	u32 create_flags;
-
-	/**
-	 * Attributes of object to create and
-	 * returned actual attributes of opened/created object.
-	 */
-	struct shfl_fsobjinfo info;
-} __packed;
-
-/** Shared Folder directory information */
-struct shfl_dirinfo {
-	/** Full information about the object. */
-	struct shfl_fsobjinfo info;
-	/**
-	 * The length of the short field (number of UTF16 chars).
-	 * It is 16-bit for reasons of alignment.
-	 */
-	u16 short_name_len;
-	/**
-	 * The short name for 8.3 compatibility.
-	 * Empty string if not available.
-	 */
-	u16 short_name[14];
-	struct shfl_string name;
-};
-
-/** Shared folder filesystem properties. */
-struct shfl_fsproperties {
-	/**
-	 * The maximum size of a filesystem object name.
-	 * This does not include the '\\0'.
-	 */
-	u32 max_component_len;
-
-	/**
-	 * True if the filesystem is remote.
-	 * False if the filesystem is local.
-	 */
-	bool remote;
-
-	/**
-	 * True if the filesystem is case sensitive.
-	 * False if the filesystem is case insensitive.
-	 */
-	bool case_sensitive;
-
-	/**
-	 * True if the filesystem is mounted read only.
-	 * False if the filesystem is mounted read write.
-	 */
-	bool read_only;
-
-	/**
-	 * True if the filesystem can encode unicode object names.
-	 * False if it can't.
-	 */
-	bool supports_unicode;
-
-	/**
-	 * True if the filesystem is compresses.
-	 * False if it isn't or we don't know.
-	 */
-	bool compressed;
-
-	/**
-	 * True if the filesystem compresses of individual files.
-	 * False if it doesn't or we don't know.
-	 */
-	bool file_compression;
-};
-VMMDEV_ASSERT_SIZE(shfl_fsproperties, 12);
-
-struct shfl_volinfo {
-	s64 total_allocation_bytes;
-	s64 available_allocation_bytes;
-	u32 bytes_per_allocation_unit;
-	u32 bytes_per_sector;
-	u32 serial;
-	struct shfl_fsproperties properties;
-};
-
-
-/** SHFL_FN_MAP_FOLDER Parameters structure. */
-struct shfl_map_folder {
-	/**
-	 * pointer, in:
-	 * Points to struct shfl_string buffer.
-	 */
-	struct vmmdev_hgcm_function_parameter path;
-
-	/**
-	 * pointer, out: SHFLROOT (u32)
-	 * Root handle of the mapping which name is queried.
-	 */
-	struct vmmdev_hgcm_function_parameter root;
-
-	/**
-	 * pointer, in: UTF16
-	 * Path delimiter
-	 */
-	struct vmmdev_hgcm_function_parameter delimiter;
-
-	/**
-	 * pointer, in: SHFLROOT (u32)
-	 * Case senstive flag
-	 */
-	struct vmmdev_hgcm_function_parameter case_sensitive;
-
-};
-
-/* Number of parameters */
-#define SHFL_CPARMS_MAP_FOLDER (4)
-
-
-/** SHFL_FN_UNMAP_FOLDER Parameters structure. */
-struct shfl_unmap_folder {
-	/**
-	 * pointer, in: SHFLROOT (u32)
-	 * Root handle of the mapping which name is queried.
-	 */
-	struct vmmdev_hgcm_function_parameter root;
-
-};
-
-/* Number of parameters */
-#define SHFL_CPARMS_UNMAP_FOLDER (1)
-
-
-/** SHFL_FN_CREATE Parameters structure. */
-struct shfl_create {
-	/**
-	 * pointer, in: SHFLROOT (u32)
-	 * Root handle of the mapping which name is queried.
-	 */
-	struct vmmdev_hgcm_function_parameter root;
-
-	/**
-	 * pointer, in:
-	 * Points to struct shfl_string buffer.
-	 */
-	struct vmmdev_hgcm_function_parameter path;
-
-	/**
-	 * pointer, in/out:
-	 * Points to struct shfl_createparms buffer.
-	 */
-	struct vmmdev_hgcm_function_parameter parms;
-
-};
-
-/* Number of parameters */
-#define SHFL_CPARMS_CREATE (3)
-
-
-/** SHFL_FN_CLOSE Parameters structure. */
-struct shfl_close {
-	/**
-	 * pointer, in: SHFLROOT (u32)
-	 * Root handle of the mapping which name is queried.
-	 */
-	struct vmmdev_hgcm_function_parameter root;
-
-	/**
-	 * value64, in:
-	 * SHFLHANDLE (u64) of object to close.
-	 */
-	struct vmmdev_hgcm_function_parameter handle;
-
-};
-
-/* Number of parameters */
-#define SHFL_CPARMS_CLOSE (2)
-
-
-/** SHFL_FN_READ Parameters structure. */
-struct shfl_read {
-	/**
-	 * pointer, in: SHFLROOT (u32)
-	 * Root handle of the mapping which name is queried.
-	 */
-	struct vmmdev_hgcm_function_parameter root;
-
-	/**
-	 * value64, in:
-	 * SHFLHANDLE (u64) of object to read from.
-	 */
-	struct vmmdev_hgcm_function_parameter handle;
-
-	/**
-	 * value64, in:
-	 * Offset to read from.
-	 */
-	struct vmmdev_hgcm_function_parameter offset;
-
-	/**
-	 * value64, in/out:
-	 * Bytes to read/How many were read.
-	 */
-	struct vmmdev_hgcm_function_parameter cb;
-
-	/**
-	 * pointer, out:
-	 * Buffer to place data to.
-	 */
-	struct vmmdev_hgcm_function_parameter buffer;
-
-};
-
-/* Number of parameters */
-#define SHFL_CPARMS_READ (5)
-
-
-/** SHFL_FN_WRITE Parameters structure. */
-struct shfl_write {
-	/**
-	 * pointer, in: SHFLROOT (u32)
-	 * Root handle of the mapping which name is queried.
-	 */
-	struct vmmdev_hgcm_function_parameter root;
-
-	/**
-	 * value64, in:
-	 * SHFLHANDLE (u64) of object to write to.
-	 */
-	struct vmmdev_hgcm_function_parameter handle;
-
-	/**
-	 * value64, in:
-	 * Offset to write to.
-	 */
-	struct vmmdev_hgcm_function_parameter offset;
-
-	/**
-	 * value64, in/out:
-	 * Bytes to write/How many were written.
-	 */
-	struct vmmdev_hgcm_function_parameter cb;
-
-	/**
-	 * pointer, in:
-	 * Data to write.
-	 */
-	struct vmmdev_hgcm_function_parameter buffer;
-
-};
-
-/* Number of parameters */
-#define SHFL_CPARMS_WRITE (5)
-
-
-/*
- * SHFL_FN_LIST
- * Listing information includes variable length RTDIRENTRY[EX] structures.
- */
-
-#define SHFL_LIST_NONE			0
-#define SHFL_LIST_RETURN_ONE		1
-
-/** SHFL_FN_LIST Parameters structure. */
-struct shfl_list {
-	/**
-	 * pointer, in: SHFLROOT (u32)
-	 * Root handle of the mapping which name is queried.
-	 */
-	struct vmmdev_hgcm_function_parameter root;
-
-	/**
-	 * value64, in:
-	 * SHFLHANDLE (u64) of object to be listed.
-	 */
-	struct vmmdev_hgcm_function_parameter handle;
-
-	/**
-	 * value32, in:
-	 * List flags SHFL_LIST_*.
-	 */
-	struct vmmdev_hgcm_function_parameter flags;
-
-	/**
-	 * value32, in/out:
-	 * Bytes to be used for listing information/How many bytes were used.
-	 */
-	struct vmmdev_hgcm_function_parameter cb;
-
-	/**
-	 * pointer, in/optional
-	 * Points to struct shfl_string buffer that specifies a search path.
-	 */
-	struct vmmdev_hgcm_function_parameter path;
-
-	/**
-	 * pointer, out:
-	 * Buffer to place listing information to. (struct shfl_dirinfo)
-	 */
-	struct vmmdev_hgcm_function_parameter buffer;
-
-	/**
-	 * value32, in/out:
-	 * Indicates a key where the listing must be resumed.
-	 * in: 0 means start from begin of object.
-	 * out: 0 means listing completed.
-	 */
-	struct vmmdev_hgcm_function_parameter resume_point;
-
-	/**
-	 * pointer, out:
-	 * Number of files returned
-	 */
-	struct vmmdev_hgcm_function_parameter file_count;
-};
-
-/* Number of parameters */
-#define SHFL_CPARMS_LIST (8)
-
-
-/** SHFL_FN_READLINK Parameters structure. */
-struct shfl_readLink {
-	/**
-	 * pointer, in: SHFLROOT (u32)
-	 * Root handle of the mapping which name is queried.
-	 */
-	struct vmmdev_hgcm_function_parameter root;
-
-	/**
-	 * pointer, in:
-	 * Points to struct shfl_string buffer.
-	 */
-	struct vmmdev_hgcm_function_parameter path;
-
-	/**
-	 * pointer, out:
-	 * Buffer to place data to.
-	 */
-	struct vmmdev_hgcm_function_parameter buffer;
-
-};
-
-/* Number of parameters */
-#define SHFL_CPARMS_READLINK (3)
-
-
-/* SHFL_FN_INFORMATION */
-
-/* Mask of Set/Get bit. */
-#define SHFL_INFO_MODE_MASK    (0x1)
-/* Get information */
-#define SHFL_INFO_GET          (0x0)
-/* Set information */
-#define SHFL_INFO_SET          (0x1)
-
-/* Get name of the object. */
-#define SHFL_INFO_NAME         (0x2)
-/* Set size of object (extend/trucate); only applies to file objects */
-#define SHFL_INFO_SIZE         (0x4)
-/* Get/Set file object info. */
-#define SHFL_INFO_FILE         (0x8)
-/* Get volume information. */
-#define SHFL_INFO_VOLUME       (0x10)
-
-/** SHFL_FN_INFORMATION Parameters structure. */
-struct shfl_information {
-	/**
-	 * pointer, in: SHFLROOT (u32)
-	 * Root handle of the mapping which name is queried.
-	 */
-	struct vmmdev_hgcm_function_parameter root;
-
-	/**
-	 * value64, in:
-	 * SHFLHANDLE (u64) of object to be listed.
-	 */
-	struct vmmdev_hgcm_function_parameter handle;
-
-	/**
-	 * value32, in:
-	 * SHFL_INFO_*
-	 */
-	struct vmmdev_hgcm_function_parameter flags;
-
-	/**
-	 * value32, in/out:
-	 * Bytes to be used for information/How many bytes were used.
-	 */
-	struct vmmdev_hgcm_function_parameter cb;
-
-	/**
-	 * pointer, in/out:
-	 * Information to be set/get (shfl_fsobjinfo or shfl_string). Do not
-	 * forget to set the shfl_fsobjinfo::attr::additional for a get
-	 * operation as well.
-	 */
-	struct vmmdev_hgcm_function_parameter info;
-
-};
-
-/* Number of parameters */
-#define SHFL_CPARMS_INFORMATION (5)
-
-
-/* SHFL_FN_REMOVE */
-
-#define SHFL_REMOVE_FILE        (0x1)
-#define SHFL_REMOVE_DIR         (0x2)
-#define SHFL_REMOVE_SYMLINK     (0x4)
-
-/** SHFL_FN_REMOVE Parameters structure. */
-struct shfl_remove {
-	/**
-	 * pointer, in: SHFLROOT (u32)
-	 * Root handle of the mapping which name is queried.
-	 */
-	struct vmmdev_hgcm_function_parameter root;
-
-	/**
-	 * pointer, in:
-	 * Points to struct shfl_string buffer.
-	 */
-	struct vmmdev_hgcm_function_parameter path;
-
-	/**
-	 * value32, in:
-	 * remove flags (file/directory)
-	 */
-	struct vmmdev_hgcm_function_parameter flags;
-
-};
-
-#define SHFL_CPARMS_REMOVE  (3)
-
-
-/* SHFL_FN_RENAME */
-
-#define SHFL_RENAME_FILE                (0x1)
-#define SHFL_RENAME_DIR                 (0x2)
-#define SHFL_RENAME_REPLACE_IF_EXISTS   (0x4)
-
-/** SHFL_FN_RENAME Parameters structure. */
-struct shfl_rename {
-	/**
-	 * pointer, in: SHFLROOT (u32)
-	 * Root handle of the mapping which name is queried.
-	 */
-	struct vmmdev_hgcm_function_parameter root;
-
-	/**
-	 * pointer, in:
-	 * Points to struct shfl_string src.
-	 */
-	struct vmmdev_hgcm_function_parameter src;
-
-	/**
-	 * pointer, in:
-	 * Points to struct shfl_string dest.
-	 */
-	struct vmmdev_hgcm_function_parameter dest;
-
-	/**
-	 * value32, in:
-	 * rename flags (file/directory)
-	 */
-	struct vmmdev_hgcm_function_parameter flags;
-
-};
-
-#define SHFL_CPARMS_RENAME  (4)
-
-
-/** SHFL_FN_SYMLINK Parameters structure. */
-struct shfl_symlink {
-	/**
-	 * pointer, in: SHFLROOT (u32)
-	 * Root handle of the mapping which name is queried.
-	 */
-	struct vmmdev_hgcm_function_parameter root;
-
-	/**
-	 * pointer, in:
-	 * Points to struct shfl_string of path for the new symlink.
-	 */
-	struct vmmdev_hgcm_function_parameter new_path;
-
-	/**
-	 * pointer, in:
-	 * Points to struct shfl_string of destination for symlink.
-	 */
-	struct vmmdev_hgcm_function_parameter old_path;
-
-	/**
-	 * pointer, out:
-	 * Information about created symlink.
-	 */
-	struct vmmdev_hgcm_function_parameter info;
-
-};
-
-#define SHFL_CPARMS_SYMLINK  (4)
-
-#endif
diff --git a/drivers/staging/vboxsf/super.c b/drivers/staging/vboxsf/super.c
deleted file mode 100644
index 0bf4d724aefd..000000000000
--- a/drivers/staging/vboxsf/super.c
+++ /dev/null
@@ -1,501 +0,0 @@
-// SPDX-License-Identifier: MIT
-/*
- * VirtualBox Guest Shared Folders support: Virtual File System.
- *
- * Module initialization/finalization
- * File system registration/deregistration
- * Superblock reading
- * Few utility functions
- *
- * Copyright (C) 2006-2018 Oracle Corporation
- */
-
-#include <linux/idr.h>
-#include <linux/fs_parser.h>
-#include <linux/magic.h>
-#include <linux/module.h>
-#include <linux/nls.h>
-#include <linux/statfs.h>
-#include <linux/vbox_utils.h>
-#include "vfsmod.h"
-
-#define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */
-
-#define VBSF_MOUNT_SIGNATURE_BYTE_0 ('\000')
-#define VBSF_MOUNT_SIGNATURE_BYTE_1 ('\377')
-#define VBSF_MOUNT_SIGNATURE_BYTE_2 ('\376')
-#define VBSF_MOUNT_SIGNATURE_BYTE_3 ('\375')
-
-static int follow_symlinks;
-module_param(follow_symlinks, int, 0444);
-MODULE_PARM_DESC(follow_symlinks,
-		 "Let host resolve symlinks rather than showing them");
-
-static DEFINE_IDA(vboxsf_bdi_ida);
-static DEFINE_MUTEX(vboxsf_setup_mutex);
-static bool vboxsf_setup_done;
-static struct super_operations vboxsf_super_ops; /* forward declaration */
-static struct kmem_cache *vboxsf_inode_cachep;
-
-static char * const vboxsf_default_nls = CONFIG_NLS_DEFAULT;
-
-enum  { opt_nls, opt_uid, opt_gid, opt_ttl, opt_dmode, opt_fmode,
-	opt_dmask, opt_fmask };
-
-static const struct fs_parameter_spec vboxsf_param_specs[] = {
-	fsparam_string	("nls",		opt_nls),
-	fsparam_u32	("uid",		opt_uid),
-	fsparam_u32	("gid",		opt_gid),
-	fsparam_u32	("ttl",		opt_ttl),
-	fsparam_u32oct	("dmode",	opt_dmode),
-	fsparam_u32oct	("fmode",	opt_fmode),
-	fsparam_u32oct	("dmask",	opt_dmask),
-	fsparam_u32oct	("fmask",	opt_fmask),
-	{}
-};
-
-static const struct fs_parameter_description vboxsf_fs_parameters = {
-	.name  = "vboxsf",
-	.specs  = vboxsf_param_specs,
-};
-
-static int vboxsf_parse_param(struct fs_context *fc, struct fs_parameter *param)
-{
-	struct vboxsf_fs_context *ctx = fc->fs_private;
-	struct fs_parse_result result;
-	kuid_t uid;
-	kgid_t gid;
-	int opt;
-
-	opt = fs_parse(fc, &vboxsf_fs_parameters, param, &result);
-	if (opt < 0)
-		return opt;
-
-	switch (opt) {
-	case opt_nls:
-		if (fc->purpose != FS_CONTEXT_FOR_MOUNT) {
-			vbg_err("vboxsf: Cannot reconfigure nls option\n");
-			return -EINVAL;
-		}
-		ctx->nls_name = param->string;
-		param->string = NULL;
-		break;
-	case opt_uid:
-		uid = make_kuid(current_user_ns(), result.uint_32);
-		if (!uid_valid(uid))
-			return -EINVAL;
-		ctx->o.uid = uid;
-		break;
-	case opt_gid:
-		gid = make_kgid(current_user_ns(), result.uint_32);
-		if (!gid_valid(gid))
-			return -EINVAL;
-		ctx->o.gid = gid;
-		break;
-	case opt_ttl:
-		ctx->o.ttl = msecs_to_jiffies(result.uint_32);
-		break;
-	case opt_dmode:
-		if (result.uint_32 & ~0777)
-			return -EINVAL;
-		ctx->o.dmode = result.uint_32;
-		ctx->o.dmode_set = true;
-		break;
-	case opt_fmode:
-		if (result.uint_32 & ~0777)
-			return -EINVAL;
-		ctx->o.fmode = result.uint_32;
-		ctx->o.fmode_set = true;
-		break;
-	case opt_dmask:
-		if (result.uint_32 & ~07777)
-			return -EINVAL;
-		ctx->o.dmask = result.uint_32;
-		break;
-	case opt_fmask:
-		if (result.uint_32 & ~07777)
-			return -EINVAL;
-		ctx->o.fmask = result.uint_32;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
-{
-	struct vboxsf_fs_context *ctx = fc->fs_private;
-	struct shfl_string *folder_name, root_path;
-	struct vboxsf_sbi *sbi;
-	struct dentry *droot;
-	struct inode *iroot;
-	char *nls_name;
-	size_t size;
-	int err;
-
-	if (!fc->source)
-		return -EINVAL;
-
-	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi)
-		return -ENOMEM;
-
-	sbi->o = ctx->o;
-	idr_init(&sbi->ino_idr);
-	spin_lock_init(&sbi->ino_idr_lock);
-	sbi->next_generation = 1;
-	sbi->bdi_id = -1;
-
-	/* Load nls if not utf8 */
-	nls_name = ctx->nls_name ? ctx->nls_name : vboxsf_default_nls;
-	if (strcmp(nls_name, "utf8") != 0) {
-		if (nls_name == vboxsf_default_nls)
-			sbi->nls = load_nls_default();
-		else
-			sbi->nls = load_nls(nls_name);
-
-		if (!sbi->nls) {
-			vbg_err("vboxsf: Count not load '%s' nls\n", nls_name);
-			err = -EINVAL;
-			goto fail_free;
-		}
-	}
-
-	sbi->bdi_id = ida_simple_get(&vboxsf_bdi_ida, 0, 0, GFP_KERNEL);
-	if (sbi->bdi_id < 0) {
-		err = sbi->bdi_id;
-		goto fail_free;
-	}
-
-	err = super_setup_bdi_name(sb, "vboxsf-%s.%d", fc->source, sbi->bdi_id);
-	if (err)
-		goto fail_free;
-
-	/* Turn source into a shfl_string and map the folder */
-	size = strlen(fc->source) + 1;
-	folder_name = kmalloc(SHFLSTRING_HEADER_SIZE + size, GFP_KERNEL);
-	if (!folder_name) {
-		err = -ENOMEM;
-		goto fail_free;
-	}
-	folder_name->size = size;
-	folder_name->length = size - 1;
-	strlcpy(folder_name->string.utf8, fc->source, size);
-	err = vboxsf_map_folder(folder_name, &sbi->root);
-	kfree(folder_name);
-	if (err) {
-		vbg_err("vboxsf: Host rejected mount of '%s' with error %d\n",
-			fc->source, err);
-		goto fail_free;
-	}
-
-	root_path.length = 1;
-	root_path.size = 2;
-	root_path.string.utf8[0] = '/';
-	root_path.string.utf8[1] = 0;
-	err = vboxsf_stat(sbi, &root_path, &sbi->root_info);
-	if (err)
-		goto fail_unmap;
-
-	sb->s_magic = VBOXSF_SUPER_MAGIC;
-	sb->s_blocksize = 1024;
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_op = &vboxsf_super_ops;
-	sb->s_d_op = &vboxsf_dentry_ops;
-
-	iroot = iget_locked(sb, 0);
-	if (!iroot) {
-		err = -ENOMEM;
-		goto fail_unmap;
-	}
-	vboxsf_init_inode(sbi, iroot, &sbi->root_info);
-	unlock_new_inode(iroot);
-
-	droot = d_make_root(iroot);
-	if (!droot) {
-		err = -ENOMEM;
-		goto fail_unmap;
-	}
-
-	sb->s_root = droot;
-	sb->s_fs_info = sbi;
-	return 0;
-
-fail_unmap:
-	vboxsf_unmap_folder(sbi->root);
-fail_free:
-	if (sbi->bdi_id >= 0)
-		ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id);
-	if (sbi->nls)
-		unload_nls(sbi->nls);
-	idr_destroy(&sbi->ino_idr);
-	kfree(sbi);
-	return err;
-}
-
-static void vboxsf_inode_init_once(void *data)
-{
-	struct vboxsf_inode *sf_i = data;
-
-	mutex_init(&sf_i->handle_list_mutex);
-	inode_init_once(&sf_i->vfs_inode);
-}
-
-static struct inode *vboxsf_alloc_inode(struct super_block *sb)
-{
-	struct vboxsf_inode *sf_i;
-
-	sf_i = kmem_cache_alloc(vboxsf_inode_cachep, GFP_NOFS);
-	if (!sf_i)
-		return NULL;
-
-	sf_i->force_restat = 0;
-	INIT_LIST_HEAD(&sf_i->handle_list);
-
-	return &sf_i->vfs_inode;
-}
-
-static void vboxsf_free_inode(struct inode *inode)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(inode->i_sb);
-	unsigned long flags;
-
-	spin_lock_irqsave(&sbi->ino_idr_lock, flags);
-	idr_remove(&sbi->ino_idr, inode->i_ino);
-	spin_unlock_irqrestore(&sbi->ino_idr_lock, flags);
-	kmem_cache_free(vboxsf_inode_cachep, VBOXSF_I(inode));
-}
-
-static void vboxsf_put_super(struct super_block *sb)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(sb);
-
-	vboxsf_unmap_folder(sbi->root);
-	if (sbi->bdi_id >= 0)
-		ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id);
-	if (sbi->nls)
-		unload_nls(sbi->nls);
-
-	/*
-	 * vboxsf_free_inode uses the idr, make sure all delayed rcu free
-	 * inodes are flushed.
-	 */
-	rcu_barrier();
-	idr_destroy(&sbi->ino_idr);
-	kfree(sbi);
-}
-
-static int vboxsf_statfs(struct dentry *dentry, struct kstatfs *stat)
-{
-	struct super_block *sb = dentry->d_sb;
-	struct shfl_volinfo shfl_volinfo;
-	struct vboxsf_sbi *sbi;
-	u32 buf_len;
-	int err;
-
-	sbi = VBOXSF_SBI(sb);
-	buf_len = sizeof(shfl_volinfo);
-	err = vboxsf_fsinfo(sbi->root, 0, SHFL_INFO_GET | SHFL_INFO_VOLUME,
-			    &buf_len, &shfl_volinfo);
-	if (err)
-		return err;
-
-	stat->f_type = VBOXSF_SUPER_MAGIC;
-	stat->f_bsize = shfl_volinfo.bytes_per_allocation_unit;
-
-	do_div(shfl_volinfo.total_allocation_bytes,
-	       shfl_volinfo.bytes_per_allocation_unit);
-	stat->f_blocks = shfl_volinfo.total_allocation_bytes;
-
-	do_div(shfl_volinfo.available_allocation_bytes,
-	       shfl_volinfo.bytes_per_allocation_unit);
-	stat->f_bfree  = shfl_volinfo.available_allocation_bytes;
-	stat->f_bavail = shfl_volinfo.available_allocation_bytes;
-
-	stat->f_files = 1000;
-	/*
-	 * Don't return 0 here since the guest may then think that it is not
-	 * possible to create any more files.
-	 */
-	stat->f_ffree = 1000000;
-	stat->f_fsid.val[0] = 0;
-	stat->f_fsid.val[1] = 0;
-	stat->f_namelen = 255;
-	return 0;
-}
-
-static struct super_operations vboxsf_super_ops = {
-	.alloc_inode	= vboxsf_alloc_inode,
-	.free_inode	= vboxsf_free_inode,
-	.put_super	= vboxsf_put_super,
-	.statfs		= vboxsf_statfs,
-};
-
-static int vboxsf_setup(void)
-{
-	int err;
-
-	mutex_lock(&vboxsf_setup_mutex);
-
-	if (vboxsf_setup_done)
-		goto success;
-
-	vboxsf_inode_cachep =
-		kmem_cache_create("vboxsf_inode_cache",
-				  sizeof(struct vboxsf_inode), 0,
-				  (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
-				   SLAB_ACCOUNT),
-				  vboxsf_inode_init_once);
-	if (!vboxsf_inode_cachep) {
-		err = -ENOMEM;
-		goto fail_nomem;
-	}
-
-	err = vboxsf_connect();
-	if (err) {
-		vbg_err("vboxsf: err %d connecting to guest PCI-device\n", err);
-		vbg_err("vboxsf: make sure you are inside a VirtualBox VM\n");
-		vbg_err("vboxsf: and check dmesg for vboxguest errors\n");
-		goto fail_free_cache;
-	}
-
-	err = vboxsf_set_utf8();
-	if (err) {
-		vbg_err("vboxsf_setutf8 error %d\n", err);
-		goto fail_disconnect;
-	}
-
-	if (!follow_symlinks) {
-		err = vboxsf_set_symlinks();
-		if (err)
-			vbg_warn("vboxsf: Unable to show symlinks: %d\n", err);
-	}
-
-	vboxsf_setup_done = true;
-success:
-	mutex_unlock(&vboxsf_setup_mutex);
-	return 0;
-
-fail_disconnect:
-	vboxsf_disconnect();
-fail_free_cache:
-	kmem_cache_destroy(vboxsf_inode_cachep);
-fail_nomem:
-	mutex_unlock(&vboxsf_setup_mutex);
-	return err;
-}
-
-static int vboxsf_parse_monolithic(struct fs_context *fc, void *data)
-{
-	char *options = data;
-
-	if (options && options[0] == VBSF_MOUNT_SIGNATURE_BYTE_0 &&
-		       options[1] == VBSF_MOUNT_SIGNATURE_BYTE_1 &&
-		       options[2] == VBSF_MOUNT_SIGNATURE_BYTE_2 &&
-		       options[3] == VBSF_MOUNT_SIGNATURE_BYTE_3) {
-		vbg_err("vboxsf: Old binary mount data not supported, remove obsolete mount.vboxsf and/or update your VBoxService.\n");
-		return -EINVAL;
-	}
-
-	return generic_parse_monolithic(fc, data);
-}
-
-static int vboxsf_get_tree(struct fs_context *fc)
-{
-	int err;
-
-	err = vboxsf_setup();
-	if (err)
-		return err;
-
-	return vfs_get_super(fc, vfs_get_independent_super, vboxsf_fill_super);
-}
-
-static int vboxsf_reconfigure(struct fs_context *fc)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(fc->root->d_sb);
-	struct vboxsf_fs_context *ctx = fc->fs_private;
-	struct inode *iroot;
-
-	iroot = ilookup(fc->root->d_sb, 0);
-	if (!iroot)
-		return -ENOENT;
-
-	/* Apply changed options to the root inode */
-	sbi->o = ctx->o;
-	vboxsf_init_inode(sbi, iroot, &sbi->root_info);
-
-	return 0;
-}
-
-static void vboxsf_free_fc(struct fs_context *fc)
-{
-	struct vboxsf_fs_context *ctx = fc->fs_private;
-
-	kfree(ctx->nls_name);
-	kfree(ctx);
-}
-
-static const struct fs_context_operations vboxsf_context_ops = {
-	.free			= vboxsf_free_fc,
-	.parse_param		= vboxsf_parse_param,
-	.parse_monolithic	= vboxsf_parse_monolithic,
-	.get_tree		= vboxsf_get_tree,
-	.reconfigure		= vboxsf_reconfigure,
-};
-
-static int vboxsf_init_fs_context(struct fs_context *fc)
-{
-	struct vboxsf_fs_context *ctx;
-
-	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-	if (!ctx)
-		return -ENOMEM;
-
-	current_uid_gid(&ctx->o.uid, &ctx->o.gid);
-
-	fc->fs_private = ctx;
-	fc->ops = &vboxsf_context_ops;
-	return 0;
-}
-
-static struct file_system_type vboxsf_fs_type = {
-	.owner			= THIS_MODULE,
-	.name			= "vboxsf",
-	.init_fs_context	= vboxsf_init_fs_context,
-	.parameters		= &vboxsf_fs_parameters,
-	.kill_sb		= kill_anon_super
-};
-
-/* Module initialization/finalization handlers */
-static int __init vboxsf_init(void)
-{
-	return register_filesystem(&vboxsf_fs_type);
-}
-
-static void __exit vboxsf_fini(void)
-{
-	unregister_filesystem(&vboxsf_fs_type);
-
-	mutex_lock(&vboxsf_setup_mutex);
-	if (vboxsf_setup_done) {
-		vboxsf_disconnect();
-		/*
-		 * Make sure all delayed rcu free inodes are flushed
-		 * before we destroy the cache.
-		 */
-		rcu_barrier();
-		kmem_cache_destroy(vboxsf_inode_cachep);
-	}
-	mutex_unlock(&vboxsf_setup_mutex);
-}
-
-module_init(vboxsf_init);
-module_exit(vboxsf_fini);
-
-MODULE_DESCRIPTION("Oracle VM VirtualBox Module for Host File System Access");
-MODULE_AUTHOR("Oracle Corporation");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_FS("vboxsf");
diff --git a/drivers/staging/vboxsf/utils.c b/drivers/staging/vboxsf/utils.c
deleted file mode 100644
index 34a49e6f74fc..000000000000
--- a/drivers/staging/vboxsf/utils.c
+++ /dev/null
@@ -1,551 +0,0 @@
-// SPDX-License-Identifier: MIT
-/*
- * VirtualBox Guest Shared Folders support: Utility functions.
- * Mainly conversion from/to VirtualBox/Linux data structures.
- *
- * Copyright (C) 2006-2018 Oracle Corporation
- */
-
-#include <linux/namei.h>
-#include <linux/nls.h>
-#include <linux/sizes.h>
-#include <linux/vfs.h>
-#include "vfsmod.h"
-
-struct inode *vboxsf_new_inode(struct super_block *sb)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(sb);
-	struct inode *inode;
-	unsigned long flags;
-	int cursor, ret;
-	u32 gen;
-
-	inode = new_inode(sb);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	idr_preload(GFP_KERNEL);
-	spin_lock_irqsave(&sbi->ino_idr_lock, flags);
-	cursor = idr_get_cursor(&sbi->ino_idr);
-	ret = idr_alloc_cyclic(&sbi->ino_idr, inode, 1, 0, GFP_ATOMIC);
-	if (ret >= 0 && ret < cursor)
-		sbi->next_generation++;
-	gen = sbi->next_generation;
-	spin_unlock_irqrestore(&sbi->ino_idr_lock, flags);
-	idr_preload_end();
-
-	if (ret < 0) {
-		iput(inode);
-		return ERR_PTR(ret);
-	}
-
-	inode->i_ino = ret;
-	inode->i_generation = gen;
-	return inode;
-}
-
-/* set [inode] attributes based on [info], uid/gid based on [sbi] */
-void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
-		       const struct shfl_fsobjinfo *info)
-{
-	const struct shfl_fsobjattr *attr;
-	s64 allocated;
-	int mode;
-
-	attr = &info->attr;
-
-#define mode_set(r) ((attr->mode & (SHFL_UNIX_##r)) ? (S_##r) : 0)
-
-	mode = mode_set(IRUSR);
-	mode |= mode_set(IWUSR);
-	mode |= mode_set(IXUSR);
-
-	mode |= mode_set(IRGRP);
-	mode |= mode_set(IWGRP);
-	mode |= mode_set(IXGRP);
-
-	mode |= mode_set(IROTH);
-	mode |= mode_set(IWOTH);
-	mode |= mode_set(IXOTH);
-
-#undef mode_set
-
-	/* We use the host-side values for these */
-	inode->i_flags |= S_NOATIME | S_NOCMTIME;
-	inode->i_mapping->a_ops = &vboxsf_reg_aops;
-
-	if (SHFL_IS_DIRECTORY(attr->mode)) {
-		inode->i_mode = sbi->o.dmode_set ? sbi->o.dmode : mode;
-		inode->i_mode &= ~sbi->o.dmask;
-		inode->i_mode |= S_IFDIR;
-		inode->i_op = &vboxsf_dir_iops;
-		inode->i_fop = &vboxsf_dir_fops;
-		/*
-		 * XXX: this probably should be set to the number of entries
-		 * in the directory plus two (. ..)
-		 */
-		set_nlink(inode, 1);
-	} else if (SHFL_IS_SYMLINK(attr->mode)) {
-		inode->i_mode = sbi->o.fmode_set ? sbi->o.fmode : mode;
-		inode->i_mode &= ~sbi->o.fmask;
-		inode->i_mode |= S_IFLNK;
-		inode->i_op = &vboxsf_lnk_iops;
-		set_nlink(inode, 1);
-	} else {
-		inode->i_mode = sbi->o.fmode_set ? sbi->o.fmode : mode;
-		inode->i_mode &= ~sbi->o.fmask;
-		inode->i_mode |= S_IFREG;
-		inode->i_op = &vboxsf_reg_iops;
-		inode->i_fop = &vboxsf_reg_fops;
-		set_nlink(inode, 1);
-	}
-
-	inode->i_uid = sbi->o.uid;
-	inode->i_gid = sbi->o.gid;
-
-	inode->i_size = info->size;
-	inode->i_blkbits = 12;
-	/* i_blocks always in units of 512 bytes! */
-	allocated = info->allocated + 511;
-	do_div(allocated, 512);
-	inode->i_blocks = allocated;
-
-	inode->i_atime = ns_to_timespec64(
-				 info->access_time.ns_relative_to_unix_epoch);
-	inode->i_ctime = ns_to_timespec64(
-				 info->change_time.ns_relative_to_unix_epoch);
-	inode->i_mtime = ns_to_timespec64(
-			   info->modification_time.ns_relative_to_unix_epoch);
-}
-
-int vboxsf_create_at_dentry(struct dentry *dentry,
-			    struct shfl_createparms *params)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(dentry->d_sb);
-	struct shfl_string *path;
-	int err;
-
-	path = vboxsf_path_from_dentry(sbi, dentry);
-	if (IS_ERR(path))
-		return PTR_ERR(path);
-
-	err = vboxsf_create(sbi->root, path, params);
-	__putname(path);
-
-	return err;
-}
-
-int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path,
-		struct shfl_fsobjinfo *info)
-{
-	struct shfl_createparms params = {};
-	int err;
-
-	params.handle = SHFL_HANDLE_NIL;
-	params.create_flags = SHFL_CF_LOOKUP | SHFL_CF_ACT_FAIL_IF_NEW;
-
-	err = vboxsf_create(sbi->root, path, &params);
-	if (err)
-		return err;
-
-	if (params.result != SHFL_FILE_EXISTS)
-		return -ENOENT;
-
-	if (info)
-		*info = params.info;
-
-	return 0;
-}
-
-int vboxsf_stat_dentry(struct dentry *dentry, struct shfl_fsobjinfo *info)
-{
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(dentry->d_sb);
-	struct shfl_string *path;
-	int err;
-
-	path = vboxsf_path_from_dentry(sbi, dentry);
-	if (IS_ERR(path))
-		return PTR_ERR(path);
-
-	err = vboxsf_stat(sbi, path, info);
-	__putname(path);
-	return err;
-}
-
-int vboxsf_inode_revalidate(struct dentry *dentry)
-{
-	struct vboxsf_sbi *sbi;
-	struct vboxsf_inode *sf_i;
-	struct shfl_fsobjinfo info;
-	struct timespec64 prev_mtime;
-	struct inode *inode;
-	int err;
-
-	if (!dentry || !d_really_is_positive(dentry))
-		return -EINVAL;
-
-	inode = d_inode(dentry);
-	prev_mtime = inode->i_mtime;
-	sf_i = VBOXSF_I(inode);
-	sbi = VBOXSF_SBI(dentry->d_sb);
-	if (!sf_i->force_restat) {
-		if (time_before(jiffies, dentry->d_time + sbi->o.ttl))
-			return 0;
-	}
-
-	err = vboxsf_stat_dentry(dentry, &info);
-	if (err)
-		return err;
-
-	dentry->d_time = jiffies;
-	sf_i->force_restat = 0;
-	vboxsf_init_inode(sbi, inode, &info);
-
-	/*
-	 * If the file was changed on the host side we need to invalidate the
-	 * page-cache for it.  Note this also gets triggered by our own writes,
-	 * this is unavoidable.
-	 */
-	if (timespec64_compare(&inode->i_mtime, &prev_mtime) > 0)
-		invalidate_inode_pages2(inode->i_mapping);
-
-	return 0;
-}
-
-int vboxsf_getattr(const struct path *path, struct kstat *kstat,
-		   u32 request_mask, unsigned int flags)
-{
-	int err;
-	struct dentry *dentry = path->dentry;
-	struct inode *inode = d_inode(dentry);
-	struct vboxsf_inode *sf_i = VBOXSF_I(inode);
-
-	switch (flags & AT_STATX_SYNC_TYPE) {
-	case AT_STATX_DONT_SYNC:
-		err = 0;
-		break;
-	case AT_STATX_FORCE_SYNC:
-		sf_i->force_restat = 1;
-		/* fall-through */
-	default:
-		err = vboxsf_inode_revalidate(dentry);
-	}
-	if (err)
-		return err;
-
-	generic_fillattr(d_inode(dentry), kstat);
-	return 0;
-}
-
-int vboxsf_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-	struct vboxsf_inode *sf_i = VBOXSF_I(d_inode(dentry));
-	struct vboxsf_sbi *sbi = VBOXSF_SBI(dentry->d_sb);
-	struct shfl_createparms params = {};
-	struct shfl_fsobjinfo info = {};
-	u32 buf_len;
-	int err;
-
-	params.handle = SHFL_HANDLE_NIL;
-	params.create_flags = SHFL_CF_ACT_OPEN_IF_EXISTS |
-			      SHFL_CF_ACT_FAIL_IF_NEW |
-			      SHFL_CF_ACCESS_ATTR_WRITE;
-
-	/* this is at least required for Posix hosts */
-	if (iattr->ia_valid & ATTR_SIZE)
-		params.create_flags |= SHFL_CF_ACCESS_WRITE;
-
-	err = vboxsf_create_at_dentry(dentry, &params);
-	if (err || params.result != SHFL_FILE_EXISTS)
-		return err ? err : -ENOENT;
-
-#define mode_set(r) ((iattr->ia_mode & (S_##r)) ? SHFL_UNIX_##r : 0)
-
-	/*
-	 * Setting the file size and setting the other attributes has to
-	 * be handled separately.
-	 */
-	if (iattr->ia_valid & (ATTR_MODE | ATTR_ATIME | ATTR_MTIME)) {
-		if (iattr->ia_valid & ATTR_MODE) {
-			info.attr.mode = mode_set(IRUSR);
-			info.attr.mode |= mode_set(IWUSR);
-			info.attr.mode |= mode_set(IXUSR);
-			info.attr.mode |= mode_set(IRGRP);
-			info.attr.mode |= mode_set(IWGRP);
-			info.attr.mode |= mode_set(IXGRP);
-			info.attr.mode |= mode_set(IROTH);
-			info.attr.mode |= mode_set(IWOTH);
-			info.attr.mode |= mode_set(IXOTH);
-
-			if (iattr->ia_mode & S_IFDIR)
-				info.attr.mode |= SHFL_TYPE_DIRECTORY;
-			else
-				info.attr.mode |= SHFL_TYPE_FILE;
-		}
-
-		if (iattr->ia_valid & ATTR_ATIME)
-			info.access_time.ns_relative_to_unix_epoch =
-					    timespec64_to_ns(&iattr->ia_atime);
-
-		if (iattr->ia_valid & ATTR_MTIME)
-			info.modification_time.ns_relative_to_unix_epoch =
-					    timespec64_to_ns(&iattr->ia_mtime);
-
-		/*
-		 * Ignore ctime (inode change time) as it can't be set
-		 * from userland anyway.
-		 */
-
-		buf_len = sizeof(info);
-		err = vboxsf_fsinfo(sbi->root, params.handle,
-				    SHFL_INFO_SET | SHFL_INFO_FILE, &buf_len,
-				    &info);
-		if (err) {
-			vboxsf_close(sbi->root, params.handle);
-			return err;
-		}
-
-		/* the host may have given us different attr then requested */
-		sf_i->force_restat = 1;
-	}
-
-#undef mode_set
-
-	if (iattr->ia_valid & ATTR_SIZE) {
-		memset(&info, 0, sizeof(info));
-		info.size = iattr->ia_size;
-		buf_len = sizeof(info);
-		err = vboxsf_fsinfo(sbi->root, params.handle,
-				    SHFL_INFO_SET | SHFL_INFO_SIZE, &buf_len,
-				    &info);
-		if (err) {
-			vboxsf_close(sbi->root, params.handle);
-			return err;
-		}
-
-		/* the host may have given us different attr then requested */
-		sf_i->force_restat = 1;
-	}
-
-	vboxsf_close(sbi->root, params.handle);
-
-	/* Update the inode with what the host has actually given us. */
-	if (sf_i->force_restat)
-		vboxsf_inode_revalidate(dentry);
-
-	return 0;
-}
-
-/*
- * [dentry] contains string encoded in coding system that corresponds
- * to [sbi]->nls, we must convert it to UTF8 here.
- * Returns a shfl_string allocated through __getname (must be freed using
- * __putname), or an ERR_PTR on error.
- */
-struct shfl_string *vboxsf_path_from_dentry(struct vboxsf_sbi *sbi,
-					    struct dentry *dentry)
-{
-	struct shfl_string *shfl_path;
-	int path_len, out_len, nb;
-	char *buf, *path;
-	wchar_t uni;
-	u8 *out;
-
-	buf = __getname();
-	if (!buf)
-		return ERR_PTR(-ENOMEM);
-
-	path = dentry_path_raw(dentry, buf, PATH_MAX);
-	if (IS_ERR(path)) {
-		__putname(buf);
-		return (struct shfl_string *)path;
-	}
-	path_len = strlen(path);
-
-	if (sbi->nls) {
-		shfl_path = __getname();
-		if (!shfl_path) {
-			__putname(buf);
-			return ERR_PTR(-ENOMEM);
-		}
-
-		out = shfl_path->string.utf8;
-		out_len = PATH_MAX - SHFLSTRING_HEADER_SIZE - 1;
-
-		while (path_len) {
-			nb = sbi->nls->char2uni(path, path_len, &uni);
-			if (nb < 0) {
-				__putname(shfl_path);
-				__putname(buf);
-				return ERR_PTR(-EINVAL);
-			}
-			path += nb;
-			path_len -= nb;
-
-			nb = utf32_to_utf8(uni, out, out_len);
-			if (nb < 0) {
-				__putname(shfl_path);
-				__putname(buf);
-				return ERR_PTR(-ENAMETOOLONG);
-			}
-			out += nb;
-			out_len -= nb;
-		}
-		*out = 0;
-		shfl_path->length = out - shfl_path->string.utf8;
-		shfl_path->size = shfl_path->length + 1;
-		__putname(buf);
-	} else {
-		if ((SHFLSTRING_HEADER_SIZE + path_len + 1) > PATH_MAX) {
-			__putname(buf);
-			return ERR_PTR(-ENAMETOOLONG);
-		}
-		/*
-		 * dentry_path stores the name at the end of buf, but the
-		 * shfl_string string we return must be properly aligned.
-		 */
-		shfl_path = (struct shfl_string *)buf;
-		memmove(shfl_path->string.utf8, path, path_len);
-		shfl_path->string.utf8[path_len] = 0;
-		shfl_path->length = path_len;
-		shfl_path->size = path_len + 1;
-	}
-
-	return shfl_path;
-}
-
-int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
-		  const unsigned char *utf8_name, size_t utf8_len)
-{
-	const char *in;
-	char *out;
-	size_t out_len;
-	size_t out_bound_len;
-	size_t in_bound_len;
-
-	in = utf8_name;
-	in_bound_len = utf8_len;
-
-	out = name;
-	out_len = 0;
-	/* Reserve space for terminating 0 */
-	out_bound_len = name_bound_len - 1;
-
-	while (in_bound_len) {
-		int nb;
-		unicode_t uni;
-
-		nb = utf8_to_utf32(in, in_bound_len, &uni);
-		if (nb < 0)
-			return -EINVAL;
-
-		in += nb;
-		in_bound_len -= nb;
-
-		nb = sbi->nls->uni2char(uni, out, out_bound_len);
-		if (nb < 0)
-			return nb;
-
-		out += nb;
-		out_bound_len -= nb;
-		out_len += nb;
-	}
-
-	*out = 0;
-
-	return 0;
-}
-
-static struct vboxsf_dir_buf *vboxsf_dir_buf_alloc(struct list_head *list)
-{
-	struct vboxsf_dir_buf *b;
-
-	b = kmalloc(sizeof(*b), GFP_KERNEL);
-	if (!b)
-		return NULL;
-
-	b->buf = kmalloc(DIR_BUFFER_SIZE, GFP_KERNEL);
-	if (!b->buf) {
-		kfree(b);
-		return NULL;
-	}
-
-	b->entries = 0;
-	b->used = 0;
-	b->free = DIR_BUFFER_SIZE;
-	list_add(&b->head, list);
-
-	return b;
-}
-
-static void vboxsf_dir_buf_free(struct vboxsf_dir_buf *b)
-{
-	list_del(&b->head);
-	kfree(b->buf);
-	kfree(b);
-}
-
-struct vboxsf_dir_info *vboxsf_dir_info_alloc(void)
-{
-	struct vboxsf_dir_info *p;
-
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
-	if (!p)
-		return NULL;
-
-	INIT_LIST_HEAD(&p->info_list);
-	return p;
-}
-
-void vboxsf_dir_info_free(struct vboxsf_dir_info *p)
-{
-	struct list_head *list, *pos, *tmp;
-
-	list = &p->info_list;
-	list_for_each_safe(pos, tmp, list) {
-		struct vboxsf_dir_buf *b;
-
-		b = list_entry(pos, struct vboxsf_dir_buf, head);
-		vboxsf_dir_buf_free(b);
-	}
-	kfree(p);
-}
-
-int vboxsf_dir_read_all(struct vboxsf_sbi *sbi, struct vboxsf_dir_info *sf_d,
-			u64 handle)
-{
-	struct vboxsf_dir_buf *b;
-	u32 entries, size;
-	int err = 0;
-	void *buf;
-
-	/* vboxsf_dirinfo returns 1 on end of dir */
-	while (err == 0) {
-		b = vboxsf_dir_buf_alloc(&sf_d->info_list);
-		if (!b) {
-			err = -ENOMEM;
-			break;
-		}
-
-		buf = b->buf;
-		size = b->free;
-
-		err = vboxsf_dirinfo(sbi->root, handle, NULL, 0, 0,
-				     &size, buf, &entries);
-		if (err < 0)
-			break;
-
-		b->entries += entries;
-		b->free -= size;
-		b->used += size;
-	}
-
-	if (b && b->used == 0)
-		vboxsf_dir_buf_free(b);
-
-	/* -EILSEQ means the host could not translate a filename, ignore */
-	if (err > 0 || err == -EILSEQ)
-		err = 0;
-
-	return err;
-}
diff --git a/drivers/staging/vboxsf/vboxsf_wrappers.c b/drivers/staging/vboxsf/vboxsf_wrappers.c
deleted file mode 100644
index bfc78a097dae..000000000000
--- a/drivers/staging/vboxsf/vboxsf_wrappers.c
+++ /dev/null
@@ -1,371 +0,0 @@
-// SPDX-License-Identifier: MIT
-/*
- * Wrapper functions for the shfl host calls.
- *
- * Copyright (C) 2006-2018 Oracle Corporation
- */
-
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/vbox_err.h>
-#include <linux/vbox_utils.h>
-#include "vfsmod.h"
-
-#define SHFL_REQUEST \
-	(VMMDEV_REQUESTOR_KERNEL | VMMDEV_REQUESTOR_USR_DRV_OTHER | \
-	 VMMDEV_REQUESTOR_CON_DONT_KNOW | VMMDEV_REQUESTOR_TRUST_NOT_GIVEN)
-
-static u32 vboxsf_client_id;
-
-int vboxsf_connect(void)
-{
-	struct vbg_dev *gdev;
-	struct vmmdev_hgcm_service_location loc;
-	int err, vbox_status;
-
-	loc.type = VMMDEV_HGCM_LOC_LOCALHOST_EXISTING;
-	strcpy(loc.u.localhost.service_name, "VBoxSharedFolders");
-
-	gdev = vbg_get_gdev();
-	if (IS_ERR(gdev))
-		return -ENODEV;	/* No guest-device */
-
-	err = vbg_hgcm_connect(gdev, SHFL_REQUEST, &loc,
-			       &vboxsf_client_id, &vbox_status);
-	vbg_put_gdev(gdev);
-
-	return err ? err : vbg_status_code_to_errno(vbox_status);
-}
-
-void vboxsf_disconnect(void)
-{
-	struct vbg_dev *gdev;
-	int vbox_status;
-
-	gdev = vbg_get_gdev();
-	if (IS_ERR(gdev))
-		return;   /* guest-device is gone, already disconnected */
-
-	vbg_hgcm_disconnect(gdev, SHFL_REQUEST, vboxsf_client_id, &vbox_status);
-	vbg_put_gdev(gdev);
-}
-
-static int vboxsf_call(u32 function, void *parms, u32 parm_count, int *status)
-{
-	struct vbg_dev *gdev;
-	int err, vbox_status;
-
-	gdev = vbg_get_gdev();
-	if (IS_ERR(gdev))
-		return -ESHUTDOWN; /* guest-dev removed underneath us */
-
-	err = vbg_hgcm_call(gdev, SHFL_REQUEST, vboxsf_client_id, function,
-			    U32_MAX, parms, parm_count, &vbox_status);
-	vbg_put_gdev(gdev);
-
-	if (err < 0)
-		return err;
-
-	if (status)
-		*status = vbox_status;
-
-	return vbg_status_code_to_errno(vbox_status);
-}
-
-int vboxsf_map_folder(struct shfl_string *folder_name, u32 *root)
-{
-	struct shfl_map_folder parms;
-	int err, status;
-
-	parms.path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL;
-	parms.path.u.pointer.size = shfl_string_buf_size(folder_name);
-	parms.path.u.pointer.u.linear_addr = (uintptr_t)folder_name;
-
-	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.root.u.value32 = 0;
-
-	parms.delimiter.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.delimiter.u.value32 = '/';
-
-	parms.case_sensitive.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.case_sensitive.u.value32 = 1;
-
-	err = vboxsf_call(SHFL_FN_MAP_FOLDER, &parms, SHFL_CPARMS_MAP_FOLDER,
-			  &status);
-	if (err == -ENOSYS && status == VERR_NOT_IMPLEMENTED)
-		vbg_err("%s: Error host is too old\n", __func__);
-
-	*root = parms.root.u.value32;
-	return err;
-}
-
-int vboxsf_unmap_folder(u32 root)
-{
-	struct shfl_unmap_folder parms;
-
-	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.root.u.value32 = root;
-
-	return vboxsf_call(SHFL_FN_UNMAP_FOLDER, &parms,
-			   SHFL_CPARMS_UNMAP_FOLDER, NULL);
-}
-
-/**
- * vboxsf_create - Create a new file or folder
- * @root:         Root of the shared folder in which to create the file
- * @parsed_path:  The path of the file or folder relative to the shared folder
- * @param:        create_parms Parameters for file/folder creation.
- *
- * Create a new file or folder or open an existing one in a shared folder.
- * Note this function always returns 0 / success unless an exceptional condition
- * occurs - out of memory, invalid arguments, etc. If the file or folder could
- * not be opened or created, create_parms->handle will be set to
- * SHFL_HANDLE_NIL on return.  In this case the value in create_parms->result
- * provides information as to why (e.g. SHFL_FILE_EXISTS), create_parms->result
- * is also set on success as additional information.
- *
- * Returns:
- * 0 or negative errno value.
- */
-int vboxsf_create(u32 root, struct shfl_string *parsed_path,
-		  struct shfl_createparms *create_parms)
-{
-	struct shfl_create parms;
-
-	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.root.u.value32 = root;
-
-	parms.path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL;
-	parms.path.u.pointer.size = shfl_string_buf_size(parsed_path);
-	parms.path.u.pointer.u.linear_addr = (uintptr_t)parsed_path;
-
-	parms.parms.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL;
-	parms.parms.u.pointer.size = sizeof(struct shfl_createparms);
-	parms.parms.u.pointer.u.linear_addr = (uintptr_t)create_parms;
-
-	return vboxsf_call(SHFL_FN_CREATE, &parms, SHFL_CPARMS_CREATE, NULL);
-}
-
-int vboxsf_close(u32 root, u64 handle)
-{
-	struct shfl_close parms;
-
-	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.root.u.value32 = root;
-
-	parms.handle.type = VMMDEV_HGCM_PARM_TYPE_64BIT;
-	parms.handle.u.value64 = handle;
-
-	return vboxsf_call(SHFL_FN_CLOSE, &parms, SHFL_CPARMS_CLOSE, NULL);
-}
-
-int vboxsf_remove(u32 root, struct shfl_string *parsed_path, u32 flags)
-{
-	struct shfl_remove parms;
-
-	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.root.u.value32 = root;
-
-	parms.path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN;
-	parms.path.u.pointer.size = shfl_string_buf_size(parsed_path);
-	parms.path.u.pointer.u.linear_addr = (uintptr_t)parsed_path;
-
-	parms.flags.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.flags.u.value32 = flags;
-
-	return vboxsf_call(SHFL_FN_REMOVE, &parms, SHFL_CPARMS_REMOVE, NULL);
-}
-
-int vboxsf_rename(u32 root, struct shfl_string *src_path,
-		  struct shfl_string *dest_path, u32 flags)
-{
-	struct shfl_rename parms;
-
-	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.root.u.value32 = root;
-
-	parms.src.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN;
-	parms.src.u.pointer.size = shfl_string_buf_size(src_path);
-	parms.src.u.pointer.u.linear_addr = (uintptr_t)src_path;
-
-	parms.dest.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN;
-	parms.dest.u.pointer.size = shfl_string_buf_size(dest_path);
-	parms.dest.u.pointer.u.linear_addr = (uintptr_t)dest_path;
-
-	parms.flags.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.flags.u.value32 = flags;
-
-	return vboxsf_call(SHFL_FN_RENAME, &parms, SHFL_CPARMS_RENAME, NULL);
-}
-
-int vboxsf_read(u32 root, u64 handle, u64 offset, u32 *buf_len, u8 *buf)
-{
-	struct shfl_read parms;
-	int err;
-
-	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.root.u.value32 = root;
-
-	parms.handle.type = VMMDEV_HGCM_PARM_TYPE_64BIT;
-	parms.handle.u.value64 = handle;
-	parms.offset.type = VMMDEV_HGCM_PARM_TYPE_64BIT;
-	parms.offset.u.value64 = offset;
-	parms.cb.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.cb.u.value32 = *buf_len;
-	parms.buffer.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT;
-	parms.buffer.u.pointer.size = *buf_len;
-	parms.buffer.u.pointer.u.linear_addr = (uintptr_t)buf;
-
-	err = vboxsf_call(SHFL_FN_READ, &parms, SHFL_CPARMS_READ, NULL);
-
-	*buf_len = parms.cb.u.value32;
-	return err;
-}
-
-int vboxsf_write(u32 root, u64 handle, u64 offset, u32 *buf_len, u8 *buf)
-{
-	struct shfl_write parms;
-	int err;
-
-	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.root.u.value32 = root;
-
-	parms.handle.type = VMMDEV_HGCM_PARM_TYPE_64BIT;
-	parms.handle.u.value64 = handle;
-	parms.offset.type = VMMDEV_HGCM_PARM_TYPE_64BIT;
-	parms.offset.u.value64 = offset;
-	parms.cb.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.cb.u.value32 = *buf_len;
-	parms.buffer.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN;
-	parms.buffer.u.pointer.size = *buf_len;
-	parms.buffer.u.pointer.u.linear_addr = (uintptr_t)buf;
-
-	err = vboxsf_call(SHFL_FN_WRITE, &parms, SHFL_CPARMS_WRITE, NULL);
-
-	*buf_len = parms.cb.u.value32;
-	return err;
-}
-
-/* Returns 0 on success, 1 on end-of-dir, negative errno otherwise */
-int vboxsf_dirinfo(u32 root, u64 handle,
-		   struct shfl_string *parsed_path, u32 flags, u32 index,
-		   u32 *buf_len, struct shfl_dirinfo *buf, u32 *file_count)
-{
-	struct shfl_list parms;
-	int err, status;
-
-	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.root.u.value32 = root;
-
-	parms.handle.type = VMMDEV_HGCM_PARM_TYPE_64BIT;
-	parms.handle.u.value64 = handle;
-	parms.flags.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.flags.u.value32 = flags;
-	parms.cb.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.cb.u.value32 = *buf_len;
-	if (parsed_path) {
-		parms.path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN;
-		parms.path.u.pointer.size = shfl_string_buf_size(parsed_path);
-		parms.path.u.pointer.u.linear_addr = (uintptr_t)parsed_path;
-	} else {
-		parms.path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_IN;
-		parms.path.u.pointer.size = 0;
-		parms.path.u.pointer.u.linear_addr = 0;
-	}
-
-	parms.buffer.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT;
-	parms.buffer.u.pointer.size = *buf_len;
-	parms.buffer.u.pointer.u.linear_addr = (uintptr_t)buf;
-
-	parms.resume_point.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.resume_point.u.value32 = index;
-	parms.file_count.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.file_count.u.value32 = 0;	/* out parameter only */
-
-	err = vboxsf_call(SHFL_FN_LIST, &parms, SHFL_CPARMS_LIST, &status);
-	if (err == -ENODATA && status == VERR_NO_MORE_FILES)
-		err = 1;
-
-	*buf_len = parms.cb.u.value32;
-	*file_count = parms.file_count.u.value32;
-	return err;
-}
-
-int vboxsf_fsinfo(u32 root, u64 handle, u32 flags,
-		  u32 *buf_len, void *buf)
-{
-	struct shfl_information parms;
-	int err;
-
-	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.root.u.value32 = root;
-
-	parms.handle.type = VMMDEV_HGCM_PARM_TYPE_64BIT;
-	parms.handle.u.value64 = handle;
-	parms.flags.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.flags.u.value32 = flags;
-	parms.cb.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.cb.u.value32 = *buf_len;
-	parms.info.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL;
-	parms.info.u.pointer.size = *buf_len;
-	parms.info.u.pointer.u.linear_addr = (uintptr_t)buf;
-
-	err = vboxsf_call(SHFL_FN_INFORMATION, &parms, SHFL_CPARMS_INFORMATION,
-			  NULL);
-
-	*buf_len = parms.cb.u.value32;
-	return err;
-}
-
-int vboxsf_readlink(u32 root, struct shfl_string *parsed_path,
-		    u32 buf_len, u8 *buf)
-{
-	struct shfl_readLink parms;
-
-	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.root.u.value32 = root;
-
-	parms.path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN;
-	parms.path.u.pointer.size = shfl_string_buf_size(parsed_path);
-	parms.path.u.pointer.u.linear_addr = (uintptr_t)parsed_path;
-
-	parms.buffer.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT;
-	parms.buffer.u.pointer.size = buf_len;
-	parms.buffer.u.pointer.u.linear_addr = (uintptr_t)buf;
-
-	return vboxsf_call(SHFL_FN_READLINK, &parms, SHFL_CPARMS_READLINK,
-			   NULL);
-}
-
-int vboxsf_symlink(u32 root, struct shfl_string *new_path,
-		   struct shfl_string *old_path, struct shfl_fsobjinfo *buf)
-{
-	struct shfl_symlink parms;
-
-	parms.root.type = VMMDEV_HGCM_PARM_TYPE_32BIT;
-	parms.root.u.value32 = root;
-
-	parms.new_path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN;
-	parms.new_path.u.pointer.size = shfl_string_buf_size(new_path);
-	parms.new_path.u.pointer.u.linear_addr = (uintptr_t)new_path;
-
-	parms.old_path.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_IN;
-	parms.old_path.u.pointer.size = shfl_string_buf_size(old_path);
-	parms.old_path.u.pointer.u.linear_addr = (uintptr_t)old_path;
-
-	parms.info.type = VMMDEV_HGCM_PARM_TYPE_LINADDR_KERNEL_OUT;
-	parms.info.u.pointer.size = sizeof(struct shfl_fsobjinfo);
-	parms.info.u.pointer.u.linear_addr = (uintptr_t)buf;
-
-	return vboxsf_call(SHFL_FN_SYMLINK, &parms, SHFL_CPARMS_SYMLINK, NULL);
-}
-
-int vboxsf_set_utf8(void)
-{
-	return vboxsf_call(SHFL_FN_SET_UTF8, NULL, 0, NULL);
-}
-
-int vboxsf_set_symlinks(void)
-{
-	return vboxsf_call(SHFL_FN_SET_SYMLINKS, NULL, 0, NULL);
-}
diff --git a/drivers/staging/vboxsf/vfsmod.h b/drivers/staging/vboxsf/vfsmod.h
deleted file mode 100644
index 18f95b00fc33..000000000000
--- a/drivers/staging/vboxsf/vfsmod.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* SPDX-License-Identifier: MIT */
-/*
- * VirtualBox Guest Shared Folders support: module header.
- *
- * Copyright (C) 2006-2018 Oracle Corporation
- */
-
-#ifndef VFSMOD_H
-#define VFSMOD_H
-
-#include <linux/backing-dev.h>
-#include <linux/idr.h>
-#include "shfl_hostintf.h"
-
-#define DIR_BUFFER_SIZE SZ_16K
-
-/* The cast is to prevent assignment of void * to pointers of arbitrary type */
-#define VBOXSF_SBI(sb)	((struct vboxsf_sbi *)(sb)->s_fs_info)
-#define VBOXSF_I(i)	container_of(i, struct vboxsf_inode, vfs_inode)
-
-struct vboxsf_options {
-	unsigned long ttl;
-	kuid_t uid;
-	kgid_t gid;
-	bool dmode_set;
-	bool fmode_set;
-	umode_t dmode;
-	umode_t fmode;
-	umode_t dmask;
-	umode_t fmask;
-};
-
-struct vboxsf_fs_context {
-	struct vboxsf_options o;
-	char *nls_name;
-};
-
-/* per-shared folder information */
-struct vboxsf_sbi {
-	struct vboxsf_options o;
-	struct shfl_fsobjinfo root_info;
-	struct idr ino_idr;
-	spinlock_t ino_idr_lock; /* This protects ino_idr */
-	struct nls_table *nls;
-	u32 next_generation;
-	u32 root;
-	int bdi_id;
-};
-
-/* per-inode information */
-struct vboxsf_inode {
-	/* some information was changed, update data on next revalidate */
-	int force_restat;
-	/* list of open handles for this inode + lock protecting it */
-	struct list_head handle_list;
-	/* This mutex protects handle_list accesses */
-	struct mutex handle_list_mutex;
-	/* The VFS inode struct */
-	struct inode vfs_inode;
-};
-
-struct vboxsf_dir_info {
-	struct list_head info_list;
-};
-
-struct vboxsf_dir_buf {
-	size_t entries;
-	size_t free;
-	size_t used;
-	void *buf;
-	struct list_head head;
-};
-
-/* globals */
-extern const struct inode_operations vboxsf_dir_iops;
-extern const struct inode_operations vboxsf_lnk_iops;
-extern const struct inode_operations vboxsf_reg_iops;
-extern const struct file_operations vboxsf_dir_fops;
-extern const struct file_operations vboxsf_reg_fops;
-extern const struct address_space_operations vboxsf_reg_aops;
-extern const struct dentry_operations vboxsf_dentry_ops;
-
-/* from utils.c */
-struct inode *vboxsf_new_inode(struct super_block *sb);
-void vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
-		       const struct shfl_fsobjinfo *info);
-int vboxsf_create_at_dentry(struct dentry *dentry,
-			    struct shfl_createparms *params);
-int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path,
-		struct shfl_fsobjinfo *info);
-int vboxsf_stat_dentry(struct dentry *dentry, struct shfl_fsobjinfo *info);
-int vboxsf_inode_revalidate(struct dentry *dentry);
-int vboxsf_getattr(const struct path *path, struct kstat *kstat,
-		   u32 request_mask, unsigned int query_flags);
-int vboxsf_setattr(struct dentry *dentry, struct iattr *iattr);
-struct shfl_string *vboxsf_path_from_dentry(struct vboxsf_sbi *sbi,
-					    struct dentry *dentry);
-int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
-		  const unsigned char *utf8_name, size_t utf8_len);
-struct vboxsf_dir_info *vboxsf_dir_info_alloc(void);
-void vboxsf_dir_info_free(struct vboxsf_dir_info *p);
-int vboxsf_dir_read_all(struct vboxsf_sbi *sbi, struct vboxsf_dir_info *sf_d,
-			u64 handle);
-
-/* from vboxsf_wrappers.c */
-int vboxsf_connect(void);
-void vboxsf_disconnect(void);
-
-int vboxsf_create(u32 root, struct shfl_string *parsed_path,
-		  struct shfl_createparms *create_parms);
-
-int vboxsf_close(u32 root, u64 handle);
-int vboxsf_remove(u32 root, struct shfl_string *parsed_path, u32 flags);
-int vboxsf_rename(u32 root, struct shfl_string *src_path,
-		  struct shfl_string *dest_path, u32 flags);
-
-int vboxsf_read(u32 root, u64 handle, u64 offset, u32 *buf_len, u8 *buf);
-int vboxsf_write(u32 root, u64 handle, u64 offset, u32 *buf_len, u8 *buf);
-
-int vboxsf_dirinfo(u32 root, u64 handle,
-		   struct shfl_string *parsed_path, u32 flags, u32 index,
-		   u32 *buf_len, struct shfl_dirinfo *buf, u32 *file_count);
-int vboxsf_fsinfo(u32 root, u64 handle, u32 flags,
-		  u32 *buf_len, void *buf);
-
-int vboxsf_map_folder(struct shfl_string *folder_name, u32 *root);
-int vboxsf_unmap_folder(u32 root);
-
-int vboxsf_readlink(u32 root, struct shfl_string *parsed_path,
-		    u32 buf_len, u8 *buf);
-int vboxsf_symlink(u32 root, struct shfl_string *new_path,
-		   struct shfl_string *old_path, struct shfl_fsobjinfo *buf);
-
-int vboxsf_set_utf8(void);
-int vboxsf_set_symlinks(void);
-
-#endif