mirror of
https://github.com/torvalds/linux.git
synced 2026-06-07 14:04:54 +02:00
Merge branch 'linux-linaro-lsk-v4.4' into linux-linaro-lsk-v4.4-android
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Conflicts:
kernel/fork.c
Conflict due to Kaiser implementation in LTS 4.4.110.
net/ipv4/raw.c
Minor conflict due to LTS commit
be27b620a8 ("net: ipv4: fix for a race condition in raw_sendmsg")
This commit is contained in:
commit
395ae9f4e6
|
|
@ -271,3 +271,19 @@ Description: Parameters for the CPU cache attributes
|
|||
- WriteBack: data is written only to the cache line and
|
||||
the modified cache line is written to main
|
||||
memory only when it is replaced
|
||||
|
||||
What: /sys/devices/system/cpu/vulnerabilities
|
||||
/sys/devices/system/cpu/vulnerabilities/meltdown
|
||||
/sys/devices/system/cpu/vulnerabilities/spectre_v1
|
||||
/sys/devices/system/cpu/vulnerabilities/spectre_v2
|
||||
Date: January 2018
|
||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||
Description: Information about CPU vulnerabilities
|
||||
|
||||
The files are named after the code names of CPU
|
||||
vulnerabilities. The output of those files reflects the
|
||||
state of the CPUs in the system. Possible output values:
|
||||
|
||||
"Not affected" CPU is not affected by the vulnerability
|
||||
"Vulnerable" CPU is affected and no mitigation in effect
|
||||
"Mitigation: $M" CPU is affected and mitigation $M is in effect
|
||||
|
|
|
|||
|
|
@ -2525,6 +2525,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
|
||||
nointroute [IA-64]
|
||||
|
||||
noinvpcid [X86] Disable the INVPCID cpu feature.
|
||||
|
||||
nojitter [IA-64] Disables jitter checking for ITC timers.
|
||||
|
||||
no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
|
||||
|
|
@ -2559,6 +2561,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
nopat [X86] Disable PAT (page attribute table extension of
|
||||
pagetables) support.
|
||||
|
||||
nopcid [X86-64] Disable the PCID cpu feature.
|
||||
|
||||
norandmaps Don't use address space randomization. Equivalent to
|
||||
echo 0 > /proc/sys/kernel/randomize_va_space
|
||||
|
||||
|
|
@ -3056,6 +3060,21 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
pt. [PARIDE]
|
||||
See Documentation/blockdev/paride.txt.
|
||||
|
||||
pti= [X86_64] Control Page Table Isolation of user and
|
||||
kernel address spaces. Disabling this feature
|
||||
removes hardening, but improves performance of
|
||||
system calls and interrupts.
|
||||
|
||||
on - unconditionally enable
|
||||
off - unconditionally disable
|
||||
auto - kernel detects whether your CPU model is
|
||||
vulnerable to issues that PTI mitigates
|
||||
|
||||
Not specifying this option is equivalent to pti=auto.
|
||||
|
||||
nopti [X86_64]
|
||||
Equivalent to pti=off
|
||||
|
||||
pty.legacy_count=
|
||||
[KNL] Number of legacy pty's. Overwrites compiled-in
|
||||
default number.
|
||||
|
|
|
|||
186
Documentation/x86/pti.txt
Normal file
186
Documentation/x86/pti.txt
Normal file
|
|
@ -0,0 +1,186 @@
|
|||
Overview
|
||||
========
|
||||
|
||||
Page Table Isolation (pti, previously known as KAISER[1]) is a
|
||||
countermeasure against attacks on the shared user/kernel address
|
||||
space such as the "Meltdown" approach[2].
|
||||
|
||||
To mitigate this class of attacks, we create an independent set of
|
||||
page tables for use only when running userspace applications. When
|
||||
the kernel is entered via syscalls, interrupts or exceptions, the
|
||||
page tables are switched to the full "kernel" copy. When the system
|
||||
switches back to user mode, the user copy is used again.
|
||||
|
||||
The userspace page tables contain only a minimal amount of kernel
|
||||
data: only what is needed to enter/exit the kernel such as the
|
||||
entry/exit functions themselves and the interrupt descriptor table
|
||||
(IDT). There are a few strictly unnecessary things that get mapped
|
||||
such as the first C function when entering an interrupt (see
|
||||
comments in pti.c).
|
||||
|
||||
This approach helps to ensure that side-channel attacks leveraging
|
||||
the paging structures do not function when PTI is enabled. It can be
|
||||
enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time.
|
||||
Once enabled at compile-time, it can be disabled at boot with the
|
||||
'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt).
|
||||
|
||||
Page Table Management
|
||||
=====================
|
||||
|
||||
When PTI is enabled, the kernel manages two sets of page tables.
|
||||
The first set is very similar to the single set which is present in
|
||||
kernels without PTI. This includes a complete mapping of userspace
|
||||
that the kernel can use for things like copy_to_user().
|
||||
|
||||
Although _complete_, the user portion of the kernel page tables is
|
||||
crippled by setting the NX bit in the top level. This ensures
|
||||
that any missed kernel->user CR3 switch will immediately crash
|
||||
userspace upon executing its first instruction.
|
||||
|
||||
The userspace page tables map only the kernel data needed to enter
|
||||
and exit the kernel. This data is entirely contained in the 'struct
|
||||
cpu_entry_area' structure which is placed in the fixmap which gives
|
||||
each CPU's copy of the area a compile-time-fixed virtual address.
|
||||
|
||||
For new userspace mappings, the kernel makes the entries in its
|
||||
page tables like normal. The only difference is when the kernel
|
||||
makes entries in the top (PGD) level. In addition to setting the
|
||||
entry in the main kernel PGD, a copy of the entry is made in the
|
||||
userspace page tables' PGD.
|
||||
|
||||
This sharing at the PGD level also inherently shares all the lower
|
||||
layers of the page tables. This leaves a single, shared set of
|
||||
userspace page tables to manage. One PTE to lock, one set of
|
||||
accessed bits, dirty bits, etc...
|
||||
|
||||
Overhead
|
||||
========
|
||||
|
||||
Protection against side-channel attacks is important. But,
|
||||
this protection comes at a cost:
|
||||
|
||||
1. Increased Memory Use
|
||||
a. Each process now needs an order-1 PGD instead of order-0.
|
||||
(Consumes an additional 4k per process).
|
||||
b. The 'cpu_entry_area' structure must be 2MB in size and 2MB
|
||||
aligned so that it can be mapped by setting a single PMD
|
||||
entry. This consumes nearly 2MB of RAM once the kernel
|
||||
is decompressed, but no space in the kernel image itself.
|
||||
|
||||
2. Runtime Cost
|
||||
a. CR3 manipulation to switch between the page table copies
|
||||
must be done at interrupt, syscall, and exception entry
|
||||
and exit (it can be skipped when the kernel is interrupted,
|
||||
though.) Moves to CR3 are on the order of a hundred
|
||||
cycles, and are required at every entry and exit.
|
||||
b. A "trampoline" must be used for SYSCALL entry. This
|
||||
trampoline depends on a smaller set of resources than the
|
||||
non-PTI SYSCALL entry code, so requires mapping fewer
|
||||
things into the userspace page tables. The downside is
|
||||
that stacks must be switched at entry time.
|
||||
d. Global pages are disabled for all kernel structures not
|
||||
mapped into both kernel and userspace page tables. This
|
||||
feature of the MMU allows different processes to share TLB
|
||||
entries mapping the kernel. Losing the feature means more
|
||||
TLB misses after a context switch. The actual loss of
|
||||
performance is very small, however, never exceeding 1%.
|
||||
d. Process Context IDentifiers (PCID) is a CPU feature that
|
||||
allows us to skip flushing the entire TLB when switching page
|
||||
tables by setting a special bit in CR3 when the page tables
|
||||
are changed. This makes switching the page tables (at context
|
||||
switch, or kernel entry/exit) cheaper. But, on systems with
|
||||
PCID support, the context switch code must flush both the user
|
||||
and kernel entries out of the TLB. The user PCID TLB flush is
|
||||
deferred until the exit to userspace, minimizing the cost.
|
||||
See intel.com/sdm for the gory PCID/INVPCID details.
|
||||
e. The userspace page tables must be populated for each new
|
||||
process. Even without PTI, the shared kernel mappings
|
||||
are created by copying top-level (PGD) entries into each
|
||||
new process. But, with PTI, there are now *two* kernel
|
||||
mappings: one in the kernel page tables that maps everything
|
||||
and one for the entry/exit structures. At fork(), we need to
|
||||
copy both.
|
||||
f. In addition to the fork()-time copying, there must also
|
||||
be an update to the userspace PGD any time a set_pgd() is done
|
||||
on a PGD used to map userspace. This ensures that the kernel
|
||||
and userspace copies always map the same userspace
|
||||
memory.
|
||||
g. On systems without PCID support, each CR3 write flushes
|
||||
the entire TLB. That means that each syscall, interrupt
|
||||
or exception flushes the TLB.
|
||||
h. INVPCID is a TLB-flushing instruction which allows flushing
|
||||
of TLB entries for non-current PCIDs. Some systems support
|
||||
PCIDs, but do not support INVPCID. On these systems, addresses
|
||||
can only be flushed from the TLB for the current PCID. When
|
||||
flushing a kernel address, we need to flush all PCIDs, so a
|
||||
single kernel address flush will require a TLB-flushing CR3
|
||||
write upon the next use of every PCID.
|
||||
|
||||
Possible Future Work
|
||||
====================
|
||||
1. We can be more careful about not actually writing to CR3
|
||||
unless its value is actually changed.
|
||||
2. Allow PTI to be enabled/disabled at runtime in addition to the
|
||||
boot-time switching.
|
||||
|
||||
Testing
|
||||
========
|
||||
|
||||
To test stability of PTI, the following test procedure is recommended,
|
||||
ideally doing all of these in parallel:
|
||||
|
||||
1. Set CONFIG_DEBUG_ENTRY=y
|
||||
2. Run several copies of all of the tools/testing/selftests/x86/ tests
|
||||
(excluding MPX and protection_keys) in a loop on multiple CPUs for
|
||||
several minutes. These tests frequently uncover corner cases in the
|
||||
kernel entry code. In general, old kernels might cause these tests
|
||||
themselves to crash, but they should never crash the kernel.
|
||||
3. Run the 'perf' tool in a mode (top or record) that generates many
|
||||
frequent performance monitoring non-maskable interrupts (see "NMI"
|
||||
in /proc/interrupts). This exercises the NMI entry/exit code which
|
||||
is known to trigger bugs in code paths that did not expect to be
|
||||
interrupted, including nested NMIs. Using "-c" boosts the rate of
|
||||
NMIs, and using two -c with separate counters encourages nested NMIs
|
||||
and less deterministic behavior.
|
||||
|
||||
while true; do perf record -c 10000 -e instructions,cycles -a sleep 10; done
|
||||
|
||||
4. Launch a KVM virtual machine.
|
||||
5. Run 32-bit binaries on systems supporting the SYSCALL instruction.
|
||||
This has been a lightly-tested code path and needs extra scrutiny.
|
||||
|
||||
Debugging
|
||||
=========
|
||||
|
||||
Bugs in PTI cause a few different signatures of crashes
|
||||
that are worth noting here.
|
||||
|
||||
* Failures of the selftests/x86 code. Usually a bug in one of the
|
||||
more obscure corners of entry_64.S
|
||||
* Crashes in early boot, especially around CPU bringup. Bugs
|
||||
in the trampoline code or mappings cause these.
|
||||
* Crashes at the first interrupt. Caused by bugs in entry_64.S,
|
||||
like screwing up a page table switch. Also caused by
|
||||
incorrectly mapping the IRQ handler entry code.
|
||||
* Crashes at the first NMI. The NMI code is separate from main
|
||||
interrupt handlers and can have bugs that do not affect
|
||||
normal interrupts. Also caused by incorrectly mapping NMI
|
||||
code. NMIs that interrupt the entry code must be very
|
||||
careful and can be the cause of crashes that show up when
|
||||
running perf.
|
||||
* Kernel crashes at the first exit to userspace. entry_64.S
|
||||
bugs, or failing to map some of the exit code.
|
||||
* Crashes at first interrupt that interrupts userspace. The paths
|
||||
in entry_64.S that return to userspace are sometimes separate
|
||||
from the ones that return to the kernel.
|
||||
* Double faults: overflowing the kernel stack because of page
|
||||
faults upon page faults. Caused by touching non-pti-mapped
|
||||
data in the entry code, or forgetting to switch to kernel
|
||||
CR3 before calling into C functions which are not pti-mapped.
|
||||
* Userspace segfaults early in boot, sometimes manifesting
|
||||
as mount(8) failing to mount the rootfs. These have
|
||||
tended to be TLB invalidation issues. Usually invalidating
|
||||
the wrong PCID, or otherwise missing an invalidation.
|
||||
|
||||
1. https://gruss.cc/files/kaiser.pdf
|
||||
2. https://meltdownattack.com/meltdown.pdf
|
||||
5
Makefile
5
Makefile
|
|
@ -1,6 +1,6 @@
|
|||
VERSION = 4
|
||||
PATCHLEVEL = 4
|
||||
SUBLEVEL = 107
|
||||
SUBLEVEL = 112
|
||||
EXTRAVERSION =
|
||||
NAME = Blurry Fish Butt
|
||||
|
||||
|
|
@ -795,6 +795,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
|
|||
# disable invalid "can't wrap" optimizations for signed / pointers
|
||||
KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
|
||||
|
||||
# Make sure -fstack-check isn't enabled (like gentoo apparently did)
|
||||
KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,)
|
||||
|
||||
# conserve stack if available
|
||||
KBUILD_CFLAGS += $(call cc-option,-fconserve-stack)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
* Copyright (C) 1996, Linus Torvalds
|
||||
*/
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <asm/machvec.h>
|
||||
#include <asm/compiler.h>
|
||||
#include <asm-generic/mm_hooks.h>
|
||||
|
|
|
|||
|
|
@ -673,6 +673,7 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
|
|||
return 0;
|
||||
|
||||
__asm__ __volatile__(
|
||||
" mov lp_count, %5 \n"
|
||||
" lp 3f \n"
|
||||
"1: ldb.ab %3, [%2, 1] \n"
|
||||
" breq.d %3, 0, 3f \n"
|
||||
|
|
@ -689,8 +690,8 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
|
|||
" .word 1b, 4b \n"
|
||||
" .previous \n"
|
||||
: "+r"(res), "+r"(dst), "+r"(src), "=r"(val)
|
||||
: "g"(-EFAULT), "l"(count)
|
||||
: "memory");
|
||||
: "g"(-EFAULT), "r"(count)
|
||||
: "lp_count", "lp_start", "lp_end", "memory");
|
||||
|
||||
return res;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -668,6 +668,7 @@ &mmc2 {
|
|||
ti,non-removable;
|
||||
bus-width = <4>;
|
||||
cap-power-off-card;
|
||||
keep-power-in-suspend;
|
||||
pinctrl-names = "default";
|
||||
pinctrl-0 = <&mmc2_pins>;
|
||||
|
||||
|
|
|
|||
|
|
@ -227,6 +227,7 @@ pcie1: pcie@51000000 {
|
|||
device_type = "pci";
|
||||
ranges = <0x81000000 0 0 0x03000 0 0x00010000
|
||||
0x82000000 0 0x20013000 0x13000 0 0xffed000>;
|
||||
bus-range = <0x00 0xff>;
|
||||
#interrupt-cells = <1>;
|
||||
num-lanes = <1>;
|
||||
ti,hwmods = "pcie1";
|
||||
|
|
@ -262,6 +263,7 @@ pcie@51000000 {
|
|||
device_type = "pci";
|
||||
ranges = <0x81000000 0 0 0x03000 0 0x00010000
|
||||
0x82000000 0 0x30013000 0x13000 0 0xffed000>;
|
||||
bus-range = <0x00 0xff>;
|
||||
#interrupt-cells = <1>;
|
||||
num-lanes = <1>;
|
||||
ti,hwmods = "pcie2";
|
||||
|
|
|
|||
|
|
@ -61,6 +61,7 @@ static inline void check_and_switch_context(struct mm_struct *mm,
|
|||
cpu_switch_mm(mm->pgd, mm);
|
||||
}
|
||||
|
||||
#ifndef MODULE
|
||||
#define finish_arch_post_lock_switch \
|
||||
finish_arch_post_lock_switch
|
||||
static inline void finish_arch_post_lock_switch(void)
|
||||
|
|
@ -82,6 +83,7 @@ static inline void finish_arch_post_lock_switch(void)
|
|||
preempt_enable_no_resched();
|
||||
}
|
||||
}
|
||||
#endif /* !MODULE */
|
||||
|
||||
#endif /* CONFIG_MMU */
|
||||
|
||||
|
|
|
|||
|
|
@ -113,7 +113,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
|
|||
}
|
||||
|
||||
trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
|
||||
data);
|
||||
&data);
|
||||
data = vcpu_data_host_to_guest(vcpu, data, len);
|
||||
vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data);
|
||||
}
|
||||
|
|
@ -189,14 +189,14 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
|
|||
data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
|
||||
len);
|
||||
|
||||
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data);
|
||||
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data);
|
||||
mmio_write_buf(data_buf, len, data);
|
||||
|
||||
ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
|
||||
data_buf);
|
||||
} else {
|
||||
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
|
||||
fault_ipa, 0);
|
||||
fault_ipa, NULL);
|
||||
|
||||
ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
|
||||
data_buf);
|
||||
|
|
|
|||
|
|
@ -774,13 +774,31 @@ static void arm_coherent_dma_free(struct device *dev, size_t size, void *cpu_add
|
|||
__arm_dma_free(dev, size, cpu_addr, handle, attrs, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
|
||||
* that the intention is to allow exporting memory allocated via the
|
||||
* coherent DMA APIs through the dma_buf API, which only accepts a
|
||||
* scattertable. This presents a couple of problems:
|
||||
* 1. Not all memory allocated via the coherent DMA APIs is backed by
|
||||
* a struct page
|
||||
* 2. Passing coherent DMA memory into the streaming APIs is not allowed
|
||||
* as we will try to flush the memory through a different alias to that
|
||||
* actually being used (and the flushes are redundant.)
|
||||
*/
|
||||
int arm_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
|
||||
void *cpu_addr, dma_addr_t handle, size_t size,
|
||||
struct dma_attrs *attrs)
|
||||
{
|
||||
struct page *page = pfn_to_page(dma_to_pfn(dev, handle));
|
||||
unsigned long pfn = dma_to_pfn(dev, handle);
|
||||
struct page *page;
|
||||
int ret;
|
||||
|
||||
/* If the PFN is not valid, we do not have a struct page */
|
||||
if (!pfn_valid(pfn))
|
||||
return -ENXIO;
|
||||
|
||||
page = pfn_to_page(pfn);
|
||||
|
||||
ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
|
|
|||
|
|
@ -433,6 +433,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
|
|||
struct hlist_node *tmp;
|
||||
unsigned long flags, orig_ret_address = 0;
|
||||
unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
|
||||
kprobe_opcode_t *correct_ret_addr = NULL;
|
||||
|
||||
INIT_HLIST_HEAD(&empty_rp);
|
||||
kretprobe_hash_lock(current, &head, &flags);
|
||||
|
|
@ -455,15 +456,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
|
|||
/* another task is sharing our hash bucket */
|
||||
continue;
|
||||
|
||||
if (ri->rp && ri->rp->handler) {
|
||||
__this_cpu_write(current_kprobe, &ri->rp->kp);
|
||||
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
|
||||
ri->rp->handler(ri, regs);
|
||||
__this_cpu_write(current_kprobe, NULL);
|
||||
}
|
||||
|
||||
orig_ret_address = (unsigned long)ri->ret_addr;
|
||||
recycle_rp_inst(ri, &empty_rp);
|
||||
|
||||
if (orig_ret_address != trampoline_address)
|
||||
/*
|
||||
|
|
@ -475,6 +468,33 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
|
|||
}
|
||||
|
||||
kretprobe_assert(ri, orig_ret_address, trampoline_address);
|
||||
|
||||
correct_ret_addr = ri->ret_addr;
|
||||
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
|
||||
if (ri->task != current)
|
||||
/* another task is sharing our hash bucket */
|
||||
continue;
|
||||
|
||||
orig_ret_address = (unsigned long)ri->ret_addr;
|
||||
if (ri->rp && ri->rp->handler) {
|
||||
__this_cpu_write(current_kprobe, &ri->rp->kp);
|
||||
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
|
||||
ri->ret_addr = correct_ret_addr;
|
||||
ri->rp->handler(ri, regs);
|
||||
__this_cpu_write(current_kprobe, NULL);
|
||||
}
|
||||
|
||||
recycle_rp_inst(ri, &empty_rp);
|
||||
|
||||
if (orig_ret_address != trampoline_address)
|
||||
/*
|
||||
* This is the real return address. Any other
|
||||
* instances associated with this task are for
|
||||
* other calls deeper on the call stack
|
||||
*/
|
||||
break;
|
||||
}
|
||||
|
||||
kretprobe_hash_unlock(current, &flags);
|
||||
|
||||
hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
|
||||
|
|
|
|||
|
|
@ -976,7 +976,10 @@ static void coverage_end(void)
|
|||
void __naked __kprobes_test_case_start(void)
|
||||
{
|
||||
__asm__ __volatile__ (
|
||||
"stmdb sp!, {r4-r11} \n\t"
|
||||
"mov r2, sp \n\t"
|
||||
"bic r3, r2, #7 \n\t"
|
||||
"mov sp, r3 \n\t"
|
||||
"stmdb sp!, {r2-r11} \n\t"
|
||||
"sub sp, sp, #"__stringify(TEST_MEMORY_SIZE)"\n\t"
|
||||
"bic r0, lr, #1 @ r0 = inline data \n\t"
|
||||
"mov r1, sp \n\t"
|
||||
|
|
@ -996,7 +999,8 @@ void __naked __kprobes_test_case_end_32(void)
|
|||
"movne pc, r0 \n\t"
|
||||
"mov r0, r4 \n\t"
|
||||
"add sp, sp, #"__stringify(TEST_MEMORY_SIZE)"\n\t"
|
||||
"ldmia sp!, {r4-r11} \n\t"
|
||||
"ldmia sp!, {r2-r11} \n\t"
|
||||
"mov sp, r2 \n\t"
|
||||
"mov pc, r0 \n\t"
|
||||
);
|
||||
}
|
||||
|
|
@ -1012,7 +1016,8 @@ void __naked __kprobes_test_case_end_16(void)
|
|||
"bxne r0 \n\t"
|
||||
"mov r0, r4 \n\t"
|
||||
"add sp, sp, #"__stringify(TEST_MEMORY_SIZE)"\n\t"
|
||||
"ldmia sp!, {r4-r11} \n\t"
|
||||
"ldmia sp!, {r2-r11} \n\t"
|
||||
"mov sp, r2 \n\t"
|
||||
"bx r0 \n\t"
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -427,6 +427,7 @@ void __init arm64_memblock_init(void)
|
|||
|
||||
reserve_elfcorehdr();
|
||||
|
||||
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
|
||||
dma_contiguous_reserve(arm64_dma_phys_limit);
|
||||
|
||||
memblock_allow_resize();
|
||||
|
|
@ -451,7 +452,6 @@ void __init bootmem_init(void)
|
|||
sparse_init();
|
||||
zone_sizes_init(min, max);
|
||||
|
||||
high_memory = __va((max << PAGE_SHIFT) - 1) + 1;
|
||||
max_pfn = max_low_pfn = max;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -664,6 +664,18 @@ int mips_set_process_fp_mode(struct task_struct *task, unsigned int value)
|
|||
unsigned long switch_count;
|
||||
struct task_struct *t;
|
||||
|
||||
/* If nothing to change, return right away, successfully. */
|
||||
if (value == mips_get_process_fp_mode(task))
|
||||
return 0;
|
||||
|
||||
/* Only accept a mode change if 64-bit FP enabled for o32. */
|
||||
if (!IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/* And only for o32 tasks. */
|
||||
if (IS_ENABLED(CONFIG_64BIT) && !test_thread_flag(TIF_32BIT_REGS))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/* Check the value is valid */
|
||||
if (value & ~known_bits)
|
||||
return -EOPNOTSUPP;
|
||||
|
|
|
|||
|
|
@ -439,25 +439,38 @@ static int gpr64_set(struct task_struct *target,
|
|||
|
||||
#endif /* CONFIG_64BIT */
|
||||
|
||||
static int fpr_get(struct task_struct *target,
|
||||
const struct user_regset *regset,
|
||||
unsigned int pos, unsigned int count,
|
||||
void *kbuf, void __user *ubuf)
|
||||
/*
|
||||
* Copy the floating-point context to the supplied NT_PRFPREG buffer,
|
||||
* !CONFIG_CPU_HAS_MSA variant. FP context's general register slots
|
||||
* correspond 1:1 to buffer slots. Only general registers are copied.
|
||||
*/
|
||||
static int fpr_get_fpa(struct task_struct *target,
|
||||
unsigned int *pos, unsigned int *count,
|
||||
void **kbuf, void __user **ubuf)
|
||||
{
|
||||
unsigned i;
|
||||
int err;
|
||||
return user_regset_copyout(pos, count, kbuf, ubuf,
|
||||
&target->thread.fpu,
|
||||
0, NUM_FPU_REGS * sizeof(elf_fpreg_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the floating-point context to the supplied NT_PRFPREG buffer,
|
||||
* CONFIG_CPU_HAS_MSA variant. Only lower 64 bits of FP context's
|
||||
* general register slots are copied to buffer slots. Only general
|
||||
* registers are copied.
|
||||
*/
|
||||
static int fpr_get_msa(struct task_struct *target,
|
||||
unsigned int *pos, unsigned int *count,
|
||||
void **kbuf, void __user **ubuf)
|
||||
{
|
||||
unsigned int i;
|
||||
u64 fpr_val;
|
||||
int err;
|
||||
|
||||
/* XXX fcr31 */
|
||||
|
||||
if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t))
|
||||
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
|
||||
&target->thread.fpu,
|
||||
0, sizeof(elf_fpregset_t));
|
||||
|
||||
BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t));
|
||||
for (i = 0; i < NUM_FPU_REGS; i++) {
|
||||
fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0);
|
||||
err = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
|
||||
err = user_regset_copyout(pos, count, kbuf, ubuf,
|
||||
&fpr_val, i * sizeof(elf_fpreg_t),
|
||||
(i + 1) * sizeof(elf_fpreg_t));
|
||||
if (err)
|
||||
|
|
@ -467,27 +480,64 @@ static int fpr_get(struct task_struct *target,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int fpr_set(struct task_struct *target,
|
||||
/*
|
||||
* Copy the floating-point context to the supplied NT_PRFPREG buffer.
|
||||
* Choose the appropriate helper for general registers, and then copy
|
||||
* the FCSR register separately.
|
||||
*/
|
||||
static int fpr_get(struct task_struct *target,
|
||||
const struct user_regset *regset,
|
||||
unsigned int pos, unsigned int count,
|
||||
const void *kbuf, const void __user *ubuf)
|
||||
void *kbuf, void __user *ubuf)
|
||||
{
|
||||
unsigned i;
|
||||
const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t);
|
||||
int err;
|
||||
|
||||
if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
|
||||
err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf);
|
||||
else
|
||||
err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
|
||||
&target->thread.fpu.fcr31,
|
||||
fcr31_pos, fcr31_pos + sizeof(u32));
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the supplied NT_PRFPREG buffer to the floating-point context,
|
||||
* !CONFIG_CPU_HAS_MSA variant. Buffer slots correspond 1:1 to FP
|
||||
* context's general register slots. Only general registers are copied.
|
||||
*/
|
||||
static int fpr_set_fpa(struct task_struct *target,
|
||||
unsigned int *pos, unsigned int *count,
|
||||
const void **kbuf, const void __user **ubuf)
|
||||
{
|
||||
return user_regset_copyin(pos, count, kbuf, ubuf,
|
||||
&target->thread.fpu,
|
||||
0, NUM_FPU_REGS * sizeof(elf_fpreg_t));
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the supplied NT_PRFPREG buffer to the floating-point context,
|
||||
* CONFIG_CPU_HAS_MSA variant. Buffer slots are copied to lower 64
|
||||
* bits only of FP context's general register slots. Only general
|
||||
* registers are copied.
|
||||
*/
|
||||
static int fpr_set_msa(struct task_struct *target,
|
||||
unsigned int *pos, unsigned int *count,
|
||||
const void **kbuf, const void __user **ubuf)
|
||||
{
|
||||
unsigned int i;
|
||||
u64 fpr_val;
|
||||
|
||||
/* XXX fcr31 */
|
||||
|
||||
init_fp_ctx(target);
|
||||
|
||||
if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t))
|
||||
return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
||||
&target->thread.fpu,
|
||||
0, sizeof(elf_fpregset_t));
|
||||
int err;
|
||||
|
||||
BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t));
|
||||
for (i = 0; i < NUM_FPU_REGS && count >= sizeof(elf_fpreg_t); i++) {
|
||||
err = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
||||
for (i = 0; i < NUM_FPU_REGS && *count > 0; i++) {
|
||||
err = user_regset_copyin(pos, count, kbuf, ubuf,
|
||||
&fpr_val, i * sizeof(elf_fpreg_t),
|
||||
(i + 1) * sizeof(elf_fpreg_t));
|
||||
if (err)
|
||||
|
|
@ -498,6 +548,53 @@ static int fpr_set(struct task_struct *target,
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the supplied NT_PRFPREG buffer to the floating-point context.
|
||||
* Choose the appropriate helper for general registers, and then copy
|
||||
* the FCSR register separately.
|
||||
*
|
||||
* We optimize for the case where `count % sizeof(elf_fpreg_t) == 0',
|
||||
* which is supposed to have been guaranteed by the kernel before
|
||||
* calling us, e.g. in `ptrace_regset'. We enforce that requirement,
|
||||
* so that we can safely avoid preinitializing temporaries for
|
||||
* partial register writes.
|
||||
*/
|
||||
static int fpr_set(struct task_struct *target,
|
||||
const struct user_regset *regset,
|
||||
unsigned int pos, unsigned int count,
|
||||
const void *kbuf, const void __user *ubuf)
|
||||
{
|
||||
const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t);
|
||||
u32 fcr31;
|
||||
int err;
|
||||
|
||||
BUG_ON(count % sizeof(elf_fpreg_t));
|
||||
|
||||
if (pos + count > sizeof(elf_fpregset_t))
|
||||
return -EIO;
|
||||
|
||||
init_fp_ctx(target);
|
||||
|
||||
if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
|
||||
err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf);
|
||||
else
|
||||
err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (count > 0) {
|
||||
err = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
||||
&fcr31,
|
||||
fcr31_pos, fcr31_pos + sizeof(u32));
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
ptrace_setfcr31(target, fcr31);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
enum mips_regset {
|
||||
REGSET_GPR,
|
||||
REGSET_FPR,
|
||||
|
|
|
|||
|
|
@ -1777,7 +1777,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
SPFROMREG(fs, MIPSInst_FS(ir));
|
||||
SPFROMREG(fd, MIPSInst_FD(ir));
|
||||
rv.s = ieee754sp_maddf(fd, fs, ft);
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case fmsubf_op: {
|
||||
|
|
@ -1790,7 +1790,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
SPFROMREG(fs, MIPSInst_FS(ir));
|
||||
SPFROMREG(fd, MIPSInst_FD(ir));
|
||||
rv.s = ieee754sp_msubf(fd, fs, ft);
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case frint_op: {
|
||||
|
|
@ -1814,7 +1814,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
SPFROMREG(fs, MIPSInst_FS(ir));
|
||||
rv.w = ieee754sp_2008class(fs);
|
||||
rfmt = w_fmt;
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case fmin_op: {
|
||||
|
|
@ -1826,7 +1826,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
SPFROMREG(ft, MIPSInst_FT(ir));
|
||||
SPFROMREG(fs, MIPSInst_FS(ir));
|
||||
rv.s = ieee754sp_fmin(fs, ft);
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case fmina_op: {
|
||||
|
|
@ -1838,7 +1838,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
SPFROMREG(ft, MIPSInst_FT(ir));
|
||||
SPFROMREG(fs, MIPSInst_FS(ir));
|
||||
rv.s = ieee754sp_fmina(fs, ft);
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case fmax_op: {
|
||||
|
|
@ -1850,7 +1850,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
SPFROMREG(ft, MIPSInst_FT(ir));
|
||||
SPFROMREG(fs, MIPSInst_FS(ir));
|
||||
rv.s = ieee754sp_fmax(fs, ft);
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case fmaxa_op: {
|
||||
|
|
@ -1862,7 +1862,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
SPFROMREG(ft, MIPSInst_FT(ir));
|
||||
SPFROMREG(fs, MIPSInst_FS(ir));
|
||||
rv.s = ieee754sp_fmaxa(fs, ft);
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case fabs_op:
|
||||
|
|
@ -2095,7 +2095,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
DPFROMREG(fs, MIPSInst_FS(ir));
|
||||
DPFROMREG(fd, MIPSInst_FD(ir));
|
||||
rv.d = ieee754dp_maddf(fd, fs, ft);
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case fmsubf_op: {
|
||||
|
|
@ -2108,7 +2108,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
DPFROMREG(fs, MIPSInst_FS(ir));
|
||||
DPFROMREG(fd, MIPSInst_FD(ir));
|
||||
rv.d = ieee754dp_msubf(fd, fs, ft);
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case frint_op: {
|
||||
|
|
@ -2132,7 +2132,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
DPFROMREG(fs, MIPSInst_FS(ir));
|
||||
rv.w = ieee754dp_2008class(fs);
|
||||
rfmt = w_fmt;
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case fmin_op: {
|
||||
|
|
@ -2144,7 +2144,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
DPFROMREG(ft, MIPSInst_FT(ir));
|
||||
DPFROMREG(fs, MIPSInst_FS(ir));
|
||||
rv.d = ieee754dp_fmin(fs, ft);
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case fmina_op: {
|
||||
|
|
@ -2156,7 +2156,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
DPFROMREG(ft, MIPSInst_FT(ir));
|
||||
DPFROMREG(fs, MIPSInst_FS(ir));
|
||||
rv.d = ieee754dp_fmina(fs, ft);
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case fmax_op: {
|
||||
|
|
@ -2168,7 +2168,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
DPFROMREG(ft, MIPSInst_FT(ir));
|
||||
DPFROMREG(fs, MIPSInst_FS(ir));
|
||||
rv.d = ieee754dp_fmax(fs, ft);
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case fmaxa_op: {
|
||||
|
|
@ -2180,7 +2180,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
|
|||
DPFROMREG(ft, MIPSInst_FT(ir));
|
||||
DPFROMREG(fs, MIPSInst_FS(ir));
|
||||
rv.d = ieee754dp_fmaxa(fs, ft);
|
||||
break;
|
||||
goto copcsr;
|
||||
}
|
||||
|
||||
case fabs_op:
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
for the semaphore. */
|
||||
|
||||
#define __PA_LDCW_ALIGNMENT 16
|
||||
#define __PA_LDCW_ALIGN_ORDER 4
|
||||
#define __ldcw_align(a) ({ \
|
||||
unsigned long __ret = (unsigned long) &(a)->lock[0]; \
|
||||
__ret = (__ret + __PA_LDCW_ALIGNMENT - 1) \
|
||||
|
|
@ -28,6 +29,7 @@
|
|||
ldcd). */
|
||||
|
||||
#define __PA_LDCW_ALIGNMENT 4
|
||||
#define __PA_LDCW_ALIGN_ORDER 2
|
||||
#define __ldcw_align(a) (&(a)->slock)
|
||||
#define __LDCW "ldcw,co"
|
||||
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@
|
|||
#include <asm/pgtable.h>
|
||||
#include <asm/signal.h>
|
||||
#include <asm/unistd.h>
|
||||
#include <asm/ldcw.h>
|
||||
#include <asm/thread_info.h>
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
|
@ -46,6 +47,14 @@
|
|||
#endif
|
||||
|
||||
.import pa_tlb_lock,data
|
||||
.macro load_pa_tlb_lock reg
|
||||
#if __PA_LDCW_ALIGNMENT > 4
|
||||
load32 PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg
|
||||
depi 0,31,__PA_LDCW_ALIGN_ORDER, \reg
|
||||
#else
|
||||
load32 PA(pa_tlb_lock), \reg
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/* space_to_prot macro creates a prot id from a space id */
|
||||
|
||||
|
|
@ -457,7 +466,7 @@
|
|||
.macro tlb_lock spc,ptp,pte,tmp,tmp1,fault
|
||||
#ifdef CONFIG_SMP
|
||||
cmpib,COND(=),n 0,\spc,2f
|
||||
load32 PA(pa_tlb_lock),\tmp
|
||||
load_pa_tlb_lock \tmp
|
||||
1: LDCW 0(\tmp),\tmp1
|
||||
cmpib,COND(=) 0,\tmp1,1b
|
||||
nop
|
||||
|
|
@ -480,7 +489,7 @@
|
|||
/* Release pa_tlb_lock lock. */
|
||||
.macro tlb_unlock1 spc,tmp
|
||||
#ifdef CONFIG_SMP
|
||||
load32 PA(pa_tlb_lock),\tmp
|
||||
load_pa_tlb_lock \tmp
|
||||
tlb_unlock0 \spc,\tmp
|
||||
#endif
|
||||
.endm
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@
|
|||
#include <asm/assembly.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/cache.h>
|
||||
#include <asm/ldcw.h>
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.text
|
||||
|
|
@ -333,8 +334,12 @@ ENDPROC(flush_data_cache_local)
|
|||
|
||||
.macro tlb_lock la,flags,tmp
|
||||
#ifdef CONFIG_SMP
|
||||
ldil L%pa_tlb_lock,%r1
|
||||
ldo R%pa_tlb_lock(%r1),\la
|
||||
#if __PA_LDCW_ALIGNMENT > 4
|
||||
load32 pa_tlb_lock + __PA_LDCW_ALIGNMENT-1, \la
|
||||
depi 0,31,__PA_LDCW_ALIGN_ORDER, \la
|
||||
#else
|
||||
load32 pa_tlb_lock, \la
|
||||
#endif
|
||||
rsm PSW_SM_I,\flags
|
||||
1: LDCW 0(\la),\tmp
|
||||
cmpib,<>,n 0,\tmp,3f
|
||||
|
|
|
|||
|
|
@ -401,8 +401,12 @@ static __u64 power_pmu_bhrb_to(u64 addr)
|
|||
int ret;
|
||||
__u64 target;
|
||||
|
||||
if (is_kernel_addr(addr))
|
||||
return branch_target((unsigned int *)addr);
|
||||
if (is_kernel_addr(addr)) {
|
||||
if (probe_kernel_read(&instr, (void *)addr, sizeof(instr)))
|
||||
return 0;
|
||||
|
||||
return branch_target(&instr);
|
||||
}
|
||||
|
||||
/* Userspace: need copy instruction here then translate it */
|
||||
pagefault_disable();
|
||||
|
|
|
|||
|
|
@ -263,6 +263,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis
|
|||
return retval;
|
||||
}
|
||||
|
||||
groups_sort(group_info);
|
||||
retval = set_current_groups(group_info);
|
||||
put_group_info(group_info);
|
||||
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ config X86
|
|||
select ARCH_USE_CMPXCHG_LOCKREF if X86_64
|
||||
select ARCH_USE_QUEUED_RWLOCKS
|
||||
select ARCH_USE_QUEUED_SPINLOCKS
|
||||
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
|
||||
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
||||
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
|
||||
select ARCH_WANT_FRAME_POINTERS
|
||||
select ARCH_WANT_IPC_PARSE_VERSION if X86_32
|
||||
|
|
@ -64,6 +64,7 @@ config X86
|
|||
select GENERIC_CLOCKEVENTS_MIN_ADJUST
|
||||
select GENERIC_CMOS_UPDATE
|
||||
select GENERIC_CPU_AUTOPROBE
|
||||
select GENERIC_CPU_VULNERABILITIES
|
||||
select GENERIC_EARLY_IOREMAP
|
||||
select GENERIC_FIND_FIRST_BIT
|
||||
select GENERIC_IOMAP
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
*/
|
||||
#undef CONFIG_PARAVIRT
|
||||
#undef CONFIG_PARAVIRT_SPINLOCKS
|
||||
#undef CONFIG_PAGE_TABLE_ISOLATION
|
||||
#undef CONFIG_KASAN
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@
|
|||
#include <asm/asm.h>
|
||||
#include <asm/smap.h>
|
||||
#include <asm/pgtable_types.h>
|
||||
#include <asm/kaiser.h>
|
||||
#include <linux/err.h>
|
||||
|
||||
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
|
||||
|
|
@ -135,6 +136,7 @@ ENTRY(entry_SYSCALL_64)
|
|||
* it is too small to ever cause noticeable irq latency.
|
||||
*/
|
||||
SWAPGS_UNSAFE_STACK
|
||||
SWITCH_KERNEL_CR3_NO_STACK
|
||||
/*
|
||||
* A hypervisor implementation might want to use a label
|
||||
* after the swapgs, so that it can do the swapgs
|
||||
|
|
@ -207,9 +209,17 @@ entry_SYSCALL_64_fastpath:
|
|||
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
|
||||
jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
|
||||
|
||||
RESTORE_C_REGS_EXCEPT_RCX_R11
|
||||
movq RIP(%rsp), %rcx
|
||||
movq EFLAGS(%rsp), %r11
|
||||
RESTORE_C_REGS_EXCEPT_RCX_R11
|
||||
/*
|
||||
* This opens a window where we have a user CR3, but are
|
||||
* running in the kernel. This makes using the CS
|
||||
* register useless for telling whether or not we need to
|
||||
* switch CR3 in NMIs. Normal interrupts are OK because
|
||||
* they are off here.
|
||||
*/
|
||||
SWITCH_USER_CR3
|
||||
movq RSP(%rsp), %rsp
|
||||
/*
|
||||
* 64-bit SYSRET restores rip from rcx,
|
||||
|
|
@ -347,10 +357,26 @@ GLOBAL(int_ret_from_sys_call)
|
|||
syscall_return_via_sysret:
|
||||
/* rcx and r11 are already restored (see code above) */
|
||||
RESTORE_C_REGS_EXCEPT_RCX_R11
|
||||
/*
|
||||
* This opens a window where we have a user CR3, but are
|
||||
* running in the kernel. This makes using the CS
|
||||
* register useless for telling whether or not we need to
|
||||
* switch CR3 in NMIs. Normal interrupts are OK because
|
||||
* they are off here.
|
||||
*/
|
||||
SWITCH_USER_CR3
|
||||
movq RSP(%rsp), %rsp
|
||||
USERGS_SYSRET64
|
||||
|
||||
opportunistic_sysret_failed:
|
||||
/*
|
||||
* This opens a window where we have a user CR3, but are
|
||||
* running in the kernel. This makes using the CS
|
||||
* register useless for telling whether or not we need to
|
||||
* switch CR3 in NMIs. Normal interrupts are OK because
|
||||
* they are off here.
|
||||
*/
|
||||
SWITCH_USER_CR3
|
||||
SWAPGS
|
||||
jmp restore_c_regs_and_iret
|
||||
END(entry_SYSCALL_64)
|
||||
|
|
@ -509,6 +535,7 @@ END(irq_entries_start)
|
|||
* tracking that we're in kernel mode.
|
||||
*/
|
||||
SWAPGS
|
||||
SWITCH_KERNEL_CR3
|
||||
|
||||
/*
|
||||
* We need to tell lockdep that IRQs are off. We can't do this until
|
||||
|
|
@ -568,6 +595,7 @@ GLOBAL(retint_user)
|
|||
mov %rsp,%rdi
|
||||
call prepare_exit_to_usermode
|
||||
TRACE_IRQS_IRETQ
|
||||
SWITCH_USER_CR3
|
||||
SWAPGS
|
||||
jmp restore_regs_and_iret
|
||||
|
||||
|
|
@ -625,6 +653,7 @@ native_irq_return_ldt:
|
|||
pushq %rax
|
||||
pushq %rdi
|
||||
SWAPGS
|
||||
SWITCH_KERNEL_CR3
|
||||
movq PER_CPU_VAR(espfix_waddr), %rdi
|
||||
movq %rax, (0*8)(%rdi) /* RAX */
|
||||
movq (2*8)(%rsp), %rax /* RIP */
|
||||
|
|
@ -640,6 +669,7 @@ native_irq_return_ldt:
|
|||
andl $0xffff0000, %eax
|
||||
popq %rdi
|
||||
orq PER_CPU_VAR(espfix_stack), %rax
|
||||
SWITCH_USER_CR3
|
||||
SWAPGS
|
||||
movq %rax, %rsp
|
||||
popq %rax
|
||||
|
|
@ -995,7 +1025,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
|
|||
/*
|
||||
* Save all registers in pt_regs, and switch gs if needed.
|
||||
* Use slow, but surefire "are we in kernel?" check.
|
||||
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
|
||||
*
|
||||
* Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
|
||||
* ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
|
||||
* ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
|
||||
* ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
|
||||
*/
|
||||
ENTRY(paranoid_entry)
|
||||
cld
|
||||
|
|
@ -1008,7 +1042,26 @@ ENTRY(paranoid_entry)
|
|||
js 1f /* negative -> in kernel */
|
||||
SWAPGS
|
||||
xorl %ebx, %ebx
|
||||
1: ret
|
||||
1:
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* We might have come in between a swapgs and a SWITCH_KERNEL_CR3
|
||||
* on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
|
||||
* Do a conditional SWITCH_KERNEL_CR3: this could safely be done
|
||||
* unconditionally, but we need to find out whether the reverse
|
||||
* should be done on return (conveyed to paranoid_exit in %ebx).
|
||||
*/
|
||||
ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
|
||||
testl $KAISER_SHADOW_PGD_OFFSET, %eax
|
||||
jz 2f
|
||||
orl $2, %ebx
|
||||
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
|
||||
/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
|
||||
ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
|
||||
movq %rax, %cr3
|
||||
2:
|
||||
#endif
|
||||
ret
|
||||
END(paranoid_entry)
|
||||
|
||||
/*
|
||||
|
|
@ -1021,19 +1074,26 @@ END(paranoid_entry)
|
|||
* be complicated. Fortunately, we there's no good reason
|
||||
* to try to handle preemption here.
|
||||
*
|
||||
* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
|
||||
* On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
|
||||
* ebx=1: needs neither swapgs nor SWITCH_USER_CR3
|
||||
* ebx=2: needs both swapgs and SWITCH_USER_CR3
|
||||
* ebx=3: needs SWITCH_USER_CR3 but not swapgs
|
||||
*/
|
||||
ENTRY(paranoid_exit)
|
||||
DISABLE_INTERRUPTS(CLBR_NONE)
|
||||
TRACE_IRQS_OFF_DEBUG
|
||||
testl %ebx, %ebx /* swapgs needed? */
|
||||
jnz paranoid_exit_no_swapgs
|
||||
TRACE_IRQS_IRETQ
|
||||
SWAPGS_UNSAFE_STACK
|
||||
jmp paranoid_exit_restore
|
||||
paranoid_exit_no_swapgs:
|
||||
TRACE_IRQS_IRETQ_DEBUG
|
||||
paranoid_exit_restore:
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
|
||||
testl $2, %ebx /* SWITCH_USER_CR3 needed? */
|
||||
jz paranoid_exit_no_switch
|
||||
SWITCH_USER_CR3
|
||||
paranoid_exit_no_switch:
|
||||
#endif
|
||||
testl $1, %ebx /* swapgs needed? */
|
||||
jnz paranoid_exit_no_swapgs
|
||||
SWAPGS_UNSAFE_STACK
|
||||
paranoid_exit_no_swapgs:
|
||||
RESTORE_EXTRA_REGS
|
||||
RESTORE_C_REGS
|
||||
REMOVE_PT_GPREGS_FROM_STACK 8
|
||||
|
|
@ -1048,6 +1108,13 @@ ENTRY(error_entry)
|
|||
cld
|
||||
SAVE_C_REGS 8
|
||||
SAVE_EXTRA_REGS 8
|
||||
/*
|
||||
* error_entry() always returns with a kernel gsbase and
|
||||
* CR3. We must also have a kernel CR3/gsbase before
|
||||
* calling TRACE_IRQS_*. Just unconditionally switch to
|
||||
* the kernel CR3 here.
|
||||
*/
|
||||
SWITCH_KERNEL_CR3
|
||||
xorl %ebx, %ebx
|
||||
testb $3, CS+8(%rsp)
|
||||
jz .Lerror_kernelspace
|
||||
|
|
@ -1210,6 +1277,10 @@ ENTRY(nmi)
|
|||
*/
|
||||
|
||||
SWAPGS_UNSAFE_STACK
|
||||
/*
|
||||
* percpu variables are mapped with user CR3, so no need
|
||||
* to switch CR3 here.
|
||||
*/
|
||||
cld
|
||||
movq %rsp, %rdx
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
|
|
@ -1243,12 +1314,34 @@ ENTRY(nmi)
|
|||
|
||||
movq %rsp, %rdi
|
||||
movq $-1, %rsi
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/* Unconditionally use kernel CR3 for do_nmi() */
|
||||
/* %rax is saved above, so OK to clobber here */
|
||||
ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
|
||||
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
|
||||
ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
|
||||
pushq %rax
|
||||
/* mask off "user" bit of pgd address and 12 PCID bits: */
|
||||
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
|
||||
movq %rax, %cr3
|
||||
2:
|
||||
#endif
|
||||
call do_nmi
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Unconditionally restore CR3. I know we return to
|
||||
* kernel code that needs user CR3, but do we ever return
|
||||
* to "user mode" where we need the kernel CR3?
|
||||
*/
|
||||
ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Return back to user mode. We must *not* do the normal exit
|
||||
* work, because we don't want to enable interrupts. Fortunately,
|
||||
* do_nmi doesn't modify pt_regs.
|
||||
* work, because we don't want to enable interrupts. Do not
|
||||
* switch to user CR3: we might be going back to kernel code
|
||||
* that had a user CR3 set.
|
||||
*/
|
||||
SWAPGS
|
||||
jmp restore_c_regs_and_iret
|
||||
|
|
@ -1445,22 +1538,55 @@ end_repeat_nmi:
|
|||
ALLOC_PT_GPREGS_ON_STACK
|
||||
|
||||
/*
|
||||
* Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
|
||||
* as we should not be calling schedule in NMI context.
|
||||
* Even with normal interrupts enabled. An NMI should not be
|
||||
* setting NEED_RESCHED or anything that normal interrupts and
|
||||
* exceptions might do.
|
||||
* Use the same approach as paranoid_entry to handle SWAPGS, but
|
||||
* without CR3 handling since we do that differently in NMIs. No
|
||||
* need to use paranoid_exit as we should not be calling schedule
|
||||
* in NMI context. Even with normal interrupts enabled. An NMI
|
||||
* should not be setting NEED_RESCHED or anything that normal
|
||||
* interrupts and exceptions might do.
|
||||
*/
|
||||
call paranoid_entry
|
||||
|
||||
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
|
||||
cld
|
||||
SAVE_C_REGS
|
||||
SAVE_EXTRA_REGS
|
||||
movl $1, %ebx
|
||||
movl $MSR_GS_BASE, %ecx
|
||||
rdmsr
|
||||
testl %edx, %edx
|
||||
js 1f /* negative -> in kernel */
|
||||
SWAPGS
|
||||
xorl %ebx, %ebx
|
||||
1:
|
||||
movq %rsp, %rdi
|
||||
movq $-1, %rsi
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/* Unconditionally use kernel CR3 for do_nmi() */
|
||||
/* %rax is saved above, so OK to clobber here */
|
||||
ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
|
||||
/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
|
||||
ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
|
||||
pushq %rax
|
||||
/* mask off "user" bit of pgd address and 12 PCID bits: */
|
||||
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
|
||||
movq %rax, %cr3
|
||||
2:
|
||||
#endif
|
||||
|
||||
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
|
||||
call do_nmi
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Unconditionally restore CR3. We might be returning to
|
||||
* kernel code that needs user CR3, like just just before
|
||||
* a sysret.
|
||||
*/
|
||||
ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
|
||||
#endif
|
||||
|
||||
testl %ebx, %ebx /* swapgs needed? */
|
||||
jnz nmi_restore
|
||||
nmi_swapgs:
|
||||
/* We fixed up CR3 above, so no need to switch it here */
|
||||
SWAPGS_UNSAFE_STACK
|
||||
nmi_restore:
|
||||
RESTORE_EXTRA_REGS
|
||||
|
|
|
|||
|
|
@ -13,6 +13,8 @@
|
|||
#include <asm/irqflags.h>
|
||||
#include <asm/asm.h>
|
||||
#include <asm/smap.h>
|
||||
#include <asm/pgtable_types.h>
|
||||
#include <asm/kaiser.h>
|
||||
#include <linux/linkage.h>
|
||||
#include <linux/err.h>
|
||||
|
||||
|
|
@ -50,6 +52,7 @@ ENDPROC(native_usergs_sysret32)
|
|||
ENTRY(entry_SYSENTER_compat)
|
||||
/* Interrupts are off on entry. */
|
||||
SWAPGS_UNSAFE_STACK
|
||||
SWITCH_KERNEL_CR3_NO_STACK
|
||||
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
|
||||
|
||||
/*
|
||||
|
|
@ -161,6 +164,7 @@ ENDPROC(entry_SYSENTER_compat)
|
|||
ENTRY(entry_SYSCALL_compat)
|
||||
/* Interrupts are off on entry. */
|
||||
SWAPGS_UNSAFE_STACK
|
||||
SWITCH_KERNEL_CR3_NO_STACK
|
||||
|
||||
/* Stash user ESP and switch to the kernel stack. */
|
||||
movl %esp, %r8d
|
||||
|
|
@ -208,6 +212,7 @@ ENTRY(entry_SYSCALL_compat)
|
|||
/* Opportunistic SYSRET */
|
||||
sysret32_from_system_call:
|
||||
TRACE_IRQS_ON /* User mode traces as IRQs on. */
|
||||
SWITCH_USER_CR3
|
||||
movq RBX(%rsp), %rbx /* pt_regs->rbx */
|
||||
movq RBP(%rsp), %rbp /* pt_regs->rbp */
|
||||
movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
|
||||
|
|
@ -269,6 +274,7 @@ ENTRY(entry_INT80_compat)
|
|||
PARAVIRT_ADJUST_EXCEPTION_FRAME
|
||||
ASM_CLAC /* Do this early to minimize exposure */
|
||||
SWAPGS
|
||||
SWITCH_KERNEL_CR3_NO_STACK
|
||||
|
||||
/*
|
||||
* User tracing code (ptrace or signal handlers) might assume that
|
||||
|
|
@ -311,6 +317,7 @@ ENTRY(entry_INT80_compat)
|
|||
|
||||
/* Go back to user mode. */
|
||||
TRACE_IRQS_ON
|
||||
SWITCH_USER_CR3
|
||||
SWAPGS
|
||||
jmp restore_regs_and_iret
|
||||
END(entry_INT80_compat)
|
||||
|
|
|
|||
|
|
@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PARAVIRT_CLOCK
|
||||
extern u8 pvclock_page
|
||||
__attribute__((visibility("hidden")));
|
||||
#endif
|
||||
|
||||
#ifndef BUILD_VDSO32
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
|
@ -62,63 +67,65 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
|
|||
|
||||
#ifdef CONFIG_PARAVIRT_CLOCK
|
||||
|
||||
static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
|
||||
static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
|
||||
{
|
||||
const struct pvclock_vsyscall_time_info *pvti_base;
|
||||
int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
|
||||
int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
|
||||
|
||||
BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
|
||||
|
||||
pvti_base = (struct pvclock_vsyscall_time_info *)
|
||||
__fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
|
||||
|
||||
return &pvti_base[offset];
|
||||
return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
|
||||
}
|
||||
|
||||
static notrace cycle_t vread_pvclock(int *mode)
|
||||
{
|
||||
const struct pvclock_vsyscall_time_info *pvti;
|
||||
const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
|
||||
cycle_t ret;
|
||||
u64 last;
|
||||
u32 version;
|
||||
u8 flags;
|
||||
unsigned cpu, cpu1;
|
||||
|
||||
u64 tsc, pvti_tsc;
|
||||
u64 last, delta, pvti_system_time;
|
||||
u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
|
||||
|
||||
/*
|
||||
* Note: hypervisor must guarantee that:
|
||||
* 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
|
||||
* 2. that per-CPU pvclock time info is updated if the
|
||||
* underlying CPU changes.
|
||||
* 3. that version is increased whenever underlying CPU
|
||||
* changes.
|
||||
* Note: The kernel and hypervisor must guarantee that cpu ID
|
||||
* number maps 1:1 to per-CPU pvclock time info.
|
||||
*
|
||||
* Because the hypervisor is entirely unaware of guest userspace
|
||||
* preemption, it cannot guarantee that per-CPU pvclock time
|
||||
* info is updated if the underlying CPU changes or that that
|
||||
* version is increased whenever underlying CPU changes.
|
||||
*
|
||||
* On KVM, we are guaranteed that pvti updates for any vCPU are
|
||||
* atomic as seen by *all* vCPUs. This is an even stronger
|
||||
* guarantee than we get with a normal seqlock.
|
||||
*
|
||||
* On Xen, we don't appear to have that guarantee, but Xen still
|
||||
* supplies a valid seqlock using the version field.
|
||||
|
||||
* We only do pvclock vdso timing at all if
|
||||
* PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
|
||||
* mean that all vCPUs have matching pvti and that the TSC is
|
||||
* synced, so we can just look at vCPU 0's pvti.
|
||||
*/
|
||||
do {
|
||||
cpu = __getcpu() & VGETCPU_CPU_MASK;
|
||||
/* TODO: We can put vcpu id into higher bits of pvti.version.
|
||||
* This will save a couple of cycles by getting rid of
|
||||
* __getcpu() calls (Gleb).
|
||||
*/
|
||||
|
||||
pvti = get_pvti(cpu);
|
||||
|
||||
version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
|
||||
|
||||
/*
|
||||
* Test we're still on the cpu as well as the version.
|
||||
* We could have been migrated just after the first
|
||||
* vgetcpu but before fetching the version, so we
|
||||
* wouldn't notice a version change.
|
||||
*/
|
||||
cpu1 = __getcpu() & VGETCPU_CPU_MASK;
|
||||
} while (unlikely(cpu != cpu1 ||
|
||||
(pvti->pvti.version & 1) ||
|
||||
pvti->pvti.version != version));
|
||||
|
||||
if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
|
||||
if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
|
||||
*mode = VCLOCK_NONE;
|
||||
return 0;
|
||||
}
|
||||
|
||||
do {
|
||||
version = pvti->version;
|
||||
|
||||
/* This is also a read barrier, so we'll read version first. */
|
||||
tsc = rdtsc_ordered();
|
||||
|
||||
pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
|
||||
pvti_tsc_shift = pvti->tsc_shift;
|
||||
pvti_system_time = pvti->system_time;
|
||||
pvti_tsc = pvti->tsc_timestamp;
|
||||
|
||||
/* Make sure that the version double-check is last. */
|
||||
smp_rmb();
|
||||
} while (unlikely((version & 1) || version != pvti->version));
|
||||
|
||||
delta = tsc - pvti_tsc;
|
||||
ret = pvti_system_time +
|
||||
pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
|
||||
pvti_tsc_shift);
|
||||
|
||||
/* refer to tsc.c read_tsc() comment for rationale */
|
||||
last = gtod->cycle_last;
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ SECTIONS
|
|||
* segment.
|
||||
*/
|
||||
|
||||
vvar_start = . - 2 * PAGE_SIZE;
|
||||
vvar_start = . - 3 * PAGE_SIZE;
|
||||
vvar_page = vvar_start;
|
||||
|
||||
/* Place all vvars at the offsets in asm/vvar.h. */
|
||||
|
|
@ -36,6 +36,7 @@ SECTIONS
|
|||
#undef EMIT_VVAR
|
||||
|
||||
hpet_page = vvar_start + PAGE_SIZE;
|
||||
pvclock_page = vvar_start + 2 * PAGE_SIZE;
|
||||
|
||||
. = SIZEOF_HEADERS;
|
||||
|
||||
|
|
|
|||
|
|
@ -73,6 +73,7 @@ enum {
|
|||
sym_vvar_start,
|
||||
sym_vvar_page,
|
||||
sym_hpet_page,
|
||||
sym_pvclock_page,
|
||||
sym_VDSO_FAKE_SECTION_TABLE_START,
|
||||
sym_VDSO_FAKE_SECTION_TABLE_END,
|
||||
};
|
||||
|
|
@ -80,6 +81,7 @@ enum {
|
|||
const int special_pages[] = {
|
||||
sym_vvar_page,
|
||||
sym_hpet_page,
|
||||
sym_pvclock_page,
|
||||
};
|
||||
|
||||
struct vdso_sym {
|
||||
|
|
@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {
|
|||
[sym_vvar_start] = {"vvar_start", true},
|
||||
[sym_vvar_page] = {"vvar_page", true},
|
||||
[sym_hpet_page] = {"hpet_page", true},
|
||||
[sym_pvclock_page] = {"pvclock_page", true},
|
||||
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
|
||||
"VDSO_FAKE_SECTION_TABLE_START", false
|
||||
},
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@
|
|||
#include <linux/random.h>
|
||||
#include <linux/elf.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <asm/pvclock.h>
|
||||
#include <asm/vgtod.h>
|
||||
#include <asm/proto.h>
|
||||
#include <asm/vdso.h>
|
||||
|
|
@ -100,6 +101,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
|
|||
.name = "[vvar]",
|
||||
.pages = no_pages,
|
||||
};
|
||||
struct pvclock_vsyscall_time_info *pvti;
|
||||
|
||||
if (calculate_addr) {
|
||||
addr = vdso_addr(current->mm->start_stack,
|
||||
|
|
@ -169,6 +171,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
|
|||
}
|
||||
#endif
|
||||
|
||||
pvti = pvclock_pvti_cpu0_va();
|
||||
if (pvti && image->sym_pvclock_page) {
|
||||
ret = remap_pfn_range(vma,
|
||||
text_start + image->sym_pvclock_page,
|
||||
__pa(pvti) >> PAGE_SHIFT,
|
||||
PAGE_SIZE,
|
||||
PAGE_READONLY);
|
||||
|
||||
if (ret)
|
||||
goto up_fail;
|
||||
}
|
||||
|
||||
up_fail:
|
||||
if (ret)
|
||||
current->mm->context.vdso = NULL;
|
||||
|
|
|
|||
|
|
@ -66,6 +66,11 @@ static int __init vsyscall_setup(char *str)
|
|||
}
|
||||
early_param("vsyscall", vsyscall_setup);
|
||||
|
||||
bool vsyscall_enabled(void)
|
||||
{
|
||||
return vsyscall_mode != NONE;
|
||||
}
|
||||
|
||||
static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
|
||||
const char *message)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -138,7 +138,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
|
|||
".popsection\n" \
|
||||
".pushsection .altinstr_replacement, \"ax\"\n" \
|
||||
ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
|
||||
".popsection"
|
||||
".popsection\n"
|
||||
|
||||
#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
|
||||
OLDINSTR_2(oldinstr, 1, 2) \
|
||||
|
|
@ -149,7 +149,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
|
|||
".pushsection .altinstr_replacement, \"ax\"\n" \
|
||||
ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
|
||||
ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
|
||||
".popsection"
|
||||
".popsection\n"
|
||||
|
||||
/*
|
||||
* This must be included *after* the definition of ALTERNATIVE due to
|
||||
|
|
|
|||
|
|
@ -2,5 +2,7 @@
|
|||
#define _ASM_X86_CMDLINE_H
|
||||
|
||||
int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
|
||||
int cmdline_find_option(const char *cmdline_ptr, const char *option,
|
||||
char *buffer, int bufsize);
|
||||
|
||||
#endif /* _ASM_X86_CMDLINE_H */
|
||||
|
|
|
|||
|
|
@ -187,6 +187,7 @@
|
|||
#define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */
|
||||
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
|
||||
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
|
||||
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
|
||||
#define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */
|
||||
#define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */
|
||||
#define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
|
||||
|
|
@ -199,6 +200,9 @@
|
|||
#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
|
||||
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
|
||||
|
||||
/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
|
||||
#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
|
||||
|
||||
/* Virtualization flags: Linux defined, word 8 */
|
||||
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
|
||||
#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
|
||||
|
|
@ -273,6 +277,9 @@
|
|||
#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
|
||||
#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
|
||||
#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
|
||||
#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
|
||||
#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
|
||||
#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
|
||||
|
||||
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
|
||||
|
||||
|
|
@ -355,6 +362,8 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
|
|||
set_bit(bit, (unsigned long *)cpu_caps_set); \
|
||||
} while (0)
|
||||
|
||||
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
|
||||
|
||||
#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
|
||||
#define cpu_has_de boot_cpu_has(X86_FEATURE_DE)
|
||||
#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE)
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ struct gdt_page {
|
|||
struct desc_struct gdt[GDT_ENTRIES];
|
||||
} __attribute__((aligned(PAGE_SIZE)));
|
||||
|
||||
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
|
||||
DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
|
||||
|
||||
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -21,11 +21,13 @@
|
|||
# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31))
|
||||
# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31))
|
||||
# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31))
|
||||
# define DISABLE_PCID 0
|
||||
#else
|
||||
# define DISABLE_VME 0
|
||||
# define DISABLE_K6_MTRR 0
|
||||
# define DISABLE_CYRIX_ARR 0
|
||||
# define DISABLE_CENTAUR_MCR 0
|
||||
# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31))
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
/*
|
||||
|
|
@ -35,7 +37,7 @@
|
|||
#define DISABLED_MASK1 0
|
||||
#define DISABLED_MASK2 0
|
||||
#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
|
||||
#define DISABLED_MASK4 0
|
||||
#define DISABLED_MASK4 (DISABLE_PCID)
|
||||
#define DISABLED_MASK5 0
|
||||
#define DISABLED_MASK6 0
|
||||
#define DISABLED_MASK7 0
|
||||
|
|
|
|||
|
|
@ -22,12 +22,8 @@ typedef struct {
|
|||
#ifdef CONFIG_SMP
|
||||
unsigned int irq_resched_count;
|
||||
unsigned int irq_call_count;
|
||||
/*
|
||||
* irq_tlb_count is double-counted in irq_call_count, so it must be
|
||||
* subtracted from irq_call_count when displaying irq_call_count
|
||||
*/
|
||||
unsigned int irq_tlb_count;
|
||||
#endif
|
||||
unsigned int irq_tlb_count;
|
||||
#ifdef CONFIG_X86_THERMAL_VECTOR
|
||||
unsigned int irq_thermal_count;
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -187,7 +187,7 @@ extern char irq_entries_start[];
|
|||
#define VECTOR_RETRIGGERED ((void *)~0UL)
|
||||
|
||||
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
|
||||
DECLARE_PER_CPU(vector_irq_t, vector_irq);
|
||||
DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
|
||||
|
||||
#endif /* !ASSEMBLY_ */
|
||||
|
||||
|
|
|
|||
151
arch/x86/include/asm/kaiser.h
Normal file
151
arch/x86/include/asm/kaiser.h
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
#ifndef _ASM_X86_KAISER_H
|
||||
#define _ASM_X86_KAISER_H
|
||||
|
||||
#include <uapi/asm/processor-flags.h> /* For PCID constants */
|
||||
|
||||
/*
|
||||
* This file includes the definitions for the KAISER feature.
|
||||
* KAISER is a counter measure against x86_64 side channel attacks on
|
||||
* the kernel virtual memory. It has a shadow pgd for every process: the
|
||||
* shadow pgd has a minimalistic kernel-set mapped, but includes the whole
|
||||
* user memory. Within a kernel context switch, or when an interrupt is handled,
|
||||
* the pgd is switched to the normal one. When the system switches to user mode,
|
||||
* the shadow pgd is enabled. By this, the virtual memory caches are freed,
|
||||
* and the user may not attack the whole kernel memory.
|
||||
*
|
||||
* A minimalistic kernel mapping holds the parts needed to be mapped in user
|
||||
* mode, such as the entry/exit functions of the user space, or the stacks.
|
||||
*/
|
||||
|
||||
#define KAISER_SHADOW_PGD_OFFSET 0x1000
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* A page table address must have this alignment to stay the same when
|
||||
* KAISER_SHADOW_PGD_OFFSET mask is applied
|
||||
*/
|
||||
#define KAISER_KERNEL_PGD_ALIGNMENT (KAISER_SHADOW_PGD_OFFSET << 1)
|
||||
#else
|
||||
#define KAISER_KERNEL_PGD_ALIGNMENT PAGE_SIZE
|
||||
#endif
|
||||
|
||||
#ifdef __ASSEMBLY__
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
|
||||
.macro _SWITCH_TO_KERNEL_CR3 reg
|
||||
movq %cr3, \reg
|
||||
andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
|
||||
/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
|
||||
ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
|
||||
movq \reg, %cr3
|
||||
.endm
|
||||
|
||||
.macro _SWITCH_TO_USER_CR3 reg regb
|
||||
/*
|
||||
* regb must be the low byte portion of reg: because we have arranged
|
||||
* for the low byte of the user PCID to serve as the high byte of NOFLUSH
|
||||
* (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
|
||||
* not enabled): so that the one register can update both memory and cr3.
|
||||
*/
|
||||
movq %cr3, \reg
|
||||
orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
|
||||
js 9f
|
||||
/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
|
||||
movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
|
||||
9:
|
||||
movq \reg, %cr3
|
||||
.endm
|
||||
|
||||
.macro SWITCH_KERNEL_CR3
|
||||
ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
|
||||
_SWITCH_TO_KERNEL_CR3 %rax
|
||||
popq %rax
|
||||
8:
|
||||
.endm
|
||||
|
||||
.macro SWITCH_USER_CR3
|
||||
ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
|
||||
_SWITCH_TO_USER_CR3 %rax %al
|
||||
popq %rax
|
||||
8:
|
||||
.endm
|
||||
|
||||
.macro SWITCH_KERNEL_CR3_NO_STACK
|
||||
ALTERNATIVE "jmp 8f", \
|
||||
__stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
|
||||
X86_FEATURE_KAISER
|
||||
_SWITCH_TO_KERNEL_CR3 %rax
|
||||
movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
|
||||
8:
|
||||
.endm
|
||||
|
||||
#else /* CONFIG_PAGE_TABLE_ISOLATION */
|
||||
|
||||
.macro SWITCH_KERNEL_CR3
|
||||
.endm
|
||||
.macro SWITCH_USER_CR3
|
||||
.endm
|
||||
.macro SWITCH_KERNEL_CR3_NO_STACK
|
||||
.endm
|
||||
|
||||
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
|
||||
|
||||
#else /* __ASSEMBLY__ */
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Upon kernel/user mode switch, it may happen that the address
|
||||
* space has to be switched before the registers have been
|
||||
* stored. To change the address space, another register is
|
||||
* needed. A register therefore has to be stored/restored.
|
||||
*/
|
||||
DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
|
||||
|
||||
DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
|
||||
|
||||
extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
|
||||
|
||||
extern int kaiser_enabled;
|
||||
extern void __init kaiser_check_boottime_disable(void);
|
||||
#else
|
||||
#define kaiser_enabled 0
|
||||
static inline void __init kaiser_check_boottime_disable(void) {}
|
||||
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
|
||||
|
||||
/*
|
||||
* Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,
|
||||
* so as to build with tests on kaiser_enabled instead of #ifdefs.
|
||||
*/
|
||||
|
||||
/**
|
||||
* kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
|
||||
* @addr: the start address of the range
|
||||
* @size: the size of the range
|
||||
* @flags: The mapping flags of the pages
|
||||
*
|
||||
* The mapping is done on a global scope, so no bigger
|
||||
* synchronization has to be done. the pages have to be
|
||||
* manually unmapped again when they are not needed any longer.
|
||||
*/
|
||||
extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
|
||||
|
||||
/**
|
||||
* kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
|
||||
* @addr: the start address of the range
|
||||
* @size: the size of the range
|
||||
*/
|
||||
extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
|
||||
|
||||
/**
|
||||
* kaiser_init - Initialize the shadow mapping
|
||||
*
|
||||
* Most parts of the shadow mapping can be mapped upon boot
|
||||
* time. Only per-process things like the thread stacks
|
||||
* or a new LDT have to be mapped at runtime. These boot-
|
||||
* time mappings are permanent and never unmapped.
|
||||
*/
|
||||
extern void kaiser_init(void);
|
||||
|
||||
#endif /* __ASSEMBLY */
|
||||
|
||||
#endif /* _ASM_X86_KAISER_H */
|
||||
|
|
@ -24,12 +24,6 @@ typedef struct {
|
|||
atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */
|
||||
} mm_context_t;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
void leave_mm(int cpu);
|
||||
#else
|
||||
static inline void leave_mm(int cpu)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_X86_MMU_H */
|
||||
|
|
|
|||
|
|
@ -98,109 +98,16 @@ static inline void load_mm_ldt(struct mm_struct *mm)
|
|||
|
||||
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
|
||||
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
||||
struct task_struct *tsk)
|
||||
{
|
||||
unsigned cpu = smp_processor_id();
|
||||
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
||||
struct task_struct *tsk);
|
||||
|
||||
if (likely(prev != next)) {
|
||||
#ifdef CONFIG_SMP
|
||||
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
|
||||
this_cpu_write(cpu_tlbstate.active_mm, next);
|
||||
#endif
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
/*
|
||||
* Re-load page tables.
|
||||
*
|
||||
* This logic has an ordering constraint:
|
||||
*
|
||||
* CPU 0: Write to a PTE for 'next'
|
||||
* CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
|
||||
* CPU 1: set bit 1 in next's mm_cpumask
|
||||
* CPU 1: load from the PTE that CPU 0 writes (implicit)
|
||||
*
|
||||
* We need to prevent an outcome in which CPU 1 observes
|
||||
* the new PTE value and CPU 0 observes bit 1 clear in
|
||||
* mm_cpumask. (If that occurs, then the IPI will never
|
||||
* be sent, and CPU 0's TLB will contain a stale entry.)
|
||||
*
|
||||
* The bad outcome can occur if either CPU's load is
|
||||
* reordered before that CPU's store, so both CPUs must
|
||||
* execute full barriers to prevent this from happening.
|
||||
*
|
||||
* Thus, switch_mm needs a full barrier between the
|
||||
* store to mm_cpumask and any operation that could load
|
||||
* from next->pgd. TLB fills are special and can happen
|
||||
* due to instruction fetches or for no reason at all,
|
||||
* and neither LOCK nor MFENCE orders them.
|
||||
* Fortunately, load_cr3() is serializing and gives the
|
||||
* ordering guarantee we need.
|
||||
*
|
||||
*/
|
||||
load_cr3(next->pgd);
|
||||
|
||||
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
|
||||
/* Stop flush ipis for the previous mm */
|
||||
cpumask_clear_cpu(cpu, mm_cpumask(prev));
|
||||
|
||||
/* Load per-mm CR4 state */
|
||||
load_mm_cr4(next);
|
||||
|
||||
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
||||
/*
|
||||
* Load the LDT, if the LDT is different.
|
||||
*
|
||||
* It's possible that prev->context.ldt doesn't match
|
||||
* the LDT register. This can happen if leave_mm(prev)
|
||||
* was called and then modify_ldt changed
|
||||
* prev->context.ldt but suppressed an IPI to this CPU.
|
||||
* In this case, prev->context.ldt != NULL, because we
|
||||
* never set context.ldt to NULL while the mm still
|
||||
* exists. That means that next->context.ldt !=
|
||||
* prev->context.ldt, because mms never share an LDT.
|
||||
*/
|
||||
if (unlikely(prev->context.ldt != next->context.ldt))
|
||||
load_mm_ldt(next);
|
||||
#endif
|
||||
}
|
||||
#ifdef CONFIG_SMP
|
||||
else {
|
||||
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
|
||||
BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
|
||||
|
||||
if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
|
||||
/*
|
||||
* On established mms, the mm_cpumask is only changed
|
||||
* from irq context, from ptep_clear_flush() while in
|
||||
* lazy tlb mode, and here. Irqs are blocked during
|
||||
* schedule, protecting us from simultaneous changes.
|
||||
*/
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
/*
|
||||
* We were in lazy tlb mode and leave_mm disabled
|
||||
* tlb flush IPI delivery. We must reload CR3
|
||||
* to make sure to use no freed page tables.
|
||||
*
|
||||
* As above, load_cr3() is serializing and orders TLB
|
||||
* fills with respect to the mm_cpumask write.
|
||||
*/
|
||||
load_cr3(next->pgd);
|
||||
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
load_mm_cr4(next);
|
||||
load_mm_ldt(next);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
struct task_struct *tsk);
|
||||
#define switch_mm_irqs_off switch_mm_irqs_off
|
||||
|
||||
#define activate_mm(prev, next) \
|
||||
do { \
|
||||
|
|
|
|||
|
|
@ -18,6 +18,12 @@
|
|||
#ifndef __ASSEMBLY__
|
||||
#include <asm/x86_init.h>
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
extern int kaiser_enabled;
|
||||
#else
|
||||
#define kaiser_enabled 0
|
||||
#endif
|
||||
|
||||
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
|
||||
void ptdump_walk_pgd_level_checkwx(void);
|
||||
|
||||
|
|
@ -653,7 +659,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
|
|||
|
||||
static inline int pgd_bad(pgd_t pgd)
|
||||
{
|
||||
return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
|
||||
pgdval_t ignore_flags = _PAGE_USER;
|
||||
/*
|
||||
* We set NX on KAISER pgds that map userspace memory so
|
||||
* that userspace can not meaningfully use the kernel
|
||||
* page table by accident; it will fault on the first
|
||||
* instruction it tries to run. See native_set_pgd().
|
||||
*/
|
||||
if (kaiser_enabled)
|
||||
ignore_flags |= _PAGE_NX;
|
||||
|
||||
return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
|
||||
}
|
||||
|
||||
static inline int pgd_none(pgd_t pgd)
|
||||
|
|
@ -855,7 +871,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
|
|||
*/
|
||||
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
|
||||
{
|
||||
memcpy(dst, src, count * sizeof(pgd_t));
|
||||
memcpy(dst, src, count * sizeof(pgd_t));
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
if (kaiser_enabled) {
|
||||
/* Clone the shadow pgd part as well */
|
||||
memcpy(native_get_shadow_pgd(dst),
|
||||
native_get_shadow_pgd(src),
|
||||
count * sizeof(pgd_t));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
|
||||
|
|
|
|||
|
|
@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)
|
|||
native_set_pud(pud, native_make_pud(0));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
|
||||
|
||||
static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
|
||||
{
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
/* linux/mmdebug.h may not have been included at this point */
|
||||
BUG_ON(!kaiser_enabled);
|
||||
#endif
|
||||
return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
|
||||
}
|
||||
#else
|
||||
static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
|
||||
{
|
||||
return pgd;
|
||||
}
|
||||
static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
|
||||
{
|
||||
BUILD_BUG_ON(1);
|
||||
return NULL;
|
||||
}
|
||||
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
|
||||
|
||||
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
|
||||
{
|
||||
*pgdp = pgd;
|
||||
*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
|
||||
}
|
||||
|
||||
static inline void native_pgd_clear(pgd_t *pgd)
|
||||
|
|
|
|||
|
|
@ -89,7 +89,7 @@
|
|||
#define _PAGE_NX (_AT(pteval_t, 0))
|
||||
#endif
|
||||
|
||||
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
|
||||
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
|
||||
|
||||
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
|
||||
_PAGE_ACCESSED | _PAGE_DIRTY)
|
||||
|
|
@ -102,6 +102,33 @@
|
|||
_PAGE_SOFT_DIRTY)
|
||||
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
|
||||
|
||||
/* The ASID is the lower 12 bits of CR3 */
|
||||
#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
|
||||
|
||||
/* Mask for all the PCID-related bits in CR3: */
|
||||
#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
|
||||
#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
|
||||
|
||||
#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
|
||||
/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
|
||||
#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
|
||||
|
||||
#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
|
||||
#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
|
||||
#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
|
||||
#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
|
||||
#else
|
||||
#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
|
||||
/*
|
||||
* PCIDs are unsupported on 32-bit and none of these bits can be
|
||||
* set in CR3:
|
||||
*/
|
||||
#define X86_CR3_PCID_KERN_FLUSH (0)
|
||||
#define X86_CR3_PCID_USER_FLUSH (0)
|
||||
#define X86_CR3_PCID_KERN_NOFLUSH (0)
|
||||
#define X86_CR3_PCID_USER_NOFLUSH (0)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The cache modes defined here are used to translate between pure SW usage
|
||||
* and the HW defined cache mode bits and/or PAT entries.
|
||||
|
|
|
|||
|
|
@ -156,8 +156,8 @@ extern struct cpuinfo_x86 boot_cpu_data;
|
|||
extern struct cpuinfo_x86 new_cpu_data;
|
||||
|
||||
extern struct tss_struct doublefault_tss;
|
||||
extern __u32 cpu_caps_cleared[NCAPINTS];
|
||||
extern __u32 cpu_caps_set[NCAPINTS];
|
||||
extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
|
||||
extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
|
||||
|
|
@ -305,7 +305,7 @@ struct tss_struct {
|
|||
|
||||
} ____cacheline_aligned;
|
||||
|
||||
DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
|
||||
DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
|
||||
|
|
|
|||
|
|
@ -4,6 +4,15 @@
|
|||
#include <linux/clocksource.h>
|
||||
#include <asm/pvclock-abi.h>
|
||||
|
||||
#ifdef CONFIG_KVM_GUEST
|
||||
extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
|
||||
#else
|
||||
static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* some helper functions for xen and kvm pv clock sources */
|
||||
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
|
||||
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
|
||||
|
|
|
|||
|
|
@ -6,6 +6,55 @@
|
|||
|
||||
#include <asm/processor.h>
|
||||
#include <asm/special_insns.h>
|
||||
#include <asm/smp.h>
|
||||
|
||||
static inline void __invpcid(unsigned long pcid, unsigned long addr,
|
||||
unsigned long type)
|
||||
{
|
||||
struct { u64 d[2]; } desc = { { pcid, addr } };
|
||||
|
||||
/*
|
||||
* The memory clobber is because the whole point is to invalidate
|
||||
* stale TLB entries and, especially if we're flushing global
|
||||
* mappings, we don't want the compiler to reorder any subsequent
|
||||
* memory accesses before the TLB flush.
|
||||
*
|
||||
* The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
|
||||
* invpcid (%rcx), %rax in long mode.
|
||||
*/
|
||||
asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
|
||||
: : "m" (desc), "a" (type), "c" (&desc) : "memory");
|
||||
}
|
||||
|
||||
#define INVPCID_TYPE_INDIV_ADDR 0
|
||||
#define INVPCID_TYPE_SINGLE_CTXT 1
|
||||
#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
|
||||
#define INVPCID_TYPE_ALL_NON_GLOBAL 3
|
||||
|
||||
/* Flush all mappings for a given pcid and addr, not including globals. */
|
||||
static inline void invpcid_flush_one(unsigned long pcid,
|
||||
unsigned long addr)
|
||||
{
|
||||
__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
|
||||
}
|
||||
|
||||
/* Flush all mappings for a given PCID, not including globals. */
|
||||
static inline void invpcid_flush_single_context(unsigned long pcid)
|
||||
{
|
||||
__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
|
||||
}
|
||||
|
||||
/* Flush all mappings, including globals, for all PCIDs. */
|
||||
static inline void invpcid_flush_all(void)
|
||||
{
|
||||
__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
|
||||
}
|
||||
|
||||
/* Flush all mappings for all PCIDs except globals. */
|
||||
static inline void invpcid_flush_all_nonglobals(void)
|
||||
{
|
||||
__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
#include <asm/paravirt.h>
|
||||
|
|
@ -16,10 +65,8 @@
|
|||
#endif
|
||||
|
||||
struct tlb_state {
|
||||
#ifdef CONFIG_SMP
|
||||
struct mm_struct *active_mm;
|
||||
int state;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Access to this CR4 shadow and to H/W CR4 is protected by
|
||||
|
|
@ -84,6 +131,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
|
|||
cr4_set_bits(mask);
|
||||
}
|
||||
|
||||
/*
|
||||
* Declare a couple of kaiser interfaces here for convenience,
|
||||
* to avoid the need for asm/kaiser.h in unexpected places.
|
||||
*/
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
extern int kaiser_enabled;
|
||||
extern void kaiser_setup_pcid(void);
|
||||
extern void kaiser_flush_tlb_on_return_to_user(void);
|
||||
#else
|
||||
#define kaiser_enabled 0
|
||||
static inline void kaiser_setup_pcid(void)
|
||||
{
|
||||
}
|
||||
static inline void kaiser_flush_tlb_on_return_to_user(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void __native_flush_tlb(void)
|
||||
{
|
||||
/*
|
||||
|
|
@ -92,6 +157,8 @@ static inline void __native_flush_tlb(void)
|
|||
* back:
|
||||
*/
|
||||
preempt_disable();
|
||||
if (kaiser_enabled)
|
||||
kaiser_flush_tlb_on_return_to_user();
|
||||
native_write_cr3(native_read_cr3());
|
||||
preempt_enable();
|
||||
}
|
||||
|
|
@ -101,39 +168,84 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
|
|||
unsigned long cr4;
|
||||
|
||||
cr4 = this_cpu_read(cpu_tlbstate.cr4);
|
||||
/* clear PGE */
|
||||
native_write_cr4(cr4 & ~X86_CR4_PGE);
|
||||
/* write old PGE again and flush TLBs */
|
||||
native_write_cr4(cr4);
|
||||
if (cr4 & X86_CR4_PGE) {
|
||||
/* clear PGE and flush TLB of all entries */
|
||||
native_write_cr4(cr4 & ~X86_CR4_PGE);
|
||||
/* restore PGE as it was before */
|
||||
native_write_cr4(cr4);
|
||||
} else {
|
||||
/* do it with cr3, letting kaiser flush user PCID */
|
||||
__native_flush_tlb();
|
||||
}
|
||||
}
|
||||
|
||||
static inline void __native_flush_tlb_global(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
if (this_cpu_has(X86_FEATURE_INVPCID)) {
|
||||
/*
|
||||
* Using INVPCID is considerably faster than a pair of writes
|
||||
* to CR4 sandwiched inside an IRQ flag save/restore.
|
||||
*
|
||||
* Note, this works with CR4.PCIDE=0 or 1.
|
||||
*/
|
||||
invpcid_flush_all();
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read-modify-write to CR4 - protect it from preemption and
|
||||
* from interrupts. (Use the raw variant because this code can
|
||||
* be called from deep inside debugging code.)
|
||||
*/
|
||||
raw_local_irq_save(flags);
|
||||
|
||||
__native_flush_tlb_global_irq_disabled();
|
||||
|
||||
raw_local_irq_restore(flags);
|
||||
}
|
||||
|
||||
static inline void __native_flush_tlb_single(unsigned long addr)
|
||||
{
|
||||
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
|
||||
/*
|
||||
* SIMICS #GP's if you run INVPCID with type 2/3
|
||||
* and X86_CR4_PCIDE clear. Shame!
|
||||
*
|
||||
* The ASIDs used below are hard-coded. But, we must not
|
||||
* call invpcid(type=1/2) before CR4.PCIDE=1. Just call
|
||||
* invlpg in the case we are called early.
|
||||
*/
|
||||
|
||||
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
|
||||
if (kaiser_enabled)
|
||||
kaiser_flush_tlb_on_return_to_user();
|
||||
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
|
||||
return;
|
||||
}
|
||||
/* Flush the address out of both PCIDs. */
|
||||
/*
|
||||
* An optimization here might be to determine addresses
|
||||
* that are only kernel-mapped and only flush the kernel
|
||||
* ASID. But, userspace flushes are probably much more
|
||||
* important performance-wise.
|
||||
*
|
||||
* Make sure to do only a single invpcid when KAISER is
|
||||
* disabled and we have only a single ASID.
|
||||
*/
|
||||
if (kaiser_enabled)
|
||||
invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
|
||||
invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
|
||||
}
|
||||
|
||||
static inline void __flush_tlb_all(void)
|
||||
{
|
||||
if (cpu_has_pge)
|
||||
__flush_tlb_global();
|
||||
else
|
||||
__flush_tlb();
|
||||
__flush_tlb_global();
|
||||
/*
|
||||
* Note: if we somehow had PCID but not PGE, then this wouldn't work --
|
||||
* we'd end up flushing kernel translations for the current ASID but
|
||||
* we might fail to flush kernel translations for other cached ASIDs.
|
||||
*
|
||||
* To avoid this issue, we force PCID off if PGE is off.
|
||||
*/
|
||||
}
|
||||
|
||||
static inline void __flush_tlb_one(unsigned long addr)
|
||||
|
|
@ -147,7 +259,6 @@ static inline void __flush_tlb_one(unsigned long addr)
|
|||
/*
|
||||
* TLB flushing:
|
||||
*
|
||||
* - flush_tlb() flushes the current mm struct TLBs
|
||||
* - flush_tlb_all() flushes all processes TLBs
|
||||
* - flush_tlb_mm(mm) flushes the specified mm context TLB's
|
||||
* - flush_tlb_page(vma, vmaddr) flushes one page
|
||||
|
|
@ -159,84 +270,6 @@ static inline void __flush_tlb_one(unsigned long addr)
|
|||
* and page-granular flushes are available only on i486 and up.
|
||||
*/
|
||||
|
||||
#ifndef CONFIG_SMP
|
||||
|
||||
/* "_up" is for UniProcessor.
|
||||
*
|
||||
* This is a helper for other header functions. *Not* intended to be called
|
||||
* directly. All global TLB flushes need to either call this, or to bump the
|
||||
* vm statistics themselves.
|
||||
*/
|
||||
static inline void __flush_tlb_up(void)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
||||
__flush_tlb();
|
||||
}
|
||||
|
||||
static inline void flush_tlb_all(void)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
||||
__flush_tlb_all();
|
||||
}
|
||||
|
||||
static inline void flush_tlb(void)
|
||||
{
|
||||
__flush_tlb_up();
|
||||
}
|
||||
|
||||
static inline void local_flush_tlb(void)
|
||||
{
|
||||
__flush_tlb_up();
|
||||
}
|
||||
|
||||
static inline void flush_tlb_mm(struct mm_struct *mm)
|
||||
{
|
||||
if (mm == current->active_mm)
|
||||
__flush_tlb_up();
|
||||
}
|
||||
|
||||
static inline void flush_tlb_page(struct vm_area_struct *vma,
|
||||
unsigned long addr)
|
||||
{
|
||||
if (vma->vm_mm == current->active_mm)
|
||||
__flush_tlb_one(addr);
|
||||
}
|
||||
|
||||
static inline void flush_tlb_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
if (vma->vm_mm == current->active_mm)
|
||||
__flush_tlb_up();
|
||||
}
|
||||
|
||||
static inline void flush_tlb_mm_range(struct mm_struct *mm,
|
||||
unsigned long start, unsigned long end, unsigned long vmflag)
|
||||
{
|
||||
if (mm == current->active_mm)
|
||||
__flush_tlb_up();
|
||||
}
|
||||
|
||||
static inline void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
struct mm_struct *mm,
|
||||
unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void reset_lazy_tlbstate(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void flush_tlb_kernel_range(unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
flush_tlb_all();
|
||||
}
|
||||
|
||||
#else /* SMP */
|
||||
|
||||
#include <asm/smp.h>
|
||||
|
||||
#define local_flush_tlb() __flush_tlb()
|
||||
|
||||
#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
|
||||
|
|
@ -245,13 +278,14 @@ static inline void flush_tlb_kernel_range(unsigned long start,
|
|||
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
|
||||
|
||||
extern void flush_tlb_all(void);
|
||||
extern void flush_tlb_current_task(void);
|
||||
extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
|
||||
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, unsigned long vmflag);
|
||||
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
|
||||
|
||||
#define flush_tlb() flush_tlb_current_task()
|
||||
static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
|
||||
{
|
||||
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
|
||||
}
|
||||
|
||||
void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
struct mm_struct *mm,
|
||||
|
|
@ -266,14 +300,6 @@ static inline void reset_lazy_tlbstate(void)
|
|||
this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
|
||||
}
|
||||
|
||||
#endif /* SMP */
|
||||
|
||||
/* Not inlined due to inc_irq_stat not being defined yet */
|
||||
#define flush_tlb_local() { \
|
||||
inc_irq_stat(irq_tlb_count); \
|
||||
local_flush_tlb(); \
|
||||
}
|
||||
|
||||
#ifndef CONFIG_PARAVIRT
|
||||
#define flush_tlb_others(mask, mm, start, end) \
|
||||
native_flush_tlb_others(mask, mm, start, end)
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ struct vdso_image {
|
|||
|
||||
long sym_vvar_page;
|
||||
long sym_hpet_page;
|
||||
long sym_pvclock_page;
|
||||
long sym_VDSO32_NOTE_MASK;
|
||||
long sym___kernel_sigreturn;
|
||||
long sym___kernel_rt_sigreturn;
|
||||
|
|
|
|||
|
|
@ -12,12 +12,14 @@ extern void map_vsyscall(void);
|
|||
* Returns true if handled.
|
||||
*/
|
||||
extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
|
||||
extern bool vsyscall_enabled(void);
|
||||
#else
|
||||
static inline void map_vsyscall(void) {}
|
||||
static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static inline bool vsyscall_enabled(void) { return false; }
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_X86_VSYSCALL_H */
|
||||
|
|
|
|||
|
|
@ -77,7 +77,8 @@
|
|||
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
|
||||
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
|
||||
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
|
||||
#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
|
||||
#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
|
||||
#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
|
||||
|
||||
/*
|
||||
* Intel CPU features in CR4
|
||||
|
|
|
|||
|
|
@ -321,13 +321,12 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
|
|||
#ifdef CONFIG_X86_IO_APIC
|
||||
#define MP_ISA_BUS 0
|
||||
|
||||
static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
|
||||
u8 trigger, u32 gsi);
|
||||
|
||||
static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
|
||||
u32 gsi)
|
||||
{
|
||||
int ioapic;
|
||||
int pin;
|
||||
struct mpc_intsrc mp_irq;
|
||||
|
||||
/*
|
||||
* Check bus_irq boundary.
|
||||
*/
|
||||
|
|
@ -336,14 +335,6 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
|
|||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert 'gsi' to 'ioapic.pin'.
|
||||
*/
|
||||
ioapic = mp_find_ioapic(gsi);
|
||||
if (ioapic < 0)
|
||||
return;
|
||||
pin = mp_find_ioapic_pin(ioapic, gsi);
|
||||
|
||||
/*
|
||||
* TBD: This check is for faulty timer entries, where the override
|
||||
* erroneously sets the trigger to level, resulting in a HUGE
|
||||
|
|
@ -352,16 +343,8 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
|
|||
if ((bus_irq == 0) && (trigger == 3))
|
||||
trigger = 1;
|
||||
|
||||
mp_irq.type = MP_INTSRC;
|
||||
mp_irq.irqtype = mp_INT;
|
||||
mp_irq.irqflag = (trigger << 2) | polarity;
|
||||
mp_irq.srcbus = MP_ISA_BUS;
|
||||
mp_irq.srcbusirq = bus_irq; /* IRQ */
|
||||
mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
|
||||
mp_irq.dstirq = pin; /* INTIN# */
|
||||
|
||||
mp_save_irq(&mp_irq);
|
||||
|
||||
if (mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi) < 0)
|
||||
return;
|
||||
/*
|
||||
* Reset default identity mapping if gsi is also an legacy IRQ,
|
||||
* otherwise there will be more than one entry with the same GSI
|
||||
|
|
@ -408,6 +391,34 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
|
||||
u8 trigger, u32 gsi)
|
||||
{
|
||||
struct mpc_intsrc mp_irq;
|
||||
int ioapic, pin;
|
||||
|
||||
/* Convert 'gsi' to 'ioapic.pin'(INTIN#) */
|
||||
ioapic = mp_find_ioapic(gsi);
|
||||
if (ioapic < 0) {
|
||||
pr_warn("Failed to find ioapic for gsi : %u\n", gsi);
|
||||
return ioapic;
|
||||
}
|
||||
|
||||
pin = mp_find_ioapic_pin(ioapic, gsi);
|
||||
|
||||
mp_irq.type = MP_INTSRC;
|
||||
mp_irq.irqtype = mp_INT;
|
||||
mp_irq.irqflag = (trigger << 2) | polarity;
|
||||
mp_irq.srcbus = MP_ISA_BUS;
|
||||
mp_irq.srcbusirq = bus_irq;
|
||||
mp_irq.dstapic = mpc_ioapic_id(ioapic);
|
||||
mp_irq.dstirq = pin;
|
||||
|
||||
mp_save_irq(&mp_irq);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __init
|
||||
acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
|
||||
{
|
||||
|
|
@ -452,7 +463,11 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
|
|||
if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
|
||||
polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
|
||||
|
||||
mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
|
||||
if (bus_irq < NR_IRQS_LEGACY)
|
||||
mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
|
||||
else
|
||||
mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi);
|
||||
|
||||
acpi_penalize_sci_irq(bus_irq, trigger, polarity);
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -339,9 +339,12 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
|
|||
static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
|
||||
{
|
||||
unsigned long flags;
|
||||
int i;
|
||||
|
||||
if (instr[0] != 0x90)
|
||||
return;
|
||||
for (i = 0; i < a->padlen; i++) {
|
||||
if (instr[i] != 0x90)
|
||||
return;
|
||||
}
|
||||
|
||||
local_irq_save(flags);
|
||||
add_nops(instr + (a->instrlen - a->padlen), a->padlen);
|
||||
|
|
|
|||
|
|
@ -16,13 +16,11 @@ obj-y := intel_cacheinfo.o scattered.o topology.o
|
|||
obj-y += common.o
|
||||
obj-y += rdrand.o
|
||||
obj-y += match.o
|
||||
obj-y += bugs.o
|
||||
|
||||
obj-$(CONFIG_PROC_FS) += proc.o
|
||||
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
|
||||
|
||||
obj-$(CONFIG_X86_32) += bugs.o
|
||||
obj-$(CONFIG_X86_64) += bugs_64.o
|
||||
|
||||
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
|
||||
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
|
||||
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
*/
|
||||
#include <linux/init.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <asm/bugs.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/processor-flags.h>
|
||||
|
|
@ -16,15 +17,27 @@
|
|||
#include <asm/msr.h>
|
||||
#include <asm/paravirt.h>
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/cacheflush.h>
|
||||
|
||||
void __init check_bugs(void)
|
||||
{
|
||||
identify_boot_cpu();
|
||||
#ifndef CONFIG_SMP
|
||||
pr_info("CPU: ");
|
||||
print_cpu_info(&boot_cpu_data);
|
||||
#ifdef CONFIG_X86_32
|
||||
/*
|
||||
* Regardless of whether PCID is enumerated, the SDM says
|
||||
* that it can't be enabled in 32-bit mode.
|
||||
*/
|
||||
setup_clear_cpu_cap(X86_FEATURE_PCID);
|
||||
#endif
|
||||
|
||||
identify_boot_cpu();
|
||||
|
||||
if (!IS_ENABLED(CONFIG_SMP)) {
|
||||
pr_info("CPU: ");
|
||||
print_cpu_info(&boot_cpu_data);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
/*
|
||||
* Check whether we are able to run this kernel safely on SMP.
|
||||
*
|
||||
|
|
@ -40,4 +53,46 @@ void __init check_bugs(void)
|
|||
alternative_instructions();
|
||||
|
||||
fpu__init_check_bugs();
|
||||
#else /* CONFIG_X86_64 */
|
||||
alternative_instructions();
|
||||
|
||||
/*
|
||||
* Make sure the first 2MB area is not mapped by huge pages
|
||||
* There are typically fixed size MTRRs in there and overlapping
|
||||
* MTRRs into large pages causes slow downs.
|
||||
*
|
||||
* Right now we don't do that with gbpages because there seems
|
||||
* very little benefit for that case.
|
||||
*/
|
||||
if (!direct_gbpages)
|
||||
set_memory_4k((unsigned long)__va(0), 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SYSFS
|
||||
ssize_t cpu_show_meltdown(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
|
||||
return sprintf(buf, "Not affected\n");
|
||||
if (boot_cpu_has(X86_FEATURE_KAISER))
|
||||
return sprintf(buf, "Mitigation: PTI\n");
|
||||
return sprintf(buf, "Vulnerable\n");
|
||||
}
|
||||
|
||||
ssize_t cpu_show_spectre_v1(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
|
||||
return sprintf(buf, "Not affected\n");
|
||||
return sprintf(buf, "Vulnerable\n");
|
||||
}
|
||||
|
||||
ssize_t cpu_show_spectre_v2(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
|
||||
return sprintf(buf, "Not affected\n");
|
||||
return sprintf(buf, "Vulnerable\n");
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1,33 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) 1994 Linus Torvalds
|
||||
* Copyright (C) 2000 SuSE
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/init.h>
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/bugs.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/mtrr.h>
|
||||
#include <asm/cacheflush.h>
|
||||
|
||||
void __init check_bugs(void)
|
||||
{
|
||||
identify_boot_cpu();
|
||||
#if !defined(CONFIG_SMP)
|
||||
printk(KERN_INFO "CPU: ");
|
||||
print_cpu_info(&boot_cpu_data);
|
||||
#endif
|
||||
alternative_instructions();
|
||||
|
||||
/*
|
||||
* Make sure the first 2MB area is not mapped by huge pages
|
||||
* There are typically fixed size MTRRs in there and overlapping
|
||||
* MTRRs into large pages causes slow downs.
|
||||
*
|
||||
* Right now we don't do that with gbpages because there seems
|
||||
* very little benefit for that case.
|
||||
*/
|
||||
if (!direct_gbpages)
|
||||
set_memory_4k((unsigned long)__va(0), 1);
|
||||
}
|
||||
|
|
@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = {
|
|||
|
||||
static const struct cpu_dev *this_cpu = &default_cpu;
|
||||
|
||||
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
|
||||
DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* We need valid kernel segments for data and code in long mode too
|
||||
|
|
@ -162,6 +162,40 @@ static int __init x86_mpx_setup(char *s)
|
|||
}
|
||||
__setup("nompx", x86_mpx_setup);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
static int __init x86_pcid_setup(char *s)
|
||||
{
|
||||
/* require an exact match without trailing characters */
|
||||
if (strlen(s))
|
||||
return 0;
|
||||
|
||||
/* do not emit a message if the feature is not present */
|
||||
if (!boot_cpu_has(X86_FEATURE_PCID))
|
||||
return 1;
|
||||
|
||||
setup_clear_cpu_cap(X86_FEATURE_PCID);
|
||||
pr_info("nopcid: PCID feature disabled\n");
|
||||
return 1;
|
||||
}
|
||||
__setup("nopcid", x86_pcid_setup);
|
||||
#endif
|
||||
|
||||
static int __init x86_noinvpcid_setup(char *s)
|
||||
{
|
||||
/* noinvpcid doesn't accept parameters */
|
||||
if (s)
|
||||
return -EINVAL;
|
||||
|
||||
/* do not emit a message if the feature is not present */
|
||||
if (!boot_cpu_has(X86_FEATURE_INVPCID))
|
||||
return 0;
|
||||
|
||||
setup_clear_cpu_cap(X86_FEATURE_INVPCID);
|
||||
pr_info("noinvpcid: INVPCID feature disabled\n");
|
||||
return 0;
|
||||
}
|
||||
early_param("noinvpcid", x86_noinvpcid_setup);
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
static int cachesize_override = -1;
|
||||
static int disable_x86_serial_nr = 1;
|
||||
|
|
@ -287,6 +321,39 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
|
|||
}
|
||||
}
|
||||
|
||||
static void setup_pcid(struct cpuinfo_x86 *c)
|
||||
{
|
||||
if (cpu_has(c, X86_FEATURE_PCID)) {
|
||||
if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
|
||||
cr4_set_bits(X86_CR4_PCIDE);
|
||||
/*
|
||||
* INVPCID has two "groups" of types:
|
||||
* 1/2: Invalidate an individual address
|
||||
* 3/4: Invalidate all contexts
|
||||
*
|
||||
* 1/2 take a PCID, but 3/4 do not. So, 3/4
|
||||
* ignore the PCID argument in the descriptor.
|
||||
* But, we have to be careful not to call 1/2
|
||||
* with an actual non-zero PCID in them before
|
||||
* we do the above cr4_set_bits().
|
||||
*/
|
||||
if (cpu_has(c, X86_FEATURE_INVPCID))
|
||||
set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
|
||||
} else {
|
||||
/*
|
||||
* flush_tlb_all(), as currently implemented, won't
|
||||
* work if PCID is on but PGE is not. Since that
|
||||
* combination doesn't exist on real hardware, there's
|
||||
* no reason to try to fully support it, but it's
|
||||
* polite to avoid corrupting data if we're on
|
||||
* an improperly configured VM.
|
||||
*/
|
||||
clear_cpu_cap(c, X86_FEATURE_PCID);
|
||||
}
|
||||
}
|
||||
kaiser_setup_pcid();
|
||||
}
|
||||
|
||||
/*
|
||||
* Some CPU features depend on higher CPUID levels, which may not always
|
||||
* be available due to CPUID level capping or broken virtualization
|
||||
|
|
@ -365,8 +432,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
|
|||
return NULL; /* Not found */
|
||||
}
|
||||
|
||||
__u32 cpu_caps_cleared[NCAPINTS];
|
||||
__u32 cpu_caps_set[NCAPINTS];
|
||||
__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
|
||||
__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
|
||||
|
||||
void load_percpu_segment(int cpu)
|
||||
{
|
||||
|
|
@ -597,6 +664,16 @@ void cpu_detect(struct cpuinfo_x86 *c)
|
|||
}
|
||||
}
|
||||
|
||||
static void apply_forced_caps(struct cpuinfo_x86 *c)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
|
||||
c->x86_capability[i] &= ~cpu_caps_cleared[i];
|
||||
c->x86_capability[i] |= cpu_caps_set[i];
|
||||
}
|
||||
}
|
||||
|
||||
void get_cpu_cap(struct cpuinfo_x86 *c)
|
||||
{
|
||||
u32 tfms, xlvl;
|
||||
|
|
@ -753,6 +830,13 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
|
|||
}
|
||||
|
||||
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
|
||||
|
||||
/* Assume for now that ALL x86 CPUs are insecure */
|
||||
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
|
||||
|
||||
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
|
||||
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
|
||||
|
||||
fpu__init_system(c);
|
||||
}
|
||||
|
||||
|
|
@ -888,11 +972,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
|
|||
if (this_cpu->c_identify)
|
||||
this_cpu->c_identify(c);
|
||||
|
||||
/* Clear/Set all flags overriden by options, after probe */
|
||||
for (i = 0; i < NCAPINTS; i++) {
|
||||
c->x86_capability[i] &= ~cpu_caps_cleared[i];
|
||||
c->x86_capability[i] |= cpu_caps_set[i];
|
||||
}
|
||||
/* Clear/Set all flags overridden by options, after probe */
|
||||
apply_forced_caps(c);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
|
||||
|
|
@ -918,6 +999,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
|
|||
setup_smep(c);
|
||||
setup_smap(c);
|
||||
|
||||
/* Set up PCID */
|
||||
setup_pcid(c);
|
||||
|
||||
/*
|
||||
* The vendor-specific functions might have changed features.
|
||||
* Now we do "generic changes."
|
||||
|
|
@ -950,10 +1034,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
|
|||
* Clear/Set all flags overriden by options, need do it
|
||||
* before following smp all cpus cap AND.
|
||||
*/
|
||||
for (i = 0; i < NCAPINTS; i++) {
|
||||
c->x86_capability[i] &= ~cpu_caps_cleared[i];
|
||||
c->x86_capability[i] |= cpu_caps_set[i];
|
||||
}
|
||||
apply_forced_caps(c);
|
||||
|
||||
/*
|
||||
* On SMP, boot_cpu_data holds the common feature set between
|
||||
|
|
@ -1173,7 +1254,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
|
|||
[DEBUG_STACK - 1] = DEBUG_STKSZ
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
|
||||
DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
|
||||
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
|
||||
|
||||
/* May not be marked __init: used by software suspend */
|
||||
|
|
@ -1336,6 +1417,14 @@ void cpu_init(void)
|
|||
* try to read it.
|
||||
*/
|
||||
cr4_init_shadow();
|
||||
if (!kaiser_enabled) {
|
||||
/*
|
||||
* secondary_startup_64() deferred setting PGE in cr4:
|
||||
* probe_page_size_mask() sets it on the boot cpu,
|
||||
* but it needs to be set on each secondary cpu.
|
||||
*/
|
||||
cr4_set_bits(X86_CR4_PGE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Load microcode on this cpu if a valid microcode is available.
|
||||
|
|
|
|||
|
|
@ -580,6 +580,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
|
|||
#define F14H_MPB_MAX_SIZE 1824
|
||||
#define F15H_MPB_MAX_SIZE 4096
|
||||
#define F16H_MPB_MAX_SIZE 3458
|
||||
#define F17H_MPB_MAX_SIZE 3200
|
||||
|
||||
switch (family) {
|
||||
case 0x14:
|
||||
|
|
@ -591,6 +592,9 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
|
|||
case 0x16:
|
||||
max_size = F16H_MPB_MAX_SIZE;
|
||||
break;
|
||||
case 0x17:
|
||||
max_size = F17H_MPB_MAX_SIZE;
|
||||
break;
|
||||
default:
|
||||
max_size = F1XH_MPB_MAX_SIZE;
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -994,9 +994,17 @@ static bool is_blacklisted(unsigned int cpu)
|
|||
{
|
||||
struct cpuinfo_x86 *c = &cpu_data(cpu);
|
||||
|
||||
if (c->x86 == 6 && c->x86_model == 79) {
|
||||
pr_err_once("late loading on model 79 is disabled.\n");
|
||||
return true;
|
||||
/*
|
||||
* Late loading on model 79 with microcode revision less than 0x0b000021
|
||||
* may result in a system hang. This behavior is documented in item
|
||||
* BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family).
|
||||
*/
|
||||
if (c->x86 == 6 &&
|
||||
c->x86_model == 79 &&
|
||||
c->x86_mask == 0x01 &&
|
||||
c->microcode < 0x0b000021) {
|
||||
pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
|
||||
pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
|
||||
}
|
||||
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -2,11 +2,15 @@
|
|||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include <asm/kaiser.h>
|
||||
#include <asm/perf_event.h>
|
||||
#include <asm/insn.h>
|
||||
|
||||
#include "perf_event.h"
|
||||
|
||||
static
|
||||
DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
|
||||
|
||||
/* The size of a BTS record in bytes: */
|
||||
#define BTS_RECORD_SIZE 24
|
||||
|
||||
|
|
@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
|
|||
|
||||
static DEFINE_PER_CPU(void *, insn_buffer);
|
||||
|
||||
static void *dsalloc(size_t size, gfp_t flags, int node)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
unsigned int order = get_order(size);
|
||||
struct page *page;
|
||||
unsigned long addr;
|
||||
|
||||
page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
|
||||
if (!page)
|
||||
return NULL;
|
||||
addr = (unsigned long)page_address(page);
|
||||
if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
|
||||
__free_pages(page, order);
|
||||
addr = 0;
|
||||
}
|
||||
return (void *)addr;
|
||||
#else
|
||||
return kmalloc_node(size, flags | __GFP_ZERO, node);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void dsfree(const void *buffer, size_t size)
|
||||
{
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
if (!buffer)
|
||||
return;
|
||||
kaiser_remove_mapping((unsigned long)buffer, size);
|
||||
free_pages((unsigned long)buffer, get_order(size));
|
||||
#else
|
||||
kfree(buffer);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int alloc_pebs_buffer(int cpu)
|
||||
{
|
||||
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
|
||||
|
|
@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
|
|||
if (!x86_pmu.pebs)
|
||||
return 0;
|
||||
|
||||
buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
|
||||
buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
|
||||
if (unlikely(!buffer))
|
||||
return -ENOMEM;
|
||||
|
||||
|
|
@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
|
|||
if (x86_pmu.intel_cap.pebs_format < 2) {
|
||||
ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
|
||||
if (!ibuffer) {
|
||||
kfree(buffer);
|
||||
dsfree(buffer, x86_pmu.pebs_buffer_size);
|
||||
return -ENOMEM;
|
||||
}
|
||||
per_cpu(insn_buffer, cpu) = ibuffer;
|
||||
|
|
@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
|
|||
kfree(per_cpu(insn_buffer, cpu));
|
||||
per_cpu(insn_buffer, cpu) = NULL;
|
||||
|
||||
kfree((void *)(unsigned long)ds->pebs_buffer_base);
|
||||
dsfree((void *)(unsigned long)ds->pebs_buffer_base,
|
||||
x86_pmu.pebs_buffer_size);
|
||||
ds->pebs_buffer_base = 0;
|
||||
}
|
||||
|
||||
|
|
@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
|
|||
if (!x86_pmu.bts)
|
||||
return 0;
|
||||
|
||||
buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
|
||||
buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
|
||||
if (unlikely(!buffer)) {
|
||||
WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
|
||||
return -ENOMEM;
|
||||
|
|
@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
|
|||
if (!ds || !x86_pmu.bts)
|
||||
return;
|
||||
|
||||
kfree((void *)(unsigned long)ds->bts_buffer_base);
|
||||
dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
|
||||
ds->bts_buffer_base = 0;
|
||||
}
|
||||
|
||||
static int alloc_ds_buffer(int cpu)
|
||||
{
|
||||
int node = cpu_to_node(cpu);
|
||||
struct debug_store *ds;
|
||||
|
||||
ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
|
||||
if (unlikely(!ds))
|
||||
return -ENOMEM;
|
||||
struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
|
||||
|
||||
memset(ds, 0, sizeof(*ds));
|
||||
per_cpu(cpu_hw_events, cpu).ds = ds;
|
||||
|
||||
return 0;
|
||||
|
|
@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
|
|||
return;
|
||||
|
||||
per_cpu(cpu_hw_events, cpu).ds = NULL;
|
||||
kfree(ds);
|
||||
}
|
||||
|
||||
void release_ds_buffers(void)
|
||||
|
|
|
|||
|
|
@ -41,6 +41,7 @@
|
|||
#include <asm/pgalloc.h>
|
||||
#include <asm/setup.h>
|
||||
#include <asm/espfix.h>
|
||||
#include <asm/kaiser.h>
|
||||
|
||||
/*
|
||||
* Note: we only need 6*8 = 48 bytes for the espfix stack, but round
|
||||
|
|
@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)
|
|||
/* Install the espfix pud into the kernel page directory */
|
||||
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
|
||||
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
|
||||
/*
|
||||
* Just copy the top-level PGD that is mapping the espfix
|
||||
* area to ensure it is mapped into the shadow user page
|
||||
* tables.
|
||||
*/
|
||||
if (kaiser_enabled) {
|
||||
set_pgd(native_get_shadow_pgd(pgd_p),
|
||||
__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
|
||||
}
|
||||
|
||||
/* Randomize the locations */
|
||||
init_espfix_random();
|
||||
|
|
|
|||
|
|
@ -183,8 +183,8 @@ ENTRY(secondary_startup_64)
|
|||
movq $(init_level4_pgt - __START_KERNEL_map), %rax
|
||||
1:
|
||||
|
||||
/* Enable PAE mode and PGE */
|
||||
movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
|
||||
/* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
|
||||
movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
|
||||
movq %rcx, %cr4
|
||||
|
||||
/* Setup early boot stage 4 level pagetables. */
|
||||
|
|
@ -441,6 +441,27 @@ early_idt_ripmsg:
|
|||
.balign PAGE_SIZE; \
|
||||
GLOBAL(name)
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_ISOLATION
|
||||
/*
|
||||
* Each PGD needs to be 8k long and 8k aligned. We do not
|
||||
* ever go out to userspace with these, so we do not
|
||||
* strictly *need* the second page, but this allows us to
|
||||
* have a single set_pgd() implementation that does not
|
||||
* need to worry about whether it has 4k or 8k to work
|
||||
* with.
|
||||
*
|
||||
* This ensures PGDs are 8k long:
|
||||
*/
|
||||
#define KAISER_USER_PGD_FILL 512
|
||||
/* This ensures they are 8k-aligned: */
|
||||
#define NEXT_PGD_PAGE(name) \
|
||||
.balign 2 * PAGE_SIZE; \
|
||||
GLOBAL(name)
|
||||
#else
|
||||
#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
|
||||
#define KAISER_USER_PGD_FILL 0
|
||||
#endif
|
||||
|
||||
/* Automate the creation of 1 to 1 mapping pmd entries */
|
||||
#define PMDS(START, PERM, COUNT) \
|
||||
i = 0 ; \
|
||||
|
|
@ -450,9 +471,10 @@ GLOBAL(name)
|
|||
.endr
|
||||
|
||||
__INITDATA
|
||||
NEXT_PAGE(early_level4_pgt)
|
||||
NEXT_PGD_PAGE(early_level4_pgt)
|
||||
.fill 511,8,0
|
||||
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
|
||||
.fill KAISER_USER_PGD_FILL,8,0
|
||||
|
||||
NEXT_PAGE(early_dynamic_pgts)
|
||||
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
|
||||
|
|
@ -460,16 +482,18 @@ NEXT_PAGE(early_dynamic_pgts)
|
|||
.data
|
||||
|
||||
#ifndef CONFIG_XEN
|
||||
NEXT_PAGE(init_level4_pgt)
|
||||
NEXT_PGD_PAGE(init_level4_pgt)
|
||||
.fill 512,8,0
|
||||
.fill KAISER_USER_PGD_FILL,8,0
|
||||
#else
|
||||
NEXT_PAGE(init_level4_pgt)
|
||||
NEXT_PGD_PAGE(init_level4_pgt)
|
||||
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
|
||||
.org init_level4_pgt + L4_PAGE_OFFSET*8, 0
|
||||
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
|
||||
.org init_level4_pgt + L4_START_KERNEL*8, 0
|
||||
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
|
||||
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
|
||||
.fill KAISER_USER_PGD_FILL,8,0
|
||||
|
||||
NEXT_PAGE(level3_ident_pgt)
|
||||
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
|
||||
|
|
@ -480,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
|
|||
*/
|
||||
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
|
||||
#endif
|
||||
.fill KAISER_USER_PGD_FILL,8,0
|
||||
|
||||
NEXT_PAGE(level3_kernel_pgt)
|
||||
.fill L3_START_KERNEL,8,0
|
||||
|
|
|
|||
|
|
@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
|
|||
seq_puts(p, " Rescheduling interrupts\n");
|
||||
seq_printf(p, "%*s: ", prec, "CAL");
|
||||
for_each_online_cpu(j)
|
||||
seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
|
||||
irq_stats(j)->irq_tlb_count);
|
||||
seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
|
||||
seq_puts(p, " Function call interrupts\n");
|
||||
seq_printf(p, "%*s: ", prec, "TLB");
|
||||
for_each_online_cpu(j)
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ static struct irqaction irq2 = {
|
|||
.flags = IRQF_NO_THREAD,
|
||||
};
|
||||
|
||||
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
|
||||
DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
|
||||
[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock);
|
|||
static struct pvclock_vsyscall_time_info *hv_clock;
|
||||
static struct pvclock_wall_clock wall_clock;
|
||||
|
||||
struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
|
||||
{
|
||||
return hv_clock;
|
||||
}
|
||||
|
||||
/*
|
||||
* The wallclock is the time of day when we booted. Since then, some time may
|
||||
* have elapsed since the hypervisor wrote the data. So we try to account for
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@
|
|||
#include <linux/slab.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/kaiser.h>
|
||||
|
||||
#include <asm/ldt.h>
|
||||
#include <asm/desc.h>
|
||||
|
|
@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
|
|||
set_ldt(pc->ldt->entries, pc->ldt->size);
|
||||
}
|
||||
|
||||
static void __free_ldt_struct(struct ldt_struct *ldt)
|
||||
{
|
||||
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
|
||||
vfree(ldt->entries);
|
||||
else
|
||||
free_page((unsigned long)ldt->entries);
|
||||
kfree(ldt);
|
||||
}
|
||||
|
||||
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
|
||||
static struct ldt_struct *alloc_ldt_struct(int size)
|
||||
{
|
||||
struct ldt_struct *new_ldt;
|
||||
int alloc_size;
|
||||
int ret;
|
||||
|
||||
if (size > LDT_ENTRIES)
|
||||
return NULL;
|
||||
|
|
@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
|
||||
__PAGE_KERNEL);
|
||||
new_ldt->size = size;
|
||||
if (ret) {
|
||||
__free_ldt_struct(new_ldt);
|
||||
return NULL;
|
||||
}
|
||||
return new_ldt;
|
||||
}
|
||||
|
||||
|
|
@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
|
|||
if (likely(!ldt))
|
||||
return;
|
||||
|
||||
kaiser_remove_mapping((unsigned long)ldt->entries,
|
||||
ldt->size * LDT_ENTRY_SIZE);
|
||||
paravirt_free_ldt(ldt->entries, ldt->size);
|
||||
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
|
||||
vfree(ldt->entries);
|
||||
else
|
||||
free_page((unsigned long)ldt->entries);
|
||||
kfree(ldt);
|
||||
__free_ldt_struct(ldt);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
|
|||
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
|
||||
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
|
||||
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
|
||||
DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
|
||||
DEF_NATIVE(pv_cpu_ops, clts, "clts");
|
||||
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
|
||||
|
||||
|
|
@ -62,7 +61,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
|
|||
PATCH_SITE(pv_mmu_ops, read_cr3);
|
||||
PATCH_SITE(pv_mmu_ops, write_cr3);
|
||||
PATCH_SITE(pv_cpu_ops, clts);
|
||||
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
|
||||
PATCH_SITE(pv_cpu_ops, wbinvd);
|
||||
#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
|
||||
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@
|
|||
* section. Since TSS's are completely CPU-local, we want them
|
||||
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
|
||||
*/
|
||||
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
|
||||
__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
|
||||
.x86_tss = {
|
||||
.sp0 = TOP_OF_INIT_STACK,
|
||||
#ifdef CONFIG_X86_32
|
||||
|
|
|
|||
|
|
@ -93,6 +93,10 @@ void __noreturn machine_real_restart(unsigned int type)
|
|||
load_cr3(initial_page_table);
|
||||
#else
|
||||
write_cr3(real_mode_header->trampoline_pgd);
|
||||
|
||||
/* Exiting long mode will fail if CR4.PCIDE is set. */
|
||||
if (static_cpu_has(X86_FEATURE_PCID))
|
||||
cr4_clear_bits(X86_CR4_PCIDE);
|
||||
#endif
|
||||
|
||||
/* Jump to the identity-mapped low memory code */
|
||||
|
|
|
|||
|
|
@ -112,6 +112,7 @@
|
|||
#include <asm/alternative.h>
|
||||
#include <asm/prom.h>
|
||||
#include <asm/microcode.h>
|
||||
#include <asm/kaiser.h>
|
||||
|
||||
/*
|
||||
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
|
||||
|
|
@ -1016,6 +1017,12 @@ void __init setup_arch(char **cmdline_p)
|
|||
*/
|
||||
init_hypervisor_platform();
|
||||
|
||||
/*
|
||||
* This needs to happen right after XENPV is set on xen and
|
||||
* kaiser_enabled is checked below in cleanup_highmap().
|
||||
*/
|
||||
kaiser_check_boottime_disable();
|
||||
|
||||
x86_init.resources.probe_roms();
|
||||
|
||||
/* after parse_early_param, so could debug it */
|
||||
|
|
|
|||
|
|
@ -104,25 +104,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
|
|||
spin_lock_irqsave(&rtc_lock, flags);
|
||||
CMOS_WRITE(0xa, 0xf);
|
||||
spin_unlock_irqrestore(&rtc_lock, flags);
|
||||
local_flush_tlb();
|
||||
pr_debug("1.\n");
|
||||
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
|
||||
start_eip >> 4;
|
||||
pr_debug("2.\n");
|
||||
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
|
||||
start_eip & 0xf;
|
||||
pr_debug("3.\n");
|
||||
}
|
||||
|
||||
static inline void smpboot_restore_warm_reset_vector(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* Install writable page 0 entry to set BIOS data area.
|
||||
*/
|
||||
local_flush_tlb();
|
||||
|
||||
/*
|
||||
* Paranoid: Set warm reset code and vector here back
|
||||
* to default values.
|
||||
|
|
|
|||
|
|
@ -9,10 +9,12 @@
|
|||
#include <linux/atomic.h>
|
||||
|
||||
atomic_t trace_idt_ctr = ATOMIC_INIT(0);
|
||||
__aligned(PAGE_SIZE)
|
||||
struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
|
||||
(unsigned long) trace_idt_table };
|
||||
|
||||
/* No need to be aligned, but done to keep all IDTs defined the same way. */
|
||||
__aligned(PAGE_SIZE)
|
||||
gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
|
||||
|
||||
static int trace_irq_vector_refcount;
|
||||
|
|
|
|||
|
|
@ -187,7 +187,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
|
|||
pte_unmap_unlock(pte, ptl);
|
||||
out:
|
||||
up_write(&mm->mmap_sem);
|
||||
flush_tlb();
|
||||
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -2383,9 +2383,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
|
|||
}
|
||||
|
||||
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
|
||||
u64 cr0, u64 cr4)
|
||||
u64 cr0, u64 cr3, u64 cr4)
|
||||
{
|
||||
int bad;
|
||||
u64 pcid;
|
||||
|
||||
/* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
|
||||
pcid = 0;
|
||||
if (cr4 & X86_CR4_PCIDE) {
|
||||
pcid = cr3 & 0xfff;
|
||||
cr3 &= ~0xfff;
|
||||
}
|
||||
|
||||
bad = ctxt->ops->set_cr(ctxt, 3, cr3);
|
||||
if (bad)
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
|
||||
/*
|
||||
* First enable PAE, long mode needs it before CR0.PG = 1 is set.
|
||||
|
|
@ -2404,6 +2416,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
|
|||
bad = ctxt->ops->set_cr(ctxt, 4, cr4);
|
||||
if (bad)
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
if (pcid) {
|
||||
bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
|
||||
if (bad)
|
||||
return X86EMUL_UNHANDLEABLE;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return X86EMUL_CONTINUE;
|
||||
|
|
@ -2414,11 +2432,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
|
|||
struct desc_struct desc;
|
||||
struct desc_ptr dt;
|
||||
u16 selector;
|
||||
u32 val, cr0, cr4;
|
||||
u32 val, cr0, cr3, cr4;
|
||||
int i;
|
||||
|
||||
cr0 = GET_SMSTATE(u32, smbase, 0x7ffc);
|
||||
ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
|
||||
cr3 = GET_SMSTATE(u32, smbase, 0x7ff8);
|
||||
ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
|
||||
ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0);
|
||||
|
||||
|
|
@ -2460,14 +2478,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
|
|||
|
||||
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
|
||||
|
||||
return rsm_enter_protected_mode(ctxt, cr0, cr4);
|
||||
return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
|
||||
}
|
||||
|
||||
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
|
||||
{
|
||||
struct desc_struct desc;
|
||||
struct desc_ptr dt;
|
||||
u64 val, cr0, cr4;
|
||||
u64 val, cr0, cr3, cr4;
|
||||
u32 base3;
|
||||
u16 selector;
|
||||
int i, r;
|
||||
|
|
@ -2484,7 +2502,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
|
|||
ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
|
||||
|
||||
cr0 = GET_SMSTATE(u64, smbase, 0x7f58);
|
||||
ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50));
|
||||
cr3 = GET_SMSTATE(u64, smbase, 0x7f50);
|
||||
cr4 = GET_SMSTATE(u64, smbase, 0x7f48);
|
||||
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
|
||||
val = GET_SMSTATE(u64, smbase, 0x7ed0);
|
||||
|
|
@ -2512,7 +2530,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
|
|||
dt.address = GET_SMSTATE(u64, smbase, 0x7e68);
|
||||
ctxt->ops->set_gdt(ctxt, &dt);
|
||||
|
||||
r = rsm_enter_protected_mode(ctxt, cr0, cr4);
|
||||
r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
|
||||
if (r != X86EMUL_CONTINUE)
|
||||
return r;
|
||||
|
||||
|
|
|
|||
|
|
@ -3855,6 +3855,25 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
|
|||
"mov %%r13, %c[r13](%[svm]) \n\t"
|
||||
"mov %%r14, %c[r14](%[svm]) \n\t"
|
||||
"mov %%r15, %c[r15](%[svm]) \n\t"
|
||||
#endif
|
||||
/*
|
||||
* Clear host registers marked as clobbered to prevent
|
||||
* speculative use.
|
||||
*/
|
||||
"xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
|
||||
"xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
|
||||
"xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
|
||||
"xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
|
||||
"xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
|
||||
#ifdef CONFIG_X86_64
|
||||
"xor %%r8, %%r8 \n\t"
|
||||
"xor %%r9, %%r9 \n\t"
|
||||
"xor %%r10, %%r10 \n\t"
|
||||
"xor %%r11, %%r11 \n\t"
|
||||
"xor %%r12, %%r12 \n\t"
|
||||
"xor %%r13, %%r13 \n\t"
|
||||
"xor %%r14, %%r14 \n\t"
|
||||
"xor %%r15, %%r15 \n\t"
|
||||
#endif
|
||||
"pop %%" _ASM_BP
|
||||
:
|
||||
|
|
|
|||
|
|
@ -828,8 +828,16 @@ static inline short vmcs_field_to_offset(unsigned long field)
|
|||
{
|
||||
BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
|
||||
|
||||
if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
|
||||
vmcs_field_to_offset_table[field] == 0)
|
||||
if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
|
||||
return -ENOENT;
|
||||
|
||||
/*
|
||||
* FIXME: Mitigation for CVE-2017-5753. To be replaced with a
|
||||
* generic mechanism.
|
||||
*/
|
||||
asm("lfence");
|
||||
|
||||
if (vmcs_field_to_offset_table[field] == 0)
|
||||
return -ENOENT;
|
||||
|
||||
return vmcs_field_to_offset_table[field];
|
||||
|
|
@ -1107,6 +1115,11 @@ static inline bool cpu_has_vmx_invvpid_global(void)
|
|||
return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
|
||||
}
|
||||
|
||||
static inline bool cpu_has_vmx_invvpid(void)
|
||||
{
|
||||
return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
|
||||
}
|
||||
|
||||
static inline bool cpu_has_vmx_ept(void)
|
||||
{
|
||||
return vmcs_config.cpu_based_2nd_exec_ctrl &
|
||||
|
|
@ -6199,8 +6212,10 @@ static __init int hardware_setup(void)
|
|||
if (boot_cpu_has(X86_FEATURE_NX))
|
||||
kvm_enable_efer_bits(EFER_NX);
|
||||
|
||||
if (!cpu_has_vmx_vpid())
|
||||
if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
|
||||
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
|
||||
enable_vpid = 0;
|
||||
|
||||
if (!cpu_has_vmx_shadow_vmcs())
|
||||
enable_shadow_vmcs = 0;
|
||||
if (enable_shadow_vmcs)
|
||||
|
|
@ -8616,6 +8631,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
|||
/* Save guest registers, load host registers, keep flags */
|
||||
"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
|
||||
"pop %0 \n\t"
|
||||
"setbe %c[fail](%0)\n\t"
|
||||
"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
|
||||
"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
|
||||
__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
|
||||
|
|
@ -8632,12 +8648,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
|||
"mov %%r13, %c[r13](%0) \n\t"
|
||||
"mov %%r14, %c[r14](%0) \n\t"
|
||||
"mov %%r15, %c[r15](%0) \n\t"
|
||||
"xor %%r8d, %%r8d \n\t"
|
||||
"xor %%r9d, %%r9d \n\t"
|
||||
"xor %%r10d, %%r10d \n\t"
|
||||
"xor %%r11d, %%r11d \n\t"
|
||||
"xor %%r12d, %%r12d \n\t"
|
||||
"xor %%r13d, %%r13d \n\t"
|
||||
"xor %%r14d, %%r14d \n\t"
|
||||
"xor %%r15d, %%r15d \n\t"
|
||||
#endif
|
||||
"mov %%cr2, %%" _ASM_AX " \n\t"
|
||||
"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
|
||||
|
||||
"xor %%eax, %%eax \n\t"
|
||||
"xor %%ebx, %%ebx \n\t"
|
||||
"xor %%esi, %%esi \n\t"
|
||||
"xor %%edi, %%edi \n\t"
|
||||
"pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
|
||||
"setbe %c[fail](%0) \n\t"
|
||||
".pushsection .rodata \n\t"
|
||||
".global vmx_return \n\t"
|
||||
"vmx_return: " _ASM_PTR " 2b \n\t"
|
||||
|
|
|
|||
|
|
@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
|
|||
return 1;
|
||||
|
||||
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
|
||||
if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
|
||||
if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
|
||||
!is_long_mode(vcpu))
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
@ -4113,7 +4114,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
|
|||
addr, n, v))
|
||||
&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
|
||||
break;
|
||||
trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
|
||||
trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
|
||||
handled += n;
|
||||
addr += n;
|
||||
len -= n;
|
||||
|
|
@ -4361,7 +4362,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
|
|||
{
|
||||
if (vcpu->mmio_read_completed) {
|
||||
trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
|
||||
vcpu->mmio_fragments[0].gpa, *(u64 *)val);
|
||||
vcpu->mmio_fragments[0].gpa, val);
|
||||
vcpu->mmio_read_completed = 0;
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -4383,14 +4384,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
|
|||
|
||||
static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
|
||||
{
|
||||
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
|
||||
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
|
||||
return vcpu_mmio_write(vcpu, gpa, bytes, val);
|
||||
}
|
||||
|
||||
static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
|
||||
void *val, int bytes)
|
||||
{
|
||||
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
|
||||
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
|
||||
return X86EMUL_IO_NEEDED;
|
||||
}
|
||||
|
||||
|
|
@ -6941,7 +6942,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
|
|||
#endif
|
||||
|
||||
kvm_rip_write(vcpu, regs->rip);
|
||||
kvm_set_rflags(vcpu, regs->rflags);
|
||||
kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
|
||||
|
||||
vcpu->arch.exception.pending = false;
|
||||
|
||||
|
|
@ -8230,11 +8231,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
|
|||
{
|
||||
struct x86_exception fault;
|
||||
|
||||
trace_kvm_async_pf_ready(work->arch.token, work->gva);
|
||||
if (work->wakeup_all)
|
||||
work->arch.token = ~0; /* broadcast wakeup */
|
||||
else
|
||||
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
|
||||
trace_kvm_async_pf_ready(work->arch.token, work->gva);
|
||||
|
||||
if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
|
||||
!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
|
||||
|
|
|
|||
|
|
@ -82,3 +82,108 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
|
|||
|
||||
return 0; /* Buffer overrun */
|
||||
}
|
||||
|
||||
/*
|
||||
* Find a non-boolean option (i.e. option=argument). In accordance with
|
||||
* standard Linux practice, if this option is repeated, this returns the
|
||||
* last instance on the command line.
|
||||
*
|
||||
* @cmdline: the cmdline string
|
||||
* @max_cmdline_size: the maximum size of cmdline
|
||||
* @option: option string to look for
|
||||
* @buffer: memory buffer to return the option argument
|
||||
* @bufsize: size of the supplied memory buffer
|
||||
*
|
||||
* Returns the length of the argument (regardless of if it was
|
||||
* truncated to fit in the buffer), or -1 on not found.
|
||||
*/
|
||||
static int
|
||||
__cmdline_find_option(const char *cmdline, int max_cmdline_size,
|
||||
const char *option, char *buffer, int bufsize)
|
||||
{
|
||||
char c;
|
||||
int pos = 0, len = -1;
|
||||
const char *opptr = NULL;
|
||||
char *bufptr = buffer;
|
||||
enum {
|
||||
st_wordstart = 0, /* Start of word/after whitespace */
|
||||
st_wordcmp, /* Comparing this word */
|
||||
st_wordskip, /* Miscompare, skip */
|
||||
st_bufcpy, /* Copying this to buffer */
|
||||
} state = st_wordstart;
|
||||
|
||||
if (!cmdline)
|
||||
return -1; /* No command line */
|
||||
|
||||
/*
|
||||
* This 'pos' check ensures we do not overrun
|
||||
* a non-NULL-terminated 'cmdline'
|
||||
*/
|
||||
while (pos++ < max_cmdline_size) {
|
||||
c = *(char *)cmdline++;
|
||||
if (!c)
|
||||
break;
|
||||
|
||||
switch (state) {
|
||||
case st_wordstart:
|
||||
if (myisspace(c))
|
||||
break;
|
||||
|
||||
state = st_wordcmp;
|
||||
opptr = option;
|
||||
/* fall through */
|
||||
|
||||
case st_wordcmp:
|
||||
if ((c == '=') && !*opptr) {
|
||||
/*
|
||||
* We matched all the way to the end of the
|
||||
* option we were looking for, prepare to
|
||||
* copy the argument.
|
||||
*/
|
||||
len = 0;
|
||||
bufptr = buffer;
|
||||
state = st_bufcpy;
|
||||
break;
|
||||
} else if (c == *opptr++) {
|
||||
/*
|
||||
* We are currently matching, so continue
|
||||
* to the next character on the cmdline.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
state = st_wordskip;
|
||||
/* fall through */
|
||||
|
||||
case st_wordskip:
|
||||
if (myisspace(c))
|
||||
state = st_wordstart;
|
||||
break;
|
||||
|
||||
case st_bufcpy:
|
||||
if (myisspace(c)) {
|
||||
state = st_wordstart;
|
||||
} else {
|
||||
/*
|
||||
* Increment len, but don't overrun the
|
||||
* supplied buffer and leave room for the
|
||||
* NULL terminator.
|
||||
*/
|
||||
if (++len < bufsize)
|
||||
*bufptr++ = c;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (bufsize)
|
||||
*bufptr = '\0';
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
|
||||
int bufsize)
|
||||
{
|
||||
return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
|
||||
buffer, bufsize);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
|
||||
pat.o pgtable.o physaddr.o gup.o setup_nx.o
|
||||
pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
|
||||
|
||||
# Make sure __phys_addr has no stackprotector
|
||||
nostackp := $(call cc-option, -fno-stack-protector)
|
||||
|
|
@ -9,7 +9,6 @@ CFLAGS_setup_nx.o := $(nostackp)
|
|||
CFLAGS_fault.o := -I$(src)/../include/asm/trace
|
||||
|
||||
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
|
||||
obj-$(CONFIG_SMP) += tlb.o
|
||||
|
||||
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
|
||||
|
||||
|
|
@ -33,3 +32,4 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
|
|||
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
|
||||
|
||||
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
|
||||
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o
|
||||
|
|
|
|||
|
|
@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void)
|
|||
cr4_set_bits_and_update_boot(X86_CR4_PSE);
|
||||
|
||||
/* Enable PGE if available */
|
||||
if (cpu_has_pge) {
|
||||
if (cpu_has_pge && !kaiser_enabled) {
|
||||
cr4_set_bits_and_update_boot(X86_CR4_PGE);
|
||||
__supported_pte_mask |= _PAGE_GLOBAL;
|
||||
} else
|
||||
|
|
@ -753,13 +753,11 @@ void __init zone_sizes_init(void)
|
|||
}
|
||||
|
||||
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
|
||||
#ifdef CONFIG_SMP
|
||||
.active_mm = &init_mm,
|
||||
.state = 0,
|
||||
#endif
|
||||
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
|
||||
};
|
||||
EXPORT_SYMBOL_GPL(cpu_tlbstate);
|
||||
EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
|
||||
|
||||
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -395,6 +395,16 @@ void __init cleanup_highmap(void)
|
|||
continue;
|
||||
if (vaddr < (unsigned long) _text || vaddr > end)
|
||||
set_pmd(pmd, __pmd(0));
|
||||
else if (kaiser_enabled) {
|
||||
/*
|
||||
* level2_kernel_pgt is initialized with _PAGE_GLOBAL:
|
||||
* clear that now. This is not important, so long as
|
||||
* CR4.PGE remains clear, but it removes an anomaly.
|
||||
* Physical mapping setup below avoids _PAGE_GLOBAL
|
||||
* by use of massage_pgprot() inside pfn_pte() etc.
|
||||
*/
|
||||
set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
484
arch/x86/mm/kaiser.c
Normal file
484
arch/x86/mm/kaiser.c
Normal file
|
|
@ -0,0 +1,484 @@
|
|||
#include <linux/bug.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/bug.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/ftrace.h>
|
||||
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
|
||||
|
||||
#include <asm/kaiser.h>
|
||||
#include <asm/tlbflush.h> /* to verify its kaiser declarations */
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/cmdline.h>
|
||||
#include <asm/vsyscall.h>
|
||||
|
||||
int kaiser_enabled __read_mostly = 1;
|
||||
EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
|
||||
|
||||
__visible
|
||||
DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
|
||||
|
||||
/*
|
||||
* These can have bit 63 set, so we can not just use a plain "or"
|
||||
* instruction to get their value or'd into CR3. It would take
|
||||
* another register. So, we use a memory reference to these instead.
|
||||
*
|
||||
* This is also handy because systems that do not support PCIDs
|
||||
* just end up or'ing a 0 into their CR3, which does no harm.
|
||||
*/
|
||||
DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
|
||||
|
||||
/*
|
||||
* At runtime, the only things we map are some things for CPU
|
||||
* hotplug, and stacks for new processes. No two CPUs will ever
|
||||
* be populating the same addresses, so we only need to ensure
|
||||
* that we protect between two CPUs trying to allocate and
|
||||
* populate the same page table page.
|
||||
*
|
||||
* Only take this lock when doing a set_p[4um]d(), but it is not
|
||||
* needed for doing a set_pte(). We assume that only the *owner*
|
||||
* of a given allocation will be doing this for _their_
|
||||
* allocation.
|
||||
*
|
||||
* This ensures that once a system has been running for a while
|
||||
* and there have been stacks all over and these page tables
|
||||
* are fully populated, there will be no further acquisitions of
|
||||
* this lock.
|
||||
*/
|
||||
static DEFINE_SPINLOCK(shadow_table_allocation_lock);
|
||||
|
||||
/*
|
||||
* Returns -1 on error.
|
||||
*/
|
||||
static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
|
||||
pgd = pgd_offset_k(vaddr);
|
||||
/*
|
||||
* We made all the kernel PGDs present in kaiser_init().
|
||||
* We expect them to stay that way.
|
||||
*/
|
||||
BUG_ON(pgd_none(*pgd));
|
||||
/*
|
||||
* PGDs are either 512GB or 128TB on all x86_64
|
||||
* configurations. We don't handle these.
|
||||
*/
|
||||
BUG_ON(pgd_large(*pgd));
|
||||
|
||||
pud = pud_offset(pgd, vaddr);
|
||||
if (pud_none(*pud)) {
|
||||
WARN_ON_ONCE(1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (pud_large(*pud))
|
||||
return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
|
||||
|
||||
pmd = pmd_offset(pud, vaddr);
|
||||
if (pmd_none(*pmd)) {
|
||||
WARN_ON_ONCE(1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (pmd_large(*pmd))
|
||||
return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
|
||||
|
||||
pte = pte_offset_kernel(pmd, vaddr);
|
||||
if (pte_none(*pte)) {
|
||||
WARN_ON_ONCE(1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a relatively normal page table walk, except that it
|
||||
* also tries to allocate page tables pages along the way.
|
||||
*
|
||||
* Returns a pointer to a PTE on success, or NULL on failure.
|
||||
*/
|
||||
static pte_t *kaiser_pagetable_walk(unsigned long address, bool user)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
pud_t *pud;
|
||||
pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
|
||||
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
|
||||
unsigned long prot = _KERNPG_TABLE;
|
||||
|
||||
if (pgd_none(*pgd)) {
|
||||
WARN_ONCE(1, "All shadow pgds should have been populated");
|
||||
return NULL;
|
||||
}
|
||||
BUILD_BUG_ON(pgd_large(*pgd) != 0);
|
||||
|
||||
if (user) {
|
||||
/*
|
||||
* The vsyscall page is the only page that will have
|
||||
* _PAGE_USER set. Catch everything else.
|
||||
*/
|
||||
BUG_ON(address != VSYSCALL_ADDR);
|
||||
|
||||
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
|
||||
prot = _PAGE_TABLE;
|
||||
}
|
||||
|
||||
pud = pud_offset(pgd, address);
|
||||
/* The shadow page tables do not use large mappings: */
|
||||
if (pud_large(*pud)) {
|
||||
WARN_ON(1);
|
||||
return NULL;
|
||||
}
|
||||
if (pud_none(*pud)) {
|
||||
unsigned long new_pmd_page = __get_free_page(gfp);
|
||||
if (!new_pmd_page)
|
||||
return NULL;
|
||||
spin_lock(&shadow_table_allocation_lock);
|
||||
if (pud_none(*pud)) {
|
||||
set_pud(pud, __pud(prot | __pa(new_pmd_page)));
|
||||
__inc_zone_page_state(virt_to_page((void *)
|
||||
new_pmd_page), NR_KAISERTABLE);
|
||||
} else
|
||||
free_page(new_pmd_page);
|
||||
spin_unlock(&shadow_table_allocation_lock);
|
||||
}
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
/* The shadow page tables do not use large mappings: */
|
||||
if (pmd_large(*pmd)) {
|
||||
WARN_ON(1);
|
||||
return NULL;
|
||||
}
|
||||
if (pmd_none(*pmd)) {
|
||||
unsigned long new_pte_page = __get_free_page(gfp);
|
||||
if (!new_pte_page)
|
||||
return NULL;
|
||||
spin_lock(&shadow_table_allocation_lock);
|
||||
if (pmd_none(*pmd)) {
|
||||
set_pmd(pmd, __pmd(prot | __pa(new_pte_page)));
|
||||
__inc_zone_page_state(virt_to_page((void *)
|
||||
new_pte_page), NR_KAISERTABLE);
|
||||
} else
|
||||
free_page(new_pte_page);
|
||||
spin_unlock(&shadow_table_allocation_lock);
|
||||
}
|
||||
|
||||
return pte_offset_kernel(pmd, address);
|
||||
}
|
||||
|
||||
static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
|
||||
unsigned long flags)
|
||||
{
|
||||
int ret = 0;
|
||||
pte_t *pte;
|
||||
unsigned long start_addr = (unsigned long )__start_addr;
|
||||
unsigned long address = start_addr & PAGE_MASK;
|
||||
unsigned long end_addr = PAGE_ALIGN(start_addr + size);
|
||||
unsigned long target_address;
|
||||
|
||||
/*
|
||||
* It is convenient for callers to pass in __PAGE_KERNEL etc,
|
||||
* and there is no actual harm from setting _PAGE_GLOBAL, so
|
||||
* long as CR4.PGE is not set. But it is nonetheless troubling
|
||||
* to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
|
||||
* requires that not to be #defined to 0): so mask it off here.
|
||||
*/
|
||||
flags &= ~_PAGE_GLOBAL;
|
||||
if (!(__supported_pte_mask & _PAGE_NX))
|
||||
flags &= ~_PAGE_NX;
|
||||
|
||||
for (; address < end_addr; address += PAGE_SIZE) {
|
||||
target_address = get_pa_from_mapping(address);
|
||||
if (target_address == -1) {
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
pte = kaiser_pagetable_walk(address, flags & _PAGE_USER);
|
||||
if (!pte) {
|
||||
ret = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
if (pte_none(*pte)) {
|
||||
set_pte(pte, __pte(flags | target_address));
|
||||
} else {
|
||||
pte_t tmp;
|
||||
set_pte(&tmp, __pte(flags | target_address));
|
||||
WARN_ON_ONCE(!pte_same(*pte, tmp));
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
|
||||
{
|
||||
unsigned long size = end - start;
|
||||
|
||||
return kaiser_add_user_map(start, size, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure that the top level of the (shadow) page tables are
|
||||
* entirely populated. This ensures that all processes that get
|
||||
* forked have the same entries. This way, we do not have to
|
||||
* ever go set up new entries in older processes.
|
||||
*
|
||||
* Note: we never free these, so there are no updates to them
|
||||
* after this.
|
||||
*/
|
||||
static void __init kaiser_init_all_pgds(void)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
int i = 0;
|
||||
|
||||
pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
|
||||
for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
|
||||
pgd_t new_pgd;
|
||||
pud_t *pud = pud_alloc_one(&init_mm,
|
||||
PAGE_OFFSET + i * PGDIR_SIZE);
|
||||
if (!pud) {
|
||||
WARN_ON(1);
|
||||
break;
|
||||
}
|
||||
inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
|
||||
new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
|
||||
/*
|
||||
* Make sure not to stomp on some other pgd entry.
|
||||
*/
|
||||
if (!pgd_none(pgd[i])) {
|
||||
WARN_ON(1);
|
||||
continue;
|
||||
}
|
||||
set_pgd(pgd + i, new_pgd);
|
||||
}
|
||||
}
|
||||
|
||||
#define kaiser_add_user_map_early(start, size, flags) do { \
|
||||
int __ret = kaiser_add_user_map(start, size, flags); \
|
||||
WARN_ON(__ret); \
|
||||
} while (0)
|
||||
|
||||
#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
|
||||
int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
|
||||
WARN_ON(__ret); \
|
||||
} while (0)
|
||||
|
||||
void __init kaiser_check_boottime_disable(void)
|
||||
{
|
||||
bool enable = true;
|
||||
char arg[5];
|
||||
int ret;
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_XENPV))
|
||||
goto silent_disable;
|
||||
|
||||
ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
|
||||
if (ret > 0) {
|
||||
if (!strncmp(arg, "on", 2))
|
||||
goto enable;
|
||||
|
||||
if (!strncmp(arg, "off", 3))
|
||||
goto disable;
|
||||
|
||||
if (!strncmp(arg, "auto", 4))
|
||||
goto skip;
|
||||
}
|
||||
|
||||
if (cmdline_find_option_bool(boot_command_line, "nopti"))
|
||||
goto disable;
|
||||
|
||||
skip:
|
||||
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
|
||||
goto disable;
|
||||
|
||||
enable:
|
||||
if (enable)
|
||||
setup_force_cpu_cap(X86_FEATURE_KAISER);
|
||||
|
||||
return;
|
||||
|
||||
disable:
|
||||
pr_info("disabled\n");
|
||||
|
||||
silent_disable:
|
||||
kaiser_enabled = 0;
|
||||
setup_clear_cpu_cap(X86_FEATURE_KAISER);
|
||||
}
|
||||
|
||||
/*
|
||||
* If anything in here fails, we will likely die on one of the
|
||||
* first kernel->user transitions and init will die. But, we
|
||||
* will have most of the kernel up by then and should be able to
|
||||
* get a clean warning out of it. If we BUG_ON() here, we run
|
||||
* the risk of being before we have good console output.
|
||||
*/
|
||||
void __init kaiser_init(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
if (!kaiser_enabled)
|
||||
return;
|
||||
|
||||
kaiser_init_all_pgds();
|
||||
|
||||
/*
|
||||
* Note that this sets _PAGE_USER and it needs to happen when the
|
||||
* pagetable hierarchy gets created, i.e., early. Otherwise
|
||||
* kaiser_pagetable_walk() will encounter initialized PTEs in the
|
||||
* hierarchy and not set the proper permissions, leading to the
|
||||
* pagefaults with page-protection violations when trying to read the
|
||||
* vsyscall page. For example.
|
||||
*/
|
||||
if (vsyscall_enabled())
|
||||
kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
|
||||
PAGE_SIZE,
|
||||
__PAGE_KERNEL_VSYSCALL);
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
void *percpu_vaddr = __per_cpu_user_mapped_start +
|
||||
per_cpu_offset(cpu);
|
||||
unsigned long percpu_sz = __per_cpu_user_mapped_end -
|
||||
__per_cpu_user_mapped_start;
|
||||
kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
|
||||
__PAGE_KERNEL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Map the entry/exit text section, which is needed at
|
||||
* switches from user to and from kernel.
|
||||
*/
|
||||
kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
|
||||
__PAGE_KERNEL_RX);
|
||||
|
||||
#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
|
||||
kaiser_add_user_map_ptrs_early(__irqentry_text_start,
|
||||
__irqentry_text_end,
|
||||
__PAGE_KERNEL_RX);
|
||||
#endif
|
||||
kaiser_add_user_map_early((void *)idt_descr.address,
|
||||
sizeof(gate_desc) * NR_VECTORS,
|
||||
__PAGE_KERNEL_RO);
|
||||
#ifdef CONFIG_TRACING
|
||||
kaiser_add_user_map_early(&trace_idt_descr,
|
||||
sizeof(trace_idt_descr),
|
||||
__PAGE_KERNEL);
|
||||
kaiser_add_user_map_early(&trace_idt_table,
|
||||
sizeof(gate_desc) * NR_VECTORS,
|
||||
__PAGE_KERNEL);
|
||||
#endif
|
||||
kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
|
||||
__PAGE_KERNEL);
|
||||
kaiser_add_user_map_early(&debug_idt_table,
|
||||
sizeof(gate_desc) * NR_VECTORS,
|
||||
__PAGE_KERNEL);
|
||||
|
||||
pr_info("enabled\n");
|
||||
}
|
||||
|
||||
/* Add a mapping to the shadow mapping, and synchronize the mappings */
|
||||
int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
|
||||
{
|
||||
if (!kaiser_enabled)
|
||||
return 0;
|
||||
return kaiser_add_user_map((const void *)addr, size, flags);
|
||||
}
|
||||
|
||||
void kaiser_remove_mapping(unsigned long start, unsigned long size)
|
||||
{
|
||||
extern void unmap_pud_range_nofree(pgd_t *pgd,
|
||||
unsigned long start, unsigned long end);
|
||||
unsigned long end = start + size;
|
||||
unsigned long addr, next;
|
||||
pgd_t *pgd;
|
||||
|
||||
if (!kaiser_enabled)
|
||||
return;
|
||||
pgd = native_get_shadow_pgd(pgd_offset_k(start));
|
||||
for (addr = start; addr < end; pgd++, addr = next) {
|
||||
next = pgd_addr_end(addr, end);
|
||||
unmap_pud_range_nofree(pgd, addr, next);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Page table pages are page-aligned. The lower half of the top
|
||||
* level is used for userspace and the top half for the kernel.
|
||||
* This returns true for user pages that need to get copied into
|
||||
* both the user and kernel copies of the page tables, and false
|
||||
* for kernel pages that should only be in the kernel copy.
|
||||
*/
|
||||
static inline bool is_userspace_pgd(pgd_t *pgdp)
|
||||
{
|
||||
return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
|
||||
}
|
||||
|
||||
pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
|
||||
{
|
||||
if (!kaiser_enabled)
|
||||
return pgd;
|
||||
/*
|
||||
* Do we need to also populate the shadow pgd? Check _PAGE_USER to
|
||||
* skip cases like kexec and EFI which make temporary low mappings.
|
||||
*/
|
||||
if (pgd.pgd & _PAGE_USER) {
|
||||
if (is_userspace_pgd(pgdp)) {
|
||||
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
|
||||
/*
|
||||
* Even if the entry is *mapping* userspace, ensure
|
||||
* that userspace can not use it. This way, if we
|
||||
* get out to userspace running on the kernel CR3,
|
||||
* userspace will crash instead of running.
|
||||
*/
|
||||
if (__supported_pte_mask & _PAGE_NX)
|
||||
pgd.pgd |= _PAGE_NX;
|
||||
}
|
||||
} else if (!pgd.pgd) {
|
||||
/*
|
||||
* pgd_clear() cannot check _PAGE_USER, and is even used to
|
||||
* clear corrupted pgd entries: so just rely on cases like
|
||||
* kexec and EFI never to be using pgd_clear().
|
||||
*/
|
||||
if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
|
||||
is_userspace_pgd(pgdp))
|
||||
native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
|
||||
}
|
||||
return pgd;
|
||||
}
|
||||
|
||||
void kaiser_setup_pcid(void)
|
||||
{
|
||||
unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
|
||||
|
||||
if (this_cpu_has(X86_FEATURE_PCID))
|
||||
user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
|
||||
/*
|
||||
* These variables are used by the entry/exit
|
||||
* code to change PCID and pgd and TLB flushing.
|
||||
*/
|
||||
this_cpu_write(x86_cr3_pcid_user, user_cr3);
|
||||
}
|
||||
|
||||
/*
|
||||
* Make a note that this cpu will need to flush USER tlb on return to user.
|
||||
* If cpu does not have PCID, then the NOFLUSH bit will never have been set.
|
||||
*/
|
||||
void kaiser_flush_tlb_on_return_to_user(void)
|
||||
{
|
||||
if (this_cpu_has(X86_FEATURE_PCID))
|
||||
this_cpu_write(x86_cr3_pcid_user,
|
||||
X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
|
||||
}
|
||||
EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
|
||||
|
|
@ -121,11 +121,22 @@ void __init kasan_init(void)
|
|||
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
|
||||
(void *)KASAN_SHADOW_END);
|
||||
|
||||
memset(kasan_zero_page, 0, PAGE_SIZE);
|
||||
|
||||
load_cr3(init_level4_pgt);
|
||||
__flush_tlb_all();
|
||||
init_task.kasan_depth = 0;
|
||||
|
||||
/*
|
||||
* kasan_zero_page has been used as early shadow memory, thus it may
|
||||
* contain some garbage. Now we can clear and write protect it, since
|
||||
* after the TLB flush no one should write to it.
|
||||
*/
|
||||
memset(kasan_zero_page, 0, PAGE_SIZE);
|
||||
for (i = 0; i < PTRS_PER_PTE; i++) {
|
||||
pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
|
||||
set_pte(&kasan_zero_pte[i], pte);
|
||||
}
|
||||
/* Flush TLBs again to be sure that write protection applied. */
|
||||
__flush_tlb_all();
|
||||
|
||||
init_task.kasan_depth = 0;
|
||||
pr_info("KernelAddressSanitizer initialized\n");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
|
|||
#define CPA_FLUSHTLB 1
|
||||
#define CPA_ARRAY 2
|
||||
#define CPA_PAGES_ARRAY 4
|
||||
#define CPA_FREE_PAGETABLES 8
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
static unsigned long direct_pages_count[PG_LEVEL_NUM];
|
||||
|
|
@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static bool try_to_free_pte_page(pte_t *pte)
|
||||
static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!(cpa->flags & CPA_FREE_PAGETABLES))
|
||||
return false;
|
||||
|
||||
for (i = 0; i < PTRS_PER_PTE; i++)
|
||||
if (!pte_none(pte[i]))
|
||||
return false;
|
||||
|
|
@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool try_to_free_pmd_page(pmd_t *pmd)
|
||||
static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!(cpa->flags & CPA_FREE_PAGETABLES))
|
||||
return false;
|
||||
|
||||
for (i = 0; i < PTRS_PER_PMD; i++)
|
||||
if (!pmd_none(pmd[i]))
|
||||
return false;
|
||||
|
|
@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
|
||||
static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
|
||||
unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
pte_t *pte = pte_offset_kernel(pmd, start);
|
||||
|
||||
|
|
@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
|
|||
pte++;
|
||||
}
|
||||
|
||||
if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
|
||||
if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
|
||||
pmd_clear(pmd);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
|
||||
static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
if (unmap_pte_range(pmd, start, end))
|
||||
if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
|
||||
if (unmap_pte_range(cpa, pmd, start, end))
|
||||
if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
|
||||
pud_clear(pud);
|
||||
}
|
||||
|
||||
static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
|
||||
static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
pmd_t *pmd = pmd_offset(pud, start);
|
||||
|
||||
|
|
@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
|
|||
unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
|
||||
unsigned long pre_end = min_t(unsigned long, end, next_page);
|
||||
|
||||
__unmap_pmd_range(pud, pmd, start, pre_end);
|
||||
__unmap_pmd_range(cpa, pud, pmd, start, pre_end);
|
||||
|
||||
start = pre_end;
|
||||
pmd++;
|
||||
|
|
@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
|
|||
if (pmd_large(*pmd))
|
||||
pmd_clear(pmd);
|
||||
else
|
||||
__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
|
||||
__unmap_pmd_range(cpa, pud, pmd,
|
||||
start, start + PMD_SIZE);
|
||||
|
||||
start += PMD_SIZE;
|
||||
pmd++;
|
||||
|
|
@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
|
|||
* 4K leftovers?
|
||||
*/
|
||||
if (start < end)
|
||||
return __unmap_pmd_range(pud, pmd, start, end);
|
||||
return __unmap_pmd_range(cpa, pud, pmd, start, end);
|
||||
|
||||
/*
|
||||
* Try again to free the PMD page if haven't succeeded above.
|
||||
*/
|
||||
if (!pud_none(*pud))
|
||||
if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
|
||||
if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
|
||||
pud_clear(pud);
|
||||
}
|
||||
|
||||
static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
|
||||
static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
|
||||
unsigned long start,
|
||||
unsigned long end)
|
||||
{
|
||||
pud_t *pud = pud_offset(pgd, start);
|
||||
|
||||
|
|
@ -840,7 +853,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
|
|||
unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
|
||||
unsigned long pre_end = min_t(unsigned long, end, next_page);
|
||||
|
||||
unmap_pmd_range(pud, start, pre_end);
|
||||
unmap_pmd_range(cpa, pud, start, pre_end);
|
||||
|
||||
start = pre_end;
|
||||
pud++;
|
||||
|
|
@ -854,7 +867,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
|
|||
if (pud_large(*pud))
|
||||
pud_clear(pud);
|
||||
else
|
||||
unmap_pmd_range(pud, start, start + PUD_SIZE);
|
||||
unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
|
||||
|
||||
start += PUD_SIZE;
|
||||
pud++;
|
||||
|
|
@ -864,7 +877,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
|
|||
* 2M leftovers?
|
||||
*/
|
||||
if (start < end)
|
||||
unmap_pmd_range(pud, start, end);
|
||||
unmap_pmd_range(cpa, pud, start, end);
|
||||
|
||||
/*
|
||||
* No need to try to free the PUD page because we'll free it in
|
||||
|
|
@ -872,6 +885,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
|
|||
*/
|
||||
}
|
||||
|
||||
static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
|
||||
{
|
||||
struct cpa_data cpa = {
|
||||
.flags = CPA_FREE_PAGETABLES,
|
||||
};
|
||||
|
||||
__unmap_pud_range(&cpa, pgd, start, end);
|
||||
}
|
||||
|
||||
void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
|
||||
{
|
||||
struct cpa_data cpa = {
|
||||
.flags = 0,
|
||||
};
|
||||
|
||||
__unmap_pud_range(&cpa, pgd, start, end);
|
||||
}
|
||||
|
||||
static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
|
||||
{
|
||||
pgd_t *pgd_entry = root + pgd_index(addr);
|
||||
|
|
|
|||
|
|
@ -750,11 +750,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
|
|||
return 1;
|
||||
|
||||
while (cursor < to) {
|
||||
if (!devmem_is_allowed(pfn)) {
|
||||
pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
|
||||
current->comm, from, to - 1);
|
||||
if (!devmem_is_allowed(pfn))
|
||||
return 0;
|
||||
}
|
||||
cursor += PAGE_SIZE;
|
||||
pfn++;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@
|
|||
#include <asm/fixmap.h>
|
||||
#include <asm/mtrr.h>
|
||||
|
||||
#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
|
||||
#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
|
||||
|
||||
#ifdef CONFIG_HIGHPTE
|
||||
#define PGALLOC_USER_GFP __GFP_HIGHMEM
|
||||
|
|
@ -340,14 +340,24 @@ static inline void _pgd_free(pgd_t *pgd)
|
|||
kmem_cache_free(pgd_cache, pgd);
|
||||
}
|
||||
#else
|
||||
|
||||
/*
|
||||
* Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
|
||||
* both 8k in size and 8k-aligned. That lets us just flip bit 12
|
||||
* in a pointer to swap between the two 4k halves.
|
||||
*/
|
||||
#define PGD_ALLOCATION_ORDER kaiser_enabled
|
||||
|
||||
static inline pgd_t *_pgd_alloc(void)
|
||||
{
|
||||
return (pgd_t *)__get_free_page(PGALLOC_GFP);
|
||||
/* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
|
||||
return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
|
||||
PGD_ALLOCATION_ORDER);
|
||||
}
|
||||
|
||||
static inline void _pgd_free(pgd_t *pgd)
|
||||
{
|
||||
free_page((unsigned long)pgd);
|
||||
free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
|
||||
}
|
||||
#endif /* CONFIG_X86_PAE */
|
||||
|
||||
|
|
|
|||
|
|
@ -6,16 +6,17 @@
|
|||
#include <linux/interrupt.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/debugfs.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/cache.h>
|
||||
#include <asm/apic.h>
|
||||
#include <asm/uv/uv.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <asm/kaiser.h>
|
||||
|
||||
/*
|
||||
* Smarter SMP flushing macros.
|
||||
* TLB flushing, formerly SMP-only
|
||||
* c/o Linus Torvalds.
|
||||
*
|
||||
* These mean you can really definitely utterly forget about
|
||||
|
|
@ -34,6 +35,36 @@ struct flush_tlb_info {
|
|||
unsigned long flush_end;
|
||||
};
|
||||
|
||||
static void load_new_mm_cr3(pgd_t *pgdir)
|
||||
{
|
||||
unsigned long new_mm_cr3 = __pa(pgdir);
|
||||
|
||||
if (kaiser_enabled) {
|
||||
/*
|
||||
* We reuse the same PCID for different tasks, so we must
|
||||
* flush all the entries for the PCID out when we change tasks.
|
||||
* Flush KERN below, flush USER when returning to userspace in
|
||||
* kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
|
||||
*
|
||||
* invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
|
||||
* do it here, but can only be used if X86_FEATURE_INVPCID is
|
||||
* available - and many machines support pcid without invpcid.
|
||||
*
|
||||
* If X86_CR3_PCID_KERN_FLUSH actually added something, then it
|
||||
* would be needed in the write_cr3() below - if PCIDs enabled.
|
||||
*/
|
||||
BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
|
||||
kaiser_flush_tlb_on_return_to_user();
|
||||
}
|
||||
|
||||
/*
|
||||
* Caution: many callers of this function expect
|
||||
* that load_cr3() is serializing and orders TLB
|
||||
* fills with respect to the mm_cpumask writes.
|
||||
*/
|
||||
write_cr3(new_mm_cr3);
|
||||
}
|
||||
|
||||
/*
|
||||
* We cannot call mmdrop() because we are in interrupt context,
|
||||
* instead update mm->cpu_vm_mask.
|
||||
|
|
@ -45,7 +76,7 @@ void leave_mm(int cpu)
|
|||
BUG();
|
||||
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
|
||||
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
|
||||
load_cr3(swapper_pg_dir);
|
||||
load_new_mm_cr3(swapper_pg_dir);
|
||||
/*
|
||||
* This gets called in the idle path where RCU
|
||||
* functions differently. Tracing normally
|
||||
|
|
@ -57,6 +88,109 @@ void leave_mm(int cpu)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(leave_mm);
|
||||
|
||||
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
||||
struct task_struct *tsk)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
switch_mm_irqs_off(prev, next, tsk);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
||||
struct task_struct *tsk)
|
||||
{
|
||||
unsigned cpu = smp_processor_id();
|
||||
|
||||
if (likely(prev != next)) {
|
||||
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
|
||||
this_cpu_write(cpu_tlbstate.active_mm, next);
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
/*
|
||||
* Re-load page tables.
|
||||
*
|
||||
* This logic has an ordering constraint:
|
||||
*
|
||||
* CPU 0: Write to a PTE for 'next'
|
||||
* CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
|
||||
* CPU 1: set bit 1 in next's mm_cpumask
|
||||
* CPU 1: load from the PTE that CPU 0 writes (implicit)
|
||||
*
|
||||
* We need to prevent an outcome in which CPU 1 observes
|
||||
* the new PTE value and CPU 0 observes bit 1 clear in
|
||||
* mm_cpumask. (If that occurs, then the IPI will never
|
||||
* be sent, and CPU 0's TLB will contain a stale entry.)
|
||||
*
|
||||
* The bad outcome can occur if either CPU's load is
|
||||
* reordered before that CPU's store, so both CPUs must
|
||||
* execute full barriers to prevent this from happening.
|
||||
*
|
||||
* Thus, switch_mm needs a full barrier between the
|
||||
* store to mm_cpumask and any operation that could load
|
||||
* from next->pgd. TLB fills are special and can happen
|
||||
* due to instruction fetches or for no reason at all,
|
||||
* and neither LOCK nor MFENCE orders them.
|
||||
* Fortunately, load_cr3() is serializing and gives the
|
||||
* ordering guarantee we need.
|
||||
*
|
||||
*/
|
||||
load_new_mm_cr3(next->pgd);
|
||||
|
||||
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
|
||||
/* Stop flush ipis for the previous mm */
|
||||
cpumask_clear_cpu(cpu, mm_cpumask(prev));
|
||||
|
||||
/* Load per-mm CR4 state */
|
||||
load_mm_cr4(next);
|
||||
|
||||
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
||||
/*
|
||||
* Load the LDT, if the LDT is different.
|
||||
*
|
||||
* It's possible that prev->context.ldt doesn't match
|
||||
* the LDT register. This can happen if leave_mm(prev)
|
||||
* was called and then modify_ldt changed
|
||||
* prev->context.ldt but suppressed an IPI to this CPU.
|
||||
* In this case, prev->context.ldt != NULL, because we
|
||||
* never set context.ldt to NULL while the mm still
|
||||
* exists. That means that next->context.ldt !=
|
||||
* prev->context.ldt, because mms never share an LDT.
|
||||
*/
|
||||
if (unlikely(prev->context.ldt != next->context.ldt))
|
||||
load_mm_ldt(next);
|
||||
#endif
|
||||
} else {
|
||||
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
|
||||
BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
|
||||
|
||||
if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
|
||||
/*
|
||||
* On established mms, the mm_cpumask is only changed
|
||||
* from irq context, from ptep_clear_flush() while in
|
||||
* lazy tlb mode, and here. Irqs are blocked during
|
||||
* schedule, protecting us from simultaneous changes.
|
||||
*/
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
/*
|
||||
* We were in lazy tlb mode and leave_mm disabled
|
||||
* tlb flush IPI delivery. We must reload CR3
|
||||
* to make sure to use no freed page tables.
|
||||
*
|
||||
* As above, load_cr3() is serializing and orders TLB
|
||||
* fills with respect to the mm_cpumask write.
|
||||
*/
|
||||
load_new_mm_cr3(next->pgd);
|
||||
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
load_mm_cr4(next);
|
||||
load_mm_ldt(next);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The flush IPI assumes that a thread switch happens in this order:
|
||||
* [cpu0: the cpu that switches]
|
||||
|
|
@ -104,7 +238,7 @@ static void flush_tlb_func(void *info)
|
|||
|
||||
inc_irq_stat(irq_tlb_count);
|
||||
|
||||
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
|
||||
if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
|
||||
return;
|
||||
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
|
||||
|
|
@ -158,23 +292,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
|
|||
smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
|
||||
}
|
||||
|
||||
void flush_tlb_current_task(void)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
|
||||
preempt_disable();
|
||||
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
||||
|
||||
/* This is an implicit full barrier that synchronizes with switch_mm. */
|
||||
local_flush_tlb();
|
||||
|
||||
trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
|
||||
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
|
||||
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
* See Documentation/x86/tlb.txt for details. We choose 33
|
||||
* because it is large enough to cover the vast majority (at
|
||||
|
|
@ -195,6 +312,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
|||
unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
|
||||
|
||||
preempt_disable();
|
||||
|
||||
if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
|
||||
base_pages_to_flush = (end - start) >> PAGE_SHIFT;
|
||||
if (base_pages_to_flush > tlb_single_page_flush_ceiling)
|
||||
base_pages_to_flush = TLB_FLUSH_ALL;
|
||||
|
||||
if (current->active_mm != mm) {
|
||||
/* Synchronize with switch_mm. */
|
||||
smp_mb();
|
||||
|
|
@ -211,15 +334,11 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
|||
goto out;
|
||||
}
|
||||
|
||||
if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
|
||||
base_pages_to_flush = (end - start) >> PAGE_SHIFT;
|
||||
|
||||
/*
|
||||
* Both branches below are implicit full barriers (MOV to CR or
|
||||
* INVLPG) that synchronize with switch_mm.
|
||||
*/
|
||||
if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
|
||||
base_pages_to_flush = TLB_FLUSH_ALL;
|
||||
if (base_pages_to_flush == TLB_FLUSH_ALL) {
|
||||
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
|
||||
local_flush_tlb();
|
||||
} else {
|
||||
|
|
@ -240,33 +359,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
|||
preempt_enable();
|
||||
}
|
||||
|
||||
void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
|
||||
preempt_disable();
|
||||
|
||||
if (current->active_mm == mm) {
|
||||
if (current->mm) {
|
||||
/*
|
||||
* Implicit full barrier (INVLPG) that synchronizes
|
||||
* with switch_mm.
|
||||
*/
|
||||
__flush_tlb_one(start);
|
||||
} else {
|
||||
leave_mm(smp_processor_id());
|
||||
|
||||
/* Synchronize with switch_mm. */
|
||||
smp_mb();
|
||||
}
|
||||
}
|
||||
|
||||
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
|
||||
flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
|
||||
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void do_flush_tlb_all(void *info)
|
||||
{
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
#include <asm/cacheflush.h>
|
||||
#include <asm/pgtable.h>
|
||||
#include <asm/realmode.h>
|
||||
#include <asm/kaiser.h>
|
||||
|
||||
struct real_mode_header *real_mode_header;
|
||||
u32 *trampoline_cr4_features;
|
||||
|
|
@ -15,7 +16,8 @@ void __init reserve_real_mode(void)
|
|||
size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
|
||||
|
||||
/* Has to be under 1M so we can execute real-mode AP code. */
|
||||
mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
|
||||
mem = memblock_find_in_range(0, 1 << 20, size,
|
||||
KAISER_KERNEL_PGD_ALIGNMENT);
|
||||
if (!mem)
|
||||
panic("Cannot allocate trampoline\n");
|
||||
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@
|
|||
#include <asm/msr.h>
|
||||
#include <asm/segment.h>
|
||||
#include <asm/processor-flags.h>
|
||||
#include <asm/kaiser.h>
|
||||
#include "realmode.h"
|
||||
|
||||
.text
|
||||
|
|
@ -139,7 +140,7 @@ tr_gdt:
|
|||
tr_gdt_end:
|
||||
|
||||
.bss
|
||||
.balign PAGE_SIZE
|
||||
.balign KAISER_KERNEL_PGD_ALIGNMENT
|
||||
GLOBAL(trampoline_pgd) .space PAGE_SIZE
|
||||
|
||||
.balign 8
|
||||
|
|
|
|||
|
|
@ -433,6 +433,12 @@ static void __init xen_init_cpuid_mask(void)
|
|||
~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
|
||||
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
|
||||
|
||||
/*
|
||||
* Xen PV would need some work to support PCID: CR3 handling as well
|
||||
* as xen_flush_tlb_others() would need updating.
|
||||
*/
|
||||
cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_PCID % 32)); /* disable PCID */
|
||||
|
||||
if (!xen_initial_domain())
|
||||
cpuid_leaf1_edx_mask &=
|
||||
~((1 << X86_FEATURE_ACPI)); /* disable ACPI */
|
||||
|
|
|
|||
|
|
@ -168,6 +168,18 @@ void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list,
|
|||
|
||||
spawn->alg = NULL;
|
||||
spawns = &inst->alg.cra_users;
|
||||
|
||||
/*
|
||||
* We may encounter an unregistered instance here, since
|
||||
* an instance's spawns are set up prior to the instance
|
||||
* being registered. An unregistered instance will have
|
||||
* NULL ->cra_users.next, since ->cra_users isn't
|
||||
* properly initialized until registration. But an
|
||||
* unregistered instance cannot have any users, so treat
|
||||
* it the same as ->cra_users being empty.
|
||||
*/
|
||||
if (spawns->next == NULL)
|
||||
break;
|
||||
}
|
||||
} while ((spawns = crypto_more_spawns(alg, &stack, &top,
|
||||
&secondary_spawns)));
|
||||
|
|
|
|||
|
|
@ -600,6 +600,11 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
|
|||
CRYPTO_ALG_TYPE_AHASH_MASK);
|
||||
if (IS_ERR(poly))
|
||||
return PTR_ERR(poly);
|
||||
poly_hash = __crypto_hash_alg_common(poly);
|
||||
|
||||
err = -EINVAL;
|
||||
if (poly_hash->digestsize != POLY1305_DIGEST_SIZE)
|
||||
goto out_put_poly;
|
||||
|
||||
err = -ENOMEM;
|
||||
inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
|
||||
|
|
@ -608,7 +613,6 @@ static int chachapoly_create(struct crypto_template *tmpl, struct rtattr **tb,
|
|||
|
||||
ctx = aead_instance_ctx(inst);
|
||||
ctx->saltlen = CHACHAPOLY_IV_SIZE - ivsize;
|
||||
poly_hash = __crypto_hash_alg_common(poly);
|
||||
err = crypto_init_ahash_spawn(&ctx->poly, poly_hash,
|
||||
aead_crypto_instance(inst));
|
||||
if (err)
|
||||
|
|
|
|||
|
|
@ -80,6 +80,7 @@ static int mcryptd_init_queue(struct mcryptd_queue *queue,
|
|||
pr_debug("cpu_queue #%d %p\n", cpu, queue->cpu_queue);
|
||||
crypto_init_queue(&cpu_queue->queue, max_cpu_qlen);
|
||||
INIT_WORK(&cpu_queue->work, mcryptd_queue_worker);
|
||||
spin_lock_init(&cpu_queue->q_lock);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -103,15 +104,16 @@ static int mcryptd_enqueue_request(struct mcryptd_queue *queue,
|
|||
int cpu, err;
|
||||
struct mcryptd_cpu_queue *cpu_queue;
|
||||
|
||||
cpu = get_cpu();
|
||||
cpu_queue = this_cpu_ptr(queue->cpu_queue);
|
||||
rctx->tag.cpu = cpu;
|
||||
cpu_queue = raw_cpu_ptr(queue->cpu_queue);
|
||||
spin_lock(&cpu_queue->q_lock);
|
||||
cpu = smp_processor_id();
|
||||
rctx->tag.cpu = smp_processor_id();
|
||||
|
||||
err = crypto_enqueue_request(&cpu_queue->queue, request);
|
||||
pr_debug("enqueue request: cpu %d cpu_queue %p request %p\n",
|
||||
cpu, cpu_queue, request);
|
||||
spin_unlock(&cpu_queue->q_lock);
|
||||
queue_work_on(cpu, kcrypto_wq, &cpu_queue->work);
|
||||
put_cpu();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
|
@ -164,16 +166,11 @@ static void mcryptd_queue_worker(struct work_struct *work)
|
|||
cpu_queue = container_of(work, struct mcryptd_cpu_queue, work);
|
||||
i = 0;
|
||||
while (i < MCRYPTD_BATCH || single_task_running()) {
|
||||
/*
|
||||
* preempt_disable/enable is used to prevent
|
||||
* being preempted by mcryptd_enqueue_request()
|
||||
*/
|
||||
local_bh_disable();
|
||||
preempt_disable();
|
||||
|
||||
spin_lock_bh(&cpu_queue->q_lock);
|
||||
backlog = crypto_get_backlog(&cpu_queue->queue);
|
||||
req = crypto_dequeue_request(&cpu_queue->queue);
|
||||
preempt_enable();
|
||||
local_bh_enable();
|
||||
spin_unlock_bh(&cpu_queue->q_lock);
|
||||
|
||||
if (!req) {
|
||||
mcryptd_opportunistic_flush();
|
||||
|
|
@ -188,7 +185,7 @@ static void mcryptd_queue_worker(struct work_struct *work)
|
|||
++i;
|
||||
}
|
||||
if (cpu_queue->queue.qlen)
|
||||
queue_work(kcrypto_wq, &cpu_queue->work);
|
||||
queue_work_on(smp_processor_id(), kcrypto_wq, &cpu_queue->work);
|
||||
}
|
||||
|
||||
void mcryptd_flusher(struct work_struct *__work)
|
||||
|
|
|
|||
|
|
@ -254,6 +254,14 @@ static void pcrypt_aead_exit_tfm(struct crypto_aead *tfm)
|
|||
crypto_free_aead(ctx->child);
|
||||
}
|
||||
|
||||
static void pcrypt_free(struct aead_instance *inst)
|
||||
{
|
||||
struct pcrypt_instance_ctx *ctx = aead_instance_ctx(inst);
|
||||
|
||||
crypto_drop_aead(&ctx->spawn);
|
||||
kfree(inst);
|
||||
}
|
||||
|
||||
static int pcrypt_init_instance(struct crypto_instance *inst,
|
||||
struct crypto_alg *alg)
|
||||
{
|
||||
|
|
@ -319,6 +327,8 @@ static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb,
|
|||
inst->alg.encrypt = pcrypt_aead_encrypt;
|
||||
inst->alg.decrypt = pcrypt_aead_decrypt;
|
||||
|
||||
inst->free = pcrypt_free;
|
||||
|
||||
err = aead_register_instance(tmpl, inst);
|
||||
if (err)
|
||||
goto out_drop_aead;
|
||||
|
|
@ -349,14 +359,6 @@ static int pcrypt_create(struct crypto_template *tmpl, struct rtattr **tb)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
static void pcrypt_free(struct crypto_instance *inst)
|
||||
{
|
||||
struct pcrypt_instance_ctx *ctx = crypto_instance_ctx(inst);
|
||||
|
||||
crypto_drop_aead(&ctx->spawn);
|
||||
kfree(inst);
|
||||
}
|
||||
|
||||
static int pcrypt_cpumask_change_notify(struct notifier_block *self,
|
||||
unsigned long val, void *data)
|
||||
{
|
||||
|
|
@ -469,7 +471,6 @@ static void pcrypt_fini_padata(struct padata_pcrypt *pcrypt)
|
|||
static struct crypto_template pcrypt_tmpl = {
|
||||
.name = "pcrypt",
|
||||
.create = pcrypt_create,
|
||||
.free = pcrypt_free,
|
||||
.module = THIS_MODULE,
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -1020,7 +1020,7 @@ static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, int *count,
|
|||
/* The record may be cleared by others, try read next record */
|
||||
if (len == -ENOENT)
|
||||
goto skip;
|
||||
else if (len < sizeof(*rcd)) {
|
||||
else if (len < 0 || len < sizeof(*rcd)) {
|
||||
rc = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -223,6 +223,9 @@ config GENERIC_CPU_DEVICES
|
|||
config GENERIC_CPU_AUTOPROBE
|
||||
bool
|
||||
|
||||
config GENERIC_CPU_VULNERABILITIES
|
||||
bool
|
||||
|
||||
config SOC_BUS
|
||||
bool
|
||||
|
||||
|
|
|
|||
|
|
@ -498,10 +498,58 @@ static void __init cpu_dev_register_generic(void)
|
|||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_GENERIC_CPU_VULNERABILITIES
|
||||
|
||||
ssize_t __weak cpu_show_meltdown(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
return sprintf(buf, "Not affected\n");
|
||||
}
|
||||
|
||||
ssize_t __weak cpu_show_spectre_v1(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
return sprintf(buf, "Not affected\n");
|
||||
}
|
||||
|
||||
ssize_t __weak cpu_show_spectre_v2(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
return sprintf(buf, "Not affected\n");
|
||||
}
|
||||
|
||||
static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
|
||||
static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
|
||||
static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
|
||||
|
||||
static struct attribute *cpu_root_vulnerabilities_attrs[] = {
|
||||
&dev_attr_meltdown.attr,
|
||||
&dev_attr_spectre_v1.attr,
|
||||
&dev_attr_spectre_v2.attr,
|
||||
NULL
|
||||
};
|
||||
|
||||
static const struct attribute_group cpu_root_vulnerabilities_group = {
|
||||
.name = "vulnerabilities",
|
||||
.attrs = cpu_root_vulnerabilities_attrs,
|
||||
};
|
||||
|
||||
static void __init cpu_register_vulnerabilities(void)
|
||||
{
|
||||
if (sysfs_create_group(&cpu_subsys.dev_root->kobj,
|
||||
&cpu_root_vulnerabilities_group))
|
||||
pr_err("Unable to register CPU vulnerabilities\n");
|
||||
}
|
||||
|
||||
#else
|
||||
static inline void cpu_register_vulnerabilities(void) { }
|
||||
#endif
|
||||
|
||||
void __init cpu_dev_init(void)
|
||||
{
|
||||
if (subsys_system_register(&cpu_subsys, cpu_root_attr_groups))
|
||||
panic("Failed to register CPU subsystem");
|
||||
|
||||
cpu_dev_register_generic();
|
||||
cpu_register_vulnerabilities();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3767,7 +3767,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
|
|||
segment_size = rbd_obj_bytes(&rbd_dev->header);
|
||||
blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
|
||||
q->limits.max_sectors = queue_max_hw_sectors(q);
|
||||
blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
|
||||
blk_queue_max_segments(q, USHRT_MAX);
|
||||
blk_queue_max_segment_size(q, segment_size);
|
||||
blk_queue_io_min(q, segment_size);
|
||||
blk_queue_io_opt(q, segment_size);
|
||||
|
|
|
|||
|
|
@ -1050,10 +1050,6 @@ static int btusb_open(struct hci_dev *hdev)
|
|||
return err;
|
||||
|
||||
data->intf->needs_remote_wakeup = 1;
|
||||
/* device specific wakeup source enabled and required for USB
|
||||
* remote wakeup while host is suspended
|
||||
*/
|
||||
device_wakeup_enable(&data->udev->dev);
|
||||
|
||||
if (test_and_set_bit(BTUSB_INTR_RUNNING, &data->flags))
|
||||
goto done;
|
||||
|
|
@ -1117,7 +1113,6 @@ static int btusb_close(struct hci_dev *hdev)
|
|||
goto failed;
|
||||
|
||||
data->intf->needs_remote_wakeup = 0;
|
||||
device_wakeup_disable(&data->udev->dev);
|
||||
usb_autopm_put_interface(data->intf);
|
||||
|
||||
failed:
|
||||
|
|
|
|||
|
|
@ -178,6 +178,7 @@ static struct bus_type sunxi_rsb_bus = {
|
|||
.match = sunxi_rsb_device_match,
|
||||
.probe = sunxi_rsb_device_probe,
|
||||
.remove = sunxi_rsb_device_remove,
|
||||
.uevent = of_device_uevent_modalias,
|
||||
};
|
||||
|
||||
static void sunxi_rsb_dev_release(struct device *dev)
|
||||
|
|
|
|||
|
|
@ -238,7 +238,10 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf,
|
|||
goto out;
|
||||
}
|
||||
|
||||
mutex_lock(&reading_mutex);
|
||||
if (mutex_lock_interruptible(&reading_mutex)) {
|
||||
err = -ERESTARTSYS;
|
||||
goto out_put;
|
||||
}
|
||||
if (!data_avail) {
|
||||
bytes_read = rng_get_data(rng, rng_buffer,
|
||||
rng_buffer_size(),
|
||||
|
|
@ -288,6 +291,7 @@ static ssize_t rng_dev_read(struct file *filp, char __user *buf,
|
|||
|
||||
out_unlock_reading:
|
||||
mutex_unlock(&reading_mutex);
|
||||
out_put:
|
||||
put_rng(rng);
|
||||
goto out;
|
||||
}
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user