From 0547e0bd9b95da18747009c2091846fcb5691a6f Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 23 Oct 2023 21:16:41 +0200 Subject: [PATCH 01/16] s390/mm: add missing conversion to use ptdescs Commit 6326c26c1514 ("s390: convert various pgalloc functions to use ptdescs") missed to convert tlb_remove_table() into tlb_remove_ptdesc() in few locations. Reviewed-by: Heiko Carstens Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/tlb.h | 4 ++-- arch/s390/mm/pgalloc.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 383b1f91442c..a0089b2e5e67 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -112,7 +112,7 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; - tlb_remove_table(tlb, p4d); + tlb_remove_ptdesc(tlb, p4d); } /* @@ -130,7 +130,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_p4ds = 1; - tlb_remove_table(tlb, pud); + tlb_remove_ptdesc(tlb, pud); } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 5488ae17318e..15f6a3ef40e8 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -412,7 +412,7 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, } spin_unlock_bh(&mm->context.lock); table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); - tlb_remove_table(tlb, table); + tlb_remove_ptdesc(tlb, table); } void __tlb_remove_table(void *_table) From 01c89ab7f81b76de00daccf6a856a00eb63a17d7 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Mon, 23 Oct 2023 14:50:11 +0200 Subject: [PATCH 02/16] s390/ap: rework to use irq info from ap queue status This patch reworks the irq handling and reporting code for the AP queue interrupt handling to always use the irq info from the queue status. Until now the interrupt status of an AP queue was stored into a bool variable within the ap_queue struct. This variable was set on a successful interrupt enablement and cleared with kicking a reset. However, it may be that the interrupt state is manipulated outband for example by a hypervisor. This patch removes this variable and instead the irq bit from the AP queue status which is always reflecting the current irq state is used. Reviewed-by: Tony Krowiak Reviewed-by: Holger Dengler Signed-off-by: Harald Freudenberger Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/ap_bus.h | 1 - drivers/s390/crypto/ap_queue.c | 22 ++++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 359a35f894d5..b0771ca0849b 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -206,7 +206,6 @@ struct ap_queue { bool config; /* configured state */ bool chkstop; /* checkstop state */ ap_qid_t qid; /* AP queue id. */ - bool interrupt; /* indicate if interrupts are enabled */ bool se_bound; /* SE bound state */ unsigned int assoc_idx; /* SE association index */ int queue_count; /* # messages currently on AP queue. */ diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index 993240370ecf..f1abd21661dc 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -203,7 +203,7 @@ static enum ap_sm_wait ap_sm_read(struct ap_queue *aq) return AP_SM_WAIT_NONE; case AP_RESPONSE_NO_PENDING_REPLY: if (aq->queue_count > 0) - return aq->interrupt ? + return status.irq_enabled ? AP_SM_WAIT_INTERRUPT : AP_SM_WAIT_HIGH_TIMEOUT; aq->sm_state = AP_SM_STATE_IDLE; return AP_SM_WAIT_NONE; @@ -254,7 +254,7 @@ static enum ap_sm_wait ap_sm_write(struct ap_queue *aq) fallthrough; case AP_RESPONSE_Q_FULL: aq->sm_state = AP_SM_STATE_QUEUE_FULL; - return aq->interrupt ? + return status.irq_enabled ? AP_SM_WAIT_INTERRUPT : AP_SM_WAIT_HIGH_TIMEOUT; case AP_RESPONSE_RESET_IN_PROGRESS: aq->sm_state = AP_SM_STATE_RESET_WAIT; @@ -307,7 +307,6 @@ static enum ap_sm_wait ap_sm_reset(struct ap_queue *aq) case AP_RESPONSE_NORMAL: case AP_RESPONSE_RESET_IN_PROGRESS: aq->sm_state = AP_SM_STATE_RESET_WAIT; - aq->interrupt = false; aq->rapq_fbit = 0; aq->se_bound = false; return AP_SM_WAIT_LOW_TIMEOUT; @@ -383,7 +382,6 @@ static enum ap_sm_wait ap_sm_setirq_wait(struct ap_queue *aq) if (status.irq_enabled == 1) { /* Irqs are now enabled */ - aq->interrupt = true; aq->sm_state = (aq->queue_count > 0) ? AP_SM_STATE_WORKING : AP_SM_STATE_IDLE; } @@ -626,16 +624,21 @@ static ssize_t interrupt_show(struct device *dev, struct device_attribute *attr, char *buf) { struct ap_queue *aq = to_ap_queue(dev); + struct ap_queue_status status; int rc = 0; spin_lock_bh(&aq->lock); - if (aq->sm_state == AP_SM_STATE_SETIRQ_WAIT) + if (aq->sm_state == AP_SM_STATE_SETIRQ_WAIT) { rc = sysfs_emit(buf, "Enable Interrupt pending.\n"); - else if (aq->interrupt) - rc = sysfs_emit(buf, "Interrupts enabled.\n"); - else - rc = sysfs_emit(buf, "Interrupts disabled.\n"); + } else { + status = ap_tapq(aq->qid, NULL); + if (status.irq_enabled) + rc = sysfs_emit(buf, "Interrupts enabled.\n"); + else + rc = sysfs_emit(buf, "Interrupts disabled.\n"); + } spin_unlock_bh(&aq->lock); + return rc; } @@ -1032,7 +1035,6 @@ struct ap_queue *ap_queue_create(ap_qid_t qid, int device_type) if (ap_sb_available() && is_prot_virt_guest()) aq->ap_dev.device.groups = ap_queue_dev_sb_attr_groups; aq->qid = qid; - aq->interrupt = false; spin_lock_init(&aq->lock); INIT_LIST_HEAD(&aq->pendingq); INIT_LIST_HEAD(&aq->requestq); From c40284b3642554d6cf519525872f2f5784933d8d Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Mon, 23 Oct 2023 15:42:21 +0200 Subject: [PATCH 03/16] s390/ap: re-enable interrupt for AP queues This patch introduces some code lines which check for interrupt support enabled on an AP queue after a reply has been received. This invocation has been chosen as there is a good chance to have the queue empty at that time. As the enablement of the irq imples a state machine change the queue should not have any pending requests or unreceived replies. Reviewed-by: Tony Krowiak Reviewed-by: Holger Dengler Signed-off-by: Harald Freudenberger Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/ap_queue.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index f1abd21661dc..3934a0cc13e7 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -200,13 +200,13 @@ static enum ap_sm_wait ap_sm_read(struct ap_queue *aq) return AP_SM_WAIT_AGAIN; } aq->sm_state = AP_SM_STATE_IDLE; - return AP_SM_WAIT_NONE; + break; case AP_RESPONSE_NO_PENDING_REPLY: if (aq->queue_count > 0) return status.irq_enabled ? AP_SM_WAIT_INTERRUPT : AP_SM_WAIT_HIGH_TIMEOUT; aq->sm_state = AP_SM_STATE_IDLE; - return AP_SM_WAIT_NONE; + break; default: aq->dev_state = AP_DEV_STATE_ERROR; aq->last_err_rc = status.response_code; @@ -215,6 +215,16 @@ static enum ap_sm_wait ap_sm_read(struct ap_queue *aq) AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid)); return AP_SM_WAIT_NONE; } + /* Check and maybe enable irq support (again) on this queue */ + if (!status.irq_enabled && status.queue_empty) { + void *lsi_ptr = ap_airq_ptr(); + + if (lsi_ptr && ap_queue_enable_irq(aq, lsi_ptr) == 0) { + aq->sm_state = AP_SM_STATE_SETIRQ_WAIT; + return AP_SM_WAIT_AGAIN; + } + } + return AP_SM_WAIT_NONE; } /** From e14aec23025eeb1f2159ba34dbc1458467c4c347 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Mon, 23 Oct 2023 09:57:10 +0200 Subject: [PATCH 04/16] s390/ap: fix AP bus crash on early config change callback invocation Fix kernel crash in AP bus code caused by very early invocation of the config change callback function via SCLP. After a fresh IML of the machine the crypto cards are still offline and will get switched online only with activation of any LPAR which has the card in it's configuration. A crypto card coming online is reported to the LPAR via SCLP and the AP bus offers a callback function to get this kind of information. However, it may happen that the callback is invoked before the AP bus init function is complete. As the callback triggers a synchronous AP bus scan, the scan may already run but some internal states are not initialized by the AP bus init function resulting in a crash like this: [ 11.635859] Unable to handle kernel pointer dereference in virtual kernel address space [ 11.635861] Failing address: 0000000000000000 TEID: 0000000000000887 [ 11.635862] Fault in home space mode while using kernel ASCE. [ 11.635864] AS:00000000894c4007 R3:00000001fece8007 S:00000001fece7800 P:000000000000013d [ 11.635879] Oops: 0004 ilc:1 [#1] SMP [ 11.635882] Modules linked in: [ 11.635884] CPU: 5 PID: 42 Comm: kworker/5:0 Not tainted 6.6.0-rc3-00003-g4dbf7cdc6b42 #12 [ 11.635886] Hardware name: IBM 3931 A01 751 (LPAR) [ 11.635887] Workqueue: events_long ap_scan_bus [ 11.635891] Krnl PSW : 0704c00180000000 0000000000000000 (0x0) [ 11.635895] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3 [ 11.635897] Krnl GPRS: 0000000001000a00 0000000000000000 0000000000000006 0000000089591940 [ 11.635899] 0000000080000000 0000000000000a00 0000000000000000 0000000000000000 [ 11.635901] 0000000081870c00 0000000089591000 000000008834e4e2 0000000002625a00 [ 11.635903] 0000000081734200 0000038000913c18 000000008834c6d6 0000038000913ac8 [ 11.635906] Krnl Code:>0000000000000000: 0000 illegal [ 11.635906] 0000000000000002: 0000 illegal [ 11.635906] 0000000000000004: 0000 illegal [ 11.635906] 0000000000000006: 0000 illegal [ 11.635906] 0000000000000008: 0000 illegal [ 11.635906] 000000000000000a: 0000 illegal [ 11.635906] 000000000000000c: 0000 illegal [ 11.635906] 000000000000000e: 0000 illegal [ 11.635915] Call Trace: [ 11.635916] [<0000000000000000>] 0x0 [ 11.635918] [<000000008834e4e2>] ap_queue_init_state+0x82/0xb8 [ 11.635921] [<000000008834ba1c>] ap_scan_domains+0x6fc/0x740 [ 11.635923] [<000000008834c092>] ap_scan_adapter+0x632/0x8b0 [ 11.635925] [<000000008834c3e4>] ap_scan_bus+0xd4/0x288 [ 11.635927] [<00000000879a33ba>] process_one_work+0x19a/0x410 [ 11.635930] Discipline DIAG cannot be used without z/VM [ 11.635930] [<00000000879a3a2c>] worker_thread+0x3fc/0x560 [ 11.635933] [<00000000879aea60>] kthread+0x120/0x128 [ 11.635936] [<000000008792afa4>] __ret_from_fork+0x3c/0x58 [ 11.635938] [<00000000885ebe62>] ret_from_fork+0xa/0x30 [ 11.635942] Last Breaking-Event-Address: [ 11.635942] [<000000008834c6d4>] ap_wait+0xcc/0x148 This patch improves the ap_bus_force_rescan() function which is invoked by the config change callback by checking if a first initial AP bus scan has been done. If not, the force rescan request is simple ignored. Anyhow it does not make sense to trigger AP bus re-scans even before the very first bus scan is complete. Cc: stable@vger.kernel.org Reviewed-by: Holger Dengler Signed-off-by: Harald Freudenberger Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/ap_bus.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index d09e08b71cfb..d6ad437883fa 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -1022,6 +1022,10 @@ EXPORT_SYMBOL(ap_driver_unregister); void ap_bus_force_rescan(void) { + /* Only trigger AP bus scans after the initial scan is done */ + if (atomic64_read(&ap_scan_bus_count) <= 0) + return; + /* processing a asynchronous bus rescan */ del_timer(&ap_config_timer); queue_work(system_long_wq, &ap_scan_work); From 504b73d00a55a68c0d1bb0e7c12e56e5f908e906 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 30 Oct 2023 16:50:46 +0100 Subject: [PATCH 05/16] s390/perf: implement perf_callchain_user() Daan De Meyer and Neal Gompa reported that s390 does not support perf user stack unwinding. This was never implemented since this requires user space to be compiled with the -mbackchain compile option, which until now no distribution did. However this is going to change with Fedora. Therefore provide a perf_callchain_user() implementation. Note that due to the way s390 sets up stack frames the provided call chains can contain invalid values. This is especially true for the first stack frame, where it is not possible to tell if the return address has been written to the stack already or not. Reported-by: Daan De Meyer Reported-by: Neal Gompa Closes: https://lore.kernel.org/all/CAO8sHcn3+_qrnvp0580aK7jN0Wion5F7KYeBAa4MnCY4mqABPA@mail.gmail.com/ Link: https://lore.kernel.org/all/20231030123558.10816-A-hca@linux.ibm.com Reviewed-by: Neal Gompa Acked-by: Ilya Leoshkevich Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/stacktrace.h | 7 +++++ arch/s390/kernel/perf_event.c | 41 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h index 78f7b729b65f..31ec4f545e03 100644 --- a/arch/s390/include/asm/stacktrace.h +++ b/arch/s390/include/asm/stacktrace.h @@ -6,6 +6,13 @@ #include #include +struct stack_frame_user { + unsigned long back_chain; + unsigned long empty1[5]; + unsigned long gprs[10]; + unsigned long empty2[4]; +}; + enum stack_type { STACK_TYPE_UNKNOWN, STACK_TYPE_TASK, diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index c27321cb0969..dfa77da2fd2e 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -15,7 +15,10 @@ #include #include #include +#include +#include #include +#include #include #include #include @@ -212,6 +215,44 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, } } +void perf_callchain_user(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + struct stack_frame_user __user *sf; + unsigned long ip, sp; + bool first = true; + + if (is_compat_task()) + return; + perf_callchain_store(entry, instruction_pointer(regs)); + sf = (void __user *)user_stack_pointer(regs); + pagefault_disable(); + while (entry->nr < entry->max_stack) { + if (__get_user(sp, &sf->back_chain)) + break; + if (__get_user(ip, &sf->gprs[8])) + break; + if (ip & 0x1) { + /* + * If the instruction address is invalid, and this + * is the first stack frame, assume r14 has not + * been written to the stack yet. Otherwise exit. + */ + if (first && !(regs->gprs[14] & 0x1)) + ip = regs->gprs[14]; + else + break; + } + perf_callchain_store(entry, ip); + /* Sanity check: ABI requires SP to be aligned 8 bytes. */ + if (!sp || sp & 0x7) + break; + sf = (void __user *)sp; + first = false; + } + pagefault_enable(); +} + /* Perf definitions for PMU event attributes in sysfs */ ssize_t cpumf_events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) From aa44433ac4ee2ae59b4b11e01eddb6241ae24ef5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 30 Oct 2023 16:50:47 +0100 Subject: [PATCH 06/16] s390: add USER_STACKTRACE support Use the perf_callchain_user() code as blueprint to also add support for USER_STACKTRACE. To describe how to use this cite the commit message of the LoongArch implementation which came with commit 4d7bf939df08 ("LoongArch: Add USER_STACKTRACE support"), but replace -fno-omit-frame-pointer option with the s390 specific -mbackchain option: ====================================================================== To get the best stacktrace output, you can compile your userspace programs with frame pointers (at least glibc + the app you are tracing). 1, export "CC = gcc -mbackchain"; 2, compile your programs with "CC"; 3, use uprobe to get stacktrace output. ... echo 'p:malloc /usr/lib64/libc.so.6:0x0a4704 size=%r2:u64' > uprobe_events echo 'p:free /usr/lib64/libc.so.6:0x0a4d50 ptr=%r2:u64' >> uprobe_events echo 'comm == "demo"' > ./events/uprobes/malloc/filter echo 'comm == "demo"' > ./events/uprobes/free/filter echo 1 > ./options/userstacktrace echo 1 > ./options/sym-userobj ... ====================================================================== Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 1 + arch/s390/kernel/stacktrace.c | 43 +++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b0d67ac8695f..3bec98d20283 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -236,6 +236,7 @@ config S390 select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select TTY + select USER_STACKTRACE_SUPPORT select VIRT_CPU_ACCOUNTING select ZONE_DMA # Note: keep the above list sorted alphabetically diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 0787010139f7..94f440e38303 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -6,9 +6,12 @@ */ #include +#include +#include #include #include #include +#include void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) @@ -58,3 +61,43 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, return -EINVAL; return 0; } + +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) +{ + struct stack_frame_user __user *sf; + unsigned long ip, sp; + bool first = true; + + if (is_compat_task()) + return; + if (!consume_entry(cookie, instruction_pointer(regs))) + return; + sf = (void __user *)user_stack_pointer(regs); + pagefault_disable(); + while (1) { + if (__get_user(sp, &sf->back_chain)) + break; + if (__get_user(ip, &sf->gprs[8])) + break; + if (ip & 0x1) { + /* + * If the instruction address is invalid, and this + * is the first stack frame, assume r14 has not + * been written to the stack yet. Otherwise exit. + */ + if (first && !(regs->gprs[14] & 0x1)) + ip = regs->gprs[14]; + else + break; + } + if (!consume_entry(cookie, ip)) + break; + /* Sanity check: ABI requires SP to be aligned 8 bytes. */ + if (!sp || sp & 0x7) + break; + sf = (void __user *)sp; + first = false; + } + pagefault_enable(); +} From cfaef6516e9ac64e36967bd74d173b67fef6eee4 Mon Sep 17 00:00:00 2001 From: Ingo Franzki Date: Thu, 26 Oct 2023 11:24:03 +0200 Subject: [PATCH 07/16] s390/zcrypt: don't report online if card or queue is in check-stop state If a card or a queue is in check-stop state, it can't be used by applications to perform crypto operations. Applications check the 'online' sysfs attribute to check if a card or queue is usable. Report a card or queue as offline in case it is in check-stop state. Furthermore, don't allow to set a card or queue online, if it is in check-stop state. This is similar to when the card or the queue is in deconfigured state, there it is also reported as being offline, and it can't be set online. Reviewed-by: Harald Freudenberger Signed-off-by: Ingo Franzki Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/zcrypt_card.c | 4 ++-- drivers/s390/crypto/zcrypt_queue.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/s390/crypto/zcrypt_card.c b/drivers/s390/crypto/zcrypt_card.c index c815722d0ac8..050462d95222 100644 --- a/drivers/s390/crypto/zcrypt_card.c +++ b/drivers/s390/crypto/zcrypt_card.c @@ -52,7 +52,7 @@ static ssize_t online_show(struct device *dev, { struct zcrypt_card *zc = dev_get_drvdata(dev); struct ap_card *ac = to_ap_card(dev); - int online = ac->config && zc->online ? 1 : 0; + int online = ac->config && !ac->chkstop && zc->online ? 1 : 0; return sysfs_emit(buf, "%d\n", online); } @@ -70,7 +70,7 @@ static ssize_t online_store(struct device *dev, if (sscanf(buf, "%d\n", &online) != 1 || online < 0 || online > 1) return -EINVAL; - if (online && !ac->config) + if (online && (!ac->config || ac->chkstop)) return -ENODEV; zc->online = online; diff --git a/drivers/s390/crypto/zcrypt_queue.c b/drivers/s390/crypto/zcrypt_queue.c index 112a80e8e6c2..67d8e0ae0eec 100644 --- a/drivers/s390/crypto/zcrypt_queue.c +++ b/drivers/s390/crypto/zcrypt_queue.c @@ -42,7 +42,7 @@ static ssize_t online_show(struct device *dev, { struct zcrypt_queue *zq = dev_get_drvdata(dev); struct ap_queue *aq = to_ap_queue(dev); - int online = aq->config && zq->online ? 1 : 0; + int online = aq->config && !aq->chkstop && zq->online ? 1 : 0; return sysfs_emit(buf, "%d\n", online); } @@ -59,7 +59,8 @@ static ssize_t online_store(struct device *dev, if (sscanf(buf, "%d\n", &online) != 1 || online < 0 || online > 1) return -EINVAL; - if (online && (!aq->config || !aq->card->config)) + if (online && (!aq->config || !aq->card->config || + aq->chkstop || aq->card->chkstop)) return -ENODEV; if (online && !zc->online) return -EINVAL; From 5cf1a563a3284dc5c9f4ee8917ad2ebd51ff3def Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Thu, 26 Oct 2023 09:33:31 +0200 Subject: [PATCH 08/16] s390/ap: fix vanishing crypto cards in SE environment A secure execution (SE, also known as confidential computing) guest may see asynchronous errors on a crypto firmware queue. The current implementation to gather information about cards and queues in ap_queue_info() simple returns if an asynchronous error is hanging on the firmware queue. If such a situation happened and it was the only queue visible for a crypto card within an SE guest, then the card vanished from sysfs as the AP bus scan function refuses to hold a card without any type information. As lszcrypt evaluates the sysfs such a card vanished from the lszcrypt card listing and the user is baffled and has no way to reset and thus clear the pending asynchronous error. This patch improves the named function to also evaluate GR2 of the TAPQ in case of asynchronous error pending. If there is a not-null value stored in, the info is processed now. In the end, a queue with pending asynchronous error does not lead to a vanishing card any more. Reviewed-by: Holger Dengler Signed-off-by: Harald Freudenberger Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/ap_bus.c | 43 +++++++++++++++++------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index d6ad437883fa..5dd33155d5d5 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -352,7 +352,7 @@ EXPORT_SYMBOL(ap_test_config_ctrl_domain); /* * ap_queue_info(): Check and get AP queue info. * Returns: 1 if APQN exists and info is filled, - * 0 if APQN seems to exit but there is no info + * 0 if APQN seems to exist but there is no info * available (eg. caused by an asynch pending error) * -1 invalid APQN, TAPQ error or AP queue status which * indicates there is no APQN. @@ -373,36 +373,33 @@ static int ap_queue_info(ap_qid_t qid, int *q_type, unsigned int *q_fac, /* call TAPQ on this APQN */ status = ap_test_queue(qid, ap_apft_available(), &tapq_info); - /* handle pending async error with return 'no info available' */ - if (status.async) - return 0; - switch (status.response_code) { case AP_RESPONSE_NORMAL: case AP_RESPONSE_RESET_IN_PROGRESS: case AP_RESPONSE_DECONFIGURED: case AP_RESPONSE_CHECKSTOPPED: case AP_RESPONSE_BUSY: - /* - * According to the architecture in all these cases the - * info should be filled. All bits 0 is not possible as - * there is at least one of the mode bits set. - */ - if (WARN_ON_ONCE(!tapq_info.value)) - return 0; - *q_type = tapq_info.at; - *q_fac = tapq_info.fac; - *q_depth = tapq_info.qd; - *q_ml = tapq_info.ml; - *q_decfg = status.response_code == AP_RESPONSE_DECONFIGURED; - *q_cstop = status.response_code == AP_RESPONSE_CHECKSTOPPED; - return 1; + /* For all these RCs the tapq info should be available */ + break; default: - /* - * A response code which indicates, there is no info available. - */ - return -1; + /* On a pending async error the info should be available */ + if (!status.async) + return -1; + break; } + + /* There should be at least one of the mode bits set */ + if (WARN_ON_ONCE(!tapq_info.value)) + return 0; + + *q_type = tapq_info.at; + *q_fac = tapq_info.fac; + *q_depth = tapq_info.qd; + *q_ml = tapq_info.ml; + *q_decfg = status.response_code == AP_RESPONSE_DECONFIGURED; + *q_cstop = status.response_code == AP_RESPONSE_CHECKSTOPPED; + + return 1; } void ap_wait(enum ap_sm_wait wait) From 92b519f3bc1c90e098bb3f12d8e739fc56c2d444 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 27 Oct 2023 14:12:35 +0200 Subject: [PATCH 09/16] s390/cmma: cleanup inline assemblies Cleanup cmma related inline assemblies: - consolidate inline assemblies - use symbolic names - add same white space where missing - add braces to for-loops which contain a multi-line statement Reviewed-by: Claudio Imbrenda Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/mm/page-states.c | 48 +++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index a31acb2c4ef2..202dacdf17db 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -40,7 +40,7 @@ static inline int cmma_test_essa(void) " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n" "0: la %[rc],0\n" "1:\n" - EX_TABLE(0b,1b) + EX_TABLE(0b, 1b) : [rc] "+&d" (rc), [tmp] "+&d" (tmp) : [cmd] "i" (ESSA_GET_STATE)); return rc; @@ -58,37 +58,41 @@ void __init cmma_init(void) cmma_flag = 2; } +static __always_inline void essa(unsigned long paddr, unsigned char cmd) +{ + unsigned long rc; + + asm volatile( + " .insn rrf,0xb9ab0000,%[rc],%[paddr],%[cmd],0" + : [rc] "=d" (rc) + : [paddr] "d" (paddr), + [cmd] "i" (cmd)); +} + +static __always_inline void __set_page_state(struct page *page, int order, unsigned char cmd) +{ + unsigned long paddr = page_to_phys(page); + unsigned long num_pages = 1UL << order; + + while (num_pages--) { + essa(paddr, cmd); + paddr += PAGE_SIZE; + } +} + static inline void set_page_unused(struct page *page, int order) { - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_UNUSED)); + __set_page_state(page, order, ESSA_SET_UNUSED); } static inline void set_page_stable_dat(struct page *page, int order) { - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE)); + __set_page_state(page, order, ESSA_SET_STABLE); } static inline void set_page_stable_nodat(struct page *page, int order) { - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE_NODAT)); + __set_page_state(page, order, ESSA_SET_STABLE_NODAT); } static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) From 468a3bc2b7b955a7cf97d47c6022bf1ae4a538a3 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 27 Oct 2023 14:12:36 +0200 Subject: [PATCH 10/16] s390/cmma: move parsing of cmma kernel parameter to early boot code The "cmma=" kernel command line parameter needs to be parsed early for upcoming changes. Therefore move the parsing code. Note that EX_TABLE handling of cmma_test_essa() needs to be open-coded, since the early boot code doesn't have infrastructure for handling expected exceptions. Reviewed-by: Claudio Imbrenda Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/boot/ipl_parm.c | 8 ++++++ arch/s390/boot/startup.c | 44 +++++++++++++++++++++++++++++ arch/s390/include/asm/page-states.h | 4 +++ arch/s390/include/asm/setup.h | 1 - arch/s390/kernel/early.c | 1 + arch/s390/mm/init.c | 2 -- arch/s390/mm/page-states.c | 40 +------------------------- 7 files changed, 58 insertions(+), 42 deletions(-) diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 7b7521762633..2ab4872fbee1 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,7 @@ unsigned int __bootdata_preserved(zlib_dfltcc_support) = ZLIB_DFLTCC_FULL; struct ipl_parameter_block __bootdata_preserved(ipl_block); int __bootdata_preserved(ipl_block_valid); int __bootdata_preserved(__kaslr_enabled); +int __bootdata_preserved(cmma_flag) = 1; unsigned long vmalloc_size = VMALLOC_DEFAULT_SIZE; unsigned long memory_limit; @@ -295,6 +297,12 @@ void parse_boot_command_line(void) if (!strcmp(param, "nokaslr")) __kaslr_enabled = 0; + if (!strcmp(param, "cmma")) { + rc = kstrtobool(val, &enabled); + if (!rc && !enabled) + cmma_flag = 0; + } + #if IS_ENABLED(CONFIG_KVM) if (!strcmp(param, "prot_virt")) { rc = kstrtobool(val, &enabled); diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 8826c4f18645..8104e0e3d188 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include @@ -57,6 +58,48 @@ static void detect_facilities(void) machine.has_nx = 1; } +static int cmma_test_essa(void) +{ + unsigned long reg1, reg2, tmp = 0; + int rc = 1; + psw_t old; + + /* Test ESSA_GET_STATE */ + asm volatile( + " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n" + " epsw %[reg1],%[reg2]\n" + " st %[reg1],0(%[psw_pgm])\n" + " st %[reg2],4(%[psw_pgm])\n" + " larl %[reg1],1f\n" + " stg %[reg1],8(%[psw_pgm])\n" + " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n" + " la %[rc],0\n" + "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n" + : [reg1] "=&d" (reg1), + [reg2] "=&a" (reg2), + [rc] "+&d" (rc), + [tmp] "=&d" (tmp), + "+Q" (S390_lowcore.program_new_psw), + "=Q" (old) + : [psw_old] "a" (&old), + [psw_pgm] "a" (&S390_lowcore.program_new_psw), + [cmd] "i" (ESSA_GET_STATE) + : "cc", "memory"); + return rc; +} + +static void cmma_init(void) +{ + if (!cmma_flag) + return; + if (cmma_test_essa()) { + cmma_flag = 0; + return; + } + if (test_facility(147)) + cmma_flag = 2; +} + static void setup_lpp(void) { S390_lowcore.current_pid = 0; @@ -306,6 +349,7 @@ void startup_kernel(void) setup_boot_command_line(); parse_boot_command_line(); detect_facilities(); + cmma_init(); sanitize_prot_virt_host(); max_physmem_end = detect_max_physmem_end(); setup_ident_map_size(max_physmem_end); diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h index c33c4deb545f..659e3c963ce6 100644 --- a/arch/s390/include/asm/page-states.h +++ b/arch/s390/include/asm/page-states.h @@ -7,6 +7,8 @@ #ifndef PAGE_STATES_H #define PAGE_STATES_H +#include + #define ESSA_GET_STATE 0 #define ESSA_SET_STABLE 1 #define ESSA_SET_UNUSED 2 @@ -18,4 +20,6 @@ #define ESSA_MAX ESSA_SET_STABLE_NODAT +extern int __bootdata_preserved(cmma_flag); + #endif diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 25cadc2b9cff..ad6641245658 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -125,7 +125,6 @@ static inline void vmcp_cma_reserve(void) { } void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault); -void cmma_init(void); void cmma_init_nodat(void); extern void (*_machine_restart)(char *command); diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index ff1f02b54771..eb43e5922a25 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -46,6 +46,7 @@ decompressor_handled_param(vmalloc); decompressor_handled_param(dfltcc); decompressor_handled_param(facilities); decompressor_handled_param(nokaslr); +decompressor_handled_param(cmma); #if IS_ENABLED(CONFIG_KVM) decompressor_handled_param(prot_virt); #endif diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 7eca10c32caa..c322d09624bc 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -164,8 +164,6 @@ void __init mem_init(void) pv_init(); kfence_split_mapping(); - /* Setup guest page hinting */ - cmma_init(); /* this will put all low memory onto the freelists */ memblock_free_all(); diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 202dacdf17db..b68fef3d1230 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -18,45 +18,7 @@ #include #include -static int cmma_flag = 1; - -static int __init cmma(char *str) -{ - bool enabled; - - if (!kstrtobool(str, &enabled)) - cmma_flag = enabled; - return 1; -} -__setup("cmma=", cmma); - -static inline int cmma_test_essa(void) -{ - unsigned long tmp = 0; - int rc = -EOPNOTSUPP; - - /* test ESSA_GET_STATE */ - asm volatile( - " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n" - "0: la %[rc],0\n" - "1:\n" - EX_TABLE(0b, 1b) - : [rc] "+&d" (rc), [tmp] "+&d" (tmp) - : [cmd] "i" (ESSA_GET_STATE)); - return rc; -} - -void __init cmma_init(void) -{ - if (!cmma_flag) - return; - if (cmma_test_essa()) { - cmma_flag = 0; - return; - } - if (test_facility(147)) - cmma_flag = 2; -} +int __bootdata_preserved(cmma_flag); static __always_inline void essa(unsigned long paddr, unsigned char cmd) { From a3e89e20fe00209779eb3e419621070b9158acdb Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 27 Oct 2023 14:12:37 +0200 Subject: [PATCH 11/16] s390/cmma: move set_page_stable() and friends to header file In order to be usable for early boot code move the simple set_page_xxx() function to header file. Also change the parameters, and the function names slightly. This is required since there aren't any struct pages available in early boot code, and renaming of functions is done to make sure that all users are converted to the new API. Instead of a pointer to a struct page a virtual address is passed, and instead of an order the number of pages for which the page state needs be set. Reviewed-by: Claudio Imbrenda Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/page-states.h | 38 +++++++++++++++++++++++ arch/s390/mm/page-states.c | 47 +++-------------------------- 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h index 659e3c963ce6..1eae4046c07d 100644 --- a/arch/s390/include/asm/page-states.h +++ b/arch/s390/include/asm/page-states.h @@ -8,6 +8,7 @@ #define PAGE_STATES_H #include +#include #define ESSA_GET_STATE 0 #define ESSA_SET_STABLE 1 @@ -22,4 +23,41 @@ extern int __bootdata_preserved(cmma_flag); +static __always_inline unsigned long essa(unsigned long paddr, unsigned char cmd) +{ + unsigned long rc; + + asm volatile( + " .insn rrf,0xb9ab0000,%[rc],%[paddr],%[cmd],0" + : [rc] "=d" (rc) + : [paddr] "d" (paddr), + [cmd] "i" (cmd)); + return rc; +} + +static __always_inline void __set_page_state(void *addr, unsigned long num_pages, unsigned char cmd) +{ + unsigned long paddr = __pa(addr) & PAGE_MASK; + + while (num_pages--) { + essa(paddr, cmd); + paddr += PAGE_SIZE; + } +} + +static inline void __set_page_unused(void *addr, unsigned long num_pages) +{ + __set_page_state(addr, num_pages, ESSA_SET_UNUSED); +} + +static inline void __set_page_stable_dat(void *addr, unsigned long num_pages) +{ + __set_page_state(addr, num_pages, ESSA_SET_STABLE); +} + +static inline void __set_page_stable_nodat(void *addr, unsigned long num_pages) +{ + __set_page_state(addr, num_pages, ESSA_SET_STABLE_NODAT); +} + #endif diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index b68fef3d1230..7dc75dd05f48 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -20,43 +20,6 @@ int __bootdata_preserved(cmma_flag); -static __always_inline void essa(unsigned long paddr, unsigned char cmd) -{ - unsigned long rc; - - asm volatile( - " .insn rrf,0xb9ab0000,%[rc],%[paddr],%[cmd],0" - : [rc] "=d" (rc) - : [paddr] "d" (paddr), - [cmd] "i" (cmd)); -} - -static __always_inline void __set_page_state(struct page *page, int order, unsigned char cmd) -{ - unsigned long paddr = page_to_phys(page); - unsigned long num_pages = 1UL << order; - - while (num_pages--) { - essa(paddr, cmd); - paddr += PAGE_SIZE; - } -} - -static inline void set_page_unused(struct page *page, int order) -{ - __set_page_state(page, order, ESSA_SET_UNUSED); -} - -static inline void set_page_stable_dat(struct page *page, int order) -{ - __set_page_state(page, order, ESSA_SET_STABLE); -} - -static inline void set_page_stable_nodat(struct page *page, int order) -{ - __set_page_state(page, order, ESSA_SET_STABLE_NODAT); -} - static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) { unsigned long next; @@ -169,7 +132,7 @@ void __init cmma_init_nodat(void) continue; /* skip page table pages */ if (!list_empty(&page->lru)) continue; /* skip free pages */ - set_page_stable_nodat(page, 0); + __set_page_stable_nodat(page_to_virt(page), 1); } } } @@ -178,7 +141,7 @@ void arch_free_page(struct page *page, int order) { if (!cmma_flag) return; - set_page_unused(page, order); + __set_page_unused(page_to_virt(page), 1UL << order); } void arch_alloc_page(struct page *page, int order) @@ -186,14 +149,14 @@ void arch_alloc_page(struct page *page, int order) if (!cmma_flag) return; if (cmma_flag < 2) - set_page_stable_dat(page, order); + __set_page_stable_dat(page_to_virt(page), 1UL << order); else - set_page_stable_nodat(page, order); + __set_page_stable_nodat(page_to_virt(page), 1UL << order); } void arch_set_page_dat(struct page *page, int order) { if (!cmma_flag) return; - set_page_stable_dat(page, order); + __set_page_stable_dat(page_to_virt(page), 1UL << order); } From 65d37f163add1c6ead3a63788acb2f9590159f94 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 27 Oct 2023 14:12:38 +0200 Subject: [PATCH 12/16] s390/cmma: move arch_set_page_dat() to header file In order to be usable for early boot code move the simple arch_set_page_dat() function to header file, and add its counter-part arch_set_page_nodat(). Also change the parameters, and the function name slightly. This is required since there aren't any struct pages available in early boot code, and renaming of functions is done to make sure that all users are converted to the new API. Instead of a pointer to a struct page a virtual address is passed, and instead of an order the number of pages for which the page state needs be set. Reviewed-by: Claudio Imbrenda Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/page-states.h | 17 +++++++++++++++++ arch/s390/include/asm/page.h | 1 - arch/s390/mm/gmap.c | 4 ++-- arch/s390/mm/page-states.c | 7 ------- arch/s390/mm/pgalloc.c | 13 ++++++++----- arch/s390/mm/vmem.c | 2 +- 6 files changed, 28 insertions(+), 16 deletions(-) diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h index 1eae4046c07d..08fcbd628120 100644 --- a/arch/s390/include/asm/page-states.h +++ b/arch/s390/include/asm/page-states.h @@ -60,4 +60,21 @@ static inline void __set_page_stable_nodat(void *addr, unsigned long num_pages) __set_page_state(addr, num_pages, ESSA_SET_STABLE_NODAT); } +static inline void __arch_set_page_nodat(void *addr, unsigned long num_pages) +{ + if (!cmma_flag) + return; + if (cmma_flag < 2) + __set_page_stable_dat(addr, num_pages); + else + __set_page_stable_nodat(addr, num_pages); +} + +static inline void __arch_set_page_dat(void *addr, unsigned long num_pages) +{ + if (!cmma_flag) + return; + __set_page_stable_dat(addr, num_pages); +} + #endif diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index cfec0743314e..73b9c3bf377f 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -164,7 +164,6 @@ static inline int page_reset_referenced(unsigned long addr) struct page; void arch_free_page(struct page *page, int order); void arch_alloc_page(struct page *page, int order); -void arch_set_page_dat(struct page *page, int order); static inline int devmem_is_allowed(unsigned long pfn) { diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 20786f6883b2..6f96b5a71c63 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -18,7 +18,7 @@ #include #include #include - +#include #include #include #include @@ -33,7 +33,7 @@ static struct page *gmap_alloc_crst(void) page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); if (!page) return NULL; - arch_set_page_dat(page, CRST_ALLOC_ORDER); + __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); return page; } diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 7dc75dd05f48..511c43aad5df 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -153,10 +153,3 @@ void arch_alloc_page(struct page *page, int order) else __set_page_stable_nodat(page_to_virt(page), 1UL << order); } - -void arch_set_page_dat(struct page *page, int order) -{ - if (!cmma_flag) - return; - __set_page_stable_dat(page_to_virt(page), 1UL << order); -} diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 15f6a3ef40e8..e7f0deef71c7 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -43,11 +44,13 @@ __initcall(page_table_register_sysctl); unsigned long *crst_table_alloc(struct mm_struct *mm) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + unsigned long *table; if (!ptdesc) return NULL; - arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER); - return (unsigned long *) ptdesc_to_virt(ptdesc); + table = ptdesc_to_virt(ptdesc); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); + return table; } void crst_table_free(struct mm_struct *mm, unsigned long *table) @@ -145,7 +148,7 @@ struct page *page_table_alloc_pgste(struct mm_struct *mm) ptdesc = pagetable_alloc(GFP_KERNEL, 0); if (ptdesc) { table = (u64 *)ptdesc_to_virt(ptdesc); - arch_set_page_dat(virt_to_page(table), 0); + __arch_set_page_dat(table, 1); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } @@ -285,9 +288,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) pagetable_free(ptdesc); return NULL; } - arch_set_page_dat(ptdesc_page(ptdesc), 0); /* Initialize page table */ - table = (unsigned long *) ptdesc_to_virt(ptdesc); + table = ptdesc_to_virt(ptdesc); + __arch_set_page_dat(table, 1); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ INIT_LIST_HEAD(&ptdesc->pt_list); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 2e8a1064f103..b13990776fae 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -51,7 +51,7 @@ void *vmem_crst_alloc(unsigned long val) return NULL; crst_table_init(table, val); if (slab_is_available()) - arch_set_page_dat(virt_to_page(table), CRST_ALLOC_ORDER); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); return table; } From a51324c430db3fcf3e7d77c265491322c251a396 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 27 Oct 2023 14:12:39 +0200 Subject: [PATCH 13/16] s390/cmma: rework no-dat handling Rework the way physical pages are set no-dat / dat: The old way is: - Rely on that all pages are initially marked "dat" - Allocate page tables for the kernel mapping - Enable dat - Walk the whole kernel mapping and set PG_arch_1 bit in all struct pages that belong to pages of kernel page tables - Walk all struct pages and test and clear the PG_arch_1 bit. If the bit is not set, set the page state to no-dat - For all subsequent page table allocations, set the page state to dat (remove the no-dat state) on allocation time Change this rather complex logic to a simpler approach: - Set the whole physical memory (all pages) to "no-dat" - Explicitly set those page table pages to "dat" which are part of the kernel image (e.g. swapper_pg_dir) - For all subsequent page table allocations, set the page state to dat (remove the no-dat state) on allocation time In result the code is simpler, and this also allows to get rid of one odd usage of the PG_arch_1 bit. Reviewed-by: Claudio Imbrenda Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/boot/vmem.c | 17 +++++ arch/s390/include/asm/setup.h | 2 - arch/s390/mm/init.c | 2 - arch/s390/mm/page-states.c | 127 +--------------------------------- arch/s390/mm/vmem.c | 4 +- 5 files changed, 21 insertions(+), 131 deletions(-) diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 3075d65e112c..e3a4500a5a75 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -70,6 +71,10 @@ static void kasan_populate_shadow(void) crst_table_init((unsigned long *)kasan_early_shadow_pud, pud_val(pud_z)); crst_table_init((unsigned long *)kasan_early_shadow_pmd, pmd_val(pmd_z)); memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE); + __arch_set_page_dat(kasan_early_shadow_p4d, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat(kasan_early_shadow_pud, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat(kasan_early_shadow_pmd, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat(kasan_early_shadow_pte, 1); /* * Current memory layout: @@ -223,6 +228,7 @@ static void *boot_crst_alloc(unsigned long val) table = (unsigned long *)physmem_alloc_top_down(RR_VMEM, size, size); crst_table_init(table, val); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); return table; } @@ -238,6 +244,7 @@ static pte_t *boot_pte_alloc(void) if (!pte_leftover) { pte_leftover = (void *)physmem_alloc_top_down(RR_VMEM, PAGE_SIZE, PAGE_SIZE); pte = pte_leftover + _PAGE_TABLE_SIZE; + __arch_set_page_dat(pte, 1); } else { pte = pte_leftover; pte_leftover = NULL; @@ -418,6 +425,14 @@ void setup_vmem(unsigned long asce_limit) unsigned long asce_bits; int i; + /* + * Mark whole memory as no-dat. This must be done before any + * page tables are allocated, or kernel image builtin pages + * are marked as dat tables. + */ + for_each_physmem_online_range(i, &start, &end) + __arch_set_page_nodat((void *)start, (end - start) >> PAGE_SHIFT); + if (asce_limit == _REGION1_SIZE) { asce_type = _REGION2_ENTRY_EMPTY; asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; @@ -429,6 +444,8 @@ void setup_vmem(unsigned long asce_limit) crst_table_init((unsigned long *)swapper_pg_dir, asce_type); crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); + __arch_set_page_dat((void *)swapper_pg_dir, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat((void *)invalid_pg_dir, 1UL << CRST_ALLOC_ORDER); /* * To allow prefixing the lowcore must be mapped with 4KB pages. diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index ad6641245658..df316436d2e1 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -125,8 +125,6 @@ static inline void vmcp_cma_reserve(void) { } void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault); -void cmma_init_nodat(void); - extern void (*_machine_restart)(char *command); extern void (*_machine_halt)(void); extern void (*_machine_power_off)(void); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index c322d09624bc..43e612bc2bcd 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -168,8 +168,6 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ memblock_free_all(); setup_zero_pages(); /* Setup zeroed pages. */ - - cmma_init_nodat(); } void free_initmem(void) diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 511c43aad5df..01f9b39e65f5 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -7,136 +7,13 @@ * Author(s): Martin Schwidefsky */ -#include -#include -#include #include -#include -#include -#include -#include -#include #include +#include +#include int __bootdata_preserved(cmma_flag); -static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - pmd_t *pmd; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (pmd_none(*pmd) || pmd_large(*pmd)) - continue; - page = phys_to_page(pmd_val(*pmd)); - set_bit(PG_arch_1, &page->flags); - } while (pmd++, addr = next, addr != end); -} - -static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - pud_t *pud; - int i; - - pud = pud_offset(p4d, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none(*pud) || pud_large(*pud)) - continue; - if (!pud_folded(*pud)) { - page = phys_to_page(pud_val(*pud)); - for (i = 0; i < 4; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_pmd(pud, addr, next); - } while (pud++, addr = next, addr != end); -} - -static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - p4d_t *p4d; - int i; - - p4d = p4d_offset(pgd, addr); - do { - next = p4d_addr_end(addr, end); - if (p4d_none(*p4d)) - continue; - if (!p4d_folded(*p4d)) { - page = phys_to_page(p4d_val(*p4d)); - for (i = 0; i < 4; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_pud(p4d, addr, next); - } while (p4d++, addr = next, addr != end); -} - -static void mark_kernel_pgd(void) -{ - unsigned long addr, next, max_addr; - struct page *page; - pgd_t *pgd; - int i; - - addr = 0; - /* - * Figure out maximum virtual address accessible with the - * kernel ASCE. This is required to keep the page table walker - * from accessing non-existent entries. - */ - max_addr = (S390_lowcore.kernel_asce.val & _ASCE_TYPE_MASK) >> 2; - max_addr = 1UL << (max_addr * 11 + 31); - pgd = pgd_offset_k(addr); - do { - next = pgd_addr_end(addr, max_addr); - if (pgd_none(*pgd)) - continue; - if (!pgd_folded(*pgd)) { - page = phys_to_page(pgd_val(*pgd)); - for (i = 0; i < 4; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_p4d(pgd, addr, next); - } while (pgd++, addr = next, addr != max_addr); -} - -void __init cmma_init_nodat(void) -{ - struct page *page; - unsigned long start, end, ix; - int i; - - if (cmma_flag < 2) - return; - /* Mark pages used in kernel page tables */ - mark_kernel_pgd(); - page = virt_to_page(&swapper_pg_dir); - for (i = 0; i < 4; i++) - set_bit(PG_arch_1, &page[i].flags); - page = virt_to_page(&invalid_pg_dir); - for (i = 0; i < 4; i++) - set_bit(PG_arch_1, &page[i].flags); - - /* Set all kernel pages not used for page tables to stable/no-dat */ - for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { - page = pfn_to_page(start); - for (ix = start; ix < end; ix++, page++) { - if (__test_and_clear_bit(PG_arch_1, &page->flags)) - continue; /* skip page table pages */ - if (!list_empty(&page->lru)) - continue; /* skip free pages */ - __set_page_stable_nodat(page_to_virt(page), 1); - } - } -} - void arch_free_page(struct page *page, int order) { if (!cmma_flag) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index b13990776fae..186a020857cf 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -50,8 +50,7 @@ void *vmem_crst_alloc(unsigned long val) if (!table) return NULL; crst_table_init(table, val); - if (slab_is_available()) - __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); return table; } @@ -67,6 +66,7 @@ pte_t __ref *vmem_pte_alloc(void) if (!pte) return NULL; memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + __arch_set_page_dat(pte, 1); return pte; } From d08d4e7cd6bffe333f09853005aa549a8d57614b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 12 Oct 2023 20:28:51 +0200 Subject: [PATCH 14/16] s390/mm: use full 4KB page for 2KB PTE Cease using 4KB pages to host two 2KB PTEs. That greatly simplifies the memory management code at the expense of page tables memory footprint. Instead of two PTEs per 4KB page use only upper half of the parent page for a single PTE. With that the list of half-used pages pgtable_list becomes unneeded. Further, the upper byte of the parent page _refcount counter does not need to be used for fragments tracking and could be left alone. Commit 8211dad62798 ("s390: add pte_free_defer() for pgtables sharing page") introduced the use of PageActive flag to coordinate a deferred free with 2KB page table fragments tracking. Since there is no tracking anymore, there is no need for using PageActive flag. Reviewed-by: Heiko Carstens Reviewed-by: Gerald Schaefer Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/mmu.h | 2 - arch/s390/include/asm/mmu_context.h | 1 - arch/s390/include/asm/tlb.h | 5 - arch/s390/mm/pgalloc.c | 281 +++------------------------- 4 files changed, 29 insertions(+), 260 deletions(-) diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 829d68e2c685..bb1b4bef1878 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -11,7 +11,6 @@ typedef struct { cpumask_t cpu_attach_mask; atomic_t flush_count; unsigned int flush_mm; - struct list_head pgtable_list; struct list_head gmap_list; unsigned long gmap_asce; unsigned long asce; @@ -39,7 +38,6 @@ typedef struct { #define INIT_MM_CONTEXT(name) \ .context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \ - .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list), #endif diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 757fe6f0d802..929af18b0908 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -22,7 +22,6 @@ static inline int init_new_context(struct task_struct *tsk, unsigned long asce_type, init_entry; spin_lock_init(&mm->context.lock); - INIT_LIST_HEAD(&mm->context.pgtable_list); INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.flush_count, 0); diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index a0089b2e5e67..3f0fec0a1e3c 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -69,11 +69,6 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_pmds = 1; - /* - * page_table_free_rcu takes care of the allocation bit masks - * of the 2K table fragments in the 4K page table page, - * then calls tlb_remove_table. - */ page_table_free_rcu(tlb, (unsigned long *) pte, address); } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index e7f0deef71c7..5b15b45941c6 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -133,11 +133,6 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) return -ENOMEM; } -static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) -{ - return atomic_fetch_xor(bits, v) ^ bits; -} - #ifdef CONFIG_PGSTE struct page *page_table_alloc_pgste(struct mm_struct *mm) @@ -162,125 +157,11 @@ void page_table_free_pgste(struct page *page) #endif /* CONFIG_PGSTE */ -/* - * A 2KB-pgtable is either upper or lower half of a normal page. - * The second half of the page may be unused or used as another - * 2KB-pgtable. - * - * Whenever possible the parent page for a new 2KB-pgtable is picked - * from the list of partially allocated pages mm_context_t::pgtable_list. - * In case the list is empty a new parent page is allocated and added to - * the list. - * - * When a parent page gets fully allocated it contains 2KB-pgtables in both - * upper and lower halves and is removed from mm_context_t::pgtable_list. - * - * When 2KB-pgtable is freed from to fully allocated parent page that - * page turns partially allocated and added to mm_context_t::pgtable_list. - * - * If 2KB-pgtable is freed from the partially allocated parent page that - * page turns unused and gets removed from mm_context_t::pgtable_list. - * Furthermore, the unused parent page is released. - * - * As follows from the above, no unallocated or fully allocated parent - * pages are contained in mm_context_t::pgtable_list. - * - * The upper byte (bits 24-31) of the parent page _refcount is used - * for tracking contained 2KB-pgtables and has the following format: - * - * PP AA - * 01234567 upper byte (bits 24-31) of struct page::_refcount - * || || - * || |+--- upper 2KB-pgtable is allocated - * || +---- lower 2KB-pgtable is allocated - * |+------- upper 2KB-pgtable is pending for removal - * +-------- lower 2KB-pgtable is pending for removal - * - * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why - * using _refcount is possible). - * - * When 2KB-pgtable is allocated the corresponding AA bit is set to 1. - * The parent page is either: - * - added to mm_context_t::pgtable_list in case the second half of the - * parent page is still unallocated; - * - removed from mm_context_t::pgtable_list in case both hales of the - * parent page are allocated; - * These operations are protected with mm_context_t::lock. - * - * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0 - * and the corresponding PP bit is set to 1 in a single atomic operation. - * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually - * exclusive and may never be both set to 1! - * The parent page is either: - * - added to mm_context_t::pgtable_list in case the second half of the - * parent page is still allocated; - * - removed from mm_context_t::pgtable_list in case the second half of - * the parent page is unallocated; - * These operations are protected with mm_context_t::lock. - * - * It is important to understand that mm_context_t::lock only protects - * mm_context_t::pgtable_list and AA bits, but not the parent page itself - * and PP bits. - * - * Releasing the parent page happens whenever the PP bit turns from 1 to 0, - * while both AA bits and the second PP bit are already unset. Then the - * parent page does not contain any 2KB-pgtable fragment anymore, and it has - * also been removed from mm_context_t::pgtable_list. It is safe to release - * the page therefore. - * - * PGSTE memory spaces use full 4KB-pgtables and do not need most of the - * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable - * while the PP bits are never used, nor such a page is added to or removed - * from mm_context_t::pgtable_list. - * - * pte_free_defer() overrides those rules: it takes the page off pgtable_list, - * and prevents both 2K fragments from being reused. pte_free_defer() has to - * guarantee that its pgtable cannot be reused before the RCU grace period - * has elapsed (which page_table_free_rcu() does not actually guarantee). - * But for simplicity, because page->rcu_head overlays page->lru, and because - * the RCU callback might not be called before the mm_context_t has been freed, - * pte_free_defer() in this implementation prevents both fragments from being - * reused, and delays making the call to RCU until both fragments are freed. - */ unsigned long *page_table_alloc(struct mm_struct *mm) { - unsigned long *table; struct ptdesc *ptdesc; - unsigned int mask, bit; + unsigned long *table; - /* Try to get a fragment of a 4K page as a 2K page table */ - if (!mm_alloc_pgste(mm)) { - table = NULL; - spin_lock_bh(&mm->context.lock); - if (!list_empty(&mm->context.pgtable_list)) { - ptdesc = list_first_entry(&mm->context.pgtable_list, - struct ptdesc, pt_list); - mask = atomic_read(&ptdesc->_refcount) >> 24; - /* - * The pending removal bits must also be checked. - * Failure to do so might lead to an impossible - * value of (i.e 0x13 or 0x23) written to _refcount. - * Such values violate the assumption that pending and - * allocation bits are mutually exclusive, and the rest - * of the code unrails as result. That could lead to - * a whole bunch of races and corruptions. - */ - mask = (mask | (mask >> 4)) & 0x03U; - if (mask != 0x03U) { - table = (unsigned long *) ptdesc_to_virt(ptdesc); - bit = mask & 1; /* =1 -> second 2K */ - if (bit) - table += PTRS_PER_PTE; - atomic_xor_bits(&ptdesc->_refcount, - 0x01U << (bit + 24)); - list_del_init(&ptdesc->pt_list); - } - } - spin_unlock_bh(&mm->context.lock); - if (table) - return table; - } - /* Allocate a fresh page */ ptdesc = pagetable_alloc(GFP_KERNEL, 0); if (!ptdesc) return NULL; @@ -288,177 +169,73 @@ unsigned long *page_table_alloc(struct mm_struct *mm) pagetable_free(ptdesc); return NULL; } - /* Initialize page table */ table = ptdesc_to_virt(ptdesc); __arch_set_page_dat(table, 1); - if (mm_alloc_pgste(mm)) { - /* Return 4K page table with PGSTEs */ - INIT_LIST_HEAD(&ptdesc->pt_list); - atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); - memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); - memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); - } else { - /* Return the first 2K fragment of the page */ - atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24); - memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); - spin_lock_bh(&mm->context.lock); - list_add(&ptdesc->pt_list, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.lock); - } + /* pt_list is used by gmap only */ + INIT_LIST_HEAD(&ptdesc->pt_list); + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); return table; } -static void page_table_release_check(struct page *page, void *table, - unsigned int half, unsigned int mask) +static void pagetable_pte_dtor_free(struct ptdesc *ptdesc) { - char msg[128]; - - if (!IS_ENABLED(CONFIG_DEBUG_VM)) - return; - if (!mask && list_empty(&page->lru)) - return; - snprintf(msg, sizeof(msg), - "Invalid pgtable %p release half 0x%02x mask 0x%02x", - table, half, mask); - dump_page(page, msg); -} - -static void pte_free_now(struct rcu_head *head) -{ - struct ptdesc *ptdesc; - - ptdesc = container_of(head, struct ptdesc, pt_rcu_head); pagetable_pte_dtor(ptdesc); pagetable_free(ptdesc); } void page_table_free(struct mm_struct *mm, unsigned long *table) { - unsigned int mask, bit, half; struct ptdesc *ptdesc = virt_to_ptdesc(table); - if (!mm_alloc_pgste(mm)) { - /* Free 2K page table fragment of a 4K page */ - bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.lock); - /* - * Mark the page for delayed release. The actual release - * will happen outside of the critical section from this - * function or from __tlb_remove_table() - */ - mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); - mask >>= 24; - if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { - /* - * Other half is allocated, and neither half has had - * its free deferred: add page to head of list, to make - * this freed half available for immediate reuse. - */ - list_add(&ptdesc->pt_list, &mm->context.pgtable_list); - } else { - /* If page is on list, now remove it. */ - list_del_init(&ptdesc->pt_list); - } - spin_unlock_bh(&mm->context.lock); - mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24)); - mask >>= 24; - if (mask != 0x00U) - return; - half = 0x01U << bit; - } else { - half = 0x03U; - mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); - mask >>= 24; - } - - page_table_release_check(ptdesc_page(ptdesc), table, half, mask); - if (folio_test_clear_active(ptdesc_folio(ptdesc))) - call_rcu(&ptdesc->pt_rcu_head, pte_free_now); - else - pte_free_now(&ptdesc->pt_rcu_head); + pagetable_pte_dtor_free(ptdesc); } void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, unsigned long vmaddr) { struct mm_struct *mm; - unsigned int bit, mask; - struct ptdesc *ptdesc = virt_to_ptdesc(table); mm = tlb->mm; - if (mm_alloc_pgste(mm)) { + if (mm_alloc_pgste(mm)) gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) ((unsigned long)table | 0x03U); - tlb_remove_ptdesc(tlb, table); - return; - } - bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.lock); - /* - * Mark the page for delayed release. The actual release will happen - * outside of the critical section from __tlb_remove_table() or from - * page_table_free() - */ - mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); - mask >>= 24; - if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { - /* - * Other half is allocated, and neither half has had - * its free deferred: add page to end of list, to make - * this freed half available for reuse once its pending - * bit has been cleared by __tlb_remove_table(). - */ - list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list); - } else { - /* If page is on list, now remove it. */ - list_del_init(&ptdesc->pt_list); - } - spin_unlock_bh(&mm->context.lock); - table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); + table = (unsigned long *)((unsigned long)table | 0x01U); tlb_remove_ptdesc(tlb, table); } void __tlb_remove_table(void *_table) { - unsigned int mask = (unsigned long) _table & 0x03U, half = mask; - void *table = (void *)((unsigned long) _table ^ mask); - struct ptdesc *ptdesc = virt_to_ptdesc(table); + struct ptdesc *ptdesc; + unsigned int mask; + void *table; - switch (half) { - case 0x00U: /* pmd, pud, or p4d */ + mask = (unsigned long)_table & 0x01U; + table = (void *)((unsigned long)_table ^ mask); + ptdesc = virt_to_ptdesc(table); + if (!mask) { + /* pmd, pud, or p4d */ pagetable_free(ptdesc); return; - case 0x01U: /* lower 2K of a 4K page table */ - case 0x02U: /* higher 2K of a 4K page table */ - mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24)); - mask >>= 24; - if (mask != 0x00U) - return; - break; - case 0x03U: /* 4K page table with pgstes */ - mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); - mask >>= 24; - break; } - - page_table_release_check(ptdesc_page(ptdesc), table, half, mask); - if (folio_test_clear_active(ptdesc_folio(ptdesc))) - call_rcu(&ptdesc->pt_rcu_head, pte_free_now); - else - pte_free_now(&ptdesc->pt_rcu_head); + pagetable_pte_dtor_free(ptdesc); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pte_free_now(struct rcu_head *head) +{ + struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + + pagetable_pte_dtor_free(ptdesc); +} + void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) { - struct page *page; + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); - page = virt_to_page(pgtable); - SetPageActive(page); - page_table_free(mm, (unsigned long *)pgtable); + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); /* - * page_table_free() does not do the pgste gmap_unlink() which - * page_table_free_rcu() does: warn us if pgste ever reaches here. + * THPs are not allowed for KVM guests. Warn if pgste ever reaches here. + * Turn to the generic pte_free_defer() version once gmap is removed. */ WARN_ON_ONCE(mm_has_pgste(mm)); } From 0031f1c7cf2632a068a80261071b9b1f2d32d836 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 3 Nov 2023 16:40:13 +0100 Subject: [PATCH 15/16] s390/mm: use compound page order to distinguish page tables CRSTs always have size of four pages, while 2KB-size page tables always occupy a single page. Use that information to distinguish page tables from CRSTs. Reviewed-by: Heiko Carstens Reviewed-by: Gerald Schaefer Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/mm/pgalloc.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 5b15b45941c6..8159f4fade3f 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -199,20 +199,15 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, mm = tlb->mm; if (mm_alloc_pgste(mm)) gmap_unlink(mm, table, vmaddr); - table = (unsigned long *)((unsigned long)table | 0x01U); tlb_remove_ptdesc(tlb, table); } -void __tlb_remove_table(void *_table) +void __tlb_remove_table(void *table) { - struct ptdesc *ptdesc; - unsigned int mask; - void *table; + struct ptdesc *ptdesc = virt_to_ptdesc(table); + struct page *page = ptdesc_page(ptdesc); - mask = (unsigned long)_table & 0x01U; - table = (void *)((unsigned long)_table ^ mask); - ptdesc = virt_to_ptdesc(table); - if (!mask) { + if (compound_order(page) == CRST_ALLOC_ORDER) { /* pmd, pud, or p4d */ pagetable_free(ptdesc); return; From 02e790ee3077c0571794d0ab8f71413edbe129cc Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 3 Nov 2023 16:43:52 +0100 Subject: [PATCH 16/16] s390/mm: make pte_free_tlb() similar to pXd_free_tlb() Make pte_free_tlb() look similar to pXd_free_tlb() family functions. Reviewed-by: Heiko Carstens Reviewed-by: Gerald Schaefer Signed-off-by: Alexander Gordeev Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/pgalloc.h | 1 - arch/s390/include/asm/tlb.h | 4 +++- arch/s390/mm/pgalloc.c | 11 ----------- 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 376b4b23bdaa..502d655fe6ae 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -25,7 +25,6 @@ void crst_table_free(struct mm_struct *, unsigned long *); unsigned long *page_table_alloc(struct mm_struct *); struct page *page_table_alloc_pgste(struct mm_struct *mm); void page_table_free(struct mm_struct *, unsigned long *); -void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); void page_table_free_pgste(struct page *page); extern int page_table_allocate_pgste; diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 3f0fec0a1e3c..d1455a601adc 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -69,7 +69,9 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_pmds = 1; - page_table_free_rcu(tlb, (unsigned long *) pte, address); + if (mm_alloc_pgste(tlb->mm)) + gmap_unlink(tlb->mm, (unsigned long *)pte, address); + tlb_remove_ptdesc(tlb, pte); } /* diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 8159f4fade3f..008e487c94a6 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -191,17 +191,6 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) pagetable_pte_dtor_free(ptdesc); } -void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, - unsigned long vmaddr) -{ - struct mm_struct *mm; - - mm = tlb->mm; - if (mm_alloc_pgste(mm)) - gmap_unlink(mm, table, vmaddr); - tlb_remove_ptdesc(tlb, table); -} - void __tlb_remove_table(void *table) { struct ptdesc *ptdesc = virt_to_ptdesc(table);