From 3ec8a8330a1aa846dffbf1d64479213366c55b54 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 16 May 2025 14:39:44 +0200 Subject: [PATCH 1/7] s390/uv: Don't return 0 from make_hva_secure() if the operation was not successful If s390_wiggle_split_folio() returns 0 because splitting a large folio succeeded, we will return 0 from make_hva_secure() even though a retry is required. Return -EAGAIN in that case. Otherwise, we'll return 0 from gmap_make_secure(), and consequently from unpack_one(). In kvm_s390_pv_unpack(), we assume that unpacking succeeded and skip unpacking this page. Later on, we run into issues and fail booting the VM. So far, this issue was only observed with follow-up patches where we split large pagecache XFS folios. Maybe it can also be triggered with shmem? We'll cleanup s390_wiggle_split_folio() a bit next, to also return 0 if no split was required. Fixes: d8dfda5af0be ("KVM: s390: pv: fix race when making a page secure") Cc: stable@vger.kernel.org Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20250516123946.1648026-2-david@redhat.com Message-ID: <20250516123946.1648026-2-david@redhat.com> Reviewed-by: Claudio Imbrenda Signed-off-by: Claudio Imbrenda --- arch/s390/kernel/uv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 9a5d5be8acf4..2cc3b599c7fe 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -393,8 +393,11 @@ int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header folio_walk_end(&fw, vma); mmap_read_unlock(mm); - if (rc == -E2BIG || rc == -EBUSY) + if (rc == -E2BIG || rc == -EBUSY) { rc = s390_wiggle_split_folio(mm, folio, rc == -E2BIG); + if (!rc) + rc = -EAGAIN; + } folio_put(folio); return rc; From bd428b8c79ed8e8658570e70c62c0092500e2eac Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 16 May 2025 14:39:45 +0200 Subject: [PATCH 2/7] s390/uv: Always return 0 from s390_wiggle_split_folio() if successful Let's consistently return 0 if the operation was successful, and just detect ourselves whether splitting is required -- folio_test_large() is a cheap operation. Update the documentation. Should we simply always return -EAGAIN instead of 0, so we don't have to handle it in the caller? Not sure, staring at the documentation, this way looks a bit cleaner. Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20250516123946.1648026-3-david@redhat.com Message-ID: <20250516123946.1648026-3-david@redhat.com> Reviewed-by: Claudio Imbrenda Signed-off-by: Claudio Imbrenda --- arch/s390/kernel/uv.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 2cc3b599c7fe..f6ddb2b54032 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -324,34 +324,36 @@ static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct u } /** - * s390_wiggle_split_folio() - try to drain extra references to a folio and optionally split. + * s390_wiggle_split_folio() - try to drain extra references to a folio and + * split the folio if it is large. * @mm: the mm containing the folio to work on * @folio: the folio - * @split: whether to split a large folio * * Context: Must be called while holding an extra reference to the folio; * the mm lock should not be held. - * Return: 0 if the folio was split successfully; - * -EAGAIN if the folio was not split successfully but another attempt - * can be made, or if @split was set to false; - * -EINVAL in case of other errors. See split_folio(). + * Return: 0 if the operation was successful; + * -EAGAIN if splitting the large folio was not successful, + * but another attempt can be made; + * -EINVAL in case of other folio splitting errors. See split_folio(). */ -static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split) +static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio) { int rc; lockdep_assert_not_held(&mm->mmap_lock); folio_wait_writeback(folio); lru_add_drain_all(); - if (split) { + + if (folio_test_large(folio)) { folio_lock(folio); rc = split_folio(folio); folio_unlock(folio); if (rc != -EBUSY) return rc; + return -EAGAIN; } - return -EAGAIN; + return 0; } int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb) @@ -394,7 +396,7 @@ int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header mmap_read_unlock(mm); if (rc == -E2BIG || rc == -EBUSY) { - rc = s390_wiggle_split_folio(mm, folio, rc == -E2BIG); + rc = s390_wiggle_split_folio(mm, folio); if (!rc) rc = -EAGAIN; } From ab73b29efd36f8916c6cc9954e912c4723c9a1b0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 16 May 2025 14:39:46 +0200 Subject: [PATCH 3/7] s390/uv: Improve splitting of large folios that cannot be split while dirty Currently, starting a PV VM on an iomap-based filesystem with large folio support, such as XFS, will not work. We'll be stuck in unpack_one()->gmap_make_secure(), because we can't seem to make progress splitting the large folio. The problem is that we require a writable PTE but a writable PTE under such filesystems will imply a dirty folio. So whenever we have a writable PTE, we'll have a dirty folio, and dirty iomap folios cannot currently get split, because split_folio()->split_huge_page_to_list_to_order()->filemap_release_folio() will fail in iomap_release_folio(). So we will not make any progress splitting such large folios. Until dirty folios can be split more reliably, let's manually trigger writeback of the problematic folio using filemap_write_and_wait_range(), and retry the split immediately afterwards exactly once, before looking up the folio again. Should this logic be part of split_folio()? Likely not; most split users don't have to split so eagerly to make any progress. For now, this seems to affect xfs, zonefs and erofs, and this patch makes it work again (tested on xfs only). While this could be considered a fix for commit 6795801366da ("xfs: Support large folios"), commit df2f9708ff1f ("zonefs: enable support for large folios") and commit ce529cc25b18 ("erofs: enable large folios for iomap mode"), before commit eef88fe45ac9 ("s390/uv: Split large folios in gmap_make_secure()"), we did not try splitting large folios at all. So it's all rather part of making SE compatible with file systems that support large folios. But to have some "Fixes:" tag, let's just use eef88fe45ac9. Not CCing stable, because there are a lot of dependencies, and it simply not working is not critical in stable kernels. Reported-by: Sebastian Mitterle Closes: https://issues.redhat.com/browse/RHEL-58218 Fixes: eef88fe45ac9 ("s390/uv: Split large folios in gmap_make_secure()") Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20250516123946.1648026-4-david@redhat.com Message-ID: <20250516123946.1648026-4-david@redhat.com> Reviewed-by: Claudio Imbrenda Signed-off-by: Claudio Imbrenda --- arch/s390/kernel/uv.c | 66 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index f6ddb2b54032..d278bf0c09d1 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -338,22 +339,75 @@ static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct u */ static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio) { - int rc; + int rc, tried_splits; lockdep_assert_not_held(&mm->mmap_lock); folio_wait_writeback(folio); lru_add_drain_all(); - if (folio_test_large(folio)) { + if (!folio_test_large(folio)) + return 0; + + for (tried_splits = 0; tried_splits < 2; tried_splits++) { + struct address_space *mapping; + loff_t lstart, lend; + struct inode *inode; + folio_lock(folio); rc = split_folio(folio); + if (rc != -EBUSY) { + folio_unlock(folio); + return rc; + } + + /* + * Splitting with -EBUSY can fail for various reasons, but we + * have to handle one case explicitly for now: some mappings + * don't allow for splitting dirty folios; writeback will + * mark them clean again, including marking all page table + * entries mapping the folio read-only, to catch future write + * attempts. + * + * While the system should be writing back dirty folios in the + * background, we obtained this folio by looking up a writable + * page table entry. On these problematic mappings, writable + * page table entries imply dirty folios, preventing the + * split in the first place. + * + * To prevent a livelock when trigger writeback manually and + * letting the caller look up the folio again in the page + * table (turning it dirty), immediately try to split again. + * + * This is only a problem for some mappings (e.g., XFS); + * mappings that do not support writeback (e.g., shmem) do not + * apply. + */ + if (!folio_test_dirty(folio) || folio_test_anon(folio) || + !folio->mapping || !mapping_can_writeback(folio->mapping)) { + folio_unlock(folio); + break; + } + + /* + * Ideally, we'd only trigger writeback on this exact folio. But + * there is no easy way to do that, so we'll stabilize the + * mapping while we still hold the folio lock, so we can drop + * the folio lock to trigger writeback on the range currently + * covered by the folio instead. + */ + mapping = folio->mapping; + lstart = folio_pos(folio); + lend = lstart + folio_size(folio) - 1; + inode = igrab(mapping->host); folio_unlock(folio); - if (rc != -EBUSY) - return rc; - return -EAGAIN; + if (unlikely(!inode)) + break; + + filemap_write_and_wait_range(mapping, lstart, lend); + iput(mapping->host); } - return 0; + return -EAGAIN; } int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb) From af941f3dd8d75d0f6ae911f25a1e68cdc5db2cfd Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 28 May 2025 11:54:59 +0200 Subject: [PATCH 4/7] s390: Remove unneeded includes Many files don't need to include asm/tlb.h or asm/gmap.h. On the other hand, asm/tlb.h does need to include asm/gmap.h. Remove all unneeded includes so that asm/tlb.h is not directly used by s390 arch code anymore. Remove asm/gmap.h from a few other files as well, so that now only KVM code, mm/gmap.c, and asm/tlb.h include it. Reviewed-by: Christoph Schlameuss Reviewed-by: Steffen Eiden Acked-by: Heiko Carstens Link: https://lore.kernel.org/r/20250528095502.226213-2-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250528095502.226213-2-imbrenda@linux.ibm.com> --- arch/s390/include/asm/tlb.h | 1 + arch/s390/include/asm/uv.h | 1 - arch/s390/kvm/intercept.c | 1 + arch/s390/mm/fault.c | 1 - arch/s390/mm/gmap.c | 1 - arch/s390/mm/init.c | 1 - arch/s390/mm/pgalloc.c | 2 -- arch/s390/mm/pgtable.c | 1 - 8 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index f20601995bb0..56d5f9e0eb2e 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -36,6 +36,7 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb, #include #include +#include /* * Release the page cache reference for a pte removed by diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 46fb0ef6f984..eeb2db4783e6 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #define UVC_CC_OK 0 diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index a06a000f196c..b4834bd4d216 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index da84ff6770de..3829521450dd 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index a94bd4870c65..4869555ff403 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -24,7 +24,6 @@ #include #include #include -#include /* * The address is saved in a radix tree directly; NULL would be ambiguous, diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index afa085e8186c..074bf4fb4ce2 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index e3a6f8ae156c..ddab36875370 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -12,8 +12,6 @@ #include #include #include -#include -#include #include unsigned long *crst_table_alloc(struct mm_struct *mm) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 9901934284ec..7df70cd8f739 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -20,7 +20,6 @@ #include #include -#include #include #include #include From 7e42ad66fb5d0902b1f8d4d9a8fee898b685046a Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 28 May 2025 11:55:00 +0200 Subject: [PATCH 5/7] KVM: s390: Remove unneeded srcu lock All paths leading to handle_essa() already hold the kvm->srcu. Remove unneeded srcu locking from handle_essa(). Add lockdep assertion to make sure we will always be holding kvm->srcu when entering handle_essa(). Reviewed-by: Nina Schoetterl-Glausch Reviewed-by: Christian Borntraeger Reviewed-by: Christoph Schlameuss Reviewed-by: Steffen Eiden Link: https://lore.kernel.org/r/20250528095502.226213-3-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250528095502.226213-3-imbrenda@linux.ibm.com> --- arch/s390/kvm/priv.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 1a49b89706f8..9253c70897a8 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1248,6 +1248,8 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) static int handle_essa(struct kvm_vcpu *vcpu) { + lockdep_assert_held(&vcpu->kvm->srcu); + /* entries expected to be 1FF */ int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; unsigned long *cbrlo; @@ -1297,12 +1299,8 @@ static int handle_essa(struct kvm_vcpu *vcpu) /* Retry the ESSA instruction */ kvm_s390_retry_instr(vcpu); } else { - int srcu_idx; - mmap_read_lock(vcpu->kvm->mm); - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); i = __do_essa(vcpu, orc); - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); mmap_read_unlock(vcpu->kvm->mm); if (i < 0) return i; From 200197908dc4afe03a75234478e6a14634a0a788 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 28 May 2025 11:55:01 +0200 Subject: [PATCH 6/7] KVM: s390: Refactor and split some gmap helpers Refactor some gmap functions; move the implementation into a separate file with only helper functions. The new helper functions work on vm addresses, leaving all gmap logic in the gmap functions, which mostly become just wrappers. The whole gmap handling is going to be moved inside KVM soon, but the helper functions need to touch core mm functions, and thus need to stay in the core of kernel. Reviewed-by: Steffen Eiden Reviewed-by: Christoph Schlameuss Acked-by: Janosch Frank Link: https://lore.kernel.org/r/20250528095502.226213-4-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250528095502.226213-4-imbrenda@linux.ibm.com> --- MAINTAINERS | 2 + arch/s390/include/asm/gmap.h | 2 - arch/s390/include/asm/gmap_helpers.h | 15 ++ arch/s390/kvm/diag.c | 30 +++- arch/s390/kvm/kvm-s390.c | 5 +- arch/s390/mm/Makefile | 2 + arch/s390/mm/gmap.c | 184 +--------------------- arch/s390/mm/gmap_helpers.c | 221 +++++++++++++++++++++++++++ 8 files changed, 274 insertions(+), 187 deletions(-) create mode 100644 arch/s390/include/asm/gmap_helpers.h create mode 100644 arch/s390/mm/gmap_helpers.c diff --git a/MAINTAINERS b/MAINTAINERS index c59316109e3f..afeb47c5e0a2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13075,12 +13075,14 @@ S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git F: Documentation/virt/kvm/s390* F: arch/s390/include/asm/gmap.h +F: arch/s390/include/asm/gmap_helpers.h F: arch/s390/include/asm/kvm* F: arch/s390/include/uapi/asm/kvm* F: arch/s390/include/uapi/asm/uvdevice.h F: arch/s390/kernel/uv.c F: arch/s390/kvm/ F: arch/s390/mm/gmap.c +F: arch/s390/mm/gmap_helpers.c F: drivers/s390/char/uvdevice.c F: tools/testing/selftests/drivers/s390x/uvdevice/ F: tools/testing/selftests/kvm/*/s390/ diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 9f2814d0e1e9..66c5808fd011 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -110,7 +110,6 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); unsigned long __gmap_translate(struct gmap *, unsigned long gaddr); int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr); -void gmap_discard(struct gmap *, unsigned long from, unsigned long to); void __gmap_zap(struct gmap *, unsigned long gaddr); void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); @@ -134,7 +133,6 @@ int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], unsigned long gaddr, unsigned long vmaddr); -int s390_disable_cow_sharing(void); int s390_replace_asce(struct gmap *gmap); void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h new file mode 100644 index 000000000000..5356446a61c4 --- /dev/null +++ b/arch/s390/include/asm/gmap_helpers.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Helper functions for KVM guest address space mapping code + * + * Copyright IBM Corp. 2025 + */ + +#ifndef _ASM_S390_GMAP_HELPERS_H +#define _ASM_S390_GMAP_HELPERS_H + +void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr); +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end); +int gmap_helper_disable_cow_sharing(void); + +#endif /* _ASM_S390_GMAP_HELPERS_H */ diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 74f73141f9b9..53233dec8cad 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -11,12 +11,30 @@ #include #include #include +#include #include #include "kvm-s390.h" #include "trace.h" #include "trace-s390.h" #include "gaccess.h" +static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end) +{ + struct kvm_memslot_iter iter; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + unsigned long start, end; + + slots = kvm_vcpu_memslots(vcpu); + + kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { + slot = iter.slot; + start = __gfn_to_hva_memslot(slot, max(gfn_start, slot->base_gfn)); + end = __gfn_to_hva_memslot(slot, min(gfn_end, slot->base_gfn + slot->npages)); + gmap_helper_discard(vcpu->kvm->mm, start, end); + } +} + static int diag_release_pages(struct kvm_vcpu *vcpu) { unsigned long start, end; @@ -32,12 +50,13 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 5, "diag release pages %lX %lX", start, end); + mmap_read_lock(vcpu->kvm->mm); /* * We checked for start >= end above, so lets check for the * fast path (no prefix swap page involved) */ if (end <= prefix || start >= prefix + 2 * PAGE_SIZE) { - gmap_discard(vcpu->arch.gmap, start, end); + do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(end)); } else { /* * This is slow path. gmap_discard will check for start @@ -45,13 +64,14 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) * prefix and let gmap_discard make some of these calls * NOPs. */ - gmap_discard(vcpu->arch.gmap, start, prefix); + do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(prefix)); if (start <= prefix) - gmap_discard(vcpu->arch.gmap, 0, PAGE_SIZE); + do_discard_gfn_range(vcpu, 0, 1); if (end > prefix + PAGE_SIZE) - gmap_discard(vcpu->arch.gmap, PAGE_SIZE, 2 * PAGE_SIZE); - gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end); + do_discard_gfn_range(vcpu, 1, 2); + do_discard_gfn_range(vcpu, gpa_to_gfn(prefix) + 2, gpa_to_gfn(end)); } + mmap_read_unlock(vcpu->kvm->mm); return 0; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3f3175193fd7..10cfc047525d 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -2674,7 +2675,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) if (r) break; - r = s390_disable_cow_sharing(); + mmap_write_lock(kvm->mm); + r = gmap_helper_disable_cow_sharing(); + mmap_write_unlock(kvm->mm); if (r) break; diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 9726b91fe7e4..bd0401cc7ca5 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -12,3 +12,5 @@ obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP) += dump_pagetables.o obj-$(CONFIG_PGSTE) += gmap.o obj-$(CONFIG_PFAULT) += pfault.o + +obj-$(subst m,y,$(CONFIG_KVM)) += gmap_helpers.o diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 4869555ff403..012a4366a2ad 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -619,63 +620,20 @@ EXPORT_SYMBOL(__gmap_link); */ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) { - struct vm_area_struct *vma; unsigned long vmaddr; - spinlock_t *ptl; - pte_t *ptep; + + mmap_assert_locked(gmap->mm); /* Find the vm address for the guest address */ vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); if (vmaddr) { vmaddr |= gaddr & ~PMD_MASK; - - vma = vma_lookup(gmap->mm, vmaddr); - if (!vma || is_vm_hugetlb_page(vma)) - return; - - /* Get pointer to the page table entry */ - ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); - if (likely(ptep)) { - ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); - pte_unmap_unlock(ptep, ptl); - } + gmap_helper_zap_one_page(gmap->mm, vmaddr); } } EXPORT_SYMBOL_GPL(__gmap_zap); -void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) -{ - unsigned long gaddr, vmaddr, size; - struct vm_area_struct *vma; - - mmap_read_lock(gmap->mm); - for (gaddr = from; gaddr < to; - gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (!vmaddr) - continue; - vmaddr |= gaddr & ~PMD_MASK; - /* Find vma in the parent mm */ - vma = find_vma(gmap->mm, vmaddr); - if (!vma) - continue; - /* - * We do not discard pages that are backed by - * hugetlbfs, so we don't have to refault them. - */ - if (is_vm_hugetlb_page(vma)) - continue; - size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range_single(vma, vmaddr, size, NULL); - } - mmap_read_unlock(gmap->mm); -} -EXPORT_SYMBOL_GPL(gmap_discard); - static LIST_HEAD(gmap_notifier_list); static DEFINE_SPINLOCK(gmap_notifier_lock); @@ -2268,138 +2226,6 @@ int s390_enable_sie(void) } EXPORT_SYMBOL_GPL(s390_enable_sie); -static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - unsigned long *found_addr = walk->private; - - /* Return 1 of the page is a zeropage. */ - if (is_zero_pfn(pte_pfn(*pte))) { - /* - * Shared zeropage in e.g., a FS DAX mapping? We cannot do the - * right thing and likely don't care: FAULT_FLAG_UNSHARE - * currently only works in COW mappings, which is also where - * mm_forbids_zeropage() is checked. - */ - if (!is_cow_mapping(walk->vma->vm_flags)) - return -EFAULT; - - *found_addr = addr; - return 1; - } - return 0; -} - -static const struct mm_walk_ops find_zeropage_ops = { - .pte_entry = find_zeropage_pte_entry, - .walk_lock = PGWALK_WRLOCK, -}; - -/* - * Unshare all shared zeropages, replacing them by anonymous pages. Note that - * we cannot simply zap all shared zeropages, because this could later - * trigger unexpected userfaultfd missing events. - * - * This must be called after mm->context.allow_cow_sharing was - * set to 0, to avoid future mappings of shared zeropages. - * - * mm contracts with s390, that even if mm were to remove a page table, - * and racing with walk_page_range_vma() calling pte_offset_map_lock() - * would fail, it will never insert a page table containing empty zero - * pages once mm_forbids_zeropage(mm) i.e. - * mm->context.allow_cow_sharing is set to 0. - */ -static int __s390_unshare_zeropages(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - unsigned long addr; - vm_fault_t fault; - int rc; - - for_each_vma(vmi, vma) { - /* - * We could only look at COW mappings, but it's more future - * proof to catch unexpected zeropages in other mappings and - * fail. - */ - if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) - continue; - addr = vma->vm_start; - -retry: - rc = walk_page_range_vma(vma, addr, vma->vm_end, - &find_zeropage_ops, &addr); - if (rc < 0) - return rc; - else if (!rc) - continue; - - /* addr was updated by find_zeropage_pte_entry() */ - fault = handle_mm_fault(vma, addr, - FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, - NULL); - if (fault & VM_FAULT_OOM) - return -ENOMEM; - /* - * See break_ksm(): even after handle_mm_fault() returned 0, we - * must start the lookup from the current address, because - * handle_mm_fault() may back out if there's any difficulty. - * - * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but - * maybe they could trigger in the future on concurrent - * truncation. In that case, the shared zeropage would be gone - * and we can simply retry and make progress. - */ - cond_resched(); - goto retry; - } - - return 0; -} - -static int __s390_disable_cow_sharing(struct mm_struct *mm) -{ - int rc; - - if (!mm->context.allow_cow_sharing) - return 0; - - mm->context.allow_cow_sharing = 0; - - /* Replace all shared zeropages by anonymous pages. */ - rc = __s390_unshare_zeropages(mm); - /* - * Make sure to disable KSM (if enabled for the whole process or - * individual VMAs). Note that nothing currently hinders user space - * from re-enabling it. - */ - if (!rc) - rc = ksm_disable(mm); - if (rc) - mm->context.allow_cow_sharing = 1; - return rc; -} - -/* - * Disable most COW-sharing of memory pages for the whole process: - * (1) Disable KSM and unmerge/unshare any KSM pages. - * (2) Disallow shared zeropages and unshare any zerpages that are mapped. - * - * Not that we currently don't bother with COW-shared pages that are shared - * with parent/child processes due to fork(). - */ -int s390_disable_cow_sharing(void) -{ - int rc; - - mmap_write_lock(current->mm); - rc = __s390_disable_cow_sharing(current->mm); - mmap_write_unlock(current->mm); - return rc; -} -EXPORT_SYMBOL_GPL(s390_disable_cow_sharing); - /* * Enable storage key handling from now on and initialize the storage * keys with the default key. @@ -2467,7 +2293,7 @@ int s390_enable_skey(void) goto out_up; mm->context.uses_skeys = 1; - rc = __s390_disable_cow_sharing(mm); + rc = gmap_helper_disable_cow_sharing(); if (rc) { mm->context.uses_skeys = 0; goto out_up; diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c new file mode 100644 index 000000000000..a45d417ad951 --- /dev/null +++ b/arch/s390/mm/gmap_helpers.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helper functions for KVM guest address space mapping code + * + * Copyright IBM Corp. 2007, 2025 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * ptep_zap_swap_entry() - discard a swap entry. + * @mm: the mm + * @entry: the swap entry that needs to be zapped + * + * Discards the given swap entry. If the swap entry was an actual swap + * entry (and not a migration entry, for example), the actual swapped + * page is also discarded from swap. + */ +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) +{ + if (!non_swap_entry(entry)) + dec_mm_counter(mm, MM_SWAPENTS); + else if (is_migration_entry(entry)) + dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry))); + free_swap_and_cache(entry); +} + +/** + * gmap_helper_zap_one_page() - discard a page if it was swapped. + * @mm: the mm + * @vmaddr: the userspace virtual address that needs to be discarded + * + * If the given address maps to a swap entry, discard it. + * + * Context: needs to be called while holding the mmap lock. + */ +void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) +{ + struct vm_area_struct *vma; + spinlock_t *ptl; + pte_t *ptep; + + mmap_assert_locked(mm); + + /* Find the vm address for the guest address */ + vma = vma_lookup(mm, vmaddr); + if (!vma || is_vm_hugetlb_page(vma)) + return; + + /* Get pointer to the page table entry */ + ptep = get_locked_pte(mm, vmaddr, &ptl); + if (unlikely(!ptep)) + return; + if (pte_swap(*ptep)) + ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep)); + pte_unmap_unlock(ptep, ptl); +} +EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page); + +/** + * gmap_helper_discard() - discard user pages in the given range + * @mm: the mm + * @vmaddr: starting userspace address + * @end: end address (first address outside the range) + * + * All userpace pages in the range [@vamddr, @end) are discarded and unmapped. + * + * Context: needs to be called while holding the mmap lock. + */ +void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + + while (vmaddr < end) { + vma = find_vma_intersection(mm, vmaddr, end); + if (!vma) + return; + if (!is_vm_hugetlb_page(vma)) + zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL); + vmaddr = vma->vm_end; + } +} +EXPORT_SYMBOL_GPL(gmap_helper_discard); + +static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + unsigned long *found_addr = walk->private; + + /* Return 1 of the page is a zeropage. */ + if (is_zero_pfn(pte_pfn(*pte))) { + /* + * Shared zeropage in e.g., a FS DAX mapping? We cannot do the + * right thing and likely don't care: FAULT_FLAG_UNSHARE + * currently only works in COW mappings, which is also where + * mm_forbids_zeropage() is checked. + */ + if (!is_cow_mapping(walk->vma->vm_flags)) + return -EFAULT; + + *found_addr = addr; + return 1; + } + return 0; +} + +static const struct mm_walk_ops find_zeropage_ops = { + .pte_entry = find_zeropage_pte_entry, + .walk_lock = PGWALK_WRLOCK, +}; + +/** __gmap_helper_unshare_zeropages() - unshare all shared zeropages + * @mm: the mm whose zero pages are to be unshared + * + * Unshare all shared zeropages, replacing them by anonymous pages. Note that + * we cannot simply zap all shared zeropages, because this could later + * trigger unexpected userfaultfd missing events. + * + * This must be called after mm->context.allow_cow_sharing was + * set to 0, to avoid future mappings of shared zeropages. + * + * mm contracts with s390, that even if mm were to remove a page table, + * and racing with walk_page_range_vma() calling pte_offset_map_lock() + * would fail, it will never insert a page table containing empty zero + * pages once mm_forbids_zeropage(mm) i.e. + * mm->context.allow_cow_sharing is set to 0. + */ +static int __gmap_helper_unshare_zeropages(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + unsigned long addr; + vm_fault_t fault; + int rc; + + for_each_vma(vmi, vma) { + /* + * We could only look at COW mappings, but it's more future + * proof to catch unexpected zeropages in other mappings and + * fail. + */ + if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma)) + continue; + addr = vma->vm_start; + +retry: + rc = walk_page_range_vma(vma, addr, vma->vm_end, + &find_zeropage_ops, &addr); + if (rc < 0) + return rc; + else if (!rc) + continue; + + /* addr was updated by find_zeropage_pte_entry() */ + fault = handle_mm_fault(vma, addr, + FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, + NULL); + if (fault & VM_FAULT_OOM) + return -ENOMEM; + /* + * See break_ksm(): even after handle_mm_fault() returned 0, we + * must start the lookup from the current address, because + * handle_mm_fault() may back out if there's any difficulty. + * + * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but + * maybe they could trigger in the future on concurrent + * truncation. In that case, the shared zeropage would be gone + * and we can simply retry and make progress. + */ + cond_resched(); + goto retry; + } + + return 0; +} + +/** + * gmap_helper_disable_cow_sharing() - disable all COW sharing + * + * Disable most COW-sharing of memory pages for the whole process: + * (1) Disable KSM and unmerge/unshare any KSM pages. + * (2) Disallow shared zeropages and unshare any zerpages that are mapped. + * + * Not that we currently don't bother with COW-shared pages that are shared + * with parent/child processes due to fork(). + */ +int gmap_helper_disable_cow_sharing(void) +{ + struct mm_struct *mm = current->mm; + int rc; + + mmap_assert_write_locked(mm); + + if (!mm->context.allow_cow_sharing) + return 0; + + mm->context.allow_cow_sharing = 0; + + /* Replace all shared zeropages by anonymous pages. */ + rc = __gmap_helper_unshare_zeropages(mm); + /* + * Make sure to disable KSM (if enabled for the whole process or + * individual VMAs). Note that nothing currently hinders user space + * from re-enabling it. + */ + if (!rc) + rc = ksm_disable(mm); + if (rc) + mm->context.allow_cow_sharing = 1; + return rc; +} +EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing); From d6c8097803cbc3bb8d875baef542e6d77d10c203 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 28 May 2025 11:55:02 +0200 Subject: [PATCH 7/7] KVM: s390: Simplify and move pv code All functions in kvm/gmap.c fit better in kvm/pv.c instead. Move and rename them appropriately, then delete the now empty kvm/gmap.c and kvm/gmap.h. Reviewed-by: Nina Schoetterl-Glausch Reviewed-by: Steffen Eiden Reviewed-by: Christoph Schlameuss Acked-by: Janosch Frank Link: https://lore.kernel.org/r/20250528095502.226213-5-imbrenda@linux.ibm.com Signed-off-by: Claudio Imbrenda Message-ID: <20250528095502.226213-5-imbrenda@linux.ibm.com> --- arch/s390/kernel/uv.c | 12 ++-- arch/s390/kvm/Makefile | 2 +- arch/s390/kvm/gaccess.c | 3 +- arch/s390/kvm/gmap-vsie.c | 1 - arch/s390/kvm/gmap.c | 121 -------------------------------------- arch/s390/kvm/gmap.h | 39 ------------ arch/s390/kvm/intercept.c | 10 +--- arch/s390/kvm/kvm-s390.c | 5 +- arch/s390/kvm/kvm-s390.h | 42 +++++++++++++ arch/s390/kvm/pv.c | 61 ++++++++++++++++++- arch/s390/kvm/vsie.c | 19 +++++- 11 files changed, 133 insertions(+), 182 deletions(-) delete mode 100644 arch/s390/kvm/gmap.c delete mode 100644 arch/s390/kvm/gmap.h diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index d278bf0c09d1..cdbdc1fc8964 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -136,7 +136,7 @@ int uv_destroy_folio(struct folio *folio) { int rc; - /* See gmap_make_secure(): large folios cannot be secure */ + /* Large folios cannot be secure */ if (unlikely(folio_test_large(folio))) return 0; @@ -185,7 +185,7 @@ int uv_convert_from_secure_folio(struct folio *folio) { int rc; - /* See gmap_make_secure(): large folios cannot be secure */ + /* Large folios cannot be secure */ if (unlikely(folio_test_large(folio))) return 0; @@ -462,15 +462,15 @@ EXPORT_SYMBOL_GPL(make_hva_secure); /* * To be called with the folio locked or with an extra reference! This will - * prevent gmap_make_secure from touching the folio concurrently. Having 2 - * parallel arch_make_folio_accessible is fine, as the UV calls will become a - * no-op if the folio is already exported. + * prevent kvm_s390_pv_make_secure() from touching the folio concurrently. + * Having 2 parallel arch_make_folio_accessible is fine, as the UV calls will + * become a no-op if the folio is already exported. */ int arch_make_folio_accessible(struct folio *folio) { int rc = 0; - /* See gmap_make_secure(): large folios cannot be secure */ + /* Large folios cannot be secure */ if (unlikely(folio_test_large(folio))) return 0; diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index f0ffe874adc2..9a723c48b05a 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -8,7 +8,7 @@ include $(srctree)/virt/kvm/Makefile.kvm ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap.o gmap-vsie.o +kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index f6fded15633a..e23670e1949c 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -16,9 +16,10 @@ #include #include #include "kvm-s390.h" -#include "gmap.h" #include "gaccess.h" +#define GMAP_SHADOW_FAKE_TABLE 1ULL + /* * vaddress union in order to easily decode a virtual address into its * region first index, region second index etc. parts. diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c index a6d1dbb04c97..56ef153eb8fe 100644 --- a/arch/s390/kvm/gmap-vsie.c +++ b/arch/s390/kvm/gmap-vsie.c @@ -22,7 +22,6 @@ #include #include "kvm-s390.h" -#include "gmap.h" /** * gmap_find_shadow - find a specific asce in the list of shadow tables diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c deleted file mode 100644 index 6d8944d1b4a0..000000000000 --- a/arch/s390/kvm/gmap.c +++ /dev/null @@ -1,121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Guest memory management for KVM/s390 - * - * Copyright IBM Corp. 2008, 2020, 2024 - * - * Author(s): Claudio Imbrenda - * Martin Schwidefsky - * David Hildenbrand - * Janosch Frank - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "gmap.h" - -/** - * gmap_make_secure() - make one guest page secure - * @gmap: the guest gmap - * @gaddr: the guest address that needs to be made secure - * @uvcb: the UVCB specifying which operation needs to be performed - * - * Context: needs to be called with kvm->srcu held. - * Return: 0 on success, < 0 in case of error. - */ -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) -{ - struct kvm *kvm = gmap->private; - unsigned long vmaddr; - - lockdep_assert_held(&kvm->srcu); - - vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr)); - if (kvm_is_error_hva(vmaddr)) - return -EFAULT; - return make_hva_secure(gmap->mm, vmaddr, uvcb); -} - -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) -{ - struct uv_cb_cts uvcb = { - .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, - .header.len = sizeof(uvcb), - .guest_handle = gmap->guest_handle, - .gaddr = gaddr, - }; - - return gmap_make_secure(gmap, gaddr, &uvcb); -} - -/** - * __gmap_destroy_page() - Destroy a guest page. - * @gmap: the gmap of the guest - * @page: the page to destroy - * - * An attempt will be made to destroy the given guest page. If the attempt - * fails, an attempt is made to export the page. If both attempts fail, an - * appropriate error is returned. - * - * Context: must be called holding the mm lock for gmap->mm - */ -static int __gmap_destroy_page(struct gmap *gmap, struct page *page) -{ - struct folio *folio = page_folio(page); - int rc; - - /* - * See gmap_make_secure(): large folios cannot be secure. Small - * folio implies FW_LEVEL_PTE. - */ - if (folio_test_large(folio)) - return -EFAULT; - - rc = uv_destroy_folio(folio); - /* - * Fault handlers can race; it is possible that two CPUs will fault - * on the same secure page. One CPU can destroy the page, reboot, - * re-enter secure mode and import it, while the second CPU was - * stuck at the beginning of the handler. At some point the second - * CPU will be able to progress, and it will not be able to destroy - * the page. In that case we do not want to terminate the process, - * we instead try to export the page. - */ - if (rc) - rc = uv_convert_from_secure_folio(folio); - - return rc; -} - -/** - * gmap_destroy_page() - Destroy a guest page. - * @gmap: the gmap of the guest - * @gaddr: the guest address to destroy - * - * An attempt will be made to destroy the given guest page. If the attempt - * fails, an attempt is made to export the page. If both attempts fail, an - * appropriate error is returned. - * - * Context: may sleep. - */ -int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) -{ - struct page *page; - int rc = 0; - - mmap_read_lock(gmap->mm); - page = gfn_to_page(gmap->private, gpa_to_gfn(gaddr)); - if (page) - rc = __gmap_destroy_page(gmap, page); - kvm_release_page_clean(page); - mmap_read_unlock(gmap->mm); - return rc; -} diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h deleted file mode 100644 index c8f031c9ea5f..000000000000 --- a/arch/s390/kvm/gmap.h +++ /dev/null @@ -1,39 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * KVM guest address space mapping code - * - * Copyright IBM Corp. 2007, 2016, 2025 - * Author(s): Martin Schwidefsky - * Claudio Imbrenda - */ - -#ifndef ARCH_KVM_S390_GMAP_H -#define ARCH_KVM_S390_GMAP_H - -#define GMAP_SHADOW_FAKE_TABLE 1ULL - -int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); -int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); -int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); - -/** - * gmap_shadow_valid - check if a shadow guest address space matches the - * given properties and is still valid - * @sg: pointer to the shadow guest address space structure - * @asce: ASCE for which the shadow table is requested - * @edat_level: edat level to be used for the shadow translation - * - * Returns 1 if the gmap shadow is still valid and matches the given - * properties, the caller can continue using it. Returns 0 otherwise, the - * caller has to request a new shadow gmap in this case. - * - */ -static inline int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) -{ - if (sg->removed) - return 0; - return sg->orig_asce == asce && sg->edat_level == edat_level; -} - -#endif diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index b4834bd4d216..c7908950c1f4 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -16,13 +16,11 @@ #include #include #include -#include #include "kvm-s390.h" #include "gaccess.h" #include "trace.h" #include "trace-s390.h" -#include "gmap.h" u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu) { @@ -546,7 +544,7 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu) guest_uvcb->header.cmd); return 0; } - rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb); + rc = kvm_s390_pv_make_secure(vcpu->kvm, uvcb.gaddr, &uvcb); /* * If the unpin did not succeed, the guest will exit again for the UVC * and we will retry the unpin. @@ -654,10 +652,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) break; case ICPT_PV_PREF: rc = 0; - gmap_convert_to_secure(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu)); - gmap_convert_to_secure(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu) + PAGE_SIZE); + kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu)); + kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu) + PAGE_SIZE); break; default: return -EOPNOTSUPP; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 10cfc047525d..d5ad10791c25 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -53,7 +53,6 @@ #include "kvm-s390.h" #include "gaccess.h" #include "pci.h" -#include "gmap.h" #define CREATE_TRACE_POINTS #include "trace.h" @@ -4976,7 +4975,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) * previous protected guest. The old pages need to be destroyed * so the new guest can use them. */ - if (gmap_destroy_page(vcpu->arch.gmap, gaddr)) { + if (kvm_s390_pv_destroy_page(vcpu->kvm, gaddr)) { /* * Either KVM messed up the secure guest mapping or the * same page is mapped into multiple secure guests. @@ -4998,7 +4997,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) * guest has not been imported yet. Try to import the page into * the protected guest. */ - rc = gmap_convert_to_secure(vcpu->arch.gmap, gaddr); + rc = kvm_s390_pv_convert_to_secure(vcpu->kvm, gaddr); if (rc == -EINVAL) send_sig(SIGSEGV, current, 0); if (rc != -ENXIO) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 8d3bbb2dd8d2..c44fe0c3a097 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -308,6 +308,9 @@ int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc); int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, u16 *rc, u16 *rrc); +int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr); +int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr); +int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb); static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm) { @@ -319,6 +322,41 @@ static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu) return vcpu->arch.pv.handle; } +/** + * __kvm_s390_pv_destroy_page() - Destroy a guest page. + * @page: the page to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + * + * Context: must be called holding the mm lock for gmap->mm + */ +static inline int __kvm_s390_pv_destroy_page(struct page *page) +{ + struct folio *folio = page_folio(page); + int rc; + + /* Large folios cannot be secure. Small folio implies FW_LEVEL_PTE. */ + if (folio_test_large(folio)) + return -EFAULT; + + rc = uv_destroy_folio(folio); + /* + * Fault handlers can race; it is possible that two CPUs will fault + * on the same secure page. One CPU can destroy the page, reboot, + * re-enter secure mode and import it, while the second CPU was + * stuck at the beginning of the handler. At some point the second + * CPU will be able to progress, and it will not be able to destroy + * the page. In that case we do not want to terminate the process, + * we instead try to export the page. + */ + if (rc) + rc = uv_convert_from_secure_folio(folio); + + return rc; +} + /* implemented in interrupt.c */ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); @@ -398,6 +436,10 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); void kvm_s390_vsie_init(struct kvm *kvm); void kvm_s390_vsie_destroy(struct kvm *kvm); +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); + +/* implemented in gmap-vsie.c */ +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); /* implemented in sigp.c */ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index 22c012aa5206..14c330ec8ceb 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -17,7 +17,6 @@ #include #include #include "kvm-s390.h" -#include "gmap.h" bool kvm_s390_pv_is_protected(struct kvm *kvm) { @@ -33,6 +32,64 @@ bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); +/** + * kvm_s390_pv_make_secure() - make one guest page secure + * @kvm: the guest + * @gaddr: the guest address that needs to be made secure + * @uvcb: the UVCB specifying which operation needs to be performed + * + * Context: needs to be called with kvm->srcu held. + * Return: 0 on success, < 0 in case of error. + */ +int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb) +{ + unsigned long vmaddr; + + lockdep_assert_held(&kvm->srcu); + + vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr)); + if (kvm_is_error_hva(vmaddr)) + return -EFAULT; + return make_hva_secure(kvm->mm, vmaddr, uvcb); +} + +int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr) +{ + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(kvm), + .gaddr = gaddr, + }; + + return kvm_s390_pv_make_secure(kvm, gaddr, &uvcb); +} + +/** + * kvm_s390_pv_destroy_page() - Destroy a guest page. + * @kvm: the guest + * @gaddr: the guest address to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + * + * Context: may sleep. + */ +int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr) +{ + struct page *page; + int rc = 0; + + mmap_read_lock(kvm->mm); + page = gfn_to_page(kvm, gpa_to_gfn(gaddr)); + if (page) + rc = __kvm_s390_pv_destroy_page(page); + kvm_release_page_clean(page); + mmap_read_unlock(kvm->mm); + return rc; +} + /** * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to * be destroyed @@ -638,7 +695,7 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, .tweak[0] = tweak, .tweak[1] = offset, }; - int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); + int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb); unsigned long vmaddr; bool unlocked; diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index a78df3a4f353..13a9661d2b28 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -23,7 +23,6 @@ #include #include "kvm-s390.h" #include "gaccess.h" -#include "gmap.h" enum vsie_page_flags { VSIE_PAGE_IN_USE = 0, @@ -68,6 +67,24 @@ struct vsie_page { __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ }; +/** + * gmap_shadow_valid() - check if a shadow guest address space matches the + * given properties and is still valid + * @sg: pointer to the shadow guest address space structure + * @asce: ASCE for which the shadow table is requested + * @edat_level: edat level to be used for the shadow translation + * + * Returns 1 if the gmap shadow is still valid and matches the given + * properties, the caller can continue using it. Returns 0 otherwise; the + * caller has to request a new shadow gmap in this case. + */ +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) +{ + if (sg->removed) + return 0; + return sg->orig_asce == asce && sg->edat_level == edat_level; +} + /* trigger a validity icpt for the given scb */ static int set_validity_icpt(struct kvm_s390_sie_block *scb, __u16 reason_code)