From fce09ea314505a52f2436397608fa0a5d0934fb1 Mon Sep 17 00:00:00 2001
From: Xiao Liang <shaw.leon@gmail.com>
Date: Sat, 2 Sep 2023 08:48:38 +0800
Subject: [PATCH 001/346] apparmor: Fix null pointer deref when receiving skb
 during sock creation

The panic below is observed when receiving ICMP packets with secmark set
while an ICMP raw socket is being created. SK_CTX(sk)->label is updated
in apparmor_socket_post_create(), but the packet is delivered to the
socket before that, causing the null pointer dereference.
Drop the packet if label context is not set.

    BUG: kernel NULL pointer dereference, address: 000000000000004c
    #PF: supervisor read access in kernel mode
    #PF: error_code(0x0000) - not-present page
    PGD 0 P4D 0
    Oops: 0000 [#1] PREEMPT SMP NOPTI
    CPU: 0 PID: 407 Comm: a.out Not tainted 6.4.12-arch1-1 #1 3e6fa2753a2d75925c34ecb78e22e85a65d083df
    Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/28/2020
    RIP: 0010:aa_label_next_confined+0xb/0x40
    Code: 00 00 48 89 ef e8 d5 25 0c 00 e9 66 ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 0f 1f 44 00 00 89 f0 <8b> 77 4c 39 c6 7e 1f 48 63 d0 48 8d 14 d7 eb 0b 83 c0 01 48 83 c2
    RSP: 0018:ffffa92940003b08 EFLAGS: 00010246
    RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000000000e
    RDX: ffffa92940003be8 RSI: 0000000000000000 RDI: 0000000000000000
    RBP: ffff8b57471e7800 R08: ffff8b574c642400 R09: 0000000000000002
    R10: ffffffffbd820eeb R11: ffffffffbeb7ff00 R12: ffff8b574c642400
    R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
    FS:  00007fb092ea7640(0000) GS:ffff8b577bc00000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 000000000000004c CR3: 00000001020f2005 CR4: 00000000007706f0
    PKRU: 55555554
    Call Trace:
     <IRQ>
     ? __die+0x23/0x70
     ? page_fault_oops+0x171/0x4e0
     ? exc_page_fault+0x7f/0x180
     ? asm_exc_page_fault+0x26/0x30
     ? aa_label_next_confined+0xb/0x40
     apparmor_secmark_check+0xec/0x330
     security_sock_rcv_skb+0x35/0x50
     sk_filter_trim_cap+0x47/0x250
     sock_queue_rcv_skb_reason+0x20/0x60
     raw_rcv+0x13c/0x210
     raw_local_deliver+0x1f3/0x250
     ip_protocol_deliver_rcu+0x4f/0x2f0
     ip_local_deliver_finish+0x76/0xa0
     __netif_receive_skb_one_core+0x89/0xa0
     netif_receive_skb+0x119/0x170
     ? __netdev_alloc_skb+0x3d/0x140
     vmxnet3_rq_rx_complete+0xb23/0x1010 [vmxnet3 56a84f9c97178c57a43a24ec073b45a9d6f01f3a]
     vmxnet3_poll_rx_only+0x36/0xb0 [vmxnet3 56a84f9c97178c57a43a24ec073b45a9d6f01f3a]
     __napi_poll+0x28/0x1b0
     net_rx_action+0x2a4/0x380
     __do_softirq+0xd1/0x2c8
     __irq_exit_rcu+0xbb/0xf0
     common_interrupt+0x86/0xa0
     </IRQ>
     <TASK>
     asm_common_interrupt+0x26/0x40
    RIP: 0010:apparmor_socket_post_create+0xb/0x200
    Code: 08 48 85 ff 75 a1 eb b1 0f 1f 80 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 41 54 <55> 48 89 fd 53 45 85 c0 0f 84 b2 00 00 00 48 8b 1d 80 56 3f 02 48
    RSP: 0018:ffffa92940ce7e50 EFLAGS: 00000286
    RAX: ffffffffbc756440 RBX: 0000000000000000 RCX: 0000000000000001
    RDX: 0000000000000003 RSI: 0000000000000002 RDI: ffff8b574eaab740
    RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000
    R10: ffff8b57444cec70 R11: 0000000000000000 R12: 0000000000000003
    R13: 0000000000000002 R14: ffff8b574eaab740 R15: ffffffffbd8e4748
     ? __pfx_apparmor_socket_post_create+0x10/0x10
     security_socket_post_create+0x4b/0x80
     __sock_create+0x176/0x1f0
     __sys_socket+0x89/0x100
     __x64_sys_socket+0x17/0x20
     do_syscall_64+0x5d/0x90
     ? do_syscall_64+0x6c/0x90
     ? do_syscall_64+0x6c/0x90
     ? do_syscall_64+0x6c/0x90
     entry_SYSCALL_64_after_hwframe+0x72/0xdc

Fixes: ab9f2115081a ("apparmor: Allow filtering based on secmark policy")
Signed-off-by: Xiao Liang <shaw.leon@gmail.com>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/lsm.c | 7 +++++++
 1 file changed, 7 insertions(+)
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index cef8c466af80..c2f73d281724 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -1304,6 +1304,13 @@ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	if (!skb->secmark)
 		return 0;
 
+	/*
+	 * If reach here before socket_post_create hook is called, in which
+	 * case label is null, drop the packet.
+	 */
+	if (!ctx->label)
+		return -EACCES;
+
 	return apparmor_secmark_check(ctx->label, OP_RECVMSG, AA_MAY_RECEIVE,
 				      skb->secmark, sk);
 }

From 2bc73505a5cd2a18a7a542022722f136c19e3b87 Mon Sep 17 00:00:00 2001
From: Fedor Pchelkin <pchelkin@ispras.ru>
Date: Thu, 1 Feb 2024 17:24:48 +0300
Subject: [PATCH 002/346] apparmor: use kvfree_sensitive to free data->data

Inside unpack_profile() data->data is allocated using kvmemdup() so it
should be freed with the corresponding kvfree_sensitive().

Also add missing data->data release for rhashtable insertion failure path
in unpack_profile().

Found by Linux Verification Center (linuxtesting.org).

Fixes: e025be0f26d5 ("apparmor: support querying extended trusted helper extra data")
Cc: stable@vger.kernel.org
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/policy.c        | 2 +-
 security/apparmor/policy_unpack.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c
index 957654d253dd..14df15e35695 100644
--- a/security/apparmor/policy.c
+++ b/security/apparmor/policy.c
@@ -225,7 +225,7 @@ static void aa_free_data(void *ptr, void *arg)
 {
 	struct aa_data *data = ptr;
 
-	kfree_sensitive(data->data);
+	kvfree_sensitive(data->data, data->size);
 	kfree_sensitive(data->key);
 	kfree_sensitive(data);
 }
diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c
index 5e578ef0ddff..75452acd0e35 100644
--- a/security/apparmor/policy_unpack.c
+++ b/security/apparmor/policy_unpack.c
@@ -1071,6 +1071,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e, char **ns_name)
 
 			if (rhashtable_insert_fast(profile->data, &data->head,
 						   profile->data->p)) {
+				kvfree_sensitive(data->data, data->size);
 				kfree_sensitive(data->key);
 				kfree_sensitive(data);
 				info = "failed to insert data to table";

From 4a8db3678403e34c31df6b99a26414d5e183538c Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Mon, 4 Mar 2024 16:36:55 +0000
Subject: [PATCH 003/346] apparmor: remove useless static inline function
 is_deleted

The inlined function is_deleted is redundant, it is not called at all
from any function in security/apparmor/file.c and so it can be removed.

Cleans up clang scan build warning:
security/apparmor/file.c:153:20: warning: unused function
'is_deleted' [-Wunused-function]

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/file.c | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/security/apparmor/file.c b/security/apparmor/file.c
index c03eb7c19f16..d52a5b14dad4 100644
--- a/security/apparmor/file.c
+++ b/security/apparmor/file.c
@@ -144,19 +144,6 @@ int aa_audit_file(const struct cred *subj_cred,
 	return aa_audit(type, profile, &ad, file_audit_cb);
 }
 
-/**
- * is_deleted - test if a file has been completely unlinked
- * @dentry: dentry of file to test for deletion  (NOT NULL)
- *
- * Returns: true if deleted else false
- */
-static inline bool is_deleted(struct dentry *dentry)
-{
-	if (d_unlinked(dentry) && d_backing_inode(dentry)->i_nlink == 0)
-		return true;
-	return false;
-}
-
 static int path_name(const char *op, const struct cred *subj_cred,
 		     struct aa_label *label,
 		     const struct path *path, int flags, char *buffer,

From b2c858148acf96290b9a9af259a04e080a169f51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= <cgzones@googlemail.com>
Date: Fri, 15 Mar 2024 13:54:09 +0100
Subject: [PATCH 004/346] apparmor: fix typo in kernel doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the typo in the function documentation to please kernel doc
warnings.

Signed-off-by: Christian Göttsche <cgzones@googlemail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/lsm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index c2f73d281724..717204fba938 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -1124,7 +1124,7 @@ static int apparmor_socket_create(int family, int type, int protocol, int kern)
  * @sock: socket that is being setup
  * @family: family of socket being created
  * @type: type of the socket
- * @ptotocol: protocol of the socket
+ * @protocol: protocol of the socket
  * @kern: socket is a special kernel socket
  *
  * Note:

From 3dd384108d53834002be5630132ad5c3f32166ad Mon Sep 17 00:00:00 2001
From: Leesoo Ahn <lsahn@ooseel.net>
Date: Wed, 8 May 2024 01:12:29 +0900
Subject: [PATCH 005/346] apparmor: fix possible NULL pointer dereference

profile->parent->dents[AAFS_PROF_DIR] could be NULL only if its parent is made
from __create_missing_ancestors(..) and 'ent->old' is NULL in
aa_replace_profiles(..).
In that case, it must return an error code and the code, -ENOENT represents
its state that the path of its parent is not existed yet.

BUG: kernel NULL pointer dereference, address: 0000000000000030
PGD 0 P4D 0
PREEMPT SMP PTI
CPU: 4 PID: 3362 Comm: apparmor_parser Not tainted 6.8.0-24-generic #24
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014
RIP: 0010:aafs_create.constprop.0+0x7f/0x130
Code: 4c 63 e0 48 83 c4 18 4c 89 e0 5b 41 5c 41 5d 41 5e 41 5f 5d 31 d2 31 c9 31 f6 31 ff 45 31 c0 45 31 c9 45 31 d2 c3 cc cc cc cc <4d> 8b 55 30 4d 8d ba a0 00 00 00 4c 89 55 c0 4c 89 ff e8 7a 6a ae
RSP: 0018:ffffc9000b2c7c98 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 00000000000041ed RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffffc9000b2c7cd8 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82baac10
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
FS:  00007be9f22cf740(0000) GS:ffff88817bc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000030 CR3: 0000000134b08000 CR4: 00000000000006f0
Call Trace:
 <TASK>
 ? show_regs+0x6d/0x80
 ? __die+0x24/0x80
 ? page_fault_oops+0x99/0x1b0
 ? kernelmode_fixup_or_oops+0xb2/0x140
 ? __bad_area_nosemaphore+0x1a5/0x2c0
 ? find_vma+0x34/0x60
 ? bad_area_nosemaphore+0x16/0x30
 ? do_user_addr_fault+0x2a2/0x6b0
 ? exc_page_fault+0x83/0x1b0
 ? asm_exc_page_fault+0x27/0x30
 ? aafs_create.constprop.0+0x7f/0x130
 ? aafs_create.constprop.0+0x51/0x130
 __aafs_profile_mkdir+0x3d6/0x480
 aa_replace_profiles+0x83f/0x1270
 policy_update+0xe3/0x180
 profile_load+0xbc/0x150
 ? rw_verify_area+0x47/0x140
 vfs_write+0x100/0x480
 ? __x64_sys_openat+0x55/0xa0
 ? syscall_exit_to_user_mode+0x86/0x260
 ksys_write+0x73/0x100
 __x64_sys_write+0x19/0x30
 x64_sys_call+0x7e/0x25c0
 do_syscall_64+0x7f/0x180
 entry_SYSCALL_64_after_hwframe+0x78/0x80
RIP: 0033:0x7be9f211c574
Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 80 3d d5 ea 0e 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 48 89
RSP: 002b:00007ffd26f2b8c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 00005d504415e200 RCX: 00007be9f211c574
RDX: 0000000000001fc1 RSI: 00005d504418bc80 RDI: 0000000000000004
RBP: 0000000000001fc1 R08: 0000000000001fc1 R09: 0000000080000000
R10: 0000000000000000 R11: 0000000000000202 R12: 00005d504418bc80
R13: 0000000000000004 R14: 00007ffd26f2b9b0 R15: 00007ffd26f2ba30
 </TASK>
Modules linked in: snd_seq_dummy snd_hrtimer qrtr snd_hda_codec_generic snd_hda_intel snd_intel_dspcfg snd_intel_sdw_acpi snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq snd_seq_device i2c_i801 snd_timer i2c_smbus qxl snd soundcore drm_ttm_helper lpc_ich ttm joydev input_leds serio_raw mac_hid binfmt_misc msr parport_pc ppdev lp parport efi_pstore nfnetlink dmi_sysfs qemu_fw_cfg ip_tables x_tables autofs4 hid_generic usbhid hid ahci libahci psmouse virtio_rng xhci_pci xhci_pci_renesas
CR2: 0000000000000030
---[ end trace 0000000000000000 ]---
RIP: 0010:aafs_create.constprop.0+0x7f/0x130
Code: 4c 63 e0 48 83 c4 18 4c 89 e0 5b 41 5c 41 5d 41 5e 41 5f 5d 31 d2 31 c9 31 f6 31 ff 45 31 c0 45 31 c9 45 31 d2 c3 cc cc cc cc <4d> 8b 55 30 4d 8d ba a0 00 00 00 4c 89 55 c0 4c 89 ff e8 7a 6a ae
RSP: 0018:ffffc9000b2c7c98 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 00000000000041ed RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffffc9000b2c7cd8 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff82baac10
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
FS:  00007be9f22cf740(0000) GS:ffff88817bc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000030 CR3: 0000000134b08000 CR4: 00000000000006f0

Signed-off-by: Leesoo Ahn <lsahn@ooseel.net>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/apparmorfs.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index bcfea073e3f2..01b923d97a44 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -1692,6 +1692,10 @@ int __aafs_profile_mkdir(struct aa_profile *profile, struct dentry *parent)
 		struct aa_profile *p;
 		p = aa_deref_parent(profile);
 		dent = prof_dir(p);
+		if (!dent) {
+			error = -ENOENT;
+			goto fail2;
+		}
 		/* adding to parent that previously didn't have children */
 		dent = aafs_create_dir("profiles", dent);
 		if (IS_ERR(dent))

From 1bbf5a2156fa246ab54a15412116458006462038 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 9 Apr 2024 19:03:14 +0300
Subject: [PATCH 006/346] auxdisplay: charlcd: Provide a forward declaration

While there is no compilation error, strictly speaking the compiler
should know about used types beforehand. Provide a forward declaration
for struct charlcd_ops before using it in struct charlcd.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/auxdisplay/charlcd.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/auxdisplay/charlcd.h b/drivers/auxdisplay/charlcd.h
index eed80063a6d2..4d4287209d04 100644
--- a/drivers/auxdisplay/charlcd.h
+++ b/drivers/auxdisplay/charlcd.h
@@ -36,6 +36,8 @@ enum charlcd_lines {
 	CHARLCD_LINES_2,
 };
 
+struct charlcd_ops;
+
 struct charlcd {
 	const struct charlcd_ops *ops;
 	const unsigned char *char_conv;	/* Optional */

From b4cf5fc01ce83e5c0bcf3dbb9f929428646b9098 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 30 May 2024 23:54:55 -0400
Subject: [PATCH 007/346] powerpc: fix a file leak in
 kvm_vcpu_ioctl_enable_cap()

missing fdput() on one of the failure exits

Fixes: eacc56bb9de3e # v5.2
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/kvm/powerpc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index d32abe7fe6ab..d11767208bfc 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1984,8 +1984,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 			break;
 
 		r = -ENXIO;
-		if (!xive_enabled())
+		if (!xive_enabled()) {
+			fdput(f);
 			break;
+		}
 
 		r = -EPERM;
 		dev = kvm_device_from_filp(f.file);

From bba1f6758a9ec90c1adac5dcf78f8a15f1bad65b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 30 May 2024 23:58:26 -0400
Subject: [PATCH 008/346] lirc: rc_dev_get_from_fd(): fix file leak

missing fdput() on a failure exit

Fixes: 6a9d552483d50 "media: rc: bpf attach/detach requires write permission" # v6.9
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/media/rc/lirc_dev.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index 52aea4167718..717c441b4a86 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -828,8 +828,10 @@ struct rc_dev *rc_dev_get_from_fd(int fd, bool write)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (write && !(f.file->f_mode & FMODE_WRITE))
+	if (write && !(f.file->f_mode & FMODE_WRITE)) {
+		fdput(f);
 		return ERR_PTR(-EPERM);
+	}
 
 	fh = f.file->private_data;
 	dev = fh->rc;

From c8ffef985af564c1b4fc09fdb5a5b336bfc0c33c Mon Sep 17 00:00:00 2001
From: Chris Packham <chris.packham@alliedtelesis.co.nz>
Date: Fri, 31 May 2024 11:20:54 +1200
Subject: [PATCH 009/346] auxdisplay: linedisp: Support configuring the boot
 message

Like we do for charlcd, allow the configuration of the initial message
on line-display devices.

Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/auxdisplay/Kconfig        |  2 +-
 drivers/auxdisplay/line-display.c | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/auxdisplay/Kconfig b/drivers/auxdisplay/Kconfig
index 69d2138d7efb..21545ffba065 100644
--- a/drivers/auxdisplay/Kconfig
+++ b/drivers/auxdisplay/Kconfig
@@ -316,7 +316,7 @@ endif # PARPORT_PANEL
 
 config PANEL_CHANGE_MESSAGE
 	bool "Change LCD initialization message ?"
-	depends on CHARLCD
+	depends on CHARLCD || LINEDISP
 	help
 	  This allows you to replace the boot message indicating the kernel version
 	  and the driver version with a custom message. This is useful on appliances
diff --git a/drivers/auxdisplay/line-display.c b/drivers/auxdisplay/line-display.c
index e2b546210f8d..978dc457ca5f 100644
--- a/drivers/auxdisplay/line-display.c
+++ b/drivers/auxdisplay/line-display.c
@@ -8,7 +8,9 @@
  * Copyright (C) 2021 Glider bv
  */
 
+#ifndef CONFIG_PANEL_BOOT_MESSAGE
 #include <generated/utsrelease.h>
+#endif
 
 #include <linux/container_of.h>
 #include <linux/device.h>
@@ -312,6 +314,12 @@ static int linedisp_init_map(struct linedisp *linedisp)
 	return 0;
 }
 
+#ifdef CONFIG_PANEL_BOOT_MESSAGE
+#define LINEDISP_INIT_TEXT CONFIG_PANEL_BOOT_MESSAGE
+#else
+#define LINEDISP_INIT_TEXT "Linux " UTS_RELEASE "       "
+#endif
+
 /**
  * linedisp_register - register a character line display
  * @linedisp: pointer to character line display structure
@@ -359,7 +367,7 @@ int linedisp_register(struct linedisp *linedisp, struct device *parent,
 		goto out_del_timer;
 
 	/* display a default message */
-	err = linedisp_display(linedisp, "Linux " UTS_RELEASE "       ", -1);
+	err = linedisp_display(linedisp, LINEDISP_INIT_TEXT, -1);
 	if (err)
 		goto out_del_dev;
 

From 4adbf7086dfdd6101508fd070cb1c0e0adc9513e Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Sun, 2 Jun 2024 20:02:01 -0700
Subject: [PATCH 010/346] auxdisplay: linedisp: add missing
 MODULE_DESCRIPTION() macro

make allmodconfig && make W=1 C=1 reports:
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/auxdisplay/line-display.o

Add the missing invocation of the MODULE_DESCRIPTION() macro.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/auxdisplay/line-display.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/auxdisplay/line-display.c b/drivers/auxdisplay/line-display.c
index 978dc457ca5f..731ffdfafc4e 100644
--- a/drivers/auxdisplay/line-display.c
+++ b/drivers/auxdisplay/line-display.c
@@ -396,4 +396,5 @@ void linedisp_unregister(struct linedisp *linedisp)
 }
 EXPORT_SYMBOL_NS_GPL(linedisp_unregister, LINEDISP);
 
+MODULE_DESCRIPTION("Character line display core support");
 MODULE_LICENSE("GPL");

From c56a45064e989998a616aadf88023cc9611416b7 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Sun, 2 Jun 2024 19:50:15 -0700
Subject: [PATCH 011/346] auxdisplay: hd44780: add missing MODULE_DESCRIPTION()
 macro

make allmodconfig && make W=1 C=1 reports:
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/auxdisplay/hd44780_common.o

Add the missing invocation of the MODULE_DESCRIPTION() macro.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/auxdisplay/hd44780_common.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/auxdisplay/hd44780_common.c b/drivers/auxdisplay/hd44780_common.c
index 7cbf375b0fa5..4ef87c3118c0 100644
--- a/drivers/auxdisplay/hd44780_common.c
+++ b/drivers/auxdisplay/hd44780_common.c
@@ -366,4 +366,5 @@ struct hd44780_common *hd44780_common_alloc(void)
 }
 EXPORT_SYMBOL_GPL(hd44780_common_alloc);
 
+MODULE_DESCRIPTION("Common functions for HD44780 (and compatibles) LCD displays");
 MODULE_LICENSE("GPL");

From 632691ad83197227adc902793a2a3e10c04fe9b8 Mon Sep 17 00:00:00 2001
From: Erick Archer <erick.archer@outlook.com>
Date: Sun, 2 Jun 2024 10:49:22 +0200
Subject: [PATCH 012/346] auxdisplay: Use sizeof(*pointer) instead of
 sizeof(type)

It is preferred to use sizeof(*pointer) instead of sizeof(type)
due to the type of the variable can change and one needs not
change the former (unlike the latter). This patch has no effect
on runtime behavior.

Signed-off-by: Erick Archer <erick.archer@outlook.com>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/auxdisplay/arm-charlcd.c | 2 +-
 drivers/auxdisplay/hd44780.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/auxdisplay/arm-charlcd.c b/drivers/auxdisplay/arm-charlcd.c
index 0b1c99cca733..a7eae99a48f7 100644
--- a/drivers/auxdisplay/arm-charlcd.c
+++ b/drivers/auxdisplay/arm-charlcd.c
@@ -270,7 +270,7 @@ static int __init charlcd_probe(struct platform_device *pdev)
 	struct charlcd *lcd;
 	struct resource *res;
 
-	lcd = kzalloc(sizeof(struct charlcd), GFP_KERNEL);
+	lcd = kzalloc(sizeof(*lcd), GFP_KERNEL);
 	if (!lcd)
 		return -ENOMEM;
 
diff --git a/drivers/auxdisplay/hd44780.c b/drivers/auxdisplay/hd44780.c
index 7ac0b1b1d548..025dc6855cb2 100644
--- a/drivers/auxdisplay/hd44780.c
+++ b/drivers/auxdisplay/hd44780.c
@@ -230,7 +230,7 @@ static int hd44780_probe(struct platform_device *pdev)
 	if (!lcd)
 		goto fail1;
 
-	hd = kzalloc(sizeof(struct hd44780), GFP_KERNEL);
+	hd = kzalloc(sizeof(*hd), GFP_KERNEL);
 	if (!hd)
 		goto fail2;
 

From 2ccfe94bc3ac980d2d1df9f7a0b2c6d2137abe55 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 4 Jun 2024 17:02:15 +0200
Subject: [PATCH 013/346] auxdisplay: ht16k33: Drop reference after LED
 registration

The reference count is bumped by device_get_named_child_node()
and never dropped. Since LED APIs do not require it to be
bumped by the user, drop the reference after LED registration.

[andy: rewritten the commit message and amended the change]

Fixes: c223d9c636ed ("auxdisplay: ht16k33: Add LED support")
Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/auxdisplay/ht16k33.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/auxdisplay/ht16k33.c b/drivers/auxdisplay/ht16k33.c
index ce987944662c..8a7034b41d50 100644
--- a/drivers/auxdisplay/ht16k33.c
+++ b/drivers/auxdisplay/ht16k33.c
@@ -483,6 +483,7 @@ static int ht16k33_led_probe(struct device *dev, struct led_classdev *led,
 	led->max_brightness = MAX_BRIGHTNESS;
 
 	err = devm_led_classdev_register_ext(dev, led, &init_data);
+	fwnode_handle_put(init_data.fwnode);
 	if (err)
 		dev_err(dev, "Failed to register LED\n");
 

From 5c563ee90a22d3295bcd6217e3ecd7bf9f4d9d48 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 24 May 2024 11:58:28 -0700
Subject: [PATCH 014/346] cpumask: introduce assign_cpu() macro

Now that assign_bit() is a thin macro wrapper around set_bit() and
clear_bit(), we can use it in cpumask API and drop duplicating
implementations of set_cpu_xxx() helpers with no additional overhead.

Bloat-o-meter reports almost 2k less of generated code for allyesconfig,
mostly in kernel/cpu.c:
	add/remove: 2/4 grow/shrink: 3/4 up/down: 498/-2228 (-1730)

Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 40 ++++++----------------------------------
 1 file changed, 6 insertions(+), 34 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 23686bed441d..18410acdbc9e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1083,44 +1083,16 @@ void init_cpu_present(const struct cpumask *src);
 void init_cpu_possible(const struct cpumask *src);
 void init_cpu_online(const struct cpumask *src);
 
-static inline void
-set_cpu_possible(unsigned int cpu, bool possible)
-{
-	if (possible)
-		cpumask_set_cpu(cpu, &__cpu_possible_mask);
-	else
-		cpumask_clear_cpu(cpu, &__cpu_possible_mask);
-}
+#define assign_cpu(cpu, mask, val)	\
+	assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val))
 
-static inline void
-set_cpu_present(unsigned int cpu, bool present)
-{
-	if (present)
-		cpumask_set_cpu(cpu, &__cpu_present_mask);
-	else
-		cpumask_clear_cpu(cpu, &__cpu_present_mask);
-}
+#define set_cpu_possible(cpu, possible)	assign_cpu((cpu), &__cpu_possible_mask, (possible))
+#define set_cpu_present(cpu, present)	assign_cpu((cpu), &__cpu_present_mask, (present))
+#define set_cpu_active(cpu, active)	assign_cpu((cpu), &__cpu_active_mask, (active))
+#define set_cpu_dying(cpu, dying)	assign_cpu((cpu), &__cpu_dying_mask, (dying))
 
 void set_cpu_online(unsigned int cpu, bool online);
 
-static inline void
-set_cpu_active(unsigned int cpu, bool active)
-{
-	if (active)
-		cpumask_set_cpu(cpu, &__cpu_active_mask);
-	else
-		cpumask_clear_cpu(cpu, &__cpu_active_mask);
-}
-
-static inline void
-set_cpu_dying(unsigned int cpu, bool dying)
-{
-	if (dying)
-		cpumask_set_cpu(cpu, &__cpu_dying_mask);
-	else
-		cpumask_clear_cpu(cpu, &__cpu_dying_mask);
-}
-
 /**
  * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask *
  * @bitmap: the bitmap

From e334771d83ec14f755a554394162198a955e3faa Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Fri, 31 May 2024 09:03:11 -0700
Subject: [PATCH 015/346] lib: bitmap: add missing MODULE_DESCRIPTION() macros

make allmodconfig && make W=1 C=1 reports:
WARNING: modpost: missing MODULE_DESCRIPTION() in lib/find_bit_benchmark.o
WARNING: modpost: missing MODULE_DESCRIPTION() in lib/cpumask_kunit.o
WARNING: modpost: missing MODULE_DESCRIPTION() in lib/test_bitmap.o

Add the missing invocations of the MODULE_DESCRIPTION() macro.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 lib/cpumask_kunit.c      | 1 +
 lib/find_bit_benchmark.c | 1 +
 lib/test_bitmap.c        | 1 +
 3 files changed, 3 insertions(+)

diff --git a/lib/cpumask_kunit.c b/lib/cpumask_kunit.c
index a105e6369efc..6b62a6bdd50e 100644
--- a/lib/cpumask_kunit.c
+++ b/lib/cpumask_kunit.c
@@ -152,4 +152,5 @@ static struct kunit_suite test_cpumask_suite = {
 };
 kunit_test_suite(test_cpumask_suite);
 
+MODULE_DESCRIPTION("KUnit tests for cpumask");
 MODULE_LICENSE("GPL");
diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c
index d3fb09e6eff1..402e160e7186 100644
--- a/lib/find_bit_benchmark.c
+++ b/lib/find_bit_benchmark.c
@@ -194,4 +194,5 @@ static int __init find_bit_test(void)
 }
 module_init(find_bit_test);
 
+MODULE_DESCRIPTION("Test for find_*_bit functions");
 MODULE_LICENSE("GPL");
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 6dfb8d46a4ff..65a75d58ed9e 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -1486,4 +1486,5 @@ static void __init selftest(void)
 
 KSTM_MODULE_LOADERS(test_bitmap);
 MODULE_AUTHOR("david decotigny <david.decotigny@googlers.com>");
+MODULE_DESCRIPTION("Test cases for bitmap API");
 MODULE_LICENSE("GPL");

From e0eeb938adb0367de4b0946125a06142d8de7d37 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Tue, 11 Jun 2024 15:38:12 +0300
Subject: [PATCH 016/346] bitops: Add a comment explaining the double
 underscore macros

Linus Walleij pointed out that a new comer might be confused about the
difference between set_bit() and __set_bit().  Add a comment explaining
the difference.

Link: https://lore.kernel.org/all/CACRpkdZFPG_YLici-BmYfk9HZ36f4WavCN3JNotkk8cPgCODCg@mail.gmail.com/
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitops.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 46d4bdc634c0..ba35bbf07798 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -47,12 +47,17 @@ extern unsigned long __sw_hweight64(__u64 w);
 	  __builtin_constant_p(*(const unsigned long *)(addr))) ?	\
 	 const##op(nr, addr) : op(nr, addr))
 
+/*
+ * The following macros are non-atomic versions of their non-underscored
+ * counterparts.
+ */
 #define __set_bit(nr, addr)		bitop(___set_bit, nr, addr)
 #define __clear_bit(nr, addr)		bitop(___clear_bit, nr, addr)
 #define __change_bit(nr, addr)		bitop(___change_bit, nr, addr)
 #define __test_and_set_bit(nr, addr)	bitop(___test_and_set_bit, nr, addr)
 #define __test_and_clear_bit(nr, addr)	bitop(___test_and_clear_bit, nr, addr)
 #define __test_and_change_bit(nr, addr)	bitop(___test_and_change_bit, nr, addr)
+
 #define test_bit(nr, addr)		bitop(_test_bit, nr, addr)
 #define test_bit_acquire(nr, addr)	bitop(_test_bit_acquire, nr, addr)
 

From 7bd3d76a1f9fb38c8234cfadb7131e0b26deb919 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben.dooks@codethink.co.uk>
Date: Fri, 8 Mar 2024 18:32:15 +0000
Subject: [PATCH 017/346] unicode: make utf8 test count static

The variables failed_tests and total_tests are not used outside of the
utf8-selftest.c file so make them static to avoid the following warnings:

fs/unicode/utf8-selftest.c:17:14: warning: symbol 'failed_tests' was not declared. Should it be static?
fs/unicode/utf8-selftest.c:18:14: warning: symbol 'total_tests' was not declared. Should it be static?

Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
Link: https://lore.kernel.org/r/20240308183215.1924331-1-ben.dooks@codethink.co.uk
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
---
 fs/unicode/utf8-selftest.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c
index eb2bbdd688d7..c928e6007356 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/utf8-selftest.c
@@ -14,8 +14,8 @@
 
 #include "utf8n.h"
 
-unsigned int failed_tests;
-unsigned int total_tests;
+static unsigned int failed_tests;
+static unsigned int total_tests;
 
 /* Tests will be based on this version. */
 #define UTF8_LATEST	UNICODE_AGE(12, 1, 0)

From 68318904a7758e11f16fa9d202a6df60f896e71a Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Fri, 24 May 2024 11:48:09 -0700
Subject: [PATCH 018/346] unicode: add MODULE_DESCRIPTION() macros

Currently 'make W=1' reports:
WARNING: modpost: missing MODULE_DESCRIPTION() in fs/unicode/utf8data.o
WARNING: modpost: missing MODULE_DESCRIPTION() in fs/unicode/utf8-selftest.o

Add a MODULE_DESCRIPTION() to utf8-selftest.c and utf8data.c_shipped,
and update mkutf8data.c to add a MODULE_DESCRIPTION() to any future
generated utf8data file.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://lore.kernel.org/r/20240524-md-unicode-v1-1-e2727ce8574d@quicinc.com
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
---
 fs/unicode/mkutf8data.c       | 1 +
 fs/unicode/utf8-selftest.c    | 1 +
 fs/unicode/utf8data.c_shipped | 1 +
 3 files changed, 3 insertions(+)

diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c
index bc1a7c8b5c8d..77b685db8275 100644
--- a/fs/unicode/mkutf8data.c
+++ b/fs/unicode/mkutf8data.c
@@ -3352,6 +3352,7 @@ static void write_file(void)
 	fprintf(file, "};\n");
 	fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
 	fprintf(file, "\n");
+	fprintf(file, "MODULE_DESCRIPTION(\"UTF8 data table\");\n");
 	fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
 	fclose(file);
 }
diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c
index c928e6007356..600e15efe9ed 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/utf8-selftest.c
@@ -307,4 +307,5 @@ module_init(init_test_ucd);
 module_exit(exit_test_ucd);
 
 MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
+MODULE_DESCRIPTION("Kernel module for testing utf-8 support");
 MODULE_LICENSE("GPL");
diff --git a/fs/unicode/utf8data.c_shipped b/fs/unicode/utf8data.c_shipped
index d9b62901aa96..dafa5fed761d 100644
--- a/fs/unicode/utf8data.c_shipped
+++ b/fs/unicode/utf8data.c_shipped
@@ -4120,4 +4120,5 @@ struct utf8data_table utf8_data_table = {
 	.utf8data = utf8data,
 };
 EXPORT_SYMBOL_GPL(utf8_data_table);
+MODULE_DESCRIPTION("UTF8 data table");
 MODULE_LICENSE("GPL v2");

From 1b3bf0747d4f1a963e59c26e602868bdce195318 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 11 Jun 2024 10:19:50 -0700
Subject: [PATCH 019/346] tools/power/turbostat: Switch to new Intel CPU model
 defines

New CPU #defines encode vendor and family as well as model.

N.B. Copied VFM_*() defines here from <asm/cpu_device_id.h> to avoid
an application picking a second internal kernel header file.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 165 +++++++++++++++-----------
 1 file changed, 95 insertions(+), 70 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 9f5d053d4bc6..02312e62c857 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -9,6 +9,30 @@
 
 #define _GNU_SOURCE
 #include MSRHEADER
+
+// copied from arch/x86/include/asm/cpu_device_id.h
+#define VFM_MODEL_BIT	0
+#define VFM_FAMILY_BIT	8
+#define VFM_VENDOR_BIT	16
+#define VFM_RSVD_BIT	24
+
+#define	VFM_MODEL_MASK	GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT)
+#define	VFM_FAMILY_MASK	GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT)
+#define	VFM_VENDOR_MASK	GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT)
+
+#define VFM_MODEL(vfm)	(((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT)
+#define VFM_FAMILY(vfm)	(((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT)
+#define VFM_VENDOR(vfm)	(((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT)
+
+#define	VFM_MAKE(_vendor, _family, _model) (	\
+	((_model) << VFM_MODEL_BIT) |		\
+	((_family) << VFM_FAMILY_BIT) |		\
+	((_vendor) << VFM_VENDOR_BIT)		\
+)
+// end copied section
+
+#define X86_VENDOR_INTEL	0
+
 #include INTEL_FAMILY_HEADER
 #include BUILD_BUG_HEADER
 #include <stdarg.h>
@@ -367,7 +391,7 @@ struct platform_features {
 };
 
 struct platform_data {
-	unsigned int model;
+	unsigned int vfm;
 	const struct platform_features *features;
 };
 
@@ -910,75 +934,75 @@ static const struct platform_features amd_features_with_rapl = {
 };
 
 static const struct platform_data turbostat_pdata[] = {
-	{ INTEL_FAM6_NEHALEM, &nhm_features },
-	{ INTEL_FAM6_NEHALEM_G, &nhm_features },
-	{ INTEL_FAM6_NEHALEM_EP, &nhm_features },
-	{ INTEL_FAM6_NEHALEM_EX, &nhx_features },
-	{ INTEL_FAM6_WESTMERE, &nhm_features },
-	{ INTEL_FAM6_WESTMERE_EP, &nhm_features },
-	{ INTEL_FAM6_WESTMERE_EX, &nhx_features },
-	{ INTEL_FAM6_SANDYBRIDGE, &snb_features },
-	{ INTEL_FAM6_SANDYBRIDGE_X, &snx_features },
-	{ INTEL_FAM6_IVYBRIDGE, &ivb_features },
-	{ INTEL_FAM6_IVYBRIDGE_X, &ivx_features },
-	{ INTEL_FAM6_HASWELL, &hsw_features },
-	{ INTEL_FAM6_HASWELL_X, &hsx_features },
-	{ INTEL_FAM6_HASWELL_L, &hswl_features },
-	{ INTEL_FAM6_HASWELL_G, &hswg_features },
-	{ INTEL_FAM6_BROADWELL, &bdw_features },
-	{ INTEL_FAM6_BROADWELL_G, &bdwg_features },
-	{ INTEL_FAM6_BROADWELL_X, &bdx_features },
-	{ INTEL_FAM6_BROADWELL_D, &bdx_features },
-	{ INTEL_FAM6_SKYLAKE_L, &skl_features },
-	{ INTEL_FAM6_SKYLAKE, &skl_features },
-	{ INTEL_FAM6_SKYLAKE_X, &skx_features },
-	{ INTEL_FAM6_KABYLAKE_L, &skl_features },
-	{ INTEL_FAM6_KABYLAKE, &skl_features },
-	{ INTEL_FAM6_COMETLAKE, &skl_features },
-	{ INTEL_FAM6_COMETLAKE_L, &skl_features },
-	{ INTEL_FAM6_CANNONLAKE_L, &cnl_features },
-	{ INTEL_FAM6_ICELAKE_X, &icx_features },
-	{ INTEL_FAM6_ICELAKE_D, &icx_features },
-	{ INTEL_FAM6_ICELAKE_L, &cnl_features },
-	{ INTEL_FAM6_ICELAKE_NNPI, &cnl_features },
-	{ INTEL_FAM6_ROCKETLAKE, &cnl_features },
-	{ INTEL_FAM6_TIGERLAKE_L, &cnl_features },
-	{ INTEL_FAM6_TIGERLAKE, &cnl_features },
-	{ INTEL_FAM6_SAPPHIRERAPIDS_X, &spr_features },
-	{ INTEL_FAM6_EMERALDRAPIDS_X, &spr_features },
-	{ INTEL_FAM6_GRANITERAPIDS_X, &spr_features },
-	{ INTEL_FAM6_LAKEFIELD, &cnl_features },
-	{ INTEL_FAM6_ALDERLAKE, &adl_features },
-	{ INTEL_FAM6_ALDERLAKE_L, &adl_features },
-	{ INTEL_FAM6_RAPTORLAKE, &adl_features },
-	{ INTEL_FAM6_RAPTORLAKE_P, &adl_features },
-	{ INTEL_FAM6_RAPTORLAKE_S, &adl_features },
-	{ INTEL_FAM6_METEORLAKE, &cnl_features },
-	{ INTEL_FAM6_METEORLAKE_L, &cnl_features },
-	{ INTEL_FAM6_ARROWLAKE_H, &arl_features },
-	{ INTEL_FAM6_ARROWLAKE_U, &arl_features },
-	{ INTEL_FAM6_ARROWLAKE, &arl_features },
-	{ INTEL_FAM6_LUNARLAKE_M, &arl_features },
-	{ INTEL_FAM6_ATOM_SILVERMONT, &slv_features },
-	{ INTEL_FAM6_ATOM_SILVERMONT_D, &slvd_features },
-	{ INTEL_FAM6_ATOM_AIRMONT, &amt_features },
-	{ INTEL_FAM6_ATOM_GOLDMONT, &gmt_features },
-	{ INTEL_FAM6_ATOM_GOLDMONT_D, &gmtd_features },
-	{ INTEL_FAM6_ATOM_GOLDMONT_PLUS, &gmtp_features },
-	{ INTEL_FAM6_ATOM_TREMONT_D, &tmtd_features },
-	{ INTEL_FAM6_ATOM_TREMONT, &tmt_features },
-	{ INTEL_FAM6_ATOM_TREMONT_L, &tmt_features },
-	{ INTEL_FAM6_ATOM_GRACEMONT, &adl_features },
-	{ INTEL_FAM6_ATOM_CRESTMONT_X, &srf_features },
-	{ INTEL_FAM6_ATOM_CRESTMONT, &grr_features },
-	{ INTEL_FAM6_XEON_PHI_KNL, &knl_features },
-	{ INTEL_FAM6_XEON_PHI_KNM, &knl_features },
+	{ INTEL_NEHALEM, &nhm_features },
+	{ INTEL_NEHALEM_G, &nhm_features },
+	{ INTEL_NEHALEM_EP, &nhm_features },
+	{ INTEL_NEHALEM_EX, &nhx_features },
+	{ INTEL_WESTMERE, &nhm_features },
+	{ INTEL_WESTMERE_EP, &nhm_features },
+	{ INTEL_WESTMERE_EX, &nhx_features },
+	{ INTEL_SANDYBRIDGE, &snb_features },
+	{ INTEL_SANDYBRIDGE_X, &snx_features },
+	{ INTEL_IVYBRIDGE, &ivb_features },
+	{ INTEL_IVYBRIDGE_X, &ivx_features },
+	{ INTEL_HASWELL, &hsw_features },
+	{ INTEL_HASWELL_X, &hsx_features },
+	{ INTEL_HASWELL_L, &hswl_features },
+	{ INTEL_HASWELL_G, &hswg_features },
+	{ INTEL_BROADWELL, &bdw_features },
+	{ INTEL_BROADWELL_G, &bdwg_features },
+	{ INTEL_BROADWELL_X, &bdx_features },
+	{ INTEL_BROADWELL_D, &bdx_features },
+	{ INTEL_SKYLAKE_L, &skl_features },
+	{ INTEL_SKYLAKE, &skl_features },
+	{ INTEL_SKYLAKE_X, &skx_features },
+	{ INTEL_KABYLAKE_L, &skl_features },
+	{ INTEL_KABYLAKE, &skl_features },
+	{ INTEL_COMETLAKE, &skl_features },
+	{ INTEL_COMETLAKE_L, &skl_features },
+	{ INTEL_CANNONLAKE_L, &cnl_features },
+	{ INTEL_ICELAKE_X, &icx_features },
+	{ INTEL_ICELAKE_D, &icx_features },
+	{ INTEL_ICELAKE_L, &cnl_features },
+	{ INTEL_ICELAKE_NNPI, &cnl_features },
+	{ INTEL_ROCKETLAKE, &cnl_features },
+	{ INTEL_TIGERLAKE_L, &cnl_features },
+	{ INTEL_TIGERLAKE, &cnl_features },
+	{ INTEL_SAPPHIRERAPIDS_X, &spr_features },
+	{ INTEL_EMERALDRAPIDS_X, &spr_features },
+	{ INTEL_GRANITERAPIDS_X, &spr_features },
+	{ INTEL_LAKEFIELD, &cnl_features },
+	{ INTEL_ALDERLAKE, &adl_features },
+	{ INTEL_ALDERLAKE_L, &adl_features },
+	{ INTEL_RAPTORLAKE, &adl_features },
+	{ INTEL_RAPTORLAKE_P, &adl_features },
+	{ INTEL_RAPTORLAKE_S, &adl_features },
+	{ INTEL_METEORLAKE, &cnl_features },
+	{ INTEL_METEORLAKE_L, &cnl_features },
+	{ INTEL_ARROWLAKE_H, &arl_features },
+	{ INTEL_ARROWLAKE_U, &arl_features },
+	{ INTEL_ARROWLAKE, &arl_features },
+	{ INTEL_LUNARLAKE_M, &arl_features },
+	{ INTEL_ATOM_SILVERMONT, &slv_features },
+	{ INTEL_ATOM_SILVERMONT_D, &slvd_features },
+	{ INTEL_ATOM_AIRMONT, &amt_features },
+	{ INTEL_ATOM_GOLDMONT, &gmt_features },
+	{ INTEL_ATOM_GOLDMONT_D, &gmtd_features },
+	{ INTEL_ATOM_GOLDMONT_PLUS, &gmtp_features },
+	{ INTEL_ATOM_TREMONT_D, &tmtd_features },
+	{ INTEL_ATOM_TREMONT, &tmt_features },
+	{ INTEL_ATOM_TREMONT_L, &tmt_features },
+	{ INTEL_ATOM_GRACEMONT, &adl_features },
+	{ INTEL_ATOM_CRESTMONT_X, &srf_features },
+	{ INTEL_ATOM_CRESTMONT, &grr_features },
+	{ INTEL_XEON_PHI_KNL, &knl_features },
+	{ INTEL_XEON_PHI_KNM, &knl_features },
 	/*
 	 * Missing support for
-	 * INTEL_FAM6_ICELAKE
-	 * INTEL_FAM6_ATOM_SILVERMONT_MID
-	 * INTEL_FAM6_ATOM_AIRMONT_MID
-	 * INTEL_FAM6_ATOM_AIRMONT_NP
+	 * INTEL_ICELAKE
+	 * INTEL_ATOM_SILVERMONT_MID
+	 * INTEL_ATOM_AIRMONT_MID
+	 * INTEL_ATOM_AIRMONT_NP
 	 */
 	{ 0, NULL },
 };
@@ -1003,11 +1027,12 @@ void probe_platform_features(unsigned int family, unsigned int model)
 		return;
 	}
 
-	if (!genuine_intel || family != 6)
+	if (!genuine_intel)
 		return;
 
 	for (i = 0; turbostat_pdata[i].features; i++) {
-		if (turbostat_pdata[i].model == model) {
+		if (VFM_FAMILY(turbostat_pdata[i].vfm) == family &&
+		    VFM_MODEL(turbostat_pdata[i].vfm) == model) {
 			platform = turbostat_pdata[i].features;
 			return;
 		}

From c81c8ee445e84703cd90171365ed596ef94399cc Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Fri, 24 May 2024 12:06:39 +0200
Subject: [PATCH 020/346] tools/power turbostat: Remove anonymous union from
 rapl_counter_info_t

fd_perf field used to be part of the union, but later moved out of it,
because we test it with fd_perf != -1 to determine if any perf counter
is opened, making the union unused.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 02312e62c857..1ac74cd49cc9 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1085,15 +1085,9 @@ struct rapl_counter_info_t {
 	unsigned long long flags[NUM_RAPL_COUNTERS];
 	double scale[NUM_RAPL_COUNTERS];
 	enum rapl_unit unit[NUM_RAPL_COUNTERS];
-
-	union {
-		/* Active when source == RAPL_SOURCE_MSR */
-		struct {
-			unsigned long long msr[NUM_RAPL_COUNTERS];
-			unsigned long long msr_mask[NUM_RAPL_COUNTERS];
-			int msr_shift[NUM_RAPL_COUNTERS];
-		};
-	};
+	unsigned long long msr[NUM_RAPL_COUNTERS];
+	unsigned long long msr_mask[NUM_RAPL_COUNTERS];
+	int msr_shift[NUM_RAPL_COUNTERS];
 
 	int fd_perf;
 };

From 73ed3c941a76ff3348022477975c84b8d183599a Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Fri, 24 May 2024 12:22:57 +0200
Subject: [PATCH 021/346] tools/power turbostat: Replace enum rapl_source and
 cstate_source with counter_source

Reuse the enum. It means the same thing in both cases.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 39 +++++++++++++--------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 1ac74cd49cc9..0fc6c107e371 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -86,8 +86,7 @@ enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
 enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M };
 enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE };
 enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR };
-enum rapl_source { RAPL_SOURCE_NONE, RAPL_SOURCE_PERF, RAPL_SOURCE_MSR };
-enum cstate_source { CSTATE_SOURCE_NONE, CSTATE_SOURCE_PERF, CSTATE_SOURCE_MSR };
+enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR };
 
 struct sysfs_path {
 	char path[PATH_BYTES];
@@ -1081,7 +1080,7 @@ enum rapl_unit {
 
 struct rapl_counter_info_t {
 	unsigned long long data[NUM_RAPL_COUNTERS];
-	enum rapl_source source[NUM_RAPL_COUNTERS];
+	enum counter_source source[NUM_RAPL_COUNTERS];
 	unsigned long long flags[NUM_RAPL_COUNTERS];
 	double scale[NUM_RAPL_COUNTERS];
 	enum rapl_unit unit[NUM_RAPL_COUNTERS];
@@ -1243,7 +1242,7 @@ enum ccstate_rci_index {
 
 struct cstate_counter_info_t {
 	unsigned long long data[NUM_CSTATE_COUNTERS];
-	enum cstate_source source[NUM_CSTATE_COUNTERS];
+	enum counter_source source[NUM_CSTATE_COUNTERS];
 	unsigned long long msr[NUM_CSTATE_COUNTERS];
 	int fd_perf_core;
 	int fd_perf_pkg;
@@ -3601,7 +3600,7 @@ size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci)
 	size_t ret = 0;
 
 	for (int i = 0; i < NUM_RAPL_COUNTERS; ++i)
-		if (rci->source[i] == RAPL_SOURCE_PERF)
+		if (rci->source[i] == COUNTER_SOURCE_PERF)
 			++ret;
 
 	return ret;
@@ -3612,7 +3611,7 @@ static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t
 	size_t ret = 0;
 
 	for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i)
-		if (cci->source[i] == CSTATE_SOURCE_PERF)
+		if (cci->source[i] == COUNTER_SOURCE_PERF)
 			++ret;
 
 	return ret;
@@ -3653,10 +3652,10 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct
 
 	for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) {
 		switch (rci->source[i]) {
-		case RAPL_SOURCE_NONE:
+		case COUNTER_SOURCE_NONE:
 			break;
 
-		case RAPL_SOURCE_PERF:
+		case COUNTER_SOURCE_PERF:
 			assert(pi < ARRAY_SIZE(perf_data));
 			assert(rci->fd_perf != -1);
 
@@ -3669,7 +3668,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct
 			++pi;
 			break;
 
-		case RAPL_SOURCE_MSR:
+		case COUNTER_SOURCE_MSR:
 			if (debug)
 				fprintf(stderr, "Reading rapl counter via msr at %u\n", i);
 
@@ -3791,10 +3790,10 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat
 
 	for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) {
 		switch (cci->source[i]) {
-		case CSTATE_SOURCE_NONE:
+		case COUNTER_SOURCE_NONE:
 			break;
 
-		case CSTATE_SOURCE_PERF:
+		case COUNTER_SOURCE_PERF:
 			assert(pi < ARRAY_SIZE(perf_data));
 			assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1);
 
@@ -3807,7 +3806,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat
 			++pi;
 			break;
 
-		case CSTATE_SOURCE_MSR:
+		case COUNTER_SOURCE_MSR:
 			assert(!no_msr);
 			if (get_msr(cpu, cci->msr[i], &cci->data[i]))
 				return -13 - i;
@@ -3828,7 +3827,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat
 	 * when invoked for the thread sibling.
 	 */
 #define PERF_COUNTER_WRITE_DATA(out_counter, index) do {	\
-	if (cci->source[index] != CSTATE_SOURCE_NONE)		\
+	if (cci->source[index] != COUNTER_SOURCE_NONE)		\
 		out_counter = cci->data[index];			\
 } while (0)
 
@@ -6894,7 +6893,7 @@ void rapl_perf_init(void)
 		rci->fd_perf = -1;
 		for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) {
 			rci->data[i] = 0;
-			rci->source[i] = RAPL_SOURCE_NONE;
+			rci->source[i] = COUNTER_SOURCE_NONE;
 		}
 	}
 
@@ -6936,14 +6935,14 @@ void rapl_perf_init(void)
 				/* Use perf API for this counter */
 				if (!no_perf && cai->perf_name
 				    && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) {
-					rci->source[cai->rci_index] = RAPL_SOURCE_PERF;
+					rci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
 					rci->scale[cai->rci_index] = scale * cai->compat_scale;
 					rci->unit[cai->rci_index] = unit;
 					rci->flags[cai->rci_index] = cai->flags;
 
 					/* Use MSR for this counter */
 				} else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) {
-					rci->source[cai->rci_index] = RAPL_SOURCE_MSR;
+					rci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
 					rci->msr[cai->rci_index] = cai->msr;
 					rci->msr_mask[cai->rci_index] = cai->msr_mask;
 					rci->msr_shift[cai->rci_index] = cai->msr_shift;
@@ -6953,7 +6952,7 @@ void rapl_perf_init(void)
 				}
 			}
 
-			if (rci->source[cai->rci_index] != RAPL_SOURCE_NONE)
+			if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE)
 				has_counter = 1;
 		}
 
@@ -7146,17 +7145,17 @@ void cstate_perf_init_(bool soft_c1)
 				/* Use perf API for this counter */
 				if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) {
 
-					cci->source[cai->rci_index] = CSTATE_SOURCE_PERF;
+					cci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
 
 					/* User MSR for this counter */
 				} else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit
 					   && probe_msr(cpu, cai->msr) == 0) {
-					cci->source[cai->rci_index] = CSTATE_SOURCE_MSR;
+					cci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
 					cci->msr[cai->rci_index] = cai->msr;
 				}
 			}
 
-			if (cci->source[cai->rci_index] != CSTATE_SOURCE_NONE) {
+			if (cci->source[cai->rci_index] != COUNTER_SOURCE_NONE) {
 				has_counter = true;
 				cores_visited[core_id] = true;
 				pkg_visited[pkg_id] = true;

From 25e713c6b54a899b5a8b5e09b58f4b87a5b3664b Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Fri, 24 May 2024 13:04:33 +0200
Subject: [PATCH 022/346] tools/power turbostat: Add ZERO_ARRAY for zero
 initializing builtin array

It makes it harder to shoot yourself in the foot, by using
additional __must_be_array() check.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 0fc6c107e371..29751b41ea0d 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1060,6 +1060,8 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi
 #define MAX_ADDED_PACKAGE_COUNTERS 16
 #define BITMASK_SIZE 32
 
+#define ZERO_ARRAY(arr) (memset(arr, 0, sizeof(arr)) + __must_be_array(arr))
+
 /* Indexes used to map data read from perf and MSRs into global variables */
 enum rapl_rci_index {
 	RAPL_RCI_INDEX_ENERGY_PKG = 0,
@@ -3733,9 +3735,9 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat
 	assert(ccstate_counter_info);
 	assert(cpu <= ccstate_counter_info_size);
 
-	memset(perf_data, 0, sizeof(perf_data));
-	memset(perf_data_core, 0, sizeof(perf_data_core));
-	memset(perf_data_pkg, 0, sizeof(perf_data_pkg));
+	ZERO_ARRAY(perf_data);
+	ZERO_ARRAY(perf_data_core);
+	ZERO_ARRAY(perf_data_pkg);
 
 	cci = &ccstate_counter_info[cpu];
 

From 675e979db473d08be346a3190c5f0db095a57153 Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fabio.m.de.francesco@linux.intel.com>
Date: Fri, 7 Jun 2024 16:43:58 +0200
Subject: [PATCH 023/346] cxl/events: Use a common struct for DRAM and General
 Media events

cxl_event_common was an unfortunate naming choice and caused confusion with
the existing Common Event Record. Furthermore, its fields didn't map all
the common information between DRAM and General Media Events.

Remove cxl_event_common and introduce cxl_event_media_hdr to record common
information between DRAM and General Media events.

cxl_event_media_hdr, which is embedded in both cxl_event_gen_media and
cxl_event_dram, leverages the commonalities between the two events to
simplify their respective handling.

Suggested-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20240607144423.48681-1-fabio.m.de.francesco@linux.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/mbox.c      |  2 +-
 drivers/cxl/core/trace.h     | 32 ++++++++++-----------
 include/linux/cxl-event.h    | 45 +++++++++++++-----------------
 tools/testing/cxl/test/mem.c | 54 +++++++++++++++++++-----------------
 4 files changed, 65 insertions(+), 68 deletions(-)

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 2626f3fff201..a08f050cc1ca 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -875,7 +875,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 		guard(rwsem_read)(&cxl_region_rwsem);
 		guard(rwsem_read)(&cxl_dpa_rwsem);
 
-		dpa = le64_to_cpu(evt->common.phys_addr) & CXL_DPA_MASK;
+		dpa = le64_to_cpu(evt->media_hdr.phys_addr) & CXL_DPA_MASK;
 		cxlr = cxl_dpa_to_region(cxlmd, dpa);
 		if (cxlr)
 			hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index ee5cd4eb2f16..6d8b71d8f6c4 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -340,23 +340,23 @@ TRACE_EVENT(cxl_general_media,
 	),
 
 	TP_fast_assign(
-		CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr);
+		CXL_EVT_TP_fast_assign(cxlmd, log, rec->media_hdr.hdr);
 		__entry->hdr_uuid = CXL_EVENT_GEN_MEDIA_UUID;
 
 		/* General Media */
-		__entry->dpa = le64_to_cpu(rec->phys_addr);
+		__entry->dpa = le64_to_cpu(rec->media_hdr.phys_addr);
 		__entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK;
 		/* Mask after flags have been parsed */
 		__entry->dpa &= CXL_DPA_MASK;
-		__entry->descriptor = rec->descriptor;
-		__entry->type = rec->type;
-		__entry->transaction_type = rec->transaction_type;
-		__entry->channel = rec->channel;
-		__entry->rank = rec->rank;
+		__entry->descriptor = rec->media_hdr.descriptor;
+		__entry->type = rec->media_hdr.type;
+		__entry->transaction_type = rec->media_hdr.transaction_type;
+		__entry->channel = rec->media_hdr.channel;
+		__entry->rank = rec->media_hdr.rank;
 		__entry->device = get_unaligned_le24(rec->device);
 		memcpy(__entry->comp_id, &rec->component_id,
 			CXL_EVENT_GEN_MED_COMP_ID_SIZE);
-		__entry->validity_flags = get_unaligned_le16(&rec->validity_flags);
+		__entry->validity_flags = get_unaligned_le16(&rec->media_hdr.validity_flags);
 		__entry->hpa = hpa;
 		if (cxlr) {
 			__assign_str(region_name);
@@ -440,19 +440,19 @@ TRACE_EVENT(cxl_dram,
 	),
 
 	TP_fast_assign(
-		CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr);
+		CXL_EVT_TP_fast_assign(cxlmd, log, rec->media_hdr.hdr);
 		__entry->hdr_uuid = CXL_EVENT_DRAM_UUID;
 
 		/* DRAM */
-		__entry->dpa = le64_to_cpu(rec->phys_addr);
+		__entry->dpa = le64_to_cpu(rec->media_hdr.phys_addr);
 		__entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK;
 		__entry->dpa &= CXL_DPA_MASK;
-		__entry->descriptor = rec->descriptor;
-		__entry->type = rec->type;
-		__entry->transaction_type = rec->transaction_type;
-		__entry->validity_flags = get_unaligned_le16(rec->validity_flags);
-		__entry->channel = rec->channel;
-		__entry->rank = rec->rank;
+		__entry->descriptor = rec->media_hdr.descriptor;
+		__entry->type = rec->media_hdr.type;
+		__entry->transaction_type = rec->media_hdr.transaction_type;
+		__entry->validity_flags = get_unaligned_le16(rec->media_hdr.validity_flags);
+		__entry->channel = rec->media_hdr.channel;
+		__entry->rank = rec->media_hdr.rank;
 		__entry->nibble_mask = get_unaligned_le24(rec->nibble_mask);
 		__entry->bank_group = rec->bank_group;
 		__entry->bank = rec->bank;
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 60b25020281f..0bea1afbd747 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -21,6 +21,21 @@ struct cxl_event_record_hdr {
 	u8 reserved[15];
 } __packed;
 
+struct cxl_event_media_hdr {
+	struct cxl_event_record_hdr hdr;
+	__le64 phys_addr;
+	u8 descriptor;
+	u8 type;
+	u8 transaction_type;
+	/*
+	 * The meaning of Validity Flags from bit 2 is
+	 * different across DRAM and General Media records
+	 */
+	u8 validity_flags[2];
+	u8 channel;
+	u8 rank;
+} __packed;
+
 #define CXL_EVENT_RECORD_DATA_LENGTH 0x50
 struct cxl_event_generic {
 	struct cxl_event_record_hdr hdr;
@@ -33,14 +48,7 @@ struct cxl_event_generic {
  */
 #define CXL_EVENT_GEN_MED_COMP_ID_SIZE	0x10
 struct cxl_event_gen_media {
-	struct cxl_event_record_hdr hdr;
-	__le64 phys_addr;
-	u8 descriptor;
-	u8 type;
-	u8 transaction_type;
-	u8 validity_flags[2];
-	u8 channel;
-	u8 rank;
+	struct cxl_event_media_hdr media_hdr;
 	u8 device[3];
 	u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE];
 	u8 reserved[46];
@@ -52,14 +60,7 @@ struct cxl_event_gen_media {
  */
 #define CXL_EVENT_DER_CORRECTION_MASK_SIZE	0x20
 struct cxl_event_dram {
-	struct cxl_event_record_hdr hdr;
-	__le64 phys_addr;
-	u8 descriptor;
-	u8 type;
-	u8 transaction_type;
-	u8 validity_flags[2];
-	u8 channel;
-	u8 rank;
+	struct cxl_event_media_hdr media_hdr;
 	u8 nibble_mask[3];
 	u8 bank_group;
 	u8 bank;
@@ -95,21 +96,13 @@ struct cxl_event_mem_module {
 	u8 reserved[0x3d];
 } __packed;
 
-/*
- * General Media or DRAM Event Common Fields
- * - provides common access to phys_addr
- */
-struct cxl_event_common {
-	struct cxl_event_record_hdr hdr;
-	__le64 phys_addr;
-} __packed;
-
 union cxl_event {
 	struct cxl_event_generic generic;
 	struct cxl_event_gen_media gen_media;
 	struct cxl_event_dram dram;
 	struct cxl_event_mem_module mem_module;
-	struct cxl_event_common common;
+	/* dram & gen_media event header */
+	struct cxl_event_media_hdr media_hdr;
 } __packed;
 
 /*
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index eaf091a3d331..94de2cf60f4f 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -385,19 +385,21 @@ struct cxl_test_gen_media {
 struct cxl_test_gen_media gen_media = {
 	.id = CXL_EVENT_GEN_MEDIA_UUID,
 	.rec = {
-		.hdr = {
-			.length = sizeof(struct cxl_test_gen_media),
-			.flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT,
-			/* .handle = Set dynamically */
-			.related_handle = cpu_to_le16(0),
+		.media_hdr = {
+			.hdr = {
+				.length = sizeof(struct cxl_test_gen_media),
+				.flags[0] = CXL_EVENT_RECORD_FLAG_PERMANENT,
+				/* .handle = Set dynamically */
+				.related_handle = cpu_to_le16(0),
+			},
+			.phys_addr = cpu_to_le64(0x2000),
+			.descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT,
+			.type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR,
+			.transaction_type = CXL_GMER_TRANS_HOST_WRITE,
+			/* .validity_flags = <set below> */
+			.channel = 1,
+			.rank = 30,
 		},
-		.phys_addr = cpu_to_le64(0x2000),
-		.descriptor = CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT,
-		.type = CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR,
-		.transaction_type = CXL_GMER_TRANS_HOST_WRITE,
-		/* .validity_flags = <set below> */
-		.channel = 1,
-		.rank = 30
 	},
 };
 
@@ -409,18 +411,20 @@ struct cxl_test_dram {
 struct cxl_test_dram dram = {
 	.id = CXL_EVENT_DRAM_UUID,
 	.rec = {
-		.hdr = {
-			.length = sizeof(struct cxl_test_dram),
-			.flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,
-			/* .handle = Set dynamically */
-			.related_handle = cpu_to_le16(0),
+		.media_hdr = {
+			.hdr = {
+				.length = sizeof(struct cxl_test_dram),
+				.flags[0] = CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,
+				/* .handle = Set dynamically */
+				.related_handle = cpu_to_le16(0),
+			},
+			.phys_addr = cpu_to_le64(0x8000),
+			.descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT,
+			.type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR,
+			.transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB,
+			/* .validity_flags = <set below> */
+			.channel = 1,
 		},
-		.phys_addr = cpu_to_le64(0x8000),
-		.descriptor = CXL_GMER_EVT_DESC_THRESHOLD_EVENT,
-		.type = CXL_GMER_MEM_EVT_TYPE_INV_ADDR,
-		.transaction_type = CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB,
-		/* .validity_flags = <set below> */
-		.channel = 1,
 		.bank_group = 5,
 		.bank = 2,
 		.column = {0xDE, 0xAD},
@@ -474,11 +478,11 @@ static int mock_set_timestamp(struct cxl_dev_state *cxlds,
 static void cxl_mock_add_event_logs(struct mock_event_store *mes)
 {
 	put_unaligned_le16(CXL_GMER_VALID_CHANNEL | CXL_GMER_VALID_RANK,
-			   &gen_media.rec.validity_flags);
+			   &gen_media.rec.media_hdr.validity_flags);
 
 	put_unaligned_le16(CXL_DER_VALID_CHANNEL | CXL_DER_VALID_BANK_GROUP |
 			   CXL_DER_VALID_BANK | CXL_DER_VALID_COLUMN,
-			   &dram.rec.validity_flags);
+			   &dram.rec.media_hdr.validity_flags);
 
 	mes_add_event(mes, CXL_EVENT_TYPE_INFO, &maint_needed);
 	mes_add_event(mes, CXL_EVENT_TYPE_INFO,

From a0caa19711ceb54c34368f66a746844fb03fde6c Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Fri, 7 Jun 2024 06:57:15 -0700
Subject: [PATCH 024/346] cxl: add missing MODULE_DESCRIPTION() macros

make allmodconfig && make W=1 C=1 reports:
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/core/cxl_core.o
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_pci.o
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_mem.o
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_acpi.o
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_pmem.o
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cxl/cxl_port.o

Add the missing invocations of the MODULE_DESCRIPTION() macro.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://patch.msgid.link/20240607-md-drivers-cxl-v2-1-0c61d95ee7a7@quicinc.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/acpi.c      | 1 +
 drivers/cxl/core/port.c | 1 +
 drivers/cxl/mem.c       | 1 +
 drivers/cxl/pci.c       | 1 +
 drivers/cxl/pmem.c      | 1 +
 drivers/cxl/port.c      | 1 +
 6 files changed, 6 insertions(+)

diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index 571069863c62..e51315ea4a6a 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -921,6 +921,7 @@ static void __exit cxl_acpi_exit(void)
 /* load before dax_hmem sees 'Soft Reserved' CXL ranges */
 subsys_initcall(cxl_acpi_init);
 module_exit(cxl_acpi_exit);
+MODULE_DESCRIPTION("CXL ACPI: Platform Support");
 MODULE_LICENSE("GPL v2");
 MODULE_IMPORT_NS(CXL);
 MODULE_IMPORT_NS(ACPI);
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 887ed6e358fb..e31c5fcd9bf8 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -2356,5 +2356,6 @@ static void cxl_core_exit(void)
 
 subsys_initcall(cxl_core_init);
 module_exit(cxl_core_exit);
+MODULE_DESCRIPTION("CXL: Core Compute Express Link support");
 MODULE_LICENSE("GPL v2");
 MODULE_IMPORT_NS(CXL);
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index 0c79d9ce877c..1afb0e78082b 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -252,6 +252,7 @@ static struct cxl_driver cxl_mem_driver = {
 
 module_cxl_driver(cxl_mem_driver);
 
+MODULE_DESCRIPTION("CXL: Memory Expansion");
 MODULE_LICENSE("GPL v2");
 MODULE_IMPORT_NS(CXL);
 MODULE_ALIAS_CXL(CXL_DEVICE_MEMORY_EXPANDER);
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index e53646e9f2fb..4be35dc22202 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1066,5 +1066,6 @@ static void __exit cxl_pci_driver_exit(void)
 
 module_init(cxl_pci_driver_init);
 module_exit(cxl_pci_driver_exit);
+MODULE_DESCRIPTION("CXL: PCI manageability");
 MODULE_LICENSE("GPL v2");
 MODULE_IMPORT_NS(CXL);
diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
index 2ecdaee63021..4ef93da22335 100644
--- a/drivers/cxl/pmem.c
+++ b/drivers/cxl/pmem.c
@@ -453,6 +453,7 @@ static __exit void cxl_pmem_exit(void)
 	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
 }
 
+MODULE_DESCRIPTION("CXL PMEM: Persistent Memory Support");
 MODULE_LICENSE("GPL v2");
 module_init(cxl_pmem_init);
 module_exit(cxl_pmem_exit);
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 97c21566677a..d7d5d982ce69 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -209,6 +209,7 @@ static struct cxl_driver cxl_port_driver = {
 };
 
 module_cxl_driver(cxl_port_driver);
+MODULE_DESCRIPTION("CXL: Port enumeration and services");
 MODULE_LICENSE("GPL v2");
 MODULE_IMPORT_NS(CXL);
 MODULE_ALIAS_CXL(CXL_DEVICE_PORT);

From a3483ee7e6a7f2d12b5950246f4e0ef94f4a5df0 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 18 Jun 2024 16:46:37 +0800
Subject: [PATCH 025/346] cxl/region: Fix a race condition in memory hotplug
 notifier

In the memory hotplug notifier function of the CXL region,
cxl_region_perf_attrs_callback(), the node ID is obtained by checking
the host address range of the region. However, the address range
information is not available when the region is registered in
devm_cxl_add_region(). Additionally, this information may be removed
or added under the protection of cxl_region_rwsem during runtime. If
the memory notifier is called for nodes other than that backed by the
region, a race condition may occur, potentially leading to a NULL
dereference or an invalid address range.

The race condition is addressed by checking the availability of the
address range information under the protection of cxl_region_rwsem. To
enhance code readability and use guard(), the relevant code has been
moved into a newly added function: cxl_region_nid().

Fixes: 067353a46d8c ("cxl/region: Add memory hotplug notifier for cxl region")
Signed-off-by: Huang, Ying <ying.huang@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://patch.msgid.link/20240618084639.1419629-2-ying.huang@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/region.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 3c2b6144be23..51aeef2c012c 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -2304,14 +2304,25 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid)
 	return true;
 }
 
+static int cxl_region_nid(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_endpoint_decoder *cxled;
+	struct cxl_decoder *cxld;
+
+	guard(rwsem_read)(&cxl_region_rwsem);
+	cxled = p->targets[0];
+	if (!cxled)
+		return NUMA_NO_NODE;
+	cxld = &cxled->cxld;
+	return phys_to_target_node(cxld->hpa_range.start);
+}
+
 static int cxl_region_perf_attrs_callback(struct notifier_block *nb,
 					  unsigned long action, void *arg)
 {
 	struct cxl_region *cxlr = container_of(nb, struct cxl_region,
 					       memory_notifier);
-	struct cxl_region_params *p = &cxlr->params;
-	struct cxl_endpoint_decoder *cxled = p->targets[0];
-	struct cxl_decoder *cxld = &cxled->cxld;
 	struct memory_notify *mnb = arg;
 	int nid = mnb->status_change_nid;
 	int region_nid;
@@ -2319,7 +2330,7 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb,
 	if (nid == NUMA_NO_NODE || action != MEM_ONLINE)
 		return NOTIFY_DONE;
 
-	region_nid = phys_to_target_node(cxld->hpa_range.start);
+	region_nid = cxl_region_nid(cxlr);
 	if (nid != region_nid)
 		return NOTIFY_DONE;
 

From 643e8e3e65290fdfe507bb23fa524c77e1345af3 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 18 Jun 2024 16:46:38 +0800
Subject: [PATCH 026/346] cxl/region: Support to calculate memory tier abstract
 distance

An abstract distance value must be assigned by the driver that makes
the memory available to the system. It reflects relative performance
and is used to place memory nodes backed by CXL regions in the appropriate
memory tiers allowing promotion/demotion within the existing memory tiering
mechanism.

The abstract distance is calculated based on the memory access latency
and bandwidth of CXL regions.

Signed-off-by: Huang, Ying <ying.huang@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://patch.msgid.link/20240618084639.1419629-3-ying.huang@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/region.c | 27 +++++++++++++++++++++++++++
 drivers/cxl/cxl.h         |  2 ++
 2 files changed, 29 insertions(+)

diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 51aeef2c012c..dc15ceba7ab7 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -9,6 +9,7 @@
 #include <linux/uuid.h>
 #include <linux/sort.h>
 #include <linux/idr.h>
+#include <linux/memory-tiers.h>
 #include <cxlmem.h>
 #include <cxl.h>
 #include "core.h"
@@ -2228,6 +2229,7 @@ static void unregister_region(void *_cxlr)
 	int i;
 
 	unregister_memory_notifier(&cxlr->memory_notifier);
+	unregister_mt_adistance_algorithm(&cxlr->adist_notifier);
 	device_del(&cxlr->dev);
 
 	/*
@@ -2340,6 +2342,27 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+static int cxl_region_calculate_adistance(struct notifier_block *nb,
+					  unsigned long nid, void *data)
+{
+	struct cxl_region *cxlr = container_of(nb, struct cxl_region,
+					       adist_notifier);
+	struct access_coordinate *perf;
+	int *adist = data;
+	int region_nid;
+
+	region_nid = cxl_region_nid(cxlr);
+	if (nid != region_nid)
+		return NOTIFY_OK;
+
+	perf = &cxlr->coord[ACCESS_COORDINATE_CPU];
+
+	if (mt_perf_to_adistance(perf, adist))
+		return NOTIFY_OK;
+
+	return NOTIFY_STOP;
+}
+
 /**
  * devm_cxl_add_region - Adds a region to a decoder
  * @cxlrd: root decoder
@@ -2382,6 +2405,10 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
 	cxlr->memory_notifier.priority = CXL_CALLBACK_PRI;
 	register_memory_notifier(&cxlr->memory_notifier);
 
+	cxlr->adist_notifier.notifier_call = cxl_region_calculate_adistance;
+	cxlr->adist_notifier.priority = 100;
+	register_mt_adistance_algorithm(&cxlr->adist_notifier);
+
 	rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr);
 	if (rc)
 		return ERR_PTR(rc);
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 603c0120cff8..f46252373159 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -522,6 +522,7 @@ struct cxl_region_params {
  * @params: active + config params for the region
  * @coord: QoS access coordinates for the region
  * @memory_notifier: notifier for setting the access coordinates to node
+ * @adist_notifier: notifier for calculating the abstract distance of node
  */
 struct cxl_region {
 	struct device dev;
@@ -534,6 +535,7 @@ struct cxl_region {
 	struct cxl_region_params params;
 	struct access_coordinate coord[ACCESS_COORDINATE_MAX];
 	struct notifier_block memory_notifier;
+	struct notifier_block adist_notifier;
 };
 
 struct cxl_nvdimm_bridge {

From f3d70720e92c24c22e0e63c2588636edba33eb1a Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 18 Jun 2024 16:46:39 +0800
Subject: [PATCH 027/346] cxl/region: Simplify cxl_region_nid()

The node ID of the region can be gotten via resource start address
directly.  This simplifies the implementation of cxl_region_nid().

Signed-off-by: Huang Ying <ying.huang@intel.com>
Suggested-by: Alison Schofield <alison.schofield@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://patch.msgid.link/20240618084639.1419629-4-ying.huang@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/region.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index dc15ceba7ab7..32473fddce03 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -2309,15 +2309,13 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid)
 static int cxl_region_nid(struct cxl_region *cxlr)
 {
 	struct cxl_region_params *p = &cxlr->params;
-	struct cxl_endpoint_decoder *cxled;
-	struct cxl_decoder *cxld;
+	struct resource *res;
 
 	guard(rwsem_read)(&cxl_region_rwsem);
-	cxled = p->targets[0];
-	if (!cxled)
+	res = p->res;
+	if (!res)
 		return NUMA_NO_NODE;
-	cxld = &cxled->cxld;
-	return phys_to_target_node(cxld->hpa_range.start);
+	return phys_to_target_node(res->start);
 }
 
 static int cxl_region_perf_attrs_callback(struct notifier_block *nb,

From 67f9c312b0a7f4bc869376d2a68308e673235954 Mon Sep 17 00:00:00 2001
From: Aswin Unnikrishnan <aswinunni01@gmail.com>
Date: Sun, 12 May 2024 11:23:20 +0000
Subject: [PATCH 028/346] rust: add example for `alias` argument in `module`
 macro documentation

Add example for `alias` argument supported by `module` macro.
`alias` accepts an array of alternate names for the module as string.

Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Aswin Unnikrishnan <aswinunni01@gmail.com>
Reviewed-by: Benno Lossin <benno.lossin@proton.me>
Link: https://lore.kernel.org/r/20240512112324.8514-1-aswinunni01@gmail.com
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/macros/lib.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs
index 520eae5fd792..aa89b41fa10e 100644
--- a/rust/macros/lib.rs
+++ b/rust/macros/lib.rs
@@ -35,6 +35,7 @@
 ///     author: "Rust for Linux Contributors",
 ///     description: "My very own kernel module!",
 ///     license: "GPL",
+///     alias: ["alternate_module_name"],
 /// }
 ///
 /// struct MyModule;

From 63249a070eb5187d5caec995d171b53e374a0741 Mon Sep 17 00:00:00 2001
From: Aswin Unnikrishnan <aswinunni01@gmail.com>
Date: Sun, 12 May 2024 11:23:21 +0000
Subject: [PATCH 029/346] rust: fix datatype in docs for `module` macro
 arguments

Remove the mention of byte array as datatype for `module` macro arguments
since the arguments are defined as string, and `alias` is a string array.

Signed-off-by: Aswin Unnikrishnan <aswinunni01@gmail.com>
Reviewed-by: Vincenzo Palazzo <vincenzopalazzodev@gmail.com>
Reviewed-by: Benno Lossin <benno.lossin@proton.me>
Link: https://lore.kernel.org/r/20240512112324.8514-2-aswinunni01@gmail.com
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/macros/lib.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs
index aa89b41fa10e..5214e07367c5 100644
--- a/rust/macros/lib.rs
+++ b/rust/macros/lib.rs
@@ -58,11 +58,11 @@
 ///
 /// # Supported argument types
 ///   - `type`: type which implements the [`Module`] trait (required).
-///   - `name`: byte array of the name of the kernel module (required).
-///   - `author`: byte array of the author of the kernel module.
-///   - `description`: byte array of the description of the kernel module.
-///   - `license`: byte array of the license of the kernel module (required).
-///   - `alias`: byte array of alias name of the kernel module.
+///   - `name`: ASCII string literal of the name of the kernel module (required).
+///   - `author`: string literal of the author of the kernel module.
+///   - `description`: string literal of the description of the kernel module.
+///   - `license`: ASCII string literal of the license of the kernel module (required).
+///   - `alias`: array of ASCII string literals of the alias names of the kernel module.
 #[proc_macro]
 pub fn module(ts: TokenStream) -> TokenStream {
     module::module(ts)

From 549d3c2ffbea44fe123a67983fd8b15ab6989d8d Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@gmail.com>
Date: Wed, 1 May 2024 21:35:48 +0900
Subject: [PATCH 030/346] rust: add 'firmware' field support to module! macro

This adds 'firmware' field support to module! macro, corresponds to
MODULE_FIRMWARE macro. You can specify the file names of binary
firmware that the kernel module requires. The information is embedded
in the modinfo section of the kernel module. For example, a tool to
build an initramfs uses this information to put the firmware files
into the initramfs image.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@gmail.com>
Reviewed-by: Benno Lossin <benno.lossin@proton.me>
Link: https://lore.kernel.org/r/20240501123548.51769-1-fujita.tomonori@gmail.com
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/macros/lib.rs    | 32 ++++++++++++++++++++++++++++++++
 rust/macros/module.rs | 18 ++++++++++++++++--
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs
index 5214e07367c5..6dcc72aff111 100644
--- a/rust/macros/lib.rs
+++ b/rust/macros/lib.rs
@@ -56,6 +56,36 @@
 /// }
 /// ```
 ///
+/// ## Firmware
+///
+/// The following example shows how to declare a kernel module that needs
+/// to load binary firmware files. You need to specify the file names of
+/// the firmware in the `firmware` field. The information is embedded
+/// in the `modinfo` section of the kernel module. For example, a tool to
+/// build an initramfs uses this information to put the firmware files into
+/// the initramfs image.
+///
+/// ```ignore
+/// use kernel::prelude::*;
+///
+/// module!{
+///     type: MyDeviceDriverModule,
+///     name: "my_device_driver_module",
+///     author: "Rust for Linux Contributors",
+///     description: "My device driver requires firmware",
+///     license: "GPL",
+///     firmware: ["my_device_firmware1.bin", "my_device_firmware2.bin"],
+/// }
+///
+/// struct MyDeviceDriverModule;
+///
+/// impl kernel::Module for MyDeviceDriverModule {
+///     fn init() -> Result<Self> {
+///         Ok(Self)
+///     }
+/// }
+/// ```
+///
 /// # Supported argument types
 ///   - `type`: type which implements the [`Module`] trait (required).
 ///   - `name`: ASCII string literal of the name of the kernel module (required).
@@ -63,6 +93,8 @@
 ///   - `description`: string literal of the description of the kernel module.
 ///   - `license`: ASCII string literal of the license of the kernel module (required).
 ///   - `alias`: array of ASCII string literals of the alias names of the kernel module.
+///   - `firmware`: array of ASCII string literals of the firmware files of
+/// the kernel module.
 #[proc_macro]
 pub fn module(ts: TokenStream) -> TokenStream {
     module::module(ts)
diff --git a/rust/macros/module.rs b/rust/macros/module.rs
index acd0393b5095..411dc103d82e 100644
--- a/rust/macros/module.rs
+++ b/rust/macros/module.rs
@@ -97,14 +97,22 @@ struct ModuleInfo {
     author: Option<String>,
     description: Option<String>,
     alias: Option<Vec<String>>,
+    firmware: Option<Vec<String>>,
 }
 
 impl ModuleInfo {
     fn parse(it: &mut token_stream::IntoIter) -> Self {
         let mut info = ModuleInfo::default();
 
-        const EXPECTED_KEYS: &[&str] =
-            &["type", "name", "author", "description", "license", "alias"];
+        const EXPECTED_KEYS: &[&str] = &[
+            "type",
+            "name",
+            "author",
+            "description",
+            "license",
+            "alias",
+            "firmware",
+        ];
         const REQUIRED_KEYS: &[&str] = &["type", "name", "license"];
         let mut seen_keys = Vec::new();
 
@@ -131,6 +139,7 @@ fn parse(it: &mut token_stream::IntoIter) -> Self {
                 "description" => info.description = Some(expect_string(it)),
                 "license" => info.license = expect_string_ascii(it),
                 "alias" => info.alias = Some(expect_string_array(it)),
+                "firmware" => info.firmware = Some(expect_string_array(it)),
                 _ => panic!(
                     "Unknown key \"{}\". Valid keys are: {:?}.",
                     key, EXPECTED_KEYS
@@ -186,6 +195,11 @@ pub(crate) fn module(ts: TokenStream) -> TokenStream {
             modinfo.emit("alias", &alias);
         }
     }
+    if let Some(firmware) = info.firmware {
+        for fw in firmware {
+            modinfo.emit("firmware", &fw);
+        }
+    }
 
     // Built-in modules also export the `file` modinfo string.
     let file =

From 526c539452cec6e7e65776d5807e6c66dd65d636 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Fri, 17 May 2024 19:06:15 +0200
Subject: [PATCH 031/346] docs: rust: introduce the new kernel.org LLVM+Rust
 toolchains

These combined LLVM+Rust toolchains are now available, thanks to Nathan
Chancellor (ClangBuiltLinux).

Thus introduce them in the Rust Quick Start guide.

Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/r/20240517170615.377786-1-ojeda@kernel.org
Co-developed-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Documentation/rust/quick-start.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst
index cc3f11e0d441..6fe69a601134 100644
--- a/Documentation/rust/quick-start.rst
+++ b/Documentation/rust/quick-start.rst
@@ -16,6 +16,13 @@ under names like ``rustc``, ``rust-src``, ``rust-bindgen``, etc. However,
 at the time of writing, they are likely not to be recent enough unless
 the distribution tracks the latest releases.
 
+Prebuilt stable versions of LLVM+Rust are provided on `kernel.org
+<https://kernel.org/pub/tools/llvm/rust/>`_. These are the same slim and fast
+LLVM toolchains from :ref:`Getting LLVM <getting_llvm>` with versions of Rust
+added to them that Rust for Linux supports, depending on the Linux version. Two
+sets are provided: the "latest LLVM" and "matching LLVM" (please see the link
+for more information).
+
 To easily check whether the requirements are met, the following target
 can be used::
 

From fe7d9d804337180d2377f0654537970c6cd863f7 Mon Sep 17 00:00:00 2001
From: Roland Xu <mu001999@outlook.com>
Date: Thu, 23 May 2024 00:08:46 +0800
Subject: [PATCH 032/346] rust: kernel: make impl_has_work compatible with more
 generics

Make the impl_has_work macro compatible with more complex generics such
as lifetimes and const generic arguments.

Signed-off-by: Roland Xu <mu001999@outlook.com>
Link: https://lore.kernel.org/r/ME0P282MB4890A180B99490CC65EF64FDCCEB2@ME0P282MB4890.AUSP282.PROD.OUTLOOK.COM
Suggested-by: Benno Lossin <benno.lossin@proton.me>
Link: https://github.com/Rust-for-Linux/linux/issues/1077
[ Wrapped message to 72 columns. - Miguel ]
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/kernel/workqueue.rs | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/rust/kernel/workqueue.rs b/rust/kernel/workqueue.rs
index 1cec63a2aea8..553a5cba2adc 100644
--- a/rust/kernel/workqueue.rs
+++ b/rust/kernel/workqueue.rs
@@ -482,24 +482,26 @@ unsafe fn work_container_of(ptr: *mut Work<T, ID>) -> *mut Self
 /// use kernel::sync::Arc;
 /// use kernel::workqueue::{self, impl_has_work, Work};
 ///
-/// struct MyStruct {
-///     work_field: Work<MyStruct, 17>,
+/// struct MyStruct<'a, T, const N: usize> {
+///     work_field: Work<MyStruct<'a, T, N>, 17>,
+///     f: fn(&'a [T; N]),
 /// }
 ///
 /// impl_has_work! {
-///     impl HasWork<MyStruct, 17> for MyStruct { self.work_field }
+///     impl{'a, T, const N: usize} HasWork<MyStruct<'a, T, N>, 17>
+///     for MyStruct<'a, T, N> { self.work_field }
 /// }
 /// ```
 #[macro_export]
 macro_rules! impl_has_work {
-    ($(impl$(<$($implarg:ident),*>)?
+    ($(impl$({$($generics:tt)*})?
        HasWork<$work_type:ty $(, $id:tt)?>
-       for $self:ident $(<$($selfarg:ident),*>)?
+       for $self:ty
        { self.$field:ident }
     )*) => {$(
         // SAFETY: The implementation of `raw_get_work` only compiles if the field has the right
         // type.
-        unsafe impl$(<$($implarg),*>)? $crate::workqueue::HasWork<$work_type $(, $id)?> for $self $(<$($selfarg),*>)? {
+        unsafe impl$(<$($generics)+>)? $crate::workqueue::HasWork<$work_type $(, $id)?> for $self {
             const OFFSET: usize = ::core::mem::offset_of!(Self, $field) as usize;
 
             #[inline]
@@ -515,7 +517,7 @@ unsafe fn raw_get_work(ptr: *mut Self) -> *mut $crate::workqueue::Work<$work_typ
 pub use impl_has_work;
 
 impl_has_work! {
-    impl<T> HasWork<Self> for ClosureWork<T> { self.work }
+    impl{T} HasWork<Self> for ClosureWork<T> { self.work }
 }
 
 unsafe impl<T, const ID: u64> WorkItemPointer<ID> for Arc<T>

From 9ffc80c819739ab60c42223c46b7351cec6a0e97 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Tue, 28 May 2024 18:35:02 +0200
Subject: [PATCH 033/346] kbuild: rust: remove now-unneeded `rusttest` custom
 sysroot handling

Since we dropped our custom `alloc` in commit 9d0441bab775 ("rust: alloc:
remove our fork of the `alloc` crate"), there is no need anymore to keep
the custom sysroot hack.

Thus delete it, which makes the target way simpler and faster too.

This also means we are not using Cargo for anything at the moment,
and that no download is required anymore, so update the main `Makefile`
and the documentation accordingly.

Link: https://lore.kernel.org/r/20240528163502.411600-1-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Documentation/rust/quick-start.rst | 14 ------
 Documentation/rust/testing.rst     |  5 +--
 Makefile                           |  3 +-
 rust/Makefile                      | 70 ++++--------------------------
 4 files changed, 11 insertions(+), 81 deletions(-)

diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst
index 6fe69a601134..ac2f16288458 100644
--- a/Documentation/rust/quick-start.rst
+++ b/Documentation/rust/quick-start.rst
@@ -171,20 +171,6 @@ can be installed manually::
 The standalone installers also come with ``clippy``.
 
 
-cargo
-*****
-
-``cargo`` is the Rust native build system. It is currently required to run
-the tests since it is used to build a custom standard library that contains
-the facilities provided by the custom ``alloc`` in the kernel. The tests can
-be run using the ``rusttest`` Make target.
-
-If ``rustup`` is being used, all the profiles already install the tool,
-thus nothing needs to be done.
-
-The standalone installers also come with ``cargo``.
-
-
 rustdoc
 *******
 
diff --git a/Documentation/rust/testing.rst b/Documentation/rust/testing.rst
index acfd0c2be48d..568b71b415a4 100644
--- a/Documentation/rust/testing.rst
+++ b/Documentation/rust/testing.rst
@@ -131,9 +131,8 @@ Additionally, there are the ``#[test]`` tests. These can be run using the
 
 	make LLVM=1 rusttest
 
-This requires the kernel ``.config`` and downloads external repositories. It
-runs the ``#[test]`` tests on the host (currently) and thus is fairly limited in
-what these tests can test.
+This requires the kernel ``.config``. It runs the ``#[test]`` tests on the host
+(currently) and thus is fairly limited in what these tests can test.
 
 The Kselftests
 --------------
diff --git a/Makefile b/Makefile
index b25b5b44af10..d4134a9f32b6 100644
--- a/Makefile
+++ b/Makefile
@@ -507,7 +507,6 @@ RUSTDOC		= rustdoc
 RUSTFMT		= rustfmt
 CLIPPY_DRIVER	= clippy-driver
 BINDGEN		= bindgen
-CARGO		= cargo
 PAHOLE		= pahole
 RESOLVE_BTFIDS	= $(objtree)/tools/bpf/resolve_btfids/resolve_btfids
 LEX		= flex
@@ -601,7 +600,7 @@ endif
 export RUSTC_BOOTSTRAP := 1
 
 export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC HOSTPKG_CONFIG
-export RUSTC RUSTDOC RUSTFMT RUSTC_OR_CLIPPY_QUIET RUSTC_OR_CLIPPY BINDGEN CARGO
+export RUSTC RUSTDOC RUSTFMT RUSTC_OR_CLIPPY_QUIET RUSTC_OR_CLIPPY BINDGEN
 export HOSTRUSTC KBUILD_HOSTRUSTFLAGS
 export CPP AR NM STRIP OBJCOPY OBJDUMP READELF PAHOLE RESOLVE_BTFIDS LEX YACC AWK INSTALLKERNEL
 export PERL PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX
diff --git a/rust/Makefile b/rust/Makefile
index f70d5e244fee..385378311322 100644
--- a/rust/Makefile
+++ b/rust/Makefile
@@ -44,17 +44,10 @@ rustc_sysroot := $(shell MAKEFLAGS= $(RUSTC) $(rust_flags) --print sysroot)
 rustc_host_target := $(shell $(RUSTC) --version --verbose | grep -F 'host: ' | cut -d' ' -f2)
 RUST_LIB_SRC ?= $(rustc_sysroot)/lib/rustlib/src/rust/library
 
-ifeq ($(quiet),silent_)
-cargo_quiet=-q
+ifneq ($(quiet),)
 rust_test_quiet=-q
 rustdoc_test_quiet=--test-args -q
 rustdoc_test_kernel_quiet=>/dev/null
-else ifeq ($(quiet),quiet_)
-rust_test_quiet=-q
-rustdoc_test_quiet=--test-args -q
-rustdoc_test_kernel_quiet=>/dev/null
-else
-cargo_quiet=--verbose
 endif
 
 core-cfgs = \
@@ -135,22 +128,21 @@ quiet_cmd_rustc_test_library = RUSTC TL $<
 		@$(objtree)/include/generated/rustc_cfg $(rustc_target_flags) \
 		--crate-type $(if $(rustc_test_library_proc),proc-macro,rlib) \
 		--out-dir $(objtree)/$(obj)/test --cfg testlib \
-		--sysroot $(objtree)/$(obj)/test/sysroot \
 		-L$(objtree)/$(obj)/test \
 		--crate-name $(subst rusttest-,,$(subst rusttestlib-,,$@)) $<
 
-rusttestlib-build_error: $(src)/build_error.rs rusttest-prepare FORCE
+rusttestlib-build_error: $(src)/build_error.rs FORCE
 	+$(call if_changed,rustc_test_library)
 
 rusttestlib-macros: private rustc_target_flags = --extern proc_macro
 rusttestlib-macros: private rustc_test_library_proc = yes
-rusttestlib-macros: $(src)/macros/lib.rs rusttest-prepare FORCE
+rusttestlib-macros: $(src)/macros/lib.rs FORCE
 	+$(call if_changed,rustc_test_library)
 
-rusttestlib-bindings: $(src)/bindings/lib.rs rusttest-prepare FORCE
+rusttestlib-bindings: $(src)/bindings/lib.rs FORCE
 	+$(call if_changed,rustc_test_library)
 
-rusttestlib-uapi: $(src)/uapi/lib.rs rusttest-prepare FORCE
+rusttestlib-uapi: $(src)/uapi/lib.rs FORCE
 	+$(call if_changed,rustc_test_library)
 
 quiet_cmd_rustdoc_test = RUSTDOC T $<
@@ -159,7 +151,7 @@ quiet_cmd_rustdoc_test = RUSTDOC T $<
 	$(RUSTDOC) --test $(rust_common_flags) \
 		@$(objtree)/include/generated/rustc_cfg \
 		$(rustc_target_flags) $(rustdoc_test_target_flags) \
-		--sysroot $(objtree)/$(obj)/test/sysroot $(rustdoc_test_quiet) \
+		$(rustdoc_test_quiet) \
 		-L$(objtree)/$(obj)/test --output $(rustdoc_output) \
 		--crate-name $(subst rusttest-,,$@) $<
 
@@ -192,7 +184,6 @@ quiet_cmd_rustc_test = RUSTC T  $<
 	$(RUSTC) --test $(rust_common_flags) \
 		@$(objtree)/include/generated/rustc_cfg \
 		$(rustc_target_flags) --out-dir $(objtree)/$(obj)/test \
-		--sysroot $(objtree)/$(obj)/test/sysroot \
 		-L$(objtree)/$(obj)/test \
 		--crate-name $(subst rusttest-,,$@) $<; \
 	$(objtree)/$(obj)/test/$(subst rusttest-,,$@) $(rust_test_quiet) \
@@ -200,60 +191,15 @@ quiet_cmd_rustc_test = RUSTC T  $<
 
 rusttest: rusttest-macros rusttest-kernel
 
-# This prepares a custom sysroot with our custom `alloc` instead of
-# the standard one.
-#
-# This requires several hacks:
-#   - Unlike `core` and `alloc`, `std` depends on more than a dozen crates,
-#     including third-party crates that need to be downloaded, plus custom
-#     `build.rs` steps. Thus hardcoding things here is not maintainable.
-#   - `cargo` knows how to build the standard library, but it is an unstable
-#     feature so far (`-Zbuild-std`).
-#   - `cargo` only considers the use case of building the standard library
-#     to use it in a given package. Thus we need to create a dummy package
-#     and pick the generated libraries from there.
-#   - The usual ways of modifying the dependency graph in `cargo` do not seem
-#     to apply for the `-Zbuild-std` steps, thus we have to mislead it
-#     by modifying the sources in the sysroot.
-#   - To avoid messing with the user's Rust installation, we create a clone
-#     of the sysroot. However, `cargo` ignores `RUSTFLAGS` in the `-Zbuild-std`
-#     steps, thus we use a wrapper binary passed via `RUSTC` to pass the flag.
-#
-# In the future, we hope to avoid the whole ordeal by either:
-#   - Making the `test` crate not depend on `std` (either improving upstream
-#     or having our own custom crate).
-#   - Making the tests run in kernel space (requires the previous point).
-#   - Making `std` and friends be more like a "normal" crate, so that
-#     `-Zbuild-std` and related hacks are not needed.
-quiet_cmd_rustsysroot = RUSTSYSROOT
-      cmd_rustsysroot = \
-	rm -rf $(objtree)/$(obj)/test; \
-	mkdir -p $(objtree)/$(obj)/test; \
-	cp -a $(rustc_sysroot) $(objtree)/$(obj)/test/sysroot; \
-	echo '\#!/bin/sh' > $(objtree)/$(obj)/test/rustc_sysroot; \
-	echo "$(RUSTC) --sysroot=$(abspath $(objtree)/$(obj)/test/sysroot) \"\$$@\"" \
-		>> $(objtree)/$(obj)/test/rustc_sysroot; \
-	chmod u+x $(objtree)/$(obj)/test/rustc_sysroot; \
-	$(CARGO) -q new $(objtree)/$(obj)/test/dummy; \
-	RUSTC=$(objtree)/$(obj)/test/rustc_sysroot $(CARGO) $(cargo_quiet) \
-		test -Zbuild-std --target $(rustc_host_target) \
-		--manifest-path $(objtree)/$(obj)/test/dummy/Cargo.toml; \
-	rm $(objtree)/$(obj)/test/sysroot/lib/rustlib/$(rustc_host_target)/lib/*; \
-	cp $(objtree)/$(obj)/test/dummy/target/$(rustc_host_target)/debug/deps/* \
-		$(objtree)/$(obj)/test/sysroot/lib/rustlib/$(rustc_host_target)/lib
-
-rusttest-prepare: FORCE
-	+$(call if_changed,rustsysroot)
-
 rusttest-macros: private rustc_target_flags = --extern proc_macro
 rusttest-macros: private rustdoc_test_target_flags = --crate-type proc-macro
-rusttest-macros: $(src)/macros/lib.rs rusttest-prepare FORCE
+rusttest-macros: $(src)/macros/lib.rs FORCE
 	+$(call if_changed,rustc_test)
 	+$(call if_changed,rustdoc_test)
 
 rusttest-kernel: private rustc_target_flags = --extern alloc \
     --extern build_error --extern macros --extern bindings --extern uapi
-rusttest-kernel: $(src)/kernel/lib.rs rusttest-prepare \
+rusttest-kernel: $(src)/kernel/lib.rs \
     rusttestlib-build_error rusttestlib-macros rusttestlib-bindings \
     rusttestlib-uapi FORCE
 	+$(call if_changed,rustc_test)

From d3ee24cce4e558041cbf21a5bc9cd9a4b1b3c7e7 Mon Sep 17 00:00:00 2001
From: Alice Ryhl <aliceryhl@google.com>
Date: Wed, 29 May 2024 08:34:52 +0000
Subject: [PATCH 034/346] rust: alloc: fix typo in docs for GFP_NOWAIT

Fix a typo in alloc.rs by replacing Ror with For.

Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Reviewed-by: Vincenzo Palazzo <vincenzopalazzodev@gmail.com>
Link: https://lore.kernel.org/r/20240529083452.779865-1-aliceryhl@google.com
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/kernel/alloc.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs
index 531b5e471cb1..c44995914f75 100644
--- a/rust/kernel/alloc.rs
+++ b/rust/kernel/alloc.rs
@@ -66,7 +66,7 @@ pub mod flags {
     /// The same as [`GFP_KERNEL`], except the allocation is accounted to kmemcg.
     pub const GFP_KERNEL_ACCOUNT: Flags = Flags(bindings::GFP_KERNEL_ACCOUNT);
 
-    /// Ror kernel allocations that should not stall for direct reclaim, start physical IO or
+    /// For kernel allocations that should not stall for direct reclaim, start physical IO or
     /// use any filesystem callback.  It is very likely to fail to allocate memory, even for very
     /// small allocations.
     pub const GFP_NOWAIT: Flags = Flags(bindings::GFP_NOWAIT);

From ab44079e2869c9f1743d167d5f9b5befe375b6d9 Mon Sep 17 00:00:00 2001
From: Alice Ryhl <aliceryhl@google.com>
Date: Fri, 7 Jun 2024 08:23:41 +0000
Subject: [PATCH 035/346] rust: alloc: add __GFP_HIGHMEM flag

Make it possible to allocate memory that doesn't need to mapped into the
kernel's address space. This flag is useful together with
Page::alloc_page [1].

Rust Binder needs this for the memory that holds incoming transactions
for each process. Each process will have a few megabytes of memory
allocated with this flag, which is mapped into the process using
vm_insert_page. When the kernel copies data for an incoming transaction
into a process's memory region, it will use kmap_local_page to
temporarily map pages that are being modified. There is no need for them
to take up address space in the kernel when the kernel is not writing an
incoming transaction into the page.

Link: https://lore.kernel.org/all/20240528-alice-mm-v7-4-78222c31b8f4@google.com/ [1]
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Link: https://lore.kernel.org/r/20240607-highmem-v1-1-d18c5ca4072f@google.com
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/bindings/bindings_helper.h | 1 +
 rust/kernel/alloc.rs            | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
index ddb5644d4fd9..52a1412338ef 100644
--- a/rust/bindings/bindings_helper.h
+++ b/rust/bindings/bindings_helper.h
@@ -25,3 +25,4 @@ const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL;
 const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT;
 const gfp_t RUST_CONST_HELPER_GFP_NOWAIT = GFP_NOWAIT;
 const gfp_t RUST_CONST_HELPER___GFP_ZERO = __GFP_ZERO;
+const gfp_t RUST_CONST_HELPER___GFP_HIGHMEM = ___GFP_HIGHMEM;
diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs
index c44995914f75..396fe5a85a8f 100644
--- a/rust/kernel/alloc.rs
+++ b/rust/kernel/alloc.rs
@@ -52,6 +52,14 @@ pub mod flags {
     /// This is normally or'd with other flags.
     pub const __GFP_ZERO: Flags = Flags(bindings::__GFP_ZERO);
 
+    /// Allow the allocation to be in high memory.
+    ///
+    /// Allocations in high memory may not be mapped into the kernel's address space, so this can't
+    /// be used with `kmalloc` and other similar methods.
+    ///
+    /// This is normally or'd with other flags.
+    pub const __GFP_HIGHMEM: Flags = Flags(bindings::__GFP_HIGHMEM);
+
     /// Users can not sleep and need the allocation to succeed.
     ///
     /// A lower watermark is applied to allow access to "atomic reserves". The current

From b63c455d38be5f62a0665f3080c67334db5b4c41 Mon Sep 17 00:00:00 2001
From: Dirk Behme <dirk.behme@de.bosch.com>
Date: Mon, 10 Jun 2024 14:23:32 +0200
Subject: [PATCH 036/346] docs: rust: no_std is used

Using the #![no_std] attribute in the Rust kernel support is different
to the default Rust usage. Mention this in the Documentation.

Signed-off-by: Dirk Behme <dirk.behme@de.bosch.com>
Reviewed-by: Trevor Gross <tmgross@umich.edu>
Link: https://lore.kernel.org/r/20240610122332.3858571-1-dirk.behme@de.bosch.com
[ Avoided breaking links in two lines. - Miguel ]
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Documentation/rust/general-information.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Documentation/rust/general-information.rst b/Documentation/rust/general-information.rst
index 4bb6ac12d482..e3f388ef4ee4 100644
--- a/Documentation/rust/general-information.rst
+++ b/Documentation/rust/general-information.rst
@@ -7,6 +7,14 @@ This document contains useful information to know when working with
 the Rust support in the kernel.
 
 
+``no_std``
+----------
+
+The Rust support in the kernel can link only `core <https://doc.rust-lang.org/core/>`_,
+but not `std <https://doc.rust-lang.org/std/>`_. Crates for use in the
+kernel must opt into this behavior using the ``#![no_std]`` attribute.
+
+
 Code documentation
 ------------------
 

From 6dc9d9ca9a728c8b30976d3ca353563ba0bfb949 Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Thu, 27 Jun 2024 17:43:55 -0700
Subject: [PATCH 037/346] kbuild: rust-analyzer: better error handling

1) Provide a better error message for the "Rust not available" case.
Without this patch, one gets various misleading messages, such as:

    "No rule to make target 'rust-analyzer'"

Instead, run scripts/rust_is_available.sh directly, as a prerequisite,
and let that script report the cause of any problems, as well as
providing a link to the documentation. Thanks to Miguel Ojeda for the
idea of just letting rust_is_available.sh report its results directly.

The new output in the failure case looks like this:

$ make rust-analyzer
***
*** Rust compiler 'rustc' could not be found.
***
***
*** Please see Documentation/rust/quick-start.rst for details
*** on how to set up the Rust support.
***
make[1]: *** [/kernel_work/linux-github/Makefile:1975: rust-analyzer] Error 1
make: *** [Makefile:240: __sub-make] Error 2

Reviewed-by: Finn Behrens <me@kloenk.dev>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Tested-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Link: https://lore.kernel.org/r/20240628004356.1384486-2-jhubbard@nvidia.com
[ Reworded title. - Miguel ]
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index d4134a9f32b6..de5be3845c70 100644
--- a/Makefile
+++ b/Makefile
@@ -1969,6 +1969,7 @@ tags TAGS cscope gtags: FORCE
 # IDE support targets
 PHONY += rust-analyzer
 rust-analyzer:
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh
 	$(Q)$(MAKE) $(build)=rust $@
 
 # Script to generate missing namespace dependencies

From 5045b460843adf5bb31774b9464df5ae8e4da0d4 Mon Sep 17 00:00:00 2001
From: John Hubbard <jhubbard@nvidia.com>
Date: Thu, 27 Jun 2024 17:43:56 -0700
Subject: [PATCH 038/346] kbuild: rust-analyzer: improve comment documentation

Replace the cryptic phrase ("IDE support targets") that initially
appears to be about how to support old hard drives, with a few sentences
that explain what "make rust-analyzer" provides.

Signed-off-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Finn Behrens <me@kloenk.dev>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Link: https://lore.kernel.org/r/20240628004356.1384486-3-jhubbard@nvidia.com
[ Reworded title. - Miguel ]
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index de5be3845c70..fea263aaa492 100644
--- a/Makefile
+++ b/Makefile
@@ -1966,7 +1966,9 @@ quiet_cmd_tags = GEN     $@
 tags TAGS cscope gtags: FORCE
 	$(call cmd,tags)
 
-# IDE support targets
+# Generate rust-project.json (a file that describes the structure of non-Cargo
+# Rust projects) for rust-analyzer (an implementation of the Language Server
+# Protocol).
 PHONY += rust-analyzer
 rust-analyzer:
 	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh

From 1b580e7b9ba2e5939c4b94da2cb4888605b39955 Mon Sep 17 00:00:00 2001
From: Wedson Almeida Filho <wedsonaf@gmail.com>
Date: Tue, 28 May 2024 14:58:02 +0000
Subject: [PATCH 039/346] rust: uaccess: add userspace pointers

A pointer to an area in userspace memory, which can be either read-only
or read-write.

All methods on this struct are safe: attempting to read or write on bad
addresses (either out of the bound of the slice or unmapped addresses)
will return `EFAULT`. Concurrent access, *including data races to/from
userspace memory*, is permitted, because fundamentally another userspace
thread/process could always be modifying memory at the same time (in the
same way that userspace Rust's `std::io` permits data races with the
contents of files on disk). In the presence of a race, the exact byte
values read/written are unspecified but the operation is well-defined.
Kernelspace code should validate its copy of data after completing a
read, and not expect that multiple reads of the same address will return
the same value.

These APIs are designed to make it difficult to accidentally write
TOCTOU bugs. Every time you read from a memory location, the pointer is
advanced by the length so that you cannot use that reader to read the
same memory location twice. Preventing double-fetches avoids TOCTOU
bugs. This is accomplished by taking `self` by value to prevent
obtaining multiple readers on a given `UserSlice`, and the readers only
permitting forward reads. If double-fetching a memory location is
necessary for some reason, then that is done by creating multiple
readers to the same memory location.

Constructing a `UserSlice` performs no checks on the provided address
and length, it can safely be constructed inside a kernel thread with no
current userspace process. Reads and writes wrap the kernel APIs
`copy_from_user` and `copy_to_user`, which check the memory map of the
current process and enforce that the address range is within the user
range (no additional calls to `access_ok` are needed).

This code is based on something that was originally written by Wedson on
the old rust branch. It was modified by Alice by removing the
`IoBufferReader` and `IoBufferWriter` traits, and various other changes.

Signed-off-by: Wedson Almeida Filho <wedsonaf@gmail.com>
Reviewed-by: Benno Lossin <benno.lossin@proton.me>
Reviewed-by: Trevor Gross <tmgross@umich.edu>
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Co-developed-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Link: https://lore.kernel.org/r/20240528-alice-mm-v7-1-78222c31b8f4@google.com
[ Wrapped docs to 100 and added a few intra-doc links. - Miguel ]
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/helpers.c         |  14 ++
 rust/kernel/lib.rs     |   1 +
 rust/kernel/uaccess.rs | 313 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 328 insertions(+)
 create mode 100644 rust/kernel/uaccess.rs

diff --git a/rust/helpers.c b/rust/helpers.c
index 2c37a0f5d7a8..d6abe4bd45d9 100644
--- a/rust/helpers.c
+++ b/rust/helpers.c
@@ -39,6 +39,20 @@ __noreturn void rust_helper_BUG(void)
 }
 EXPORT_SYMBOL_GPL(rust_helper_BUG);
 
+unsigned long rust_helper_copy_from_user(void *to, const void __user *from,
+					 unsigned long n)
+{
+	return copy_from_user(to, from, n);
+}
+EXPORT_SYMBOL_GPL(rust_helper_copy_from_user);
+
+unsigned long rust_helper_copy_to_user(void __user *to, const void *from,
+				       unsigned long n)
+{
+	return copy_to_user(to, from, n);
+}
+EXPORT_SYMBOL_GPL(rust_helper_copy_to_user);
+
 void rust_helper_mutex_lock(struct mutex *lock)
 {
 	mutex_lock(lock);
diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
index fbd91a48ff8b..767026db068e 100644
--- a/rust/kernel/lib.rs
+++ b/rust/kernel/lib.rs
@@ -45,6 +45,7 @@
 pub mod task;
 pub mod time;
 pub mod types;
+pub mod uaccess;
 pub mod workqueue;
 
 #[doc(hidden)]
diff --git a/rust/kernel/uaccess.rs b/rust/kernel/uaccess.rs
new file mode 100644
index 000000000000..55f0d7ecfa3f
--- /dev/null
+++ b/rust/kernel/uaccess.rs
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Slices to user space memory regions.
+//!
+//! C header: [`include/linux/uaccess.h`](srctree/include/linux/uaccess.h)
+
+use crate::{alloc::Flags, bindings, error::Result, prelude::*};
+use alloc::vec::Vec;
+use core::ffi::{c_ulong, c_void};
+use core::mem::MaybeUninit;
+
+/// The type used for userspace addresses.
+pub type UserPtr = usize;
+
+/// A pointer to an area in userspace memory, which can be either read-only or read-write.
+///
+/// All methods on this struct are safe: attempting to read or write on bad addresses (either out of
+/// the bound of the slice or unmapped addresses) will return [`EFAULT`]. Concurrent access,
+/// *including data races to/from userspace memory*, is permitted, because fundamentally another
+/// userspace thread/process could always be modifying memory at the same time (in the same way that
+/// userspace Rust's [`std::io`] permits data races with the contents of files on disk). In the
+/// presence of a race, the exact byte values read/written are unspecified but the operation is
+/// well-defined. Kernelspace code should validate its copy of data after completing a read, and not
+/// expect that multiple reads of the same address will return the same value.
+///
+/// These APIs are designed to make it difficult to accidentally write TOCTOU (time-of-check to
+/// time-of-use) bugs. Every time a memory location is read, the reader's position is advanced by
+/// the read length and the next read will start from there. This helps prevent accidentally reading
+/// the same location twice and causing a TOCTOU bug.
+///
+/// Creating a [`UserSliceReader`] and/or [`UserSliceWriter`] consumes the `UserSlice`, helping
+/// ensure that there aren't multiple readers or writers to the same location.
+///
+/// If double-fetching a memory location is necessary for some reason, then that is done by creating
+/// multiple readers to the same memory location, e.g. using [`clone_reader`].
+///
+/// # Examples
+///
+/// Takes a region of userspace memory from the current process, and modify it by adding one to
+/// every byte in the region.
+///
+/// ```no_run
+/// use alloc::vec::Vec;
+/// use core::ffi::c_void;
+/// use kernel::error::Result;
+/// use kernel::uaccess::{UserPtr, UserSlice};
+///
+/// fn bytes_add_one(uptr: UserPtr, len: usize) -> Result<()> {
+///     let (read, mut write) = UserSlice::new(uptr, len).reader_writer();
+///
+///     let mut buf = Vec::new();
+///     read.read_all(&mut buf, GFP_KERNEL)?;
+///
+///     for b in &mut buf {
+///         *b = b.wrapping_add(1);
+///     }
+///
+///     write.write_slice(&buf)?;
+///     Ok(())
+/// }
+/// ```
+///
+/// Example illustrating a TOCTOU (time-of-check to time-of-use) bug.
+///
+/// ```no_run
+/// use alloc::vec::Vec;
+/// use core::ffi::c_void;
+/// use kernel::error::{code::EINVAL, Result};
+/// use kernel::uaccess::{UserPtr, UserSlice};
+///
+/// /// Returns whether the data in this region is valid.
+/// fn is_valid(uptr: UserPtr, len: usize) -> Result<bool> {
+///     let read = UserSlice::new(uptr, len).reader();
+///
+///     let mut buf = Vec::new();
+///     read.read_all(&mut buf, GFP_KERNEL)?;
+///
+///     todo!()
+/// }
+///
+/// /// Returns the bytes behind this user pointer if they are valid.
+/// fn get_bytes_if_valid(uptr: UserPtr, len: usize) -> Result<Vec<u8>> {
+///     if !is_valid(uptr, len)? {
+///         return Err(EINVAL);
+///     }
+///
+///     let read = UserSlice::new(uptr, len).reader();
+///
+///     let mut buf = Vec::new();
+///     read.read_all(&mut buf, GFP_KERNEL)?;
+///
+///     // THIS IS A BUG! The bytes could have changed since we checked them.
+///     //
+///     // To avoid this kind of bug, don't call `UserSlice::new` multiple
+///     // times with the same address.
+///     Ok(buf)
+/// }
+/// ```
+///
+/// [`std::io`]: https://doc.rust-lang.org/std/io/index.html
+/// [`clone_reader`]: UserSliceReader::clone_reader
+pub struct UserSlice {
+    ptr: UserPtr,
+    length: usize,
+}
+
+impl UserSlice {
+    /// Constructs a user slice from a raw pointer and a length in bytes.
+    ///
+    /// Constructing a [`UserSlice`] performs no checks on the provided address and length, it can
+    /// safely be constructed inside a kernel thread with no current userspace process. Reads and
+    /// writes wrap the kernel APIs `copy_from_user` and `copy_to_user`, which check the memory map
+    /// of the current process and enforce that the address range is within the user range (no
+    /// additional calls to `access_ok` are needed). Validity of the pointer is checked when you
+    /// attempt to read or write, not in the call to `UserSlice::new`.
+    ///
+    /// Callers must be careful to avoid time-of-check-time-of-use (TOCTOU) issues. The simplest way
+    /// is to create a single instance of [`UserSlice`] per user memory block as it reads each byte
+    /// at most once.
+    pub fn new(ptr: UserPtr, length: usize) -> Self {
+        UserSlice { ptr, length }
+    }
+
+    /// Reads the entirety of the user slice, appending it to the end of the provided buffer.
+    ///
+    /// Fails with [`EFAULT`] if the read happens on a bad address.
+    pub fn read_all(self, buf: &mut Vec<u8>, flags: Flags) -> Result {
+        self.reader().read_all(buf, flags)
+    }
+
+    /// Constructs a [`UserSliceReader`].
+    pub fn reader(self) -> UserSliceReader {
+        UserSliceReader {
+            ptr: self.ptr,
+            length: self.length,
+        }
+    }
+
+    /// Constructs a [`UserSliceWriter`].
+    pub fn writer(self) -> UserSliceWriter {
+        UserSliceWriter {
+            ptr: self.ptr,
+            length: self.length,
+        }
+    }
+
+    /// Constructs both a [`UserSliceReader`] and a [`UserSliceWriter`].
+    ///
+    /// Usually when this is used, you will first read the data, and then overwrite it afterwards.
+    pub fn reader_writer(self) -> (UserSliceReader, UserSliceWriter) {
+        (
+            UserSliceReader {
+                ptr: self.ptr,
+                length: self.length,
+            },
+            UserSliceWriter {
+                ptr: self.ptr,
+                length: self.length,
+            },
+        )
+    }
+}
+
+/// A reader for [`UserSlice`].
+///
+/// Used to incrementally read from the user slice.
+pub struct UserSliceReader {
+    ptr: UserPtr,
+    length: usize,
+}
+
+impl UserSliceReader {
+    /// Skip the provided number of bytes.
+    ///
+    /// Returns an error if skipping more than the length of the buffer.
+    pub fn skip(&mut self, num_skip: usize) -> Result {
+        // Update `self.length` first since that's the fallible part of this operation.
+        self.length = self.length.checked_sub(num_skip).ok_or(EFAULT)?;
+        self.ptr = self.ptr.wrapping_add(num_skip);
+        Ok(())
+    }
+
+    /// Create a reader that can access the same range of data.
+    ///
+    /// Reading from the clone does not advance the current reader.
+    ///
+    /// The caller should take care to not introduce TOCTOU issues, as described in the
+    /// documentation for [`UserSlice`].
+    pub fn clone_reader(&self) -> UserSliceReader {
+        UserSliceReader {
+            ptr: self.ptr,
+            length: self.length,
+        }
+    }
+
+    /// Returns the number of bytes left to be read from this reader.
+    ///
+    /// Note that even reading less than this number of bytes may fail.
+    pub fn len(&self) -> usize {
+        self.length
+    }
+
+    /// Returns `true` if no data is available in the io buffer.
+    pub fn is_empty(&self) -> bool {
+        self.length == 0
+    }
+
+    /// Reads raw data from the user slice into a kernel buffer.
+    ///
+    /// For a version that uses `&mut [u8]`, please see [`UserSliceReader::read_slice`].
+    ///
+    /// Fails with [`EFAULT`] if the read happens on a bad address, or if the read goes out of
+    /// bounds of this [`UserSliceReader`]. This call may modify `out` even if it returns an error.
+    ///
+    /// # Guarantees
+    ///
+    /// After a successful call to this method, all bytes in `out` are initialized.
+    pub fn read_raw(&mut self, out: &mut [MaybeUninit<u8>]) -> Result {
+        let len = out.len();
+        let out_ptr = out.as_mut_ptr().cast::<c_void>();
+        if len > self.length {
+            return Err(EFAULT);
+        }
+        let Ok(len_ulong) = c_ulong::try_from(len) else {
+            return Err(EFAULT);
+        };
+        // SAFETY: `out_ptr` points into a mutable slice of length `len_ulong`, so we may write
+        // that many bytes to it.
+        let res =
+            unsafe { bindings::copy_from_user(out_ptr, self.ptr as *const c_void, len_ulong) };
+        if res != 0 {
+            return Err(EFAULT);
+        }
+        self.ptr = self.ptr.wrapping_add(len);
+        self.length -= len;
+        Ok(())
+    }
+
+    /// Reads raw data from the user slice into a kernel buffer.
+    ///
+    /// Fails with [`EFAULT`] if the read happens on a bad address, or if the read goes out of
+    /// bounds of this [`UserSliceReader`]. This call may modify `out` even if it returns an error.
+    pub fn read_slice(&mut self, out: &mut [u8]) -> Result {
+        // SAFETY: The types are compatible and `read_raw` doesn't write uninitialized bytes to
+        // `out`.
+        let out = unsafe { &mut *(out as *mut [u8] as *mut [MaybeUninit<u8>]) };
+        self.read_raw(out)
+    }
+
+    /// Reads the entirety of the user slice, appending it to the end of the provided buffer.
+    ///
+    /// Fails with [`EFAULT`] if the read happens on a bad address.
+    pub fn read_all(mut self, buf: &mut Vec<u8>, flags: Flags) -> Result {
+        let len = self.length;
+        VecExt::<u8>::reserve(buf, len, flags)?;
+
+        // The call to `try_reserve` was successful, so the spare capacity is at least `len` bytes
+        // long.
+        self.read_raw(&mut buf.spare_capacity_mut()[..len])?;
+
+        // SAFETY: Since the call to `read_raw` was successful, so the next `len` bytes of the
+        // vector have been initialized.
+        unsafe { buf.set_len(buf.len() + len) };
+        Ok(())
+    }
+}
+
+/// A writer for [`UserSlice`].
+///
+/// Used to incrementally write into the user slice.
+pub struct UserSliceWriter {
+    ptr: UserPtr,
+    length: usize,
+}
+
+impl UserSliceWriter {
+    /// Returns the amount of space remaining in this buffer.
+    ///
+    /// Note that even writing less than this number of bytes may fail.
+    pub fn len(&self) -> usize {
+        self.length
+    }
+
+    /// Returns `true` if no more data can be written to this buffer.
+    pub fn is_empty(&self) -> bool {
+        self.length == 0
+    }
+
+    /// Writes raw data to this user pointer from a kernel buffer.
+    ///
+    /// Fails with [`EFAULT`] if the write happens on a bad address, or if the write goes out of
+    /// bounds of this [`UserSliceWriter`]. This call may modify the associated userspace slice even
+    /// if it returns an error.
+    pub fn write_slice(&mut self, data: &[u8]) -> Result {
+        let len = data.len();
+        let data_ptr = data.as_ptr().cast::<c_void>();
+        if len > self.length {
+            return Err(EFAULT);
+        }
+        let Ok(len_ulong) = c_ulong::try_from(len) else {
+            return Err(EFAULT);
+        };
+        // SAFETY: `data_ptr` points into an immutable slice of length `len_ulong`, so we may read
+        // that many bytes from it.
+        let res = unsafe { bindings::copy_to_user(self.ptr as *mut c_void, data_ptr, len_ulong) };
+        if res != 0 {
+            return Err(EFAULT);
+        }
+        self.ptr = self.ptr.wrapping_add(len);
+        self.length -= len;
+        Ok(())
+    }
+}

From 1f9a8286bc0c3df7d789ea625d9d9db3d7779f2d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 28 May 2024 14:58:03 +0000
Subject: [PATCH 040/346] uaccess: always export _copy_[from|to]_user with
 CONFIG_RUST

Rust code needs to be able to access _copy_from_user and _copy_to_user
so that it can skip the check_copy_size check in cases where the length
is known at compile-time, mirroring the logic for when C code will skip
check_copy_size. To do this, we ensure that exported versions of these
methods are available when CONFIG_RUST is enabled.

Alice has verified that this patch passes the CONFIG_TEST_USER_COPY test
on x86 using the Android cuttlefish emulator.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Alice Ryhl <aliceryhl@google.com>
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/r/20240528-alice-mm-v7-2-78222c31b8f4@google.com
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 include/linux/uaccess.h | 46 ++++++++++++++++++++++++++++-------------
 lib/usercopy.c          | 30 ++++-----------------------
 2 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 3064314f4832..d8e4105a2f21 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -5,6 +5,7 @@
 #include <linux/fault-inject-usercopy.h>
 #include <linux/instrumented.h>
 #include <linux/minmax.h>
+#include <linux/nospec.h>
 #include <linux/sched.h>
 #include <linux/thread_info.h>
 
@@ -138,13 +139,26 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
 	return raw_copy_to_user(to, from, n);
 }
 
-#ifdef INLINE_COPY_FROM_USER
+/*
+ * Architectures that #define INLINE_COPY_TO_USER use this function
+ * directly in the normal copy_to/from_user(), the other ones go
+ * through an extern _copy_to/from_user(), which expands the same code
+ * here.
+ *
+ * Rust code always uses the extern definition.
+ */
 static inline __must_check unsigned long
-_copy_from_user(void *to, const void __user *from, unsigned long n)
+_inline_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	unsigned long res = n;
 	might_fault();
 	if (!should_fail_usercopy() && likely(access_ok(from, n))) {
+		/*
+		 * Ensure that bad access_ok() speculation will not
+		 * lead to nasty side effects *after* the copy is
+		 * finished:
+		 */
+		barrier_nospec();
 		instrument_copy_from_user_before(to, from, n);
 		res = raw_copy_from_user(to, from, n);
 		instrument_copy_from_user_after(to, from, n, res);
@@ -153,14 +167,11 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
 		memset(to + (n - res), 0, res);
 	return res;
 }
-#else
 extern __must_check unsigned long
 _copy_from_user(void *, const void __user *, unsigned long);
-#endif
 
-#ifdef INLINE_COPY_TO_USER
 static inline __must_check unsigned long
-_copy_to_user(void __user *to, const void *from, unsigned long n)
+_inline_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 	might_fault();
 	if (should_fail_usercopy())
@@ -171,25 +182,32 @@ _copy_to_user(void __user *to, const void *from, unsigned long n)
 	}
 	return n;
 }
-#else
 extern __must_check unsigned long
 _copy_to_user(void __user *, const void *, unsigned long);
-#endif
 
 static __always_inline unsigned long __must_check
 copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	if (check_copy_size(to, n, false))
-		n = _copy_from_user(to, from, n);
-	return n;
+	if (!check_copy_size(to, n, false))
+		return n;
+#ifdef INLINE_COPY_FROM_USER
+	return _inline_copy_from_user(to, from, n);
+#else
+	return _copy_from_user(to, from, n);
+#endif
 }
 
 static __always_inline unsigned long __must_check
 copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-	if (check_copy_size(from, n, true))
-		n = _copy_to_user(to, from, n);
-	return n;
+	if (!check_copy_size(from, n, true))
+		return n;
+
+#ifdef INLINE_COPY_TO_USER
+	return _inline_copy_to_user(to, from, n);
+#else
+	return _copy_to_user(to, from, n);
+#endif
 }
 
 #ifndef copy_mc_to_kernel
diff --git a/lib/usercopy.c b/lib/usercopy.c
index 499a7a7d54db..7b17b83c8042 100644
--- a/lib/usercopy.c
+++ b/lib/usercopy.c
@@ -12,40 +12,18 @@
 
 /* out-of-line parts */
 
-#ifndef INLINE_COPY_FROM_USER
+#if !defined(INLINE_COPY_FROM_USER) || defined(CONFIG_RUST)
 unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	unsigned long res = n;
-	might_fault();
-	if (!should_fail_usercopy() && likely(access_ok(from, n))) {
-		/*
-		 * Ensure that bad access_ok() speculation will not
-		 * lead to nasty side effects *after* the copy is
-		 * finished:
-		 */
-		barrier_nospec();
-		instrument_copy_from_user_before(to, from, n);
-		res = raw_copy_from_user(to, from, n);
-		instrument_copy_from_user_after(to, from, n, res);
-	}
-	if (unlikely(res))
-		memset(to + (n - res), 0, res);
-	return res;
+	return _inline_copy_from_user(to, from, n);
 }
 EXPORT_SYMBOL(_copy_from_user);
 #endif
 
-#ifndef INLINE_COPY_TO_USER
+#if !defined(INLINE_COPY_TO_USER) || defined(CONFIG_RUST)
 unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-	might_fault();
-	if (should_fail_usercopy())
-		return n;
-	if (likely(access_ok(to, n))) {
-		instrument_copy_to_user(to, from, n);
-		n = raw_copy_to_user(to, from, n);
-	}
-	return n;
+	return _inline_copy_to_user(to, from, n);
 }
 EXPORT_SYMBOL(_copy_to_user);
 #endif

From b33bf37adbb2ae35881e7fdd997ce3334d71b129 Mon Sep 17 00:00:00 2001
From: Alice Ryhl <aliceryhl@google.com>
Date: Tue, 28 May 2024 14:58:04 +0000
Subject: [PATCH 041/346] rust: uaccess: add typed accessors for userspace
 pointers

Add safe methods for reading and writing Rust values to and from
userspace pointers.

The C methods for copying to/from userspace use a function called
`check_object_size` to verify that the kernel pointer is not dangling.
However, this check is skipped when the length is a compile-time
constant, with the assumption that such cases trivially have a correct
kernel pointer.

In this patch, we apply the same optimization to the typed accessors.
For both methods, the size of the operation is known at compile time to
be size_of of the type being read or written. Since the C side doesn't
provide a variant that skips only this check, we create custom helpers
for this purpose.

The majority of reads and writes to userspace pointers in the Rust
Binder driver uses these accessor methods. Benchmarking has found that
skipping the `check_object_size` check makes a big difference for the
cases being skipped here. (And that the check doesn't make a difference
for the cases that use the raw read/write methods.)

This code is based on something that was originally written by Wedson on
the old rust branch. It was modified by Alice to skip the
`check_object_size` check, and to update various comments, including the
notes about kernel pointers in `WritableToBytes`.

Co-developed-by: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@gmail.com>
Reviewed-by: Benno Lossin <benno.lossin@proton.me>
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Trevor Gross <tmgross@umich.edu>
Reviewed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Link: https://lore.kernel.org/r/20240528-alice-mm-v7-3-78222c31b8f4@google.com
[ Wrapped docs to 100 and added a few intra-doc links. - Miguel ]
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/kernel/types.rs   | 64 ++++++++++++++++++++++++++++++++++
 rust/kernel/uaccess.rs | 79 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 141 insertions(+), 2 deletions(-)

diff --git a/rust/kernel/types.rs b/rust/kernel/types.rs
index 2e7c9008621f..bd189d646adb 100644
--- a/rust/kernel/types.rs
+++ b/rust/kernel/types.rs
@@ -409,3 +409,67 @@ pub enum Either<L, R> {
     /// Constructs an instance of [`Either`] containing a value of type `R`.
     Right(R),
 }
+
+/// Types for which any bit pattern is valid.
+///
+/// Not all types are valid for all values. For example, a `bool` must be either zero or one, so
+/// reading arbitrary bytes into something that contains a `bool` is not okay.
+///
+/// It's okay for the type to have padding, as initializing those bytes has no effect.
+///
+/// # Safety
+///
+/// All bit-patterns must be valid for this type. This type must not have interior mutability.
+pub unsafe trait FromBytes {}
+
+// SAFETY: All bit patterns are acceptable values of the types below.
+unsafe impl FromBytes for u8 {}
+unsafe impl FromBytes for u16 {}
+unsafe impl FromBytes for u32 {}
+unsafe impl FromBytes for u64 {}
+unsafe impl FromBytes for usize {}
+unsafe impl FromBytes for i8 {}
+unsafe impl FromBytes for i16 {}
+unsafe impl FromBytes for i32 {}
+unsafe impl FromBytes for i64 {}
+unsafe impl FromBytes for isize {}
+// SAFETY: If all bit patterns are acceptable for individual values in an array, then all bit
+// patterns are also acceptable for arrays of that type.
+unsafe impl<T: FromBytes> FromBytes for [T] {}
+unsafe impl<T: FromBytes, const N: usize> FromBytes for [T; N] {}
+
+/// Types that can be viewed as an immutable slice of initialized bytes.
+///
+/// If a struct implements this trait, then it is okay to copy it byte-for-byte to userspace. This
+/// means that it should not have any padding, as padding bytes are uninitialized. Reading
+/// uninitialized memory is not just undefined behavior, it may even lead to leaking sensitive
+/// information on the stack to userspace.
+///
+/// The struct should also not hold kernel pointers, as kernel pointer addresses are also considered
+/// sensitive. However, leaking kernel pointers is not considered undefined behavior by Rust, so
+/// this is a correctness requirement, but not a safety requirement.
+///
+/// # Safety
+///
+/// Values of this type may not contain any uninitialized bytes. This type must not have interior
+/// mutability.
+pub unsafe trait AsBytes {}
+
+// SAFETY: Instances of the following types have no uninitialized portions.
+unsafe impl AsBytes for u8 {}
+unsafe impl AsBytes for u16 {}
+unsafe impl AsBytes for u32 {}
+unsafe impl AsBytes for u64 {}
+unsafe impl AsBytes for usize {}
+unsafe impl AsBytes for i8 {}
+unsafe impl AsBytes for i16 {}
+unsafe impl AsBytes for i32 {}
+unsafe impl AsBytes for i64 {}
+unsafe impl AsBytes for isize {}
+unsafe impl AsBytes for bool {}
+unsafe impl AsBytes for char {}
+unsafe impl AsBytes for str {}
+// SAFETY: If individual values in an array have no uninitialized portions, then the array itself
+// does not have any uninitialized portions either.
+unsafe impl<T: AsBytes> AsBytes for [T] {}
+unsafe impl<T: AsBytes, const N: usize> AsBytes for [T; N] {}
diff --git a/rust/kernel/uaccess.rs b/rust/kernel/uaccess.rs
index 55f0d7ecfa3f..e9347cff99ab 100644
--- a/rust/kernel/uaccess.rs
+++ b/rust/kernel/uaccess.rs
@@ -4,10 +4,16 @@
 //!
 //! C header: [`include/linux/uaccess.h`](srctree/include/linux/uaccess.h)
 
-use crate::{alloc::Flags, bindings, error::Result, prelude::*};
+use crate::{
+    alloc::Flags,
+    bindings,
+    error::Result,
+    prelude::*,
+    types::{AsBytes, FromBytes},
+};
 use alloc::vec::Vec;
 use core::ffi::{c_ulong, c_void};
-use core::mem::MaybeUninit;
+use core::mem::{size_of, MaybeUninit};
 
 /// The type used for userspace addresses.
 pub type UserPtr = usize;
@@ -247,6 +253,41 @@ pub fn read_slice(&mut self, out: &mut [u8]) -> Result {
         self.read_raw(out)
     }
 
+    /// Reads a value of the specified type.
+    ///
+    /// Fails with [`EFAULT`] if the read happens on a bad address, or if the read goes out of
+    /// bounds of this [`UserSliceReader`].
+    pub fn read<T: FromBytes>(&mut self) -> Result<T> {
+        let len = size_of::<T>();
+        if len > self.length {
+            return Err(EFAULT);
+        }
+        let Ok(len_ulong) = c_ulong::try_from(len) else {
+            return Err(EFAULT);
+        };
+        let mut out: MaybeUninit<T> = MaybeUninit::uninit();
+        // SAFETY: The local variable `out` is valid for writing `size_of::<T>()` bytes.
+        //
+        // By using the _copy_from_user variant, we skip the check_object_size check that verifies
+        // the kernel pointer. This mirrors the logic on the C side that skips the check when the
+        // length is a compile-time constant.
+        let res = unsafe {
+            bindings::_copy_from_user(
+                out.as_mut_ptr().cast::<c_void>(),
+                self.ptr as *const c_void,
+                len_ulong,
+            )
+        };
+        if res != 0 {
+            return Err(EFAULT);
+        }
+        self.ptr = self.ptr.wrapping_add(len);
+        self.length -= len;
+        // SAFETY: The read above has initialized all bytes in `out`, and since `T` implements
+        // `FromBytes`, any bit-pattern is a valid value for this type.
+        Ok(unsafe { out.assume_init() })
+    }
+
     /// Reads the entirety of the user slice, appending it to the end of the provided buffer.
     ///
     /// Fails with [`EFAULT`] if the read happens on a bad address.
@@ -310,4 +351,38 @@ pub fn write_slice(&mut self, data: &[u8]) -> Result {
         self.length -= len;
         Ok(())
     }
+
+    /// Writes the provided Rust value to this userspace pointer.
+    ///
+    /// Fails with [`EFAULT`] if the write happens on a bad address, or if the write goes out of
+    /// bounds of this [`UserSliceWriter`]. This call may modify the associated userspace slice even
+    /// if it returns an error.
+    pub fn write<T: AsBytes>(&mut self, value: &T) -> Result {
+        let len = size_of::<T>();
+        if len > self.length {
+            return Err(EFAULT);
+        }
+        let Ok(len_ulong) = c_ulong::try_from(len) else {
+            return Err(EFAULT);
+        };
+        // SAFETY: The reference points to a value of type `T`, so it is valid for reading
+        // `size_of::<T>()` bytes.
+        //
+        // By using the _copy_to_user variant, we skip the check_object_size check that verifies the
+        // kernel pointer. This mirrors the logic on the C side that skips the check when the length
+        // is a compile-time constant.
+        let res = unsafe {
+            bindings::_copy_to_user(
+                self.ptr as *mut c_void,
+                (value as *const T).cast::<c_void>(),
+                len_ulong,
+            )
+        };
+        if res != 0 {
+            return Err(EFAULT);
+        }
+        self.ptr = self.ptr.wrapping_add(len);
+        self.length -= len;
+        Ok(())
+    }
 }

From fc6e66f4696b63b8a2645a2bcea407cb04bd0666 Mon Sep 17 00:00:00 2001
From: Alice Ryhl <aliceryhl@google.com>
Date: Tue, 28 May 2024 14:58:05 +0000
Subject: [PATCH 042/346] rust: add abstraction for `struct page`

Adds a new struct called `Page` that wraps a pointer to `struct page`.
This struct is assumed to hold ownership over the page, so that Rust
code can allocate and manage pages directly.

The page type has various methods for reading and writing into the page.
These methods will temporarily map the page to allow the operation. All
of these methods use a helper that takes an offset and length, performs
bounds checks, and returns a pointer to the given offset in the page.

This patch only adds support for pages of order zero, as that is all
Rust Binder needs. However, it is written to make it easy to add support
for higher-order pages in the future. To do that, you would add a const
generic parameter to `Page` that specifies the order. Most of the
methods do not need to be adjusted, as the logic for dealing with
mapping multiple pages at once can be isolated to just the
`with_pointer_into_page` method.

Rust Binder needs to manage pages directly as that is how transactions
are delivered: Each process has an mmap'd region for incoming
transactions. When an incoming transaction arrives, the Binder driver
will choose a region in the mmap, allocate and map the relevant pages
manually, and copy the incoming transaction directly into the page. This
architecture allows the driver to copy transactions directly from the
address space of one process to another, without an intermediate copy
to a kernel buffer.

This code is based on Wedson's page abstractions from the old rust
branch, but it has been modified by Alice by removing the incomplete
support for higher-order pages, by introducing the `with_*` helpers
to consolidate the bounds checking logic into a single place, and
various other changes.

Co-developed-by: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@gmail.com>
Reviewed-by: Andreas Hindborg <a.hindborg@samsung.com>
Reviewed-by: Trevor Gross <tmgross@umich.edu>
Reviewed-by: Benno Lossin <benno.lossin@proton.me>
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Link: https://lore.kernel.org/r/20240528-alice-mm-v7-4-78222c31b8f4@google.com
[ Fixed typos and added a few intra-doc links. - Miguel ]
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/bindings/bindings_helper.h |   1 +
 rust/helpers.c                  |  20 +++
 rust/kernel/alloc.rs            |   7 +
 rust/kernel/lib.rs              |   1 +
 rust/kernel/page.rs             | 250 ++++++++++++++++++++++++++++++++
 5 files changed, 279 insertions(+)
 create mode 100644 rust/kernel/page.rs

diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
index 52a1412338ef..f2bafb10f181 100644
--- a/rust/bindings/bindings_helper.h
+++ b/rust/bindings/bindings_helper.h
@@ -20,6 +20,7 @@
 
 /* `bindgen` gets confused at certain things. */
 const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN;
+const size_t RUST_CONST_HELPER_PAGE_SIZE = PAGE_SIZE;
 const gfp_t RUST_CONST_HELPER_GFP_ATOMIC = GFP_ATOMIC;
 const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL;
 const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT;
diff --git a/rust/helpers.c b/rust/helpers.c
index d6abe4bd45d9..305f0577fae9 100644
--- a/rust/helpers.c
+++ b/rust/helpers.c
@@ -25,6 +25,8 @@
 #include <linux/build_bug.h>
 #include <linux/err.h>
 #include <linux/errname.h>
+#include <linux/gfp.h>
+#include <linux/highmem.h>
 #include <linux/mutex.h>
 #include <linux/refcount.h>
 #include <linux/sched/signal.h>
@@ -94,6 +96,24 @@ int rust_helper_signal_pending(struct task_struct *t)
 }
 EXPORT_SYMBOL_GPL(rust_helper_signal_pending);
 
+struct page *rust_helper_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+	return alloc_pages(gfp_mask, order);
+}
+EXPORT_SYMBOL_GPL(rust_helper_alloc_pages);
+
+void *rust_helper_kmap_local_page(struct page *page)
+{
+	return kmap_local_page(page);
+}
+EXPORT_SYMBOL_GPL(rust_helper_kmap_local_page);
+
+void rust_helper_kunmap_local(const void *addr)
+{
+	kunmap_local(addr);
+}
+EXPORT_SYMBOL_GPL(rust_helper_kunmap_local);
+
 refcount_t rust_helper_REFCOUNT_INIT(int n)
 {
 	return (refcount_t)REFCOUNT_INIT(n);
diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs
index 396fe5a85a8f..1966bd407017 100644
--- a/rust/kernel/alloc.rs
+++ b/rust/kernel/alloc.rs
@@ -20,6 +20,13 @@
 #[derive(Clone, Copy)]
 pub struct Flags(u32);
 
+impl Flags {
+    /// Get the raw representation of this flag.
+    pub(crate) fn as_raw(self) -> u32 {
+        self.0
+    }
+}
+
 impl core::ops::BitOr for Flags {
     type Output = Self;
     fn bitor(self, rhs: Self) -> Self::Output {
diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
index 767026db068e..5d310e79485f 100644
--- a/rust/kernel/lib.rs
+++ b/rust/kernel/lib.rs
@@ -35,6 +35,7 @@
 pub mod kunit;
 #[cfg(CONFIG_NET)]
 pub mod net;
+pub mod page;
 pub mod prelude;
 pub mod print;
 mod static_assert;
diff --git a/rust/kernel/page.rs b/rust/kernel/page.rs
new file mode 100644
index 000000000000..208a006d587c
--- /dev/null
+++ b/rust/kernel/page.rs
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Kernel page allocation and management.
+
+use crate::{
+    alloc::{AllocError, Flags},
+    bindings,
+    error::code::*,
+    error::Result,
+    uaccess::UserSliceReader,
+};
+use core::ptr::{self, NonNull};
+
+/// A bitwise shift for the page size.
+pub const PAGE_SHIFT: usize = bindings::PAGE_SHIFT as usize;
+
+/// The number of bytes in a page.
+pub const PAGE_SIZE: usize = bindings::PAGE_SIZE;
+
+/// A bitmask that gives the page containing a given address.
+pub const PAGE_MASK: usize = !(PAGE_SIZE - 1);
+
+/// A pointer to a page that owns the page allocation.
+///
+/// # Invariants
+///
+/// The pointer is valid, and has ownership over the page.
+pub struct Page {
+    page: NonNull<bindings::page>,
+}
+
+// SAFETY: Pages have no logic that relies on them staying on a given thread, so moving them across
+// threads is safe.
+unsafe impl Send for Page {}
+
+// SAFETY: Pages have no logic that relies on them not being accessed concurrently, so accessing
+// them concurrently is safe.
+unsafe impl Sync for Page {}
+
+impl Page {
+    /// Allocates a new page.
+    ///
+    /// # Examples
+    ///
+    /// Allocate memory for a page.
+    ///
+    /// ```
+    /// use kernel::page::Page;
+    ///
+    /// # fn dox() -> Result<(), kernel::alloc::AllocError> {
+    /// let page = Page::alloc_page(GFP_KERNEL)?;
+    /// # Ok(()) }
+    /// ```
+    ///
+    /// Allocate memory for a page and zero its contents.
+    ///
+    /// ```
+    /// use kernel::page::Page;
+    ///
+    /// # fn dox() -> Result<(), kernel::alloc::AllocError> {
+    /// let page = Page::alloc_page(GFP_KERNEL | __GFP_ZERO)?;
+    /// # Ok(()) }
+    /// ```
+    pub fn alloc_page(flags: Flags) -> Result<Self, AllocError> {
+        // SAFETY: Depending on the value of `gfp_flags`, this call may sleep. Other than that, it
+        // is always safe to call this method.
+        let page = unsafe { bindings::alloc_pages(flags.as_raw(), 0) };
+        let page = NonNull::new(page).ok_or(AllocError)?;
+        // INVARIANT: We just successfully allocated a page, so we now have ownership of the newly
+        // allocated page. We transfer that ownership to the new `Page` object.
+        Ok(Self { page })
+    }
+
+    /// Returns a raw pointer to the page.
+    pub fn as_ptr(&self) -> *mut bindings::page {
+        self.page.as_ptr()
+    }
+
+    /// Runs a piece of code with this page mapped to an address.
+    ///
+    /// The page is unmapped when this call returns.
+    ///
+    /// # Using the raw pointer
+    ///
+    /// It is up to the caller to use the provided raw pointer correctly. The pointer is valid for
+    /// `PAGE_SIZE` bytes and for the duration in which the closure is called. The pointer might
+    /// only be mapped on the current thread, and when that is the case, dereferencing it on other
+    /// threads is UB. Other than that, the usual rules for dereferencing a raw pointer apply: don't
+    /// cause data races, the memory may be uninitialized, and so on.
+    ///
+    /// If multiple threads map the same page at the same time, then they may reference with
+    /// different addresses. However, even if the addresses are different, the underlying memory is
+    /// still the same for these purposes (e.g., it's still a data race if they both write to the
+    /// same underlying byte at the same time).
+    fn with_page_mapped<T>(&self, f: impl FnOnce(*mut u8) -> T) -> T {
+        // SAFETY: `page` is valid due to the type invariants on `Page`.
+        let mapped_addr = unsafe { bindings::kmap_local_page(self.as_ptr()) };
+
+        let res = f(mapped_addr.cast());
+
+        // This unmaps the page mapped above.
+        //
+        // SAFETY: Since this API takes the user code as a closure, it can only be used in a manner
+        // where the pages are unmapped in reverse order. This is as required by `kunmap_local`.
+        //
+        // In other words, if this call to `kunmap_local` happens when a different page should be
+        // unmapped first, then there must necessarily be a call to `kmap_local_page` other than the
+        // call just above in `with_page_mapped` that made that possible. In this case, it is the
+        // unsafe block that wraps that other call that is incorrect.
+        unsafe { bindings::kunmap_local(mapped_addr) };
+
+        res
+    }
+
+    /// Runs a piece of code with a raw pointer to a slice of this page, with bounds checking.
+    ///
+    /// If `f` is called, then it will be called with a pointer that points at `off` bytes into the
+    /// page, and the pointer will be valid for at least `len` bytes. The pointer is only valid on
+    /// this task, as this method uses a local mapping.
+    ///
+    /// If `off` and `len` refers to a region outside of this page, then this method returns
+    /// [`EINVAL`] and does not call `f`.
+    ///
+    /// # Using the raw pointer
+    ///
+    /// It is up to the caller to use the provided raw pointer correctly. The pointer is valid for
+    /// `len` bytes and for the duration in which the closure is called. The pointer might only be
+    /// mapped on the current thread, and when that is the case, dereferencing it on other threads
+    /// is UB. Other than that, the usual rules for dereferencing a raw pointer apply: don't cause
+    /// data races, the memory may be uninitialized, and so on.
+    ///
+    /// If multiple threads map the same page at the same time, then they may reference with
+    /// different addresses. However, even if the addresses are different, the underlying memory is
+    /// still the same for these purposes (e.g., it's still a data race if they both write to the
+    /// same underlying byte at the same time).
+    fn with_pointer_into_page<T>(
+        &self,
+        off: usize,
+        len: usize,
+        f: impl FnOnce(*mut u8) -> Result<T>,
+    ) -> Result<T> {
+        let bounds_ok = off <= PAGE_SIZE && len <= PAGE_SIZE && (off + len) <= PAGE_SIZE;
+
+        if bounds_ok {
+            self.with_page_mapped(move |page_addr| {
+                // SAFETY: The `off` integer is at most `PAGE_SIZE`, so this pointer offset will
+                // result in a pointer that is in bounds or one off the end of the page.
+                f(unsafe { page_addr.add(off) })
+            })
+        } else {
+            Err(EINVAL)
+        }
+    }
+
+    /// Maps the page and reads from it into the given buffer.
+    ///
+    /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes
+    /// outside of the page, then this call returns [`EINVAL`].
+    ///
+    /// # Safety
+    ///
+    /// * Callers must ensure that `dst` is valid for writing `len` bytes.
+    /// * Callers must ensure that this call does not race with a write to the same page that
+    ///   overlaps with this read.
+    pub unsafe fn read_raw(&self, dst: *mut u8, offset: usize, len: usize) -> Result {
+        self.with_pointer_into_page(offset, len, move |src| {
+            // SAFETY: If `with_pointer_into_page` calls into this closure, then
+            // it has performed a bounds check and guarantees that `src` is
+            // valid for `len` bytes.
+            //
+            // There caller guarantees that there is no data race.
+            unsafe { ptr::copy_nonoverlapping(src, dst, len) };
+            Ok(())
+        })
+    }
+
+    /// Maps the page and writes into it from the given buffer.
+    ///
+    /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes
+    /// outside of the page, then this call returns [`EINVAL`].
+    ///
+    /// # Safety
+    ///
+    /// * Callers must ensure that `src` is valid for reading `len` bytes.
+    /// * Callers must ensure that this call does not race with a read or write to the same page
+    ///   that overlaps with this write.
+    pub unsafe fn write_raw(&self, src: *const u8, offset: usize, len: usize) -> Result {
+        self.with_pointer_into_page(offset, len, move |dst| {
+            // SAFETY: If `with_pointer_into_page` calls into this closure, then it has performed a
+            // bounds check and guarantees that `dst` is valid for `len` bytes.
+            //
+            // There caller guarantees that there is no data race.
+            unsafe { ptr::copy_nonoverlapping(src, dst, len) };
+            Ok(())
+        })
+    }
+
+    /// Maps the page and zeroes the given slice.
+    ///
+    /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes
+    /// outside of the page, then this call returns [`EINVAL`].
+    ///
+    /// # Safety
+    ///
+    /// Callers must ensure that this call does not race with a read or write to the same page that
+    /// overlaps with this write.
+    pub unsafe fn fill_zero_raw(&self, offset: usize, len: usize) -> Result {
+        self.with_pointer_into_page(offset, len, move |dst| {
+            // SAFETY: If `with_pointer_into_page` calls into this closure, then it has performed a
+            // bounds check and guarantees that `dst` is valid for `len` bytes.
+            //
+            // There caller guarantees that there is no data race.
+            unsafe { ptr::write_bytes(dst, 0u8, len) };
+            Ok(())
+        })
+    }
+
+    /// Copies data from userspace into this page.
+    ///
+    /// This method will perform bounds checks on the page offset. If `offset .. offset+len` goes
+    /// outside of the page, then this call returns [`EINVAL`].
+    ///
+    /// Like the other `UserSliceReader` methods, data races are allowed on the userspace address.
+    /// However, they are not allowed on the page you are copying into.
+    ///
+    /// # Safety
+    ///
+    /// Callers must ensure that this call does not race with a read or write to the same page that
+    /// overlaps with this write.
+    pub unsafe fn copy_from_user_slice_raw(
+        &self,
+        reader: &mut UserSliceReader,
+        offset: usize,
+        len: usize,
+    ) -> Result {
+        self.with_pointer_into_page(offset, len, move |dst| {
+            // SAFETY: If `with_pointer_into_page` calls into this closure, then it has performed a
+            // bounds check and guarantees that `dst` is valid for `len` bytes. Furthermore, we have
+            // exclusive access to the slice since the caller guarantees that there are no races.
+            reader.read_raw(unsafe { core::slice::from_raw_parts_mut(dst.cast(), len) })
+        })
+    }
+}
+
+impl Drop for Page {
+    fn drop(&mut self) {
+        // SAFETY: By the type invariants, we have ownership of the page and can free it.
+        unsafe { bindings::__free_pages(self.page.as_ptr(), 0) };
+    }
+}

From 86588139b84386a47108b024d703c3c24ddbbec8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 3 Jul 2024 18:01:35 -0700
Subject: [PATCH 043/346] Documentation: CXL Maturity Map

Provide a survey of the work-in-progress maturity (implementation
status) of various aspects of the CXL subsystem.

Clarify that in addition to ongoing upkeep relative to specification
updates, there are some long running themes in the driver that respond
to the discovery of new corner cases (bugs) and new use cases (feature
extensions).

The primary audience is distribution maintainers, but it also serves as
a guide for kernel developers to understand what aspects of the CXL
subsystem need more help. It is a landing page to document ongoing
progress, and a guide to discern exposure to work-in-progress features.

Reviewed-by: Adam Manzanares <a.manzanares@samsung.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Link: https://patch.msgid.link/172005486862.2048248.6668794717827294862.stgit@dwillia2-xfh.jf.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 Documentation/driver-api/cxl/index.rst        |   2 +
 Documentation/driver-api/cxl/maturity-map.rst | 202 ++++++++++++++++++
 MAINTAINERS                                   |   1 +
 3 files changed, 205 insertions(+)
 create mode 100644 Documentation/driver-api/cxl/maturity-map.rst

diff --git a/Documentation/driver-api/cxl/index.rst b/Documentation/driver-api/cxl/index.rst
index 036e49553542..12b82725d322 100644
--- a/Documentation/driver-api/cxl/index.rst
+++ b/Documentation/driver-api/cxl/index.rst
@@ -9,4 +9,6 @@ Compute Express Link
 
    memory-devices
 
+   maturity-map
+
 .. only::  subproject and html
diff --git a/Documentation/driver-api/cxl/maturity-map.rst b/Documentation/driver-api/cxl/maturity-map.rst
new file mode 100644
index 000000000000..df8e2ac2a320
--- /dev/null
+++ b/Documentation/driver-api/cxl/maturity-map.rst
@@ -0,0 +1,202 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+===========================================
+Compute Express Link Subsystem Maturity Map
+===========================================
+
+The Linux CXL subsystem tracks the dynamic `CXL specification
+<https://computeexpresslink.org/cxl-specification-landing-page>`_ that
+continues to respond to new use cases with new features, capability
+updates and fixes. At any given point some aspects of the subsystem are
+more mature than others. While the periodic pull requests summarize the
+`work being incorporated each merge window
+<https://lore.kernel.org/linux-cxl/?q=s%3APULL+s%3ACXL+tc%3Atorvalds+NOT+s%3ARe>`_,
+those do not always convey progress relative to a starting point and a
+future end goal.
+
+What follows is a coarse breakdown of the subsystem's major
+responsibilities along with a maturity score. The expectation is that
+the change-history of this document provides an overview summary of the
+subsystem maturation over time.
+
+The maturity scores are:
+
+- [3] Mature: Work in this area is complete and no changes on the horizon.
+  Note that this score can regress from one kernel release to the next
+  based on new test results or end user reports.
+
+- [2] Stabilizing: Major functionality operational, common cases are
+  mature, but known corner cases are still a work in progress.
+
+- [1] Initial: Capability that has exited the Proof of Concept phase, but
+  may still have significant gaps to close and fixes to apply as real
+  world testing occurs.
+
+- [0] Known gap: Feature is on a medium to long term horizon to
+  implement.  If the specification has a feature that does not even have
+  a '0' score in this document, there is a good chance that no one in
+  the linux-cxl@vger.kernel.org community has started to look at it.
+
+- X: Out of scope for kernel enabling, or kernel enabling not required
+
+Feature and Capabilities
+========================
+
+Enumeration / Provisioning
+--------------------------
+All of the fundamental enumeration an object model of the subsystem is
+in place, but there are several corner cases that are pending closure.
+
+
+* [2] CXL Window Enumeration
+
+  * [0] :ref:`Extended-linear memory-side cache <extended-linear>`
+  * [0] Low Memory-hole
+  * [0] Hetero-interleave
+
+* [2] Switch Enumeration
+
+  * [0] CXL register enumeration link-up dependency
+
+* [2] HDM Decoder Configuration
+
+  * [0] Decoder target and granularity constraints
+
+* [2] Performance enumeration
+
+  * [3] Endpoint CDAT
+  * [3] Switch CDAT
+  * [1] CDAT to Core-mm integration
+
+    * [1] x86
+    * [0] Arm64
+    * [0] All other arch.
+
+  * [0] Shared link
+
+* [2] Hotplug
+  (see CXL Window Enumeration)
+
+  * [0] Handle Soft Reserved conflicts
+
+* [0] :ref:`RCH link status <rch-link-status>`
+* [0] Fabrics / G-FAM (chapter 7)
+* [0] Global Access Endpoint
+
+
+RAS
+---
+In many ways CXL can be seen as a standardization of what would normally
+be handled by custom EDAC drivers. The open development here is
+mainly caused by the enumeration corner cases above.
+
+* [3] Component events (OS)
+* [2] Component events (FFM)
+* [1] Endpoint protocol errors (OS)
+* [1] Endpoint protocol errors (FFM)
+* [0] Switch protocol errors (OS)
+* [1] Switch protocol errors (FFM)
+* [2] DPA->HPA Address translation
+
+    * [1] XOR Interleave translation
+      (see CXL Window Enumeration)
+
+* [1] Memory Failure coordination
+* [0] Scrub control
+* [2] ACPI error injection EINJ
+
+  * [0] EINJ v2
+  * [X] Compliance DOE
+
+* [2] Native error injection
+* [3] RCH error handling
+* [1] VH error handling
+* [0] PPR
+* [0] Sparing
+* [0] Device built in test
+
+
+Mailbox commands
+----------------
+
+* [3] Firmware update
+* [3] Health / Alerts
+* [1] :ref:`Background commands <background-commands>`
+* [3] Sanitization
+* [3] Security commands
+* [3] RAW Command Debug Passthrough
+* [0] CEL-only-validation Passthrough
+* [0] Switch CCI
+* [3] Timestamp
+* [1] PMEM labels
+* [0] PMEM GPF / Dirty Shutdown
+* [0] Scan Media
+
+PMU
+---
+* [1] Type 3 PMU
+* [0] Switch USP/ DSP, Root Port
+
+Security
+--------
+
+* [X] CXL Trusted Execution Environment Security Protocol (TSP)
+* [X] CXL IDE (subsumed by TSP)
+
+Memory-pooling
+--------------
+
+* [1] Hotplug of LDs (via PCI hotplug)
+* [0] Dynamic Capacity Device (DCD) Support
+
+Multi-host sharing
+------------------
+
+* [0] Hardware coherent shared memory
+* [0] Software managed coherency shared memory
+
+Multi-host memory
+-----------------
+
+* [0] Dynamic Capacity Device Support
+* [0] Sharing
+
+Accelerator
+-----------
+
+* [0] Accelerator memory enumeration HDM-D (CXL 1.1/2.0 Type-2)
+* [0] Accelerator memory enumeration HDM-DB (CXL 3.0 Type-2)
+* [0] CXL.cache 68b (CXL 2.0)
+* [0] CXL.cache 256b Cache IDs (CXL 3.0)
+
+User Flow Support
+-----------------
+
+* [0] HPA->DPA Address translation (need xormaps export solution)
+
+Details
+=======
+
+.. _extended-linear:
+
+* **Extended-linear memory-side cache**: An HMAT proposal to enumerate the presence of a
+  memory-side cache where the cache capacity extends the SRAT address
+  range capacity. `See the ECN
+  <https://lore.kernel.org/linux-cxl/6650e4f835a0e_195e294a8@dwillia2-mobl3.amr.corp.intel.com.notmuch/>`_
+  for more details:
+
+.. _rch-link-status:
+
+* **RCH Link Status**: RCH (Restricted CXL Host) topologies, end up
+  hiding some standard registers like PCIe Link Status / Capabilities in
+  the CXL RCRB (Root Complex Register Block).
+
+.. _background-commands:
+
+* **Background commands**: The CXL background command mechanism is
+  awkward as the single slot is monopolized potentially indefinitely by
+  various commands. A `cancel on conflict
+  <http://lore.kernel.org/r/66035c2e8ba17_770232948b@dwillia2-xfh.jf.intel.com.notmuch>`_
+  facility is needed to make sure the kernel can ensure forward progress
+  of priority commands.
diff --git a/MAINTAINERS b/MAINTAINERS
index 3c4fdf74a3f9..ce047ab7e977 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5485,6 +5485,7 @@ M:	Ira Weiny <ira.weiny@intel.com>
 M:	Dan Williams <dan.j.williams@intel.com>
 L:	linux-cxl@vger.kernel.org
 S:	Maintained
+F:	Documentation/driver-api/cxl
 F:	drivers/cxl/
 F:	include/linux/einj-cxl.h
 F:	include/linux/cxl-event.h

From 67bab430f4e71332aced7562a67d73444f5b4b77 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Fri, 24 May 2024 14:31:52 +0200
Subject: [PATCH 044/346] tools/power turbostat: Group SMI counter with APERF
 and MPERF

These three counters now are treated similar to other perf counters
groups. This simplifies and gets rid of a lot of special cases for APERF
and MPERF.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 635 ++++++++++++--------------
 1 file changed, 280 insertions(+), 355 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 29751b41ea0d..495235055fa2 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -85,7 +85,6 @@
 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
 enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M };
 enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE };
-enum amperf_source { AMPERF_SOURCE_PERF, AMPERF_SOURCE_MSR };
 enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR };
 
 struct sysfs_path {
@@ -275,7 +274,6 @@ char *proc_stat = "/proc/stat";
 FILE *outf;
 int *fd_percpu;
 int *fd_instr_count_percpu;
-struct amperf_group_fd *fd_amperf_percpu;	/* File descriptors for perf group with APERF and MPERF counters. */
 struct timeval interval_tv = { 5, 0 };
 struct timespec interval_ts = { 5, 0 };
 
@@ -290,6 +288,7 @@ unsigned int summary_only;
 unsigned int list_header_only;
 unsigned int dump_only;
 unsigned int has_aperf;
+unsigned int has_aperf_access;
 unsigned int has_epb;
 unsigned int has_turbo;
 unsigned int is_hybrid;
@@ -330,7 +329,6 @@ unsigned int first_counter_read = 1;
 int ignore_stdin;
 bool no_msr;
 bool no_perf;
-enum amperf_source amperf_source;
 
 enum gfx_sysfs_idx {
 	GFX_rc6,
@@ -1381,6 +1379,67 @@ static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
 	  },
 };
 
+/* Indexes used to map data read from perf and MSRs into global variables */
+enum msr_rci_index {
+	MSR_RCI_INDEX_APERF = 0,
+	MSR_RCI_INDEX_MPERF = 1,
+	MSR_RCI_INDEX_SMI = 2,
+	NUM_MSR_COUNTERS,
+};
+
+struct msr_counter_info_t {
+	unsigned long long data[NUM_MSR_COUNTERS];
+	enum counter_source source[NUM_MSR_COUNTERS];
+	unsigned long long msr[NUM_MSR_COUNTERS];
+	unsigned long long msr_mask[NUM_MSR_COUNTERS];
+	int fd_perf;
+};
+
+struct msr_counter_info_t *msr_counter_info;
+unsigned int msr_counter_info_size;
+
+struct msr_counter_arch_info {
+	const char *perf_subsys;
+	const char *perf_name;
+	unsigned long long msr;
+	unsigned long long msr_mask;
+	unsigned int rci_index;	/* Maps data from perf counters to global variables */
+	bool needed;
+	bool present;
+};
+
+enum msr_arch_info_index {
+	MSR_ARCH_INFO_APERF_INDEX = 0,
+	MSR_ARCH_INFO_MPERF_INDEX = 1,
+	MSR_ARCH_INFO_SMI_INDEX = 2,
+};
+
+static struct msr_counter_arch_info msr_counter_arch_infos[] = {
+	[MSR_ARCH_INFO_APERF_INDEX] = {
+		.perf_subsys = "msr",
+		.perf_name = "aperf",
+		.msr = MSR_IA32_APERF,
+		.msr_mask = 0xFFFFFFFFFFFFFFFF,
+		.rci_index = MSR_RCI_INDEX_APERF,
+	},
+
+	[MSR_ARCH_INFO_MPERF_INDEX] = {
+		.perf_subsys = "msr",
+		.perf_name = "mperf",
+		.msr = MSR_IA32_MPERF,
+		.msr_mask = 0xFFFFFFFFFFFFFFFF,
+		.rci_index = MSR_RCI_INDEX_MPERF,
+	},
+
+	[MSR_ARCH_INFO_SMI_INDEX] = {
+		.perf_subsys = "msr",
+		.perf_name = "smi",
+		.msr = MSR_SMI_COUNT,
+		.msr_mask = 0xFFFFFFFF,
+		.rci_index = MSR_RCI_INDEX_SMI,
+	},
+};
+
 struct thread_data {
 	struct timeval tv_begin;
 	struct timeval tv_end;
@@ -1767,7 +1826,7 @@ int get_msr_fd(int cpu)
 
 static void bic_disable_msr_access(void)
 {
-	const unsigned long bic_msrs = BIC_SMI | BIC_Mod_c6 | BIC_CoreTmp |
+	const unsigned long bic_msrs = BIC_Mod_c6 | BIC_CoreTmp |
 	    BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp;
 
 	bic_enabled &= ~bic_msrs;
@@ -3402,30 +3461,6 @@ static unsigned int read_perf_counter_info_n(const char *const path, const char
 	return v;
 }
 
-static unsigned int read_msr_type(void)
-{
-	const char *const path = "/sys/bus/event_source/devices/msr/type";
-	const char *const format = "%u";
-
-	return read_perf_counter_info_n(path, format);
-}
-
-static unsigned int read_aperf_config(void)
-{
-	const char *const path = "/sys/bus/event_source/devices/msr/events/aperf";
-	const char *const format = "event=%x";
-
-	return read_perf_counter_info_n(path, format);
-}
-
-static unsigned int read_mperf_config(void)
-{
-	const char *const path = "/sys/bus/event_source/devices/msr/events/mperf";
-	const char *const format = "event=%x";
-
-	return read_perf_counter_info_n(path, format);
-}
-
 static unsigned int read_perf_type(const char *subsys)
 {
 	const char *const path_format = "/sys/bus/event_source/devices/%s/type";
@@ -3479,124 +3514,6 @@ static double read_perf_rapl_scale(const char *subsys, const char *event_name)
 	return scale;
 }
 
-static struct amperf_group_fd open_amperf_fd(int cpu)
-{
-	const unsigned int msr_type = read_msr_type();
-	const unsigned int aperf_config = read_aperf_config();
-	const unsigned int mperf_config = read_mperf_config();
-	struct amperf_group_fd fds = {.aperf = -1, .mperf = -1 };
-
-	fds.aperf = open_perf_counter(cpu, msr_type, aperf_config, -1, PERF_FORMAT_GROUP);
-	fds.mperf = open_perf_counter(cpu, msr_type, mperf_config, fds.aperf, PERF_FORMAT_GROUP);
-
-	return fds;
-}
-
-static int get_amperf_fd(int cpu)
-{
-	assert(fd_amperf_percpu);
-
-	if (fd_amperf_percpu[cpu].aperf)
-		return fd_amperf_percpu[cpu].aperf;
-
-	fd_amperf_percpu[cpu] = open_amperf_fd(cpu);
-
-	return fd_amperf_percpu[cpu].aperf;
-}
-
-/* Read APERF, MPERF and TSC using the perf API. */
-static int read_aperf_mperf_tsc_perf(struct thread_data *t, int cpu)
-{
-	union {
-		struct {
-			unsigned long nr_entries;
-			unsigned long aperf;
-			unsigned long mperf;
-		};
-
-		unsigned long as_array[3];
-	} cnt;
-
-	const int fd_amperf = get_amperf_fd(cpu);
-
-	/*
-	 * Read the TSC with rdtsc, because we want the absolute value and not
-	 * the offset from the start of the counter.
-	 */
-	t->tsc = rdtsc();
-
-	const int n = read(fd_amperf, &cnt.as_array[0], sizeof(cnt.as_array));
-
-	if (n != sizeof(cnt.as_array))
-		return -2;
-
-	t->aperf = cnt.aperf * aperf_mperf_multiplier;
-	t->mperf = cnt.mperf * aperf_mperf_multiplier;
-
-	return 0;
-}
-
-/* Read APERF, MPERF and TSC using the MSR driver and rdtsc instruction. */
-static int read_aperf_mperf_tsc_msr(struct thread_data *t, int cpu)
-{
-	unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time;
-	int aperf_mperf_retry_count = 0;
-
-	/*
-	 * The TSC, APERF and MPERF must be read together for
-	 * APERF/MPERF and MPERF/TSC to give accurate results.
-	 *
-	 * Unfortunately, APERF and MPERF are read by
-	 * individual system call, so delays may occur
-	 * between them.  If the time to read them
-	 * varies by a large amount, we re-read them.
-	 */
-
-	/*
-	 * This initial dummy APERF read has been seen to
-	 * reduce jitter in the subsequent reads.
-	 */
-
-	if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
-		return -3;
-
-retry:
-	t->tsc = rdtsc();	/* re-read close to APERF */
-
-	tsc_before = t->tsc;
-
-	if (get_msr(cpu, MSR_IA32_APERF, &t->aperf))
-		return -3;
-
-	tsc_between = rdtsc();
-
-	if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
-		return -4;
-
-	tsc_after = rdtsc();
-
-	aperf_time = tsc_between - tsc_before;
-	mperf_time = tsc_after - tsc_between;
-
-	/*
-	 * If the system call latency to read APERF and MPERF
-	 * differ by more than 2x, then try again.
-	 */
-	if ((aperf_time > (2 * mperf_time)) || (mperf_time > (2 * aperf_time))) {
-		aperf_mperf_retry_count++;
-		if (aperf_mperf_retry_count < 5)
-			goto retry;
-		else
-			warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time);
-	}
-	aperf_mperf_retry_count = 0;
-
-	t->aperf = t->aperf * aperf_mperf_multiplier;
-	t->mperf = t->mperf * aperf_mperf_multiplier;
-
-	return 0;
-}
-
 size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci)
 {
 	size_t ret = 0;
@@ -3853,6 +3770,84 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat
 	return 0;
 }
 
+size_t msr_counter_info_count_perf(const struct msr_counter_info_t *mci)
+{
+	size_t ret = 0;
+
+	for (int i = 0; i < NUM_MSR_COUNTERS; ++i)
+		if (mci->source[i] == COUNTER_SOURCE_PERF)
+			++ret;
+
+	return ret;
+}
+
+int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t)
+{
+	unsigned long long perf_data[NUM_MSR_COUNTERS + 1];
+
+	struct msr_counter_info_t *mci;
+
+	if (debug)
+		fprintf(stderr, "%s: cpu%d\n", __func__, cpu);
+
+	assert(msr_counter_info);
+	assert(cpu <= msr_counter_info_size);
+
+	mci = &msr_counter_info[cpu];
+
+	ZERO_ARRAY(perf_data);
+	ZERO_ARRAY(mci->data);
+
+	if (mci->fd_perf != -1) {
+		const size_t num_perf_counters = msr_counter_info_count_perf(mci);
+		const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long);
+		const ssize_t actual_read_size = read(mci->fd_perf, &perf_data[0], sizeof(perf_data));
+
+		if (actual_read_size != expected_read_size)
+			err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size,
+			    actual_read_size);
+	}
+
+	for (unsigned int i = 0, pi = 1; i < NUM_MSR_COUNTERS; ++i) {
+		switch (mci->source[i]) {
+		case COUNTER_SOURCE_NONE:
+			break;
+
+		case COUNTER_SOURCE_PERF:
+			assert(pi < ARRAY_SIZE(perf_data));
+			assert(mci->fd_perf != -1);
+
+			if (debug)
+				fprintf(stderr, "Reading msr counter via perf at %u: %llu\n", i, perf_data[pi]);
+
+			mci->data[i] = perf_data[pi];
+
+			++pi;
+			break;
+
+		case COUNTER_SOURCE_MSR:
+			assert(!no_msr);
+
+			if (get_msr(cpu, mci->msr[i], &mci->data[i]))
+				return -2 - i;
+
+			mci->data[i] &= mci->msr_mask[i];
+
+			if (debug)
+				fprintf(stderr, "Reading msr counter via msr at %u: %llu\n", i, mci->data[i]);
+
+			break;
+		}
+	}
+
+	BUILD_BUG_ON(NUM_MSR_COUNTERS != 3);
+	t->aperf = mci->data[MSR_RCI_INDEX_APERF];
+	t->mperf = mci->data[MSR_RCI_INDEX_MPERF];
+	t->smi_count = mci->data[MSR_RCI_INDEX_SMI];
+
+	return 0;
+}
+
 /*
  * get_counters(...)
  * migrate to cpu
@@ -3878,24 +3873,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
 	t->tsc = rdtsc();	/* we are running on local CPU of interest */
 
-	if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
-	    || soft_c1_residency_display(BIC_Avg_MHz)) {
-		int status = -1;
-
-		assert(!no_perf || !no_msr);
-
-		switch (amperf_source) {
-		case AMPERF_SOURCE_PERF:
-			status = read_aperf_mperf_tsc_perf(t, cpu);
-			break;
-		case AMPERF_SOURCE_MSR:
-			status = read_aperf_mperf_tsc_msr(t, cpu);
-			break;
-		}
-
-		if (status != 0)
-			return status;
-	}
+	get_smi_aperf_mperf(cpu, t);
 
 	if (DO_BIC(BIC_IPC))
 		if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long))
@@ -3903,11 +3881,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 
 	if (DO_BIC(BIC_IRQ))
 		t->irq_count = irqs_per_cpu[cpu];
-	if (DO_BIC(BIC_SMI)) {
-		if (get_msr(cpu, MSR_SMI_COUNT, &msr))
-			return -5;
-		t->smi_count = msr & 0xFFFFFFFF;
-	}
 
 	get_cstate_counters(cpu, t, c, p);
 
@@ -4489,25 +4462,6 @@ void free_fd_percpu(void)
 	fd_percpu = NULL;
 }
 
-void free_fd_amperf_percpu(void)
-{
-	int i;
-
-	if (!fd_amperf_percpu)
-		return;
-
-	for (i = 0; i < topo.max_cpu_num + 1; ++i) {
-		if (fd_amperf_percpu[i].mperf != 0)
-			close(fd_amperf_percpu[i].mperf);
-
-		if (fd_amperf_percpu[i].aperf != 0)
-			close(fd_amperf_percpu[i].aperf);
-	}
-
-	free(fd_amperf_percpu);
-	fd_amperf_percpu = NULL;
-}
-
 void free_fd_instr_count_percpu(void)
 {
 	if (!fd_instr_count_percpu)
@@ -4542,6 +4496,21 @@ void free_fd_cstate(void)
 	ccstate_counter_info_size = 0;
 }
 
+void free_fd_msr(void)
+{
+	if (!msr_counter_info)
+		return;
+
+	for (int cpu = 0; cpu < topo.max_cpu_num; ++cpu) {
+		if (msr_counter_info[cpu].fd_perf != -1)
+			close(msr_counter_info[cpu].fd_perf);
+	}
+
+	free(msr_counter_info);
+	msr_counter_info = NULL;
+	msr_counter_info_size = 0;
+}
+
 void free_fd_rapl_percpu(void)
 {
 	if (!rapl_counter_info_perdomain)
@@ -4601,7 +4570,7 @@ void free_all_buffers(void)
 
 	free_fd_percpu();
 	free_fd_instr_count_percpu();
-	free_fd_amperf_percpu();
+	free_fd_msr();
 	free_fd_rapl_percpu();
 	free_fd_cstate();
 
@@ -4938,6 +4907,7 @@ static void update_effective_set(bool startup)
 }
 
 void linux_perf_init(void);
+void msr_perf_init(void);
 void rapl_perf_init(void);
 void cstate_perf_init(void);
 
@@ -4946,6 +4916,7 @@ void re_initialize(void)
 	free_all_buffers();
 	setup_all_buffers(false);
 	linux_perf_init();
+	msr_perf_init();
 	rapl_perf_init();
 	cstate_perf_init();
 	fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus,
@@ -6799,15 +6770,6 @@ static int has_instr_count_access(void)
 	return has_access;
 }
 
-bool is_aperf_access_required(void)
-{
-	return BIC_IS_ENABLED(BIC_Avg_MHz)
-	    || BIC_IS_ENABLED(BIC_Busy)
-	    || BIC_IS_ENABLED(BIC_Bzy_MHz)
-	    || BIC_IS_ENABLED(BIC_IPC)
-	    || BIC_IS_ENABLED(BIC_CPU_c1);
-}
-
 int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
 			   double *scale_, enum rapl_unit *unit_)
 {
@@ -6866,14 +6828,6 @@ void linux_perf_init(void)
 		if (fd_instr_count_percpu == NULL)
 			err(-1, "calloc fd_instr_count_percpu");
 	}
-
-	const bool aperf_required = is_aperf_access_required();
-
-	if (aperf_required && has_aperf && amperf_source == AMPERF_SOURCE_PERF) {
-		fd_amperf_percpu = calloc(topo.max_cpu_num + 1, sizeof(*fd_amperf_percpu));
-		if (fd_amperf_percpu == NULL)
-			err(-1, "calloc fd_amperf_percpu");
-	}
 }
 
 void rapl_perf_init(void)
@@ -6966,75 +6920,11 @@ void rapl_perf_init(void)
 	free(domain_visited);
 }
 
-static int has_amperf_access_via_msr(void)
-{
-	if (no_msr)
-		return 0;
-
-	if (probe_msr(base_cpu, MSR_IA32_APERF))
-		return 0;
-
-	if (probe_msr(base_cpu, MSR_IA32_MPERF))
-		return 0;
-
-	return 1;
-}
-
-static int has_amperf_access_via_perf(void)
-{
-	struct amperf_group_fd fds;
-
-	/*
-	 * Cache the last result, so we don't warn the user multiple times
-	 *
-	 * Negative means cached, no access
-	 * Zero means not cached
-	 * Positive means cached, has access
-	 */
-	static int has_access_cached;
-
-	if (no_perf)
-		return 0;
-
-	if (has_access_cached != 0)
-		return has_access_cached > 0;
-
-	fds = open_amperf_fd(base_cpu);
-	has_access_cached = (fds.aperf != -1) && (fds.mperf != -1);
-
-	if (fds.aperf == -1)
-		warnx("Failed to access %s. Some of the counters may not be available\n"
-		      "\tRun as root to enable them or use %s to disable the access explicitly",
-		      "APERF perf counter", "--no-perf");
-	else
-		close(fds.aperf);
-
-	if (fds.mperf == -1)
-		warnx("Failed to access %s. Some of the counters may not be available\n"
-		      "\tRun as root to enable them or use %s to disable the access explicitly",
-		      "MPERF perf counter", "--no-perf");
-	else
-		close(fds.mperf);
-
-	if (has_access_cached == 0)
-		has_access_cached = -1;
-
-	return has_access_cached > 0;
-}
-
-/* Check if we can access APERF and MPERF */
+/* Assumes msr_counter_info is populated */
 static int has_amperf_access(void)
 {
-	if (!is_aperf_access_required())
-		return 0;
-
-	if (!no_msr && has_amperf_access_via_msr())
-		return 1;
-
-	if (!no_perf && has_amperf_access_via_perf())
-		return 1;
-
-	return 0;
+	return msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present &&
+	    msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present;
 }
 
 int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name)
@@ -7083,6 +6973,114 @@ int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const st
 	return ret;
 }
 
+int add_msr_perf_counter_(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
+{
+	if (no_perf)
+		return -1;
+
+	const unsigned int type = read_perf_type(cai->perf_subsys);
+	const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name);
+
+	const int fd_counter = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP);
+
+	if (fd_counter == -1)
+		return -1;
+
+	/* If it's the first counter opened, make it a group descriptor */
+	if (cci->fd_perf == -1)
+		cci->fd_perf = fd_counter;
+
+	return fd_counter;
+}
+
+int add_msr_perf_counter(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
+{
+	int ret = add_msr_perf_counter_(cpu, cci, cai);
+
+	if (debug)
+		fprintf(stderr, "%s: %s/%s: %d (cpu: %d)\n", __func__, cai->perf_subsys, cai->perf_name, ret, cpu);
+
+	return ret;
+}
+
+void msr_perf_init_(void)
+{
+	const int mci_num = topo.max_cpu_num + 1;
+
+	msr_counter_info = calloc(mci_num, sizeof(*msr_counter_info));
+	if (!msr_counter_info)
+		err(1, "calloc msr_counter_info");
+	msr_counter_info_size = mci_num;
+
+	for (int cpu = 0; cpu < mci_num; ++cpu)
+		msr_counter_info[cpu].fd_perf = -1;
+
+	for (int cidx = 0; cidx < NUM_MSR_COUNTERS; ++cidx) {
+
+		struct msr_counter_arch_info *cai = &msr_counter_arch_infos[cidx];
+
+		cai->present = false;
+
+		for (int cpu = 0; cpu < mci_num; ++cpu) {
+
+			struct msr_counter_info_t *const cci = &msr_counter_info[cpu];
+
+			if (cpu_is_not_allowed(cpu))
+				continue;
+
+			if (cai->needed) {
+				/* Use perf API for this counter */
+				if (!no_perf && cai->perf_name && add_msr_perf_counter(cpu, cci, cai) != -1) {
+					cci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
+					cai->present = true;
+
+					/* User MSR for this counter */
+				} else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) {
+					cci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
+					cci->msr[cai->rci_index] = cai->msr;
+					cci->msr_mask[cai->rci_index] = cai->msr_mask;
+					cai->present = true;
+				}
+			}
+		}
+	}
+}
+
+/* Initialize data for reading perf counters from the MSR group. */
+void msr_perf_init(void)
+{
+	bool need_amperf = false, need_smi = false;
+	const bool need_soft_c1 = (!platform->has_msr_core_c1_res) && (platform->supported_cstates & CC1);
+
+	need_amperf = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz)
+	    || BIC_IS_ENABLED(BIC_IPC) || need_soft_c1;
+
+	if (BIC_IS_ENABLED(BIC_SMI))
+		need_smi = true;
+
+	/* Enable needed counters */
+	msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].needed = need_amperf;
+	msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].needed = need_amperf;
+	msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].needed = need_smi;
+
+	msr_perf_init_();
+
+	const bool has_amperf = has_amperf_access();
+	const bool has_smi = msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].present;
+
+	has_aperf_access = has_amperf;
+
+	if (has_amperf) {
+		BIC_PRESENT(BIC_Avg_MHz);
+		BIC_PRESENT(BIC_Busy);
+		BIC_PRESENT(BIC_Bzy_MHz);
+		BIC_PRESENT(BIC_SMI);
+	}
+
+	if (has_smi)
+		BIC_PRESENT(BIC_SMI);
+}
+
 void cstate_perf_init_(bool soft_c1)
 {
 	bool has_counter;
@@ -7340,12 +7338,6 @@ void process_cpuid()
 
 	__cpuid(0x6, eax, ebx, ecx, edx);
 	has_aperf = ecx & (1 << 0);
-	if (has_aperf && has_amperf_access()) {
-		BIC_PRESENT(BIC_Avg_MHz);
-		BIC_PRESENT(BIC_Busy);
-		BIC_PRESENT(BIC_Bzy_MHz);
-		BIC_PRESENT(BIC_IPC);
-	}
 	do_dts = eax & (1 << 0);
 	if (do_dts)
 		BIC_PRESENT(BIC_CoreTmp);
@@ -7462,6 +7454,11 @@ static void counter_info_init(void)
 		if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY)
 			cai->msr = MSR_ATOM_PKG_C6_RESIDENCY;
 	}
+
+	for (int i = 0; i < NUM_MSR_COUNTERS; ++i) {
+		msr_counter_arch_infos[i].present = false;
+		msr_counter_arch_infos[i].needed = false;
+	}
 }
 
 void probe_pm_features(void)
@@ -7837,21 +7834,6 @@ void set_base_cpu(void)
 	err(-ENODEV, "No valid cpus found");
 }
 
-static void set_amperf_source(void)
-{
-	amperf_source = AMPERF_SOURCE_PERF;
-
-	const bool aperf_required = is_aperf_access_required();
-
-	if (no_perf || !aperf_required || !has_amperf_access_via_perf())
-		amperf_source = AMPERF_SOURCE_MSR;
-
-	if (quiet || !debug)
-		return;
-
-	fprintf(outf, "aperf/mperf source preference: %s\n", amperf_source == AMPERF_SOURCE_MSR ? "msr" : "perf");
-}
-
 bool has_added_counters(void)
 {
 	/*
@@ -7862,54 +7844,8 @@ bool has_added_counters(void)
 	return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters;
 }
 
-bool is_msr_access_required(void)
-{
-	if (no_msr)
-		return false;
-
-	if (has_added_counters())
-		return true;
-
-	return BIC_IS_ENABLED(BIC_SMI)
-	    || BIC_IS_ENABLED(BIC_CPU_c1)
-	    || BIC_IS_ENABLED(BIC_CPU_c3)
-	    || BIC_IS_ENABLED(BIC_CPU_c6)
-	    || BIC_IS_ENABLED(BIC_CPU_c7)
-	    || BIC_IS_ENABLED(BIC_Mod_c6)
-	    || BIC_IS_ENABLED(BIC_CoreTmp)
-	    || BIC_IS_ENABLED(BIC_Totl_c0)
-	    || BIC_IS_ENABLED(BIC_Any_c0)
-	    || BIC_IS_ENABLED(BIC_GFX_c0)
-	    || BIC_IS_ENABLED(BIC_CPUGFX)
-	    || BIC_IS_ENABLED(BIC_Pkgpc3)
-	    || BIC_IS_ENABLED(BIC_Pkgpc6)
-	    || BIC_IS_ENABLED(BIC_Pkgpc2)
-	    || BIC_IS_ENABLED(BIC_Pkgpc7)
-	    || BIC_IS_ENABLED(BIC_Pkgpc8)
-	    || BIC_IS_ENABLED(BIC_Pkgpc9)
-	    || BIC_IS_ENABLED(BIC_Pkgpc10)
-	    /* TODO: Multiplex access with perf */
-	    || BIC_IS_ENABLED(BIC_CorWatt)
-	    || BIC_IS_ENABLED(BIC_Cor_J)
-	    || BIC_IS_ENABLED(BIC_PkgWatt)
-	    || BIC_IS_ENABLED(BIC_CorWatt)
-	    || BIC_IS_ENABLED(BIC_GFXWatt)
-	    || BIC_IS_ENABLED(BIC_RAMWatt)
-	    || BIC_IS_ENABLED(BIC_Pkg_J)
-	    || BIC_IS_ENABLED(BIC_Cor_J)
-	    || BIC_IS_ENABLED(BIC_GFX_J)
-	    || BIC_IS_ENABLED(BIC_RAM_J)
-	    || BIC_IS_ENABLED(BIC_PKG__)
-	    || BIC_IS_ENABLED(BIC_RAM__)
-	    || BIC_IS_ENABLED(BIC_PkgTmp)
-	    || (is_aperf_access_required() && !has_amperf_access_via_perf());
-}
-
 void check_msr_access(void)
 {
-	if (!is_msr_access_required())
-		no_msr = 1;
-
 	check_dev_msr();
 	check_msr_permission();
 
@@ -7919,19 +7855,8 @@ void check_msr_access(void)
 
 void check_perf_access(void)
 {
-	const bool intrcount_required = BIC_IS_ENABLED(BIC_IPC);
-
-	if (no_perf || !intrcount_required || !has_instr_count_access())
+	if (no_perf || !BIC_IS_ENABLED(BIC_IPC) || !has_instr_count_access())
 		bic_enabled &= ~BIC_IPC;
-
-	const bool aperf_required = is_aperf_access_required();
-
-	if (!aperf_required || !has_amperf_access()) {
-		bic_enabled &= ~BIC_Avg_MHz;
-		bic_enabled &= ~BIC_Busy;
-		bic_enabled &= ~BIC_Bzy_MHz;
-		bic_enabled &= ~BIC_IPC;
-	}
 }
 
 void turbostat_init()
@@ -7943,7 +7868,7 @@ void turbostat_init()
 	process_cpuid();
 	counter_info_init();
 	probe_pm_features();
-	set_amperf_source();
+	msr_perf_init();
 	linux_perf_init();
 	rapl_perf_init();
 	cstate_perf_init();
@@ -7951,8 +7876,8 @@ void turbostat_init()
 	for_all_cpus(get_cpu_type, ODD_COUNTERS);
 	for_all_cpus(get_cpu_type, EVEN_COUNTERS);
 
-	if (DO_BIC(BIC_IPC))
-		(void)get_instr_count_fd(base_cpu);
+	if (BIC_IS_ENABLED(BIC_IPC) && has_aperf_access && get_instr_count_fd(base_cpu) != -1)
+		BIC_PRESENT(BIC_IPC);
 
 	/*
 	 * If TSC tweak is needed, but couldn't get it,

From 361b8fc73cf63dc0c3be3778720631f1a33ba9db Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Tue, 28 May 2024 15:46:10 +0200
Subject: [PATCH 045/346] tools/power turbostat: Extend --add option with perf
 counters

User can now read perf counters using "--add perf/<device>/<event>".
Other details work similarly to how --add works with MSRs.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/Makefile    |   1 +
 tools/power/x86/turbostat/turbostat.8 |   5 +-
 tools/power/x86/turbostat/turbostat.c | 562 +++++++++++++++++++++++++-
 3 files changed, 557 insertions(+), 11 deletions(-)

diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile
index b1e6817f1e54..3946d5254a1f 100644
--- a/tools/power/x86/turbostat/Makefile
+++ b/tools/power/x86/turbostat/Makefile
@@ -46,6 +46,7 @@ snapshot: turbostat
 	@echo "#define GENMASK_ULL(h, l) (((~0ULL) << (l)) & (~0ULL >> (sizeof(long long) * 8 - 1 - (h))))" >> $(SNAPSHOT)/bits.h
 
 	@echo '#define BUILD_BUG_ON(cond) do { enum { compile_time_check ## __COUNTER__ = 1/(!(cond)) }; } while (0)' > $(SNAPSHOT)/build_bug.h
+	@echo '#define __must_be_array(arr) 0' >> $(SNAPSHOT)/build_bug.h
 
 	@echo PWD=. > $(SNAPSHOT)/Makefile
 	@echo "CFLAGS +=	-DMSRHEADER='\"msr-index.h\"'" >> $(SNAPSHOT)/Makefile
diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 8d37acd39201..5537fc6b5bc3 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -28,10 +28,13 @@ name as necessary to disambiguate it from others is necessary.  Note that option
 .PP
 \fB--add attributes\fP add column with counter having specified 'attributes'.  The 'location' attribute is required, all others are optional.
 .nf
-	location: {\fBmsrDDD\fP | \fBmsr0xXXX\fP | \fB/sys/path...\fP}
+	location: {\fBmsrDDD\fP | \fBmsr0xXXX\fP | \fB/sys/path...\fP | \fBperf/<device>/<event>\fP}
 		msrDDD is a decimal offset, eg. msr16
 		msr0xXXX is a hex offset, eg. msr0x10
 		/sys/path... is an absolute path to a sysfs attribute
+		<device> is a perf device from /sys/bus/event_source/devices/<device> eg. cstate_core
+		<event> is a perf event for given device from /sys/bus/event_source/devices/<device>/events/<event> eg. c1-residency
+			perf/cstate_core/c1-residency would then use /sys/bus/event_source/devices/cstate_core/events/c1-residency
 
 	scope: {\fBcpu\fP | \fBcore\fP | \fBpackage\fP}
 		sample and print the counter for every cpu, core, or package.
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 495235055fa2..be345a4bbe96 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -79,14 +79,40 @@
  */
 #define	NAME_BYTES 20
 #define PATH_BYTES 128
+#define PERF_NAME_BYTES 128
 
 #define MAX_NOFILE 0x8000
 
+#define COUNTER_KIND_PERF_PREFIX "perf/"
+#define COUNTER_KIND_PERF_PREFIX_LEN strlen(COUNTER_KIND_PERF_PREFIX)
+#define PERF_DEV_NAME_BYTES 32
+#define PERF_EVT_NAME_BYTES 32
+
 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
 enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M };
 enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE };
 enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR };
 
+struct perf_counter_info {
+	struct perf_counter_info *next;
+
+	/* How to open the counter / What counter it is. */
+	char device[PERF_DEV_NAME_BYTES];
+	char event[PERF_EVT_NAME_BYTES];
+
+	/* How to show/format the counter. */
+	char name[PERF_NAME_BYTES];
+	unsigned int width;
+	enum counter_scope scope;
+	enum counter_type type;
+	enum counter_format format;
+	double scale;
+
+	/* For reading the counter. */
+	int *fd_perf_per_domain;
+	size_t num_domains;
+};
+
 struct sysfs_path {
 	char path[PATH_BYTES];
 	int id;
@@ -1457,6 +1483,7 @@ struct thread_data {
 	unsigned int flags;
 	bool is_atom;
 	unsigned long long counter[MAX_ADDED_THREAD_COUNTERS];
+	unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS];
 } *thread_even, *thread_odd;
 
 struct core_data {
@@ -1470,6 +1497,7 @@ struct core_data {
 	unsigned int core_id;
 	unsigned long long core_throt_cnt;
 	unsigned long long counter[MAX_ADDED_CORE_COUNTERS];
+	unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS];
 } *core_even, *core_odd;
 
 struct pkg_data {
@@ -1503,6 +1531,7 @@ struct pkg_data {
 	unsigned int pkg_temp_c;
 	unsigned int uncore_mhz;
 	unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS];
+	unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS];
 } *package_even, *package_odd;
 
 #define ODD_COUNTERS thread_odd, core_odd, package_odd
@@ -1637,12 +1666,21 @@ int idx_valid(int idx)
 }
 
 struct sys_counters {
+	/* MSR added counters */
 	unsigned int added_thread_counters;
 	unsigned int added_core_counters;
 	unsigned int added_package_counters;
 	struct msr_counter *tp;
 	struct msr_counter *cp;
 	struct msr_counter *pp;
+
+	/* perf added counters */
+	unsigned int added_thread_perf_counters;
+	unsigned int added_core_perf_counters;
+	unsigned int added_package_perf_counters;
+	struct perf_counter_info *perf_tp;
+	struct perf_counter_info *perf_cp;
+	struct perf_counter_info *perf_pp;
 } sys;
 
 static size_t free_msr_counters_(struct msr_counter **pp)
@@ -1902,6 +1940,23 @@ int probe_msr(int cpu, off_t offset)
 	return 0;
 }
 
+/* Convert CPU ID to domain ID for given added perf counter. */
+unsigned int cpu_to_domain(const struct perf_counter_info *pc, int cpu)
+{
+	switch (pc->scope) {
+	case SCOPE_CPU:
+		return cpu;
+
+	case SCOPE_CORE:
+		return cpus[cpu].physical_core_id;
+
+	case SCOPE_PACKAGE:
+		return cpus[cpu].physical_package_id;
+	}
+
+	__builtin_unreachable();
+}
+
 #define MAX_DEFERRED 16
 char *deferred_add_names[MAX_DEFERRED];
 char *deferred_skip_names[MAX_DEFERRED];
@@ -1925,6 +1980,7 @@ void help(void)
 		"to print statistics, until interrupted.\n"
 		"  -a, --add	add a counter\n"
 		"		  eg. --add msr0x10,u64,cpu,delta,MY_TSC\n"
+		"		  eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n"
 		"  -c, --cpu	cpu-set	limit output to summary plus cpu-set:\n"
 		"		  {core | package | j,k,l..m,n-p }\n"
 		"  -d, --debug	displays usec, Time_Of_Day_Seconds and more debugging\n"
@@ -2034,6 +2090,7 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode)
 void print_header(char *delim)
 {
 	struct msr_counter *mp;
+	struct perf_counter_info *pp;
 	int printed = 0;
 
 	if (DO_BIC(BIC_USEC))
@@ -2091,6 +2148,21 @@ void print_header(char *delim)
 		}
 	}
 
+	for (pp = sys.perf_tp; pp; pp = pp->next) {
+
+		if (pp->format == FORMAT_RAW) {
+			if (pp->width == 64)
+				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name);
+			else
+				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name);
+		} else {
+			if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
+				outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name);
+			else
+				outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name);
+		}
+	}
+
 	if (DO_BIC(BIC_CPU_c1))
 		outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : ""));
 	if (DO_BIC(BIC_CPU_c3))
@@ -2131,6 +2203,21 @@ void print_header(char *delim)
 		}
 	}
 
+	for (pp = sys.perf_cp; pp; pp = pp->next) {
+
+		if (pp->format == FORMAT_RAW) {
+			if (pp->width == 64)
+				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name);
+			else
+				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name);
+		} else {
+			if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
+				outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name);
+			else
+				outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name);
+		}
+	}
+
 	if (DO_BIC(BIC_PkgTmp))
 		outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : ""));
 
@@ -2226,6 +2313,21 @@ void print_header(char *delim)
 		}
 	}
 
+	for (pp = sys.perf_pp; pp; pp = pp->next) {
+
+		if (pp->format == FORMAT_RAW) {
+			if (pp->width == 64)
+				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name);
+			else
+				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name);
+		} else {
+			if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
+				outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name);
+			else
+				outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name);
+		}
+	}
+
 	outp += sprintf(outp, "\n");
 }
 
@@ -2346,6 +2448,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	char *fmt8;
 	int i;
 	struct msr_counter *mp;
+	struct perf_counter_info *pp;
 	char *delim = "\t";
 	int printed = 0;
 
@@ -2483,6 +2586,31 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 		}
 	}
 
+	/* Added perf counters */
+	for (i = 0, pp = sys.perf_tp; pp; ++i, pp = pp->next) {
+		if (pp->format == FORMAT_RAW) {
+			if (pp->width == 32)
+				outp +=
+				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
+					    (unsigned int)t->perf_counter[i]);
+			else
+				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->perf_counter[i]);
+		} else if (pp->format == FORMAT_DELTA) {
+			if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
+				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->perf_counter[i]);
+			else
+				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->perf_counter[i]);
+		} else if (pp->format == FORMAT_PERCENT) {
+			if (pp->type == COUNTER_USEC)
+				outp +=
+				    sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
+					    t->perf_counter[i] / interval_float / 10000);
+			else
+				outp +=
+				    sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->perf_counter[i] / tsc);
+		}
+	}
+
 	/* C1 */
 	if (DO_BIC(BIC_CPU_c1))
 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc);
@@ -2526,6 +2654,24 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 		}
 	}
 
+	for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
+		if (pp->format == FORMAT_RAW) {
+			if (pp->width == 32)
+				outp +=
+				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
+					    (unsigned int)c->perf_counter[i]);
+			else
+				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->perf_counter[i]);
+		} else if (pp->format == FORMAT_DELTA) {
+			if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
+				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->perf_counter[i]);
+			else
+				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->perf_counter[i]);
+		} else if (pp->format == FORMAT_PERCENT) {
+			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->perf_counter[i] / tsc);
+		}
+	}
+
 	fmt8 = "%s%.2f";
 
 	if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
@@ -2680,6 +2826,26 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 			outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000);
 	}
 
+	for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
+		if (pp->format == FORMAT_RAW) {
+			if (pp->width == 32)
+				outp +=
+				    sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
+					    (unsigned int)p->perf_counter[i]);
+			else
+				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->perf_counter[i]);
+		} else if (pp->format == FORMAT_DELTA) {
+			if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
+				outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->perf_counter[i]);
+			else
+				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->perf_counter[i]);
+		} else if (pp->format == FORMAT_PERCENT) {
+			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->perf_counter[i] / tsc);
+		} else if (pp->type == COUNTER_K2M)
+			outp +=
+			    sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000);
+	}
+
 done:
 	if (*(outp - 1) != '\n')
 		outp += sprintf(outp, "\n");
@@ -2733,6 +2899,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old)
 {
 	int i;
 	struct msr_counter *mp;
+	struct perf_counter_info *pp;
 
 	if (DO_BIC(BIC_Totl_c0))
 		old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0;
@@ -2793,6 +2960,15 @@ int delta_package(struct pkg_data *new, struct pkg_data *old)
 			old->counter[i] = new->counter[i] - old->counter[i];
 	}
 
+	for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
+		if (pp->format == FORMAT_RAW)
+			old->perf_counter[i] = new->perf_counter[i];
+		else if (pp->format == FORMAT_AVERAGE)
+			old->perf_counter[i] = new->perf_counter[i];
+		else
+			old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
+	}
+
 	return 0;
 }
 
@@ -2800,6 +2976,7 @@ void delta_core(struct core_data *new, struct core_data *old)
 {
 	int i;
 	struct msr_counter *mp;
+	struct perf_counter_info *pp;
 
 	old->c3 = new->c3 - old->c3;
 	old->c6 = new->c6 - old->c6;
@@ -2816,6 +2993,13 @@ void delta_core(struct core_data *new, struct core_data *old)
 		else
 			old->counter[i] = new->counter[i] - old->counter[i];
 	}
+
+	for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
+		if (pp->format == FORMAT_RAW)
+			old->perf_counter[i] = new->perf_counter[i];
+		else
+			old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
+	}
 }
 
 int soft_c1_residency_display(int bic)
@@ -2833,6 +3017,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
 {
 	int i;
 	struct msr_counter *mp;
+	struct perf_counter_info *pp;
 
 	/* we run cpuid just the 1st time, copy the results */
 	if (DO_BIC(BIC_APIC))
@@ -2911,6 +3096,14 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
 		else
 			old->counter[i] = new->counter[i] - old->counter[i];
 	}
+
+	for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
+		if (pp->format == FORMAT_RAW)
+			old->perf_counter[i] = new->perf_counter[i];
+		else
+			old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
+	}
+
 	return 0;
 }
 
@@ -3013,6 +3206,10 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 
 	for (i = 0, mp = sys.pp; mp; i++, mp = mp->next)
 		p->counter[i] = 0;
+
+	memset(&t->perf_counter[0], 0, sizeof(t->perf_counter));
+	memset(&c->perf_counter[0], 0, sizeof(c->perf_counter));
+	memset(&p->perf_counter[0], 0, sizeof(p->perf_counter));
 }
 
 void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src)
@@ -3033,6 +3230,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 {
 	int i;
 	struct msr_counter *mp;
+	struct perf_counter_info *pp;
 
 	/* copy un-changing apic_id's */
 	if (DO_BIC(BIC_APIC))
@@ -3063,6 +3261,12 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		average.threads.counter[i] += t->counter[i];
 	}
 
+	for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
+		if (pp->format == FORMAT_RAW)
+			continue;
+		average.threads.perf_counter[i] += t->perf_counter[i];
+	}
+
 	/* sum per-core values only for 1st thread in core */
 	if (!is_cpu_first_thread_in_core(t, c, p))
 		return 0;
@@ -3083,6 +3287,12 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		average.cores.counter[i] += c->counter[i];
 	}
 
+	for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
+		if (pp->format == FORMAT_RAW)
+			continue;
+		average.cores.perf_counter[i] += c->perf_counter[i];
+	}
+
 	/* sum per-pkg values only for 1st core in pkg */
 	if (!is_cpu_first_core_in_package(t, c, p))
 		return 0;
@@ -3134,6 +3344,14 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		else
 			average.packages.counter[i] += p->counter[i];
 	}
+
+	for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
+		if ((pp->format == FORMAT_RAW) && (topo.num_packages == 0))
+			average.packages.perf_counter[i] = p->perf_counter[i];
+		else
+			average.packages.perf_counter[i] += p->perf_counter[i];
+	}
+
 	return 0;
 }
 
@@ -3145,6 +3363,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
 {
 	int i;
 	struct msr_counter *mp;
+	struct perf_counter_info *pp;
 
 	clear_counters(&average.threads, &average.cores, &average.packages);
 
@@ -3216,6 +3435,35 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
 		}
 		average.packages.counter[i] /= topo.allowed_packages;
 	}
+
+	for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
+		if (pp->format == FORMAT_RAW)
+			continue;
+		if (pp->type == COUNTER_ITEMS) {
+			if (average.threads.perf_counter[i] > 9999999)
+				sums_need_wide_columns = 1;
+			continue;
+		}
+		average.threads.perf_counter[i] /= topo.allowed_cpus;
+	}
+	for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
+		if (pp->format == FORMAT_RAW)
+			continue;
+		if (pp->type == COUNTER_ITEMS) {
+			if (average.cores.perf_counter[i] > 9999999)
+				sums_need_wide_columns = 1;
+		}
+		average.cores.perf_counter[i] /= topo.allowed_cores;
+	}
+	for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
+		if (pp->format == FORMAT_RAW)
+			continue;
+		if (pp->type == COUNTER_ITEMS) {
+			if (average.packages.perf_counter[i] > 9999999)
+				sums_need_wide_columns = 1;
+		}
+		average.packages.perf_counter[i] /= topo.allowed_packages;
+	}
 }
 
 static unsigned long long rdtsc(void)
@@ -3848,6 +4096,31 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t)
 	return 0;
 }
 
+int perf_counter_info_read_values(struct perf_counter_info *pp, int cpu, unsigned long long *out, size_t out_size)
+{
+	unsigned int domain;
+	unsigned long long value;
+	int fd_counter;
+
+	for (size_t i = 0; pp; ++i, pp = pp->next) {
+		domain = cpu_to_domain(pp, cpu);
+		assert(domain < pp->num_domains);
+
+		fd_counter = pp->fd_perf_per_domain[domain];
+
+		if (fd_counter == -1)
+			continue;
+
+		if (read(fd_counter, &value, sizeof(value)) != sizeof(value))
+			return 1;
+
+		assert(i < out_size);
+		out[i] = value * pp->scale;
+	}
+
+	return 0;
+}
+
 /*
  * get_counters(...)
  * migrate to cpu
@@ -3889,6 +4162,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 			return -10;
 	}
 
+	if (perf_counter_info_read_values(sys.perf_tp, cpu, t->perf_counter, MAX_ADDED_THREAD_COUNTERS))
+		return -10;
+
 	/* collect core counters only for 1st thread in core */
 	if (!is_cpu_first_thread_in_core(t, c, p))
 		goto done;
@@ -3927,6 +4203,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 			return -10;
 	}
 
+	if (perf_counter_info_read_values(sys.perf_cp, cpu, c->perf_counter, MAX_ADDED_CORE_COUNTERS))
+		return -10;
+
 	/* collect package counters only for 1st core in package */
 	if (!is_cpu_first_core_in_package(t, c, p))
 		goto done;
@@ -3999,6 +4278,10 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		if (get_mp(cpu, mp, &p->counter[i], path))
 			return -10;
 	}
+
+	if (perf_counter_info_read_values(sys.perf_pp, cpu, p->perf_counter, MAX_ADDED_PACKAGE_COUNTERS))
+		return -10;
+
 done:
 	gettimeofday(&t->tv_end, (struct timezone *)NULL);
 
@@ -4528,6 +4811,36 @@ void free_fd_rapl_percpu(void)
 	rapl_counter_info_perdomain_size = 0;
 }
 
+void free_fd_added_perf_counters_(struct perf_counter_info *pp)
+{
+	if (!pp)
+		return;
+
+	if (!pp->fd_perf_per_domain)
+		return;
+
+	while (pp) {
+		for (size_t domain = 0; domain < pp->num_domains; ++domain) {
+			if (pp->fd_perf_per_domain[domain] != -1) {
+				close(pp->fd_perf_per_domain[domain]);
+				pp->fd_perf_per_domain[domain] = -1;
+			}
+		}
+
+		free(pp->fd_perf_per_domain);
+		pp->fd_perf_per_domain = NULL;
+
+		pp = pp->next;
+	}
+}
+
+void free_fd_added_perf_counters(void)
+{
+	free_fd_added_perf_counters_(sys.perf_tp);
+	free_fd_added_perf_counters_(sys.perf_cp);
+	free_fd_added_perf_counters_(sys.perf_pp);
+}
+
 void free_all_buffers(void)
 {
 	int i;
@@ -4573,6 +4886,7 @@ void free_all_buffers(void)
 	free_fd_msr();
 	free_fd_rapl_percpu();
 	free_fd_cstate();
+	free_fd_added_perf_counters();
 
 	free(irq_column_2_cpu);
 	free(irqs_per_cpu);
@@ -4910,6 +5224,7 @@ void linux_perf_init(void);
 void msr_perf_init(void);
 void rapl_perf_init(void);
 void cstate_perf_init(void);
+void added_perf_counters_init(void);
 
 void re_initialize(void)
 {
@@ -4919,6 +5234,7 @@ void re_initialize(void)
 	msr_perf_init();
 	rapl_perf_init();
 	cstate_perf_init();
+	added_perf_counters_init();
 	fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus,
 		topo.allowed_cpus);
 }
@@ -7859,6 +8175,117 @@ void check_perf_access(void)
 		bic_enabled &= ~BIC_IPC;
 }
 
+int added_perf_counters_init_(struct perf_counter_info *pinfo)
+{
+	size_t num_domains = 0;
+	unsigned int next_domain;
+	bool *domain_visited;
+	unsigned int perf_type, perf_config;
+	double perf_scale;
+	int fd_perf;
+
+	if (!pinfo)
+		return 0;
+
+	const size_t max_num_domains = MAX(topo.max_cpu_num + 1, MAX(topo.max_core_id + 1, topo.max_package_id + 1));
+
+	domain_visited = calloc(max_num_domains, sizeof(*domain_visited));
+
+	while (pinfo) {
+		switch (pinfo->scope) {
+		case SCOPE_CPU:
+			num_domains = topo.max_cpu_num + 1;
+			break;
+
+		case SCOPE_CORE:
+			num_domains = topo.max_core_id + 1;
+			break;
+
+		case SCOPE_PACKAGE:
+			num_domains = topo.max_package_id + 1;
+			break;
+		}
+
+		/* Allocate buffer for file descriptor for each domain. */
+		pinfo->fd_perf_per_domain = calloc(num_domains, sizeof(*pinfo->fd_perf_per_domain));
+		if (!pinfo->fd_perf_per_domain)
+			errx(1, "%s: alloc %s", __func__, "fd_perf_per_domain");
+
+		for (size_t i = 0; i < num_domains; ++i)
+			pinfo->fd_perf_per_domain[i] = -1;
+
+		pinfo->num_domains = num_domains;
+		pinfo->scale = 1.0;
+
+		memset(domain_visited, 0, max_num_domains * sizeof(*domain_visited));
+
+		for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) {
+
+			next_domain = cpu_to_domain(pinfo, cpu);
+
+			assert(next_domain < num_domains);
+
+			if (cpu_is_not_allowed(cpu))
+				continue;
+
+			if (domain_visited[next_domain])
+				continue;
+
+			perf_type = read_perf_type(pinfo->device);
+			if (perf_type == (unsigned int)-1) {
+				warnx("%s: perf/%s/%s: failed to read %s",
+				      __func__, pinfo->device, pinfo->event, "type");
+				continue;
+			}
+
+			perf_config = read_rapl_config(pinfo->device, pinfo->event);
+			if (perf_config == (unsigned int)-1) {
+				warnx("%s: perf/%s/%s: failed to read %s",
+				      __func__, pinfo->device, pinfo->event, "config");
+				continue;
+			}
+
+			/* Scale is not required, some counters just don't have it. */
+			perf_scale = read_perf_rapl_scale(pinfo->device, pinfo->event);
+			if (perf_scale == 0.0)
+				perf_scale = 1.0;
+
+			fd_perf = open_perf_counter(cpu, perf_type, perf_config, -1, 0);
+			if (fd_perf == -1) {
+				warnx("%s: perf/%s/%s: failed to open counter on cpu%d",
+				      __func__, pinfo->device, pinfo->event, cpu);
+				continue;
+			}
+
+			domain_visited[next_domain] = 1;
+			pinfo->fd_perf_per_domain[next_domain] = fd_perf;
+			pinfo->scale = perf_scale;
+
+			if (debug)
+				printf("Add perf/%s/%s cpu%d: %d\n",
+				       pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]);
+		}
+
+		pinfo = pinfo->next;
+	}
+
+	free(domain_visited);
+
+	return 0;
+}
+
+void added_perf_counters_init(void)
+{
+	if (added_perf_counters_init_(sys.perf_tp))
+		errx(1, "%s: %s", __func__, "thread");
+
+	if (added_perf_counters_init_(sys.perf_cp))
+		errx(1, "%s: %s", __func__, "core");
+
+	if (added_perf_counters_init_(sys.perf_pp))
+		errx(1, "%s: %s", __func__, "package");
+}
+
 void turbostat_init()
 {
 	setup_all_buffers(true);
@@ -7872,6 +8299,7 @@ void turbostat_init()
 	linux_perf_init();
 	rapl_perf_init();
 	cstate_perf_init();
+	added_perf_counters_init();
 
 	for_all_cpus(get_cpu_type, ODD_COUNTERS);
 	for_all_cpus(get_cpu_type, EVEN_COUNTERS);
@@ -8061,6 +8489,7 @@ int add_counter(unsigned int msr_num, char *path, char *name,
 		msrp = calloc(1, sizeof(struct msr_counter));
 		if (msrp == NULL)
 			err(-1, "calloc msr_counter");
+
 		msrp->msr_num = msr_num;
 		strncpy(msrp->name, name, NAME_BYTES - 1);
 		msrp->width = width;
@@ -8101,11 +8530,106 @@ int add_counter(unsigned int msr_num, char *path, char *name,
 	return 0;
 }
 
+/*
+ * Initialize the fields used for identifying and opening the counter.
+ *
+ * Defer the initialization of any runtime buffers for actually reading
+ * the counters for when we initialize all perf counters, so we can later
+ * easily call re_initialize().
+ */
+struct perf_counter_info *make_perf_counter_info(const char *perf_device,
+						 const char *perf_event,
+						 const char *name,
+						 unsigned int width,
+						 enum counter_scope scope,
+						 enum counter_type type, enum counter_format format)
+{
+	struct perf_counter_info *pinfo;
+
+	pinfo = calloc(1, sizeof(*pinfo));
+	if (!pinfo)
+		errx(1, "%s: Failed to allocate %s/%s\n", __func__, perf_device, perf_event);
+
+	strncpy(pinfo->device, perf_device, ARRAY_SIZE(pinfo->device) - 1);
+	strncpy(pinfo->event, perf_event, ARRAY_SIZE(pinfo->event) - 1);
+
+	strncpy(pinfo->name, name, ARRAY_SIZE(pinfo->name) - 1);
+	pinfo->width = width;
+	pinfo->scope = scope;
+	pinfo->type = type;
+	pinfo->format = format;
+
+	return pinfo;
+}
+
+int add_perf_counter(const char *perf_device, const char *perf_event, const char *name_buffer, unsigned int width,
+		     enum counter_scope scope, enum counter_type type, enum counter_format format)
+{
+	struct perf_counter_info *pinfo;
+
+	switch (scope) {
+	case SCOPE_CPU:
+		if (sys.added_thread_perf_counters >= MAX_ADDED_THREAD_COUNTERS) {
+			warnx("ignoring thread counter perf/%s/%s", perf_device, perf_event);
+			return -1;
+		}
+		break;
+
+	case SCOPE_CORE:
+		if (sys.added_core_perf_counters >= MAX_ADDED_CORE_COUNTERS) {
+			warnx("ignoring core counter perf/%s/%s", perf_device, perf_event);
+			return -1;
+		}
+		break;
+
+	case SCOPE_PACKAGE:
+		if (sys.added_package_perf_counters >= MAX_ADDED_PACKAGE_COUNTERS) {
+			warnx("ignoring package counter perf/%s/%s", perf_device, perf_event);
+			return -1;
+		}
+		break;
+	}
+
+	pinfo = make_perf_counter_info(perf_device, perf_event, name_buffer, width, scope, type, format);
+
+	if (!pinfo)
+		return -1;
+
+	switch (scope) {
+	case SCOPE_CPU:
+		pinfo->next = sys.perf_tp;
+		sys.perf_tp = pinfo;
+		++sys.added_thread_perf_counters;
+		break;
+
+	case SCOPE_CORE:
+		pinfo->next = sys.perf_cp;
+		sys.perf_cp = pinfo;
+		++sys.added_core_perf_counters;
+		break;
+
+	case SCOPE_PACKAGE:
+		pinfo->next = sys.perf_pp;
+		sys.perf_pp = pinfo;
+		++sys.added_package_perf_counters;
+		break;
+	}
+
+	// FIXME: we might not have debug here yet
+	if (debug)
+		printf("%s: %s/%s, name: %s, scope%d\n",
+		       __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope);
+
+	return 0;
+}
+
 void parse_add_command(char *add_command)
 {
 	int msr_num = 0;
 	char *path = NULL;
-	char name_buffer[NAME_BYTES] = "";
+	char perf_device[PERF_DEV_NAME_BYTES] = "";
+	char perf_event[PERF_EVT_NAME_BYTES] = "";
+	char name_buffer[PERF_NAME_BYTES] = "";
 	int width = 64;
 	int fail = 0;
 	enum counter_scope scope = SCOPE_CPU;
@@ -8120,6 +8644,11 @@ void parse_add_command(char *add_command)
 		if (sscanf(add_command, "msr%d", &msr_num) == 1)
 			goto next;
 
+		BUILD_BUG_ON(ARRAY_SIZE(perf_device) <= 31);
+		BUILD_BUG_ON(ARRAY_SIZE(perf_event) <= 31);
+		if (sscanf(add_command, "perf/%31[^/]/%31[^,]", &perf_device[0], &perf_event[0]) == 2)
+			goto next;
+
 		if (*add_command == '/') {
 			path = add_command;
 			goto next;
@@ -8167,7 +8696,8 @@ void parse_add_command(char *add_command)
 			goto next;
 		}
 
-		if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) {	/* 18 < NAME_BYTES */
+		BUILD_BUG_ON(ARRAY_SIZE(name_buffer) <= 18);
+		if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) {
 			char *eos;
 
 			eos = strchr(name_buffer, ',');
@@ -8184,21 +8714,33 @@ void parse_add_command(char *add_command)
 		}
 
 	}
-	if ((msr_num == 0) && (path == NULL)) {
-		fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter ) required\n");
+	if ((msr_num == 0) && (path == NULL) && (perf_device[0] == '\0' || perf_event[0] == '\0')) {
+		fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter | perf/device/event ) required\n");
 		fail++;
 	}
 
+	/* Test for non-empty perf_device and perf_event */
+	const bool is_perf_counter = perf_device[0] && perf_event[0];
+
 	/* generate default column header */
 	if (*name_buffer == '\0') {
-		if (width == 32)
-			sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
-		else
-			sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
+		if (is_perf_counter) {
+			snprintf(name_buffer, ARRAY_SIZE(name_buffer), "perf/%s", perf_event);
+		} else {
+			if (width == 32)
+				sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
+			else
+				sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
+		}
 	}
 
-	if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0))
-		fail++;
+	if (is_perf_counter) {
+		if (add_perf_counter(perf_device, perf_event, name_buffer, width, scope, type, format))
+			fail++;
+	} else {
+		if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0))
+			fail++;
+	}
 
 	if (fail) {
 		help();

From 25826c20da55a114c2cba8c4f237dfe7a7b4f8f6 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Tue, 9 Jul 2024 14:22:30 +0200
Subject: [PATCH 046/346] tools/power turbostat: Fix formatting in turbostat.8

We had an extra "+" at the beginning of some lines that look like a
poorly formated patch.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.8 | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 5537fc6b5bc3..8a21e9b56071 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -70,10 +70,10 @@ The column name "all" can be used to enable all disabled-by-default built-in cou
 .PP
 \fB--quiet\fP Do not decode and print the system configuration header information.
 .PP
-+\fB--no-msr\fP Disable all the uses of the MSR driver.
-+.PP
-+\fB--no-perf\fP Disable all the uses of the perf API.
-+.PP
+\fB--no-msr\fP Disable all the uses of the MSR driver.
+.PP
+\fB--no-perf\fP Disable all the uses of the perf API.
+.PP
 \fB--interval seconds\fP overrides the default 5.0 second measurement interval.
 .PP
 \fB--num_iterations num\fP number of the measurement iterations.

From 9f50066b0dd47cff045ba6da37e6def52783ba55 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Tue, 9 Jul 2024 14:46:20 +0200
Subject: [PATCH 047/346] tools/power turbostat: Add perf added counter example
 to turbostat.8

We had few lines about the feature, but without any complete examples.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.8 | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 8a21e9b56071..71ae2d5159ad 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -336,6 +336,24 @@ CPU	  PRF_CTRL
 
 .fi
 
+.SH ADD PERF COUNTER EXAMPLE
+Here we limit turbostat to showing just the CPU number for cpu0 - cpu3.
+We add a counter showing time spent in C1 core cstate,
+labeling it with the column header, "pCPU%c1", and display it only once,
+after the conclusion of 0.1 second sleep.
+We also show CPU%c1 built-in counter that should show similar values.
+.nf
+sudo ./turbostat --quiet --cpu 0-3 --show CPU,CPU%c1 --add perf/cstate_core/c1-residency,cpu,delta,percent,pCPU%c1 sleep .1
+0.102448 sec
+CPU     pCPU%c1 CPU%c1
+-       34.89   34.89
+0       45.99   45.99
+1       45.94   45.94
+2       23.83   23.83
+3       23.84   23.84
+
+.fi
+
 .SH INPUT
 
 For interval-mode, turbostat will immediately end the current interval

From 478a01016c08d33635359254fc1ab61a8c83dd1a Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Tue, 9 Jul 2024 14:53:14 +0200
Subject: [PATCH 048/346] tools/power turbostat: Fix typo in turbostat.8

"After" was missing an "r", nothing to see here.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.8 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 71ae2d5159ad..44cbc7cb382d 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -323,7 +323,7 @@ available on all processors.
 Here we limit turbostat to showing just the CPU number for cpu0 - cpu3.
 We add a counter showing the 32-bit raw value of MSR 0x199 (MSR_IA32_PERF_CTL),
 labeling it with the column header, "PRF_CTRL", and display it only once,
-afte the conclusion of a 0.1 second sleep.
+after the conclusion of a 0.1 second sleep.
 .nf
 sudo ./turbostat --quiet --cpu 0-3 --show CPU --add msr0x199,u32,raw,PRF_CTRL sleep .1
 0.101604 sec

From e516211f615fb54ce3429870eefc17469ae289b8 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Tue, 9 Jul 2024 18:05:56 +0200
Subject: [PATCH 049/346] rust: macros: indent list item in `paste!`'s docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A new style lint, `doc_lazy_continuation` [1], has been introduced in the
upcoming Rust 1.80 (currently in beta), which detects missing indentation
in code documentation.

We have one such case:

    error: doc list item missing indentation
    --> rust/macros/lib.rs:315:5
        |
    315 | /// default the span of the `[< >]` group is used.
        |     ^
        |
        = help: if this is supposed to be its own paragraph, add a blank line
        = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#doc_lazy_continuation
        = note: `-D clippy::doc-lazy-continuation` implied by `-D clippy::style`
        = help: to override `-D clippy::style` add `#[allow(clippy::doc_lazy_continuation)]`
    help: indent this line
        |
    315 | ///   default the span of the `[< >]` group is used.
        |     ++

While the rendering of the docs by `rustdoc` is not affected, we apply
this kind of indentation elsewhere since it looks better.

Thus clean it up.

Link: https://rust-lang.github.io/rust-clippy/master/index.html#/doc_lazy_continuation [1]
Reviewed-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Reviewed-by: Finn Behrens <me@kloenk.dev>
Tested-by: Benno Lossin <benno.lossin@proton.me>
Tested-by: Andreas Hindborg <a.hindborg@samsung.com>
Link: https://lore.kernel.org/r/20240709160615.998336-2-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/macros/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs
index 6dcc72aff111..159e75292970 100644
--- a/rust/macros/lib.rs
+++ b/rust/macros/lib.rs
@@ -345,7 +345,7 @@ pub fn pinned_drop(args: TokenStream, input: TokenStream) -> TokenStream {
 ///
 /// Currently supported modifiers are:
 /// * `span`: change the span of concatenated identifier to the span of the specified token. By
-/// default the span of the `[< >]` group is used.
+///   default the span of the `[< >]` group is used.
 /// * `lower`: change the identifier to lower case.
 /// * `upper`: change the identifier to upper case.
 ///

From dee1396a486cf2b6e7840322f6d104680649f2ff Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Tue, 9 Jul 2024 18:05:57 +0200
Subject: [PATCH 050/346] rust: init: simplify from `map_err` to `inspect_err`

A new complexity lint, `manual_inspect` [1], has been introduced in
the upcoming Rust 1.81 (currently in nightly), which checks for uses of
`map*` which return the original item:

    error:
    --> rust/kernel/init.rs:846:23
        |
    846 |         (self.1)(val).map_err(|e| {
        |                       ^^^^^^^
        |
        = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#manual_inspect
        = note: `-D clippy::manual-inspect` implied by `-D warnings`
        = help: to override `-D warnings` add `#[allow(clippy::manual_inspect)]`
    help: try
        |
    846 ~         (self.1)(val).inspect_err(|e| {
    847 |             // SAFETY: `slot` was initialized above.
    848 ~             unsafe { core::ptr::drop_in_place(slot) };
        |

Thus clean them up.

Link: https://rust-lang.github.io/rust-clippy/master/index.html#/manual_inspect [1]
Tested-by: Benno Lossin <benno.lossin@proton.me>
Tested-by: Andreas Hindborg <a.hindborg@samsung.com>
Link: https://lore.kernel.org/r/20240709160615.998336-3-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/kernel/init.rs | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/rust/kernel/init.rs b/rust/kernel/init.rs
index 68605b633e73..495c09ebe3a3 100644
--- a/rust/kernel/init.rs
+++ b/rust/kernel/init.rs
@@ -843,11 +843,8 @@ unsafe fn __pinned_init(self, slot: *mut T) -> Result<(), E> {
         let val = unsafe { &mut *slot };
         // SAFETY: `slot` is considered pinned.
         let val = unsafe { Pin::new_unchecked(val) };
-        (self.1)(val).map_err(|e| {
-            // SAFETY: `slot` was initialized above.
-            unsafe { core::ptr::drop_in_place(slot) };
-            e
-        })
+        // SAFETY: `slot` was initialized above.
+        (self.1)(val).inspect_err(|_| unsafe { core::ptr::drop_in_place(slot) })
     }
 }
 
@@ -941,11 +938,9 @@ unsafe fn __init(self, slot: *mut T) -> Result<(), E> {
         // SAFETY: All requirements fulfilled since this function is `__init`.
         unsafe { self.0.__pinned_init(slot)? };
         // SAFETY: The above call initialized `slot` and we still have unique access.
-        (self.1)(unsafe { &mut *slot }).map_err(|e| {
+        (self.1)(unsafe { &mut *slot }).inspect_err(|_|
             // SAFETY: `slot` was initialized above.
-            unsafe { core::ptr::drop_in_place(slot) };
-            e
-        })
+            unsafe { core::ptr::drop_in_place(slot) })
     }
 }
 

From f85bea18f71b2817ea45d63c6d1b91f9bc4a811f Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Tue, 9 Jul 2024 18:05:58 +0200
Subject: [PATCH 051/346] rust: allow `dead_code` for never constructed
 bindings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Starting with the upcoming Rust 1.80.0 (since upstream commit 35130d7233e9
("Detect pub structs never constructed and unused associated constants
in traits")), the `dead_code` pass detects more cases, which triggers
in the `bindings` crate:

    warning: struct `boot_params` is never constructed
        --> rust/bindings/bindings_generated.rs:10684:12
        |
    10684 | pub struct boot_params {
        |            ^^^^^^^^^^^
        |
        = note: `#[warn(dead_code)]` on by default

As well as in the `uapi` one:

    warning: struct `boot_params` is never constructed
        --> rust/uapi/uapi_generated.rs:10392:12
        |
    10392 | pub struct boot_params {
        |            ^^^^^^^^^^^
        |
        = note: `#[warn(dead_code)]` on by default

These are all expected, since we do not use all the structs in the
bindings that `bindgen` generates from the C headers.

Therefore, allow them.

Reviewed-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Reviewed-by: Finn Behrens <me@kloenk.dev>
Tested-by: Benno Lossin <benno.lossin@proton.me>
Tested-by: Andreas Hindborg <a.hindborg@samsung.com>
Link: https://lore.kernel.org/r/20240709160615.998336-4-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/bindings/lib.rs | 1 +
 rust/uapi/lib.rs     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/rust/bindings/lib.rs b/rust/bindings/lib.rs
index 40ddaee50d8b..93a1a3fc97bc 100644
--- a/rust/bindings/lib.rs
+++ b/rust/bindings/lib.rs
@@ -24,6 +24,7 @@
     unsafe_op_in_unsafe_fn
 )]
 
+#[allow(dead_code)]
 mod bindings_raw {
     // Use glob import here to expose all helpers.
     // Symbols defined within the module will take precedence to the glob import.
diff --git a/rust/uapi/lib.rs b/rust/uapi/lib.rs
index 0caad902ba40..80a00260e3e7 100644
--- a/rust/uapi/lib.rs
+++ b/rust/uapi/lib.rs
@@ -14,6 +14,7 @@
 #![cfg_attr(test, allow(unsafe_op_in_unsafe_fn))]
 #![allow(
     clippy::all,
+    dead_code,
     missing_docs,
     non_camel_case_types,
     non_upper_case_globals,

From f8f88aa25a03ce1e0fc8a9842840988b870f0c37 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Tue, 9 Jul 2024 18:05:59 +0200
Subject: [PATCH 052/346] rust: relax most deny-level lints to warnings

Since we are starting to support several Rust toolchains, lints (including
Clippy ones) now may behave differently and lint groups may include
new lints.

Therefore, to maximize the chances a given version works, relax some
deny-level lints to warnings. It may also make our lives a bit easier
while developing new code or refactoring.

To be clear, the requirements for in-tree code are still the same, since
Rust code still needs to be warning-free (patches should be clean under
`WERROR=y`) and the set of lints is not changed.

`unsafe_op_in_unsafe_fn` is left unmodified, i.e. as an error, since it is
becoming the default in the language (warn-by-default in Rust 2024 [1] and
ideally an error later on) and thus it should also be very well tested. In
addition, it is simple enough that it should not have false positives
(unlike e.g. `rust_2018_idioms`'s `explicit_outlives_requirements`).

`non_ascii_idents` is left unmodified as well, i.e. as an error, since
it is unlikely one gains any productivity during development if it
were a warning (in fact, it may be worse, since it is likely one made
a typo). In addition, it should not have false positives.

Finally, put the two `-D` ones at the top and take the chance to do one
per line.

Link: https://github.com/rust-lang/rust/pull/112038 [1]
Reviewed-by: Finn Behrens <me@kloenk.dev>
Tested-by: Benno Lossin <benno.lossin@proton.me>
Tested-by: Andreas Hindborg <a.hindborg@samsung.com>
Link: https://lore.kernel.org/r/20240709160615.998336-5-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Makefile      | 24 +++++++++++++-----------
 rust/Makefile |  4 ++--
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/Makefile b/Makefile
index fea263aaa492..7ea526814fdb 100644
--- a/Makefile
+++ b/Makefile
@@ -461,17 +461,19 @@ KBUILD_USERLDFLAGS := $(USERLDFLAGS)
 # host programs.
 export rust_common_flags := --edition=2021 \
 			    -Zbinary_dep_depinfo=y \
-			    -Dunsafe_op_in_unsafe_fn -Drust_2018_idioms \
-			    -Dunreachable_pub -Dnon_ascii_idents \
+			    -Dunsafe_op_in_unsafe_fn \
+			    -Dnon_ascii_idents \
+			    -Wrust_2018_idioms \
+			    -Wunreachable_pub \
 			    -Wmissing_docs \
-			    -Drustdoc::missing_crate_level_docs \
-			    -Dclippy::correctness -Dclippy::style \
-			    -Dclippy::suspicious -Dclippy::complexity \
-			    -Dclippy::perf \
-			    -Dclippy::let_unit_value -Dclippy::mut_mut \
-			    -Dclippy::needless_bitwise_bool \
-			    -Dclippy::needless_continue \
-			    -Dclippy::no_mangle_with_rust_abi \
+			    -Wrustdoc::missing_crate_level_docs \
+			    -Wclippy::correctness -Wclippy::style \
+			    -Wclippy::suspicious -Wclippy::complexity \
+			    -Wclippy::perf \
+			    -Wclippy::let_unit_value -Wclippy::mut_mut \
+			    -Wclippy::needless_bitwise_bool \
+			    -Wclippy::needless_continue \
+			    -Wclippy::no_mangle_with_rust_abi \
 			    -Wclippy::dbg_macro
 
 KBUILD_HOSTCFLAGS   := $(KBUILD_USERHOSTCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS)
@@ -572,7 +574,7 @@ KBUILD_RUSTFLAGS := $(rust_common_flags) \
 		    -Csymbol-mangling-version=v0 \
 		    -Crelocation-model=static \
 		    -Zfunction-sections=n \
-		    -Dclippy::float_arithmetic
+		    -Wclippy::float_arithmetic
 
 KBUILD_AFLAGS_KERNEL :=
 KBUILD_CFLAGS_KERNEL :=
diff --git a/rust/Makefile b/rust/Makefile
index 385378311322..bf05e65365da 100644
--- a/rust/Makefile
+++ b/rust/Makefile
@@ -367,7 +367,7 @@ ifneq ($(or $(CONFIG_ARM64),$(and $(CONFIG_RISCV),$(CONFIG_64BIT))),)
 endif
 
 $(obj)/core.o: private skip_clippy = 1
-$(obj)/core.o: private skip_flags = -Dunreachable_pub
+$(obj)/core.o: private skip_flags = -Wunreachable_pub
 $(obj)/core.o: private rustc_objcopy = $(foreach sym,$(redirect-intrinsics),--redefine-sym $(sym)=__rust$(sym))
 $(obj)/core.o: private rustc_target_flags = $(core-cfgs)
 $(obj)/core.o: $(RUST_LIB_SRC)/core/src/lib.rs FORCE
@@ -381,7 +381,7 @@ $(obj)/compiler_builtins.o: $(src)/compiler_builtins.rs $(obj)/core.o FORCE
 	+$(call if_changed_dep,rustc_library)
 
 $(obj)/alloc.o: private skip_clippy = 1
-$(obj)/alloc.o: private skip_flags = -Dunreachable_pub
+$(obj)/alloc.o: private skip_flags = -Wunreachable_pub
 $(obj)/alloc.o: private rustc_target_flags = $(alloc-cfgs)
 $(obj)/alloc.o: $(RUST_LIB_SRC)/alloc/src/lib.rs $(obj)/compiler_builtins.o FORCE
 	+$(call if_changed_dep,rustc_library)

From bb421b517e4b05d1e971c1fbf6af2f241accbf52 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Tue, 9 Jul 2024 18:06:00 +0200
Subject: [PATCH 053/346] rust: simplify Clippy warning flags set

All Clippy lint groups that we enable, except `correctness`, have a
default `warn` level, thus they may be removed now that we relaxed all
lints to `warn`.

Moreover, Clippy provides an `all` lint group that covers the groups
we enable by default. Thus just use `all` instead -- the only change is
that, if Clippy introduces a new lint group or splits an existing one,
we will cover that one automatically.

In addition, `let_unit_value` is in `style` since Rust 1.62.0, thus it
does not need to be enabled manually.

Reviewed-by: Finn Behrens <me@kloenk.dev>
Tested-by: Benno Lossin <benno.lossin@proton.me>
Tested-by: Andreas Hindborg <a.hindborg@samsung.com>
Link: https://lore.kernel.org/r/20240709160615.998336-6-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Makefile | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 7ea526814fdb..9044fdb9adb1 100644
--- a/Makefile
+++ b/Makefile
@@ -467,10 +467,8 @@ export rust_common_flags := --edition=2021 \
 			    -Wunreachable_pub \
 			    -Wmissing_docs \
 			    -Wrustdoc::missing_crate_level_docs \
-			    -Wclippy::correctness -Wclippy::style \
-			    -Wclippy::suspicious -Wclippy::complexity \
-			    -Wclippy::perf \
-			    -Wclippy::let_unit_value -Wclippy::mut_mut \
+			    -Wclippy::all \
+			    -Wclippy::mut_mut \
 			    -Wclippy::needless_bitwise_bool \
 			    -Wclippy::needless_continue \
 			    -Wclippy::no_mangle_with_rust_abi \

From 63b27f4a0074bc6ef987a44ee9ad8bf960b568c2 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Tue, 9 Jul 2024 18:06:01 +0200
Subject: [PATCH 054/346] rust: start supporting several compiler versions

It is time to start supporting several Rust compiler versions and thus
establish a minimum Rust version.

We may still want to upgrade the minimum sometimes in the beginning since
there may be important features coming into the language that improve
how we write code (e.g. field projections), which may or may not make
sense to support conditionally.

We will start with a window of two stable releases, and widen it over
time. Thus this patch does not move the current minimum (1.78.0), but
instead adds support for the recently released 1.79.0.

This should already be enough for kernel developers in distributions that
provide recent Rust compiler versions routinely, such as Arch Linux,
Debian Unstable (outside the freeze period), Fedora Linux, Gentoo
Linux (especially the testing channel), Nix (unstable) and openSUSE
Tumbleweed. See the documentation patch about it later in this series.

In addition, Rust for Linux is now being built-tested in Rust's pre-merge
CI [1]. That is, every change that is attempting to land into the Rust
compiler is tested against the kernel, and it is merged only if it passes
-- thanks to the Rust project for that!

Thus, with the pre-merge CI in place, both projects hope to avoid
unintentional changes to Rust that break the kernel. This means that,
in general, apart from intentional changes on their side (that we will
need to workaround conditionally on our side), the upcoming Rust compiler
versions should generally work.

For instance, currently, the beta (1.80.0) and nightly (1.81.0) branches
work as well.

Of course, the Rust for Linux CI job in the Rust toolchain may still need
to be temporarily disabled for different reasons, but the intention is
to help bring Rust for Linux into stable Rust.

Link: https://github.com/rust-lang/rust/pull/125209 [1]
Reviewed-by: Finn Behrens <me@kloenk.dev>
Tested-by: Benno Lossin <benno.lossin@proton.me>
Tested-by: Andreas Hindborg <a.hindborg@samsung.com>
Link: https://lore.kernel.org/r/20240709160615.998336-7-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Documentation/process/changes.rst  |  4 +---
 Documentation/rust/quick-start.rst | 15 +++++++--------
 scripts/rust_is_available.sh       |  8 --------
 scripts/rust_is_available_test.py  |  5 -----
 4 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index 5685d7bfe4d0..0d0b7120792b 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -88,9 +88,7 @@ docs on :ref:`Building Linux with Clang/LLVM <kbuild_llvm>`.
 Rust (optional)
 ---------------
 
-A particular version of the Rust toolchain is required. Newer versions may or
-may not work because the kernel depends on some unstable Rust features, for
-the moment.
+A recent version of the Rust compiler is required.
 
 Each Rust toolchain comes with several "components", some of which are required
 (like ``rustc``) and some that are optional. The ``rust-src`` component (which
diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst
index ac2f16288458..89bbfde8c96c 100644
--- a/Documentation/rust/quick-start.rst
+++ b/Documentation/rust/quick-start.rst
@@ -36,16 +36,15 @@ if that is the case.
 rustc
 *****
 
-A particular version of the Rust compiler is required. Newer versions may or
-may not work because, for the moment, the kernel depends on some unstable
-Rust features.
+A recent version of the Rust compiler is required.
 
 If ``rustup`` is being used, enter the kernel build directory (or use
-``--path=<build-dir>`` argument to the ``set`` sub-command) and run::
+``--path=<build-dir>`` argument to the ``set`` sub-command) and run,
+for instance::
 
-	rustup override set $(scripts/min-tool-version.sh rustc)
+	rustup override set stable
 
-This will configure your working directory to use the correct version of
+This will configure your working directory to use the given version of
 ``rustc`` without affecting your default toolchain.
 
 Note that the override applies to the current working directory (and its
@@ -72,9 +71,9 @@ version later on requires re-adding the component.
 Otherwise, if a standalone installer is used, the Rust source tree may be
 downloaded into the toolchain's installation folder::
 
-	curl -L "https://static.rust-lang.org/dist/rust-src-$(scripts/min-tool-version.sh rustc).tar.gz" |
+	curl -L "https://static.rust-lang.org/dist/rust-src-$(rustc --version | cut -d' ' -f2).tar.gz" |
 		tar -xzf - -C "$(rustc --print sysroot)/lib" \
-		"rust-src-$(scripts/min-tool-version.sh rustc)/rust-src/lib/" \
+		"rust-src-$(rustc --version | cut -d' ' -f2)/rust-src/lib/" \
 		--strip-components=3
 
 In this case, upgrading the Rust compiler version later on requires manually
diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh
index 117018946b57..67cb900124cc 100755
--- a/scripts/rust_is_available.sh
+++ b/scripts/rust_is_available.sh
@@ -117,14 +117,6 @@ if [ "$rust_compiler_cversion" -lt "$rust_compiler_min_cversion" ]; then
 	echo >&2 "***"
 	exit 1
 fi
-if [ "$rust_compiler_cversion" -gt "$rust_compiler_min_cversion" ]; then
-	echo >&2 "***"
-	echo >&2 "*** Rust compiler '$RUSTC' is too new. This may or may not work."
-	echo >&2 "***   Your version:     $rust_compiler_version"
-	echo >&2 "***   Expected version: $rust_compiler_min_version"
-	echo >&2 "***"
-	warning=1
-fi
 
 # Check that the Rust bindings generator is suitable.
 #
diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py
index 57613fe5ed75..a255f79aafc2 100755
--- a/scripts/rust_is_available_test.py
+++ b/scripts/rust_is_available_test.py
@@ -193,11 +193,6 @@ else:
         result = self.run_script(self.Expected.FAILURE, { "RUSTC": rustc })
         self.assertIn(f"Rust compiler '{rustc}' is too old.", result.stderr)
 
-    def test_rustc_new_version(self):
-        rustc = self.generate_rustc("rustc 1.999.0 (a8314ef7d 2099-06-27)")
-        result = self.run_script(self.Expected.SUCCESS_WITH_WARNINGS, { "RUSTC": rustc })
-        self.assertIn(f"Rust compiler '{rustc}' is too new. This may or may not work.", result.stderr)
-
     def test_bindgen_nonexecutable(self):
         result = self.run_script(self.Expected.FAILURE, { "BINDGEN": self.nonexecutable })
         self.assertIn(f"Running '{self.nonexecutable}' to check the Rust bindings generator version failed with", result.stderr)

From d49082faf6a001019693a837dea7b958048c731c Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Tue, 9 Jul 2024 18:06:02 +0200
Subject: [PATCH 055/346] rust: avoid assuming a particular `bindgen` build

`bindgen`'s logic to find `libclang` (via `clang-sys`) may change over
time, and depends on how it was built (e.g. Linux distributions may decide
to build it differently, and we are going to provide documentation on
installing it via distributions later in this series).

Therefore, clarify that `bindgen` may be built in several ways and
simplify the documentation by only mentioning the most prominent
environment variable (`LIBCLANG_PATH`) as an example on how to tweak the
search of the library at runtime (i.e. when `bindgen` is built as our
documentation explains). This also avoids duplicating the documentation,
like `bindgen` itself does (i.e. it refers to `clang-sys`).

Similarly, replace the test we had for this (which used the real program)
with a mocked one, to avoid depending on the particular build as well.

Tested-by: Benno Lossin <benno.lossin@proton.me>
Tested-by: Andreas Hindborg <a.hindborg@samsung.com>
Link: https://lore.kernel.org/r/20240709160615.998336-8-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Documentation/rust/quick-start.rst | 21 ++++++++-------------
 scripts/rust_is_available_test.py  | 25 +++++++++++++++----------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst
index 89bbfde8c96c..5ea8c8914942 100644
--- a/Documentation/rust/quick-start.rst
+++ b/Documentation/rust/quick-start.rst
@@ -113,20 +113,15 @@ Install it via (note that this will download and build the tool from source)::
 
 	cargo install --locked --version $(scripts/min-tool-version.sh bindgen) bindgen-cli
 
-``bindgen`` needs to find a suitable ``libclang`` in order to work. If it is
-not found (or a different ``libclang`` than the one found should be used),
-the process can be tweaked using the environment variables understood by
-``clang-sys`` (the Rust bindings crate that ``bindgen`` uses to access
-``libclang``):
+``bindgen`` uses the ``clang-sys`` crate to find a suitable ``libclang`` (which
+may be linked statically, dynamically or loaded at runtime). By default, the
+``cargo`` command above will produce a ``bindgen`` binary that will load
+``libclang`` at runtime. If it is not found (or a different ``libclang`` than
+the one found should be used), the process can be tweaked, e.g. by using the
+``LIBCLANG_PATH`` environment variable. For details, please see ``clang-sys``'s
+documentation at:
 
-* ``LLVM_CONFIG_PATH`` can be pointed to an ``llvm-config`` executable.
-
-* Or ``LIBCLANG_PATH`` can be pointed to a ``libclang`` shared library
-  or to the directory containing it.
-
-* Or ``CLANG_PATH`` can be pointed to a ``clang`` executable.
-
-For details, please see ``clang-sys``'s documentation at:
+	https://github.com/KyleMayes/clang-sys#linking
 
 	https://github.com/KyleMayes/clang-sys#environment-variables
 
diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py
index a255f79aafc2..0481aab862ec 100755
--- a/scripts/rust_is_available_test.py
+++ b/scripts/rust_is_available_test.py
@@ -55,10 +55,15 @@ else:
 
     @classmethod
     def generate_bindgen(cls, version_stdout, libclang_stderr):
+        if libclang_stderr is None:
+            libclang_case = f"raise SystemExit({cls.bindgen_default_bindgen_libclang_failure_exit_code})"
+        else:
+            libclang_case = f"print({repr(libclang_stderr)}, file=sys.stderr)"
+
         return cls.generate_executable(f"""#!/usr/bin/env python3
 import sys
 if "rust_is_available_bindgen_libclang.h" in " ".join(sys.argv):
-    print({repr(libclang_stderr)}, file=sys.stderr)
+    {libclang_case}
 else:
     print({repr(version_stdout)})
 """)
@@ -67,6 +72,10 @@ else:
     def generate_bindgen_version(cls, stdout):
         return cls.generate_bindgen(stdout, cls.bindgen_default_bindgen_libclang_stderr)
 
+    @classmethod
+    def generate_bindgen_libclang_failure(cls):
+        return cls.generate_bindgen(cls.bindgen_default_bindgen_version_stdout, None)
+
     @classmethod
     def generate_bindgen_libclang(cls, stderr):
         return cls.generate_bindgen(cls.bindgen_default_bindgen_version_stdout, stderr)
@@ -89,6 +98,7 @@ else:
         cls.rust_default_sysroot = subprocess.check_output(("rustc", "--print", "sysroot")).decode().strip()
 
         cls.bindgen_default_bindgen_version_stdout = f"bindgen {cls.bindgen_default_version}"
+        cls.bindgen_default_bindgen_libclang_failure_exit_code = 42
         cls.bindgen_default_bindgen_libclang_stderr = f"scripts/rust_is_available_bindgen_libclang.h:2:9: warning: clang version {cls.llvm_default_version} [-W#pragma-messages], err: false"
 
         cls.default_rustc = cls.generate_rustc(f"rustc {cls.rustc_default_version}")
@@ -227,15 +237,10 @@ else:
         self.assertIn(f"Rust bindings generator '{bindgen}' is too new. This may or may not work.", result.stderr)
 
     def test_bindgen_libclang_failure(self):
-        for env in (
-            { "LLVM_CONFIG_PATH": self.missing },
-            { "LIBCLANG_PATH": self.missing },
-            { "CLANG_PATH": self.missing },
-        ):
-            with self.subTest(env=env):
-                result = self.run_script(self.Expected.FAILURE, env | { "PATH": os.environ["PATH"], "BINDGEN": "bindgen" })
-                self.assertIn("Running 'bindgen' to check the libclang version (used by the Rust", result.stderr)
-                self.assertIn("bindings generator) failed with code ", result.stderr)
+        bindgen = self.generate_bindgen_libclang_failure()
+        result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen })
+        self.assertIn(f"Running '{bindgen}' to check the libclang version (used by the Rust", result.stderr)
+        self.assertIn(f"bindings generator) failed with code {self.bindgen_default_bindgen_libclang_failure_exit_code}. This may be caused by", result.stderr)
 
     def test_bindgen_libclang_unexpected_version(self):
         bindgen = self.generate_bindgen_libclang("scripts/rust_is_available_bindgen_libclang.h:2:9: warning: clang version unexpected [-W#pragma-messages], err: false")

From 9e98db17837093cb0f4dcfcc3524739d93249c45 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Tue, 9 Jul 2024 18:06:03 +0200
Subject: [PATCH 056/346] rust: work around `bindgen` 0.69.0 issue

`bindgen` 0.69.0 contains a bug: `--version` does not work without
providing a header [1]:

    error: the following required arguments were not provided:
      <HEADER>

    Usage: bindgen <FLAGS> <OPTIONS> <HEADER> -- <CLANG_ARGS>...

Thus, in preparation for supporting several `bindgen` versions, work
around the issue by passing a dummy argument.

Include a comment so that we can remove the workaround in the future.

Link: https://github.com/rust-lang/rust-bindgen/pull/2678 [1]
Reviewed-by: Finn Behrens <me@kloenk.dev>
Tested-by: Benno Lossin <benno.lossin@proton.me>
Tested-by: Andreas Hindborg <a.hindborg@samsung.com>
Link: https://lore.kernel.org/r/20240709160615.998336-9-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 init/Kconfig                 | 5 ++++-
 scripts/rust_is_available.sh | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index febdea2afc3b..94e20d3b99d4 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1928,7 +1928,10 @@ config RUSTC_VERSION_TEXT
 config BINDGEN_VERSION_TEXT
 	string
 	depends on RUST
-	default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version || echo n)
+	# The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0
+	# (https://github.com/rust-lang/rust-bindgen/pull/2678). It can be removed when
+	# the minimum version is upgraded past that (0.69.1 already fixed the issue).
+	default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version workaround-for-0.69.0 || echo n)
 
 #
 # Place an empty function call at each tracepoint site. Can be
diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh
index 67cb900124cc..1881e8f2a2b9 100755
--- a/scripts/rust_is_available.sh
+++ b/scripts/rust_is_available.sh
@@ -121,8 +121,12 @@ fi
 # Check that the Rust bindings generator is suitable.
 #
 # Non-stable and distributions' versions may have a version suffix, e.g. `-dev`.
+#
+# The dummy parameter `workaround-for-0.69.0` is required to support 0.69.0
+# (https://github.com/rust-lang/rust-bindgen/pull/2678). It can be removed when
+# the minimum version is upgraded past that (0.69.1 already fixed the issue).
 rust_bindings_generator_output=$( \
-	LC_ALL=C "$BINDGEN" --version 2>/dev/null
+	LC_ALL=C "$BINDGEN" --version workaround-for-0.69.0 2>/dev/null
 ) || rust_bindings_generator_code=$?
 if [ -n "$rust_bindings_generator_code" ]; then
 	echo >&2 "***"

From c844fa64a2d46982fe75e834f4a46c46d2b3b2e5 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Tue, 9 Jul 2024 18:06:04 +0200
Subject: [PATCH 057/346] rust: start supporting several `bindgen` versions

With both the workaround for `bindgen` 0.69.0 and the warning about
0.66.0 and 0.66.1 in place, start supporting several `bindgen` versions,
like it was done for the Rust compiler in a previous patch.

All other versions, including the latest 0.69.4, build without errors.

The `bindgen` project, like Rust, has also agreed to have the kernel
in their CI [1] -- thanks! This should help both projects: `bindgen`
will be able to detect early issues like those mentioned above, and the
kernel will be very likely build with new releases (at least for the
basic configuration being tested).

Link: https://github.com/rust-lang/rust-bindgen/pull/2851 [1]
Tested-by: Benno Lossin <benno.lossin@proton.me>
Tested-by: Andreas Hindborg <a.hindborg@samsung.com>
Link: https://lore.kernel.org/r/20240709160615.998336-10-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Documentation/rust/quick-start.rst | 7 ++++---
 scripts/rust_is_available.sh       | 8 --------
 scripts/rust_is_available_test.py  | 5 -----
 3 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst
index 5ea8c8914942..66cefbab8f9a 100644
--- a/Documentation/rust/quick-start.rst
+++ b/Documentation/rust/quick-start.rst
@@ -107,11 +107,12 @@ bindgen
 *******
 
 The bindings to the C side of the kernel are generated at build time using
-the ``bindgen`` tool. A particular version is required.
+the ``bindgen`` tool.
 
-Install it via (note that this will download and build the tool from source)::
+Install it, for instance, via (note that this will download and build the tool
+from source)::
 
-	cargo install --locked --version $(scripts/min-tool-version.sh bindgen) bindgen-cli
+	cargo install --locked bindgen-cli
 
 ``bindgen`` uses the ``clang-sys`` crate to find a suitable ``libclang`` (which
 may be linked statically, dynamically or loaded at runtime). By default, the
diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh
index 1881e8f2a2b9..4531f9dd19d3 100755
--- a/scripts/rust_is_available.sh
+++ b/scripts/rust_is_available.sh
@@ -161,14 +161,6 @@ if [ "$rust_bindings_generator_cversion" -lt "$rust_bindings_generator_min_cvers
 	echo >&2 "***"
 	exit 1
 fi
-if [ "$rust_bindings_generator_cversion" -gt "$rust_bindings_generator_min_cversion" ]; then
-	echo >&2 "***"
-	echo >&2 "*** Rust bindings generator '$BINDGEN' is too new. This may or may not work."
-	echo >&2 "***   Your version:     $rust_bindings_generator_version"
-	echo >&2 "***   Expected version: $rust_bindings_generator_min_version"
-	echo >&2 "***"
-	warning=1
-fi
 
 # Check that the `libclang` used by the Rust bindings generator is suitable.
 #
diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py
index 0481aab862ec..d6d54b7ea42a 100755
--- a/scripts/rust_is_available_test.py
+++ b/scripts/rust_is_available_test.py
@@ -231,11 +231,6 @@ else:
         result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen })
         self.assertIn(f"Rust bindings generator '{bindgen}' is too old.", result.stderr)
 
-    def test_bindgen_new_version(self):
-        bindgen = self.generate_bindgen_version("bindgen 0.999.0")
-        result = self.run_script(self.Expected.SUCCESS_WITH_WARNINGS, { "BINDGEN": bindgen })
-        self.assertIn(f"Rust bindings generator '{bindgen}' is too new. This may or may not work.", result.stderr)
-
     def test_bindgen_libclang_failure(self):
         bindgen = self.generate_bindgen_libclang_failure()
         result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen })

From 981ad93c89a3c600dee9795d3ead105acc805483 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Tue, 9 Jul 2024 18:06:05 +0200
Subject: [PATCH 058/346] rust: warn about `bindgen` versions 0.66.0 and 0.66.1

`bindgen` versions 0.66.0 and 0.66.1 panic due to C string literals with
NUL characters [1]:

    panicked at .cargo/registry/src/index.crates.io-6f17d22bba15001f/bindgen-0.66.0/codegen/mod.rs:717:71:
    called `Result::unwrap()` on an `Err` value: FromBytesWithNulError { kind: InteriorNul(4) }

Thus, in preparation for supporting several `bindgen` versions, add a
version check to warn the user about it.

Since some distributions may have patched it (e.g. Debian did [2]),
check if that seems to be the case (after the version check matches),
in order to avoid printing a warning in that case.

We could make it an error, but 1) it is going to fail anyway later
in the build, 2) we would disable `RUST`, which is also painful, 3)
someone could have patched it in a way that still makes our extra check
fail (however unlikely), 4) the interior NUL may go away in the headers
(however unlikely). Thus just warn about it so that users know why it
is failing.

In addition, add a couple tests for the new cases.

Link: https://github.com/rust-lang/rust-bindgen/pull/2567 [1]
Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1069047 [2]
Link: https://lore.kernel.org/r/20240709160615.998336-11-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 scripts/rust_is_available.sh             | 13 ++++++++++++
 scripts/rust_is_available_bindgen_0_66.h |  2 ++
 scripts/rust_is_available_test.py        | 26 +++++++++++++++++++++---
 3 files changed, 38 insertions(+), 3 deletions(-)
 create mode 100644 scripts/rust_is_available_bindgen_0_66.h

diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh
index 4531f9dd19d3..5262c56dd674 100755
--- a/scripts/rust_is_available.sh
+++ b/scripts/rust_is_available.sh
@@ -161,6 +161,19 @@ if [ "$rust_bindings_generator_cversion" -lt "$rust_bindings_generator_min_cvers
 	echo >&2 "***"
 	exit 1
 fi
+if [ "$rust_bindings_generator_cversion" -eq 6600 ] ||
+	[ "$rust_bindings_generator_cversion" -eq 6601 ]; then
+	# Distributions may have patched the issue (e.g. Debian did).
+	if ! "$BINDGEN" $(dirname $0)/rust_is_available_bindgen_0_66.h >/dev/null; then
+		echo >&2 "***"
+		echo >&2 "*** Rust bindings generator '$BINDGEN' versions 0.66.0 and 0.66.1 may not"
+		echo >&2 "*** work due to a bug (https://github.com/rust-lang/rust-bindgen/pull/2567),"
+		echo >&2 "*** unless patched (like Debian's)."
+		echo >&2 "***   Your version:     $rust_bindings_generator_version"
+		echo >&2 "***"
+		warning=1
+	fi
+fi
 
 # Check that the `libclang` used by the Rust bindings generator is suitable.
 #
diff --git a/scripts/rust_is_available_bindgen_0_66.h b/scripts/rust_is_available_bindgen_0_66.h
new file mode 100644
index 000000000000..c0431293421c
--- /dev/null
+++ b/scripts/rust_is_available_bindgen_0_66.h
@@ -0,0 +1,2 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define A "\0"
diff --git a/scripts/rust_is_available_test.py b/scripts/rust_is_available_test.py
index d6d54b7ea42a..413741037fb3 100755
--- a/scripts/rust_is_available_test.py
+++ b/scripts/rust_is_available_test.py
@@ -54,23 +54,30 @@ else:
 """)
 
     @classmethod
-    def generate_bindgen(cls, version_stdout, libclang_stderr):
+    def generate_bindgen(cls, version_stdout, libclang_stderr, version_0_66_patched=False):
         if libclang_stderr is None:
             libclang_case = f"raise SystemExit({cls.bindgen_default_bindgen_libclang_failure_exit_code})"
         else:
             libclang_case = f"print({repr(libclang_stderr)}, file=sys.stderr)"
 
+        if version_0_66_patched:
+            version_0_66_case = "pass"
+        else:
+            version_0_66_case = "raise SystemExit(1)"
+
         return cls.generate_executable(f"""#!/usr/bin/env python3
 import sys
 if "rust_is_available_bindgen_libclang.h" in " ".join(sys.argv):
     {libclang_case}
+elif "rust_is_available_bindgen_0_66.h" in " ".join(sys.argv):
+    {version_0_66_case}
 else:
     print({repr(version_stdout)})
 """)
 
     @classmethod
-    def generate_bindgen_version(cls, stdout):
-        return cls.generate_bindgen(stdout, cls.bindgen_default_bindgen_libclang_stderr)
+    def generate_bindgen_version(cls, stdout, version_0_66_patched=False):
+        return cls.generate_bindgen(stdout, cls.bindgen_default_bindgen_libclang_stderr, version_0_66_patched)
 
     @classmethod
     def generate_bindgen_libclang_failure(cls):
@@ -231,6 +238,19 @@ else:
         result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen })
         self.assertIn(f"Rust bindings generator '{bindgen}' is too old.", result.stderr)
 
+    def test_bindgen_bad_version_0_66_0_and_0_66_1(self):
+        for version in ("0.66.0", "0.66.1"):
+            with self.subTest(version=version):
+                bindgen = self.generate_bindgen_version(f"bindgen {version}")
+                result = self.run_script(self.Expected.SUCCESS_WITH_WARNINGS, { "BINDGEN": bindgen })
+                self.assertIn(f"Rust bindings generator '{bindgen}' versions 0.66.0 and 0.66.1 may not", result.stderr)
+
+    def test_bindgen_bad_version_0_66_0_and_0_66_1_patched(self):
+        for version in ("0.66.0", "0.66.1"):
+            with self.subTest(version=version):
+                bindgen = self.generate_bindgen_version(f"bindgen {version}", True)
+                result = self.run_script(self.Expected.SUCCESS, { "BINDGEN": bindgen })
+
     def test_bindgen_libclang_failure(self):
         bindgen = self.generate_bindgen_libclang_failure()
         result = self.run_script(self.Expected.FAILURE, { "BINDGEN": bindgen })

From b1263411112305acf2af728728591465becb45b0 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Tue, 9 Jul 2024 18:06:08 +0200
Subject: [PATCH 059/346] docs: rust: quick-start: add section on Linux
 distributions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that we are starting to support several Rust compiler and `bindgen`
versions, there is a good chance some Linux distributions work out of
the box.

Thus, provide some instructions on how to set the toolchain up for a
few major Linux distributions. This simplifies the setup users need to
build the kernel.

In addition, add an introduction to the document so that it is easier
to understand its structure and move the LLVM+Rust kernel.org toolchains
paragraph there (removing "depending on the Linux version"). We may want
to reorganize the document or split it in the future, but I wanted to
focus this commit on the new information added about each particular
distribution.

Finally, remove the `rustup`'s components mention in `changes.rst` since
users do not need it if they install the toolchain via the distributions
(and anyway it was too detailed for that main document).

Cc: Jan Alexander Steffens <heftig@archlinux.org>
Cc: Johannes Löthberg <johannes@kyriasis.com>
Cc: Fabian Grünbichler <debian@fabian.gruenbichler.email>
Cc: Josh Stone <jistone@redhat.com>
Cc: Randy Barlow <randy@electronsweatshop.com>
Cc: Anna (navi) Figueiredo Gomes <navi@vlhl.dev>
Cc: Matoro Mahri <matoro_gentoo@matoro.tk>
Cc: Ryan Scheel <ryan.havvy@gmail.com>
Cc: figsoda <figsoda@pm.me>
Cc: Jörg Thalheim <joerg@thalheim.io>
Cc: Theodore Ni <43ngvg@masqt.com>
Cc: Winter <nixos@winter.cafe>
Cc: William Brown <wbrown@suse.de>
Cc: Xiaoguang Wang <xiaoguang.wang@suse.com>
Cc: Andrea Righi <andrea.righi@canonical.com>
Cc: Zixing Liu <zixing.liu@canonical.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Tested-by: Benno Lossin <benno.lossin@proton.me>
Tested-by: Andreas Hindborg <a.hindborg@samsung.com>
Link: https://lore.kernel.org/r/20240709160615.998336-14-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Documentation/process/changes.rst  |  5 --
 Documentation/rust/quick-start.rst | 93 ++++++++++++++++++++++++++----
 2 files changed, 81 insertions(+), 17 deletions(-)

diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index 0d0b7120792b..0ce96ae2588c 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -90,11 +90,6 @@ Rust (optional)
 
 A recent version of the Rust compiler is required.
 
-Each Rust toolchain comes with several "components", some of which are required
-(like ``rustc``) and some that are optional. The ``rust-src`` component (which
-is optional) needs to be installed to build the kernel. Other components are
-useful for developing.
-
 Please see Documentation/rust/quick-start.rst for instructions on how to
 satisfy the build requirements of Rust support. In particular, the ``Makefile``
 target ``rustavailable`` is useful to check why the Rust toolchain may not
diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst
index 66cefbab8f9a..d06a36106cd4 100644
--- a/Documentation/rust/quick-start.rst
+++ b/Documentation/rust/quick-start.rst
@@ -5,24 +5,93 @@ Quick Start
 
 This document describes how to get started with kernel development in Rust.
 
+There are a few ways to install a Rust toolchain needed for kernel development.
+A simple way is to use the packages from your Linux distribution if they are
+suitable -- the first section below explains this approach. An advantage of this
+approach is that, typically, the distribution will match the LLVM used by Rust
+and Clang.
+
+Another way is using the prebuilt stable versions of LLVM+Rust provided on
+`kernel.org <https://kernel.org/pub/tools/llvm/rust/>`_. These are the same slim
+and fast LLVM toolchains from :ref:`Getting LLVM <getting_llvm>` with versions
+of Rust added to them that Rust for Linux supports. Two sets are provided: the
+"latest LLVM" and "matching LLVM" (please see the link for more information).
+
+Alternatively, the next two "Requirements" sections explain each component and
+how to install them through ``rustup``, the standalone installers from Rust
+and/or building them.
+
+The rest of the document explains other aspects on how to get started.
+
+
+Distributions
+-------------
+
+Arch Linux
+**********
+
+Arch Linux provides recent Rust releases and thus it should generally work out
+of the box, e.g.::
+
+	pacman -S rust rust-src rust-bindgen
+
+
+Debian
+******
+
+Debian Unstable (Sid), outside of the freeze period, provides recent Rust
+releases and thus it should generally work out of the box, e.g.::
+
+	apt install rustc rust-src bindgen rustfmt rust-clippy
+
+
+Fedora Linux
+************
+
+Fedora Linux provides recent Rust releases and thus it should generally work out
+of the box, e.g.::
+
+	dnf install rust rust-src bindgen-cli rustfmt clippy
+
+
+Gentoo Linux
+************
+
+Gentoo Linux (and especially the testing branch) provides recent Rust releases
+and thus it should generally work out of the box, e.g.::
+
+	USE='rust-src rustfmt clippy' emerge dev-lang/rust dev-util/bindgen
+
+``LIBCLANG_PATH`` may need to be set.
+
+
+Nix
+***
+
+Nix (unstable channel) provides recent Rust releases and thus it should
+generally work out of the box, e.g.::
+
+	{ pkgs ? import <nixpkgs> {} }:
+	pkgs.mkShell {
+	  nativeBuildInputs = with pkgs; [ rustc rust-bindgen rustfmt clippy ];
+	  RUST_LIB_SRC = "${pkgs.rust.packages.stable.rustPlatform.rustLibSrc}";
+	}
+
+
+openSUSE
+********
+
+openSUSE Slowroll and openSUSE Tumbleweed provide recent Rust releases and thus
+they should generally work out of the box, e.g.::
+
+	zypper install rust rust1.79-src rust-bindgen clang
+
 
 Requirements: Building
 ----------------------
 
 This section explains how to fetch the tools needed for building.
 
-Some of these requirements might be available from Linux distributions
-under names like ``rustc``, ``rust-src``, ``rust-bindgen``, etc. However,
-at the time of writing, they are likely not to be recent enough unless
-the distribution tracks the latest releases.
-
-Prebuilt stable versions of LLVM+Rust are provided on `kernel.org
-<https://kernel.org/pub/tools/llvm/rust/>`_. These are the same slim and fast
-LLVM toolchains from :ref:`Getting LLVM <getting_llvm>` with versions of Rust
-added to them that Rust for Linux supports, depending on the Linux version. Two
-sets are provided: the "latest LLVM" and "matching LLVM" (please see the link
-for more information).
-
 To easily check whether the requirements are met, the following target
 can be used::
 

From 692a68ee9c3c4ab984ae45a74a7569f14222d5aa Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Wed, 10 Jul 2024 17:27:01 +0000
Subject: [PATCH 060/346] radix tree test suite: put definition of
 bitmap_clear() into lib/bitmap.c

In tools/ directory, function bitmap_clear() is currently only used in
object file tools/testing/radix-tree/xarray.o.

But instead of keeping a bitmap.c with only bitmap_clear() definition in
radix-tree's own directory, it would be more proper to put it in common
directory lib/.

Sync the kernel definition and link some related libs, no functional
change is expected.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
CC: Matthew Wilcox <willy@infradead.org>
CC: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 tools/include/linux/bitmap.h      | 17 ++++++++++++++++-
 tools/lib/bitmap.c                | 20 ++++++++++++++++++++
 tools/testing/radix-tree/Makefile |  4 ++--
 tools/testing/radix-tree/bitmap.c | 23 -----------------------
 4 files changed, 38 insertions(+), 26 deletions(-)
 delete mode 100644 tools/testing/radix-tree/bitmap.c

diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index 210c13b1b857..2a7f260ef9dc 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -19,7 +19,7 @@ bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
 		 const unsigned long *bitmap2, unsigned int bits);
 bool __bitmap_equal(const unsigned long *bitmap1,
 		    const unsigned long *bitmap2, unsigned int bits);
-void bitmap_clear(unsigned long *map, unsigned int start, int len);
+void __bitmap_clear(unsigned long *map, unsigned int start, int len);
 bool __bitmap_intersects(const unsigned long *bitmap1,
 			 const unsigned long *bitmap2, unsigned int bits);
 
@@ -150,4 +150,19 @@ static inline bool bitmap_intersects(const unsigned long *src1,
 		return __bitmap_intersects(src1, src2, nbits);
 }
 
+static inline void bitmap_clear(unsigned long *map, unsigned int start,
+			       unsigned int nbits)
+{
+	if (__builtin_constant_p(nbits) && nbits == 1)
+		__clear_bit(start, map);
+	else if (small_const_nbits(start + nbits))
+		*map &= ~GENMASK(start + nbits - 1, start);
+	else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
+		 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
+		 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
+		 IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
+		memset((char *)map + start / 8, 0, nbits / 8);
+	else
+		__bitmap_clear(map, start, nbits);
+}
 #endif /* _TOOLS_LINUX_BITMAP_H */
diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c
index c3e4871967bc..2178862bb114 100644
--- a/tools/lib/bitmap.c
+++ b/tools/lib/bitmap.c
@@ -100,3 +100,23 @@ bool __bitmap_intersects(const unsigned long *bitmap1,
 			return true;
 	return false;
 }
+
+void __bitmap_clear(unsigned long *map, unsigned int start, int len)
+{
+	unsigned long *p = map + BIT_WORD(start);
+	const unsigned int size = start + len;
+	int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+	unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+	while (len - bits_to_clear >= 0) {
+		*p &= ~mask_to_clear;
+		len -= bits_to_clear;
+		bits_to_clear = BITS_PER_LONG;
+		mask_to_clear = ~0UL;
+		p++;
+	}
+	if (len) {
+		mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+		*p &= ~mask_to_clear;
+	}
+}
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index 7527f738b4a1..d1acd7d58850 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -5,8 +5,8 @@ CFLAGS += -I. -I../../include -I../../../lib -g -Og -Wall \
 LDFLAGS += -fsanitize=address -fsanitize=undefined
 LDLIBS+= -lpthread -lurcu
 TARGETS = main idr-test multiorder xarray maple
-CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o bitmap.o \
-			 slab.o maple.o
+LIBS := slab.o find_bit.o bitmap.o hweight.o vsprintf.o
+CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o maple.o $(LIBS)
 OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \
 	 regression4.o tag_check.o multiorder.o idr-test.o iteration_check.o \
 	 iteration_check_2.o benchmark.o
diff --git a/tools/testing/radix-tree/bitmap.c b/tools/testing/radix-tree/bitmap.c
deleted file mode 100644
index 66ec4a24a203..000000000000
--- a/tools/testing/radix-tree/bitmap.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/* lib/bitmap.c pulls in at least two other files. */
-
-#include <linux/bitmap.h>
-
-void bitmap_clear(unsigned long *map, unsigned int start, int len)
-{
-	unsigned long *p = map + BIT_WORD(start);
-	const unsigned int size = start + len;
-	int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
-	unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
-
-	while (len - bits_to_clear >= 0) {
-		*p &= ~mask_to_clear;
-		len -= bits_to_clear;
-		bits_to_clear = BITS_PER_LONG;
-		mask_to_clear = ~0UL;
-		p++;
-	}
-	if (len) {
-		mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
-		*p &= ~mask_to_clear;
-	}
-}

From fb9086e95ad84f14e4f4db97ed96422c74407830 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@toblux.com>
Date: Wed, 10 Jul 2024 22:24:18 +0200
Subject: [PATCH 061/346] riscv: Remove unnecessary int cast in variable_fls()

__builtin_clz() returns an int and casting the whole expression to int
is unnecessary. Remove it.

Signed-off-by: Thorsten Blum <thorsten.blum@toblux.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 arch/riscv/include/asm/bitops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
index 880606b0469a..71af9ecfcfcb 100644
--- a/arch/riscv/include/asm/bitops.h
+++ b/arch/riscv/include/asm/bitops.h
@@ -170,7 +170,7 @@ static __always_inline int variable_fls(unsigned int x)
 ({								\
 	typeof(x) x_ = (x);					\
 	__builtin_constant_p(x_) ?				\
-	 (int)((x_ != 0) ? (32 - __builtin_clz(x_)) : 0)	\
+	 ((x_ != 0) ? (32 - __builtin_clz(x_)) : 0)		\
 	 :							\
 	 variable_fls(x_);					\
 })

From 8ecef8e01a08c7e3e4ffc8f08d9f9663984f334b Mon Sep 17 00:00:00 2001
From: peng guo <engguopeng@buaa.edu.cn>
Date: Wed, 10 Jul 2024 10:31:12 +0800
Subject: [PATCH 062/346] cxl/core: Fix incorrect vendor debug UUID define

When user send a mbox command whose opcode is CXL_MBOX_OP_CLEAR_LOG and
the in_payload is normal vendor debug log UUID according to
the CXL specification cxl_payload_from_user_allowed() will return
false unexpectedly, Sending mbox cmd operation fails and the kernel
log will print:
Clear Log: input payload not allowed.

All CXL devices that support a debug log shall support the Vendor Debug
Log to allow the log to be accessed through a common host driver, for any
device, all versions of the CXL specification define the same value with
Log Identifier of: 5e1819d9-11a9-400c-811f-d60719403d86

Refer to CXL spec r3.1 Table 8-71

Fix the definition value of DEFINE_CXL_VENDOR_DEBUG_UUID to match the
CXL specification.

Fixes: 472b1ce6e9d6 ("cxl/mem: Enable commands via CEL")
Signed-off-by: peng guo <engguopeng@buaa.edu.cn>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Link: https://patch.msgid.link/20240710023112.8063-1-engguopeng@buaa.edu.cn
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/cxlmem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 19aba81cdf13..f55141c6d64e 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -562,7 +562,7 @@ enum cxl_opcode {
 		  0x3b, 0x3f, 0x17)
 
 #define DEFINE_CXL_VENDOR_DEBUG_UUID                                           \
-	UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19,     \
+	UUID_INIT(0x5e1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19,     \
 		  0x40, 0x3d, 0x86)
 
 struct cxl_mbox_get_supported_logs {

From bebfbbaffccfb69126c5a6a1d41cd868b6419370 Mon Sep 17 00:00:00 2001
From: "Fabio M. De Francesco" <fabio.m.de.francesco@linux.intel.com>
Date: Fri, 28 Jun 2024 19:48:07 +0200
Subject: [PATCH 063/346] cxl/acpi: Warn on mixed CXL VH and RCH/RCD Hierarchy

Each Host Bridge instance has a corresponding CXL Host Bridge Structure
(CHBS) ACPI table that identifies its capabilities. CHBS tables can be
two types (CXL 3.1 Table 9-21): The PCIe Root Complex Register Block
(RCRB) and CXL Host Bridge Component Registers (CHBCR).

If a Host Bridge is attached to a device that is operating in Restricted
CXL Device Mode (RCD), BIOS publishes an RCRB with the base address of
registers that describe its capabilities (CXL 3.1 sec. 9.11).

Instead, the new (CXL 2.0+) Component registers can only be accessed
by means of a base address published with a CHBCR (CXL 3.1 sec. 9.12).

If an eRCD (a device that forces the host-bridge into CXL 1.1 Restricted
CXL Host mode) is attached to a CXL 2.0+ Host-Bridge, the current CXL
specification does not define a mechanism for finding CXL-2.0-only
root-port component registers like HDM decoders and Extended Security
capability.

An algorithm to locate a CHBCR associated with an RCRB, would be too
invasive to land without some concrete motivation.

Therefore, just print a message to inform of unsupported config.

Count how many different CHBS "Version" types are detected by
cxl_get_chbs_iter(). Then make cxl_get_chbs() print a warning if that sum
is greater than 1.

Tested-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Link: https://patch.msgid.link/20240628175535.272472-1-fabio.m.de.francesco@linux.intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/acpi.c | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index e51315ea4a6a..574918a9ae3a 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -482,6 +482,8 @@ struct cxl_chbs_context {
 	unsigned long long uid;
 	resource_size_t base;
 	u32 cxl_version;
+	int nr_versions;
+	u32 saved_version;
 };
 
 static int cxl_get_chbs_iter(union acpi_subtable_headers *header, void *arg,
@@ -490,22 +492,31 @@ static int cxl_get_chbs_iter(union acpi_subtable_headers *header, void *arg,
 	struct cxl_chbs_context *ctx = arg;
 	struct acpi_cedt_chbs *chbs;
 
-	if (ctx->base != CXL_RESOURCE_NONE)
-		return 0;
-
 	chbs = (struct acpi_cedt_chbs *) header;
 
-	if (ctx->uid != chbs->uid)
-		return 0;
-
-	ctx->cxl_version = chbs->cxl_version;
-	if (!chbs->base)
-		return 0;
-
 	if (chbs->cxl_version == ACPI_CEDT_CHBS_VERSION_CXL11 &&
 	    chbs->length != CXL_RCRB_SIZE)
 		return 0;
 
+	if (!chbs->base)
+		return 0;
+
+	if (ctx->saved_version != chbs->cxl_version) {
+		/*
+		 * cxl_version cannot be overwritten before the next two
+		 * checks, then use saved_version
+		 */
+		ctx->saved_version = chbs->cxl_version;
+		ctx->nr_versions++;
+	}
+
+	if (ctx->base != CXL_RESOURCE_NONE)
+		return 0;
+
+	if (ctx->uid != chbs->uid)
+		return 0;
+
+	ctx->cxl_version = chbs->cxl_version;
 	ctx->base = chbs->base;
 
 	return 0;
@@ -529,10 +540,19 @@ static int cxl_get_chbs(struct device *dev, struct acpi_device *hb,
 		.uid = uid,
 		.base = CXL_RESOURCE_NONE,
 		.cxl_version = UINT_MAX,
+		.saved_version = UINT_MAX,
 	};
 
 	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CHBS, cxl_get_chbs_iter, ctx);
 
+	if (ctx->nr_versions > 1) {
+		/*
+		 * Disclaim eRCD support given some component register may
+		 * only be found via CHBCR
+		 */
+		dev_info(dev, "Unsupported platform config, mixed Virtual Host and Restricted CXL Host hierarchy.");
+	}
+
 	return 0;
 }
 

From 591209c79844c1ecbee79e6b5a019e5b61eab8d3 Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Sat, 6 Jul 2024 18:53:43 -0700
Subject: [PATCH 064/346] cxl/memdev: Replace ENXIO with EBUSY for inject
 poison limit reached

The CXL driver provides a debugfs interface offering users the
ability to inject and clear poison to a memdev. Once a user has
injected up to the devices limit further injection requests fail
with ENXIO until a clear poison is issued.

Users may not have device specs in hand or may want to intentionally
hit the limit and then clear. Replace the usual ENXIO return status
with EBUSY so users can recognize this failure.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Tested-by: Xingtao Yao <yaoxt.fnst@fujitsu.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Link: https://patch.msgid.link/825bd4c67fb55a4373c4182d999ad49d4e6b4fe7.1720316188.git.alison.schofield@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 Documentation/ABI/testing/debugfs-cxl | 7 ++++---
 drivers/cxl/cxlmem.h                  | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/Documentation/ABI/testing/debugfs-cxl b/Documentation/ABI/testing/debugfs-cxl
index c61f9b813973..12488c14be64 100644
--- a/Documentation/ABI/testing/debugfs-cxl
+++ b/Documentation/ABI/testing/debugfs-cxl
@@ -14,9 +14,10 @@ Description:
 		event to its internal Informational Event log, updates the
 		Event Status register, and if configured, interrupts the host.
 		It is not an error to inject poison into an address that
-		already has poison present and no error is returned. The
-		inject_poison attribute is only visible for devices supporting
-		the capability.
+		already has poison present and no error is returned. If the
+		device returns 'Inject Poison Limit Reached' an -EBUSY error
+		is returned to the user. The inject_poison attribute is only
+		visible for devices supporting the capability.
 
 
 What:		/sys/kernel/debug/memX/clear_poison
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index f55141c6d64e..40a465c37e03 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -161,7 +161,7 @@ struct cxl_mbox_cmd {
 	C(FWRESET, -ENXIO, "FW failed to activate, needs cold reset"),		\
 	C(HANDLE, -ENXIO, "one or more Event Record Handles were invalid"),     \
 	C(PADDR, -EFAULT, "physical address specified is invalid"),		\
-	C(POISONLMT, -ENXIO, "poison injection limit has been reached"),        \
+	C(POISONLMT, -EBUSY, "poison injection limit has been reached"),        \
 	C(MEDIAFAILURE, -ENXIO, "permanent issue with the media"),		\
 	C(ABORT, -ENXIO, "background cmd was aborted by device"),               \
 	C(SECURITY, -ENXIO, "not valid in the current security state"),         \

From 3a8617c7df6eb351227aad9b0df647f34a7ef423 Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Sat, 6 Jul 2024 18:53:44 -0700
Subject: [PATCH 065/346] cxl/test: Replace ENXIO with EBUSY for inject poison
 limit reached

The CXL driver was recently updated to return EBUSY rather than
ENXIO when the device reports that an injection request exceeds
the device's limit. That change to EBUSY allows debug users to
differentiate between limit reached and inject failures for any
other reason.

Change cxl-test to also return EBUSY and tidy up the dev_dbg()
messaging to emit the correct limit.

Reminder: the cxl-test per device injection limit is a configurable
attribute: /sys/bus/platform/drivers/cxl_mock_mem/poison_inject_max

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Tested-by: Xingtao Yao <yaoxt.fnst@fujitsu.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Link: https://patch.msgid.link/ba1b80e1658b644d85d0d5e2287112d00a48b9cf.1720316188.git.alison.schofield@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 tools/testing/cxl/test/mem.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 94de2cf60f4f..129f179b0ac5 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -1135,27 +1135,28 @@ static bool mock_poison_dev_max_injected(struct cxl_dev_state *cxlds)
 	return (count >= poison_inject_dev_max);
 }
 
-static bool mock_poison_add(struct cxl_dev_state *cxlds, u64 dpa)
+static int mock_poison_add(struct cxl_dev_state *cxlds, u64 dpa)
 {
+	/* Return EBUSY to match the CXL driver handling */
 	if (mock_poison_dev_max_injected(cxlds)) {
 		dev_dbg(cxlds->dev,
 			"Device poison injection limit has been reached: %d\n",
-			MOCK_INJECT_DEV_MAX);
-		return false;
+			poison_inject_dev_max);
+		return -EBUSY;
 	}
 
 	for (int i = 0; i < MOCK_INJECT_TEST_MAX; i++) {
 		if (!mock_poison_list[i].cxlds) {
 			mock_poison_list[i].cxlds = cxlds;
 			mock_poison_list[i].dpa = dpa;
-			return true;
+			return 0;
 		}
 	}
 	dev_dbg(cxlds->dev,
 		"Mock test poison injection limit has been reached: %d\n",
 		MOCK_INJECT_TEST_MAX);
 
-	return false;
+	return -ENXIO;
 }
 
 static bool mock_poison_found(struct cxl_dev_state *cxlds, u64 dpa)
@@ -1179,10 +1180,8 @@ static int mock_inject_poison(struct cxl_dev_state *cxlds,
 		dev_dbg(cxlds->dev, "DPA: 0x%llx already poisoned\n", dpa);
 		return 0;
 	}
-	if (!mock_poison_add(cxlds, dpa))
-		return -ENXIO;
 
-	return 0;
+	return mock_poison_add(cxlds, dpa);
 }
 
 static bool mock_poison_del(struct cxl_dev_state *cxlds, u64 dpa)

From 9aa5f6235e16ac6fceed789a30e72addf1abd7d8 Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Tue, 2 Jul 2024 22:29:49 -0700
Subject: [PATCH 066/346] cxl/core: Fold cxl_trace_hpa() into cxl_dpa_to_hpa()

Although cxl_trace_hpa() is used to populate TRACE EVENTs with HPA
addresses the work it performs is a DPA to HPA translation not a
trace. Tidy up this naming by moving the minimal work done in
cxl_trace_hpa() into cxl_dpa_to_hpa() and use cxl_dpa_to_hpa()
for trace event callbacks.

Suggested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Robert Richter <rrichter@amd.com>
Link: https://patch.msgid.link/452a9b0c525b774c72d9d5851515ffa928750132.1719980933.git.alison.schofield@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/core.h   |  8 ++++----
 drivers/cxl/core/mbox.c   |  2 +-
 drivers/cxl/core/region.c | 33 +++++++++++++--------------------
 drivers/cxl/core/trace.h  |  4 ++--
 4 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 625394486459..72a506c9dbd0 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -28,12 +28,12 @@ int cxl_region_init(void);
 void cxl_region_exit(void);
 int cxl_get_poison_by_endpoint(struct cxl_port *port);
 struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa);
-u64 cxl_trace_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
-		  u64 dpa);
+u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
+		   u64 dpa);
 
 #else
-static inline u64
-cxl_trace_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa)
+static inline u64 cxl_dpa_to_hpa(struct cxl_region *cxlr,
+				 const struct cxl_memdev *cxlmd, u64 dpa)
 {
 	return ULLONG_MAX;
 }
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 2626f3fff201..eb0b08e5136f 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -878,7 +878,7 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 		dpa = le64_to_cpu(evt->common.phys_addr) & CXL_DPA_MASK;
 		cxlr = cxl_dpa_to_region(cxlmd, dpa);
 		if (cxlr)
-			hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
+			hpa = cxl_dpa_to_hpa(cxlr, cxlmd, dpa);
 
 		if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
 			trace_cxl_general_media(cxlmd, type, cxlr, hpa,
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 3c2b6144be23..237c28d5f2cc 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -2749,15 +2749,25 @@ static bool cxl_is_hpa_in_range(u64 hpa, struct cxl_region *cxlr, int pos)
 	return false;
 }
 
-static u64 cxl_dpa_to_hpa(u64 dpa,  struct cxl_region *cxlr,
-			  struct cxl_endpoint_decoder *cxled)
+u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
+		   u64 dpa)
 {
 	u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa;
 	struct cxl_region_params *p = &cxlr->params;
-	int pos = cxled->pos;
+	struct cxl_endpoint_decoder *cxled = NULL;
 	u16 eig = 0;
 	u8 eiw = 0;
+	int pos;
 
+	for (int i = 0; i < p->nr_targets; i++) {
+		cxled = p->targets[i];
+		if (cxlmd == cxled_to_memdev(cxled))
+			break;
+	}
+	if (!cxled || cxlmd != cxled_to_memdev(cxled))
+		return ULLONG_MAX;
+
+	pos = cxled->pos;
 	ways_to_eiw(p->interleave_ways, &eiw);
 	granularity_to_eig(p->interleave_granularity, &eig);
 
@@ -2797,23 +2807,6 @@ static u64 cxl_dpa_to_hpa(u64 dpa,  struct cxl_region *cxlr,
 	return hpa;
 }
 
-u64 cxl_trace_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
-		  u64 dpa)
-{
-	struct cxl_region_params *p = &cxlr->params;
-	struct cxl_endpoint_decoder *cxled = NULL;
-
-	for (int i = 0; i <  p->nr_targets; i++) {
-		cxled = p->targets[i];
-		if (cxlmd == cxled_to_memdev(cxled))
-			break;
-	}
-	if (!cxled || cxlmd != cxled_to_memdev(cxled))
-		return ULLONG_MAX;
-
-	return cxl_dpa_to_hpa(dpa, cxlr, cxled);
-}
-
 static struct lock_class_key cxl_pmem_region_key;
 
 static int cxl_pmem_region_alloc(struct cxl_region *cxlr)
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index ee5cd4eb2f16..21b76e9c5c60 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -704,8 +704,8 @@ TRACE_EVENT(cxl_poison,
 		if (cxlr) {
 			__assign_str(region);
 			memcpy(__entry->uuid, &cxlr->params.uuid, 16);
-			__entry->hpa = cxl_trace_hpa(cxlr, cxlmd,
-						     __entry->dpa);
+			__entry->hpa = cxl_dpa_to_hpa(cxlr, cxlmd,
+						      __entry->dpa);
 		} else {
 			__assign_str(region);
 			memset(__entry->uuid, 0, 16);

From 3b2fedcd75e3991e77c2a8c3ebcab0ea68b2d69d Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Tue, 2 Jul 2024 22:29:50 -0700
Subject: [PATCH 067/346] cxl: Restore XOR'd position bits during address
 translation

When a device reports a DPA in events like poison, general_media,
and dram, the driver translates that DPA back to an HPA. Presently,
the CXL driver translation only considers the Modulo position and
will report the wrong HPA for XOR configured root decoders.

Add a helper function that restores the XOR'd bits during DPA->HPA
address translation. Plumb a root decoder callback to the new helper
when XOR interleave arithmetic is in use. For Modulo arithmetic, just
let the callback be NULL - as in no extra work required.

Upon completion of a DPA->HPA translation a couple of checks are
performed on the result. One simply confirms that the calculated
HPA is within the address range of the region. That test is useful
for both Modulo and XOR interleave arithmetic decodes.

A second check confirms that the HPA is within an expected chunk
based on the endpoints position in the region and the region
granularity. An XOR decode disrupts the Modulo pattern making the
chunk check useless.

To align the checks with the proper decode, pull the region range
check inline and use the helper to do the chunk check for Modulo
decodes only.

A cxl-test unit test is posted for upstream review here:
https://lore.kernel.org/20240624210644.495563-1-alison.schofield@intel.com/

Fixes: 28a3ae4ff66c ("cxl/trace: Add an HPA to cxl_poison trace events")
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Tested-by: Diego Garcia Rodriguez <diego.garcia.rodriguez@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://patch.msgid.link/1a1ac880d9f889bd6384e657e810431b9a0a72e5.1719980933.git.alison.schofield@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/acpi.c        | 40 +++++++++++++++++++++++++++++++++++++++
 drivers/cxl/core/region.c | 23 +++++++++++++---------
 drivers/cxl/cxl.h         |  3 +++
 3 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index 571069863c62..6b6ae9c81368 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -74,6 +74,43 @@ static struct cxl_dport *cxl_hb_xor(struct cxl_root_decoder *cxlrd, int pos)
 	return cxlrd->cxlsd.target[n];
 }
 
+static u64 cxl_xor_hpa_to_spa(struct cxl_root_decoder *cxlrd, u64 hpa)
+{
+	struct cxl_cxims_data *cximsd = cxlrd->platform_data;
+	int hbiw = cxlrd->cxlsd.nr_targets;
+	u64 val;
+	int pos;
+
+	/* No xormaps for host bridge interleave ways of 1 or 3 */
+	if (hbiw == 1 || hbiw == 3)
+		return hpa;
+
+	/*
+	 * For root decoders using xormaps (hbiw: 2,4,6,8,12,16) restore
+	 * the position bit to its value before the xormap was applied at
+	 * HPA->DPA translation.
+	 *
+	 * pos is the lowest set bit in an XORMAP
+	 * val is the XORALLBITS(HPA & XORMAP)
+	 *
+	 * XORALLBITS: The CXL spec (3.1 Table 9-22) defines XORALLBITS
+	 * as an operation that outputs a single bit by XORing all the
+	 * bits in the input (hpa & xormap). Implement XORALLBITS using
+	 * hweight64(). If the hamming weight is even the XOR of those
+	 * bits results in val==0, if odd the XOR result is val==1.
+	 */
+
+	for (int i = 0; i < cximsd->nr_maps; i++) {
+		if (!cximsd->xormaps[i])
+			continue;
+		pos = __ffs(cximsd->xormaps[i]);
+		val = (hweight64(hpa & cximsd->xormaps[i]) & 1);
+		hpa = (hpa & ~(1ULL << pos)) | (val << pos);
+	}
+
+	return hpa;
+}
+
 struct cxl_cxims_context {
 	struct device *dev;
 	struct cxl_root_decoder *cxlrd;
@@ -434,6 +471,9 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws,
 
 	cxlrd->qos_class = cfmws->qtg_id;
 
+	if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR)
+		cxlrd->hpa_to_spa = cxl_xor_hpa_to_spa;
+
 	rc = cxl_decoder_add(cxld, target_map);
 	if (rc)
 		return rc;
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 237c28d5f2cc..23abd0f7b856 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -2723,20 +2723,13 @@ struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa)
 	return ctx.cxlr;
 }
 
-static bool cxl_is_hpa_in_range(u64 hpa, struct cxl_region *cxlr, int pos)
+static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos)
 {
 	struct cxl_region_params *p = &cxlr->params;
 	int gran = p->interleave_granularity;
 	int ways = p->interleave_ways;
 	u64 offset;
 
-	/* Is the hpa within this region at all */
-	if (hpa < p->res->start || hpa > p->res->end) {
-		dev_dbg(&cxlr->dev,
-			"Addr trans fail: hpa 0x%llx not in region\n", hpa);
-		return false;
-	}
-
 	/* Is the hpa in an expected chunk for its pos(-ition) */
 	offset = hpa - p->res->start;
 	offset = do_div(offset, gran * ways);
@@ -2752,6 +2745,7 @@ static bool cxl_is_hpa_in_range(u64 hpa, struct cxl_region *cxlr, int pos)
 u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
 		   u64 dpa)
 {
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
 	u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa;
 	struct cxl_region_params *p = &cxlr->params;
 	struct cxl_endpoint_decoder *cxled = NULL;
@@ -2801,7 +2795,18 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
 	/* Apply the hpa_offset to the region base address */
 	hpa = hpa_offset + p->res->start;
 
-	if (!cxl_is_hpa_in_range(hpa, cxlr, cxled->pos))
+	/* Root decoder translation overrides typical modulo decode */
+	if (cxlrd->hpa_to_spa)
+		hpa = cxlrd->hpa_to_spa(cxlrd, hpa);
+
+	if (hpa < p->res->start || hpa > p->res->end) {
+		dev_dbg(&cxlr->dev,
+			"Addr trans fail: hpa 0x%llx not in region\n", hpa);
+		return ULLONG_MAX;
+	}
+
+	/* Simple chunk check, by pos & gran, only applies to modulo decodes */
+	if (!cxlrd->hpa_to_spa && (!cxl_is_hpa_in_chunk(hpa, cxlr, pos)))
 		return ULLONG_MAX;
 
 	return hpa;
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 603c0120cff8..dfc4f27d195c 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -434,12 +434,14 @@ struct cxl_switch_decoder {
 struct cxl_root_decoder;
 typedef struct cxl_dport *(*cxl_calc_hb_fn)(struct cxl_root_decoder *cxlrd,
 					    int pos);
+typedef u64 (*cxl_hpa_to_spa_fn)(struct cxl_root_decoder *cxlrd, u64 hpa);
 
 /**
  * struct cxl_root_decoder - Static platform CXL address decoder
  * @res: host / parent resource for region allocations
  * @region_id: region id for next region provisioning event
  * @calc_hb: which host bridge covers the n'th position by granularity
+ * @hpa_to_spa: translate CXL host-physical-address to Platform system-physical-address
  * @platform_data: platform specific configuration data
  * @range_lock: sync region autodiscovery by address range
  * @qos_class: QoS performance class cookie
@@ -449,6 +451,7 @@ struct cxl_root_decoder {
 	struct resource *res;
 	atomic_t region_id;
 	cxl_calc_hb_fn calc_hb;
+	cxl_hpa_to_spa_fn hpa_to_spa;
 	void *platform_data;
 	struct mutex range_lock;
 	int qos_class;

From 82a3e3a235633aa0575fac9507d648dd80f3437f Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Tue, 2 Jul 2024 22:29:51 -0700
Subject: [PATCH 068/346] cxl/region: Verify target positions using the ordered
 target list

When a root decoder is configured the interleave target list is read
from the BIOS populated CFMWS structure. Per the CXL spec 3.1 Table
9-22 the target list is in interleave order. The CXL driver populates
its decoder target list in the same order and stores it in 'struct
cxl_switch_decoder' field "@target: active ordered target list in
current decoder configuration"

Given the promise of an ordered list, the driver can stop duplicating
the work of BIOS and simply check target positions against the ordered
list during region configuration.

The simplified check against the ordered list is presented here.
A follow-on patch will remove the unused code.

For Modulo arithmetic this is not a fix, only a simplification.
For XOR arithmetic this is a fix for HB IW of 3,6,12.

Fixes: f9db85bfec0d ("cxl/acpi: Support CXL XOR Interleave Math (CXIMS)")
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://patch.msgid.link/35d08d3aba08fee0f9b86ab1cef0c25116ca8a55.1719980933.git.alison.schofield@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/region.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 23abd0f7b856..2772828ca6ca 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -1559,10 +1559,13 @@ static int cxl_region_attach_position(struct cxl_region *cxlr,
 				      const struct cxl_dport *dport, int pos)
 {
 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd;
+	struct cxl_decoder *cxld = &cxlsd->cxld;
+	int iw = cxld->interleave_ways;
 	struct cxl_port *iter;
 	int rc;
 
-	if (cxlrd->calc_hb(cxlrd, pos) != dport) {
+	if (dport != cxlrd->cxlsd.target[pos % iw]) {
 		dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n",
 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
 			dev_name(&cxlrd->cxlsd.cxld.dev));

From 8f55ada796565ce801418bf579f31a6a522d0337 Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Tue, 2 Jul 2024 22:29:52 -0700
Subject: [PATCH 069/346] cxl: Remove defunct code calculating host bridge
 target positions

The CXL Spec 3.1 Table 9-22 requires that the BIOS populate the CFMWS
target list in interleave target order. This means the calculations
the CXL driver added to determine positions when XOR math is in use,
along with the entire XOR vs Modulo call back setup is not needed.

A prior patch added a common method to verify positions.

Remove the now unused code related to the cxl_calc_hb_fn.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://patch.msgid.link/2e2c32a2d0f1007e920b58712d15edad2e48d857.1719980933.git.alison.schofield@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/acpi.c      | 60 ++---------------------------------------
 drivers/cxl/core/port.c | 20 +-------------
 drivers/cxl/cxl.h       |  8 +-----
 3 files changed, 4 insertions(+), 84 deletions(-)

diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index 6b6ae9c81368..921cee3bb980 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -22,57 +22,6 @@ static const guid_t acpi_cxl_qtg_id_guid =
 	GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071,
 		  0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52);
 
-/*
- * Find a targets entry (n) in the host bridge interleave list.
- * CXL Specification 3.0 Table 9-22
- */
-static int cxl_xor_calc_n(u64 hpa, struct cxl_cxims_data *cximsd, int iw,
-			  int ig)
-{
-	int i = 0, n = 0;
-	u8 eiw;
-
-	/* IW: 2,4,6,8,12,16 begin building 'n' using xormaps */
-	if (iw != 3) {
-		for (i = 0; i < cximsd->nr_maps; i++)
-			n |= (hweight64(hpa & cximsd->xormaps[i]) & 1) << i;
-	}
-	/* IW: 3,6,12 add a modulo calculation to 'n' */
-	if (!is_power_of_2(iw)) {
-		if (ways_to_eiw(iw, &eiw))
-			return -1;
-		hpa &= GENMASK_ULL(51, eiw + ig);
-		n |= do_div(hpa, 3) << i;
-	}
-	return n;
-}
-
-static struct cxl_dport *cxl_hb_xor(struct cxl_root_decoder *cxlrd, int pos)
-{
-	struct cxl_cxims_data *cximsd = cxlrd->platform_data;
-	struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd;
-	struct cxl_decoder *cxld = &cxlsd->cxld;
-	int ig = cxld->interleave_granularity;
-	int iw = cxld->interleave_ways;
-	int n = 0;
-	u64 hpa;
-
-	if (dev_WARN_ONCE(&cxld->dev,
-			  cxld->interleave_ways != cxlsd->nr_targets,
-			  "misconfigured root decoder\n"))
-		return NULL;
-
-	hpa = cxlrd->res->start + pos * ig;
-
-	/* Entry (n) is 0 for no interleave (iw == 1) */
-	if (iw != 1)
-		n = cxl_xor_calc_n(hpa, cximsd, iw, ig);
-
-	if (n < 0)
-		return NULL;
-
-	return cxlrd->cxlsd.target[n];
-}
 
 static u64 cxl_xor_hpa_to_spa(struct cxl_root_decoder *cxlrd, u64 hpa)
 {
@@ -398,7 +347,6 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws,
 	struct cxl_port *root_port = ctx->root_port;
 	struct cxl_cxims_context cxims_ctx;
 	struct device *dev = ctx->dev;
-	cxl_calc_hb_fn cxl_calc_hb;
 	struct cxl_decoder *cxld;
 	unsigned int ways, i, ig;
 	int rc;
@@ -426,13 +374,9 @@ static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws,
 	if (rc)
 		return rc;
 
-	if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_MODULO)
-		cxl_calc_hb = cxl_hb_modulo;
-	else
-		cxl_calc_hb = cxl_hb_xor;
-
 	struct cxl_root_decoder *cxlrd __free(put_cxlrd) =
-		cxl_root_decoder_alloc(root_port, ways, cxl_calc_hb);
+		cxl_root_decoder_alloc(root_port, ways);
+
 	if (IS_ERR(cxlrd))
 		return PTR_ERR(cxlrd);
 
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 887ed6e358fb..bc9e18d02598 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -1733,21 +1733,6 @@ static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd,
 	return 0;
 }
 
-struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos)
-{
-	struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd;
-	struct cxl_decoder *cxld = &cxlsd->cxld;
-	int iw;
-
-	iw = cxld->interleave_ways;
-	if (dev_WARN_ONCE(&cxld->dev, iw != cxlsd->nr_targets,
-			  "misconfigured root decoder\n"))
-		return NULL;
-
-	return cxlrd->cxlsd.target[pos % iw];
-}
-EXPORT_SYMBOL_NS_GPL(cxl_hb_modulo, CXL);
-
 static struct lock_class_key cxl_decoder_key;
 
 /**
@@ -1807,7 +1792,6 @@ static int cxl_switch_decoder_init(struct cxl_port *port,
  * cxl_root_decoder_alloc - Allocate a root level decoder
  * @port: owning CXL root of this decoder
  * @nr_targets: static number of downstream targets
- * @calc_hb: which host bridge covers the n'th position by granularity
  *
  * Return: A new cxl decoder to be registered by cxl_decoder_add(). A
  * 'CXL root' decoder is one that decodes from a top-level / static platform
@@ -1815,8 +1799,7 @@ static int cxl_switch_decoder_init(struct cxl_port *port,
  * topology.
  */
 struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port,
-						unsigned int nr_targets,
-						cxl_calc_hb_fn calc_hb)
+						unsigned int nr_targets)
 {
 	struct cxl_root_decoder *cxlrd;
 	struct cxl_switch_decoder *cxlsd;
@@ -1838,7 +1821,6 @@ struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port,
 		return ERR_PTR(rc);
 	}
 
-	cxlrd->calc_hb = calc_hb;
 	mutex_init(&cxlrd->range_lock);
 
 	cxld = &cxlsd->cxld;
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index dfc4f27d195c..e27a0cd4f107 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -432,15 +432,12 @@ struct cxl_switch_decoder {
 };
 
 struct cxl_root_decoder;
-typedef struct cxl_dport *(*cxl_calc_hb_fn)(struct cxl_root_decoder *cxlrd,
-					    int pos);
 typedef u64 (*cxl_hpa_to_spa_fn)(struct cxl_root_decoder *cxlrd, u64 hpa);
 
 /**
  * struct cxl_root_decoder - Static platform CXL address decoder
  * @res: host / parent resource for region allocations
  * @region_id: region id for next region provisioning event
- * @calc_hb: which host bridge covers the n'th position by granularity
  * @hpa_to_spa: translate CXL host-physical-address to Platform system-physical-address
  * @platform_data: platform specific configuration data
  * @range_lock: sync region autodiscovery by address range
@@ -450,7 +447,6 @@ typedef u64 (*cxl_hpa_to_spa_fn)(struct cxl_root_decoder *cxlrd, u64 hpa);
 struct cxl_root_decoder {
 	struct resource *res;
 	atomic_t region_id;
-	cxl_calc_hb_fn calc_hb;
 	cxl_hpa_to_spa_fn hpa_to_spa;
 	void *platform_data;
 	struct mutex range_lock;
@@ -775,9 +771,7 @@ bool is_root_decoder(struct device *dev);
 bool is_switch_decoder(struct device *dev);
 bool is_endpoint_decoder(struct device *dev);
 struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port,
-						unsigned int nr_targets,
-						cxl_calc_hb_fn calc_hb);
-struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos);
+						unsigned int nr_targets);
 struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port,
 						    unsigned int nr_targets);
 int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map);

From 745d9f4a31defec731119ee8aad8ba9f2536dd9a Mon Sep 17 00:00:00 2001
From: Fedor Pchelkin <pchelkin@ispras.ru>
Date: Thu, 29 Feb 2024 23:42:36 +0300
Subject: [PATCH 070/346] ubi: eba: properly rollback inside self_check_eba

In case of a memory allocation failure in the volumes loop we can only
process the already allocated scan_eba and fm_eba array elements on the
error path - others are still uninitialized.

Found by Linux Verification Center (linuxtesting.org).

Fixes: 00abf3041590 ("UBI: Add self_check_eba()")
Cc: stable@vger.kernel.org
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Reviewed-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/ubi/eba.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c
index e5ac3cd0bbae..c7ba7a15c9f7 100644
--- a/drivers/mtd/ubi/eba.c
+++ b/drivers/mtd/ubi/eba.c
@@ -1564,6 +1564,7 @@ int self_check_eba(struct ubi_device *ubi, struct ubi_attach_info *ai_fastmap,
 					  GFP_KERNEL);
 		if (!fm_eba[i]) {
 			ret = -ENOMEM;
+			kfree(scan_eba[i]);
 			goto out_free;
 		}
 
@@ -1599,7 +1600,7 @@ int self_check_eba(struct ubi_device *ubi, struct ubi_attach_info *ai_fastmap,
 	}
 
 out_free:
-	for (i = 0; i < num_volumes; i++) {
+	while (--i >= 0) {
 		if (!ubi->volumes[i])
 			continue;
 

From 299af26eb46374295a3289be990e536b75c9376a Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Tue, 5 Mar 2024 16:35:38 -0300
Subject: [PATCH 071/346] mtd: ubi: make ubi_class constant

Since commit 43a7206b0963 ("driver core: class: make class_register() take
a const *"), the driver core allows for struct class to be in read-only
memory, so move the ubi_class structure to be declared at build time
placing it into read-only memory, instead of having to be dynamically
allocated at boot time.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ricardo B. Marliere <ricardo@marliere.net>
Reviewed-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/ubi/build.c | 2 +-
 drivers/mtd/ubi/ubi.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index a7e3a6246c0e..952c80269f57 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -112,7 +112,7 @@ static struct attribute *ubi_class_attrs[] = {
 ATTRIBUTE_GROUPS(ubi_class);
 
 /* Root UBI "class" object (corresponds to '/<sysfs>/class/ubi/') */
-struct class ubi_class = {
+const struct class ubi_class = {
 	.name		= UBI_NAME_STR,
 	.class_groups	= ubi_class_groups,
 };
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
index 32009a24869e..1aead55090d0 100644
--- a/drivers/mtd/ubi/ubi.h
+++ b/drivers/mtd/ubi/ubi.h
@@ -814,7 +814,7 @@ extern struct kmem_cache *ubi_wl_entry_slab;
 extern const struct file_operations ubi_ctrl_cdev_operations;
 extern const struct file_operations ubi_cdev_operations;
 extern const struct file_operations ubi_vol_cdev_operations;
-extern struct class ubi_class;
+extern const struct class ubi_class;
 extern struct mutex ubi_devices_mutex;
 extern struct blocking_notifier_head ubi_notifiers;
 

From 02096a0cf150fc8dc1cfe39afcd768d06b70e722 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 13 Mar 2024 09:46:52 +0100
Subject: [PATCH 072/346] mtd: ubi: avoid expensive do_div() on 32-bit machines

The use of do_div() in ubi_nvmem_reg_read() makes calling it on
32-bit machines rather expensive. Since the 'from' variable is
known to be a 32-bit quantity, it is clearly never needed and
can be optimized into a regular division operation.

Fixes: b8a77b9a5f9c ("mtd: ubi: fix NVMEM over UBI volumes on 32-bit systems")
Fixes: 3ce485803da1 ("mtd: ubi: provide NVMEM layer over UBI volumes")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/ubi/nvmem.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/mtd/ubi/nvmem.c b/drivers/mtd/ubi/nvmem.c
index 8aeb9c428e51..a94a1a9aaec1 100644
--- a/drivers/mtd/ubi/nvmem.c
+++ b/drivers/mtd/ubi/nvmem.c
@@ -6,7 +6,6 @@
 /* UBI NVMEM provider */
 #include "ubi.h"
 #include <linux/nvmem-provider.h>
-#include <asm/div64.h>
 
 /* List of all NVMEM devices */
 static LIST_HEAD(nvmem_devices);
@@ -27,14 +26,15 @@ static int ubi_nvmem_reg_read(void *priv, unsigned int from,
 	struct ubi_nvmem *unv = priv;
 	struct ubi_volume_desc *desc;
 	uint32_t offs;
-	uint64_t lnum = from;
+	uint32_t lnum;
 	int err = 0;
 
 	desc = ubi_open_volume(unv->ubi_num, unv->vol_id, UBI_READONLY);
 	if (IS_ERR(desc))
 		return PTR_ERR(desc);
 
-	offs = do_div(lnum, unv->usable_leb_size);
+	offs = from % unv->usable_leb_size;
+	lnum = from / unv->usable_leb_size;
 	while (bytes_left) {
 		to_read = unv->usable_leb_size - offs;
 

From 02eb1846b77d2896903e94b966ba632cf48d39e0 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 10 Apr 2024 15:37:43 +0800
Subject: [PATCH 073/346] ubifs: Fix unattached xattr inode if powercut happens
 after deleting

When powercut happens after deleting file, the xattr inode could be
alone existing in TNC but its' xattr entry cannot be found in TNC.
File inode and xattr inode are added into orphan list after deleting
file, file inode's nlink is 0 but xattr inode's nlink is not 0 (PS:
zero nlink xattr inode is written on disk in evicting process by
ubifs_jnl_write_inode). So, following process could happen:
 1. touch file
 2. setxattr(file)
 3. unlink file
    // inode(nlink=0), xattr inode(nlink=1) are added into orphan list
 4. commit
    // write inode inum and xattr inum into orphan area
 5. powercut
 6. mount
    do_kill_orphans
     // inode(nlink=0) is deleted from TNC by ubifs_tnc_remove_range,
     // xattr entry is deleted too.
     // xattr inode(nlink=1) is not deleted from TNC
Finally we could see following error while debugging UBIFS:
 UBIFS error (ubi0:0 pid 1093): dbg_check_filesystem [ubifs]: inode 66
 nlink is 1, but calculated nlink is 0
 UBIFS (ubi0:0): dump of the inode 66 sitting in LEB 12:2128
   node_type      0 (inode node)
   group_type     1 (in node group)
   len            197
   key            (66, inode)
   size           37
   nlink          1
   flags          0x20
   xattr_cnt      0
   xattr_size     0
   xattr_names    0
   data len       37

Fix it by removing entire inode with it's xattrs while replaying orphan,
just replace function ubifs_tnc_remove_range by ubifs_tnc_remove_ino.

Fixes: ee1438ce5dc4 ("ubifs: Check link count of inodes when killing orphans.")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218661
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/orphan.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 4909321d84cf..ddeb125e6930 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -691,12 +691,12 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 
 		n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
 		for (i = 0; i < n; i++) {
-			union ubifs_key key1, key2;
+			union ubifs_key key;
 
 			inum = le64_to_cpu(orph->inos[i]);
 
-			ino_key_init(c, &key1, inum);
-			err = ubifs_tnc_lookup(c, &key1, ino);
+			ino_key_init(c, &key, inum);
+			err = ubifs_tnc_lookup(c, &key, ino);
 			if (err && err != -ENOENT)
 				goto out_free;
 
@@ -708,10 +708,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 				dbg_rcvry("deleting orphaned inode %lu",
 					  (unsigned long)inum);
 
-				lowest_ino_key(c, &key1, inum);
-				highest_ino_key(c, &key2, inum);
-
-				err = ubifs_tnc_remove_range(c, &key1, &key2);
+				err = ubifs_tnc_remove_ino(c, inum);
 				if (err)
 					goto out_ro;
 			}

From 354c1796639a743b49bd13549ec0ec6ca121b85e Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 10 Apr 2024 15:37:44 +0800
Subject: [PATCH 074/346] ubifs: Don't add xattr inode into orphan area

Now, the entire inode with its' xattrs are removed while replaying
orphan nodes. There is no need to add xattr inodes into orphan area,
which is based on the fact that xattr entries won't be cleared from
disk before deleting xattr inodes, in another words, current logic
can make sure that xattr inode be deleted in any cases even UBIFS not
record xattr inode into orphan area.
Let's looking for possible paths that could clear xattr entries from
disk but leave the xattr inode on TNC:
 1. unlink/tmpfile -> ubifs_jnl_update: inode(nlink=0) is written
    into bud LEB and added into orphan list, then:
    a. powercut: ubifs_tnc_remove_ino(xattr entry/inode can be found
       from TNC and being deleted) is invoked in replaying journal.
    b. commit + powercut: inode is written into orphan area, and
       ubifs_tnc_remove_ino is invoked in replaying orphan nodes.
    c. evicting + powercut: xattr inode(nlink=0) is written on disk,
       xattr is removed from TNC, gc could clear xattr entries from
       disk. ubifs_tnc_remove_ino will apply on inode and xattr inode
       in replaying journal, so lost xattr entries will make no
       influence.
    d. evicting + commit + powercut: xattr inode/entry are removed from
       index tree(on disk) by ubifs_jnl_write_inode, xattr inode is
       cleared from orphan area by ubifs_jnl_write_inode + commit.
    e. commit + evicting + powercut: inode is written into orphan area,
       then equivalent to c.
 2. remove xattr -> ubifs_jnl_delete_xattr: xattr entry(inum=0) and
    xattr inode(nlink=0) is written into bud LEB, xattr entry/inode are
    removed from TNC, then:
    a. powercut: gc could clear xattr entries from disk, which won't
       affect deleting xattr entry from TNC. ubifs_tnc_remove_ino will
       apply on xattr inode in replaying journal, ubifs_tnc_remove_nm
       will apply on xattr entry in replaying journal.
    b. commit + powercut: xattr entry/inode are removed from index tree
       (on disk).
Tracking xattr inode in orphan list is imported by commit 988bec41318f3f
("ubifs: orphan: Handle xattrs like files"), it aims to fix the similar
problem described in commit 7959cf3a7506d4a ("ubifs: journal: Handle
xattrs like files"). Actually, the problem only exist in journal case
but not the orphan case. So, we can remove the orphan tracking for xattr
inodes.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/orphan.c | 85 ++++++++---------------------------------------
 fs/ubifs/ubifs.h  |  3 --
 2 files changed, 14 insertions(+), 74 deletions(-)

diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index ddeb125e6930..88fbf331ad8c 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -42,24 +42,30 @@
 
 static int dbg_check_orphans(struct ubifs_info *c);
 
-static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum,
-				       struct ubifs_orphan *parent_orphan)
+/**
+ * ubifs_add_orphan - add an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Add an orphan. This function is called when an inodes link count drops to
+ * zero.
+ */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
 {
 	struct ubifs_orphan *orphan, *o;
 	struct rb_node **p, *parent = NULL;
 
 	orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS);
 	if (!orphan)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	orphan->inum = inum;
 	orphan->new = 1;
-	INIT_LIST_HEAD(&orphan->child_list);
 
 	spin_lock(&c->orphan_lock);
 	if (c->tot_orphans >= c->max_orphans) {
 		spin_unlock(&c->orphan_lock);
 		kfree(orphan);
-		return ERR_PTR(-ENFILE);
+		return -ENFILE;
 	}
 	p = &c->orph_tree.rb_node;
 	while (*p) {
@@ -73,7 +79,7 @@ static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum,
 			ubifs_err(c, "orphaned twice");
 			spin_unlock(&c->orphan_lock);
 			kfree(orphan);
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 		}
 	}
 	c->tot_orphans += 1;
@@ -83,14 +89,9 @@ static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum,
 	list_add_tail(&orphan->list, &c->orph_list);
 	list_add_tail(&orphan->new_list, &c->orph_new);
 
-	if (parent_orphan) {
-		list_add_tail(&orphan->child_list,
-			      &parent_orphan->child_list);
-	}
-
 	spin_unlock(&c->orphan_lock);
 	dbg_gen("ino %lu", (unsigned long)inum);
-	return orphan;
+	return 0;
 }
 
 static struct ubifs_orphan *lookup_orphan(struct ubifs_info *c, ino_t inum)
@@ -144,59 +145,6 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph)
 	__orphan_drop(c, orph);
 }
 
-/**
- * ubifs_add_orphan - add an orphan.
- * @c: UBIFS file-system description object
- * @inum: orphan inode number
- *
- * Add an orphan. This function is called when an inodes link count drops to
- * zero.
- */
-int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
-{
-	int err = 0;
-	ino_t xattr_inum;
-	union ubifs_key key;
-	struct ubifs_dent_node *xent, *pxent = NULL;
-	struct fscrypt_name nm = {0};
-	struct ubifs_orphan *xattr_orphan;
-	struct ubifs_orphan *orphan;
-
-	orphan = orphan_add(c, inum, NULL);
-	if (IS_ERR(orphan))
-		return PTR_ERR(orphan);
-
-	lowest_xent_key(c, &key, inum);
-	while (1) {
-		xent = ubifs_tnc_next_ent(c, &key, &nm);
-		if (IS_ERR(xent)) {
-			err = PTR_ERR(xent);
-			if (err == -ENOENT)
-				break;
-			kfree(pxent);
-			return err;
-		}
-
-		fname_name(&nm) = xent->name;
-		fname_len(&nm) = le16_to_cpu(xent->nlen);
-		xattr_inum = le64_to_cpu(xent->inum);
-
-		xattr_orphan = orphan_add(c, xattr_inum, orphan);
-		if (IS_ERR(xattr_orphan)) {
-			kfree(pxent);
-			kfree(xent);
-			return PTR_ERR(xattr_orphan);
-		}
-
-		kfree(pxent);
-		pxent = xent;
-		key_read(c, &xent->key, &key);
-	}
-	kfree(pxent);
-
-	return 0;
-}
-
 /**
  * ubifs_delete_orphan - delete an orphan.
  * @c: UBIFS file-system description object
@@ -206,7 +154,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
  */
 void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
 {
-	struct ubifs_orphan *orph, *child_orph, *tmp_o;
+	struct ubifs_orphan *orph;
 
 	spin_lock(&c->orphan_lock);
 
@@ -219,11 +167,6 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
 		return;
 	}
 
-	list_for_each_entry_safe(child_orph, tmp_o, &orph->child_list, child_list) {
-		list_del(&child_orph->child_list);
-		orphan_delete(c, child_orph);
-	}
-	
 	orphan_delete(c, orph);
 
 	spin_unlock(&c->orphan_lock);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 1f3ea879d93a..821c6ae99e62 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -923,8 +923,6 @@ struct ubifs_budget_req {
  * @rb: rb-tree node of rb-tree of orphans sorted by inode number
  * @list: list head of list of orphans in order added
  * @new_list: list head of list of orphans added since the last commit
- * @child_list: list of xattr children if this orphan hosts xattrs, list head
- * if this orphan is a xattr, not used otherwise.
  * @cnext: next orphan to commit
  * @dnext: next orphan to delete
  * @inum: inode number
@@ -936,7 +934,6 @@ struct ubifs_orphan {
 	struct rb_node rb;
 	struct list_head list;
 	struct list_head new_list;
-	struct list_head child_list;
 	struct ubifs_orphan *cnext;
 	struct ubifs_orphan *dnext;
 	ino_t inum;

From 7bed61a1cf166b5c113047fc8f60ff22dcb04893 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 10 Apr 2024 15:37:45 +0800
Subject: [PATCH 075/346] Revert "ubifs: ubifs_symlink: Fix memleak of
 inode->i_link in error path"

This reverts commit 6379b44cdcd67f5f5d986b73953e99700591edfa. Commit
1e022216dcd2 ("ubifs: ubifs_symlink: Fix memleak of inode->i_link in
error path") is applied again in commit 6379b44cdcd6 ("ubifs:
ubifs_symlink: Fix memleak of inode->i_link in error path"), which
changed ubifs_mknod (It won't become a real problem). Just revert it.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/dir.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index eac0fef801f1..551148de66cd 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1133,8 +1133,6 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	dir_ui->ui_size = dir->i_size;
 	mutex_unlock(&dir_ui->ui_mutex);
 out_inode:
-	/* Free inode->i_link before inode is marked as bad. */
-	fscrypt_free_inode(inode);
 	make_bad_inode(inode);
 	iput(inode);
 out_fname:

From 6376d7503b02796cc6109198d555ddebfb4f5f81 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 10 Apr 2024 15:37:46 +0800
Subject: [PATCH 076/346] ubifs: Remove insert_dead_orphan from replaying
 orphan process

UBIFS will do commit at the end of mounting process(rw mode), dead
orphans(added by insert_dead_orphan in replaying orphan) are deleted
by ubifs_orphan_end_commit(). The only reason why dead orphans are
added into orphan list is that old orpans may be lost when powercut
happens in ubifs_orphan_end_commit():
ubifs_orphan_end_commit  // TNC(updated by orphans) is not written yet
 if (c->cmt_orphans != 0)
  commit_orphans
   consolidate // traverse orphan list
  write_orph_nodes // rewrite all orphans by ubifs_leb_change
  // If dead orphans are not in list, they will be lost when powercut
  // happens, then TNC won't be updated by old orphans in next mounting.
Luckily, the condition 'c->cmt_orphans != 0' will never be true in
mounting process, there can't be new orphans added into orphan list
before mounting returned, but commit will be done at the end of mounting.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/orphan.c | 49 -----------------------------------------------
 1 file changed, 49 deletions(-)

diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 88fbf331ad8c..6e843e8fc3db 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -513,51 +513,6 @@ int ubifs_clear_orphans(struct ubifs_info *c)
 	return 0;
 }
 
-/**
- * insert_dead_orphan - insert an orphan.
- * @c: UBIFS file-system description object
- * @inum: orphan inode number
- *
- * This function is a helper to the 'do_kill_orphans()' function. The orphan
- * must be kept until the next commit, so it is added to the rb-tree and the
- * deletion list.
- */
-static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
-{
-	struct ubifs_orphan *orphan, *o;
-	struct rb_node **p, *parent = NULL;
-
-	orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL);
-	if (!orphan)
-		return -ENOMEM;
-	orphan->inum = inum;
-
-	p = &c->orph_tree.rb_node;
-	while (*p) {
-		parent = *p;
-		o = rb_entry(parent, struct ubifs_orphan, rb);
-		if (inum < o->inum)
-			p = &(*p)->rb_left;
-		else if (inum > o->inum)
-			p = &(*p)->rb_right;
-		else {
-			/* Already added - no problem */
-			kfree(orphan);
-			return 0;
-		}
-	}
-	c->tot_orphans += 1;
-	rb_link_node(&orphan->rb, parent, p);
-	rb_insert_color(&orphan->rb, &c->orph_tree);
-	list_add_tail(&orphan->list, &c->orph_list);
-	orphan->del = 1;
-	orphan->dnext = c->orph_dnext;
-	c->orph_dnext = orphan;
-	dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum,
-		c->new_orphans, c->tot_orphans);
-	return 0;
-}
-
 /**
  * do_kill_orphans - remove orphan inodes from the index.
  * @c: UBIFS file-system description object
@@ -655,10 +610,6 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 				if (err)
 					goto out_ro;
 			}
-
-			err = insert_dead_orphan(c, inum);
-			if (err)
-				goto out_free;
 		}
 
 		*last_cmt_no = cmt_no;

From 7efc34b53b3950ad4fc98b5d25210bae80ab0fde Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 10 Apr 2024 15:37:47 +0800
Subject: [PATCH 077/346] ubifs: Fix adding orphan entry twice for the same
 inode

The tmpfile could be added into orphan list twice, first time is
creation, the second time is removing after it is linked. The orphan
entry could be added twice for tmpfile if following sequence is
satisfied:

ubifs_tmpfile
 ubifs_jnl_update
  ubifs_add_orphan // first time to add orphan entry

    P1                        P2
ubifs_link                 do_commit
                            ubifs_orphan_start_commit
			     orphan->cmt = 1
 ubifs_delete_orphan
  orphan_delete
   if (orph->cmt)
    orph->del = 1; // orphan entry is not deleted from tree
    return
ubifs_unlink
 ubifs_jnl_update
  ubifs_add_orphan
   orphan_add // found old orphan entry, second time to add orphan entry
    ubifs_err(c, "orphaned twice")
    return -EINVAL // unlink failed!
                            ubifs_orphan_end_commit
			     erase_deleted // delete old orphan entry
			      rb_erase(&orphan->rb, &c->orph_tree)

Fix it by removing orphan entry from orphan tree in advance, rather than
remove it from orphan tree in committing process.

Fixes: 32fe905c17f0 ("ubifs: Fix O_TMPFILE corner case in ubifs_link()")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218672
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/orphan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 6e843e8fc3db..37d206097112 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -136,6 +136,7 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph)
 
 	if (orph->cmt) {
 		orph->del = 1;
+		rb_erase(&orph->rb, &c->orph_tree);
 		orph->dnext = c->orph_dnext;
 		c->orph_dnext = orph;
 		dbg_gen("delete later ino %lu", (unsigned long)orph->inum);
@@ -461,7 +462,6 @@ static void erase_deleted(struct ubifs_info *c)
 		dnext = orphan->dnext;
 		ubifs_assert(c, !orphan->new);
 		ubifs_assert(c, orphan->del);
-		rb_erase(&orphan->rb, &c->orph_tree);
 		list_del(&orphan->list);
 		c->tot_orphans -= 1;
 		dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum);

From 9f5ecacfcee2bda21b100399a9130248d55c2a0c Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 10 Apr 2024 15:37:48 +0800
Subject: [PATCH 078/346] ubifs: Move ui->data initialization after
 initializing security

Host inode and its' xattr will be written on disk after initializing
security when creating symlink or dev, then the host inode and its
dentry will be written again in ubifs_jnl_update.
There is no need to write inode data in the security initialization
pass, just move the ui->data initialization after initializing
security.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/dir.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 551148de66cd..848988f2b7dc 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1102,16 +1102,18 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 		goto out_fname;
 	}
 
+	err = ubifs_init_security(dir, inode, &dentry->d_name);
+	if (err) {
+		kfree(dev);
+		goto out_inode;
+	}
+
 	init_special_inode(inode, inode->i_mode, rdev);
 	inode->i_size = ubifs_inode(inode)->ui_size = devlen;
 	ui = ubifs_inode(inode);
 	ui->data = dev;
 	ui->data_len = devlen;
 
-	err = ubifs_init_security(dir, inode, &dentry->d_name);
-	if (err)
-		goto out_inode;
-
 	mutex_lock(&dir_ui->ui_mutex);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
@@ -1184,6 +1186,10 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		goto out_fname;
 	}
 
+	err = ubifs_init_security(dir, inode, &dentry->d_name);
+	if (err)
+		goto out_inode;
+
 	ui = ubifs_inode(inode);
 	ui->data = kmalloc(disk_link.len, GFP_NOFS);
 	if (!ui->data) {
@@ -1209,10 +1215,6 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	ui->data_len = disk_link.len - 1;
 	inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1;
 
-	err = ubifs_init_security(dir, inode, &dentry->d_name);
-	if (err)
-		goto out_inode;
-
 	mutex_lock(&dir_ui->ui_mutex);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;

From b25e6a5f78e84598ec62552ef342a58a23b6f7c4 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 10 Apr 2024 15:37:49 +0800
Subject: [PATCH 079/346] ubifs: Fix space leak when powercut happens in
 linking tmpfile

There is a potential space leak problem when powercut happens in linking
tmpfile, in which case, inode node (with nlink=0) and its' data nodes can
be found from tnc (on flash), but there are no dentries related to the
inode, so the file is invisible but takes free space. Detailed process is
shown as:
 ubifs_tmpfile
  ubifs_jnl_update // Add bud A into log area
   ubifs_add_orphan // Add inode into orphan list

     P1             P2
 ubifs_link
  ubifs_delete_orphan // Delete inode from orphan list, then inode won't
		      // be written into orphan area, there is no chance
		      // to delete inode by replaying orphan.
                commit // bud A won't be replayed in next mounting
   >> powercut <<
  ubifs_jnl_update // Link inode to dentry

The root cause is that orphan entry deletion and journal writing(for link)
are interrupted by commit, which makes the two operations are not atomic.
Fix it by doing ubifs_delete_orphan under the protection of c->commit_sem
within ubifs_jnl_update. This is also a preparation to support all creating
new files by orphan inode.

v1 is https://lore.kernel.org/linux-mtd/20200701093227.674945-1-chengzhihao1@huawei.com/

Fixes: 32fe905c17f0 ("ubifs: Fix O_TMPFILE corner case in ubifs_link()")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=208405
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/dir.c     | 22 ++++++++--------------
 fs/ubifs/journal.c |  6 +++++-
 fs/ubifs/ubifs.h   |  2 +-
 fs/ubifs/xattr.c   |  2 +-
 4 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 848988f2b7dc..fe16443243ab 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -325,7 +325,7 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir,
 	dir_ui->ui_size = dir->i_size;
 	inode_set_mtime_to_ts(dir,
 			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
-	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0);
 	if (err)
 		goto out_cancel;
 	mutex_unlock(&dir_ui->ui_mutex);
@@ -479,7 +479,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 	mutex_unlock(&ui->ui_mutex);
 
 	lock_2_inodes(dir, inode);
-	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0);
 	if (err)
 		goto out_cancel;
 	unlock_2_inodes(dir, inode);
@@ -760,10 +760,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 
 	lock_2_inodes(dir, inode);
 
-	/* Handle O_TMPFILE corner case, it is allowed to link a O_TMPFILE. */
-	if (inode->i_nlink == 0)
-		ubifs_delete_orphan(c, inode->i_ino);
-
 	inc_nlink(inode);
 	ihold(inode);
 	inode_set_ctime_current(inode);
@@ -771,7 +767,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	dir_ui->ui_size = dir->i_size;
 	inode_set_mtime_to_ts(dir,
 			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
-	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, inode->i_nlink == 1);
 	if (err)
 		goto out_cancel;
 	unlock_2_inodes(dir, inode);
@@ -785,8 +781,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	dir->i_size -= sz_change;
 	dir_ui->ui_size = dir->i_size;
 	drop_nlink(inode);
-	if (inode->i_nlink == 0)
-		ubifs_add_orphan(c, inode->i_ino);
 	unlock_2_inodes(dir, inode);
 	ubifs_release_budget(c, &req);
 	iput(inode);
@@ -846,7 +840,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	dir_ui->ui_size = dir->i_size;
 	inode_set_mtime_to_ts(dir,
 			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
-	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0);
 	if (err)
 		goto out_cancel;
 	unlock_2_inodes(dir, inode);
@@ -950,7 +944,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 	dir_ui->ui_size = dir->i_size;
 	inode_set_mtime_to_ts(dir,
 			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
-	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0);
 	if (err)
 		goto out_cancel;
 	unlock_2_inodes(dir, inode);
@@ -1025,7 +1019,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	dir_ui->ui_size = dir->i_size;
 	inode_set_mtime_to_ts(dir,
 			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
-	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0);
 	if (err) {
 		ubifs_err(c, "cannot create directory, error %d", err);
 		goto out_cancel;
@@ -1119,7 +1113,7 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	dir_ui->ui_size = dir->i_size;
 	inode_set_mtime_to_ts(dir,
 			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
-	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0);
 	if (err)
 		goto out_cancel;
 	mutex_unlock(&dir_ui->ui_mutex);
@@ -1220,7 +1214,7 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	dir_ui->ui_size = dir->i_size;
 	inode_set_mtime_to_ts(dir,
 			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
-	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0);
 	if (err)
 		goto out_cancel;
 	mutex_unlock(&dir_ui->ui_mutex);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 74aee92433d7..2b4b05c2d9b2 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -643,6 +643,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent)
  * @inode: inode to update
  * @deletion: indicates a directory entry deletion i.e unlink or rmdir
  * @xent: non-zero if the directory entry is an extended attribute entry
+ * @delete_orphan: indicates an orphan entry deletion for @inode
  *
  * This function updates an inode by writing a directory entry (or extended
  * attribute entry), the inode itself, and the parent directory inode (or the
@@ -664,7 +665,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent)
  */
 int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
 		     const struct fscrypt_name *nm, const struct inode *inode,
-		     int deletion, int xent)
+		     int deletion, int xent, int delete_orphan)
 {
 	int err, dlen, ilen, len, lnum, ino_offs, dent_offs, orphan_added = 0;
 	int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
@@ -806,6 +807,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
 	if (err)
 		goto out_ro;
 
+	if (delete_orphan)
+		ubifs_delete_orphan(c, inode->i_ino);
+
 	finish_reservation(c);
 	spin_lock(&ui->ui_lock);
 	ui->synced_i_size = ui->ui_size;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 821c6ae99e62..d0ed78345617 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1800,7 +1800,7 @@ int ubifs_consolidate_log(struct ubifs_info *c);
 /* journal.c */
 int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
 		     const struct fscrypt_name *nm, const struct inode *inode,
-		     int deletion, int xent);
+		     int deletion, int xent, int delete_orphan);
 int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 			 const union ubifs_key *key, const void *buf, int len);
 int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 0847db521984..f734588b224a 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -149,7 +149,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
 	if (strcmp(fname_name(nm), UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0)
 		host_ui->flags |= UBIFS_CRYPT_FL;
 
-	err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
+	err = ubifs_jnl_update(c, host, nm, inode, 0, 1, 0);
 	if (err)
 		goto out_cancel;
 	ubifs_set_inode_flags(host);

From 3af2d3a8c56fe7dc24f60c4df0ab85b7ac941902 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 10 Apr 2024 15:37:50 +0800
Subject: [PATCH 080/346] ubifs: Fix unattached inode when powercut happens in
 creating

For selinux or encryption scenarios, UBIFS could become inconsistent
while creating new files in powercut case. Encryption/selinux related
xattrs will be created before creating file dentry, which makes creation
process is not atomic, details are shown as:

Encryption case:
ubifs_create
 ubifs_new_inode
  fscrypt_set_context
   ubifs_xattr_set
    create_xattr
     ubifs_jnl_update  // Disk: xentry xinode inode(LAST_OF_NODE_GROUP)
 >> power cut <<
 ubifs_jnl_update  // Disk: dentry inode parent_inode(LAST_OF_NODE_GROUP)

Selinux case:
ubifs_create
 ubifs_new_inode
 ubifs_init_security
  security_inode_init_security
   ubifs_xattr_set
    create_xattr
     ubifs_jnl_update  // Disk: xentry xinode inode(LAST_OF_NODE_GROUP)
 >> power cut <<
 ubifs_jnl_update  // Disk: dentry inode parent_inode(LAST_OF_NODE_GROUP)

Above process will make chk_fs failed in next mounting:
 UBIFS error (ubi0:0 pid 7995): dbg_check_filesystem [ubifs]: inode 66
 nlink is 1, but calculated nlink is 0

Fix it by allocating orphan inode for each non-xattr file creation, then
removing orphan list in journal writing process, which ensures that both
xattr and dentry be effective in atomic when powercut happens.

Fixes: d7f0b70d30ff ("UBIFS: Add security.* XATTR support for the UBIFS")
Fixes: d475a507457b ("ubifs: Add skeleton for fscrypto")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218309
Suggested-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/dir.c     | 59 +++++++++++++++++++++++++++++++---------------
 fs/ubifs/journal.c | 14 +++++++----
 fs/ubifs/ubifs.h   |  4 ++--
 3 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index fe16443243ab..c77ea57fe696 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -71,8 +71,13 @@ static int inherit_flags(const struct inode *dir, umode_t mode)
  * @is_xattr: whether the inode is xattr inode
  *
  * This function finds an unused inode number, allocates new inode and
- * initializes it. Returns new inode in case of success and an error code in
- * case of failure.
+ * initializes it. Non-xattr new inode may be written with xattrs(selinux/
+ * encryption) before writing dentry, which could cause inconsistent problem
+ * when powercut happens between two operations. To deal with it, non-xattr
+ * new inode is initialized with zero-nlink and added into orphan list, caller
+ * should make sure that inode is relinked later, and make sure that orphan
+ * removing and journal writing into an committing atomic operation. Returns
+ * new inode in case of success and an error code in case of failure.
  */
 struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 			      umode_t mode, bool is_xattr)
@@ -163,9 +168,25 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 	ui->creat_sqnum = ++c->max_sqnum;
 	spin_unlock(&c->cnt_lock);
 
+	if (!is_xattr) {
+		set_nlink(inode, 0);
+		err = ubifs_add_orphan(c, inode->i_ino);
+		if (err) {
+			ubifs_err(c, "ubifs_add_orphan failed: %i", err);
+			goto out_iput;
+		}
+		down_read(&c->commit_sem);
+		ui->del_cmtno = c->cmt_no;
+		up_read(&c->commit_sem);
+	}
+
 	if (encrypted) {
 		err = fscrypt_set_context(inode, NULL);
 		if (err) {
+			if (!is_xattr) {
+				set_nlink(inode, 1);
+				ubifs_delete_orphan(c, inode->i_ino);
+			}
 			ubifs_err(c, "fscrypt_set_context failed: %i", err);
 			goto out_iput;
 		}
@@ -320,12 +341,13 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir,
 	if (err)
 		goto out_inode;
 
+	set_nlink(inode, 1);
 	mutex_lock(&dir_ui->ui_mutex);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
 	inode_set_mtime_to_ts(dir,
 			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
-	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1);
 	if (err)
 		goto out_cancel;
 	mutex_unlock(&dir_ui->ui_mutex);
@@ -340,8 +362,8 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir,
 	dir->i_size -= sz_change;
 	dir_ui->ui_size = dir->i_size;
 	mutex_unlock(&dir_ui->ui_mutex);
+	set_nlink(inode, 0);
 out_inode:
-	make_bad_inode(inode);
 	iput(inode);
 out_fname:
 	fscrypt_free_filename(&nm);
@@ -386,7 +408,6 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
 	return inode;
 
 out_inode:
-	make_bad_inode(inode);
 	iput(inode);
 out_free:
 	ubifs_err(c, "cannot create whiteout file, error %d", err);
@@ -470,6 +491,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 	if (err)
 		goto out_inode;
 
+	set_nlink(inode, 1);
 	mutex_lock(&ui->ui_mutex);
 	insert_inode_hash(inode);
 	d_tmpfile(file, inode);
@@ -479,7 +501,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 	mutex_unlock(&ui->ui_mutex);
 
 	lock_2_inodes(dir, inode);
-	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0, 1);
 	if (err)
 		goto out_cancel;
 	unlock_2_inodes(dir, inode);
@@ -492,7 +514,6 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 out_cancel:
 	unlock_2_inodes(dir, inode);
 out_inode:
-	make_bad_inode(inode);
 	if (!instantiated)
 		iput(inode);
 out_budg:
@@ -1011,6 +1032,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	if (err)
 		goto out_inode;
 
+	set_nlink(inode, 1);
 	mutex_lock(&dir_ui->ui_mutex);
 	insert_inode_hash(inode);
 	inc_nlink(inode);
@@ -1019,7 +1041,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	dir_ui->ui_size = dir->i_size;
 	inode_set_mtime_to_ts(dir,
 			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
-	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1);
 	if (err) {
 		ubifs_err(c, "cannot create directory, error %d", err);
 		goto out_cancel;
@@ -1036,8 +1058,8 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	dir_ui->ui_size = dir->i_size;
 	drop_nlink(dir);
 	mutex_unlock(&dir_ui->ui_mutex);
+	set_nlink(inode, 0);
 out_inode:
-	make_bad_inode(inode);
 	iput(inode);
 out_fname:
 	fscrypt_free_filename(&nm);
@@ -1107,13 +1129,14 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	ui = ubifs_inode(inode);
 	ui->data = dev;
 	ui->data_len = devlen;
+	set_nlink(inode, 1);
 
 	mutex_lock(&dir_ui->ui_mutex);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
 	inode_set_mtime_to_ts(dir,
 			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
-	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1);
 	if (err)
 		goto out_cancel;
 	mutex_unlock(&dir_ui->ui_mutex);
@@ -1128,8 +1151,8 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	dir->i_size -= sz_change;
 	dir_ui->ui_size = dir->i_size;
 	mutex_unlock(&dir_ui->ui_mutex);
+	set_nlink(inode, 0);
 out_inode:
-	make_bad_inode(inode);
 	iput(inode);
 out_fname:
 	fscrypt_free_filename(&nm);
@@ -1208,13 +1231,14 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	 */
 	ui->data_len = disk_link.len - 1;
 	inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1;
+	set_nlink(inode, 1);
 
 	mutex_lock(&dir_ui->ui_mutex);
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
 	inode_set_mtime_to_ts(dir,
 			      inode_set_ctime_to_ts(dir, inode_get_ctime(inode)));
-	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0, 1);
 	if (err)
 		goto out_cancel;
 	mutex_unlock(&dir_ui->ui_mutex);
@@ -1228,10 +1252,10 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	dir->i_size -= sz_change;
 	dir_ui->ui_size = dir->i_size;
 	mutex_unlock(&dir_ui->ui_mutex);
+	set_nlink(inode, 0);
 out_inode:
 	/* Free inode->i_link before inode is marked as bad. */
 	fscrypt_free_inode(inode);
-	make_bad_inode(inode);
 	iput(inode);
 out_fname:
 	fscrypt_free_filename(&nm);
@@ -1399,14 +1423,10 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 		 */
 		err = ubifs_budget_space(c, &wht_req);
 		if (err) {
-			/*
-			 * Whiteout inode can not be written on flash by
-			 * ubifs_jnl_write_inode(), because it's neither
-			 * dirty nor zero-nlink.
-			 */
 			iput(whiteout);
 			goto out_release;
 		}
+		set_nlink(whiteout, 1);
 
 		/* Add the old_dentry size to the old_dir size. */
 		old_sz -= CALC_DENT_SIZE(fname_len(&old_nm));
@@ -1485,7 +1505,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 	}
 
 	err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir,
-			       new_inode, &new_nm, whiteout, sync);
+			       new_inode, &new_nm, whiteout, sync, !!whiteout);
 	if (err)
 		goto out_cancel;
 
@@ -1538,6 +1558,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 	unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
 	if (whiteout) {
 		ubifs_release_budget(c, &wht_req);
+		set_nlink(whiteout, 0);
 		iput(whiteout);
 	}
 out_release:
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 2b4b05c2d9b2..517f5de76b49 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -643,7 +643,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent)
  * @inode: inode to update
  * @deletion: indicates a directory entry deletion i.e unlink or rmdir
  * @xent: non-zero if the directory entry is an extended attribute entry
- * @delete_orphan: indicates an orphan entry deletion for @inode
+ * @in_orphan: indicates whether the @inode is in orphan list
  *
  * This function updates an inode by writing a directory entry (or extended
  * attribute entry), the inode itself, and the parent directory inode (or the
@@ -665,7 +665,7 @@ static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent)
  */
 int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
 		     const struct fscrypt_name *nm, const struct inode *inode,
-		     int deletion, int xent, int delete_orphan)
+		     int deletion, int xent, int in_orphan)
 {
 	int err, dlen, ilen, len, lnum, ino_offs, dent_offs, orphan_added = 0;
 	int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
@@ -751,7 +751,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
 	if (err)
 		goto out_release;
 
-	if (last_reference) {
+	if (last_reference && !in_orphan) {
 		err = ubifs_add_orphan(c, inode->i_ino);
 		if (err) {
 			release_head(c, BASEHD);
@@ -807,7 +807,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
 	if (err)
 		goto out_ro;
 
-	if (delete_orphan)
+	if (in_orphan && inode->i_nlink)
 		ubifs_delete_orphan(c, inode->i_ino);
 
 	finish_reservation(c);
@@ -1340,6 +1340,7 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
  * @new_nm: new name of the new directory entry
  * @whiteout: whiteout inode
  * @sync: non-zero if the write-buffer has to be synchronized
+ * @delete_orphan: indicates an orphan entry deletion for @whiteout
  *
  * This function implements the re-name operation which may involve writing up
  * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes)
@@ -1352,7 +1353,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 		     const struct inode *new_dir,
 		     const struct inode *new_inode,
 		     const struct fscrypt_name *new_nm,
-		     const struct inode *whiteout, int sync)
+		     const struct inode *whiteout, int sync, int delete_orphan)
 {
 	void *p;
 	union ubifs_key key;
@@ -1569,6 +1570,9 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 			goto out_ro;
 	}
 
+	if (delete_orphan)
+		ubifs_delete_orphan(c, whiteout->i_ino);
+
 	finish_reservation(c);
 	if (new_inode) {
 		mark_inode_clean(c, new_ui);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index d0ed78345617..b2c6554fa857 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1800,7 +1800,7 @@ int ubifs_consolidate_log(struct ubifs_info *c);
 /* journal.c */
 int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
 		     const struct fscrypt_name *nm, const struct inode *inode,
-		     int deletion, int xent, int delete_orphan);
+		     int deletion, int xent, int in_orphan);
 int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 			 const union ubifs_key *key, const void *buf, int len);
 int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
@@ -1817,7 +1817,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 		     const struct inode *new_dir,
 		     const struct inode *new_inode,
 		     const struct fscrypt_name *new_nm,
-		     const struct inode *whiteout, int sync);
+		     const struct inode *whiteout, int sync, int delete_orphan);
 int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 		       loff_t old_size, loff_t new_size);
 int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,

From 06776df740660e27c351d89f314d05b2720ba41f Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Wed, 10 Apr 2024 15:37:51 +0800
Subject: [PATCH 081/346] ubifs: dbg_orphan_check: Fix missed key type checking

When selinux/encryption is enabled, xattr entry node is added into TNC
before host inode when creating new file. So it is possible to find
xattr entry without host inode from TNC. Orphan debug checking is called
by ubifs_orphan_end_commit(), at that time, the commit semaphore is
already unlock, so the new creation won't be blocked.

Fixes: d7f0b70d30ff ("UBIFS: Add security.* XATTR support for the UBIFS")
Fixes: d475a507457b ("ubifs: Add skeleton for fscrypto")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/orphan.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 37d206097112..fb957d963ba6 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -816,8 +816,12 @@ static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 
 	inum = key_inum(c, &zbr->key);
 	if (inum != ci->last_ino) {
-		/* Lowest node type is the inode node, so it comes first */
-		if (key_type(c, &zbr->key) != UBIFS_INO_KEY)
+		/*
+		 * Lowest node type is the inode node or xattr entry(when
+		 * selinux/encryption is enabled), so it comes first
+		 */
+		if (key_type(c, &zbr->key) != UBIFS_INO_KEY &&
+		    key_type(c, &zbr->key) != UBIFS_XENT_KEY)
 			ubifs_err(c, "found orphan node ino %lu, type %d",
 				  (unsigned long)inum, key_type(c, &zbr->key));
 		ci->last_ino = inum;

From 72f3d3daddd740f744a24cd7ef8c27bd0cd5489d Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings@mind.be>
Date: Thu, 11 Apr 2024 00:42:41 +0200
Subject: [PATCH 082/346] mtd: ubi: Restore missing cleanup on ubi_init()
 failure path

We need to clean-up debugfs and ubiblock if we fail after initialising
them.

Signed-off-by: Ben Hutchings <ben.hutchings@mind.be>
Fixes: 927c145208b0 ("mtd: ubi: attach from device tree")
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/ubi/build.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index 952c80269f57..30be4ed68fad 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -1372,7 +1372,7 @@ static int __init ubi_init(void)
 
 		/* See comment above re-ubi_is_module(). */
 		if (ubi_is_module())
-			goto out_slab;
+			goto out_debugfs;
 	}
 
 	register_mtd_user(&ubi_mtd_notifier);
@@ -1387,6 +1387,9 @@ static int __init ubi_init(void)
 
 out_mtd_notifier:
 	unregister_mtd_user(&ubi_mtd_notifier);
+	ubiblock_exit();
+out_debugfs:
+	ubi_debugfs_exit();
 out_slab:
 	kmem_cache_destroy(ubi_wl_entry_slab);
 out_dev_unreg:

From 7037c96d8c42965eb93e6b96cf921399c283d431 Mon Sep 17 00:00:00 2001
From: ZhaoLong Wang <wangzhaolong1@huawei.com>
Date: Thu, 18 Apr 2024 15:07:04 +0800
Subject: [PATCH 083/346] ubifs: correct UBIFS_DFS_DIR_LEN macro definition and
 improve code clarity

The UBIFS_DFS_DIR_LEN macro, which defines the maximum length of the UBIFS
debugfs directory name, has an incorrect formula and misleading comments.
The current formula is (3 + 1 + 2*2 + 1), which assumes that both UBI device
number and volume ID are limited to 2 characters. However, UBI device number
ranges from 0 to 31 (2 characters), and volume ID ranges from 0 to 127 (up
to 3 characters).

Although the current code works due to the cancellation of mathematical
errors (9 + 1 = 10, which matches the correct UBIFS_DFS_DIR_LEN value), it
can lead to confusion and potential issues in the future.

This patch aims to improve the code clarity and maintainability by making
the following changes:

1. Corrects the UBIFS_DFS_DIR_LEN macro definition to (3 + 1 + 2 + 3 + 1),
   accommodating the maximum lengths of both UBI device number and volume ID,
   plus the separators and null terminator.
2. Updates the snprintf calls to use UBIFS_DFS_DIR_LEN instead of
   UBIFS_DFS_DIR_LEN + 1, removing the unnecessary +1.
3. Modifies the error checks to compare against UBIFS_DFS_DIR_LEN using >=
   instead of >, aligning with the corrected macro definition.
4. Removes the redundant +1 in the dfs_dir_name array definitions in ubi.h
   and debug.h.

While these changes do not affect the runtime behavior, they make the code
more readable, maintainable, and less prone to future errors.

v2->v3:

 - Removes the duplicated UBIFS_DFS_DIR_LEN and UBIFS_DFS_DIR_NAME macro
   definitions in ubifs.h, as they are already defined in debug.h.

Signed-off-by: ZhaoLong Wang <wangzhaolong1@huawei.com>
Reviewed-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/ubi/debug.c | 4 ++--
 drivers/mtd/ubi/ubi.h   | 2 +-
 fs/ubifs/debug.c        | 4 ++--
 fs/ubifs/debug.h        | 7 ++++---
 fs/ubifs/sysfs.c        | 6 +++---
 fs/ubifs/ubifs.h        | 7 -------
 6 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c
index d57f52bd2ff3..9ec3b8b6a0aa 100644
--- a/drivers/mtd/ubi/debug.c
+++ b/drivers/mtd/ubi/debug.c
@@ -598,9 +598,9 @@ int ubi_debugfs_init_dev(struct ubi_device *ubi)
 	if (!IS_ENABLED(CONFIG_DEBUG_FS))
 		return 0;
 
-	n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN + 1, UBI_DFS_DIR_NAME,
+	n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN, UBI_DFS_DIR_NAME,
 		     ubi->ubi_num);
-	if (n > UBI_DFS_DIR_LEN) {
+	if (n >= UBI_DFS_DIR_LEN) {
 		/* The array size is too small */
 		return -EINVAL;
 	}
diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
index 1aead55090d0..1c9e874e8ede 100644
--- a/drivers/mtd/ubi/ubi.h
+++ b/drivers/mtd/ubi/ubi.h
@@ -420,7 +420,7 @@ struct ubi_debug_info {
 	unsigned int power_cut_min;
 	unsigned int power_cut_max;
 	unsigned int emulate_failures;
-	char dfs_dir_name[UBI_DFS_DIR_LEN + 1];
+	char dfs_dir_name[UBI_DFS_DIR_LEN];
 	struct dentry *dfs_dir;
 	struct dentry *dfs_chk_gen;
 	struct dentry *dfs_chk_io;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index ac77ac1fd73e..d91cec93d968 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2827,9 +2827,9 @@ void dbg_debugfs_init_fs(struct ubifs_info *c)
 	const char *fname;
 	struct ubifs_debug_info *d = c->dbg;
 
-	n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME,
+	n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN, UBIFS_DFS_DIR_NAME,
 		     c->vi.ubi_num, c->vi.vol_id);
-	if (n > UBIFS_DFS_DIR_LEN) {
+	if (n >= UBIFS_DFS_DIR_LEN) {
 		/* The array size is too small */
 		return;
 	}
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index ed966108da80..d425861e6b82 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -19,10 +19,11 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
 
 /*
  * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi"
- * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte.
+ * + 1 for "_" and 2 for UBI device numbers and 3 for volume number and 1 for
+ * the trailing zero byte.
  */
 #define UBIFS_DFS_DIR_NAME "ubi%d_%d"
-#define UBIFS_DFS_DIR_LEN  (3 + 1 + 2*2 + 1)
+#define UBIFS_DFS_DIR_LEN  (3 + 1 + 2 + 3 + 1)
 
 /**
  * ubifs_debug_info - per-FS debugging information.
@@ -103,7 +104,7 @@ struct ubifs_debug_info {
 	unsigned int chk_fs:1;
 	unsigned int tst_rcvry:1;
 
-	char dfs_dir_name[UBIFS_DFS_DIR_LEN + 1];
+	char dfs_dir_name[UBIFS_DFS_DIR_LEN];
 	struct dentry *dfs_dir;
 	struct dentry *dfs_dump_lprops;
 	struct dentry *dfs_dump_budg;
diff --git a/fs/ubifs/sysfs.c b/fs/ubifs/sysfs.c
index 1c958148bb87..aae32222f11b 100644
--- a/fs/ubifs/sysfs.c
+++ b/fs/ubifs/sysfs.c
@@ -91,17 +91,17 @@ static struct kset ubifs_kset = {
 int ubifs_sysfs_register(struct ubifs_info *c)
 {
 	int ret, n;
-	char dfs_dir_name[UBIFS_DFS_DIR_LEN+1];
+	char dfs_dir_name[UBIFS_DFS_DIR_LEN];
 
 	c->stats = kzalloc(sizeof(struct ubifs_stats_info), GFP_KERNEL);
 	if (!c->stats) {
 		ret = -ENOMEM;
 		goto out_last;
 	}
-	n = snprintf(dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME,
+	n = snprintf(dfs_dir_name, UBIFS_DFS_DIR_LEN, UBIFS_DFS_DIR_NAME,
 		     c->vi.ubi_num, c->vi.vol_id);
 
-	if (n > UBIFS_DFS_DIR_LEN) {
+	if (n >= UBIFS_DFS_DIR_LEN) {
 		/* The array size is too small */
 		ret = -EINVAL;
 		goto out_free;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2c6554fa857..d69a5a42d693 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -157,13 +157,6 @@
 #define UBIFS_HMAC_ARR_SZ 0
 #endif
 
-/*
- * The UBIFS sysfs directory name pattern and maximum name length (3 for "ubi"
- * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte.
- */
-#define UBIFS_DFS_DIR_NAME "ubi%d_%d"
-#define UBIFS_DFS_DIR_LEN  (3 + 1 + 2*2 + 1)
-
 /*
  * Lockdep classes for UBIFS inode @ui_mutex.
  */

From 39986148bc2ab4436ec169c8f4db76f6cc83705d Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Thu, 2 May 2024 13:16:55 -0700
Subject: [PATCH 084/346] ubifs: fix kernel-doc warnings

make C=1 reports the following kernel-doc warnings:

fs/ubifs/compress.c:103: warning: Function parameter or struct member 'c' not described in 'ubifs_compress'
fs/ubifs/compress.c:155: warning: Function parameter or struct member 'c' not described in 'ubifs_decompress'
fs/ubifs/find.c:353: warning: Excess function parameter 'data' description in 'scan_for_free_cb'
fs/ubifs/find.c:353: warning: Function parameter or struct member 'arg' not described in 'scan_for_free_cb'
fs/ubifs/find.c:594: warning: Excess function parameter 'data' description in 'scan_for_idx_cb'
fs/ubifs/find.c:594: warning: Function parameter or struct member 'arg' not described in 'scan_for_idx_cb'
fs/ubifs/find.c:786: warning: Excess function parameter 'data' description in 'scan_dirty_idx_cb'
fs/ubifs/find.c:786: warning: Function parameter or struct member 'arg' not described in 'scan_dirty_idx_cb'
fs/ubifs/find.c:86: warning: Excess function parameter 'data' description in 'scan_for_dirty_cb'
fs/ubifs/find.c:86: warning: Function parameter or struct member 'arg' not described in 'scan_for_dirty_cb'
fs/ubifs/journal.c:369: warning: expecting prototype for wake_up_reservation(). Prototype was for add_or_start_queue() instead
fs/ubifs/lprops.c:1018: warning: Excess function parameter 'lst' description in 'scan_check_cb'
fs/ubifs/lprops.c:1018: warning: Function parameter or struct member 'arg' not described in 'scan_check_cb'
fs/ubifs/lpt.c:1938: warning: Function parameter or struct member 'ptr' not described in 'lpt_scan_node'
fs/ubifs/replay.c:60: warning: Function parameter or struct member 'hash' not described in 'replay_entry'

Fix them.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Reviewed-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/compress.c | 2 ++
 fs/ubifs/find.c     | 8 ++++----
 fs/ubifs/journal.c  | 2 +-
 fs/ubifs/lprops.c   | 2 +-
 fs/ubifs/lpt.c      | 1 +
 fs/ubifs/replay.c   | 1 +
 6 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 75461777c466..0b48cbab8a3d 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -82,6 +82,7 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
 
 /**
  * ubifs_compress - compress data.
+ * @c: UBIFS file-system description object
  * @in_buf: data to compress
  * @in_len: length of the data to compress
  * @out_buf: output buffer where compressed data should be stored
@@ -140,6 +141,7 @@ void ubifs_compress(const struct ubifs_info *c, const void *in_buf,
 
 /**
  * ubifs_decompress - decompress data.
+ * @c: UBIFS file-system description object
  * @in_buf: data to decompress
  * @in_len: length of the data to decompress
  * @out_buf: output buffer where decompressed data should
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 6ebf3c04ac5f..643718906b9f 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -73,7 +73,7 @@ static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops)
  * @c: the UBIFS file-system description object
  * @lprops: LEB properties to scan
  * @in_tree: whether the LEB properties are in main memory
- * @data: information passed to and from the caller of the scan
+ * @arg: information passed to and from the caller of the scan
  *
  * This function returns a code that indicates whether the scan should continue
  * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
@@ -340,7 +340,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
  * @c: the UBIFS file-system description object
  * @lprops: LEB properties to scan
  * @in_tree: whether the LEB properties are in main memory
- * @data: information passed to and from the caller of the scan
+ * @arg: information passed to and from the caller of the scan
  *
  * This function returns a code that indicates whether the scan should continue
  * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
@@ -581,7 +581,7 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
  * @c: the UBIFS file-system description object
  * @lprops: LEB properties to scan
  * @in_tree: whether the LEB properties are in main memory
- * @data: information passed to and from the caller of the scan
+ * @arg: information passed to and from the caller of the scan
  *
  * This function returns a code that indicates whether the scan should continue
  * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
@@ -773,7 +773,7 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
  * @c: the UBIFS file-system description object
  * @lprops: LEB properties to scan
  * @in_tree: whether the LEB properties are in main memory
- * @data: information passed to and from the caller of the scan
+ * @arg: information passed to and from the caller of the scan
  *
  * This function returns a code that indicates whether the scan should continue
  * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 517f5de76b49..4a35f9e8f668 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -359,7 +359,7 @@ static void wake_up_reservation(struct ubifs_info *c)
 }
 
 /**
- * wake_up_reservation - add current task in queue or start queuing.
+ * add_or_start_queue - add current task in queue or start queuing.
  * @c: UBIFS file-system description object
  *
  * This function starts queuing if queuing is not started, otherwise adds
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index a11c3dab7e16..8788740ec57f 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1005,7 +1005,7 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
  * @c: the UBIFS file-system description object
  * @lp: LEB properties to scan
  * @in_tree: whether the LEB properties are in main memory
- * @lst: lprops statistics to update
+ * @arg: lprops statistics to update
  *
  * This function returns a code that indicates whether the scan should continue
  * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 778a22bf9a92..441d0beca4cf 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1918,6 +1918,7 @@ int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
  * @pnode: where to keep a pnode
  * @cnode: where to keep a cnode
  * @in_tree: is the node in the tree in memory
+ * @ptr: union of node pointers
  * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in
  * the tree
  * @ptr.pnode: ditto for pnode
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 17da28d6247a..a950c5f2560e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -29,6 +29,7 @@
  * @lnum: logical eraseblock number of the node
  * @offs: node offset
  * @len: node length
+ * @hash: node hash
  * @deletion: non-zero if this entry corresponds to a node deletion
  * @sqnum: node sequence number
  * @list: links the replay list

From 4f9d406c8c90dc17470cf63342c16f66ec2d978e Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Thu, 23 May 2024 01:10:35 +0800
Subject: [PATCH 085/346] ubi: block: fix null-pointer-dereference in
 ubiblock_create()

Similar to commit adbf4c4954e3 ("ubi: block: fix memleak in
ubiblock_create()"), 'dev->gd' is not assigned but dereferenced if
blk_mq_alloc_tag_set() fails, and leading to a null-pointer-dereference.
Fix it by using pr_err() and variable 'dev' to print error log.

Additionally, the log in the error handle path of idr_alloc() has
been improved by using pr_err(), too. Before initializing device
name, using dev_err() will print error log with 'null' instead of
the actual device name, like this:
  block (null): ...
        ~~~~~~
It is unclear. Using pr_err() can print more details of the device.
The improved log is:
  ubiblock0_0: ...

Fixes: 77567b25ab9f ("ubi: use blk_mq_alloc_disk and blk_cleanup_disk")
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Daniel Golle <daniel@makrotopia.org>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/ubi/block.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index f82e3423acb9..bf7308e8ec2f 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -390,7 +390,8 @@ int ubiblock_create(struct ubi_volume_info *vi)
 
 	ret = blk_mq_alloc_tag_set(&dev->tag_set);
 	if (ret) {
-		dev_err(disk_to_dev(dev->gd), "blk_mq_alloc_tag_set failed");
+		pr_err("ubiblock%d_%d: blk_mq_alloc_tag_set failed\n",
+			dev->ubi_num, dev->vol_id);
 		goto out_free_dev;
 	}
 
@@ -407,8 +408,8 @@ int ubiblock_create(struct ubi_volume_info *vi)
 	gd->minors = 1;
 	gd->first_minor = idr_alloc(&ubiblock_minor_idr, dev, 0, 0, GFP_KERNEL);
 	if (gd->first_minor < 0) {
-		dev_err(disk_to_dev(gd),
-			"block: dynamic minor allocation failed");
+		pr_err("ubiblock%d_%d: block: dynamic minor allocation failed\n",
+			dev->ubi_num, dev->vol_id);
 		ret = -ENODEV;
 		goto out_cleanup_disk;
 	}

From 25e79a7f2c9e54872fb2c8932bb582a500284505 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Tue, 4 Jun 2024 19:32:07 +0800
Subject: [PATCH 086/346] ubifs: Fix inconsistent inode size when powercut
 happens during appendant writing

UBIFS always make sure that the data length won't beyond the inode size
by writing inode before writing page(See ubifs_writepage.). After commit
c35acef383f4a2f2cfc30("ubifs: Convert ubifs_writepage to use a folio"),
the rule is broken in one case: Given a file with size 3, then write 4096
from the offset 0, following process will make inode size be smaller than
file data length after powercut & recovery:
         P1             P2
ubifs_writepage
 len = folio_size(folio) // 4096
 if (folio_pos(folio) + len <= i_size) // condition 1: 0 + 4096 <= 4096
		          //(i_size is updated as 4096 in ubifs_write_end)
   if (folio_pos(folio) >= synced_i_size) // condition 2: 0 >= 3, false
      write_inode // Skipped, because condition 2 is false
   do_writepage(folio, len) // write one page

		do_commit // data node won't be replayed in next mounting
 >> Powercut <<

So, inode size(4096) is not updated into disk, we will get following
error messages in next mounting(chk_fs = 1):
 check_leaf [ubifs]: data node at LEB 14:2048 is not within inode size 3
 dbg_walk_index [ubifs]: leaf checking function returned error -22, for
 leaf at LEB 14:2048

Fix it by modifying condition 2 as original comparison(Compare the page
index of synced_i_size with current page index).

Fixes: c35acef383f4 ("ubifs: Convert ubifs_writepage to use a folio")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218934
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index a1f46919934c..68e104423a48 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1027,7 +1027,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc,
 
 	/* Is the folio fully inside i_size? */
 	if (folio_pos(folio) + len <= i_size) {
-		if (folio_pos(folio) >= synced_i_size) {
+		if (folio_pos(folio) + len > synced_i_size) {
 			err = inode->i_sb->s_op->write_inode(inode, NULL);
 			if (err)
 				goto out_redirty;

From 054fd15984454f031611d6c63675fc578aad0cb1 Mon Sep 17 00:00:00 2001
From: Chen Ni <nichen@iscas.ac.cn>
Date: Thu, 20 Jun 2024 16:19:26 +0800
Subject: [PATCH 087/346] ubifs: add check for crypto_shash_tfm_digest

Add check for the return value of crypto_shash_tfm_digest() and return
the error if it fails in order to catch the error.

Fixes: 817aa094842d ("ubifs: support offline signed images")
Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Reviewed-by: Zhihao Cheng <chengzhihao1@huawei.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 fs/ubifs/master.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 7adc37c10b6a..a148760fa49e 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -67,10 +67,13 @@ static int mst_node_check_hash(const struct ubifs_info *c,
 {
 	u8 calc[UBIFS_MAX_HASH_LEN];
 	const void *node = mst;
+	int ret;
 
-	crypto_shash_tfm_digest(c->hash_tfm, node + sizeof(struct ubifs_ch),
+	ret = crypto_shash_tfm_digest(c->hash_tfm, node + sizeof(struct ubifs_ch),
 				UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch),
 				calc);
+	if (ret)
+		return ret;
 
 	if (ubifs_check_hash(c, expected, calc))
 		return -EPERM;

From 7164122e25b18806f5dce68c8a0bdaa9e4f902a5 Mon Sep 17 00:00:00 2001
From: Biju Das <biju.das.jz@bp.renesas.com>
Date: Mon, 15 Jul 2024 14:41:12 +0100
Subject: [PATCH 088/346] regulator: renesas-usb-vbus-regulator: Update the
 default

As the "rzg2l-usb-vbus-regulator" platform device is only created by
drivers/reset/reset-rzg2l-usbphy-ctrl.c, update the default stricter by
replacing ARCH_RZG2L->RESET_RZG2L_USBPHY_CTRL.

Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
Reported-by: Geert Uytterhoeven <geert+renesas@glider.be>
Closes: https://lore.kernel.org/all/CAMuHMdX5ayWbLEEa6nAipECVB6H9eCpRg21pu3zYrTdiER0F+Q@mail.gmail.com/
Link: https://patch.msgid.link/20240715134120.312610-1-biju.das.jz@bp.renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 0281a9a6f4ce..5be44606ac15 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -1639,7 +1639,7 @@ config REGULATOR_RZG2L_VBCTRL
 	depends on ARCH_RZG2L || COMPILE_TEST
 	depends on OF
 	select REGMAP_MMIO
-	default ARCH_RZG2L
+	default RESET_RZG2L_USBPHY_CTRL
 	help
 	  Support for VBUS regulators implemented on Renesas RZ/G2L SoCs.
 

From 7c78fdbace0ff9bd2a2f1206182c054462b006af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ma=C3=ADra=20Canal?= <mcanal@igalia.com>
Date: Sun, 14 Jul 2024 11:49:11 -0300
Subject: [PATCH 089/346] drm/v3d: Add V3D tech revision to the device
 information
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The V3D tech revision can be a useful information when configuring
jobs. Therefore, expose it in the `struct v3d_dev` with the V3D tech
version.

Signed-off-by: Maíra Canal <mcanal@igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240714145243.1223131-1-mcanal@igalia.com
---
 drivers/gpu/drm/v3d/v3d_drv.c | 5 ++++-
 drivers/gpu/drm/v3d/v3d_drv.h | 8 +++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_drv.c b/drivers/gpu/drm/v3d/v3d_drv.c
index a47f00b443d3..5982941d933b 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.c
+++ b/drivers/gpu/drm/v3d/v3d_drv.c
@@ -265,7 +265,7 @@ static int v3d_platform_drm_probe(struct platform_device *pdev)
 	struct v3d_dev *v3d;
 	int ret;
 	u32 mmu_debug;
-	u32 ident1;
+	u32 ident1, ident3;
 	u64 mask;
 
 	v3d = devm_drm_dev_alloc(dev, &v3d_drm_driver, struct v3d_dev, drm);
@@ -298,6 +298,9 @@ static int v3d_platform_drm_probe(struct platform_device *pdev)
 	v3d->cores = V3D_GET_FIELD(ident1, V3D_HUB_IDENT1_NCORES);
 	WARN_ON(v3d->cores > 1); /* multicore not yet implemented */
 
+	ident3 = V3D_READ(V3D_HUB_IDENT3);
+	v3d->rev = V3D_GET_FIELD(ident3, V3D_HUB_IDENT3_IPREV);
+
 	if (v3d->ver >= 71)
 		v3d->max_counters = V3D_V71_NUM_PERFCOUNTERS;
 	else if (v3d->ver >= 42)
diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
index 099b962bdfde..49089eefb7c7 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -98,10 +98,12 @@ struct v3d_perfmon {
 struct v3d_dev {
 	struct drm_device drm;
 
-	/* Short representation (e.g. 33, 41) of the V3D tech version
-	 * and revision.
-	 */
+	/* Short representation (e.g. 33, 41) of the V3D tech version */
 	int ver;
+
+	/* Short representation (e.g. 5, 6) of the V3D tech revision */
+	int rev;
+
 	bool single_irq_line;
 
 	/* Different revisions of V3D have different total number of performance

From 1fe1c66274fb80cc1ac42e0d38917d53adc7d7a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ma=C3=ADra=20Canal?= <mcanal@igalia.com>
Date: Sun, 14 Jul 2024 11:49:12 -0300
Subject: [PATCH 090/346] drm/v3d: Fix Indirect Dispatch configuration for V3D
 7.1.6 and later
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`args->cfg[4]` is configured in Indirect Dispatch using the number of
batches. Currently, for all V3D tech versions, `args->cfg[4]` equals the
number of batches subtracted by 1. But, for V3D 7.1.6 and later, we must not
subtract 1 from the number of batches.

Implement the fix by checking the V3D tech version and revision.

Fixes several `dEQP-VK.synchronization*` CTS tests related to Indirect Dispatch.

Fixes: 18b8413b25b7 ("drm/v3d: Create a CPU job extension for a indirect CSD job")
Signed-off-by: Maíra Canal <mcanal@igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240714145243.1223131-2-mcanal@igalia.com
---
 drivers/gpu/drm/v3d/v3d_sched.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 03df37a3acf5..271a6d0f5aca 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -331,7 +331,8 @@ v3d_rewrite_csd_job_wg_counts_from_indirect(struct v3d_cpu_job *job)
 	struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
 	struct v3d_bo *indirect = to_v3d_bo(indirect_csd->indirect);
 	struct drm_v3d_submit_csd *args = &indirect_csd->job->args;
-	u32 *wg_counts;
+	struct v3d_dev *v3d = job->base.v3d;
+	u32 num_batches, *wg_counts;
 
 	v3d_get_bo_vaddr(bo);
 	v3d_get_bo_vaddr(indirect);
@@ -344,8 +345,17 @@ v3d_rewrite_csd_job_wg_counts_from_indirect(struct v3d_cpu_job *job)
 	args->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
 	args->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
 	args->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
-	args->cfg[4] = DIV_ROUND_UP(indirect_csd->wg_size, 16) *
-		       (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
+
+	num_batches = DIV_ROUND_UP(indirect_csd->wg_size, 16) *
+		      (wg_counts[0] * wg_counts[1] * wg_counts[2]);
+
+	/* V3D 7.1.6 and later don't subtract 1 from the number of batches */
+	if (v3d->ver < 71 || (v3d->ver == 71 && v3d->rev < 6))
+		args->cfg[4] = num_batches - 1;
+	else
+		args->cfg[4] = num_batches;
+
+	WARN_ON(args->cfg[4] == ~0);
 
 	for (int i = 0; i < 3; i++) {
 		/* 0xffffffff indicates that the uniform rewrite is not needed */

From 2634f745eac25a33f032df32cf98fca8538a534a Mon Sep 17 00:00:00 2001
From: Daniel Baluta <daniel.baluta@nxp.com>
Date: Mon, 15 Jul 2024 18:16:53 +0300
Subject: [PATCH 091/346] ASoC: SOF: imx8m: Fix DSP control regmap retrieval

According to Documentation/devicetree/bindings/dsp/fsl,dsp.yaml
fsl,dsp-ctrl is a phandle to syscon block so we need to use correct
function to retrieve it.

Currently there is no SOF DSP DTS merged into mainline so there is no
need to support the old way of retrieving the dsp control node.

Fixes: 9ba23717b292 ("ASoC: SOF: imx8m: Implement DSP start")
Signed-off-by: Daniel Baluta <daniel.baluta@nxp.com>
Link: https://patch.msgid.link/20240715151653.114751-1-daniel.baluta@oss.nxp.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/sof/imx/imx8m.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/sof/imx/imx8m.c b/sound/soc/sof/imx/imx8m.c
index 1c7019c3cbd3..cdd1e79ef9f6 100644
--- a/sound/soc/sof/imx/imx8m.c
+++ b/sound/soc/sof/imx/imx8m.c
@@ -234,7 +234,7 @@ static int imx8m_probe(struct snd_sof_dev *sdev)
 	/* set default mailbox offset for FW ready message */
 	sdev->dsp_box.offset = MBOX_OFFSET;
 
-	priv->regmap = syscon_regmap_lookup_by_compatible("fsl,dsp-ctrl");
+	priv->regmap = syscon_regmap_lookup_by_phandle(np, "fsl,dsp-ctrl");
 	if (IS_ERR(priv->regmap)) {
 		dev_err(sdev->dev, "cannot find dsp-ctrl registers");
 		ret = PTR_ERR(priv->regmap);

From 502a582b8dd897d9282db47c0911d5320ef2e6b9 Mon Sep 17 00:00:00 2001
From: Naga Sureshkumar Relli <nagasuresh.relli@microchip.com>
Date: Mon, 15 Jul 2024 12:13:52 +0100
Subject: [PATCH 092/346] spi: microchip-core: fix the issues in the isr

It is possible for the TXDONE interrupt be raised if the tx FIFO becomes
temporarily empty while transmitting, resulting in recursive calls to
mchp_corespi_write_fifo() and therefore a garbage message might be
transmitted depending on when the interrupt is triggered. Moving all of
the tx FIFO writes out of the TXDONE portion of the interrupt handler
avoids this problem.

Most of rest of the TXDONE portion of the handler is problematic too.
Only reading the rx FIFO (and finalising the transfer) when the TXDONE
interrupt is raised can cause the transfer to stall, if the final bytes
of rx data are not available in the rx FIFO when the final TXDONE
interrupt is raised. The transfer should be finalised regardless of
which interrupt is raised, provided that all tx data has been set and
all rx data received.

The first issue was encountered "in the wild", the second is
theoretical.

Fixes: 9ac8d17694b6 ("spi: add support for microchip fpga spi controllers")
Signed-off-by: Naga Sureshkumar Relli <nagasuresh.relli@microchip.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20240715-candied-deforest-585685ef3c8a@wendy
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-microchip-core.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c
index 6246254e1dff..c8076333d179 100644
--- a/drivers/spi/spi-microchip-core.c
+++ b/drivers/spi/spi-microchip-core.c
@@ -383,21 +383,18 @@ static irqreturn_t mchp_corespi_interrupt(int irq, void *dev_id)
 	if (intfield == 0)
 		return IRQ_NONE;
 
-	if (intfield & INT_TXDONE) {
+	if (intfield & INT_TXDONE)
 		mchp_corespi_write(spi, REG_INT_CLEAR, INT_TXDONE);
 
+	if (intfield & INT_RXRDY) {
+		mchp_corespi_write(spi, REG_INT_CLEAR, INT_RXRDY);
+
 		if (spi->rx_len)
 			mchp_corespi_read_fifo(spi);
-
-		if (spi->tx_len)
-			mchp_corespi_write_fifo(spi);
-
-		if (!spi->rx_len)
-			finalise = true;
 	}
 
-	if (intfield & INT_RXRDY)
-		mchp_corespi_write(spi, REG_INT_CLEAR, INT_RXRDY);
+	if (!spi->rx_len && !spi->tx_len)
+		finalise = true;
 
 	if (intfield & INT_RX_CHANNEL_OVERFLOW) {
 		mchp_corespi_write(spi, REG_INT_CLEAR, INT_RX_CHANNEL_OVERFLOW);
@@ -482,8 +479,9 @@ static int mchp_corespi_transfer_one(struct spi_controller *host,
 	mchp_corespi_set_xfer_size(spi, (spi->tx_len > FIFO_DEPTH)
 				   ? FIFO_DEPTH : spi->tx_len);
 
-	if (spi->tx_len)
+	while (spi->tx_len)
 		mchp_corespi_write_fifo(spi);
+
 	return 1;
 }
 

From 22fd98c107c792e35db7abe45298bc3a29bf4723 Mon Sep 17 00:00:00 2001
From: Steve Wilkins <steve.wilkins@raymarine.com>
Date: Mon, 15 Jul 2024 12:13:53 +0100
Subject: [PATCH 093/346] spi: microchip-core: defer asserting chip select
 until just before write to TX FIFO

Setting up many of the registers for a new SPI transfer requires the
SPI controller to be disabled after set_cs() has been called to assert
the chip select line. However, disabling the controller results in the
SCLK and MOSI output pins being tristate, which can cause clock
transitions to be seen by a slave device whilst SS is active. To fix
this, the CS is only set to inactive inline, whilst setting it active
is deferred until all registers are set up and the any controller
disables have been completed.

Fixes: 9ac8d17694b6 ("spi: add support for microchip fpga spi controllers")
Signed-off-by: Steve Wilkins <steve.wilkins@raymarine.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20240715-sanitizer-recant-dd96b7a97048@wendy
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-microchip-core.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c
index c8076333d179..a78900db2397 100644
--- a/drivers/spi/spi-microchip-core.c
+++ b/drivers/spi/spi-microchip-core.c
@@ -103,6 +103,7 @@ struct mchp_corespi {
 	u8 *rx_buf;
 	u32 clk_gen; /* divider for spi output clock generated by the controller */
 	u32 clk_mode;
+	u32 pending_slave_select;
 	int irq;
 	int tx_len;
 	int rx_len;
@@ -249,8 +250,18 @@ static void mchp_corespi_set_cs(struct spi_device *spi, bool disable)
 	reg = mchp_corespi_read(corespi, REG_SLAVE_SELECT);
 	reg &= ~BIT(spi_get_chipselect(spi, 0));
 	reg |= !disable << spi_get_chipselect(spi, 0);
+	corespi->pending_slave_select = reg;
 
-	mchp_corespi_write(corespi, REG_SLAVE_SELECT, reg);
+	/*
+	 * Only deassert chip select immediately. Writing to some registers
+	 * requires the controller to be disabled, which results in the
+	 * output pins being tristated and can cause the SCLK and MOSI lines
+	 * to transition. Therefore asserting the chip select is deferred
+	 * until just before writing to the TX FIFO, to ensure the device
+	 * doesn't see any spurious clock transitions whilst CS is enabled.
+	 */
+	if (((spi->mode & SPI_CS_HIGH) == 0) == disable)
+		mchp_corespi_write(corespi, REG_SLAVE_SELECT, reg);
 }
 
 static int mchp_corespi_setup(struct spi_device *spi)
@@ -269,6 +280,7 @@ static int mchp_corespi_setup(struct spi_device *spi)
 	if (spi->mode & SPI_CS_HIGH) {
 		reg = mchp_corespi_read(corespi, REG_SLAVE_SELECT);
 		reg |= BIT(spi_get_chipselect(spi, 0));
+		corespi->pending_slave_select = reg;
 		mchp_corespi_write(corespi, REG_SLAVE_SELECT, reg);
 	}
 	return 0;
@@ -310,7 +322,8 @@ static void mchp_corespi_init(struct spi_controller *host, struct mchp_corespi *
 	 * select is relinquished to the hardware. SSELOUT is enabled too so we
 	 * can deal with active high targets.
 	 */
-	mchp_corespi_write(spi, REG_SLAVE_SELECT, SSELOUT | SSEL_DIRECT);
+	spi->pending_slave_select = SSELOUT | SSEL_DIRECT;
+	mchp_corespi_write(spi, REG_SLAVE_SELECT, spi->pending_slave_select);
 
 	control = mchp_corespi_read(spi, REG_CONTROL);
 
@@ -479,6 +492,8 @@ static int mchp_corespi_transfer_one(struct spi_controller *host,
 	mchp_corespi_set_xfer_size(spi, (spi->tx_len > FIFO_DEPTH)
 				   ? FIFO_DEPTH : spi->tx_len);
 
+	mchp_corespi_write(spi, REG_SLAVE_SELECT, spi->pending_slave_select);
+
 	while (spi->tx_len)
 		mchp_corespi_write_fifo(spi);
 

From de9850b5c606b754dd7861678d6e2874b96b04f8 Mon Sep 17 00:00:00 2001
From: Steve Wilkins <steve.wilkins@raymarine.com>
Date: Mon, 15 Jul 2024 12:13:54 +0100
Subject: [PATCH 094/346] spi: microchip-core: only disable SPI controller when
 register value change requires it

Setting up many of the registers for a new SPI transfer involves
unconditionally disabling the SPI controller, writing the register
value and re-enabling the controller. This is being done for registers
even when the value is unchanged and is also done for registers that
don't require the controller to be disabled for the change to take
effect. Make an effort to detect changes to the register values, and
only disables the controller if the new register value is different
and disabling the controller is required. This stops the controller
being repeated disabled and the bus going tristate before every
transfer.

Fixes: 9ac8d17694b6 ("spi: add support for microchip fpga spi controllers")
Signed-off-by: Steve Wilkins <steve.wilkins@raymarine.com>
Co-developed-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20240715-depict-twirl-7e592eeabaad@wendy
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-microchip-core.c | 79 +++++++++++++++++---------------
 1 file changed, 41 insertions(+), 38 deletions(-)

diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c
index a78900db2397..c69d45e5bff2 100644
--- a/drivers/spi/spi-microchip-core.c
+++ b/drivers/spi/spi-microchip-core.c
@@ -75,6 +75,7 @@
 
 #define REG_CONTROL		(0x00)
 #define REG_FRAME_SIZE		(0x04)
+#define  FRAME_SIZE_MASK	GENMASK(5, 0)
 #define REG_STATUS		(0x08)
 #define REG_INT_CLEAR		(0x0c)
 #define REG_RX_DATA		(0x10)
@@ -89,6 +90,7 @@
 #define REG_RIS			(0x24)
 #define REG_CONTROL2		(0x28)
 #define REG_COMMAND		(0x2c)
+#define  COMMAND_CLRFRAMECNT	BIT(4)
 #define REG_PKTSIZE		(0x30)
 #define REG_CMD_SIZE		(0x34)
 #define REG_HWSTATUS		(0x38)
@@ -149,62 +151,59 @@ static inline void mchp_corespi_read_fifo(struct mchp_corespi *spi)
 
 static void mchp_corespi_enable_ints(struct mchp_corespi *spi)
 {
-	u32 control, mask = INT_ENABLE_MASK;
+	u32 control = mchp_corespi_read(spi, REG_CONTROL);
 
-	mchp_corespi_disable(spi);
-
-	control = mchp_corespi_read(spi, REG_CONTROL);
-
-	control |= mask;
-	mchp_corespi_write(spi, REG_CONTROL, control);
-
-	control |= CONTROL_ENABLE;
+	control |= INT_ENABLE_MASK;
 	mchp_corespi_write(spi, REG_CONTROL, control);
 }
 
 static void mchp_corespi_disable_ints(struct mchp_corespi *spi)
 {
-	u32 control, mask = INT_ENABLE_MASK;
+	u32 control = mchp_corespi_read(spi, REG_CONTROL);
 
-	mchp_corespi_disable(spi);
-
-	control = mchp_corespi_read(spi, REG_CONTROL);
-	control &= ~mask;
-	mchp_corespi_write(spi, REG_CONTROL, control);
-
-	control |= CONTROL_ENABLE;
+	control &= ~INT_ENABLE_MASK;
 	mchp_corespi_write(spi, REG_CONTROL, control);
 }
 
 static inline void mchp_corespi_set_xfer_size(struct mchp_corespi *spi, int len)
 {
 	u32 control;
-	u16 lenpart;
+	u32 lenpart;
+	u32 frames = mchp_corespi_read(spi, REG_FRAMESUP);
 
 	/*
-	 * Disable the SPI controller. Writes to transfer length have
-	 * no effect when the controller is enabled.
+	 * Writing to FRAMECNT in REG_CONTROL will reset the frame count, taking
+	 * a shortcut requires an explicit clear.
 	 */
-	mchp_corespi_disable(spi);
+	if (frames == len) {
+		mchp_corespi_write(spi, REG_COMMAND, COMMAND_CLRFRAMECNT);
+		return;
+	}
 
 	/*
 	 * The lower 16 bits of the frame count are stored in the control reg
 	 * for legacy reasons, but the upper 16 written to a different register:
 	 * FRAMESUP. While both the upper and lower bits can be *READ* from the
-	 * FRAMESUP register, writing to the lower 16 bits is a NOP
+	 * FRAMESUP register, writing to the lower 16 bits is (supposedly) a NOP.
+	 *
+	 * The driver used to disable the controller while modifying the frame
+	 * count, and mask off the lower 16 bits of len while writing to
+	 * FRAMES_UP. When the driver was changed to disable the controller as
+	 * infrequently as possible, it was discovered that the logic of
+	 * lenpart = len & 0xffff_0000
+	 * write(REG_FRAMESUP, lenpart)
+	 * would actually write zeros into the lower 16 bits on an mpfs250t-es,
+	 * despite documentation stating these bits were read-only.
+	 * Writing len unmasked into FRAMES_UP ensures those bits aren't zeroed
+	 * on an mpfs250t-es and will be a NOP for the lower 16 bits on hardware
+	 * that matches the documentation.
 	 */
 	lenpart = len & 0xffff;
-
 	control = mchp_corespi_read(spi, REG_CONTROL);
 	control &= ~CONTROL_FRAMECNT_MASK;
 	control |= lenpart << CONTROL_FRAMECNT_SHIFT;
 	mchp_corespi_write(spi, REG_CONTROL, control);
-
-	lenpart = len & 0xffff0000;
-	mchp_corespi_write(spi, REG_FRAMESUP, lenpart);
-
-	control |= CONTROL_ENABLE;
-	mchp_corespi_write(spi, REG_CONTROL, control);
+	mchp_corespi_write(spi, REG_FRAMESUP, len);
 }
 
 static inline void mchp_corespi_write_fifo(struct mchp_corespi *spi)
@@ -227,17 +226,22 @@ static inline void mchp_corespi_write_fifo(struct mchp_corespi *spi)
 
 static inline void mchp_corespi_set_framesize(struct mchp_corespi *spi, int bt)
 {
+	u32 frame_size = mchp_corespi_read(spi, REG_FRAME_SIZE);
 	u32 control;
 
+	if ((frame_size & FRAME_SIZE_MASK) == bt)
+		return;
+
 	/*
 	 * Disable the SPI controller. Writes to the frame size have
 	 * no effect when the controller is enabled.
 	 */
-	mchp_corespi_disable(spi);
+	control = mchp_corespi_read(spi, REG_CONTROL);
+	control &= ~CONTROL_ENABLE;
+	mchp_corespi_write(spi, REG_CONTROL, control);
 
 	mchp_corespi_write(spi, REG_FRAME_SIZE, bt);
 
-	control = mchp_corespi_read(spi, REG_CONTROL);
 	control |= CONTROL_ENABLE;
 	mchp_corespi_write(spi, REG_CONTROL, control);
 }
@@ -337,8 +341,6 @@ static inline void mchp_corespi_set_clk_gen(struct mchp_corespi *spi)
 {
 	u32 control;
 
-	mchp_corespi_disable(spi);
-
 	control = mchp_corespi_read(spi, REG_CONTROL);
 	if (spi->clk_mode)
 		control |= CONTROL_CLKMODE;
@@ -347,12 +349,12 @@ static inline void mchp_corespi_set_clk_gen(struct mchp_corespi *spi)
 
 	mchp_corespi_write(spi, REG_CLK_GEN, spi->clk_gen);
 	mchp_corespi_write(spi, REG_CONTROL, control);
-	mchp_corespi_write(spi, REG_CONTROL, control | CONTROL_ENABLE);
 }
 
 static inline void mchp_corespi_set_mode(struct mchp_corespi *spi, unsigned int mode)
 {
-	u32 control, mode_val;
+	u32 mode_val;
+	u32 control = mchp_corespi_read(spi, REG_CONTROL);
 
 	switch (mode & SPI_MODE_X_MASK) {
 	case SPI_MODE_0:
@@ -370,12 +372,13 @@ static inline void mchp_corespi_set_mode(struct mchp_corespi *spi, unsigned int
 	}
 
 	/*
-	 * Disable the SPI controller. Writes to the frame size have
+	 * Disable the SPI controller. Writes to the frame protocol have
 	 * no effect when the controller is enabled.
 	 */
-	mchp_corespi_disable(spi);
 
-	control = mchp_corespi_read(spi, REG_CONTROL);
+	control &= ~CONTROL_ENABLE;
+	mchp_corespi_write(spi, REG_CONTROL, control);
+
 	control &= ~(SPI_MODE_X_MASK << MODE_X_MASK_SHIFT);
 	control |= mode_val;
 

From 3a5e76283672efddf47cea39ccfe9f5735cc91d5 Mon Sep 17 00:00:00 2001
From: Steve Wilkins <steve.wilkins@raymarine.com>
Date: Mon, 15 Jul 2024 12:13:55 +0100
Subject: [PATCH 095/346] spi: microchip-core: fix init function not setting
 the master and motorola modes

mchp_corespi_init() reads the CONTROL register, sets the master and
motorola bits, but doesn't write the value back to the register. The
function also doesn't ensure the controller is disabled at the start,
which may present a problem if the controller was used by an
earlier boot stage as some settings (including the mode) can only be
modified while the controller is disabled.

Fixes: 9ac8d17694b6 ("spi: add support for microchip fpga spi controllers")
Signed-off-by: Steve Wilkins <steve.wilkins@raymarine.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20240715-designing-thus-05f7c26e1da7@wendy
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-microchip-core.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c
index c69d45e5bff2..aa72b9dd8956 100644
--- a/drivers/spi/spi-microchip-core.c
+++ b/drivers/spi/spi-microchip-core.c
@@ -295,17 +295,13 @@ static void mchp_corespi_init(struct spi_controller *host, struct mchp_corespi *
 	unsigned long clk_hz;
 	u32 control = mchp_corespi_read(spi, REG_CONTROL);
 
-	control |= CONTROL_MASTER;
+	control &= ~CONTROL_ENABLE;
+	mchp_corespi_write(spi, REG_CONTROL, control);
 
+	control |= CONTROL_MASTER;
 	control &= ~CONTROL_MODE_MASK;
 	control |= MOTOROLA_MODE;
 
-	mchp_corespi_set_framesize(spi, DEFAULT_FRAMESIZE);
-
-	/* max. possible spi clock rate is the apb clock rate */
-	clk_hz = clk_get_rate(spi->clk);
-	host->max_speed_hz = clk_hz;
-
 	/*
 	 * The controller must be configured so that it doesn't remove Chip
 	 * Select until the entire message has been transferred, even if at
@@ -314,11 +310,16 @@ static void mchp_corespi_init(struct spi_controller *host, struct mchp_corespi *
 	 * BIGFIFO mode is also enabled, which sets the fifo depth to 32 frames
 	 * for the 8 bit transfers that this driver uses.
 	 */
-	control = mchp_corespi_read(spi, REG_CONTROL);
 	control |= CONTROL_SPS | CONTROL_BIGFIFO;
 
 	mchp_corespi_write(spi, REG_CONTROL, control);
 
+	mchp_corespi_set_framesize(spi, DEFAULT_FRAMESIZE);
+
+	/* max. possible spi clock rate is the apb clock rate */
+	clk_hz = clk_get_rate(spi->clk);
+	host->max_speed_hz = clk_hz;
+
 	mchp_corespi_enable_ints(spi);
 
 	/*

From 9cf71eb0faef4bff01df4264841b8465382d7927 Mon Sep 17 00:00:00 2001
From: Steve Wilkins <steve.wilkins@raymarine.com>
Date: Mon, 15 Jul 2024 12:13:56 +0100
Subject: [PATCH 096/346] spi: microchip-core: ensure TX and RX FIFOs are empty
 at start of a transfer

While transmitting with rx_len == 0, the RX FIFO is not going to be
emptied in the interrupt handler. A subsequent transfer could then
read crap from the previous transfer out of the RX FIFO into the
start RX buffer. The core provides a register that will empty the RX and
TX FIFOs, so do that before each transfer.

Fixes: 9ac8d17694b6 ("spi: add support for microchip fpga spi controllers")
Signed-off-by: Steve Wilkins <steve.wilkins@raymarine.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20240715-flammable-provoke-459226d08e70@wendy
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-microchip-core.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c
index aa72b9dd8956..057092506bd5 100644
--- a/drivers/spi/spi-microchip-core.c
+++ b/drivers/spi/spi-microchip-core.c
@@ -91,6 +91,8 @@
 #define REG_CONTROL2		(0x28)
 #define REG_COMMAND		(0x2c)
 #define  COMMAND_CLRFRAMECNT	BIT(4)
+#define  COMMAND_TXFIFORST		BIT(3)
+#define  COMMAND_RXFIFORST		BIT(2)
 #define REG_PKTSIZE		(0x30)
 #define REG_CMD_SIZE		(0x34)
 #define REG_HWSTATUS		(0x38)
@@ -496,6 +498,8 @@ static int mchp_corespi_transfer_one(struct spi_controller *host,
 	mchp_corespi_set_xfer_size(spi, (spi->tx_len > FIFO_DEPTH)
 				   ? FIFO_DEPTH : spi->tx_len);
 
+	mchp_corespi_write(spi, REG_COMMAND, COMMAND_RXFIFORST | COMMAND_TXFIFORST);
+
 	mchp_corespi_write(spi, REG_SLAVE_SELECT, spi->pending_slave_select);
 
 	while (spi->tx_len)

From 87232ea8a5caf8d050f8ea7acd210a2cfcbe6309 Mon Sep 17 00:00:00 2001
From: Steve Wilkins <steve.wilkins@raymarine.com>
Date: Mon, 15 Jul 2024 12:13:57 +0100
Subject: [PATCH 097/346] spi: microchip-core: add support for word sizes of 1
 to 32 bits

The current implementation only supports a word size of 8 bits,
which limits the devices it can be used with. Add support for any
word size between 1 and 32 bits, as supported by the hardware.

Signed-off-by: Steve Wilkins <steve.wilkins@raymarine.com>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20240715-cogwheel-uniquely-0d4ef518b809@wendy
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-microchip-core.c | 53 +++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/drivers/spi/spi-microchip-core.c b/drivers/spi/spi-microchip-core.c
index 057092506bd5..7c1a9a985373 100644
--- a/drivers/spi/spi-microchip-core.c
+++ b/drivers/spi/spi-microchip-core.c
@@ -111,7 +111,7 @@ struct mchp_corespi {
 	int irq;
 	int tx_len;
 	int rx_len;
-	int pending;
+	int n_bytes;
 };
 
 static inline u32 mchp_corespi_read(struct mchp_corespi *spi, unsigned int reg)
@@ -135,20 +135,23 @@ static inline void mchp_corespi_disable(struct mchp_corespi *spi)
 
 static inline void mchp_corespi_read_fifo(struct mchp_corespi *spi)
 {
-	u8 data;
-	int fifo_max, i = 0;
+	while (spi->rx_len >= spi->n_bytes && !(mchp_corespi_read(spi, REG_STATUS) & STATUS_RXFIFO_EMPTY)) {
+		u32 data = mchp_corespi_read(spi, REG_RX_DATA);
 
-	fifo_max = min(spi->rx_len, FIFO_DEPTH);
+		spi->rx_len -= spi->n_bytes;
 
-	while ((i < fifo_max) && !(mchp_corespi_read(spi, REG_STATUS) & STATUS_RXFIFO_EMPTY)) {
-		data = mchp_corespi_read(spi, REG_RX_DATA);
+		if (!spi->rx_buf)
+			continue;
 
-		if (spi->rx_buf)
-			*spi->rx_buf++ = data;
-		i++;
+		if (spi->n_bytes == 4)
+			*((u32 *)spi->rx_buf) = data;
+		else if (spi->n_bytes == 2)
+			*((u16 *)spi->rx_buf) = data;
+		else
+			*spi->rx_buf = data;
+
+		spi->rx_buf += spi->n_bytes;
 	}
-	spi->rx_len -= i;
-	spi->pending -= i;
 }
 
 static void mchp_corespi_enable_ints(struct mchp_corespi *spi)
@@ -210,20 +213,28 @@ static inline void mchp_corespi_set_xfer_size(struct mchp_corespi *spi, int len)
 
 static inline void mchp_corespi_write_fifo(struct mchp_corespi *spi)
 {
-	u8 byte;
 	int fifo_max, i = 0;
 
-	fifo_max = min(spi->tx_len, FIFO_DEPTH);
+	fifo_max = DIV_ROUND_UP(min(spi->tx_len, FIFO_DEPTH), spi->n_bytes);
 	mchp_corespi_set_xfer_size(spi, fifo_max);
 
 	while ((i < fifo_max) && !(mchp_corespi_read(spi, REG_STATUS) & STATUS_TXFIFO_FULL)) {
-		byte = spi->tx_buf ? *spi->tx_buf++ : 0xaa;
-		mchp_corespi_write(spi, REG_TX_DATA, byte);
+		u32 word;
+
+		if (spi->n_bytes == 4)
+			word = spi->tx_buf ? *((u32 *)spi->tx_buf) : 0xaa;
+		else if (spi->n_bytes == 2)
+			word = spi->tx_buf ? *((u16 *)spi->tx_buf) : 0xaa;
+		else
+			word = spi->tx_buf ? *spi->tx_buf : 0xaa;
+
+		mchp_corespi_write(spi, REG_TX_DATA, word);
+		if (spi->tx_buf)
+			spi->tx_buf += spi->n_bytes;
 		i++;
 	}
 
-	spi->tx_len -= i;
-	spi->pending += i;
+	spi->tx_len -= i * spi->n_bytes;
 }
 
 static inline void mchp_corespi_set_framesize(struct mchp_corespi *spi, int bt)
@@ -493,10 +504,9 @@ static int mchp_corespi_transfer_one(struct spi_controller *host,
 	spi->rx_buf = xfer->rx_buf;
 	spi->tx_len = xfer->len;
 	spi->rx_len = xfer->len;
-	spi->pending = 0;
+	spi->n_bytes = roundup_pow_of_two(DIV_ROUND_UP(xfer->bits_per_word, BITS_PER_BYTE));
 
-	mchp_corespi_set_xfer_size(spi, (spi->tx_len > FIFO_DEPTH)
-				   ? FIFO_DEPTH : spi->tx_len);
+	mchp_corespi_set_framesize(spi, xfer->bits_per_word);
 
 	mchp_corespi_write(spi, REG_COMMAND, COMMAND_RXFIFORST | COMMAND_TXFIFORST);
 
@@ -514,7 +524,6 @@ static int mchp_corespi_prepare_message(struct spi_controller *host,
 	struct spi_device *spi_dev = msg->spi;
 	struct mchp_corespi *spi = spi_controller_get_devdata(host);
 
-	mchp_corespi_set_framesize(spi, DEFAULT_FRAMESIZE);
 	mchp_corespi_set_mode(spi, spi_dev->mode);
 
 	return 0;
@@ -542,7 +551,7 @@ static int mchp_corespi_probe(struct platform_device *pdev)
 	host->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH;
 	host->use_gpio_descriptors = true;
 	host->setup = mchp_corespi_setup;
-	host->bits_per_word_mask = SPI_BPW_MASK(8);
+	host->bits_per_word_mask = SPI_BPW_RANGE_MASK(1, 32);
 	host->transfer_one = mchp_corespi_transfer_one;
 	host->prepare_message = mchp_corespi_prepare_message;
 	host->set_cs = mchp_corespi_set_cs;

From ab091ec536cb7b271983c0c063b17f62f3591583 Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Mon, 15 Jul 2024 17:31:44 +0800
Subject: [PATCH 098/346] nvme/pci: Add APST quirk for Lenovo N60z laptop

There is a hardware power-saving problem with the Lenovo N60z
board. When turn it on and leave it for 10 hours, there is a
20% chance that a nvme disk will not wake up until reboot.

Link: https://lore.kernel.org/all/2B5581C46AC6E335+9c7a81f1-05fb-4fd0-9fbb-108757c21628@uniontech.com
Signed-off-by: hmy <huanglin@uniontech.com>
Signed-off-by: Wentao Guan <guanwentao@uniontech.com>
Signed-off-by: WangYuli <wangyuli@uniontech.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 21f8e1b9801f..bc24a85e96ce 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2968,6 +2968,13 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
 			return NVME_QUIRK_FORCE_NO_SIMPLE_SUSPEND;
 	}
 
+	/*
+	 * NVMe SSD drops off the PCIe bus after system idle
+	 * for 10 hours on a Lenovo N60z board.
+	 */
+	if (dmi_match(DMI_BOARD_NAME, "LXKT-ZXEG-N6"))
+		return NVME_QUIRK_NO_APST;
+
 	return 0;
 }
 

From 1a7812b25e69f6c63c52d3bc93c0970ab7ad9660 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sat, 13 Jul 2024 15:43:17 +0200
Subject: [PATCH 099/346] nvme-fabrics: Use seq_putc() in
 __nvmf_concat_opt_tokens()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Single characters should be put into a sequence.
Thus use the corresponding function “seq_putc”.

This issue was transformed by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/fabrics.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 44e342a46f39..f5f545fa0103 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -1403,10 +1403,10 @@ static void __nvmf_concat_opt_tokens(struct seq_file *seq_file)
 		tok = &opt_tokens[idx];
 		if (tok->token == NVMF_OPT_ERR)
 			continue;
-		seq_puts(seq_file, ",");
+		seq_putc(seq_file, ',');
 		seq_puts(seq_file, tok->pattern);
 	}
-	seq_puts(seq_file, "\n");
+	seq_putc(seq_file, '\n');
 }
 
 static int nvmf_dev_show(struct seq_file *seq_file, void *private)

From 88c918d1ee2c88cc5be3aa67ce048414fee471ff Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@nvidia.com>
Date: Thu, 11 Jul 2024 14:40:53 +0300
Subject: [PATCH 100/346] nvme: remove redundant bdev local variable

Use disk directly instead of getting it from bdev->bd_disk.

Signed-off-by: Israel Rukshin <israelr@nvidia.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/sysfs.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index 3c55f7edd181..ba05faaac562 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -233,13 +233,12 @@ static ssize_t nuse_show(struct device *dev, struct device_attribute *attr,
 {
 	struct nvme_ns_head *head = dev_to_ns_head(dev);
 	struct gendisk *disk = dev_to_disk(dev);
-	struct block_device *bdev = disk->part0;
 	int ret;
 
-	if (nvme_disk_is_ns_head(bdev->bd_disk))
+	if (nvme_disk_is_ns_head(disk))
 		ret = ns_head_update_nuse(head);
 	else
-		ret = ns_update_nuse(bdev->bd_disk->private_data);
+		ret = ns_update_nuse(disk->private_data);
 	if (ret)
 		return ret;
 

From 92fc2c469eb26060384e9b2cd4cb0cc228aba582 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Thu, 11 Jul 2024 15:59:52 -0700
Subject: [PATCH 101/346] nvme-pci: Fix the instructions for disabling power
 management

pcie_aspm=off tells the kernel not to modify the ASPM configuration. This
setting does not guarantee that ASPM (Active State Power Management) is
disabled. Hence add pcie_port_pm=off. This disables power management for
all PCIe ports.

This patch has been tested on a workstation with a Samsung SSD 970 EVO Plus
NVMe SSD.

Fixes: 4641a8e6e145 ("nvme-pci: add trouble shooting steps for timeouts")
Cc: Keith Busch <kbusch@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index bc24a85e96ce..8087b054da66 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1309,7 +1309,7 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 	dev_warn(dev->ctrl.device,
 		 "Does your device have a faulty power saving mode enabled?\n");
 	dev_warn(dev->ctrl.device,
-		 "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off\" and report a bug\n");
+		 "Try \"nvme_core.default_ps_max_latency_us=0 pcie_aspm=off pcie_port_pm=off\" and report a bug\n");
 }
 
 static enum blk_eh_timer_return nvme_timeout(struct request *req)

From 28814be8823002eca06d857d4bce70eb4c6fccd3 Mon Sep 17 00:00:00 2001
From: Aurabindo Pillai <aurabindo.pillai@amd.com>
Date: Wed, 10 Jul 2024 20:20:47 +0000
Subject: [PATCH 102/346] drm/amd: Bump KMS_DRIVER_MINOR version
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Increase the KMS minor version to indicate GFX12 DCC support since this
contains a major change in how DCC is managed across IPs like GFX, DCN
etc. This will be used mainly by userspace like Mesa to figure out
DCC support on GFX12 hardware.

v2: fix version number (Alex)

Signed-off-by: Aurabindo Pillai <aurabindo.pillai@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 78089f2f79f5..094498a0964b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -116,9 +116,10 @@
  * - 3.55.0 - Add AMDGPU_INFO_GPUVM_FAULT query
  * - 3.56.0 - Update IB start address and size alignment for decode and encode
  * - 3.57.0 - Compute tunneling on GFX10+
+ * - 3.58.0 - Add GFX12 DCC support
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	57
+#define KMS_DRIVER_MINOR	58
 #define KMS_DRIVER_PATCHLEVEL	0
 
 /*

From 7bbae44cf1bda02537a84cd8ad75bd81694acfc7 Mon Sep 17 00:00:00 2001
From: Aurabindo Pillai <aurabindo.pillai@amd.com>
Date: Mon, 15 Jul 2024 15:02:20 -0400
Subject: [PATCH 103/346] drm/amd/display: fix doc entry for bb_from_dmub

Fixes the warning:

Function parameter or struct member 'bb_from_dmub' not described in 'amdgpu_display_manager'

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Aurabindo Pillai <aurabindo.pillai@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index 5fd1b6b44577..369159c29bbe 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -587,7 +587,9 @@ struct amdgpu_display_manager {
 	 */
 	struct mutex dpia_aux_lock;
 
-	/*
+	/**
+	 * @bb_from_dmub:
+	 *
 	 * Bounding box data read from dmub during early initialization for DCN4+
 	 */
 	struct dml2_soc_bb *bb_from_dmub;

From d13e2a6e95e6b87f571c837c71a3d05691def9bb Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Mon, 8 Jul 2024 22:00:24 +0300
Subject: [PATCH 104/346] drm/i915/dp: Reset intel_dp->link_trained before
 retraining the link
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Regularly retraining a link during an atomic commit happens with the
given pipe/link already disabled and hence intel_dp->link_trained being
false. Ensure this also for retraining a DP SST link via direct calls to
the link training functions (vs. an actual commit as for DP MST). So far
nothing depended on this, however the next patch will depend on
link_trained==false for changing the LTTPR mode to non-transparent.

Cc: <stable@vger.kernel.org> # v5.15+
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Signed-off-by: Imre Deak <imre.deak@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240708190029.271247-2-imre.deak@intel.com
(cherry picked from commit a4d5ce61765c08ab364aa4b327f6739b646e6cfa)
Signed-off-by: Tvrtko Ursulin <tursulin@ursulin.net>
---
 drivers/gpu/drm/i915/display/intel_dp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c
index 3903f6ead6e6..59f11af3b0a1 100644
--- a/drivers/gpu/drm/i915/display/intel_dp.c
+++ b/drivers/gpu/drm/i915/display/intel_dp.c
@@ -5314,6 +5314,8 @@ static int intel_dp_retrain_link(struct intel_encoder *encoder,
 		const struct intel_crtc_state *crtc_state =
 			to_intel_crtc_state(crtc->base.state);
 
+		intel_dp->link_trained = false;
+
 		intel_dp_check_frl_training(intel_dp);
 		intel_dp_pcon_dsc_configure(intel_dp, crtc_state);
 		intel_dp_start_link_train(NULL, intel_dp, crtc_state);

From 509580fad7323b6a5da27e8365cd488f3b57210e Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Mon, 8 Jul 2024 22:00:25 +0300
Subject: [PATCH 105/346] drm/i915/dp: Don't switch the LTTPR mode on an active
 link
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switching to transparent mode leads to a loss of link synchronization,
so prevent doing this on an active link. This happened at least on an
Intel N100 system / DELL UD22 dock, the LTTPR residing either on the
host or the dock. To fix the issue, keep the current mode on an active
link, adjusting the LTTPR count accordingly (resetting it to 0 in
transparent mode).

v2: Adjust code comment during link training about reiniting the LTTPRs.
   (Ville)

Fixes: 7b2a4ab8b0ef ("drm/i915: Switch to LTTPR transparent mode link training")
Reported-and-tested-by: Gareth Yu <gareth.yu@intel.com>
Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/10902
Cc: <stable@vger.kernel.org> # v5.15+
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Signed-off-by: Imre Deak <imre.deak@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240708190029.271247-3-imre.deak@intel.com
(cherry picked from commit 211ad49cf8ccfdc798a719b4d1e000d0a8a9e588)
Signed-off-by: Tvrtko Ursulin <tursulin@ursulin.net>
---
 .../drm/i915/display/intel_dp_link_training.c | 55 ++++++++++++++++---
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/display/intel_dp_link_training.c b/drivers/gpu/drm/i915/display/intel_dp_link_training.c
index 1bc4ef84ff3b..d044c8e36bb3 100644
--- a/drivers/gpu/drm/i915/display/intel_dp_link_training.c
+++ b/drivers/gpu/drm/i915/display/intel_dp_link_training.c
@@ -117,10 +117,24 @@ intel_dp_set_lttpr_transparent_mode(struct intel_dp *intel_dp, bool enable)
 	return drm_dp_dpcd_write(&intel_dp->aux, DP_PHY_REPEATER_MODE, &val, 1) == 1;
 }
 
-static int intel_dp_init_lttpr(struct intel_dp *intel_dp, const u8 dpcd[DP_RECEIVER_CAP_SIZE])
+static bool intel_dp_lttpr_transparent_mode_enabled(struct intel_dp *intel_dp)
+{
+	return intel_dp->lttpr_common_caps[DP_PHY_REPEATER_MODE -
+					   DP_LT_TUNABLE_PHY_REPEATER_FIELD_DATA_STRUCTURE_REV] ==
+		DP_PHY_REPEATER_MODE_TRANSPARENT;
+}
+
+/*
+ * Read the LTTPR common capabilities and switch the LTTPR PHYs to
+ * non-transparent mode if this is supported. Preserve the
+ * transparent/non-transparent mode on an active link.
+ *
+ * Return the number of detected LTTPRs in non-transparent mode or 0 if the
+ * LTTPRs are in transparent mode or the detection failed.
+ */
+static int intel_dp_init_lttpr_phys(struct intel_dp *intel_dp, const u8 dpcd[DP_RECEIVER_CAP_SIZE])
 {
 	int lttpr_count;
-	int i;
 
 	if (!intel_dp_read_lttpr_common_caps(intel_dp, dpcd))
 		return 0;
@@ -134,6 +148,19 @@ static int intel_dp_init_lttpr(struct intel_dp *intel_dp, const u8 dpcd[DP_RECEI
 	if (lttpr_count == 0)
 		return 0;
 
+	/*
+	 * Don't change the mode on an active link, to prevent a loss of link
+	 * synchronization. See DP Standard v2.0 3.6.7. about the LTTPR
+	 * resetting its internal state when the mode is changed from
+	 * non-transparent to transparent.
+	 */
+	if (intel_dp->link_trained) {
+		if (lttpr_count < 0 || intel_dp_lttpr_transparent_mode_enabled(intel_dp))
+			goto out_reset_lttpr_count;
+
+		return lttpr_count;
+	}
+
 	/*
 	 * See DP Standard v2.0 3.6.6.1. about the explicit disabling of
 	 * non-transparent mode and the disable->enable non-transparent mode
@@ -154,11 +181,25 @@ static int intel_dp_init_lttpr(struct intel_dp *intel_dp, const u8 dpcd[DP_RECEI
 		       "Switching to LTTPR non-transparent LT mode failed, fall-back to transparent mode\n");
 
 		intel_dp_set_lttpr_transparent_mode(intel_dp, true);
-		intel_dp_reset_lttpr_count(intel_dp);
 
-		return 0;
+		goto out_reset_lttpr_count;
 	}
 
+	return lttpr_count;
+
+out_reset_lttpr_count:
+	intel_dp_reset_lttpr_count(intel_dp);
+
+	return 0;
+}
+
+static int intel_dp_init_lttpr(struct intel_dp *intel_dp, const u8 dpcd[DP_RECEIVER_CAP_SIZE])
+{
+	int lttpr_count;
+	int i;
+
+	lttpr_count = intel_dp_init_lttpr_phys(intel_dp, dpcd);
+
 	for (i = 0; i < lttpr_count; i++)
 		intel_dp_read_lttpr_phy_caps(intel_dp, dpcd, DP_PHY_LTTPR(i));
 
@@ -1482,10 +1523,10 @@ void intel_dp_start_link_train(struct intel_atomic_state *state,
 	struct intel_digital_port *dig_port = dp_to_dig_port(intel_dp);
 	struct intel_encoder *encoder = &dig_port->base;
 	bool passed;
-
 	/*
-	 * TODO: Reiniting LTTPRs here won't be needed once proper connector
-	 * HW state readout is added.
+	 * Reinit the LTTPRs here to ensure that they are switched to
+	 * non-transparent mode. During an earlier LTTPR detection this
+	 * could've been prevented by an active link.
 	 */
 	int lttpr_count = intel_dp_init_lttpr_and_dprx_caps(intel_dp);
 

From 9ee3f0d8c9999eb1ef2866e86f8d57d996fc0348 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 16 Jul 2024 10:45:30 +0200
Subject: [PATCH 106/346] ASOC: SOF: Intel: hda-loader: only wait for HDaudio
 IOC for IPC4 devices

Multiple users report a regression bisected to commit d5263dbbd8af
("ASoC: SOF: Intel: don't ignore IOC interrupts for non-audio
transfers"). The firmware version is the likely suspect, as these
users relied on SOF 2.0 while Intel only tested with the 2.2 release.

Rather than completely disable the wait_for_completion(), which can
help us gather timing information on the different stages of the boot
process, the simplest course of action is to just disable it for older
IPC versions which are no longer under active development.

Closes: https://github.com/thesofproject/linux/issues/5072
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218961
Fixes: d5263dbbd8af ("ASoC: SOF: Intel: don't ignore IOC interrupts for non-audio transfers")
Tested-by: Mike Krinkin <krinkin.m.u@gmail.com>
Tested-by: Todd Brandt <todd.e.brandt@linux.intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://patch.msgid.link/20240716084530.300829-1-pierre-louis.bossart@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/sof/intel/hda-loader.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/sound/soc/sof/intel/hda-loader.c b/sound/soc/sof/intel/hda-loader.c
index b8b914eaf7e0..75f6240cf3e1 100644
--- a/sound/soc/sof/intel/hda-loader.c
+++ b/sound/soc/sof/intel/hda-loader.c
@@ -310,15 +310,19 @@ int hda_cl_copy_fw(struct snd_sof_dev *sdev, struct hdac_ext_stream *hext_stream
 		return ret;
 	}
 
-	/* Wait for completion of transfer */
-	time_left = wait_for_completion_timeout(&hda_stream->ioc,
-						msecs_to_jiffies(HDA_CL_DMA_IOC_TIMEOUT_MS));
+	if (sdev->pdata->ipc_type == SOF_IPC_TYPE_4) {
+		/* Wait for completion of transfer */
+		time_left = wait_for_completion_timeout(&hda_stream->ioc,
+							msecs_to_jiffies(HDA_CL_DMA_IOC_TIMEOUT_MS));
 
-	if (!time_left) {
-		dev_err(sdev->dev, "Code loader DMA did not complete\n");
-		return -ETIMEDOUT;
+		if (!time_left) {
+			dev_err(sdev->dev, "Code loader DMA did not complete\n");
+			return -ETIMEDOUT;
+		}
+		dev_dbg(sdev->dev, "Code loader DMA done\n");
 	}
-	dev_dbg(sdev->dev, "Code loader DMA done, waiting for FW_ENTERED status\n");
+
+	dev_dbg(sdev->dev, "waiting for FW_ENTERED status\n");
 
 	status = snd_sof_dsp_read_poll_timeout(sdev, HDA_DSP_BAR,
 					chip->rom_status_reg, reg,

From 6f6a23d42bdfbbfe1b41a23e2c78319a0cc65db3 Mon Sep 17 00:00:00 2001
From: Curtis Malainey <cujomalainey@chromium.org>
Date: Tue, 16 Jul 2024 10:40:12 +0200
Subject: [PATCH 107/346] ASoC: Intel: Fix RT5650 SSP lookup

Commit 8efcd4864652 ("ASoC: Intel: sof_rt5682: use common module for
sof_card_private initialization") migrated the pin assignment in the
context struct up to soc-acpi-intel-ssp-common.c. This uses a lookup
table to see if a device has a amp/codec before assigning the pin. The
issue here arises when combination parts that serve both (with 2 ports)
are used.

sysfs: cannot create duplicate filename '/devices/pci0000:00/0000:00:1f.3/adl_rt5682_def/SSP0-Codec'
CPU: 1 PID: 2079 Comm: udevd Tainted: G     U             6.6.36-03391-g744739e00023 #1 3be1a2880a0970f65545a957db7d08ef4b3e2c0d
Hardware name: Google Anraggar/Anraggar, BIOS Google_Anraggar.15217.552.0 05/07/2024
Call Trace:
 <TASK>
 dump_stack_lvl+0x69/0xa0
 sysfs_warn_dup+0x5b/0x70
 sysfs_create_dir_ns+0xb0/0x100
 kobject_add_internal+0x133/0x3c0
 kobject_add+0x66/0xb0
 ? device_add+0x65/0x780
 device_add+0x164/0x780
 snd_soc_add_pcm_runtimes+0x2fa/0x800
 snd_soc_bind_card+0x35e/0xc20
 devm_snd_soc_register_card+0x48/0x90
 platform_probe+0x7b/0xb0
 really_probe+0xf7/0x2a0
 ...
kobject: kobject_add_internal failed for SSP0-Codec with -EEXIST, don't try to register things with the same name in the same directory.

The issue is that the ALC5650 was only defined in the codec table and
not the amp table which left the pin unassigned but the dai link was
still created by the machine driver.

Also patch the suffix filename code for the topology to prevent double
suffix names as a result of this change.

Fixes: 8efcd4864652 ("ASoC: Intel: sof_rt5682: use common module for sof_card_private initialization")
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Curtis Malainey <cujomalainey@chromium.org>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://patch.msgid.link/20240716084012.299257-1-pierre-louis.bossart@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../intel/common/soc-acpi-intel-ssp-common.c    |  9 +++++++++
 sound/soc/sof/intel/hda.c                       | 17 +++++++++--------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/sound/soc/intel/common/soc-acpi-intel-ssp-common.c b/sound/soc/intel/common/soc-acpi-intel-ssp-common.c
index 75d0b931d895..de7a3f7f47f1 100644
--- a/sound/soc/intel/common/soc-acpi-intel-ssp-common.c
+++ b/sound/soc/intel/common/soc-acpi-intel-ssp-common.c
@@ -64,6 +64,15 @@ static const struct codec_map amps[] = {
 	CODEC_MAP_ENTRY("RT1015P", "rt1015", RT1015P_ACPI_HID, CODEC_RT1015P),
 	CODEC_MAP_ENTRY("RT1019P", "rt1019", RT1019P_ACPI_HID, CODEC_RT1019P),
 	CODEC_MAP_ENTRY("RT1308", "rt1308", RT1308_ACPI_HID, CODEC_RT1308),
+
+	/*
+	 * Monolithic components
+	 *
+	 * Only put components that can serve as both the amp and the codec below this line.
+	 * This will ensure that if the part is used just as a codec and there is an amp as well
+	 * then the amp will be selected properly.
+	 */
+	CODEC_MAP_ENTRY("RT5650", "rt5650", RT5650_ACPI_HID, CODEC_RT5650),
 };
 
 enum snd_soc_acpi_intel_codec
diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c
index daf364f773dd..5a40b8fbbbd3 100644
--- a/sound/soc/sof/intel/hda.c
+++ b/sound/soc/sof/intel/hda.c
@@ -1307,9 +1307,10 @@ struct snd_soc_acpi_mach *hda_machine_select(struct snd_sof_dev *sdev)
 	const struct sof_dev_desc *desc = sof_pdata->desc;
 	struct hdac_bus *bus = sof_to_bus(sdev);
 	struct snd_soc_acpi_mach *mach = NULL;
-	enum snd_soc_acpi_intel_codec codec_type;
+	enum snd_soc_acpi_intel_codec codec_type, amp_type;
 	const char *tplg_filename;
 	const char *tplg_suffix;
+	bool amp_name_valid;
 
 	/* Try I2S or DMIC if it is supported */
 	if (interface_mask & (BIT(SOF_DAI_INTEL_SSP) | BIT(SOF_DAI_INTEL_DMIC)))
@@ -1413,15 +1414,16 @@ struct snd_soc_acpi_mach *hda_machine_select(struct snd_sof_dev *sdev)
 			}
 		}
 
-		codec_type = snd_soc_acpi_intel_detect_amp_type(sdev->dev);
+		amp_type = snd_soc_acpi_intel_detect_amp_type(sdev->dev);
+		codec_type = snd_soc_acpi_intel_detect_codec_type(sdev->dev);
+		amp_name_valid = amp_type != CODEC_NONE && amp_type != codec_type;
 
-		if (tplg_fixup &&
-		    mach->tplg_quirk_mask & SND_SOC_ACPI_TPLG_INTEL_AMP_NAME &&
-		    codec_type != CODEC_NONE) {
-			tplg_suffix = snd_soc_acpi_intel_get_amp_tplg_suffix(codec_type);
+		if (tplg_fixup && amp_name_valid &&
+		    mach->tplg_quirk_mask & SND_SOC_ACPI_TPLG_INTEL_AMP_NAME) {
+			tplg_suffix = snd_soc_acpi_intel_get_amp_tplg_suffix(amp_type);
 			if (!tplg_suffix) {
 				dev_err(sdev->dev, "no tplg suffix found, amp %d\n",
-					codec_type);
+					amp_type);
 				return NULL;
 			}
 
@@ -1436,7 +1438,6 @@ struct snd_soc_acpi_mach *hda_machine_select(struct snd_sof_dev *sdev)
 			add_extension = true;
 		}
 
-		codec_type = snd_soc_acpi_intel_detect_codec_type(sdev->dev);
 
 		if (tplg_fixup &&
 		    mach->tplg_quirk_mask & SND_SOC_ACPI_TPLG_INTEL_CODEC_NAME &&

From 415fb383ec2bbe4d4bb1a1b5593cd67eb058f1de Mon Sep 17 00:00:00 2001
From: Francis Pravin <francis.p@samsung.com>
Date: Mon, 15 Jul 2024 06:31:57 +0530
Subject: [PATCH 108/346] nvme-core: choose PIF from QPIF if QPIFS supports and
 PIF is QTYPE

As per TP4141a:
"If the Qualified Protection Information Format Support(QPIFS) bit is
set to 1 and the Protection Information Format(PIF) field is set to 11b
(i.e., Qualified Type), then the pif is as defined in the Qualified
Protection Information Format (QPIF) field."
So, choose PIF from QPIF if QPIFS supports and PIF is QTYPE.

Signed-off-by: Francis Pravin <francis.p@samsung.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 8 +++++++-
 include/linux/nvme.h     | 9 +++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 448b8239bc99..e78ef31eeef0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1875,12 +1875,18 @@ static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
 		struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm)
 {
 	u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
+	u8 guard_type;
 
 	/* no support for storage tag formats right now */
 	if (nvme_elbaf_sts(elbaf))
 		return;
 
-	head->guard_type = nvme_elbaf_guard_type(elbaf);
+	guard_type = nvme_elbaf_guard_type(elbaf);
+	if ((nvm->pic & NVME_ID_NS_NVM_QPIFS) &&
+	     guard_type == NVME_NVM_NS_QTYPE_GUARD)
+		guard_type = nvme_elbaf_qualified_guard_type(elbaf);
+
+	head->guard_type = guard_type;
 	switch (head->guard_type) {
 	case NVME_NVM_NS_64B_GUARD:
 		head->pi_size = sizeof(struct crc64_pi_tuple);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 57e27e48c913..5c2290f0d5af 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -483,6 +483,9 @@ enum {
 	NVME_ID_NS_NVM_STS_MASK		= 0x7f,
 	NVME_ID_NS_NVM_GUARD_SHIFT	= 7,
 	NVME_ID_NS_NVM_GUARD_MASK	= 0x3,
+	NVME_ID_NS_NVM_QPIF_SHIFT	= 9,
+	NVME_ID_NS_NVM_QPIF_MASK	= 0xf,
+	NVME_ID_NS_NVM_QPIFS		= 1 << 3,
 };
 
 static inline __u8 nvme_elbaf_sts(__u32 elbaf)
@@ -495,6 +498,11 @@ static inline __u8 nvme_elbaf_guard_type(__u32 elbaf)
 	return (elbaf >> NVME_ID_NS_NVM_GUARD_SHIFT) & NVME_ID_NS_NVM_GUARD_MASK;
 }
 
+static inline __u8 nvme_elbaf_qualified_guard_type(__u32 elbaf)
+{
+	return (elbaf >> NVME_ID_NS_NVM_QPIF_SHIFT) & NVME_ID_NS_NVM_QPIF_MASK;
+}
+
 struct nvme_id_ctrl_nvm {
 	__u8	vsl;
 	__u8	wzsl;
@@ -574,6 +582,7 @@ enum {
 	NVME_NVM_NS_16B_GUARD	= 0,
 	NVME_NVM_NS_32B_GUARD	= 1,
 	NVME_NVM_NS_64B_GUARD	= 2,
+	NVME_NVM_NS_QTYPE_GUARD	= 3,
 };
 
 static inline __u8 nvme_lbaf_index(__u8 flbas)

From ecfa23c8df7ef3ea2a429dfe039341bf792e95b4 Mon Sep 17 00:00:00 2001
From: Boyuan Zhang <boyuan.zhang@amd.com>
Date: Thu, 11 Jul 2024 16:19:54 -0400
Subject: [PATCH 109/346] drm/amdgpu/vcn: identify unified queue in sw init

Determine whether VCN using unified queue in sw_init, instead of calling
functions later on.

v2: fix coding style

Signed-off-by: Boyuan Zhang <boyuan.zhang@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Ruijing Dong <ruijing.dong@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 39 ++++++++++---------------
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h |  1 +
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 8d65b096db90..69b2c2503d55 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -147,6 +147,10 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
 		}
 	}
 
+	/* from vcn4 and above, only unified queue is used */
+	adev->vcn.using_unified_queue =
+		amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(4, 0, 0);
+
 	hdr = (const struct common_firmware_header *)adev->vcn.fw[0]->data;
 	adev->vcn.fw_version = le32_to_cpu(hdr->ucode_version);
 
@@ -275,18 +279,6 @@ int amdgpu_vcn_sw_fini(struct amdgpu_device *adev)
 	return 0;
 }
 
-/* from vcn4 and above, only unified queue is used */
-static bool amdgpu_vcn_using_unified_queue(struct amdgpu_ring *ring)
-{
-	struct amdgpu_device *adev = ring->adev;
-	bool ret = false;
-
-	if (amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(4, 0, 0))
-		ret = true;
-
-	return ret;
-}
-
 bool amdgpu_vcn_is_disabled_vcn(struct amdgpu_device *adev, enum vcn_ring_type type, uint32_t vcn_instance)
 {
 	bool ret = false;
@@ -724,12 +716,11 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring,
 	struct amdgpu_job *job;
 	struct amdgpu_ib *ib;
 	uint64_t addr = AMDGPU_GPU_PAGE_ALIGN(ib_msg->gpu_addr);
-	bool sq = amdgpu_vcn_using_unified_queue(ring);
 	uint32_t *ib_checksum;
 	uint32_t ib_pack_in_dw;
 	int i, r;
 
-	if (sq)
+	if (adev->vcn.using_unified_queue)
 		ib_size_dw += 8;
 
 	r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL,
@@ -742,7 +733,7 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring,
 	ib->length_dw = 0;
 
 	/* single queue headers */
-	if (sq) {
+	if (adev->vcn.using_unified_queue) {
 		ib_pack_in_dw = sizeof(struct amdgpu_vcn_decode_buffer) / sizeof(uint32_t)
 						+ 4 + 2; /* engine info + decoding ib in dw */
 		ib_checksum = amdgpu_vcn_unified_ring_ib_header(ib, ib_pack_in_dw, false);
@@ -761,7 +752,7 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring,
 	for (i = ib->length_dw; i < ib_size_dw; ++i)
 		ib->ptr[i] = 0x0;
 
-	if (sq)
+	if (adev->vcn.using_unified_queue)
 		amdgpu_vcn_unified_ring_ib_checksum(&ib_checksum, ib_pack_in_dw);
 
 	r = amdgpu_job_submit_direct(job, ring, &f);
@@ -851,15 +842,15 @@ static int amdgpu_vcn_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t hand
 					 struct dma_fence **fence)
 {
 	unsigned int ib_size_dw = 16;
+	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_job *job;
 	struct amdgpu_ib *ib;
 	struct dma_fence *f = NULL;
 	uint32_t *ib_checksum = NULL;
 	uint64_t addr;
-	bool sq = amdgpu_vcn_using_unified_queue(ring);
 	int i, r;
 
-	if (sq)
+	if (adev->vcn.using_unified_queue)
 		ib_size_dw += 8;
 
 	r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL,
@@ -873,7 +864,7 @@ static int amdgpu_vcn_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t hand
 
 	ib->length_dw = 0;
 
-	if (sq)
+	if (adev->vcn.using_unified_queue)
 		ib_checksum = amdgpu_vcn_unified_ring_ib_header(ib, 0x11, true);
 
 	ib->ptr[ib->length_dw++] = 0x00000018;
@@ -895,7 +886,7 @@ static int amdgpu_vcn_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t hand
 	for (i = ib->length_dw; i < ib_size_dw; ++i)
 		ib->ptr[i] = 0x0;
 
-	if (sq)
+	if (adev->vcn.using_unified_queue)
 		amdgpu_vcn_unified_ring_ib_checksum(&ib_checksum, 0x11);
 
 	r = amdgpu_job_submit_direct(job, ring, &f);
@@ -918,15 +909,15 @@ static int amdgpu_vcn_enc_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han
 					  struct dma_fence **fence)
 {
 	unsigned int ib_size_dw = 16;
+	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_job *job;
 	struct amdgpu_ib *ib;
 	struct dma_fence *f = NULL;
 	uint32_t *ib_checksum = NULL;
 	uint64_t addr;
-	bool sq = amdgpu_vcn_using_unified_queue(ring);
 	int i, r;
 
-	if (sq)
+	if (adev->vcn.using_unified_queue)
 		ib_size_dw += 8;
 
 	r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL,
@@ -940,7 +931,7 @@ static int amdgpu_vcn_enc_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han
 
 	ib->length_dw = 0;
 
-	if (sq)
+	if (adev->vcn.using_unified_queue)
 		ib_checksum = amdgpu_vcn_unified_ring_ib_header(ib, 0x11, true);
 
 	ib->ptr[ib->length_dw++] = 0x00000018;
@@ -962,7 +953,7 @@ static int amdgpu_vcn_enc_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han
 	for (i = ib->length_dw; i < ib_size_dw; ++i)
 		ib->ptr[i] = 0x0;
 
-	if (sq)
+	if (adev->vcn.using_unified_queue)
 		amdgpu_vcn_unified_ring_ib_checksum(&ib_checksum, 0x11);
 
 	r = amdgpu_job_submit_direct(job, ring, &f);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
index 9f06def236fd..1a5439abd1a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h
@@ -329,6 +329,7 @@ struct amdgpu_vcn {
 
 	uint16_t inst_mask;
 	uint8_t	num_inst_per_aid;
+	bool using_unified_queue;
 };
 
 struct amdgpu_fw_shared_rb_ptrs_struct {

From 7d75ef3736a025db441be652c8cc8e84044a215f Mon Sep 17 00:00:00 2001
From: Boyuan Zhang <boyuan.zhang@amd.com>
Date: Wed, 10 Jul 2024 16:17:12 -0400
Subject: [PATCH 110/346] drm/amdgpu/vcn: not pause dpg for unified queue

For unified queue, DPG pause for encoding is done inside VCN firmware,
so there is no need to pause dpg based on ring type in kernel.

For VCN3 and below, pausing DPG for encoding in kernel is still needed.

v2: add more comments
v3: update commit message

Signed-off-by: Boyuan Zhang <boyuan.zhang@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Ruijing Dong <ruijing.dong@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 69b2c2503d55..43f44cc201cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -389,7 +389,9 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work)
 		for (i = 0; i < adev->vcn.num_enc_rings; ++i)
 			fence[j] += amdgpu_fence_count_emitted(&adev->vcn.inst[j].ring_enc[i]);
 
-		if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG)	{
+		/* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
+		if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
+		    !adev->vcn.using_unified_queue) {
 			struct dpg_pause_state new_state;
 
 			if (fence[j] ||
@@ -435,7 +437,9 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
 	amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCN,
 	       AMD_PG_STATE_UNGATE);
 
-	if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG)	{
+	/* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
+	if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
+	    !adev->vcn.using_unified_queue) {
 		struct dpg_pause_state new_state;
 
 		if (ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC) {
@@ -461,8 +465,12 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring)
 
 void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring)
 {
+	struct amdgpu_device *adev = ring->adev;
+
+	/* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */
 	if (ring->adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG &&
-		ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC)
+	    ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC &&
+	    !adev->vcn.using_unified_queue)
 		atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt);
 
 	atomic_dec(&ring->adev->vcn.total_submission_cnt);

From 6e169c7e0f842c48c7bf683fb789dbf5a8b1dfd8 Mon Sep 17 00:00:00 2001
From: Alex Hung <alex.hung@amd.com>
Date: Mon, 15 Jul 2024 13:53:17 -0600
Subject: [PATCH 111/346] drm/amd/display: Add doc entry for program_3dlut_size

Fixes the warning:

Function parameter or struct member 'program_3dlut_size' not described in
'mpc_funcs'

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/dri-devel/20240715090445.7e9387ec@canb.auug.org.au/
Reviewed-by: Aurabindo Pillai <aurabindo.pillai@amd.com>
Signed-off-by: Alex Hung <alex.hung@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h b/drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h
index 40a9b3471208..3a89cc0cffc1 100644
--- a/drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h
+++ b/drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h
@@ -1039,6 +1039,20 @@ struct mpc_funcs {
 	*/
 	void (*program_lut_mode)(struct mpc *mpc, const enum MCM_LUT_ID id, const enum MCM_LUT_XABLE xable,
 			bool lut_bank_a, int mpcc_id);
+	/**
+	* @program_3dlut_size:
+	*
+	* Program 3D LUT size.
+	*
+	* Parameters:
+	* - [in/out] mpc - MPC context.
+	* - [in] is_17x17x17 - is 3dlut 17x17x17
+	* - [in] mpcc_id
+	*
+	* Return:
+	*
+	* void
+	*/
 	void (*program_3dlut_size)(struct mpc *mpc, bool is_17x17x17, int mpcc_id);
 };
 

From 0e2c796b49735ee141fbff355b9d02e0189c3c65 Mon Sep 17 00:00:00 2001
From: Roman Li <Roman.Li@amd.com>
Date: Mon, 15 Jul 2024 16:45:46 -0400
Subject: [PATCH 112/346] drm/amd/display: Add function banner for
 idle_workqueue

[Why]
htmldocs warning:
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h: warning:
Function parameter or struct member 'idle_workqueue' not described in
'amdgpu_display_manager'.

[How]
Add comment section for idle_workqueue with param description.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/dri-devel/20240715090211.736a9b4d@canb.auug.org.au/
Signed-off-by: Roman Li <Roman.Li@amd.com>
Reviewed-by: Aurabindo Pillai <aurabindo.pillai@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index 369159c29bbe..2d7755e2b6c3 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -137,6 +137,13 @@ struct vblank_control_work {
 	bool enable;
 };
 
+/**
+ * struct idle_workqueue - Work data for periodic action in idle
+ * @work: Kernel work data for the work event
+ * @dm: amdgpu display manager device
+ * @enable: true if idle worker is enabled
+ * @running: true if idle worker is running
+ */
 struct idle_workqueue {
 	struct work_struct work;
 	struct amdgpu_display_manager *dm;
@@ -502,6 +509,12 @@ struct amdgpu_display_manager {
 	 * Deferred work for vblank control events.
 	 */
 	struct workqueue_struct *vblank_control_workqueue;
+
+	/**
+	 * @idle_workqueue:
+	 *
+	 * Periodic work for idle events.
+	 */
 	struct idle_workqueue *idle_workqueue;
 
 	struct drm_atomic_state *cached_state;

From b3fb79cda5688a44a423c27b791f5456d801e49c Mon Sep 17 00:00:00 2001
From: YiPeng Chai <YiPeng.Chai@amd.com>
Date: Fri, 12 Apr 2024 13:46:03 +0800
Subject: [PATCH 113/346] drm/amdgpu: add mutex to protect ras shared memory

Add mutex to protect ras shared memory.

v2:
  Add TA_RAS_COMMAND__TRIGGER_ERROR command call
  status check.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    | 123 ++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h    |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c |   2 +
 3 files changed, 86 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 800cc7a148b2..7cdff355cedb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1591,6 +1591,68 @@ static void psp_ras_ta_check_status(struct psp_context *psp)
 	}
 }
 
+static int psp_ras_send_cmd(struct psp_context *psp,
+		enum ras_command cmd_id, void *in, void *out)
+{
+	struct ta_ras_shared_memory *ras_cmd;
+	uint32_t cmd = cmd_id;
+	int ret = 0;
+
+	if (!in)
+		return -EINVAL;
+
+	mutex_lock(&psp->ras_context.mutex);
+	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
+	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
+
+	switch (cmd) {
+	case TA_RAS_COMMAND__ENABLE_FEATURES:
+	case TA_RAS_COMMAND__DISABLE_FEATURES:
+		memcpy(&ras_cmd->ras_in_message,
+			in, sizeof(ras_cmd->ras_in_message));
+		break;
+	case TA_RAS_COMMAND__TRIGGER_ERROR:
+		memcpy(&ras_cmd->ras_in_message.trigger_error,
+			in, sizeof(ras_cmd->ras_in_message.trigger_error));
+		break;
+	case TA_RAS_COMMAND__QUERY_ADDRESS:
+		memcpy(&ras_cmd->ras_in_message.address,
+			in, sizeof(ras_cmd->ras_in_message.address));
+		break;
+	default:
+		dev_err(psp->adev->dev, "Invalid ras cmd id: %u\n", cmd);
+		ret = -EINVAL;
+		goto err_out;
+	}
+
+	ras_cmd->cmd_id = cmd;
+	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+
+	switch (cmd) {
+	case TA_RAS_COMMAND__TRIGGER_ERROR:
+		if (ret || psp->cmd_buf_mem->resp.status)
+			ret = -EINVAL;
+		else if (out)
+			memcpy(out, &ras_cmd->ras_status, sizeof(ras_cmd->ras_status));
+		break;
+	case TA_RAS_COMMAND__QUERY_ADDRESS:
+		if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
+			ret = -EINVAL;
+		else if (out)
+			memcpy(out,
+				&ras_cmd->ras_out_message.address,
+				sizeof(ras_cmd->ras_out_message.address));
+		break;
+	default:
+		break;
+	}
+
+err_out:
+	mutex_unlock(&psp->ras_context.mutex);
+
+	return ret;
+}
+
 int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
 {
 	struct ta_ras_shared_memory *ras_cmd;
@@ -1632,23 +1694,15 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
 int psp_ras_enable_features(struct psp_context *psp,
 		union ta_ras_cmd_input *info, bool enable)
 {
-	struct ta_ras_shared_memory *ras_cmd;
+	enum ras_command cmd_id;
 	int ret;
 
-	if (!psp->ras_context.context.initialized)
+	if (!psp->ras_context.context.initialized || !info)
 		return -EINVAL;
 
-	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
-	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
-
-	if (enable)
-		ras_cmd->cmd_id = TA_RAS_COMMAND__ENABLE_FEATURES;
-	else
-		ras_cmd->cmd_id = TA_RAS_COMMAND__DISABLE_FEATURES;
-
-	ras_cmd->ras_in_message = *info;
-
-	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+	cmd_id = enable ?
+		TA_RAS_COMMAND__ENABLE_FEATURES : TA_RAS_COMMAND__DISABLE_FEATURES;
+	ret = psp_ras_send_cmd(psp, cmd_id, info, NULL);
 	if (ret)
 		return -EINVAL;
 
@@ -1672,6 +1726,8 @@ int psp_ras_terminate(struct psp_context *psp)
 
 	psp->ras_context.context.initialized = false;
 
+	mutex_destroy(&psp->ras_context.mutex);
+
 	return ret;
 }
 
@@ -1756,9 +1812,10 @@ int psp_ras_initialize(struct psp_context *psp)
 
 	ret = psp_ta_load(psp, &psp->ras_context.context);
 
-	if (!ret && !ras_cmd->ras_status)
+	if (!ret && !ras_cmd->ras_status) {
 		psp->ras_context.context.initialized = true;
-	else {
+		mutex_init(&psp->ras_context.mutex);
+	} else {
 		if (ras_cmd->ras_status)
 			dev_warn(adev->dev, "RAS Init Status: 0x%X\n", ras_cmd->ras_status);
 
@@ -1772,12 +1829,12 @@ int psp_ras_initialize(struct psp_context *psp)
 int psp_ras_trigger_error(struct psp_context *psp,
 			  struct ta_ras_trigger_error_input *info, uint32_t instance_mask)
 {
-	struct ta_ras_shared_memory *ras_cmd;
 	struct amdgpu_device *adev = psp->adev;
 	int ret;
 	uint32_t dev_mask;
+	uint32_t ras_status = 0;
 
-	if (!psp->ras_context.context.initialized)
+	if (!psp->ras_context.context.initialized || !info)
 		return -EINVAL;
 
 	switch (info->block_id) {
@@ -1801,13 +1858,8 @@ int psp_ras_trigger_error(struct psp_context *psp,
 	dev_mask &= AMDGPU_RAS_INST_MASK;
 	info->sub_block_index |= dev_mask;
 
-	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
-	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
-
-	ras_cmd->cmd_id = TA_RAS_COMMAND__TRIGGER_ERROR;
-	ras_cmd->ras_in_message.trigger_error = *info;
-
-	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+	ret = psp_ras_send_cmd(psp,
+			TA_RAS_COMMAND__TRIGGER_ERROR, info, &ras_status);
 	if (ret)
 		return -EINVAL;
 
@@ -1817,9 +1869,9 @@ int psp_ras_trigger_error(struct psp_context *psp,
 	if (amdgpu_ras_intr_triggered())
 		return 0;
 
-	if (ras_cmd->ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
+	if (ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
 		return -EACCES;
-	else if (ras_cmd->ras_status)
+	else if (ras_status)
 		return -EINVAL;
 
 	return 0;
@@ -1829,25 +1881,16 @@ int psp_ras_query_address(struct psp_context *psp,
 			  struct ta_ras_query_address_input *addr_in,
 			  struct ta_ras_query_address_output *addr_out)
 {
-	struct ta_ras_shared_memory *ras_cmd;
 	int ret;
 
-	if (!psp->ras_context.context.initialized)
+	if (!psp->ras_context.context.initialized ||
+		!addr_in || !addr_out)
 		return -EINVAL;
 
-	ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
-	memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
+	ret = psp_ras_send_cmd(psp,
+			TA_RAS_COMMAND__QUERY_ADDRESS, addr_in, addr_out);
 
-	ras_cmd->cmd_id = TA_RAS_COMMAND__QUERY_ADDRESS;
-	ras_cmd->ras_in_message.address = *addr_in;
-
-	ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
-	if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
-		return -EINVAL;
-
-	*addr_out = ras_cmd->ras_out_message.address;
-
-	return 0;
+	return ret;
 }
 // ras end
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 3635303e6548..74a96516c913 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -200,6 +200,7 @@ struct psp_xgmi_context {
 struct psp_ras_context {
 	struct ta_context		context;
 	struct amdgpu_ras		*ras;
+	struct mutex			mutex;
 };
 
 #define MEM_TRAIN_SYSTEM_SIGNATURE		0x54534942
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
index 8e8afbd237bc..0c856005df6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
@@ -348,6 +348,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
 
 	context->session_id = ta_id;
 
+	mutex_lock(&psp->ras_context.mutex);
 	ret = prep_ta_mem_context(&context->mem_context, shared_buf, shared_buf_len);
 	if (ret)
 		goto err_free_shared_buf;
@@ -366,6 +367,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size
 		ret = -EFAULT;
 
 err_free_shared_buf:
+	mutex_unlock(&psp->ras_context.mutex);
 	kfree(shared_buf);
 
 	return ret;

From 2fdc99b96ea86c178eb14bc948dac93feffc8936 Mon Sep 17 00:00:00 2001
From: Tim Huang <tim.huang@amd.com>
Date: Fri, 12 Jul 2024 11:05:07 +0800
Subject: [PATCH 114/346] drm/amd/pm: early return if disabling DPMS for GFX IP
 v11.5.2

This was intended to add support for GFX IP v11.5.2, but it needs
to be applied to all GFX11 and subsequent APUs. Therefore the code
should be revised to accommodate this.

Signed-off-by: Tim Huang <tim.huang@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index fb8643d25d1b..9d7454b3c314 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -1924,20 +1924,12 @@ static int smu_disable_dpms(struct smu_context *smu)
 	}
 
 	/*
-	 * For SMU 13.0.4/11 and 14.0.0, PMFW will handle the features disablement properly
+	 * For GFX11 and subsequent APUs, PMFW will handle the features disablement properly
 	 * for gpu reset and S0i3 cases. Driver involvement is unnecessary.
 	 */
-	if (amdgpu_in_reset(adev) || adev->in_s0ix) {
-		switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
-		case IP_VERSION(13, 0, 4):
-		case IP_VERSION(13, 0, 11):
-		case IP_VERSION(14, 0, 0):
-		case IP_VERSION(14, 0, 1):
-			return 0;
-		default:
-			break;
-		}
-	}
+	if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) >= 11 &&
+	    smu->is_apu && (amdgpu_in_reset(adev) || adev->in_s0ix))
+		return 0;
 
 	/*
 	 * For gpu reset, runpm and hibernation through BACO,

From 5ae8fb971201d281cc594dba58787406e46c696a Mon Sep 17 00:00:00 2001
From: Li Ma <li.ma@amd.com>
Date: Wed, 10 Jul 2024 17:29:59 +0800
Subject: [PATCH 115/346] drm/amd/swsmu: enable Pstates profile levels for SMU
 v14.0.4

Enables following UMD stable Pstates profile levels
of power_dpm_force_performance_level for SMU v14.0.4.

    - profile_peak
    - profile_min_mclk
    - profile_min_sclk
    - profile_standard

Signed-off-by: Li Ma <li.ma@amd.com>
Reviewed-by: Tim Huang <tim.huang@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c   | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c
index 5d47d58944f6..8798ebfcea83 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c
@@ -69,6 +69,9 @@
 #define SMU_14_0_0_UMD_PSTATE_SOCCLK			678
 #define SMU_14_0_0_UMD_PSTATE_FCLK			1800
 
+#define SMU_14_0_4_UMD_PSTATE_GFXCLK			938
+#define SMU_14_0_4_UMD_PSTATE_SOCCLK			938
+
 #define FEATURE_MASK(feature) (1ULL << feature)
 #define SMC_DPM_FEATURE ( \
 	FEATURE_MASK(FEATURE_CCLK_DPM_BIT) | \
@@ -1296,19 +1299,28 @@ static int smu_v14_0_common_get_dpm_profile_freq(struct smu_context *smu,
 	switch (clk_type) {
 	case SMU_GFXCLK:
 	case SMU_SCLK:
-		clk_limit = SMU_14_0_0_UMD_PSTATE_GFXCLK;
+		if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 4))
+			clk_limit = SMU_14_0_4_UMD_PSTATE_GFXCLK;
+		else
+			clk_limit = SMU_14_0_0_UMD_PSTATE_GFXCLK;
 		if (level == AMD_DPM_FORCED_LEVEL_PROFILE_PEAK)
 			smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, NULL, &clk_limit);
 		else if (level == AMD_DPM_FORCED_LEVEL_PROFILE_MIN_SCLK)
 			smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SCLK, &clk_limit, NULL);
 		break;
 	case SMU_SOCCLK:
-		clk_limit = SMU_14_0_0_UMD_PSTATE_SOCCLK;
+		if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 4))
+			clk_limit = SMU_14_0_4_UMD_PSTATE_SOCCLK;
+		else
+			clk_limit = SMU_14_0_0_UMD_PSTATE_SOCCLK;
 		if (level == AMD_DPM_FORCED_LEVEL_PROFILE_PEAK)
 			smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_SOCCLK, NULL, &clk_limit);
 		break;
 	case SMU_FCLK:
-		clk_limit = SMU_14_0_0_UMD_PSTATE_FCLK;
+		if (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(14, 0, 4))
+			smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, NULL, &clk_limit);
+		else
+			clk_limit = SMU_14_0_0_UMD_PSTATE_FCLK;
 		if (level == AMD_DPM_FORCED_LEVEL_PROFILE_PEAK)
 			smu_v14_0_common_get_dpm_ultimate_freq(smu, SMU_FCLK, NULL, &clk_limit);
 		else if (level == AMD_DPM_FORCED_LEVEL_PROFILE_MIN_MCLK)

From 4cf300f604fe894e4bd734f87fa4502faf1b8af3 Mon Sep 17 00:00:00 2001
From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Date: Mon, 15 Jul 2024 14:57:49 -0600
Subject: [PATCH 116/346] drm/amd/display: Move DIO documentation to the right
 place

When building the kernel-doc, it complains with the below warning:

./drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h:1: warning: no structured comments found
./drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h:1: warning: no structured comments found

This warning was caused by the wrong use of the ':export:' and the lack
of function documentation in the file pointed under the ':internal:'.
This commit addresses those issues by relocating the overview
documentation to the correct C file, removing the ':export:' options,
and adding two simple kernel-doc to ensure that ':internal:' does not
have any warning.

Cc: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/dri-devel/20240715085918.68f5ecc9@canb.auug.org.au/
Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../gpu/amdgpu/display/dcn-blocks.rst         |  7 ++---
 .../amd/display/dc/link/hwss/link_hwss_dio.c  | 29 +++++++++++++++++++
 .../amd/display/dc/link/hwss/link_hwss_dio.h  |  9 ------
 3 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/Documentation/gpu/amdgpu/display/dcn-blocks.rst b/Documentation/gpu/amdgpu/display/dcn-blocks.rst
index a3fbd3ea028b..118aeb9fd2b4 100644
--- a/Documentation/gpu/amdgpu/display/dcn-blocks.rst
+++ b/Documentation/gpu/amdgpu/display/dcn-blocks.rst
@@ -68,11 +68,8 @@ OPP
 DIO
 ---
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h
+.. kernel-doc:: drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.c
    :doc: overview
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h
-   :export:
-
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h
+.. kernel-doc:: drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.c
    :internal:
diff --git a/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.c b/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.c
index 50459d7a0f85..b76737b7b9e4 100644
--- a/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.c
+++ b/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.c
@@ -26,6 +26,16 @@
 #include "core_types.h"
 #include "link_enc_cfg.h"
 
+/**
+ * DOC: overview
+ *
+ * Display Input Output (DIO), is the display input and output unit in DCN. It
+ * includes output encoders to support different display output, like
+ * DisplayPort, HDMI, DVI interface, and others. It also includes the control
+ * and status channels for these interfaces.
+ */
+
+
 void set_dio_throttled_vcp_size(struct pipe_ctx *pipe_ctx,
 		struct fixed31_32 throttled_vcp_size)
 {
@@ -254,12 +264,31 @@ static const struct link_hwss dio_link_hwss = {
 	},
 };
 
+/**
+ * can_use_dio_link_hwss - Check if the link_hwss is accessible
+ *
+ * @link: Reference a link struct containing one or more sinks and the
+ *	  connective status.
+ * @link_res: Mappable hardware resource used to enable a link.
+ *
+ * Returns:
+ * Return true if the link encoder is accessible from link.
+ */
 bool can_use_dio_link_hwss(const struct dc_link *link,
 		const struct link_resource *link_res)
 {
 	return link->link_enc != NULL;
 }
 
+/**
+ * get_dio_link_hwss - Return link_hwss reference
+ *
+ * This function behaves like a get function to return the link_hwss populated
+ * in the link_hwss_dio.c file.
+ *
+ * Returns:
+ * Return the reference to the filled struct of link_hwss.
+ */
 const struct link_hwss *get_dio_link_hwss(void)
 {
 	return &dio_link_hwss;
diff --git a/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h b/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h
index a1f72fe378ee..45f0e091fcb0 100644
--- a/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h
+++ b/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dio.h
@@ -23,15 +23,6 @@
  *
  */
 
-/**
- * DOC: overview
- *
- * Display Input Output (DIO), is the display input and output unit in DCN. It
- * includes output encoders to support different display output, like
- * DisplayPort, HDMI, DVI interface, and others. It also includes the control
- * and status channels for these interfaces.
- */
-
 #ifndef __LINK_HWSS_DIO_H__
 #define __LINK_HWSS_DIO_H__
 

From 9947dbf984dd79d644a5468a47a76d9fef9f0884 Mon Sep 17 00:00:00 2001
From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Date: Mon, 15 Jul 2024 15:10:32 -0600
Subject: [PATCH 117/346] Documentation/gpu: Remove ':export:' option from DCN
 documentation

This commit reduces, but does not fix, all the occurrences and some of
the documentation warnings related to the 'no structured comments.' This
was caused by the wrong use of the ':export:' option in the DCN
kernel-doc, so this commit drops the usage of those options.

Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 Documentation/gpu/amdgpu/display/dcn-blocks.rst | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/Documentation/gpu/amdgpu/display/dcn-blocks.rst b/Documentation/gpu/amdgpu/display/dcn-blocks.rst
index 118aeb9fd2b4..582a5fee7f29 100644
--- a/Documentation/gpu/amdgpu/display/dcn-blocks.rst
+++ b/Documentation/gpu/amdgpu/display/dcn-blocks.rst
@@ -11,9 +11,6 @@ DCHUBBUB
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
    :doc: overview
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
-   :export:
-
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
    :internal:
 
@@ -23,9 +20,6 @@ HUBP
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
    :doc: overview
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
-   :export:
-
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
    :internal:
 
@@ -35,9 +29,6 @@ DPP
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
    :doc: overview
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
-   :export:
-
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
    :internal:
 
@@ -47,9 +38,6 @@ MPC
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h
    :doc: overview
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h
-   :export:
-
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h
    :internal:
 
@@ -59,9 +47,6 @@ OPP
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/opp.h
    :doc: overview
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/opp.h
-   :export:
-
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/opp.h
    :internal:
 

From 1200bce4de8798ff73d1a0011e84b5ff68ffc25c Mon Sep 17 00:00:00 2001
From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Date: Mon, 15 Jul 2024 15:58:40 -0600
Subject: [PATCH 118/346] Documentation/gpu: Adjust DCN documentation paths

When building the kernel-doc, it has the following complaints:

Documentation/gpu/amdgpu/display/dcn-blocks:23:
drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h:3: WARNING: Duplicate C
declaration, also defined at gpu/amdgpu/display/dcn-blocks:3.

Declaration is '.. c:struct:: surface_flip_registers'.

Documentation/gpu/amdgpu/display/dcn-blocks:35:
drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h:3: WARNING: Duplicate C
declaration, also defined at gpu/amdgpu/display/dcn-blocks:3.
Declaration is '.. c:struct:: surface_flip_registers'.

This error happened due to a copy-and-paste where the same file path was
duplicated multiple times to a different set of blocks. This commit
addresses this issue by using the correct file path.

Cc: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 Documentation/gpu/amdgpu/display/dcn-blocks.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/gpu/amdgpu/display/dcn-blocks.rst b/Documentation/gpu/amdgpu/display/dcn-blocks.rst
index 582a5fee7f29..00bc0607e98c 100644
--- a/Documentation/gpu/amdgpu/display/dcn-blocks.rst
+++ b/Documentation/gpu/amdgpu/display/dcn-blocks.rst
@@ -8,10 +8,10 @@ and the code documentation when it is automatically generated.
 DCHUBBUB
 --------
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
+.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/dchubbub.h
    :doc: overview
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
+.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/dchubbub.h
    :internal:
 
 HUBP
@@ -26,10 +26,10 @@ HUBP
 DPP
 ---
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
+.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
    :doc: overview
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
+.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
    :internal:
 
 MPC

From d938ec1a12a22a4eedff319aa41cba48e9c5e544 Mon Sep 17 00:00:00 2001
From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Date: Mon, 15 Jul 2024 16:16:12 -0600
Subject: [PATCH 119/346] drm/amd/display: Add simple struct doc to remove doc
 build warning

This commit is a part of a series that addresses the following build
warning for opp:

./drivers/gpu/drm/amd/display/dc/inc/hw/opp.h:1: warning: no structured
comments found
./drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h:1: warning: no structured
comments found

This commit fixes this issue by adding a simple kernel-doc to a struct
in the opp.h and the dpp.h files.

Cc: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h | 22 ++++++++++++++++-----
 drivers/gpu/drm/amd/display/dc/inc/hw/opp.h | 15 ++++++++++++++
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h b/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
index 9ac7fc717a92..0150f2581ee4 100644
--- a/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
+++ b/drivers/gpu/drm/amd/display/dc/inc/hw/dpp.h
@@ -147,16 +147,28 @@ struct cnv_color_keyer_params {
 	int color_keyer_blue_high;
 };
 
-/* new for dcn2: set the 8bit alpha values based on the 2 bit alpha
- *ALPHA_2BIT_LUT. ALPHA_2BIT_LUT0   default: 0b00000000
- *ALPHA_2BIT_LUT. ALPHA_2BIT_LUT1   default: 0b01010101
- *ALPHA_2BIT_LUT. ALPHA_2BIT_LUT2   default: 0b10101010
- *ALPHA_2BIT_LUT. ALPHA_2BIT_LUT3   default: 0b11111111
+/**
+ * struct cnv_alpha_2bit_lut - Set the 8bit alpha values based on the 2 bit alpha
  */
 struct cnv_alpha_2bit_lut {
+	/**
+	* @lut0: ALPHA_2BIT_LUT. ALPHA_2BIT_LUT0. Default: 0b00000000
+	*/
 	int lut0;
+
+	/**
+	 * @lut1: ALPHA_2BIT_LUT. ALPHA_2BIT_LUT1. Default: 0b01010101
+	 */
 	int lut1;
+
+	/**
+	 * @lut2: ALPHA_2BIT_LUT. ALPHA_2BIT_LUT2. Default: 0b10101010
+	 */
 	int lut2;
+
+	/**
+	 * @lut3: ALPHA_2BIT_LUT. ALPHA_2BIT_LUT3. Default: 0b11111111
+	 */
 	int lut3;
 };
 
diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/opp.h b/drivers/gpu/drm/amd/display/dc/inc/hw/opp.h
index 127fb1a51654..747679cb4944 100644
--- a/drivers/gpu/drm/amd/display/dc/inc/hw/opp.h
+++ b/drivers/gpu/drm/amd/display/dc/inc/hw/opp.h
@@ -205,9 +205,24 @@ struct gamma_coefficients {
 	struct fixed31_32 user_brightness;
 };
 
+/**
+ * struct pwl_float_data - Fixed point RGB color
+ */
 struct pwl_float_data {
+	/**
+	 * @r: Component Red.
+	 */
 	struct fixed31_32 r;
+
+	/**
+	 * @g: Component Green.
+	 */
+
 	struct fixed31_32 g;
+
+	/**
+	 * @b: Component Blue.
+	 */
 	struct fixed31_32 b;
 };
 

From 91ca34cba3b6a13df6304f80bbcfdf6cad328c2b Mon Sep 17 00:00:00 2001
From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Date: Mon, 15 Jul 2024 16:29:41 -0600
Subject: [PATCH 120/346] Documentation/gpu: Remove undocumented files from
 dcn-blockshubbub.h

The dchubbub.h and hubp.h do not have any meaningful documentation; for
this reason, this commit removes those files from the dcn-blocks
documentation.

Cc: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 Documentation/gpu/amdgpu/display/dcn-blocks.rst | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Documentation/gpu/amdgpu/display/dcn-blocks.rst b/Documentation/gpu/amdgpu/display/dcn-blocks.rst
index 00bc0607e98c..f80df596ef5c 100644
--- a/Documentation/gpu/amdgpu/display/dcn-blocks.rst
+++ b/Documentation/gpu/amdgpu/display/dcn-blocks.rst
@@ -11,18 +11,12 @@ DCHUBBUB
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/dchubbub.h
    :doc: overview
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/dchubbub.h
-   :internal:
-
 HUBP
 ----
 
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
    :doc: overview
 
-.. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/hubp.h
-   :internal:
-
 DPP
 ---
 

From 4b0eb9ce15912ec303cc515888691b6ec6c32196 Mon Sep 17 00:00:00 2001
From: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Date: Mon, 15 Jul 2024 16:57:47 -0600
Subject: [PATCH 121/346] Documentation/amdgpu: Fix duplicate declaration

Address the below kernel doc warning:

Documentation/gpu/amdgpu/display/display-manager:134:
drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h:3: WARNING: Duplicate C
declaration, also defined at gpu/amdgpu/display/dcn-blocks:101.
Declaration is '.. c:struct:: mpcc_blnd_cfg'.
Documentation/gpu/amdgpu/display/display-manager:146:
drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h:3: WARNING: Duplicate C
declaration, also defined at gpu/amdgpu/display/dcn-blocks:3.
Declaration is '.. c:enum:: mpcc_alpha_blend_mode'.

To address the above warnings, this commit uses the 'no-identifiers'
option in the dcn-blocks to avoid duplication with the previous use of
this function doc in the display-manager file. Finally, replaces the
deprecated ':function:' in favor of ':identifiers:'.

Cc: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 Documentation/gpu/amdgpu/display/dcn-blocks.rst      | 1 +
 Documentation/gpu/amdgpu/display/display-manager.rst | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/gpu/amdgpu/display/dcn-blocks.rst b/Documentation/gpu/amdgpu/display/dcn-blocks.rst
index f80df596ef5c..5e34366f6dbe 100644
--- a/Documentation/gpu/amdgpu/display/dcn-blocks.rst
+++ b/Documentation/gpu/amdgpu/display/dcn-blocks.rst
@@ -34,6 +34,7 @@ MPC
 
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h
    :internal:
+   :no-identifiers: mpcc_blnd_cfg mpcc_alpha_blend_mode
 
 OPP
 ---
diff --git a/Documentation/gpu/amdgpu/display/display-manager.rst b/Documentation/gpu/amdgpu/display/display-manager.rst
index 67a811e6891f..b269ff3f7a54 100644
--- a/Documentation/gpu/amdgpu/display/display-manager.rst
+++ b/Documentation/gpu/amdgpu/display/display-manager.rst
@@ -132,7 +132,7 @@ The DRM blend mode and its elements are then mapped by AMDGPU display manager
 (MPC), as follows:
 
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h
-   :functions: mpcc_blnd_cfg
+   :identifiers: mpcc_blnd_cfg
 
 Therefore, the blending configuration for a single MPCC instance on the MPC
 tree is defined by :c:type:`mpcc_blnd_cfg`, where
@@ -144,7 +144,7 @@ alpha and plane alpha values. It sets one of the three modes for
 :c:type:`MPCC_ALPHA_BLND_MODE`, as described below.
 
 .. kernel-doc:: drivers/gpu/drm/amd/display/dc/inc/hw/mpc.h
-   :functions: mpcc_alpha_blend_mode
+   :identifiers: mpcc_alpha_blend_mode
 
 DM then maps the elements of `enum mpcc_alpha_blend_mode` to those in the DRM
 blend formula, as follows:

From a0328b397f3339d8d17a6ec356e94b3c110b010c Mon Sep 17 00:00:00 2001
From: Foryun Ma <foryun.ma@jaguarmicro.com>
Date: Tue, 4 Jun 2024 11:21:51 +0800
Subject: [PATCH 122/346] cxl/core/pci: Move reading of control register to
 immediately before usage

Relocate the reading of the DVSEC control register to immediately
before usage and avoid unnecessary PCI config access from the read
if DVSEC capability check, hdm_count check, or device validity check
results in failure.

Signed-off-by: Foryun Ma <foryun.ma@jaguarmicro.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Link: https://patch.msgid.link/20240604032151.655-1-foryun.ma@jaguarmicro.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/pci.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 8567dd11eaac..a663e7566c48 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -338,10 +338,6 @@ int cxl_dvsec_rr_decode(struct device *dev, int d,
 	if (rc)
 		return rc;
 
-	rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl);
-	if (rc)
-		return rc;
-
 	if (!(cap & CXL_DVSEC_MEM_CAPABLE)) {
 		dev_dbg(dev, "Not MEM Capable\n");
 		return -ENXIO;
@@ -368,6 +364,10 @@ int cxl_dvsec_rr_decode(struct device *dev, int d,
 	 * disabled, and they will remain moot after the HDM Decoder
 	 * capability is enabled.
 	 */
+	rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl);
+	if (rc)
+		return rc;
+
 	info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl);
 	if (!info->mem_enabled)
 		return 0;

From e3615bd198289f319172c428f20857accb46b830 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Tue, 16 Jul 2024 12:49:25 -0400
Subject: [PATCH 123/346] drm/amd/display: fix corruption with high refresh
 rates on DCN 3.0

This reverts commit bc87d666c05a13e6d4ae1ddce41fc43d2567b9a2 and the
register changes from commit 6d4279cb99ac4f51d10409501d29969f687ac8dc.

Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3412
Cc: mikhail.v.gavrilov@gmail.com
Cc: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org # 6.10.x
---
 .../drm/amd/display/dc/optc/dcn10/dcn10_optc.c    | 15 +++------------
 .../drm/amd/display/dc/optc/dcn20/dcn20_optc.c    | 10 ++++++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c
index 336488c0574e..94427875bcdd 100644
--- a/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c
+++ b/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c
@@ -945,19 +945,10 @@ void optc1_set_drr(
 				OTG_FORCE_LOCK_ON_EVENT, 0,
 				OTG_SET_V_TOTAL_MIN_MASK_EN, 0,
 				OTG_SET_V_TOTAL_MIN_MASK, 0);
-
-		// Setup manual flow control for EOF via TRIG_A
-		optc->funcs->setup_manual_trigger(optc);
-
-	} else {
-		REG_UPDATE_4(OTG_V_TOTAL_CONTROL,
-				OTG_SET_V_TOTAL_MIN_MASK, 0,
-				OTG_V_TOTAL_MIN_SEL, 0,
-				OTG_V_TOTAL_MAX_SEL, 0,
-				OTG_FORCE_LOCK_ON_EVENT, 0);
-
-		optc->funcs->set_vtotal_min_max(optc, 0, 0);
 	}
+
+	// Setup manual flow control for EOF via TRIG_A
+	optc->funcs->setup_manual_trigger(optc);
 }
 
 void optc1_set_vtotal_min_max(struct timing_generator *optc, int vtotal_min, int vtotal_max)
diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c
index 43417cff2c9b..b4694985a40a 100644
--- a/drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c
+++ b/drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c
@@ -453,6 +453,16 @@ void optc2_setup_manual_trigger(struct timing_generator *optc)
 {
 	struct optc *optc1 = DCN10TG_FROM_TG(optc);
 
+	/* Set the min/max selectors unconditionally so that
+	 * DMCUB fw may change OTG timings when necessary
+	 * TODO: Remove the w/a after fixing the issue in DMCUB firmware
+	 */
+	REG_UPDATE_4(OTG_V_TOTAL_CONTROL,
+				 OTG_V_TOTAL_MIN_SEL, 1,
+				 OTG_V_TOTAL_MAX_SEL, 1,
+				 OTG_FORCE_LOCK_ON_EVENT, 0,
+				 OTG_SET_V_TOTAL_MIN_MASK, (1 << 1)); /* TRIGA */
+
 	REG_SET_8(OTG_TRIGA_CNTL, 0,
 			OTG_TRIGA_SOURCE_SELECT, 21,
 			OTG_TRIGA_SOURCE_PIPE_SELECT, optc->inst,

From f2038c12e8133bf4c6bd4d1127a23310d55d9e21 Mon Sep 17 00:00:00 2001
From: Venkata Prasad Potturu <venkataprasad.potturu@amd.com>
Date: Thu, 18 Jul 2024 11:50:02 +0530
Subject: [PATCH 124/346] ASoC: sof: amd: fix for firmware reload failure in
 Vangogh platform

Setting ACP ACLK as clock source when ACP enters D0 state causing
firmware load failure, as per design clock source should be internal
clock.

Remove acp_clkmux_sel field so that ACP will use internal clock
source when ACP enters into D0 state.

Fixes: d0dab6b76a9f ("ASoC: SOF: amd: Add sof support for vangogh platform")

Signed-off-by: Venkata Prasad Potturu <venkataprasad.potturu@amd.com>
Link: https://patch.msgid.link/20240718062004.581685-1-venkataprasad.potturu@amd.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/sof/amd/pci-vangogh.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sound/soc/sof/amd/pci-vangogh.c b/sound/soc/sof/amd/pci-vangogh.c
index 16eb2994fbab..eba580840100 100644
--- a/sound/soc/sof/amd/pci-vangogh.c
+++ b/sound/soc/sof/amd/pci-vangogh.c
@@ -34,7 +34,6 @@ static const struct sof_amd_acp_desc vangogh_chip_info = {
 	.dsp_intr_base	= ACP5X_DSP_SW_INTR_BASE,
 	.sram_pte_offset = ACP5X_SRAM_PTE_OFFSET,
 	.hw_semaphore_offset = ACP5X_AXI2DAGB_SEM_0,
-	.acp_clkmux_sel = ACP5X_CLKMUX_SEL,
 	.probe_reg_offset = ACP5X_FUTURE_REG_ACLK_0,
 };
 

From 5170dae5591036dba7daa519ea3126169300e275 Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley@microchip.com>
Date: Wed, 17 Jul 2024 10:59:48 +0100
Subject: [PATCH 125/346] dt-bindings: trivial-devices: fix Rohm BH2228FV
 compatible string

When Maxime originally added the BH2228FV to the spidev driver, he spelt
it incorrectly - the d should have been a b. That spelling was then
propagated to the binding when written by Krzysztof. Add a new, correctly
spelt compatible and advise against using the incorrectly spelling.

Fixes: 025aea27732d ("dt-bindings: trivial-devices: document SPI dev compatibles")
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Acked-by: Maxime Ripard <mripard@kernel.org>
Link: https://patch.msgid.link/20240717-exuberant-enlighten-f890fabcd247@spud
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/trivial-devices.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml
index 0a419453d183..5d9de7f68afc 100644
--- a/Documentation/devicetree/bindings/trivial-devices.yaml
+++ b/Documentation/devicetree/bindings/trivial-devices.yaml
@@ -318,7 +318,9 @@ properties:
           - renesas,hs3001
             # Renesas ISL29501 time-of-flight sensor
           - renesas,isl29501
-            # Rohm DH2228FV
+            # Rohm BH2228FV 8 channel DAC
+          - rohm,bh2228fv
+            # Rohm DH2228FV - This device does not exist, use rohm,bh2228fv instead.
           - rohm,dh2228fv
             # S524AD0XF1 (128K/256K-bit Serial EEPROM for Low Power)
           - samsung,24ad0xd1

From fc28d1c1fe3b3e2fbc50834c8f73dda72f6af9fc Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley@microchip.com>
Date: Wed, 17 Jul 2024 10:59:49 +0100
Subject: [PATCH 126/346] spi: spidev: add correct compatible for Rohm BH2228FV

When Maxime originally added the BH2228FV to the spidev driver, he spelt
it incorrectly - the d should have been a b. Add the correctly spelt
compatible to the driver. Although the majority of users of this
compatible are abusers, there is at least one board that validly uses
the incorrect spelt compatible, so keep it in the driver to avoid
breaking the few real users it has.

Fixes: 8fad805bdc52 ("spi: spidev: Add Rohm DH2228FV DAC compatible string")
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Acked-by: Maxime Ripard <mripard@kernel.org>
Link: https://patch.msgid.link/20240717-ventricle-strewn-a7678c509e85@spud
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spidev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index 95fb5f1c91c1..05e6d007f9a7 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -734,6 +734,7 @@ static const struct of_device_id spidev_dt_ids[] = {
 	{ .compatible = "lwn,bk4", .data = &spidev_of_check },
 	{ .compatible = "menlo,m53cpld", .data = &spidev_of_check },
 	{ .compatible = "micron,spi-authenta", .data = &spidev_of_check },
+	{ .compatible = "rohm,bh2228fv", .data = &spidev_of_check },
 	{ .compatible = "rohm,dh2228fv", .data = &spidev_of_check },
 	{ .compatible = "semtech,sx1301", .data = &spidev_of_check },
 	{ .compatible = "silabs,em3581", .data = &spidev_of_check },

From 408c2f14a5d3d7ac4824b96e52693ab271efb738 Mon Sep 17 00:00:00 2001
From: Ashutosh Dixit <ashutosh.dixit@intel.com>
Date: Thu, 11 Jul 2024 14:12:03 -0700
Subject: [PATCH 127/346] drm/xe/exec: Fix minor bug related to
 xe_sync_entry_cleanup

Increment num_syncs after xe_sync_entry_parse() is successful to ensure
the xe_sync_entry_cleanup() logic under "err_syncs" label works correctly.

v2: Use the same pattern as that in xe_vm.c (Matt Brost)

Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240711211203.3728180-1-ashutosh.dixit@intel.com
(cherry picked from commit 43a6faa6d9b5e9139758200a79fe9c8f4aaa0c8d)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_exec.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 2d72cdec3a0b..f36980aa26e6 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -118,7 +118,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	u64 addresses[XE_HW_ENGINE_MAX_INSTANCE];
 	struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn};
 	struct drm_exec *exec = &vm_exec.exec;
-	u32 i, num_syncs = 0, num_ufence = 0;
+	u32 i, num_syncs, num_ufence = 0;
 	struct xe_sched_job *job;
 	struct xe_vm *vm;
 	bool write_locked, skip_retry = false;
@@ -156,15 +156,15 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 
 	vm = q->vm;
 
-	for (i = 0; i < args->num_syncs; i++) {
-		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
-					  &syncs_user[i], SYNC_PARSE_FLAG_EXEC |
+	for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
+		err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
+					  &syncs_user[num_syncs], SYNC_PARSE_FLAG_EXEC |
 					  (xe_vm_in_lr_mode(vm) ?
 					   SYNC_PARSE_FLAG_LR_MODE : 0));
 		if (err)
 			goto err_syncs;
 
-		if (xe_sync_is_ufence(&syncs[i]))
+		if (xe_sync_is_ufence(&syncs[num_syncs]))
 			num_ufence++;
 	}
 
@@ -325,8 +325,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	if (err == -EAGAIN && !skip_retry)
 		goto retry;
 err_syncs:
-	for (i = 0; i < num_syncs; i++)
-		xe_sync_entry_cleanup(&syncs[i]);
+	while (num_syncs--)
+		xe_sync_entry_cleanup(&syncs[num_syncs]);
 	kfree(syncs);
 err_exec_queue:
 	xe_exec_queue_put(q);

From bf07ca963d4fd11c88a9d4b058f2bd62e8d46a98 Mon Sep 17 00:00:00 2001
From: Michal Wajdeczko <michal.wajdeczko@intel.com>
Date: Thu, 11 Jul 2024 21:23:19 +0200
Subject: [PATCH 128/346] drm/xe/pf: Limit fair VF LMEM provisioning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Due to the current design of the BO and VRAM manager, any object
with XE_BO_FLAG_PINNED flag, which the PF driver uses during VF
LMEM provisionining, is created with the TTM_PL_FLAG_CONTIGUOUS
flag, which may cause VRAM fragmentation that prevents subsequent
allocations of larger objects, like fair VF LMEM provisioning.

To avoid such failures, round down fair VF LMEM provisioning size
to next power of two size, to compensate what xe_ttm_vram_mgr is
doing to achieve contiguous allocations.

Fixes: ac6598aed1b3 ("drm/xe/pf: Add support to configure SR-IOV VFs")
Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Reviewed-by: Piotr Piórkowski <piotr.piorkowski@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240711192320.1198-2-michal.wajdeczko@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
(cherry picked from commit 4c3fe5eae46b92e2fd961b19f7779608352e5368)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
index db6c213da847..4699b7836001 100644
--- a/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
+++ b/drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c
@@ -1543,6 +1543,7 @@ static u64 pf_estimate_fair_lmem(struct xe_gt *gt, unsigned int num_vfs)
 	u64 fair;
 
 	fair = div_u64(available, num_vfs);
+	fair = rounddown_pow_of_two(fair);	/* XXX: ttm_vram_mgr & drm_buddy limitation */
 	fair = ALIGN_DOWN(fair, alignment);
 #ifdef MAX_FAIR_LMEM
 	fair = min_t(u64, MAX_FAIR_LMEM, fair);

From c9474b726b932b5d555effd9ed0ae19f4da2367c Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Mon, 15 Jul 2024 23:39:01 -0700
Subject: [PATCH 129/346] drm/xe: Wedge the entire device

Wedge the entire device, not just GT which may have triggered the wedge.
To implement this, cleanup the layering so xe_device_declare_wedged()
calls into the lower layers (GT) to ensure entire device is wedged.

While we are here, also signal any pending GT TLB invalidations upon
wedging device.

Lastly, short circuit reset wait if device is wedged.

v2:
 - Short circuit reset wait if device is wedged (Local testing)

Fixes: 8ed9aaae39f3 ("drm/xe: Force wedged state and block GT reset upon any GPU hang")
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240716063902.1390130-1-matthew.brost@intel.com
(cherry picked from commit 7dbe8af13c189f5937e87e9fb924d5bbc49e6f71)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c     |  6 ++++
 drivers/gpu/drm/xe/xe_gt.c         | 15 ++++++++++
 drivers/gpu/drm/xe/xe_gt.h         |  1 +
 drivers/gpu/drm/xe/xe_guc.c        | 16 ++++++++++
 drivers/gpu/drm/xe/xe_guc.h        |  1 +
 drivers/gpu/drm/xe/xe_guc_submit.c | 48 +++++++++++++++++++-----------
 drivers/gpu/drm/xe/xe_guc_submit.h |  1 +
 drivers/gpu/drm/xe/xe_uc.c         | 14 +++++++++
 drivers/gpu/drm/xe/xe_uc.h         |  1 +
 9 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 03492fbcb8fb..41548eff953d 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -870,6 +870,9 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
  */
 void xe_device_declare_wedged(struct xe_device *xe)
 {
+	struct xe_gt *gt;
+	u8 id;
+
 	if (xe->wedged.mode == 0) {
 		drm_dbg(&xe->drm, "Wedged mode is forcibly disabled\n");
 		return;
@@ -883,4 +886,7 @@ void xe_device_declare_wedged(struct xe_device *xe)
 			"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
 			dev_name(xe->drm.dev));
 	}
+
+	for_each_gt(gt, xe, id)
+		xe_gt_declare_wedged(gt);
 }
diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
index 0ba2e2d0289b..31b2e64c70c6 100644
--- a/drivers/gpu/drm/xe/xe_gt.c
+++ b/drivers/gpu/drm/xe/xe_gt.c
@@ -904,3 +904,18 @@ struct xe_hw_engine *xe_gt_any_hw_engine(struct xe_gt *gt)
 
 	return NULL;
 }
+
+/**
+ * xe_gt_declare_wedged() - Declare GT wedged
+ * @gt: the GT object
+ *
+ * Wedge the GT which stops all submission, saves desired debug state, and
+ * cleans up anything which could timeout.
+ */
+void xe_gt_declare_wedged(struct xe_gt *gt)
+{
+	xe_gt_assert(gt, gt_to_xe(gt)->wedged.mode);
+
+	xe_uc_declare_wedged(&gt->uc);
+	xe_gt_tlb_invalidation_reset(gt);
+}
diff --git a/drivers/gpu/drm/xe/xe_gt.h b/drivers/gpu/drm/xe/xe_gt.h
index 1123fdfc4ebc..8b1a5027dcf2 100644
--- a/drivers/gpu/drm/xe/xe_gt.h
+++ b/drivers/gpu/drm/xe/xe_gt.h
@@ -37,6 +37,7 @@ struct xe_gt *xe_gt_alloc(struct xe_tile *tile);
 int xe_gt_init_hwconfig(struct xe_gt *gt);
 int xe_gt_init_early(struct xe_gt *gt);
 int xe_gt_init(struct xe_gt *gt);
+void xe_gt_declare_wedged(struct xe_gt *gt);
 int xe_gt_record_default_lrcs(struct xe_gt *gt);
 
 /**
diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
index eb655cee19f7..de0fe9e65746 100644
--- a/drivers/gpu/drm/xe/xe_guc.c
+++ b/drivers/gpu/drm/xe/xe_guc.c
@@ -1178,3 +1178,19 @@ void xe_guc_print_info(struct xe_guc *guc, struct drm_printer *p)
 	xe_guc_ct_print(&guc->ct, p, false);
 	xe_guc_submit_print(guc, p);
 }
+
+/**
+ * xe_guc_declare_wedged() - Declare GuC wedged
+ * @guc: the GuC object
+ *
+ * Wedge the GuC which stops all submission, saves desired debug state, and
+ * cleans up anything which could timeout.
+ */
+void xe_guc_declare_wedged(struct xe_guc *guc)
+{
+	xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode);
+
+	xe_guc_reset_prepare(guc);
+	xe_guc_ct_stop(&guc->ct);
+	xe_guc_submit_wedge(guc);
+}
diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
index af59c9545753..e0bbf98f849d 100644
--- a/drivers/gpu/drm/xe/xe_guc.h
+++ b/drivers/gpu/drm/xe/xe_guc.h
@@ -37,6 +37,7 @@ void xe_guc_reset_wait(struct xe_guc *guc);
 void xe_guc_stop_prepare(struct xe_guc *guc);
 void xe_guc_stop(struct xe_guc *guc);
 int xe_guc_start(struct xe_guc *guc);
+void xe_guc_declare_wedged(struct xe_guc *guc);
 
 static inline u16 xe_engine_class_to_guc_class(enum xe_engine_class class)
 {
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 373447758a60..8d7e7f4bbff7 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -861,13 +861,40 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
 		xe_sched_tdr_queue_imm(&q->guc->sched);
 }
 
-static bool guc_submit_hint_wedged(struct xe_guc *guc)
+/**
+ * xe_guc_submit_wedge() - Wedge GuC submission
+ * @guc: the GuC object
+ *
+ * Save exec queue's registered with GuC state by taking a ref to each queue.
+ * Register a DRMM handler to drop refs upon driver unload.
+ */
+void xe_guc_submit_wedge(struct xe_guc *guc)
 {
 	struct xe_device *xe = guc_to_xe(guc);
 	struct xe_exec_queue *q;
 	unsigned long index;
 	int err;
 
+	xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode);
+
+	err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
+				       guc_submit_wedged_fini, guc);
+	if (err) {
+		drm_err(&xe->drm, "Failed to register xe_guc_submit clean-up on wedged.mode=2. Although device is wedged.\n");
+		return;
+	}
+
+	mutex_lock(&guc->submission_state.lock);
+	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
+		if (xe_exec_queue_get_unless_zero(q))
+			set_exec_queue_wedged(q);
+	mutex_unlock(&guc->submission_state.lock);
+}
+
+static bool guc_submit_hint_wedged(struct xe_guc *guc)
+{
+	struct xe_device *xe = guc_to_xe(guc);
+
 	if (xe->wedged.mode != 2)
 		return false;
 
@@ -876,22 +903,6 @@ static bool guc_submit_hint_wedged(struct xe_guc *guc)
 
 	xe_device_declare_wedged(xe);
 
-	xe_guc_submit_reset_prepare(guc);
-	xe_guc_ct_stop(&guc->ct);
-
-	err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
-				       guc_submit_wedged_fini, guc);
-	if (err) {
-		drm_err(&xe->drm, "Failed to register xe_guc_submit clean-up on wedged.mode=2. Although device is wedged.\n");
-		return true; /* Device is wedged anyway */
-	}
-
-	mutex_lock(&guc->submission_state.lock);
-	xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
-		if (xe_exec_queue_get_unless_zero(q))
-			set_exec_queue_wedged(q);
-	mutex_unlock(&guc->submission_state.lock);
-
 	return true;
 }
 
@@ -1677,7 +1688,8 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
 
 void xe_guc_submit_reset_wait(struct xe_guc *guc)
 {
-	wait_event(guc->ct.wq, !guc_read_stopped(guc));
+	wait_event(guc->ct.wq, xe_device_wedged(guc_to_xe(guc)) ||
+		   !guc_read_stopped(guc));
 }
 
 void xe_guc_submit_stop(struct xe_guc *guc)
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h
index 4ad5f4c1b084..bdf8c9f3d24a 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.h
+++ b/drivers/gpu/drm/xe/xe_guc_submit.h
@@ -18,6 +18,7 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc);
 void xe_guc_submit_reset_wait(struct xe_guc *guc);
 void xe_guc_submit_stop(struct xe_guc *guc);
 int xe_guc_submit_start(struct xe_guc *guc);
+void xe_guc_submit_wedge(struct xe_guc *guc);
 
 int xe_guc_sched_done_handler(struct xe_guc *guc, u32 *msg, u32 len);
 int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len);
diff --git a/drivers/gpu/drm/xe/xe_uc.c b/drivers/gpu/drm/xe/xe_uc.c
index 0f240534fb72..0d073a9987c2 100644
--- a/drivers/gpu/drm/xe/xe_uc.c
+++ b/drivers/gpu/drm/xe/xe_uc.c
@@ -300,3 +300,17 @@ void xe_uc_remove(struct xe_uc *uc)
 {
 	xe_gsc_remove(&uc->gsc);
 }
+
+/**
+ * xe_uc_declare_wedged() - Declare UC wedged
+ * @uc: the UC object
+ *
+ * Wedge the UC which stops all submission, saves desired debug state, and
+ * cleans up anything which could timeout.
+ */
+void xe_uc_declare_wedged(struct xe_uc *uc)
+{
+	xe_gt_assert(uc_to_gt(uc), uc_to_xe(uc)->wedged.mode);
+
+	xe_guc_declare_wedged(&uc->guc);
+}
diff --git a/drivers/gpu/drm/xe/xe_uc.h b/drivers/gpu/drm/xe/xe_uc.h
index 11856f24e6f9..506517c11333 100644
--- a/drivers/gpu/drm/xe/xe_uc.h
+++ b/drivers/gpu/drm/xe/xe_uc.h
@@ -21,5 +21,6 @@ int xe_uc_start(struct xe_uc *uc);
 int xe_uc_suspend(struct xe_uc *uc);
 int xe_uc_sanitize_reset(struct xe_uc *uc);
 void xe_uc_remove(struct xe_uc *uc);
+void xe_uc_declare_wedged(struct xe_uc *uc);
 
 #endif

From 90936a0a4c54f0a1cdf4538f9128821ad70c36ab Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Mon, 15 Jul 2024 23:39:02 -0700
Subject: [PATCH 130/346] drm/xe: Don't suspend device upon wedge

When wedging a device we shouldn't be suspending device as state for
debug will be lost.

Also this appears to not work as the below stack trace pops upon trying
to resume a wedged device:

[  304.245044] INFO: task cat:12115 blocked for more than 151 seconds.
[  304.251333]       Tainted: G        W          6.10.0-rc7-xe+ #3518
[  304.257617] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  304.265459] task:cat             state:D stack:13384 pid:12115 tgid:12115 ppid:3986   flags:0x00000006
[  304.265465] Call Trace:
[  304.265467]  <TASK>
[  304.265469]  __schedule+0x3c4/0xdf0
[  304.265478]  schedule+0x3c/0x140
[  304.265481]  rpm_resume+0x1cc/0x740
[  304.265484]  ? __pfx_autoremove_wake_function+0x10/0x10
[  304.265489]  __pm_runtime_resume+0x49/0x80
[  304.265494]  guc_info+0x6b/0xb0 [xe]
[  304.265538]  ? __pfx___drm_printfn_seq_file+0x10/0x10
[  304.265541]  ? __pfx___drm_puts_seq_file+0x10/0x10
[  304.265545]  seq_read_iter+0x111/0x4c0
[  304.265551]  seq_read+0xfc/0x140
[  304.265556]  full_proxy_read+0x58/0x80
[  304.265560]  vfs_read+0xa7/0x360
[  304.265563]  ? find_held_lock+0x2b/0x80
[  304.265568]  ksys_read+0x64/0xe0
[  304.265571]  do_syscall_64+0x68/0x140
[  304.265575]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  304.265578] RIP: 0033:0x7f4254d14992
[  304.265580] RSP: 002b:00007ffc558666f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[  304.265583] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f4254d14992
[  304.265584] RDX: 0000000000020000 RSI: 00007f4254ebb000 RDI: 0000000000000003
[  304.265586] RBP: 00007f4254ebb000 R08: 00007f4254eba010 R09: 00007f4254eba010
[  304.265587] R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000022000
[  304.265588] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000
[  304.265593]  </TASK>
[  304.265594]
               Showing all locks held in the system:
[  304.265598] 1 lock held by khungtaskd/57:
[  304.265599]  #0: ffffffff8273b860 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x36/0x1c0
[  304.265607] 3 locks held by kworker/6:1/90:
[  304.265610] 1 lock held by in:imklog/547:
[  304.265611]  #0: ffff88810498cd88 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x76/0xc0
[  304.265620] 1 lock held by dmesg/1310:

v2: Drop local 'err' variable (Jonathan)

Fixes: 8ed9aaae39f3 ("drm/xe: Force wedged state and block GT reset upon any GPU hang")
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240716063902.1390130-2-matthew.brost@intel.com
(cherry picked from commit 452bca0edbd0764ca0284239d5438b3edd305ab3)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_device.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index 41548eff953d..76109415eba6 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -854,6 +854,13 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
 	return address & GENMASK_ULL(xe->info.va_bits - 1, 0);
 }
 
+static void xe_device_wedged_fini(struct drm_device *drm, void *arg)
+{
+	struct xe_device *xe = arg;
+
+	xe_pm_runtime_put(xe);
+}
+
 /**
  * xe_device_declare_wedged - Declare device wedged
  * @xe: xe device instance
@@ -878,6 +885,13 @@ void xe_device_declare_wedged(struct xe_device *xe)
 		return;
 	}
 
+	if (drmm_add_action_or_reset(&xe->drm, xe_device_wedged_fini, xe)) {
+		drm_err(&xe->drm, "Failed to register xe_device_wedged_fini clean-up. Although device is wedged.\n");
+		return;
+	}
+
+	xe_pm_runtime_get_noresume(xe);
+
 	if (!atomic_xchg(&xe->wedged.flag, 1)) {
 		xe->needs_flr_on_fini = true;
 		drm_err(&xe->drm,

From a83b22754e351f13fb46596c85f667dc33da71ec Mon Sep 17 00:00:00 2001
From: Bastien Curutchet <bastien.curutchet@bootlin.com>
Date: Thu, 18 Jul 2024 13:55:34 +0200
Subject: [PATCH 131/346] clk: davinci: da8xx-cfgchip: Initialize clk_init_data
 before use

The flag attribute of the struct clk_init_data isn't initialized before
the devm_clk_hw_register() call. This can lead to unexpected behavior
during registration.

Initialize the entire clk_init_data to zero at declaration.

Cc: stable@vger.kernel.org
Fixes: 58e1e2d2cd89 ("clk: davinci: cfgchip: Add TI DA8XX USB PHY clocks")
Signed-off-by: Bastien Curutchet <bastien.curutchet@bootlin.com>
Reviewed-by: David Lechner <david@lechnology.com>
Link: https://lore.kernel.org/r/20240718115534.41513-1-bastien.curutchet@bootlin.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/davinci/da8xx-cfgchip.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clk/davinci/da8xx-cfgchip.c b/drivers/clk/davinci/da8xx-cfgchip.c
index ad2d0df43dc6..ec60ecb517f1 100644
--- a/drivers/clk/davinci/da8xx-cfgchip.c
+++ b/drivers/clk/davinci/da8xx-cfgchip.c
@@ -508,7 +508,7 @@ da8xx_cfgchip_register_usb0_clk48(struct device *dev,
 	const char * const parent_names[] = { "usb_refclkin", "pll0_auxclk" };
 	struct clk *fck_clk;
 	struct da8xx_usb0_clk48 *usb0;
-	struct clk_init_data init;
+	struct clk_init_data init = {};
 	int ret;
 
 	fck_clk = devm_clk_get(dev, "fck");
@@ -583,7 +583,7 @@ da8xx_cfgchip_register_usb1_clk48(struct device *dev,
 {
 	const char * const parent_names[] = { "usb0_clk48", "usb_refclkin" };
 	struct da8xx_usb1_clk48 *usb1;
-	struct clk_init_data init;
+	struct clk_init_data init = {};
 	int ret;
 
 	usb1 = devm_kzalloc(dev, sizeof(*usb1), GFP_KERNEL);

From 5a6a25ea5bcd5bdf80fb13acd65a03fc6b8794b1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Wed, 17 Jul 2024 21:25:53 -0500
Subject: [PATCH 132/346] clk: sophgo: clk-sg2042-pll: Fix uninitialized
 variable in debug output

If sg2042_get_pll_ctl_setting() fails then "value" isn't initialized and
it is printed in the debug output.  Initialize it to zero.

Fixes: 48cf7e01386e ("clk: sophgo: Add SG2042 clock driver")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/r/baf0a490-d5ba-4528-90ba-80399684692d@stanley.mountain
Reviewed-by: Chen Wang <unicorn_wang@outlook.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/sophgo/clk-sg2042-pll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/clk/sophgo/clk-sg2042-pll.c b/drivers/clk/sophgo/clk-sg2042-pll.c
index 9695e64fc23b..ff9deeef509b 100644
--- a/drivers/clk/sophgo/clk-sg2042-pll.c
+++ b/drivers/clk/sophgo/clk-sg2042-pll.c
@@ -387,7 +387,7 @@ static int sg2042_clk_pll_set_rate(struct clk_hw *hw,
 	struct sg2042_pll_clock *pll = to_sg2042_pll_clk(hw);
 	struct sg2042_pll_ctrl pctrl_table;
 	unsigned long flags;
-	u32 value;
+	u32 value = 0;
 	int ret;
 
 	spin_lock_irqsave(pll->lock, flags);

From 74dba240881820b46b9b1c62ef4de3bfff47fbd4 Mon Sep 17 00:00:00 2001
From: wangdicheng <wangdicheng@kylinos.cn>
Date: Fri, 19 Jul 2024 10:09:06 +0800
Subject: [PATCH 133/346] ALSA: usb-audio: Fix microphone sound on HD webcam.

I own an external usb Webcam, HD webcam, which had low mic volume and
inconsistent sound quality. Video works as expected.

(snip)
[   95.473820][ 1] [   T73] usb 5-2.2: new high-speed USB device number 7 using xhci_hcd
[   95.773974][ 1] [   T73] usb 5-2.2: New USB device found, idVendor=1bcf, idProduct=2281, bcdDevice= 0.05
[   95.783445][ 1] [   T73] usb 5-2.2: New USB device strings: Mfr=1, Product=2, SerialNumber=3
[   95.791872][ 1] [   T73] usb 5-2.2: Product: HD webcam
[   95.797001][ 1] [   T73] usb 5-2.2: Manufacturer: Sunplus IT Co
[   95.802996][ 1] [   T73] usb 5-2.2: SerialNumber: 20200513
[   96.092610][ 2] [ T3680] usb 5-2.2: Warning! Unlikely big volume range (=4096), cval->res is probably wrong.
[   96.102436][ 2] [ T3680] usb 5-2.2: [5] FU [Mic Capture Volume] ch = 1, val = 0/4096/1

Set up quirk cval->res to 16 for 256 levels,
Set GET_SAMPLE_RATE quirk flag to stop trying to get the sample rate.
Confirmed that happened anyway later due to the backoff mechanism,
After 3 failures.

All audio stream on device interfaces share the same values,
apart from wMaxPacketSize and tSamFreq :

      bLength                 9
      bDescriptorType         4
      bInterfaceNumber        3
      bAlternateSetting       4
      bNumEndpoints           1
      bInterfaceClass         1 Audio

Interface Descriptor:
  bLength                 9
  bDescriptorType         4
  bInterfaceNumber        3
  bAlternateSetting       4
  bNumEndpoints           1
  bInterfaceClass         1 Audio
  bInterfaceSubClass      2 Streaming
  bInterfaceProtocol      0
  iInterface              0
  AudioStreaming Interface Descriptor:
    bLength                 7
    bDescriptorType        36
    bDescriptorSubtype      1 (AS_GENERAL)
    bTerminalLink           3
    bDelay                  1 frames
    wFormatTag         0x0001 PCM
  AudioStreaming Interface Descriptor:
    bLength                11
    bDescriptorType        36
    bDescriptorSubtype      2 (FORMAT_TYPE)
    bFormatType             1 (FORMAT_TYPE_I)
    bNrChannels             1
    bSubframeSize           2
    bBitResolution         16
    bSamFreqType            1 Discrete
    tSamFreq[ 0]        48000
  Endpoint Descriptor:
    bLength                 9
    bDescriptorType         5
    bEndpointAddress     0x86  EP 6 IN
    bmAttributes            5
      Transfer Type            Isochronous
      Synch Type               Asynchronous
      Usage Type               Data
    wMaxPacketSize     0x0064  1x 100 bytes
    bInterval               4
    bRefresh                0
    bSynchAddress           0
    AudioStreaming Endpoint Descriptor:
      bLength                 7
      bDescriptorType        37
      bDescriptorSubtype      1 (EP_GENERAL)
      bmAttributes         0x01
        Sampling Frequency
      bLockDelayUnits         0 Undefined
      wLockDelay         0x0000
(snip)

Testing patch provides consistent good sound recording quality and volume range.

(snip)
[   95.473820][ 1] [   T73] usb 5-2.2: new high-speed USB device number 7 using xhci_hcd
[   95.773974][ 1] [   T73] usb 5-2.2: New USB device found, idVendor=1bcf, idProduct=2281, bcdDevice= 0.05
[   95.783445][ 1] [   T73] usb 5-2.2: New USB device strings: Mfr=1, Product=2, SerialNumber=3
[   95.791872][ 1] [   T73] usb 5-2.2: Product: HD webcam
[   95.797001][ 1] [   T73] usb 5-2.2: Manufacturer: Sunplus IT Co
[   95.802996][ 1] [   T73] usb 5-2.2: SerialNumber: 20200513
[   96.110630][ 3] [ T3680] usbcore: registered new interface driver snd-usb-audio
[   96.114329][ 7] [ T3677] usb 5-2.2: Found UVC 1.00 device HD webcam (1bcf:2281)
[   96.167555][ 7] [ T3677] usbcore: registered new interface driver uvcvideo

Signed-off-by: wangdicheng <wangdicheng@kylinos.cn>
Cc: <stable@vger.kernel.org>
Link: https://patch.msgid.link/20240719020906.8078-1-wangdich9700@163.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/mixer.c  | 7 +++++++
 sound/usb/quirks.c | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index c00009b545c0..f7ce8e8c3c3e 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -1211,6 +1211,13 @@ static void volume_control_quirks(struct usb_mixer_elem_info *cval,
 			cval->res = 16;
 		}
 		break;
+	case USB_ID(0x1bcf, 0x2281): /* HD Webcam */
+		if (!strcmp(kctl->id.name, "Mic Capture Volume")) {
+			usb_audio_info(chip,
+				"set resolution quirk: cval->res = 16\n");
+			cval->res = 16;
+		}
+		break;
 	}
 }
 
diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 58156fbca02c..19c1fc9504c3 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -2225,6 +2225,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
 		   QUIRK_FLAG_ALIGN_TRANSFER),
 	DEVICE_FLG(0x534d, 0x2109, /* MacroSilicon MS2109 */
 		   QUIRK_FLAG_ALIGN_TRANSFER),
+	DEVICE_FLG(0x1bcf, 0x2281, /* HD Webcam */
+		   QUIRK_FLAG_GET_SAMPLE_RATE),
 
 	/* Vendor matches */
 	VENDOR_FLG(0x045e, /* MS Lifecam */

From 3ae08e47742eeebf2190900d31ddac53fdd13a5b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 19 Jul 2024 16:44:10 +0200
Subject: [PATCH 134/346] gpio: virtuser: avoid non-constant format string

Using a string variable as an sprintf format is potentially
dangerous, and gcc can warn about this:

drivers/gpio/gpio-virtuser.c: In function 'gpio_virtuser_dbgfs_init_line_attrs':
drivers/gpio/gpio-virtuser.c:808:9: error: format not a string literal and no format arguments [-Werror=format-security]
  808 |         sprintf(data->consumer, id);
      |         ^~~~~~~

Change it to a simpler strscpy() instead to just copy it and check
the destination buffer size.

Fixes: 91581c4b3f29 ("gpio: virtuser: new virtual testing driver for the GPIO API")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20240719144422.2082394-1-arnd@kernel.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpio-virtuser.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpio/gpio-virtuser.c b/drivers/gpio/gpio-virtuser.c
index 0e0d55da4f01..ccc47ea0b3e1 100644
--- a/drivers/gpio/gpio-virtuser.c
+++ b/drivers/gpio/gpio-virtuser.c
@@ -805,7 +805,7 @@ static int gpio_virtuser_dbgfs_init_line_attrs(struct device *dev,
 		return -ENOMEM;
 
 	data->ad.desc = desc;
-	sprintf(data->consumer, id);
+	strscpy(data->consumer, id);
 	atomic_set(&data->irq, 0);
 	atomic_set(&data->irq_count, 0);
 

From a2d6d8aee4a4f694920bfaf6f6be3690e0f8d302 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 19 Jul 2024 11:56:34 +0200
Subject: [PATCH 135/346] ALSA: hda: tas2781: mark const variables as
 __maybe_unused

An earlier patch changed the DECLARE_TLV_DB_SCALE declaration, but
now there are additional static const variables that cause
the same build warnings:

In file included from sound/pci/hda/tas2781_hda_i2c.c:23:
include/sound/tas2781-tlv.h:23:28: error: 'tas2563_dvc_table' defined but not used [-Werror=unused-const-variable=]
   23 | static const unsigned char tas2563_dvc_table[][4] = {
      |                            ^~~~~~~~~~~~~~~~~
In file included from include/sound/tlv.h:10,
                 from sound/pci/hda/tas2781_hda_i2c.c:22:
include/sound/tas2781-tlv.h:20:35: error: 'tas2563_dvc_tlv' defined but not used [-Werror=unused-const-variable=]
   20 | static const DECLARE_TLV_DB_SCALE(tas2563_dvc_tlv, -12150, 50, 1);
      |                                   ^~~~~~~~~~~~~~~

Mark them all as unused as well.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20240719095640.3741247-1-arnd@kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/tas2781-tlv.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/sound/tas2781-tlv.h b/include/sound/tas2781-tlv.h
index 99c41bfc7827..00fd4d449ff3 100644
--- a/include/sound/tas2781-tlv.h
+++ b/include/sound/tas2781-tlv.h
@@ -16,11 +16,11 @@
 #define __TAS2781_TLV_H__
 
 static const __maybe_unused DECLARE_TLV_DB_SCALE(dvc_tlv, -10000, 100, 0);
-static const DECLARE_TLV_DB_SCALE(amp_vol_tlv, 1100, 50, 0);
-static const DECLARE_TLV_DB_SCALE(tas2563_dvc_tlv, -12150, 50, 1);
+static const __maybe_unused DECLARE_TLV_DB_SCALE(amp_vol_tlv, 1100, 50, 0);
+static const __maybe_unused DECLARE_TLV_DB_SCALE(tas2563_dvc_tlv, -12150, 50, 1);
 
 /* pow(10, db/20) * pow(2,30) */
-static const unsigned char tas2563_dvc_table[][4] = {
+static const __maybe_unused unsigned char tas2563_dvc_table[][4] = {
 	{ 0X00, 0X00, 0X00, 0X00 }, /* -121.5db */
 	{ 0X00, 0X00, 0X03, 0XBC }, /* -121.0db */
 	{ 0X00, 0X00, 0X03, 0XF5 }, /* -120.5db */

From 7010d9464f7ca3ee2d75095ea2e642a9009a41ff Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 22 Jul 2024 10:06:04 +0200
Subject: [PATCH 136/346] ALSA: usb-audio: Move HD Webcam quirk to the right
 place

The quirk_flags_table[] is sorted in the USB ID order, while the last
fix was put at a wrong position.  Adjust the entry at the right
position.

Fixes: 74dba2408818 ("ALSA: usb-audio: Fix microphone sound on HD webcam.")
Cc: <stable@vger.kernel.org>
Link: https://patch.msgid.link/20240722080605.23481-1-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/quirks.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 19c1fc9504c3..f070ecf75ce8 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -2167,6 +2167,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
 		   QUIRK_FLAG_GET_SAMPLE_RATE),
 	DEVICE_FLG(0x19f7, 0x0035, /* RODE NT-USB+ */
 		   QUIRK_FLAG_GET_SAMPLE_RATE),
+	DEVICE_FLG(0x1bcf, 0x2281, /* HD Webcam */
+		   QUIRK_FLAG_GET_SAMPLE_RATE),
 	DEVICE_FLG(0x1bcf, 0x2283, /* NexiGo N930AF FHD Webcam */
 		   QUIRK_FLAG_GET_SAMPLE_RATE),
 	DEVICE_FLG(0x2040, 0x7200, /* Hauppauge HVR-950Q */
@@ -2225,8 +2227,6 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
 		   QUIRK_FLAG_ALIGN_TRANSFER),
 	DEVICE_FLG(0x534d, 0x2109, /* MacroSilicon MS2109 */
 		   QUIRK_FLAG_ALIGN_TRANSFER),
-	DEVICE_FLG(0x1bcf, 0x2281, /* HD Webcam */
-		   QUIRK_FLAG_GET_SAMPLE_RATE),
 
 	/* Vendor matches */
 	VENDOR_FLG(0x045e, /* MS Lifecam */

From 542440fd7b30983cae23e32bd22f69a076ec7ef4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 19 Jul 2024 12:40:24 +0200
Subject: [PATCH 137/346] regmap: maple: work around gcc-14.1 false-positive
 warning

With gcc-14.1, there is a false-postive -Wuninitialized warning in
regcache_maple_drop:

drivers/base/regmap/regcache-maple.c: In function 'regcache_maple_drop':
drivers/base/regmap/regcache-maple.c:113:23: error: 'lower_index' is used uninitialized [-Werror=uninitialized]
  113 |         unsigned long lower_index, lower_last;
      |                       ^~~~~~~~~~~
drivers/base/regmap/regcache-maple.c:113:36: error: 'lower_last' is used uninitialized [-Werror=uninitialized]
  113 |         unsigned long lower_index, lower_last;
      |                                    ^~~~~~~~~~

I've created a reduced test case to see if this needs to be reported
as a gcc, but it appears that the gcc-14.x branch already has a change
that turns this into a more sensible -Wmaybe-uninitialized warning, so
I ended up not reporting it so far.

The reduced test case also produces a warning for gcc-13 and gcc-12
but I don't see that with the version in the kernel.

Link: https://godbolt.org/z/oKbohKqd3
Link: https://lore.kernel.org/all/CAMuHMdWj=FLmkazPbYKPevDrcym2_HDb_U7Mb9YE9ovrP0jJfA@mail.gmail.com/
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20240719104030.1382465-1-arnd@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regcache-maple.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/base/regmap/regcache-maple.c b/drivers/base/regmap/regcache-maple.c
index f0df2da6d522..2dea9d259c49 100644
--- a/drivers/base/regmap/regcache-maple.c
+++ b/drivers/base/regmap/regcache-maple.c
@@ -110,7 +110,8 @@ static int regcache_maple_drop(struct regmap *map, unsigned int min,
 	struct maple_tree *mt = map->cache;
 	MA_STATE(mas, mt, min, max);
 	unsigned long *entry, *lower, *upper;
-	unsigned long lower_index, lower_last;
+	/* initialized to work around false-positive -Wuninitialized warning */
+	unsigned long lower_index = 0, lower_last = 0;
 	unsigned long upper_index, upper_last;
 	int ret = 0;
 

From 9931f7d5d251882a147cc5811060097df43e79f5 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 22 Jul 2024 10:30:02 +0200
Subject: [PATCH 138/346] ASoC: Intel: use soc_intel_is_byt_cr() only when
 IOSF_MBI is reachable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

the Intel kbuild bot reports a link failure when IOSF_MBI is built-in
but the Merrifield driver is configured as a module. The
soc-intel-quirks.h is included for Merrifield platforms, but IOSF_MBI
is not selected for that platform.

ld.lld: error: undefined symbol: iosf_mbi_read
>>> referenced by atom.c
>>>               sound/soc/sof/intel/atom.o:(atom_machine_select) in archive vmlinux.a

This patch forces the use of the fallback static inline when IOSF_MBI is not reachable.

Fixes: 536cfd2f375d ("ASoC: Intel: use common helpers to detect CPUs")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202407160704.zpdhJ8da-lkp@intel.com/
Suggested-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://patch.msgid.link/20240722083002.10800-1-pierre-louis.bossart@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/intel/common/soc-intel-quirks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/intel/common/soc-intel-quirks.h b/sound/soc/intel/common/soc-intel-quirks.h
index de4e550c5b34..42bd51456b94 100644
--- a/sound/soc/intel/common/soc-intel-quirks.h
+++ b/sound/soc/intel/common/soc-intel-quirks.h
@@ -11,7 +11,7 @@
 
 #include <linux/platform_data/x86/soc.h>
 
-#if IS_ENABLED(CONFIG_X86)
+#if IS_REACHABLE(CONFIG_IOSF_MBI)
 
 #include <linux/dmi.h>
 #include <asm/iosf_mbi.h>

From 83340b855d222f257354afd272dc8d315fecc3ee Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 19 Jul 2024 09:48:03 +0200
Subject: [PATCH 139/346] ASoC: tegra: select CONFIG_SND_SIMPLE_CARD_UTILS

This I2S client driver now uses functions exported from a helper module
but fails to link when the helper is disabled:

ERROR: modpost: "simple_util_parse_convert" [sound/soc/tegra/snd-soc-tegra210-i2s.ko] undefined!
ERROR: modpost: "simple_util_get_sample_fmt" [sound/soc/tegra/snd-soc-tegra210-i2s.ko] undefined!

Add a Kconfig select line to ensure it's always turned on here.

Fixes: 2502f8dd8c30 ("ASoC: tegra: I2S client convert formats handling")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Sameer Pujar <spujar@nvidia.com>
Link: https://patch.msgid.link/20240719074831.3253995-1-arnd@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/tegra/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/soc/tegra/Kconfig b/sound/soc/tegra/Kconfig
index 74effc57a7a0..2463c22e9cf6 100644
--- a/sound/soc/tegra/Kconfig
+++ b/sound/soc/tegra/Kconfig
@@ -78,6 +78,7 @@ config SND_SOC_TEGRA210_DMIC
 
 config SND_SOC_TEGRA210_I2S
 	tristate "Tegra210 I2S module"
+	select SND_SIMPLE_CARD_UTILS
 	help
 	  Config to enable the Inter-IC Sound (I2S) Controller which
 	  implements full-duplex and bidirectional and single direction

From 92c78222168e9035a9bfb8841c2e56ce23e51f73 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Fri, 19 Jul 2024 18:53:48 -0500
Subject: [PATCH 140/346] ASoC: TAS2781: Fix tasdev_load_calibrated_data()

This function has a reversed if statement so it's either a no-op or it
leads to a NULL dereference.

Fixes: b195acf5266d ("ASoC: tas2781: Fix wrong loading calibrated data sequence")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://patch.msgid.link/18a29b68-cc85-4139-b7c7-2514e8409a42@stanley.mountain
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/codecs/tas2781-fmwlib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/codecs/tas2781-fmwlib.c b/sound/soc/codecs/tas2781-fmwlib.c
index 63626b982d04..8f9a3ae7153e 100644
--- a/sound/soc/codecs/tas2781-fmwlib.c
+++ b/sound/soc/codecs/tas2781-fmwlib.c
@@ -2162,7 +2162,7 @@ static void tasdev_load_calibrated_data(struct tasdevice_priv *priv, int i)
 		return;
 
 	cal = cal_fmw->calibrations;
-	if (cal)
+	if (!cal)
 		return;
 
 	load_calib_data(priv, &cal->dev_data);

From 21451dfd853e7d8e6e3fbd7ef1fbdb2f2ead12f5 Mon Sep 17 00:00:00 2001
From: wangdicheng <wangdicheng@kylinos.cn>
Date: Mon, 22 Jul 2024 16:48:22 +0800
Subject: [PATCH 141/346] ALSA: usb-audio: Add a quirk for Sonix HD USB Camera

Sonix HD USB Camera does not support reading the sample rate which leads
to many lines of "cannot get freq at ep 0x84".
This patch adds the USB ID to quirks.c and avoids those error messages.

(snip)
[1.789698] usb 3-3: new high-speed USB device number 2 using xhci_hcd
[1.984121] usb 3-3: New USB device found, idVendor=0c45, idProduct=6340, bcdDevice= 0.00
[1.984124] usb 3-3: New USB device strings: Mfr=2, Product=1, SerialNumber=0
[1.984127] usb 3-3: Product: USB 2.0 Camera
[1.984128] usb 3-3: Manufacturer: Sonix Technology Co., Ltd.
[5.440957] usb 3-3: 3:1: cannot get freq at ep 0x84
[12.130679] usb 3-3: 3:1: cannot get freq at ep 0x84
[12.175065] usb 3-3: 3:1: cannot get freq at ep 0x84

Signed-off-by: wangdicheng <wangdicheng@kylinos.cn>
Cc: <stable@vger.kernel.org>
Link: https://patch.msgid.link/20240722084822.31620-1-wangdich9700@163.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/quirks.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index f070ecf75ce8..ea063a14cdd8 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -2125,6 +2125,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = {
 		   QUIRK_FLAG_CTL_MSG_DELAY_1M),
 	DEVICE_FLG(0x0b0e, 0x0349, /* Jabra 550a */
 		   QUIRK_FLAG_CTL_MSG_DELAY_1M),
+	DEVICE_FLG(0x0c45, 0x6340, /* Sonix HD USB Camera */
+		   QUIRK_FLAG_GET_SAMPLE_RATE),
 	DEVICE_FLG(0x0ecb, 0x205c, /* JBL Quantum610 Wireless */
 		   QUIRK_FLAG_FIXED_RATE),
 	DEVICE_FLG(0x0ecb, 0x2069, /* JBL Quantum810 Wireless */

From eabd9db64ea8ba64d2a0b1d70da38e1a95dcd08b Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo1.xu@intel.com>
Date: Thu, 13 Jun 2024 16:54:33 +0800
Subject: [PATCH 142/346] ACPI: RISCV: Add NUMA support based on SRAT and SLIT

Add acpi_numa.c file to enable parse NUMA information from
ACPI SRAT and SLIT tables. SRAT table provide CPUs(Hart) and
memory nodes to proximity domain mapping, while SLIT table
provide the distance metrics between proximity domains.

Signed-off-by: Haibo Xu <haibo1.xu@intel.com>
Reviewed-by: Sunil V L <sunilvl@ventanamicro.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Link: https://lore.kernel.org/r/65dbad1fda08a32922c44886e4581e49b4a2fecc.1718268003.git.haibo1.xu@intel.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/acpi.h |  15 +++-
 arch/riscv/kernel/Makefile    |   1 +
 arch/riscv/kernel/acpi.c      |   5 --
 arch/riscv/kernel/acpi_numa.c | 131 ++++++++++++++++++++++++++++++++++
 arch/riscv/kernel/setup.c     |   4 +-
 arch/riscv/kernel/smpboot.c   |   2 -
 include/linux/acpi.h          |   6 ++
 7 files changed, 154 insertions(+), 10 deletions(-)
 create mode 100644 arch/riscv/kernel/acpi_numa.c

diff --git a/arch/riscv/include/asm/acpi.h b/arch/riscv/include/asm/acpi.h
index 7dad0cf9d701..e0a1f84404f3 100644
--- a/arch/riscv/include/asm/acpi.h
+++ b/arch/riscv/include/asm/acpi.h
@@ -61,11 +61,14 @@ static inline void arch_fix_phys_package_id(int num, u32 slot) { }
 
 void acpi_init_rintc_map(void);
 struct acpi_madt_rintc *acpi_cpu_get_madt_rintc(int cpu);
-u32 get_acpi_id_for_cpu(int cpu);
+static inline u32 get_acpi_id_for_cpu(int cpu)
+{
+	return acpi_cpu_get_madt_rintc(cpu)->uid;
+}
+
 int acpi_get_riscv_isa(struct acpi_table_header *table,
 		       unsigned int cpu, const char **isa);
 
-static inline int acpi_numa_get_nid(unsigned int cpu) { return NUMA_NO_NODE; }
 void acpi_get_cbo_block_size(struct acpi_table_header *table, u32 *cbom_size,
 			     u32 *cboz_size, u32 *cbop_size);
 #else
@@ -87,4 +90,12 @@ static inline void acpi_get_cbo_block_size(struct acpi_table_header *table,
 
 #endif /* CONFIG_ACPI */
 
+#ifdef CONFIG_ACPI_NUMA
+int acpi_numa_get_nid(unsigned int cpu);
+void acpi_map_cpus_to_nodes(void);
+#else
+static inline int acpi_numa_get_nid(unsigned int cpu) { return NUMA_NO_NODE; }
+static inline void acpi_map_cpus_to_nodes(void) { }
+#endif /* CONFIG_ACPI_NUMA */
+
 #endif /*_ASM_ACPI_H*/
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 5b243d46f4b1..1e2afec141b5 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -110,3 +110,4 @@ obj-$(CONFIG_COMPAT)		+= compat_vdso/
 
 obj-$(CONFIG_64BIT)		+= pi/
 obj-$(CONFIG_ACPI)		+= acpi.o
+obj-$(CONFIG_ACPI_NUMA)	+= acpi_numa.o
diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c
index e619edc8b0cc..040bdbfea2b4 100644
--- a/arch/riscv/kernel/acpi.c
+++ b/arch/riscv/kernel/acpi.c
@@ -191,11 +191,6 @@ struct acpi_madt_rintc *acpi_cpu_get_madt_rintc(int cpu)
 	return &cpu_madt_rintc[cpu];
 }
 
-u32 get_acpi_id_for_cpu(int cpu)
-{
-	return acpi_cpu_get_madt_rintc(cpu)->uid;
-}
-
 /*
  * __acpi_map_table() will be called before paging_init(), so early_ioremap()
  * or early_memremap() should be called here to for ACPI table mapping.
diff --git a/arch/riscv/kernel/acpi_numa.c b/arch/riscv/kernel/acpi_numa.c
new file mode 100644
index 000000000000..0231482d6946
--- /dev/null
+++ b/arch/riscv/kernel/acpi_numa.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACPI 6.6 based NUMA setup for RISCV
+ * Lots of code was borrowed from arch/arm64/kernel/acpi_numa.c
+ *
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ * Copyright (C) 2013-2016, Linaro Ltd.
+ *		Author: Hanjun Guo <hanjun.guo@linaro.org>
+ * Copyright (C) 2024 Intel Corporation.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ */
+
+#define pr_fmt(fmt) "ACPI: NUMA: " fmt
+
+#include <linux/acpi.h>
+#include <linux/bitmap.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+
+#include <asm/numa.h>
+
+static int acpi_early_node_map[NR_CPUS] __initdata = { NUMA_NO_NODE };
+
+int __init acpi_numa_get_nid(unsigned int cpu)
+{
+	return acpi_early_node_map[cpu];
+}
+
+static inline int get_cpu_for_acpi_id(u32 uid)
+{
+	int cpu;
+
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+		if (uid == get_acpi_id_for_cpu(cpu))
+			return cpu;
+
+	return -EINVAL;
+}
+
+static int __init acpi_parse_rintc_pxm(union acpi_subtable_headers *header,
+				       const unsigned long end)
+{
+	struct acpi_srat_rintc_affinity *pa;
+	int cpu, pxm, node;
+
+	if (srat_disabled())
+		return -EINVAL;
+
+	pa = (struct acpi_srat_rintc_affinity *)header;
+	if (!pa)
+		return -EINVAL;
+
+	if (!(pa->flags & ACPI_SRAT_RINTC_ENABLED))
+		return 0;
+
+	pxm = pa->proximity_domain;
+	node = pxm_to_node(pxm);
+
+	/*
+	 * If we can't map the UID to a logical cpu this
+	 * means that the UID is not part of possible cpus
+	 * so we do not need a NUMA mapping for it, skip
+	 * the SRAT entry and keep parsing.
+	 */
+	cpu = get_cpu_for_acpi_id(pa->acpi_processor_uid);
+	if (cpu < 0)
+		return 0;
+
+	acpi_early_node_map[cpu] = node;
+	pr_info("SRAT: PXM %d -> HARTID 0x%lx -> Node %d\n", pxm,
+		cpuid_to_hartid_map(cpu), node);
+
+	return 0;
+}
+
+void __init acpi_map_cpus_to_nodes(void)
+{
+	int i;
+
+	/*
+	 * In ACPI, SMP and CPU NUMA information is provided in separate
+	 * static tables, namely the MADT and the SRAT.
+	 *
+	 * Thus, it is simpler to first create the cpu logical map through
+	 * an MADT walk and then map the logical cpus to their node ids
+	 * as separate steps.
+	 */
+	acpi_table_parse_entries(ACPI_SIG_SRAT, sizeof(struct acpi_table_srat),
+				 ACPI_SRAT_TYPE_RINTC_AFFINITY, acpi_parse_rintc_pxm, 0);
+
+	for (i = 0; i < nr_cpu_ids; i++)
+		early_map_cpu_to_node(i, acpi_numa_get_nid(i));
+}
+
+/* Callback for Proximity Domain -> logical node ID mapping */
+void __init acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa)
+{
+	int pxm, node;
+
+	if (srat_disabled())
+		return;
+
+	if (pa->header.length < sizeof(struct acpi_srat_rintc_affinity)) {
+		pr_err("SRAT: Invalid SRAT header length: %d\n", pa->header.length);
+		bad_srat();
+		return;
+	}
+
+	if (!(pa->flags & ACPI_SRAT_RINTC_ENABLED))
+		return;
+
+	pxm = pa->proximity_domain;
+	node = acpi_map_pxm_to_node(pxm);
+
+	if (node == NUMA_NO_NODE) {
+		pr_err("SRAT: Too many proximity domains %d\n", pxm);
+		bad_srat();
+		return;
+	}
+
+	node_set(node, numa_nodes_parsed);
+}
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index 4f73c0ae44b2..a2cde65b69e9 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -281,8 +281,10 @@ void __init setup_arch(char **cmdline_p)
 	setup_smp();
 #endif
 
-	if (!acpi_disabled)
+	if (!acpi_disabled) {
 		acpi_init_rintc_map();
+		acpi_map_cpus_to_nodes();
+	}
 
 	riscv_init_cbo_blocksizes();
 	riscv_fill_hwcap();
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 1319b29ce3b5..d49c0bf0e2cd 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -96,7 +96,6 @@ static int __init acpi_parse_rintc(union acpi_subtable_headers *header, const un
 	if (hart == cpuid_to_hartid_map(0)) {
 		BUG_ON(found_boot_cpu);
 		found_boot_cpu = true;
-		early_map_cpu_to_node(0, acpi_numa_get_nid(cpu_count));
 		return 0;
 	}
 
@@ -106,7 +105,6 @@ static int __init acpi_parse_rintc(union acpi_subtable_headers *header, const un
 	}
 
 	cpuid_to_hartid_map(cpu_count) = hart;
-	early_map_cpu_to_node(cpu_count, acpi_numa_get_nid(cpu_count));
 	cpu_count++;
 
 	return 0;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 28c3fb2bef0d..5a2b620ccd58 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -264,6 +264,12 @@ static inline void
 acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
 #endif
 
+#ifdef CONFIG_RISCV
+void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa);
+#else
+static inline void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa) { }
+#endif
+
 #ifndef PHYS_CPUID_INVALID
 typedef u32 phys_cpuid_t;
 #define PHYS_CPUID_INVALID (phys_cpuid_t)(-1)

From 39494aec8a0454f9e8a68d2422d3bd4e5c5ee0c7 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo1.xu@intel.com>
Date: Thu, 13 Jun 2024 16:54:34 +0800
Subject: [PATCH 143/346] ACPI: NUMA: Add handler for SRAT RINTC affinity
 structure

Add RINTC affinity structure handler during parsing SRAT table.

Signed-off-by: Haibo Xu <haibo1.xu@intel.com>
Reviewed-by: Sunil V L <sunilvl@ventanamicro.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Link: https://lore.kernel.org/r/e076514d78d92f104a5f2d8c82b8921f6aa26fdd.1718268003.git.haibo1.xu@intel.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 drivers/acpi/numa/srat.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index e3f26e71637a..44f91f2c6c5d 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -167,6 +167,19 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header)
 		}
 	}
 	break;
+
+	case ACPI_SRAT_TYPE_RINTC_AFFINITY:
+		{
+			struct acpi_srat_rintc_affinity *p =
+			    (struct acpi_srat_rintc_affinity *)header;
+			pr_debug("SRAT Processor (acpi id[0x%04x]) in proximity domain %d %s\n",
+				 p->acpi_processor_uid,
+				 p->proximity_domain,
+				 (p->flags & ACPI_SRAT_RINTC_ENABLED) ?
+				 "enabled" : "disabled");
+		}
+		break;
+
 	default:
 		pr_warn("Found unsupported SRAT entry (type = 0x%x)\n",
 			header->type);
@@ -450,6 +463,21 @@ acpi_parse_gi_affinity(union acpi_subtable_headers *header,
 }
 #endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */
 
+static int __init
+acpi_parse_rintc_affinity(union acpi_subtable_headers *header,
+			  const unsigned long end)
+{
+	struct acpi_srat_rintc_affinity *rintc_affinity;
+
+	rintc_affinity = (struct acpi_srat_rintc_affinity *)header;
+	acpi_table_print_srat_entry(&header->common);
+
+	/* let architecture-dependent part to do it */
+	acpi_numa_rintc_affinity_init(rintc_affinity);
+
+	return 0;
+}
+
 static int __init acpi_parse_srat(struct acpi_table_header *table)
 {
 	struct acpi_table_srat *srat = (struct acpi_table_srat *)table;
@@ -485,7 +513,7 @@ int __init acpi_numa_init(void)
 
 	/* SRAT: System Resource Affinity Table */
 	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
-		struct acpi_subtable_proc srat_proc[4];
+		struct acpi_subtable_proc srat_proc[5];
 
 		memset(srat_proc, 0, sizeof(srat_proc));
 		srat_proc[0].id = ACPI_SRAT_TYPE_CPU_AFFINITY;
@@ -496,6 +524,8 @@ int __init acpi_numa_init(void)
 		srat_proc[2].handler = acpi_parse_gicc_affinity;
 		srat_proc[3].id = ACPI_SRAT_TYPE_GENERIC_AFFINITY;
 		srat_proc[3].handler = acpi_parse_gi_affinity;
+		srat_proc[4].id = ACPI_SRAT_TYPE_RINTC_AFFINITY;
+		srat_proc[4].handler = acpi_parse_rintc_affinity;
 
 		acpi_table_parse_entries_array(ACPI_SIG_SRAT,
 					sizeof(struct acpi_table_srat),

From adc3e82d253786a5b63c821108709de23317a90d Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo1.xu@intel.com>
Date: Thu, 13 Jun 2024 16:54:35 +0800
Subject: [PATCH 144/346] ACPI: NUMA: change the ACPI_NUMA to a hidden option

x86/arm64/loongarch would select ACPI_NUMA by default and riscv
would do the same thing, so change it to a hidden option and the
select statements except for the X86_64_ACPI_NUMA can also go away.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: Sunil V L <sunilvl@ventanamicro.com>
Signed-off-by: Haibo Xu <haibo1.xu@intel.com>
Reviewed-by: Sunil V L <sunilvl@ventanamicro.com>
Acked-by: Huacai Chen <chenhuacai@loongson.cn>
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Link: https://lore.kernel.org/r/f1f96377b8ecd6e3183f28abf5c9ac21cb9855ea.1718268003.git.haibo1.xu@intel.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/arm64/Kconfig        | 1 -
 arch/loongarch/Kconfig    | 1 -
 drivers/acpi/numa/Kconfig | 5 +----
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5d91259ee7b5..5079ad4e21a5 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1484,7 +1484,6 @@ config HOTPLUG_CPU
 config NUMA
 	bool "NUMA Memory Allocation and Scheduler Support"
 	select GENERIC_ARCH_NUMA
-	select ACPI_NUMA if ACPI
 	select OF_NUMA
 	select HAVE_SETUP_PER_CPU_AREA
 	select NEED_PER_CPU_EMBED_FIRST_CHUNK
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index e38139c576ee..8d9e06e4ad84 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -470,7 +470,6 @@ config NR_CPUS
 config NUMA
 	bool "NUMA Support"
 	select SMP
-	select ACPI_NUMA if ACPI
 	help
 	  Say Y to compile the kernel with NUMA (Non-Uniform Memory Access)
 	  support.  This option improves performance on systems with more
diff --git a/drivers/acpi/numa/Kconfig b/drivers/acpi/numa/Kconfig
index 849c2bd820b9..f33194d1e43f 100644
--- a/drivers/acpi/numa/Kconfig
+++ b/drivers/acpi/numa/Kconfig
@@ -1,9 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 config ACPI_NUMA
-	bool "NUMA support"
-	depends on NUMA
-	depends on (X86 || ARM64 || LOONGARCH)
-	default y if ARM64
+	def_bool NUMA && !X86
 
 config ACPI_HMAT
 	bool "ACPI Heterogeneous Memory Attribute Table Support"

From 5f76d4211ee44dcd9b2dc8a9065002bf8ecee81e Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo1.xu@intel.com>
Date: Thu, 13 Jun 2024 16:54:36 +0800
Subject: [PATCH 145/346] ACPI: NUMA: replace pr_info with pr_debug in
 arch_acpi_numa_init

There are lots of ACPI enabled systems that aren't NUMA and If the
firmware didn't provide the SRAT/SLIT, then there will be a message
"Failed to initialise from firmware" from arch_acpi_numa_init() which
adding noise to the boot on all of those kind of systems. Replace the
pr_info with pr_debug in arch_acpi_numa_init() to avoid it.

Suggested-by: Sunil V L <sunilvl@ventanamicro.com>
Signed-off-by: Haibo Xu <haibo1.xu@intel.com>
Reviewed-by: Sunil V L <sunilvl@ventanamicro.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Link: https://lore.kernel.org/r/109354315a02cd22145d2effa4a8c571b69d3e56.1718268003.git.haibo1.xu@intel.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 drivers/base/arch_numa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c
index 5b59d133b6af..555aee3ee8e7 100644
--- a/drivers/base/arch_numa.c
+++ b/drivers/base/arch_numa.c
@@ -445,7 +445,7 @@ static int __init arch_acpi_numa_init(void)
 
 	ret = acpi_numa_init();
 	if (ret) {
-		pr_info("Failed to initialise from firmware\n");
+		pr_debug("Failed to initialise from firmware\n");
 		return ret;
 	}
 

From 9a4ab167cfb1dea1df0c0c948205a62c7eb3b85b Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 22 Jul 2024 15:59:28 +0200
Subject: [PATCH 146/346] ALSA: ump: Don't update FB name for static blocks

When a device tries to update the FB name string even if its Endpoint
is declared as static, we should skip it, just already done for the FB
info update reply.

Fixes: 37e0e14128e0 ("ALSA: ump: Support UMP Endpoint and Function Block parsing")
Cc: <stable@vger.kernel.org>
Link: https://patch.msgid.link/20240722135929.8612-1-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/core/ump.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sound/core/ump.c b/sound/core/ump.c
index 3f61220c23b4..b325fcfa77d0 100644
--- a/sound/core/ump.c
+++ b/sound/core/ump.c
@@ -806,6 +806,13 @@ static int ump_handle_fb_name_msg(struct snd_ump_endpoint *ump,
 	if (!fb)
 		return -ENODEV;
 
+	if (ump->parsed &&
+	    (ump->info.flags & SNDRV_UMP_EP_INFO_STATIC_BLOCKS)) {
+		ump_dbg(ump, "Skipping static FB name update (blk#%d)\n",
+			fb->info.block_id);
+		return 0;
+	}
+
 	ret = ump_append_string(ump, fb->info.name, sizeof(fb->info.name),
 				buf->raw, 3);
 	/* notify the FB name update to sequencer, too */

From ac29d8ae05b770ed3f52d7a60908ab9b126f69d7 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 22 Jul 2024 16:06:06 +0200
Subject: [PATCH 147/346] ALSA: ump: Force 1 Group for MIDI1 FBs

When a Function Block declares it being a legacy MIDI1 device, it has
to be only with a single UMP Group.  Correct the attribute when a
device declares it wrongly.

Fixes: 37e0e14128e0 ("ALSA: ump: Support UMP Endpoint and Function Block parsing")
Cc: <stable@vger.kernel.org>
Link: https://patch.msgid.link/20240722140610.10845-1-tiwai@suse.de
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/core/ump.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sound/core/ump.c b/sound/core/ump.c
index b325fcfa77d0..0f0d7e895c5a 100644
--- a/sound/core/ump.c
+++ b/sound/core/ump.c
@@ -733,6 +733,12 @@ static void fill_fb_info(struct snd_ump_endpoint *ump,
 		info->block_id, info->direction, info->active,
 		info->first_group, info->num_groups, info->midi_ci_version,
 		info->sysex8_streams, info->flags);
+
+	if ((info->flags & SNDRV_UMP_BLOCK_IS_MIDI1) && info->num_groups != 1) {
+		info->num_groups = 1;
+		ump_dbg(ump, "FB %d: corrected groups to 1 for MIDI1\n",
+			info->block_id);
+	}
 }
 
 /* check whether the FB info gets updated by the current message */

From b6f7d984ebf826069d3dc6fa187b4d1cfb90f965 Mon Sep 17 00:00:00 2001
From: Stephan Gerhold <stephan.gerhold@linaro.org>
Date: Mon, 15 Jul 2024 14:15:37 +0200
Subject: [PATCH 148/346] dt-bindings: display: panel: samsung,atna33xc20:
 Document ATNA45AF01

The Samsung ATNA45AF01 panel is an AMOLED eDP panel that has backlight
control over the DP AUX channel. While it works almost correctly with the
generic "edp-panel" compatible, the backlight needs special handling to
work correctly. It is similar to the existing ATNA33XC20 panel, just with
a larger resolution and size.

Add a new "samsung,atna45af01" compatible to describe this panel in the DT.
Use the existing "samsung,atna33xc20" as fallback compatible since existing
drivers should work as-is, given that resolution and size are discoverable
through the eDP link.

Signed-off-by: Stephan Gerhold <stephan.gerhold@linaro.org>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240715-x1e80100-crd-backlight-v2-1-31b7f2f658a3@linaro.org
---
 .../bindings/display/panel/samsung,atna33xc20.yaml        | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml b/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml
index 765ca155c83a..5192c93fbd67 100644
--- a/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml
+++ b/Documentation/devicetree/bindings/display/panel/samsung,atna33xc20.yaml
@@ -14,7 +14,13 @@ allOf:
 
 properties:
   compatible:
-    const: samsung,atna33xc20
+    oneOf:
+      # Samsung 13.3" FHD (1920x1080 pixels) eDP AMOLED panel
+      - const: samsung,atna33xc20
+      # Samsung 14.5" WQXGA+ (2880x1800 pixels) eDP AMOLED panel
+      - items:
+          - const: samsung,atna45af01
+          - const: samsung,atna33xc20
 
   enable-gpios: true
   port: true

From facd40aa5c4699f94014012e4e58414c082f2c01 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Tue, 16 Jul 2024 16:19:19 +0200
Subject: [PATCH 149/346] timers/migration: Do not rely always on group->parent

When reading group->parent without holding the group lock it is racy
against CPUs coming online the first time and thereby creating another
level of the hierarchy. This is not a problem when this value is read once
to decide whether to abort a propagation or not. The worst outcome is an
unnecessary/early CPU wake up. But it is racy when reading it several times
during a single 'action' (like activation, deactivation, checking for
remote timer expiry,...) and relying on the consitency of this value
without holding the lock. This happens at the moment e.g. in
tmigr_inactive_up() which is also calling tmigr_udpate_events(). Code relys
on group->parent not to change during this 'action'.

Update parent struct member description to explain the above only
once. Remove parent pointer checks when they are not mandatory (like update
of data->childmask). Remove a warning, which would be nice but the trigger
of this warning is not reliable and add expand the data structure member
description instead. Expand a comment, why it is safe to rely on parent
pointer here (inside hierarchy update).

Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model")
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-1-757baa7803fe@linutronix.de
---
 kernel/time/timer_migration.c | 33 +++++++++++++++------------------
 kernel/time/timer_migration.h | 12 +++++++++++-
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 84413114db5c..d91efe1dc3bf 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -507,7 +507,14 @@ static void walk_groups(up_f up, void *data, struct tmigr_cpu *tmc)
  *			(get_next_timer_interrupt())
  * @firstexp:		Contains the first event expiry information when last
  *			active CPU of hierarchy is on the way to idle to make
- *			sure CPU will be back in time.
+ *			sure CPU will be back in time. It is updated in top
+ *			level group only. Be aware, there could occur a new top
+ *			level of the hierarchy between the 'top level call' in
+ *			tmigr_update_events() and the check for the parent group
+ *			in walk_groups(). Then @firstexp might contain a value
+ *			!= KTIME_MAX even if it was not the final top
+ *			level. This is not a problem, as the worst outcome is a
+ *			CPU which might wake up a little early.
  * @evt:		Pointer to tmigr_event which needs to be queued (of idle
  *			child group)
  * @childmask:		childmask of child group
@@ -649,7 +656,7 @@ static bool tmigr_active_up(struct tmigr_group *group,
 
 	} while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state));
 
-	if ((walk_done == false) && group->parent)
+	if (walk_done == false)
 		data->childmask = group->childmask;
 
 	/*
@@ -1317,20 +1324,9 @@ static bool tmigr_inactive_up(struct tmigr_group *group,
 	/* Event Handling */
 	tmigr_update_events(group, child, data);
 
-	if (group->parent && (walk_done == false))
+	if (walk_done == false)
 		data->childmask = group->childmask;
 
-	/*
-	 * data->firstexp was set by tmigr_update_events() and contains the
-	 * expiry of the first global event which needs to be handled. It
-	 * differs from KTIME_MAX if:
-	 * - group is the top level group and
-	 * - group is idle (which means CPU was the last active CPU in the
-	 *   hierarchy) and
-	 * - there is a pending event in the hierarchy
-	 */
-	WARN_ON_ONCE(data->firstexp != KTIME_MAX && group->parent);
-
 	trace_tmigr_group_set_cpu_inactive(group, newstate, childmask);
 
 	return walk_done;
@@ -1552,10 +1548,11 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 		data.childmask = child->childmask;
 
 		/*
-		 * There is only one new level per time. When connecting the
-		 * child and the parent and set the child active when the parent
-		 * is inactive, the parent needs to be the uppermost
-		 * level. Otherwise there went something wrong!
+		 * There is only one new level per time (which is protected by
+		 * tmigr_mutex). When connecting the child and the parent and
+		 * set the child active when the parent is inactive, the parent
+		 * needs to be the uppermost level. Otherwise there went
+		 * something wrong!
 		 */
 		WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
 	}
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 6c37d94a37d9..494f68cc13f4 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -22,7 +22,17 @@ struct tmigr_event {
  * struct tmigr_group - timer migration hierarchy group
  * @lock:		Lock protecting the event information and group hierarchy
  *			information during setup
- * @parent:		Pointer to the parent group
+ * @parent:		Pointer to the parent group. Pointer is updated when a
+ *			new hierarchy level is added because of a CPU coming
+ *			online the first time. Once it is set, the pointer will
+ *			not be removed or updated. When accessing parent pointer
+ *			lock less to decide whether to abort a propagation or
+ *			not, it is not a problem. The worst outcome is an
+ *			unnecessary/early CPU wake up. But do not access parent
+ *			pointer several times in the same 'action' (like
+ *			activation, deactivation, check for remote expiry,...)
+ *			without holding the lock as it is not ensured that value
+ *			will not change.
  * @groupevt:		Next event of the group which is only used when the
  *			group is !active. The group event is then queued into
  *			the parent timer queue.

From 10a0e6f3d3db7dcfe36e578923e5f038f1d2b72a Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Wed, 17 Jul 2024 11:49:40 +0200
Subject: [PATCH 150/346] timers/migration: Move hierarchy setup into
 cpuhotplug prepare callback

When a CPU comes online the first time, it is possible that a new top level
group will be created. In general all propagation is done from the bottom
to top. This minimizes complexity and prevents possible races. But when a
new top level group is created, the formely top level group needs to be
connected to the new level. This is the only time, when the direction to
propagate changes is changed: the changes are propagated from top (new top
level group) to bottom (formerly top level group).

This introduces two races (see (A) and (B)) as reported by Frederic:

(A) This race happens, when marking the formely top level group as active,
but the last active CPU of the formerly top level group goes idle. Then
it's likely that formerly group is no longer active, but marked
nevertheless as active in new top level group:

		  [GRP0:0]
	       migrator = 0
	       active   = 0
	       nextevt  = KTIME_MAX
	       /         \
	      0         1 .. 7
	  active         idle

0) Hierarchy has for now only 8 CPUs and CPU 0 is the only active CPU.

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = NONE
			nextevt  = KTIME_MAX
					 \
		 [GRP0:0]                  [GRP0:1]
	      migrator = 0              migrator = TMIGR_NONE
	      active   = 0              active   = NONE
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
		/         \
	      0          1 .. 7                8
	  active         idle                !online

1) CPU 8 is booting and creates a new group in first level GRP0:1 and
   therefore also a new top group GRP1:0. For now the setup code proceeded
   only until the connected between GRP0:1 to the new top group. The
   connection between CPU8 and GRP0:1 is not yet established and CPU 8 is
   still !online.

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = NONE
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = 0              migrator = TMIGR_NONE
	      active   = 0              active   = NONE
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
		/         \
	      0          1 .. 7                8
	  active         idle                !online

2) Setup code now connects GRP0:0 to GRP1:0 and observes while in
   tmigr_connect_child_parent() that GRP0:0 is not TMIGR_NONE. So it
   prepares to call tmigr_active_up() on it. It hasn't done it yet.

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = NONE
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = TMIGR_NONE        migrator = TMIGR_NONE
	      active   = NONE              active   = NONE
	      nextevt  = KTIME_MAX         nextevt  = KTIME_MAX
		/         \
	      0          1 .. 7                8
	    idle         idle                !online

3) CPU 0 goes idle. Since GRP0:0->parent has been updated by CPU 8 with
   GRP0:0->lock held, CPU 0 observes GRP1:0 after calling
   tmigr_update_events() and it propagates the change to the top (no change
   there and no wakeup programmed since there is no timer).

			     [GRP1:0]
			migrator = GRP0:0
			active   = GRP0:0
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = TMIGR_NONE       migrator = TMIGR_NONE
	      active   = NONE             active   = NONE
	      nextevt  = KTIME_MAX        nextevt  = KTIME_MAX
		/         \
	      0          1 .. 7                8
	    idle         idle                !online

4) Now the setup code finally calls tmigr_active_up() to and sets GRP0:0
   active in GRP1:0

			     [GRP1:0]
			migrator = GRP0:0
			active   = GRP0:0, GRP0:1
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = TMIGR_NONE       migrator = 8
	      active   = NONE             active   = 8
	      nextevt  = KTIME_MAX        nextevt  = KTIME_MAX
		/         \                    |
	      0          1 .. 7                8
	    idle         idle                active

5) Now CPU 8 is connected with GRP0:1 and CPU 8 calls tmigr_active_up() out
   of tmigr_cpu_online().

			     [GRP1:0]
			migrator = GRP0:0
			active   = GRP0:0
			nextevt  = T8
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = TMIGR_NONE         migrator = TMIGR_NONE
	      active   = NONE               active   = NONE
	      nextevt  = KTIME_MAX          nextevt  = T8
		/         \                    |
	      0          1 .. 7                8
	    idle         idle                  idle

5) CPU 8 goes idle with a timer T8 and relies on GRP0:0 as the migrator.
   But it's not really active, so T8 gets ignored.

--> The update which is done in third step is not noticed by setup code. So
    a wrong migrator is set to top level group and a timer could get
    ignored.

(B) Reading group->parent and group->childmask when an hierarchy update is
ongoing and reaches the formerly top level group is racy as those values
could be inconsistent. (The notation of migrator and active now slightly
changes in contrast to the above example, as now the childmasks are used.)

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = 0x00
			nextevt  = KTIME_MAX
					 \
		 [GRP0:0]                  [GRP0:1]
	      migrator = TMIGR_NONE     migrator = TMIGR_NONE
	      active   = 0x00           active   = 0x00
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
	      childmask= 0		childmask= 1
	      parent   = NULL		parent   = GRP1:0
		/         \
	      0          1 .. 7                8
	  idle           idle                !online
	  childmask=1

1) Hierarchy has 8 CPUs. CPU 8 is at the moment in the process of onlining
   but did not yet connect GRP0:0 to GRP1:0.

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = 0x00
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = TMIGR_NONE     migrator = TMIGR_NONE
	      active   = 0x00           active   = 0x00
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
	      childmask= 0		childmask= 1
	      parent   = GRP1:0		parent   = GRP1:0
		/         \
	      0          1 .. 7                8
	  idle           idle                !online
	  childmask=1

2) Setup code (running on CPU 8) now connects GRP0:0 to GRP1:0, updates
   parent pointer of GRP0:0 and ...

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = 0x00
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = 0x01           migrator = TMIGR_NONE
	      active   = 0x01           active   = 0x00
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
	      childmask= 0		childmask= 1
	      parent   = GRP1:0		parent   = GRP1:0
		/         \
	      0          1 .. 7                8
	  active          idle                !online
	  childmask=1

	  tmigr_walk.childmask = 0

3) ... CPU 0 comes active in the same time. As migrator in GRP0:0 was
   TMIGR_NONE, childmask of GRP0:0 is stored in update propagation data
   structure tmigr_walk (as update of childmask is not yet
   visible/updated). And now ...

			     [GRP1:0]
			migrator = TMIGR_NONE
			active   = 0x00
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = 0x01           migrator = TMIGR_NONE
	      active   = 0x01           active   = 0x00
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
	      childmask= 2		childmask= 1
	      parent   = GRP1:0		parent   = GRP1:0
		/         \
	      0          1 .. 7                8
	  active          idle                !online
	  childmask=1

	  tmigr_walk.childmask = 0

4) ... childmask of GRP0:0 is updated by CPU 8 (still part of setup
   code).

			     [GRP1:0]
			migrator = 0x00
			active   = 0x00
			nextevt  = KTIME_MAX
		       /                  \
		 [GRP0:0]                  [GRP0:1]
	      migrator = 0x01           migrator = TMIGR_NONE
	      active   = 0x01           active   = 0x00
	      nextevt  = KTIME_MAX      nextevt  = KTIME_MAX
	      childmask= 2		childmask= 1
	      parent   = GRP1:0		parent   = GRP1:0
		/         \
	      0          1 .. 7                8
	  active          idle                !online
	  childmask=1

	  tmigr_walk.childmask = 0

5) CPU 0 sees the connection to GRP1:0 and now propagates active state to
   GRP1:0 but with childmask = 0 as stored in propagation data structure.

--> Now GRP1:0 always has a migrator as 0x00 != TMIGR_NONE and for all CPUs
    it looks like GRP1:0 is always active.

To prevent those races, the setup of the hierarchy is moved into the
cpuhotplug prepare callback. The prepare callback is not executed by the
CPU which will come online, it is executed by the CPU which prepares
onlining of the other CPU. This CPU is active while it is connecting the
formerly top level to the new one. This prevents from (A) to happen and it
also prevents from any further walk above the formerly top level until that
active CPU becomes inactive, releasing the new ->parent and ->childmask
updates to be visible by any subsequent walk up above the formerly top
level hierarchy. This prevents from (B) to happen. The direction for the
updates is now forced to look like "from bottom to top".

However if the active CPU prevents from tmigr_cpu_(in)active() to walk up
with the update not-or-half visible, nothing prevents walking up to the new
top with a 0 childmask in tmigr_handle_remote_up() or
tmigr_requires_handle_remote_up() if the active CPU doing the prepare is
not the migrator. But then it looks fine because:

  * tmigr_check_migrator() should just return false
  * The migrator is active and should eventually observe the new childmask
    at some point in a future tick.

Split setup functionality of online callback into the cpuhotplug prepare
callback and setup hotplug state. Change init call into early_initcall() to
make sure an already active CPU prepares everything for newly upcoming
CPUs. Reorder the code, that all prepare related functions are close to
each other and online and offline callbacks are also close together.

Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model")
Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20240717094940.18687-1-anna-maria@linutronix.de
---
 include/linux/cpuhotplug.h    |   1 +
 kernel/time/timer_migration.c | 206 ++++++++++++++++++++--------------
 2 files changed, 120 insertions(+), 87 deletions(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 7a5785f405b6..df59666a2a66 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -122,6 +122,7 @@ enum cpuhp_state {
 	CPUHP_KVM_PPC_BOOK3S_PREPARE,
 	CPUHP_ZCOMP_PREPARE,
 	CPUHP_TIMERS_PREPARE,
+	CPUHP_TMIGR_PREPARE,
 	CPUHP_MIPS_SOC_PREPARE,
 	CPUHP_BP_PREPARE_DYN,
 	CPUHP_BP_PREPARE_DYN_END		= CPUHP_BP_PREPARE_DYN + 20,
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index d91efe1dc3bf..867f0ecf2e74 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -1438,6 +1438,66 @@ u64 tmigr_quick_check(u64 nextevt)
 	return KTIME_MAX;
 }
 
+/*
+ * tmigr_trigger_active() - trigger a CPU to become active again
+ *
+ * This function is executed on a CPU which is part of cpu_online_mask, when the
+ * last active CPU in the hierarchy is offlining. With this, it is ensured that
+ * the other CPU is active and takes over the migrator duty.
+ */
+static long tmigr_trigger_active(void *unused)
+{
+	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+
+	WARN_ON_ONCE(!tmc->online || tmc->idle);
+
+	return 0;
+}
+
+static int tmigr_cpu_offline(unsigned int cpu)
+{
+	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+	int migrator;
+	u64 firstexp;
+
+	raw_spin_lock_irq(&tmc->lock);
+	tmc->online = false;
+	WRITE_ONCE(tmc->wakeup, KTIME_MAX);
+
+	/*
+	 * CPU has to handle the local events on his own, when on the way to
+	 * offline; Therefore nextevt value is set to KTIME_MAX
+	 */
+	firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
+	trace_tmigr_cpu_offline(tmc);
+	raw_spin_unlock_irq(&tmc->lock);
+
+	if (firstexp != KTIME_MAX) {
+		migrator = cpumask_any_but(cpu_online_mask, cpu);
+		work_on_cpu(migrator, tmigr_trigger_active, NULL);
+	}
+
+	return 0;
+}
+
+static int tmigr_cpu_online(unsigned int cpu)
+{
+	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+
+	/* Check whether CPU data was successfully initialized */
+	if (WARN_ON_ONCE(!tmc->tmgroup))
+		return -EINVAL;
+
+	raw_spin_lock_irq(&tmc->lock);
+	trace_tmigr_cpu_online(tmc);
+	tmc->idle = timer_base_is_idle();
+	if (!tmc->idle)
+		__tmigr_cpu_activate(tmc);
+	tmc->online = true;
+	raw_spin_unlock_irq(&tmc->lock);
+	return 0;
+}
+
 static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
 			     int node)
 {
@@ -1510,9 +1570,10 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,
 }
 
 static void tmigr_connect_child_parent(struct tmigr_group *child,
-				       struct tmigr_group *parent)
+				       struct tmigr_group *parent,
+				       bool activate)
 {
-	union tmigr_state childstate;
+	struct tmigr_walk data;
 
 	raw_spin_lock_irq(&child->lock);
 	raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
@@ -1525,6 +1586,9 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 
 	trace_tmigr_connect_child_parent(child);
 
+	if (!activate)
+		return;
+
 	/*
 	 * To prevent inconsistent states, active children need to be active in
 	 * the new parent as well. Inactive children are already marked inactive
@@ -1540,22 +1604,24 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 	 *   child to the new parent. So tmigr_connect_child_parent() is
 	 *   executed with the formerly top level group (child) and the newly
 	 *   created group (parent).
+	 *
+	 * * It is ensured that the child is active, as this setup path is
+	 *   executed in hotplug prepare callback. This is exectued by an
+	 *   already connected and !idle CPU. Even if all other CPUs go idle,
+	 *   the CPU executing the setup will be responsible up to current top
+	 *   level group. And the next time it goes inactive, it will release
+	 *   the new childmask and parent to subsequent walkers through this
+	 *   @child. Therefore propagate active state unconditionally.
 	 */
-	childstate.state = atomic_read(&child->migr_state);
-	if (childstate.migrator != TMIGR_NONE) {
-		struct tmigr_walk data;
+	data.childmask = child->childmask;
 
-		data.childmask = child->childmask;
-
-		/*
-		 * There is only one new level per time (which is protected by
-		 * tmigr_mutex). When connecting the child and the parent and
-		 * set the child active when the parent is inactive, the parent
-		 * needs to be the uppermost level. Otherwise there went
-		 * something wrong!
-		 */
-		WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
-	}
+	/*
+	 * There is only one new level per time (which is protected by
+	 * tmigr_mutex). When connecting the child and the parent and set the
+	 * child active when the parent is inactive, the parent needs to be the
+	 * uppermost level. Otherwise there went something wrong!
+	 */
+	WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
 }
 
 static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
@@ -1608,7 +1674,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
 		 * Update tmc -> group / child -> group connection
 		 */
 		if (i == 0) {
-			struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
+			struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
 
 			raw_spin_lock_irq(&group->lock);
 
@@ -1623,7 +1689,8 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
 			continue;
 		} else {
 			child = stack[i - 1];
-			tmigr_connect_child_parent(child, group);
+			/* Will be activated at online time */
+			tmigr_connect_child_parent(child, group, false);
 		}
 
 		/* check if uppermost level was newly created */
@@ -1634,12 +1701,21 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
 
 		lvllist = &tmigr_level_list[top];
 		if (group->num_children == 1 && list_is_singular(lvllist)) {
+			/*
+			 * The target CPU must never do the prepare work, except
+			 * on early boot when the boot CPU is the target. Otherwise
+			 * it may spuriously activate the old top level group inside
+			 * the new one (nevertheless whether old top level group is
+			 * active or not) and/or release an uninitialized childmask.
+			 */
+			WARN_ON_ONCE(cpu == raw_smp_processor_id());
+
 			lvllist = &tmigr_level_list[top - 1];
 			list_for_each_entry(child, lvllist, list) {
 				if (child->parent)
 					continue;
 
-				tmigr_connect_child_parent(child, group);
+				tmigr_connect_child_parent(child, group, true);
 			}
 		}
 	}
@@ -1661,80 +1737,31 @@ static int tmigr_add_cpu(unsigned int cpu)
 	return ret;
 }
 
-static int tmigr_cpu_online(unsigned int cpu)
+static int tmigr_cpu_prepare(unsigned int cpu)
 {
-	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
-	int ret;
+	struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
+	int ret = 0;
 
-	/* First online attempt? Initialize CPU data */
-	if (!tmc->tmgroup) {
-		raw_spin_lock_init(&tmc->lock);
+	/* Not first online attempt? */
+	if (tmc->tmgroup)
+		return ret;
 
-		ret = tmigr_add_cpu(cpu);
-		if (ret < 0)
-			return ret;
-
-		if (tmc->childmask == 0)
-			return -EINVAL;
-
-		timerqueue_init(&tmc->cpuevt.nextevt);
-		tmc->cpuevt.nextevt.expires = KTIME_MAX;
-		tmc->cpuevt.ignore = true;
-		tmc->cpuevt.cpu = cpu;
-
-		tmc->remote = false;
-		WRITE_ONCE(tmc->wakeup, KTIME_MAX);
-	}
-	raw_spin_lock_irq(&tmc->lock);
-	trace_tmigr_cpu_online(tmc);
-	tmc->idle = timer_base_is_idle();
-	if (!tmc->idle)
-		__tmigr_cpu_activate(tmc);
-	tmc->online = true;
-	raw_spin_unlock_irq(&tmc->lock);
-	return 0;
-}
-
-/*
- * tmigr_trigger_active() - trigger a CPU to become active again
- *
- * This function is executed on a CPU which is part of cpu_online_mask, when the
- * last active CPU in the hierarchy is offlining. With this, it is ensured that
- * the other CPU is active and takes over the migrator duty.
- */
-static long tmigr_trigger_active(void *unused)
-{
-	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
-
-	WARN_ON_ONCE(!tmc->online || tmc->idle);
-
-	return 0;
-}
-
-static int tmigr_cpu_offline(unsigned int cpu)
-{
-	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
-	int migrator;
-	u64 firstexp;
-
-	raw_spin_lock_irq(&tmc->lock);
-	tmc->online = false;
+	raw_spin_lock_init(&tmc->lock);
+	timerqueue_init(&tmc->cpuevt.nextevt);
+	tmc->cpuevt.nextevt.expires = KTIME_MAX;
+	tmc->cpuevt.ignore = true;
+	tmc->cpuevt.cpu = cpu;
+	tmc->remote = false;
 	WRITE_ONCE(tmc->wakeup, KTIME_MAX);
 
-	/*
-	 * CPU has to handle the local events on his own, when on the way to
-	 * offline; Therefore nextevt value is set to KTIME_MAX
-	 */
-	firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
-	trace_tmigr_cpu_offline(tmc);
-	raw_spin_unlock_irq(&tmc->lock);
+	ret = tmigr_add_cpu(cpu);
+	if (ret < 0)
+		return ret;
 
-	if (firstexp != KTIME_MAX) {
-		migrator = cpumask_any_but(cpu_online_mask, cpu);
-		work_on_cpu(migrator, tmigr_trigger_active, NULL);
-	}
+	if (tmc->childmask == 0)
+		return -EINVAL;
 
-	return 0;
+	return ret;
 }
 
 static int __init tmigr_init(void)
@@ -1793,6 +1820,11 @@ static int __init tmigr_init(void)
 		tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP,
 		tmigr_crossnode_level);
 
+	ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare",
+				tmigr_cpu_prepare, NULL);
+	if (ret)
+		goto err;
+
 	ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online",
 				tmigr_cpu_online, tmigr_cpu_offline);
 	if (ret)
@@ -1804,4 +1836,4 @@ static int __init tmigr_init(void)
 	pr_err("Timer migration setup failed\n");
 	return ret;
 }
-late_initcall(tmigr_init);
+early_initcall(tmigr_init);

From 92506741521fd09dfaa9d6ef3c3620a9dd6bbafd Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Tue, 16 Jul 2024 16:19:21 +0200
Subject: [PATCH 151/346] timers/migration: Improve tracing

Trace points of inactive and active propagation are located at the end of
the related functions. The interesting information of those trace points is
the updated group state. When trace points are not located directly at the
place where group state changed, order of trace points in traces could be
confusing.

Move inactive and active propagation trace points directly after update of
group state values.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-3-757baa7803fe@linutronix.de
---
 kernel/time/timer_migration.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 867f0ecf2e74..4fbd9304e896 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -656,6 +656,8 @@ static bool tmigr_active_up(struct tmigr_group *group,
 
 	} while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state));
 
+	trace_tmigr_group_set_cpu_active(group, newstate, childmask);
+
 	if (walk_done == false)
 		data->childmask = group->childmask;
 
@@ -673,8 +675,6 @@ static bool tmigr_active_up(struct tmigr_group *group,
 	 */
 	group->groupevt.ignore = true;
 
-	trace_tmigr_group_set_cpu_active(group, newstate, childmask);
-
 	return walk_done;
 }
 
@@ -1306,9 +1306,10 @@ static bool tmigr_inactive_up(struct tmigr_group *group,
 
 		WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active));
 
-		if (atomic_try_cmpxchg(&group->migr_state, &curstate.state,
-				       newstate.state))
+		if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) {
+			trace_tmigr_group_set_cpu_inactive(group, newstate, childmask);
 			break;
+		}
 
 		/*
 		 * The memory barrier is paired with the cmpxchg() in
@@ -1327,8 +1328,6 @@ static bool tmigr_inactive_up(struct tmigr_group *group,
 	if (walk_done == false)
 		data->childmask = group->childmask;
 
-	trace_tmigr_group_set_cpu_inactive(group, newstate, childmask);
-
 	return walk_done;
 }
 

From 3ba111032bc1d8a0f04e6d2a5d8fb4ddc96eeae7 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Tue, 16 Jul 2024 16:19:22 +0200
Subject: [PATCH 152/346] timers/migration: Use a single struct for hierarchy
 walk data

Two different structs are defined for propagating data from one to another
level when walking the hierarchy. Several struct members exist in both
structs which makes generalization harder.

Merge those two structs into a single one and use it directly in
walk_groups() and the corresponding function pointers instead of
introducing pointer casting all over the place.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-4-757baa7803fe@linutronix.de
---
 kernel/time/timer_migration.c | 126 +++++++++++++++-------------------
 1 file changed, 55 insertions(+), 71 deletions(-)

diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 4fbd9304e896..9f0c284c440f 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -475,9 +475,54 @@ static bool tmigr_check_lonely(struct tmigr_group *group)
 	return bitmap_weight(&active, BIT_CNT) <= 1;
 }
 
-typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, void *);
+/**
+ * struct tmigr_walk - data required for walking the hierarchy
+ * @nextexp:		Next CPU event expiry information which is handed into
+ *			the timer migration code by the timer code
+ *			(get_next_timer_interrupt())
+ * @firstexp:		Contains the first event expiry information when
+ *			hierarchy is completely idle.  When CPU itself was the
+ *			last going idle, information makes sure, that CPU will
+ *			be back in time. When using this value in the remote
+ *			expiry case, firstexp is stored in the per CPU tmigr_cpu
+ *			struct of CPU which expires remote timers. It is updated
+ *			in top level group only. Be aware, there could occur a
+ *			new top level of the hierarchy between the 'top level
+ *			call' in tmigr_update_events() and the check for the
+ *			parent group in walk_groups(). Then @firstexp might
+ *			contain a value != KTIME_MAX even if it was not the
+ *			final top level. This is not a problem, as the worst
+ *			outcome is a CPU which might wake up a little early.
+ * @evt:		Pointer to tmigr_event which needs to be queued (of idle
+ *			child group)
+ * @childmask:		childmask of child group
+ * @remote:		Is set, when the new timer path is executed in
+ *			tmigr_handle_remote_cpu()
+ * @basej:		timer base in jiffies
+ * @now:		timer base monotonic
+ * @check:		is set if there is the need to handle remote timers;
+ *			required in tmigr_requires_handle_remote() only
+ * @tmc_active:		this flag indicates, whether the CPU which triggers
+ *			the hierarchy walk is !idle in the timer migration
+ *			hierarchy. When the CPU is idle and the whole hierarchy is
+ *			idle, only the first event of the top level has to be
+ *			considered.
+ */
+struct tmigr_walk {
+	u64			nextexp;
+	u64			firstexp;
+	struct tmigr_event	*evt;
+	u8			childmask;
+	bool			remote;
+	unsigned long		basej;
+	u64			now;
+	bool			check;
+	bool			tmc_active;
+};
 
-static void __walk_groups(up_f up, void *data,
+typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *);
+
+static void __walk_groups(up_f up, struct tmigr_walk *data,
 			  struct tmigr_cpu *tmc)
 {
 	struct tmigr_group *child = NULL, *group = tmc->tmgroup;
@@ -493,68 +538,13 @@ static void __walk_groups(up_f up, void *data,
 	} while (group);
 }
 
-static void walk_groups(up_f up, void *data, struct tmigr_cpu *tmc)
+static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc)
 {
 	lockdep_assert_held(&tmc->lock);
 
 	__walk_groups(up, data, tmc);
 }
 
-/**
- * struct tmigr_walk - data required for walking the hierarchy
- * @nextexp:		Next CPU event expiry information which is handed into
- *			the timer migration code by the timer code
- *			(get_next_timer_interrupt())
- * @firstexp:		Contains the first event expiry information when last
- *			active CPU of hierarchy is on the way to idle to make
- *			sure CPU will be back in time. It is updated in top
- *			level group only. Be aware, there could occur a new top
- *			level of the hierarchy between the 'top level call' in
- *			tmigr_update_events() and the check for the parent group
- *			in walk_groups(). Then @firstexp might contain a value
- *			!= KTIME_MAX even if it was not the final top
- *			level. This is not a problem, as the worst outcome is a
- *			CPU which might wake up a little early.
- * @evt:		Pointer to tmigr_event which needs to be queued (of idle
- *			child group)
- * @childmask:		childmask of child group
- * @remote:		Is set, when the new timer path is executed in
- *			tmigr_handle_remote_cpu()
- */
-struct tmigr_walk {
-	u64			nextexp;
-	u64			firstexp;
-	struct tmigr_event	*evt;
-	u8			childmask;
-	bool			remote;
-};
-
-/**
- * struct tmigr_remote_data - data required for remote expiry hierarchy walk
- * @basej:		timer base in jiffies
- * @now:		timer base monotonic
- * @firstexp:		returns expiry of the first timer in the idle timer
- *			migration hierarchy to make sure the timer is handled in
- *			time; it is stored in the per CPU tmigr_cpu struct of
- *			CPU which expires remote timers
- * @childmask:		childmask of child group
- * @check:		is set if there is the need to handle remote timers;
- *			required in tmigr_requires_handle_remote() only
- * @tmc_active:		this flag indicates, whether the CPU which triggers
- *			the hierarchy walk is !idle in the timer migration
- *			hierarchy. When the CPU is idle and the whole hierarchy is
- *			idle, only the first event of the top level has to be
- *			considered.
- */
-struct tmigr_remote_data {
-	unsigned long	basej;
-	u64		now;
-	u64		firstexp;
-	u8		childmask;
-	bool		check;
-	bool		tmc_active;
-};
-
 /*
  * Returns the next event of the timerqueue @group->events
  *
@@ -625,10 +615,9 @@ static u64 tmigr_next_groupevt_expires(struct tmigr_group *group)
 
 static bool tmigr_active_up(struct tmigr_group *group,
 			    struct tmigr_group *child,
-			    void *ptr)
+			    struct tmigr_walk *data)
 {
 	union tmigr_state curstate, newstate;
-	struct tmigr_walk *data = ptr;
 	bool walk_done;
 	u8 childmask;
 
@@ -867,10 +856,8 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
 
 static bool tmigr_new_timer_up(struct tmigr_group *group,
 			       struct tmigr_group *child,
-			       void *ptr)
+			       struct tmigr_walk *data)
 {
-	struct tmigr_walk *data = ptr;
-
 	return tmigr_update_events(group, child, data);
 }
 
@@ -1002,9 +989,8 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
 
 static bool tmigr_handle_remote_up(struct tmigr_group *group,
 				   struct tmigr_group *child,
-				   void *ptr)
+				   struct tmigr_walk *data)
 {
-	struct tmigr_remote_data *data = ptr;
 	struct tmigr_event *evt;
 	unsigned long jif;
 	u8 childmask;
@@ -1062,7 +1048,7 @@ static bool tmigr_handle_remote_up(struct tmigr_group *group,
 void tmigr_handle_remote(void)
 {
 	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
-	struct tmigr_remote_data data;
+	struct tmigr_walk data;
 
 	if (tmigr_is_not_available(tmc))
 		return;
@@ -1104,9 +1090,8 @@ void tmigr_handle_remote(void)
 
 static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
 					    struct tmigr_group *child,
-					    void *ptr)
+					    struct tmigr_walk *data)
 {
-	struct tmigr_remote_data *data = ptr;
 	u8 childmask;
 
 	childmask = data->childmask;
@@ -1164,7 +1149,7 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
 bool tmigr_requires_handle_remote(void)
 {
 	struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
-	struct tmigr_remote_data data;
+	struct tmigr_walk data;
 	unsigned long jif;
 	bool ret = false;
 
@@ -1252,10 +1237,9 @@ u64 tmigr_cpu_new_timer(u64 nextexp)
 
 static bool tmigr_inactive_up(struct tmigr_group *group,
 			      struct tmigr_group *child,
-			      void *ptr)
+			      struct tmigr_walk *data)
 {
 	union tmigr_state curstate, newstate, childstate;
-	struct tmigr_walk *data = ptr;
 	bool walk_done;
 	u8 childmask;
 

From d47be589844224a3ef13b55ff6f15211ab20f1d1 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Tue, 16 Jul 2024 16:19:23 +0200
Subject: [PATCH 153/346] timers/migration: Read childmask and parent pointer
 in a single place

Reading the childmask and parent pointer is required when propagating
changes through the hierarchy. At the moment this reads are spread all over
the place which makes it harder to follow.

Move those reads to a single place to keep code clean.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-5-757baa7803fe@linutronix.de
---
 kernel/time/timer_migration.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 9f0c284c440f..f5652b0aa90e 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -535,6 +535,7 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
 
 		child = group;
 		group = group->parent;
+		data->childmask = child->childmask;
 	} while (group);
 }
 
@@ -647,9 +648,6 @@ static bool tmigr_active_up(struct tmigr_group *group,
 
 	trace_tmigr_group_set_cpu_active(group, newstate, childmask);
 
-	if (walk_done == false)
-		data->childmask = group->childmask;
-
 	/*
 	 * The group is active (again). The group event might be still queued
 	 * into the parent group's timerqueue but can now be handled by the
@@ -1027,12 +1025,10 @@ static bool tmigr_handle_remote_up(struct tmigr_group *group,
 	}
 
 	/*
-	 * Update of childmask for the next level and keep track of the expiry
-	 * of the first event that needs to be handled (group->next_expiry was
-	 * updated by tmigr_next_expired_groupevt(), next was set by
-	 * tmigr_handle_remote_cpu()).
+	 * Keep track of the expiry of the first event that needs to be handled
+	 * (group->next_expiry was updated by tmigr_next_expired_groupevt(),
+	 * next was set by tmigr_handle_remote_cpu()).
 	 */
-	data->childmask = group->childmask;
 	data->firstexp = group->next_expiry;
 
 	raw_spin_unlock_irq(&group->lock);
@@ -1110,7 +1106,7 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
 	 * group before reading the next_expiry value.
 	 */
 	if (group->parent && !data->tmc_active)
-		goto out;
+		return false;
 
 	/*
 	 * The lock is required on 32bit architectures to read the variable
@@ -1135,9 +1131,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
 		raw_spin_unlock(&group->lock);
 	}
 
-out:
-	/* Update of childmask for the next level */
-	data->childmask = group->childmask;
 	return false;
 }
 
@@ -1309,9 +1302,6 @@ static bool tmigr_inactive_up(struct tmigr_group *group,
 	/* Event Handling */
 	tmigr_update_events(group, child, data);
 
-	if (walk_done == false)
-		data->childmask = group->childmask;
-
 	return walk_done;
 }
 

From 835a9a67f54f01033417a254e53a1391f99db708 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Tue, 16 Jul 2024 16:19:24 +0200
Subject: [PATCH 154/346] timers/migration: Rename childmask by groupmask to
 make naming more obvious

childmask in the group reflects the mask that is required to 'reference'
this group in the parent. When reading childmask, this might be confusing,
as this suggests, that this is the mask of the child of the group.

Clarify this by renaming childmask in the tmigr_group and tmc_group by
groupmask.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-6-757baa7803fe@linutronix.de
---
 include/trace/events/timer_migration.h | 16 ++++++++--------
 kernel/time/timer_migration.c          | 24 ++++++++++++------------
 kernel/time/timer_migration.h          | 15 +++++++--------
 3 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/include/trace/events/timer_migration.h b/include/trace/events/timer_migration.h
index 79f19e76a80b..47db5eaf2f9a 100644
--- a/include/trace/events/timer_migration.h
+++ b/include/trace/events/timer_migration.h
@@ -43,7 +43,7 @@ TRACE_EVENT(tmigr_connect_child_parent,
 		__field( unsigned int,	lvl		)
 		__field( unsigned int,	numa_node	)
 		__field( unsigned int,	num_children	)
-		__field( u32,		childmask	)
+		__field( u32,		groupmask	)
 	),
 
 	TP_fast_assign(
@@ -52,11 +52,11 @@ TRACE_EVENT(tmigr_connect_child_parent,
 		__entry->lvl		= child->parent->level;
 		__entry->numa_node	= child->parent->numa_node;
 		__entry->num_children	= child->parent->num_children;
-		__entry->childmask	= child->childmask;
+		__entry->groupmask	= child->groupmask;
 	),
 
-	TP_printk("group=%p childmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
-		  __entry->child,  __entry->childmask, __entry->parent,
+	TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
+		  __entry->child,  __entry->groupmask, __entry->parent,
 		  __entry->lvl, __entry->numa_node, __entry->num_children)
 );
 
@@ -72,7 +72,7 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
 		__field( unsigned int,	lvl		)
 		__field( unsigned int,	numa_node	)
 		__field( unsigned int,	num_children	)
-		__field( u32,		childmask	)
+		__field( u32,		groupmask	)
 	),
 
 	TP_fast_assign(
@@ -81,11 +81,11 @@ TRACE_EVENT(tmigr_connect_cpu_parent,
 		__entry->lvl		= tmc->tmgroup->level;
 		__entry->numa_node	= tmc->tmgroup->numa_node;
 		__entry->num_children	= tmc->tmgroup->num_children;
-		__entry->childmask	= tmc->childmask;
+		__entry->groupmask	= tmc->groupmask;
 	),
 
-	TP_printk("cpu=%d childmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
-		  __entry->cpu,	 __entry->childmask, __entry->parent,
+	TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d",
+		  __entry->cpu,	 __entry->groupmask, __entry->parent,
 		  __entry->lvl, __entry->numa_node, __entry->num_children)
 );
 
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index f5652b0aa90e..ca76120ee7c6 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -495,7 +495,7 @@ static bool tmigr_check_lonely(struct tmigr_group *group)
  *			outcome is a CPU which might wake up a little early.
  * @evt:		Pointer to tmigr_event which needs to be queued (of idle
  *			child group)
- * @childmask:		childmask of child group
+ * @childmask:		groupmask of child group
  * @remote:		Is set, when the new timer path is executed in
  *			tmigr_handle_remote_cpu()
  * @basej:		timer base in jiffies
@@ -535,7 +535,7 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
 
 		child = group;
 		group = group->parent;
-		data->childmask = child->childmask;
+		data->childmask = child->groupmask;
 	} while (group);
 }
 
@@ -669,7 +669,7 @@ static void __tmigr_cpu_activate(struct tmigr_cpu *tmc)
 {
 	struct tmigr_walk data;
 
-	data.childmask = tmc->childmask;
+	data.childmask = tmc->groupmask;
 
 	trace_tmigr_cpu_active(tmc);
 
@@ -1049,7 +1049,7 @@ void tmigr_handle_remote(void)
 	if (tmigr_is_not_available(tmc))
 		return;
 
-	data.childmask = tmc->childmask;
+	data.childmask = tmc->groupmask;
 	data.firstexp = KTIME_MAX;
 
 	/*
@@ -1057,7 +1057,7 @@ void tmigr_handle_remote(void)
 	 * in tmigr_handle_remote_up() anyway. Keep this check to speed up the
 	 * return when nothing has to be done.
 	 */
-	if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) {
+	if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) {
 		/*
 		 * If this CPU was an idle migrator, make sure to clear its wakeup
 		 * value so it won't chase timers that have already expired elsewhere.
@@ -1150,7 +1150,7 @@ bool tmigr_requires_handle_remote(void)
 		return ret;
 
 	data.now = get_jiffies_update(&jif);
-	data.childmask = tmc->childmask;
+	data.childmask = tmc->groupmask;
 	data.firstexp = KTIME_MAX;
 	data.tmc_active = !tmc->idle;
 	data.check = false;
@@ -1310,7 +1310,7 @@ static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp)
 	struct tmigr_walk data = { .nextexp = nextexp,
 				   .firstexp = KTIME_MAX,
 				   .evt = &tmc->cpuevt,
-				   .childmask = tmc->childmask };
+				   .childmask = tmc->groupmask };
 
 	/*
 	 * If nextexp is KTIME_MAX, the CPU event will be ignored because the
@@ -1388,7 +1388,7 @@ u64 tmigr_quick_check(u64 nextevt)
 	if (WARN_ON_ONCE(tmc->idle))
 		return nextevt;
 
-	if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->childmask))
+	if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask))
 		return KTIME_MAX;
 
 	do {
@@ -1552,7 +1552,7 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 	raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
 
 	child->parent = parent;
-	child->childmask = BIT(parent->num_children++);
+	child->groupmask = BIT(parent->num_children++);
 
 	raw_spin_unlock(&parent->lock);
 	raw_spin_unlock_irq(&child->lock);
@@ -1586,7 +1586,7 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
 	 *   the new childmask and parent to subsequent walkers through this
 	 *   @child. Therefore propagate active state unconditionally.
 	 */
-	data.childmask = child->childmask;
+	data.childmask = child->groupmask;
 
 	/*
 	 * There is only one new level per time (which is protected by
@@ -1652,7 +1652,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
 			raw_spin_lock_irq(&group->lock);
 
 			tmc->tmgroup = group;
-			tmc->childmask = BIT(group->num_children++);
+			tmc->groupmask = BIT(group->num_children++);
 
 			raw_spin_unlock_irq(&group->lock);
 
@@ -1731,7 +1731,7 @@ static int tmigr_cpu_prepare(unsigned int cpu)
 	if (ret < 0)
 		return ret;
 
-	if (tmc->childmask == 0)
+	if (tmc->groupmask == 0)
 		return -EINVAL;
 
 	return ret;
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 494f68cc13f4..154accc7a543 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -51,9 +51,8 @@ struct tmigr_event {
  * @num_children:	Counter of group children to make sure the group is only
  *			filled with TMIGR_CHILDREN_PER_GROUP; Required for setup
  *			only
- * @childmask:		childmask of the group in the parent group; is set
- *			during setup and will never change; can be read
- *			lockless
+ * @groupmask:		mask of the group in the parent group; is set during
+ *			setup and will never change; can be read lockless
  * @list:		List head that is added to the per level
  *			tmigr_level_list; is required during setup when a
  *			new group needs to be connected to the existing
@@ -69,7 +68,7 @@ struct tmigr_group {
 	unsigned int		level;
 	int			numa_node;
 	unsigned int		num_children;
-	u8			childmask;
+	u8			groupmask;
 	struct list_head	list;
 };
 
@@ -89,7 +88,7 @@ struct tmigr_group {
  *			hierarchy
  * @remote:		Is set when timers of the CPU are expired remotely
  * @tmgroup:		Pointer to the parent group
- * @childmask:		childmask of tmigr_cpu in the parent group
+ * @groupmask:		mask of tmigr_cpu in the parent group
  * @wakeup:		Stores the first timer when the timer migration
  *			hierarchy is completely idle and remote expiry was done;
  *			is returned to timer code in the idle path and is only
@@ -102,7 +101,7 @@ struct tmigr_cpu {
 	bool			idle;
 	bool			remote;
 	struct tmigr_group	*tmgroup;
-	u8			childmask;
+	u8			groupmask;
 	u64			wakeup;
 	struct tmigr_event	cpuevt;
 };
@@ -118,8 +117,8 @@ union tmigr_state {
 	u32 state;
 	/**
 	 * struct - split state of tmigr_group
-	 * @active:	Contains each childmask bit of the active children
-	 * @migrator:	Contains childmask of the child which is migrator
+	 * @active:	Contains each mask bit of the active children
+	 * @migrator:	Contains mask of the child which is migrator
 	 * @seq:	Sequence counter needs to be increased when an update
 	 *		to the tmigr_state is done. It prevents a race when
 	 *		updates in the child groups are propagated in changed

From 2367e28e231af05243b92325de9a38956ad0b565 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Tue, 16 Jul 2024 16:19:25 +0200
Subject: [PATCH 155/346] timers/migration: Spare write when nothing changed

The wakeup value is written unconditionally in tmigr_cpu_new_timer(). When
there was no new next timer expiry that needs to be propagated, then the
value that was read before is written. This is not required.

Move the write to the place where wakeup value is changed changed.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-7-757baa7803fe@linutronix.de
---
 kernel/time/timer_migration.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index ca76120ee7c6..9c15ae8d8a25 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -1215,14 +1215,13 @@ u64 tmigr_cpu_new_timer(u64 nextexp)
 		if (nextexp != tmc->cpuevt.nextevt.expires ||
 		    tmc->cpuevt.ignore) {
 			ret = tmigr_new_timer(tmc, nextexp);
+			/*
+			 * Make sure the reevaluation of timers in idle path
+			 * will not miss an event.
+			 */
+			WRITE_ONCE(tmc->wakeup, ret);
 		}
 	}
-	/*
-	 * Make sure the reevaluation of timers in idle path will not miss an
-	 * event.
-	 */
-	WRITE_ONCE(tmc->wakeup, ret);
-
 	trace_tmigr_cpu_new_timer_idle(tmc, nextexp);
 	raw_spin_unlock(&tmc->lock);
 	return ret;

From f004bf9de057004f7ccea4239317aec2fbd8240b Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Tue, 16 Jul 2024 16:19:26 +0200
Subject: [PATCH 156/346] timers/migration: Fix grammar in comment

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20240716-tmigr-fixes-v4-8-757baa7803fe@linutronix.de
---
 kernel/time/timer_migration.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 9c15ae8d8a25..8d57f7686bb0 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -1368,7 +1368,7 @@ u64 tmigr_cpu_deactivate(u64 nextexp)
  *			  the only one in the level 0 group; and if it is the
  *			  only one in level 0 group, but there are more than a
  *			  single group active on the way to top level)
- * * nextevt		- when CPU is offline and has to handle timer on his own
+ * * nextevt		- when CPU is offline and has to handle timer on its own
  *			  or when on the way to top in every group only a single
  *			  child is active but @nextevt is before the lowest
  *			  next_expiry encountered while walking up to top level.

From 3d8d459c8ba282a67df3b518c5f80bcb217a5b38 Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley@microchip.com>
Date: Wed, 17 Jul 2024 09:54:38 +0100
Subject: [PATCH 157/346] RISC-V: hwprobe: sort EXT_KEY()s in
 hwprobe_isa_ext0() alphabetically
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the entries appear to be in a random order (although according
to Palmer he has tried to sort them by key value) which makes it harder
to find entries in a growing list, and more likely to have conflicts as
all patches are adding to the end of the list. Sort them alphabetically
instead.

Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Clément Léger <cleger@rivosinc.com>
Link: https://lore.kernel.org/r/20240717-dedicate-squeamish-7e4ab54df58f@spud
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/kernel/sys_hwprobe.c | 43 ++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index 685594769535..8a1c9ce170e8 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -93,44 +93,45 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair,
 		 * regardless of the kernel's configuration, as no other checks, besides
 		 * presence in the hart_isa bitmap, are made.
 		 */
+		EXT_KEY(ZACAS);
+		EXT_KEY(ZAWRS);
 		EXT_KEY(ZBA);
 		EXT_KEY(ZBB);
-		EXT_KEY(ZBS);
-		EXT_KEY(ZICBOZ);
 		EXT_KEY(ZBC);
-
 		EXT_KEY(ZBKB);
 		EXT_KEY(ZBKC);
 		EXT_KEY(ZBKX);
+		EXT_KEY(ZBS);
+		EXT_KEY(ZCA);
+		EXT_KEY(ZCB);
+		EXT_KEY(ZCMOP);
+		EXT_KEY(ZICBOZ);
+		EXT_KEY(ZICOND);
+		EXT_KEY(ZIHINTNTL);
+		EXT_KEY(ZIHINTPAUSE);
+		EXT_KEY(ZIMOP);
 		EXT_KEY(ZKND);
 		EXT_KEY(ZKNE);
 		EXT_KEY(ZKNH);
 		EXT_KEY(ZKSED);
 		EXT_KEY(ZKSH);
 		EXT_KEY(ZKT);
-		EXT_KEY(ZIHINTNTL);
 		EXT_KEY(ZTSO);
-		EXT_KEY(ZACAS);
-		EXT_KEY(ZICOND);
-		EXT_KEY(ZIHINTPAUSE);
-		EXT_KEY(ZIMOP);
-		EXT_KEY(ZCA);
-		EXT_KEY(ZCB);
-		EXT_KEY(ZCMOP);
-		EXT_KEY(ZAWRS);
 
 		/*
 		 * All the following extensions must depend on the kernel
 		 * support of V.
 		 */
 		if (has_vector()) {
-			EXT_KEY(ZVE32X);
-			EXT_KEY(ZVE32F);
-			EXT_KEY(ZVE64X);
-			EXT_KEY(ZVE64F);
-			EXT_KEY(ZVE64D);
 			EXT_KEY(ZVBB);
 			EXT_KEY(ZVBC);
+			EXT_KEY(ZVE32F);
+			EXT_KEY(ZVE32X);
+			EXT_KEY(ZVE64D);
+			EXT_KEY(ZVE64F);
+			EXT_KEY(ZVE64X);
+			EXT_KEY(ZVFH);
+			EXT_KEY(ZVFHMIN);
 			EXT_KEY(ZVKB);
 			EXT_KEY(ZVKG);
 			EXT_KEY(ZVKNED);
@@ -139,16 +140,14 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair,
 			EXT_KEY(ZVKSED);
 			EXT_KEY(ZVKSH);
 			EXT_KEY(ZVKT);
-			EXT_KEY(ZVFH);
-			EXT_KEY(ZVFHMIN);
 		}
 
 		if (has_fpu()) {
-			EXT_KEY(ZFH);
-			EXT_KEY(ZFHMIN);
-			EXT_KEY(ZFA);
 			EXT_KEY(ZCD);
 			EXT_KEY(ZCF);
+			EXT_KEY(ZFA);
+			EXT_KEY(ZFH);
+			EXT_KEY(ZFHMIN);
 		}
 #undef EXT_KEY
 	}

From 82b461680651ac452fa773f271000ddbc4e4db14 Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley@microchip.com>
Date: Wed, 17 Jul 2024 14:29:24 +0100
Subject: [PATCH 158/346] RISC-V: run savedefconfig for defconfig

It's been a while since this was run, and there's a few things that have
changed. Firstly, almost all of the Renesas stuff vanishes because the
config for the RZ/Five is gated behind NONPORTABLE. Several options
(like CONFIG_PM) are removed as they are the default values.

To retain DEFVFREQ_THERMAL and BLK_DEV_THROTTLING, add PM_DEVFREQ and
BLK_CGROUP respectively.

Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240717-shrubs-concise-51600886babf@spud
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/configs/defconfig | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig
index 3f1f055866af..0d678325444f 100644
--- a/arch/riscv/configs/defconfig
+++ b/arch/riscv/configs/defconfig
@@ -7,6 +7,7 @@ CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_CGROUPS=y
 CONFIG_MEMCG=y
+CONFIG_BLK_CGROUP=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_CFS_BANDWIDTH=y
 CONFIG_RT_GROUP_SCHED=y
@@ -35,9 +36,6 @@ CONFIG_ARCH_THEAD=y
 CONFIG_ARCH_VIRT=y
 CONFIG_ARCH_CANAAN=y
 CONFIG_SMP=y
-CONFIG_HOTPLUG_CPU=y
-CONFIG_PM=y
-CONFIG_CPU_IDLE=y
 CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_STAT=y
 CONFIG_CPU_FREQ_GOV_POWERSAVE=m
@@ -52,13 +50,11 @@ CONFIG_ACPI=y
 CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_SPARSEMEM_MANUAL=y
 CONFIG_BLK_DEV_THROTTLING=y
+CONFIG_SPARSEMEM_MANUAL=y
 CONFIG_NET=y
 CONFIG_PACKET=y
-CONFIG_UNIX=y
 CONFIG_XFRM_USER=m
-CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
 CONFIG_IP_ADVANCED_ROUTER=y
 CONFIG_IP_PNP=y
@@ -102,9 +98,9 @@ CONFIG_NET_SCHED=y
 CONFIG_NET_CLS_CGROUP=m
 CONFIG_NETLINK_DIAG=y
 CONFIG_CGROUP_NET_PRIO=y
+CONFIG_CAN=m
 CONFIG_NET_9P=y
 CONFIG_NET_9P_VIRTIO=y
-CONFIG_CAN=m
 CONFIG_PCI=y
 CONFIG_PCIEPORTBUS=y
 CONFIG_PCI_HOST_GENERIC=y
@@ -153,8 +149,8 @@ CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_8250_DW=y
 CONFIG_SERIAL_OF_PLATFORM=y
-CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_EARLYCON_RISCV_SBI=y
+CONFIG_SERIAL_SH_SCI=y
 CONFIG_VIRTIO_CONSOLE=y
 CONFIG_HW_RANDOM=y
 CONFIG_HW_RANDOM_VIRTIO=y
@@ -179,7 +175,6 @@ CONFIG_DEVFREQ_THERMAL=y
 CONFIG_RZG2L_THERMAL=y
 CONFIG_WATCHDOG=y
 CONFIG_SUNXI_WATCHDOG=y
-CONFIG_RENESAS_RZG2LWDT=y
 CONFIG_MFD_AXP20X_I2C=y
 CONFIG_REGULATOR=y
 CONFIG_REGULATOR_FIXED_VOLTAGE=y
@@ -193,11 +188,9 @@ CONFIG_DRM_NOUVEAU=m
 CONFIG_DRM_SUN4I=m
 CONFIG_DRM_VIRTIO_GPU=m
 CONFIG_FB=y
-CONFIG_FRAMEBUFFER_CONSOLE=y
 CONFIG_SOUND=y
 CONFIG_SND=y
 CONFIG_SND_SOC=y
-CONFIG_SND_SOC_RZ=m
 CONFIG_SND_DESIGNWARE_I2S=m
 CONFIG_SND_SOC_STARFIVE=m
 CONFIG_SND_SOC_JH7110_PWMDAC=m
@@ -239,34 +232,31 @@ CONFIG_USB_CONFIGFS_F_FS=y
 CONFIG_MMC=y
 CONFIG_MMC_SDHCI=y
 CONFIG_MMC_SDHCI_PLTFM=y
-CONFIG_MMC_SDHCI_CADENCE=y
 CONFIG_MMC_SDHCI_OF_DWCMSHC=y
+CONFIG_MMC_SDHCI_CADENCE=y
 CONFIG_MMC_SPI=y
+CONFIG_MMC_SDHI=y
 CONFIG_MMC_DW=y
 CONFIG_MMC_DW_STARFIVE=y
-CONFIG_MMC_SDHI=y
 CONFIG_MMC_SUNXI=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_SUN6I=y
 CONFIG_DMADEVICES=y
 CONFIG_DMA_SUN6I=m
 CONFIG_DW_AXI_DMAC=y
-CONFIG_RZ_DMAC=y
 CONFIG_VIRTIO_PCI=y
 CONFIG_VIRTIO_BALLOON=y
 CONFIG_VIRTIO_INPUT=y
 CONFIG_VIRTIO_MMIO=y
-CONFIG_RENESAS_OSTM=y
 CONFIG_CLK_SOPHGO_CV1800=y
 CONFIG_SUN8I_DE2_CCU=m
+CONFIG_RENESAS_OSTM=y
 CONFIG_SUN50I_IOMMU=y
 CONFIG_RPMSG_CHAR=y
 CONFIG_RPMSG_CTRL=y
 CONFIG_RPMSG_VIRTIO=y
-CONFIG_ARCH_R9A07G043=y
+CONFIG_PM_DEVFREQ=y
 CONFIG_IIO=y
-CONFIG_RZG2L_ADC=m
-CONFIG_RESET_RZG2L_USBPHY_CTRL=y
 CONFIG_PHY_SUN4I_USB=m
 CONFIG_PHY_RCAR_GEN3_USB2=y
 CONFIG_PHY_STARFIVE_JH7110_DPHY_RX=m

From 0e91ac701c69c44840e8d36566604cac1b638880 Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@rivosinc.com>
Date: Fri, 19 Jul 2024 08:10:27 -0700
Subject: [PATCH 159/346] clk: T-Head: Disable on 32-bit Targets

This fails to build on 32-bit targets because of a missing __udivdi3.
IIRC the right way to fix that is to avoid the division, but I just want
a tree that builds and the only real T-Head platforms are 64-bit right
now.

Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
Link: https://lore.kernel.org/r/20240719151027.16152-1-palmer@rivosinc.com
Acked-by: Drew Fustini <drew@pdp7.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/thead/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/clk/thead/Kconfig b/drivers/clk/thead/Kconfig
index 1710d50bf9d4..95e0d9eb965e 100644
--- a/drivers/clk/thead/Kconfig
+++ b/drivers/clk/thead/Kconfig
@@ -3,6 +3,7 @@
 config CLK_THEAD_TH1520_AP
 	bool "T-HEAD TH1520 AP clock support"
 	depends on ARCH_THEAD || COMPILE_TEST
+	depends on 64BIT
 	default ARCH_THEAD
 	select REGMAP_MMIO
 	help

From 23c996fc2bc1978a02c64eddb90b4ab5d309c8df Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Fri, 19 Jul 2024 09:15:18 -0700
Subject: [PATCH 160/346] riscv: Extend cpufeature.c to detect vendor
 extensions

Instead of grouping all vendor extensions into the same riscv_isa_ext
that standard instructions use, create a struct
"riscv_isa_vendor_ext_data_list" that allows each vendor to maintain
their vendor extensions independently of the standard extensions.
xandespmu is currently the only vendor extension so that is the only
extension that is affected by this change.

An additional benefit of this is that the extensions of each vendor can
be conditionally enabled. A config RISCV_ISA_VENDOR_EXT_ANDES has been
added to allow for that.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Andy Chiu <andy.chiu@sifive.com>
Tested-by: Yu Chien Peter Lin <peterlin@andestech.com>
Reviewed-by: Yu Chien Peter Lin <peterlin@andestech.com>
Link: https://lore.kernel.org/r/20240719-support_vendor_extensions-v3-1-0af7587bbec0@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/Kconfig                            |   2 +
 arch/riscv/Kconfig.vendor                     |  19 +++
 arch/riscv/errata/andes/errata.c              |   3 +
 arch/riscv/errata/sifive/errata.c             |   3 +
 arch/riscv/errata/thead/errata.c              |   3 +
 arch/riscv/include/asm/cpufeature.h           |  25 +++
 arch/riscv/include/asm/hwcap.h                |  25 ++-
 arch/riscv/include/asm/vendor_extensions.h    |  49 ++++++
 .../include/asm/vendor_extensions/andes.h     |  19 +++
 arch/riscv/kernel/Makefile                    |   2 +
 arch/riscv/kernel/cpufeature.c                | 143 +++++++++++++-----
 arch/riscv/kernel/vendor_extensions.c         |  56 +++++++
 arch/riscv/kernel/vendor_extensions/Makefile  |   3 +
 arch/riscv/kernel/vendor_extensions/andes.c   |  18 +++
 drivers/perf/riscv_pmu_sbi.c                  |  10 +-
 15 files changed, 324 insertions(+), 56 deletions(-)
 create mode 100644 arch/riscv/Kconfig.vendor
 create mode 100644 arch/riscv/include/asm/vendor_extensions.h
 create mode 100644 arch/riscv/include/asm/vendor_extensions/andes.h
 create mode 100644 arch/riscv/kernel/vendor_extensions.c
 create mode 100644 arch/riscv/kernel/vendor_extensions/Makefile
 create mode 100644 arch/riscv/kernel/vendor_extensions/andes.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index a96a8dadf662..2b12d97eca63 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -808,6 +808,8 @@ config RISCV_EFFICIENT_UNALIGNED_ACCESS
 
 endchoice
 
+source "arch/riscv/Kconfig.vendor"
+
 endmenu # "Platform type"
 
 menu "Kernel features"
diff --git a/arch/riscv/Kconfig.vendor b/arch/riscv/Kconfig.vendor
new file mode 100644
index 000000000000..6f1cdd32ed29
--- /dev/null
+++ b/arch/riscv/Kconfig.vendor
@@ -0,0 +1,19 @@
+menu "Vendor extensions"
+
+config RISCV_ISA_VENDOR_EXT
+	bool
+
+menu "Andes"
+config RISCV_ISA_VENDOR_EXT_ANDES
+	bool "Andes vendor extension support"
+	select RISCV_ISA_VENDOR_EXT
+	default y
+	help
+	  Say N here if you want to disable all Andes vendor extension
+	  support. This will cause any Andes vendor extensions that are
+	  requested by hardware probing to be ignored.
+
+	  If you don't know what to do here, say Y.
+endmenu
+
+endmenu
diff --git a/arch/riscv/errata/andes/errata.c b/arch/riscv/errata/andes/errata.c
index f2708a9494a1..fc1a34faa5f3 100644
--- a/arch/riscv/errata/andes/errata.c
+++ b/arch/riscv/errata/andes/errata.c
@@ -17,6 +17,7 @@
 #include <asm/processor.h>
 #include <asm/sbi.h>
 #include <asm/vendorid_list.h>
+#include <asm/vendor_extensions.h>
 
 #define ANDES_AX45MP_MARCHID		0x8000000000008a45UL
 #define ANDES_AX45MP_MIMPID		0x500UL
@@ -65,6 +66,8 @@ void __init_or_module andes_errata_patch_func(struct alt_entry *begin, struct al
 					      unsigned long archid, unsigned long impid,
 					      unsigned int stage)
 {
+	BUILD_BUG_ON(ERRATA_ANDES_NUMBER >= RISCV_VENDOR_EXT_ALTERNATIVES_BASE);
+
 	if (stage == RISCV_ALTERNATIVES_BOOT)
 		errata_probe_iocp(stage, archid, impid);
 
diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c
index 716cfedad3a2..cea3b96ade11 100644
--- a/arch/riscv/errata/sifive/errata.c
+++ b/arch/riscv/errata/sifive/errata.c
@@ -12,6 +12,7 @@
 #include <asm/alternative.h>
 #include <asm/vendorid_list.h>
 #include <asm/errata_list.h>
+#include <asm/vendor_extensions.h>
 
 struct errata_info_t {
 	char name[32];
@@ -96,6 +97,8 @@ void sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end,
 	u32 cpu_apply_errata = 0;
 	u32 tmp;
 
+	BUILD_BUG_ON(ERRATA_SIFIVE_NUMBER >= RISCV_VENDOR_EXT_ALTERNATIVES_BASE);
+
 	if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
 		return;
 
diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c
index bf6a0a6318ee..f5120e07c318 100644
--- a/arch/riscv/errata/thead/errata.c
+++ b/arch/riscv/errata/thead/errata.c
@@ -18,6 +18,7 @@
 #include <asm/io.h>
 #include <asm/patch.h>
 #include <asm/vendorid_list.h>
+#include <asm/vendor_extensions.h>
 
 #define CSR_TH_SXSTATUS		0x5c0
 #define SXSTATUS_MAEE		_AC(0x200000, UL)
@@ -166,6 +167,8 @@ void thead_errata_patch_func(struct alt_entry *begin, struct alt_entry *end,
 	u32 tmp;
 	void *oldptr, *altptr;
 
+	BUILD_BUG_ON(ERRATA_THEAD_NUMBER >= RISCV_VENDOR_EXT_ALTERNATIVES_BASE);
+
 	for (alt = begin; alt < end; alt++) {
 		if (alt->vendor_id != THEAD_VENDOR_ID)
 			continue;
diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
index 000796c2d0b1..bfe7c0b881e9 100644
--- a/arch/riscv/include/asm/cpufeature.h
+++ b/arch/riscv/include/asm/cpufeature.h
@@ -33,6 +33,31 @@ extern struct riscv_isainfo hart_isa[NR_CPUS];
 
 void riscv_user_isa_enable(void);
 
+#define _RISCV_ISA_EXT_DATA(_name, _id, _subset_exts, _subset_exts_size, _validate) {	\
+	.name = #_name,									\
+	.property = #_name,								\
+	.id = _id,									\
+	.subset_ext_ids = _subset_exts,							\
+	.subset_ext_size = _subset_exts_size,						\
+	.validate = _validate								\
+}
+
+#define __RISCV_ISA_EXT_DATA(_name, _id) _RISCV_ISA_EXT_DATA(_name, _id, NULL, 0, NULL)
+
+#define __RISCV_ISA_EXT_DATA_VALIDATE(_name, _id, _validate) \
+			_RISCV_ISA_EXT_DATA(_name, _id, NULL, 0, _validate)
+
+/* Used to declare pure "lasso" extension (Zk for instance) */
+#define __RISCV_ISA_EXT_BUNDLE(_name, _bundled_exts) \
+	_RISCV_ISA_EXT_DATA(_name, RISCV_ISA_EXT_INVALID, _bundled_exts, \
+			    ARRAY_SIZE(_bundled_exts), NULL)
+
+/* Used to declare extensions that are a superset of other extensions (Zvbb for instance) */
+#define __RISCV_ISA_EXT_SUPERSET(_name, _id, _sub_exts) \
+	_RISCV_ISA_EXT_DATA(_name, _id, _sub_exts, ARRAY_SIZE(_sub_exts), NULL)
+#define __RISCV_ISA_EXT_SUPERSET_VALIDATE(_name, _id, _sub_exts, _validate) \
+	_RISCV_ISA_EXT_DATA(_name, _id, _sub_exts, ARRAY_SIZE(_sub_exts), _validate)
+
 #if defined(CONFIG_RISCV_MISALIGNED)
 bool check_unaligned_access_emulated_all_cpus(void);
 void unaligned_emulation_finish(void);
diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
index b18b202ca141..5a0bd27fd11a 100644
--- a/arch/riscv/include/asm/hwcap.h
+++ b/arch/riscv/include/asm/hwcap.h
@@ -80,19 +80,18 @@
 #define RISCV_ISA_EXT_ZFA		71
 #define RISCV_ISA_EXT_ZTSO		72
 #define RISCV_ISA_EXT_ZACAS		73
-#define RISCV_ISA_EXT_XANDESPMU		74
-#define RISCV_ISA_EXT_ZVE32X		75
-#define RISCV_ISA_EXT_ZVE32F		76
-#define RISCV_ISA_EXT_ZVE64X		77
-#define RISCV_ISA_EXT_ZVE64F		78
-#define RISCV_ISA_EXT_ZVE64D		79
-#define RISCV_ISA_EXT_ZIMOP		80
-#define RISCV_ISA_EXT_ZCA		81
-#define RISCV_ISA_EXT_ZCB		82
-#define RISCV_ISA_EXT_ZCD		83
-#define RISCV_ISA_EXT_ZCF		84
-#define RISCV_ISA_EXT_ZCMOP		85
-#define RISCV_ISA_EXT_ZAWRS		86
+#define RISCV_ISA_EXT_ZVE32X		74
+#define RISCV_ISA_EXT_ZVE32F		75
+#define RISCV_ISA_EXT_ZVE64X		76
+#define RISCV_ISA_EXT_ZVE64F		77
+#define RISCV_ISA_EXT_ZVE64D		78
+#define RISCV_ISA_EXT_ZIMOP		79
+#define RISCV_ISA_EXT_ZCA		80
+#define RISCV_ISA_EXT_ZCB		81
+#define RISCV_ISA_EXT_ZCD		82
+#define RISCV_ISA_EXT_ZCF		83
+#define RISCV_ISA_EXT_ZCMOP		84
+#define RISCV_ISA_EXT_ZAWRS		85
 
 #define RISCV_ISA_EXT_XLINUXENVCFG	127
 
diff --git a/arch/riscv/include/asm/vendor_extensions.h b/arch/riscv/include/asm/vendor_extensions.h
new file mode 100644
index 000000000000..5fca550fc1f6
--- /dev/null
+++ b/arch/riscv/include/asm/vendor_extensions.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2024 Rivos, Inc
+ */
+
+#ifndef _ASM_VENDOR_EXTENSIONS_H
+#define _ASM_VENDOR_EXTENSIONS_H
+
+#include <asm/cpufeature.h>
+
+#include <linux/array_size.h>
+#include <linux/types.h>
+
+/*
+ * The extension keys of each vendor must be strictly less than this value.
+ */
+#define RISCV_ISA_VENDOR_EXT_MAX 32
+
+struct riscv_isavendorinfo {
+	DECLARE_BITMAP(isa, RISCV_ISA_VENDOR_EXT_MAX);
+};
+
+struct riscv_isa_vendor_ext_data_list {
+	bool is_initialized;
+	const size_t ext_data_count;
+	const struct riscv_isa_ext_data *ext_data;
+	struct riscv_isavendorinfo per_hart_isa_bitmap[NR_CPUS];
+	struct riscv_isavendorinfo all_harts_isa_bitmap;
+};
+
+extern struct riscv_isa_vendor_ext_data_list *riscv_isa_vendor_ext_list[];
+
+extern const size_t riscv_isa_vendor_ext_list_size;
+
+/*
+ * The alternatives need some way of distinguishing between vendor extensions
+ * and errata. Incrementing all of the vendor extension keys so they are at
+ * least 0x8000 accomplishes that.
+ */
+#define RISCV_VENDOR_EXT_ALTERNATIVES_BASE	0x8000
+
+#define VENDOR_EXT_ALL_CPUS			-1
+
+bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsigned int bit);
+#define riscv_isa_vendor_extension_available(vendor, ext)	\
+	__riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor, \
+					       RISCV_ISA_VENDOR_EXT_##ext)
+
+#endif /* _ASM_VENDOR_EXTENSIONS_H */
diff --git a/arch/riscv/include/asm/vendor_extensions/andes.h b/arch/riscv/include/asm/vendor_extensions/andes.h
new file mode 100644
index 000000000000..7bb2fc43438f
--- /dev/null
+++ b/arch/riscv/include/asm/vendor_extensions/andes.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_ANDES_H
+#define _ASM_RISCV_VENDOR_EXTENSIONS_ANDES_H
+
+#include <asm/vendor_extensions.h>
+
+#include <linux/types.h>
+
+#define RISCV_ISA_VENDOR_EXT_XANDESPMU		0
+
+/*
+ * Extension keys should be strictly less than max.
+ * It is safe to increment this when necessary.
+ */
+#define RISCV_ISA_VENDOR_EXT_MAX_ANDES			32
+
+extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_andes;
+
+#endif
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 5b243d46f4b1..b0aea202273d 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -58,6 +58,8 @@ obj-y	+= riscv_ksyms.o
 obj-y	+= stacktrace.o
 obj-y	+= cacheinfo.o
 obj-y	+= patch.o
+obj-y	+= vendor_extensions.o
+obj-y	+= vendor_extensions/
 obj-y	+= probes/
 obj-y	+= tests/
 obj-$(CONFIG_MMU) += vdso.o vdso/
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 0366dc3baf33..8f20607adb40 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -24,6 +24,7 @@
 #include <asm/processor.h>
 #include <asm/sbi.h>
 #include <asm/vector.h>
+#include <asm/vendor_extensions.h>
 
 #define NUM_ALPHA_EXTS ('z' - 'a' + 1)
 
@@ -100,31 +101,6 @@ static int riscv_ext_zicboz_validate(const struct riscv_isa_ext_data *data,
 	return 0;
 }
 
-#define _RISCV_ISA_EXT_DATA(_name, _id, _subset_exts, _subset_exts_size, _validate) {	\
-	.name = #_name,									\
-	.property = #_name,								\
-	.id = _id,									\
-	.subset_ext_ids = _subset_exts,							\
-	.subset_ext_size = _subset_exts_size,						\
-	.validate = _validate								\
-}
-
-#define __RISCV_ISA_EXT_DATA(_name, _id) _RISCV_ISA_EXT_DATA(_name, _id, NULL, 0, NULL)
-
-#define __RISCV_ISA_EXT_DATA_VALIDATE(_name, _id, _validate) \
-			_RISCV_ISA_EXT_DATA(_name, _id, NULL, 0, _validate)
-
-/* Used to declare pure "lasso" extension (Zk for instance) */
-#define __RISCV_ISA_EXT_BUNDLE(_name, _bundled_exts) \
-	_RISCV_ISA_EXT_DATA(_name, RISCV_ISA_EXT_INVALID, _bundled_exts, \
-			    ARRAY_SIZE(_bundled_exts), NULL)
-
-/* Used to declare extensions that are a superset of other extensions (Zvbb for instance) */
-#define __RISCV_ISA_EXT_SUPERSET(_name, _id, _sub_exts) \
-	_RISCV_ISA_EXT_DATA(_name, _id, _sub_exts, ARRAY_SIZE(_sub_exts), NULL)
-#define __RISCV_ISA_EXT_SUPERSET_VALIDATE(_name, _id, _sub_exts, _validate) \
-	_RISCV_ISA_EXT_DATA(_name, _id, _sub_exts, ARRAY_SIZE(_sub_exts), _validate)
-
 static int riscv_ext_zca_depends(const struct riscv_isa_ext_data *data,
 				 const unsigned long *isa_bitmap)
 {
@@ -405,7 +381,6 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
 	__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
 	__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
 	__RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
-	__RISCV_ISA_EXT_DATA(xandespmu, RISCV_ISA_EXT_XANDESPMU),
 };
 
 const size_t riscv_isa_ext_count = ARRAY_SIZE(riscv_isa_ext);
@@ -512,6 +487,21 @@ static void __init riscv_parse_isa_string(const char *isa, unsigned long *bitmap
 		bool ext_err = false;
 
 		switch (*ext) {
+		case 'x':
+		case 'X':
+			if (acpi_disabled)
+				pr_warn_once("Vendor extensions are ignored in riscv,isa. Use riscv,isa-extensions instead.");
+			/*
+			 * To skip an extension, we find its end.
+			 * As multi-letter extensions must be split from other multi-letter
+			 * extensions with an "_", the end of a multi-letter extension will
+			 * either be the null character or the "_" at the start of the next
+			 * multi-letter extension.
+			 */
+			for (; *isa && *isa != '_'; ++isa)
+				;
+			ext_err = true;
+			break;
 		case 's':
 			/*
 			 * Workaround for invalid single-letter 's' & 'u' (QEMU).
@@ -527,8 +517,6 @@ static void __init riscv_parse_isa_string(const char *isa, unsigned long *bitmap
 			}
 			fallthrough;
 		case 'S':
-		case 'x':
-		case 'X':
 		case 'z':
 		case 'Z':
 			/*
@@ -728,6 +716,61 @@ static void __init riscv_fill_hwcap_from_isa_string(unsigned long *isa2hwcap)
 		acpi_put_table((struct acpi_table_header *)rhct);
 }
 
+static void __init riscv_fill_cpu_vendor_ext(struct device_node *cpu_node, int cpu)
+{
+	if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT))
+		return;
+
+	for (int i = 0; i < riscv_isa_vendor_ext_list_size; i++) {
+		struct riscv_isa_vendor_ext_data_list *ext_list = riscv_isa_vendor_ext_list[i];
+
+		for (int j = 0; j < ext_list->ext_data_count; j++) {
+			const struct riscv_isa_ext_data ext = ext_list->ext_data[j];
+			struct riscv_isavendorinfo *isavendorinfo = &ext_list->per_hart_isa_bitmap[cpu];
+
+			if (of_property_match_string(cpu_node, "riscv,isa-extensions",
+						     ext.property) < 0)
+				continue;
+
+			/*
+			 * Assume that subset extensions are all members of the
+			 * same vendor.
+			 */
+			if (ext.subset_ext_size)
+				for (int k = 0; k < ext.subset_ext_size; k++)
+					set_bit(ext.subset_ext_ids[k], isavendorinfo->isa);
+
+			set_bit(ext.id, isavendorinfo->isa);
+		}
+	}
+}
+
+/*
+ * Populate all_harts_isa_bitmap for each vendor with all of the extensions that
+ * are shared across CPUs for that vendor.
+ */
+static void __init riscv_fill_vendor_ext_list(int cpu)
+{
+	if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT))
+		return;
+
+	for (int i = 0; i < riscv_isa_vendor_ext_list_size; i++) {
+		struct riscv_isa_vendor_ext_data_list *ext_list = riscv_isa_vendor_ext_list[i];
+
+		if (!ext_list->is_initialized) {
+			bitmap_copy(ext_list->all_harts_isa_bitmap.isa,
+				    ext_list->per_hart_isa_bitmap[cpu].isa,
+				    RISCV_ISA_VENDOR_EXT_MAX);
+			ext_list->is_initialized = true;
+		} else {
+			bitmap_and(ext_list->all_harts_isa_bitmap.isa,
+				   ext_list->all_harts_isa_bitmap.isa,
+				   ext_list->per_hart_isa_bitmap[cpu].isa,
+				   RISCV_ISA_VENDOR_EXT_MAX);
+		}
+	}
+}
+
 static int __init riscv_fill_hwcap_from_ext_list(unsigned long *isa2hwcap)
 {
 	unsigned int cpu;
@@ -760,6 +803,7 @@ static int __init riscv_fill_hwcap_from_ext_list(unsigned long *isa2hwcap)
 		}
 
 		riscv_resolve_isa(source_isa, isainfo->isa, &this_hwcap, isa2hwcap);
+		riscv_fill_cpu_vendor_ext(cpu_node, cpu);
 
 		of_node_put(cpu_node);
 
@@ -776,6 +820,8 @@ static int __init riscv_fill_hwcap_from_ext_list(unsigned long *isa2hwcap)
 			bitmap_copy(riscv_isa, isainfo->isa, RISCV_ISA_EXT_MAX);
 		else
 			bitmap_and(riscv_isa, riscv_isa, isainfo->isa, RISCV_ISA_EXT_MAX);
+
+		riscv_fill_vendor_ext_list(cpu);
 	}
 
 	if (bitmap_empty(riscv_isa, RISCV_ISA_EXT_MAX))
@@ -918,29 +964,46 @@ void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin,
 {
 	struct alt_entry *alt;
 	void *oldptr, *altptr;
-	u16 id, value;
+	u16 id, value, vendor;
 
 	if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
 		return;
 
 	for (alt = begin; alt < end; alt++) {
-		if (alt->vendor_id != 0)
-			continue;
-
 		id = PATCH_ID_CPUFEATURE_ID(alt->patch_id);
+		vendor = PATCH_ID_CPUFEATURE_ID(alt->vendor_id);
 
-		if (id >= RISCV_ISA_EXT_MAX) {
+		/*
+		 * Any alternative with a patch_id that is less than
+		 * RISCV_ISA_EXT_MAX is interpreted as a standard extension.
+		 *
+		 * Any alternative with patch_id that is greater than or equal
+		 * to RISCV_VENDOR_EXT_ALTERNATIVES_BASE is interpreted as a
+		 * vendor extension.
+		 */
+		if (id < RISCV_ISA_EXT_MAX) {
+			/*
+			 * This patch should be treated as errata so skip
+			 * processing here.
+			 */
+			if (alt->vendor_id != 0)
+				continue;
+
+			if (!__riscv_isa_extension_available(NULL, id))
+				continue;
+
+			value = PATCH_ID_CPUFEATURE_VALUE(alt->patch_id);
+			if (!riscv_cpufeature_patch_check(id, value))
+				continue;
+		} else if (id >= RISCV_VENDOR_EXT_ALTERNATIVES_BASE) {
+			if (!__riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor,
+								    id - RISCV_VENDOR_EXT_ALTERNATIVES_BASE))
+				continue;
+		} else {
 			WARN(1, "This extension id:%d is not in ISA extension list", id);
 			continue;
 		}
 
-		if (!__riscv_isa_extension_available(NULL, id))
-			continue;
-
-		value = PATCH_ID_CPUFEATURE_VALUE(alt->patch_id);
-		if (!riscv_cpufeature_patch_check(id, value))
-			continue;
-
 		oldptr = ALT_OLD_PTR(alt);
 		altptr = ALT_ALT_PTR(alt);
 
diff --git a/arch/riscv/kernel/vendor_extensions.c b/arch/riscv/kernel/vendor_extensions.c
new file mode 100644
index 000000000000..b6c1e7b5d34b
--- /dev/null
+++ b/arch/riscv/kernel/vendor_extensions.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2024 Rivos, Inc
+ */
+
+#include <asm/vendorid_list.h>
+#include <asm/vendor_extensions.h>
+#include <asm/vendor_extensions/andes.h>
+
+#include <linux/array_size.h>
+#include <linux/types.h>
+
+struct riscv_isa_vendor_ext_data_list *riscv_isa_vendor_ext_list[] = {
+#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES
+	&riscv_isa_vendor_ext_list_andes,
+#endif
+};
+
+const size_t riscv_isa_vendor_ext_list_size = ARRAY_SIZE(riscv_isa_vendor_ext_list);
+
+/**
+ * __riscv_isa_vendor_extension_available() - Check whether given vendor
+ * extension is available or not.
+ *
+ * @cpu: check if extension is available on this cpu
+ * @vendor: vendor that the extension is a member of
+ * @bit: bit position of the desired extension
+ * Return: true or false
+ *
+ * NOTE: When cpu is -1, will check if extension is available on all cpus
+ */
+bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsigned int bit)
+{
+	struct riscv_isavendorinfo *bmap;
+	struct riscv_isavendorinfo *cpu_bmap;
+
+	switch (vendor) {
+	#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES
+	case ANDES_VENDOR_ID:
+		bmap = &riscv_isa_vendor_ext_list_andes.all_harts_isa_bitmap;
+		cpu_bmap = &riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap[cpu];
+		break;
+	#endif
+	default:
+		return false;
+	}
+
+	if (cpu != -1)
+		bmap = &cpu_bmap[cpu];
+
+	if (bit >= RISCV_ISA_VENDOR_EXT_MAX)
+		return false;
+
+	return test_bit(bit, bmap->isa) ? true : false;
+}
+EXPORT_SYMBOL_GPL(__riscv_isa_vendor_extension_available);
diff --git a/arch/riscv/kernel/vendor_extensions/Makefile b/arch/riscv/kernel/vendor_extensions/Makefile
new file mode 100644
index 000000000000..6a61aed944f1
--- /dev/null
+++ b/arch/riscv/kernel/vendor_extensions/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_ANDES)	+= andes.o
diff --git a/arch/riscv/kernel/vendor_extensions/andes.c b/arch/riscv/kernel/vendor_extensions/andes.c
new file mode 100644
index 000000000000..ec688c88456a
--- /dev/null
+++ b/arch/riscv/kernel/vendor_extensions/andes.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <asm/cpufeature.h>
+#include <asm/vendor_extensions.h>
+#include <asm/vendor_extensions/andes.h>
+
+#include <linux/array_size.h>
+#include <linux/types.h>
+
+/* All Andes vendor extensions supported in Linux */
+const struct riscv_isa_ext_data riscv_isa_vendor_ext_andes[] = {
+	__RISCV_ISA_EXT_DATA(xandespmu, RISCV_ISA_VENDOR_EXT_XANDESPMU),
+};
+
+struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_andes = {
+	.ext_data_count = ARRAY_SIZE(riscv_isa_vendor_ext_andes),
+	.ext_data = riscv_isa_vendor_ext_andes,
+};
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index a2e4005e1fd0..02719e0c6368 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -24,6 +24,8 @@
 #include <asm/errata_list.h>
 #include <asm/sbi.h>
 #include <asm/cpufeature.h>
+#include <asm/vendor_extensions.h>
+#include <asm/vendor_extensions/andes.h>
 
 #define ALT_SBI_PMU_OVERFLOW(__ovl)					\
 asm volatile(ALTERNATIVE_2(						\
@@ -32,7 +34,8 @@ asm volatile(ALTERNATIVE_2(						\
 		THEAD_VENDOR_ID, ERRATA_THEAD_PMU,			\
 		CONFIG_ERRATA_THEAD_PMU,				\
 	"csrr %0, " __stringify(ANDES_CSR_SCOUNTEROF),			\
-		0, RISCV_ISA_EXT_XANDESPMU,				\
+		ANDES_VENDOR_ID,					\
+		RISCV_ISA_VENDOR_EXT_XANDESPMU + RISCV_VENDOR_EXT_ALTERNATIVES_BASE, \
 		CONFIG_ANDES_CUSTOM_PMU)				\
 	: "=r" (__ovl) :						\
 	: "memory")
@@ -41,7 +44,8 @@ asm volatile(ALTERNATIVE_2(						\
 asm volatile(ALTERNATIVE(						\
 	"csrc " __stringify(CSR_IP) ", %0\n\t",				\
 	"csrc " __stringify(ANDES_CSR_SLIP) ", %0\n\t",			\
-		0, RISCV_ISA_EXT_XANDESPMU,				\
+		ANDES_VENDOR_ID,					\
+		RISCV_ISA_VENDOR_EXT_XANDESPMU + RISCV_VENDOR_EXT_ALTERNATIVES_BASE, \
 		CONFIG_ANDES_CUSTOM_PMU)				\
 	: : "r"(__irq_mask)						\
 	: "memory")
@@ -1060,7 +1064,7 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, struct platform_device *pde
 		   riscv_cached_mimpid(0) == 0) {
 		riscv_pmu_irq_num = THEAD_C9XX_RV_IRQ_PMU;
 		riscv_pmu_use_irq = true;
-	} else if (riscv_isa_extension_available(NULL, XANDESPMU) &&
+	} else if (riscv_isa_vendor_extension_available(ANDES_VENDOR_ID, XANDESPMU) &&
 		   IS_ENABLED(CONFIG_ANDES_CUSTOM_PMU)) {
 		riscv_pmu_irq_num = ANDES_SLI_CAUSE_BASE + ANDES_RV_IRQ_PMOVI;
 		riscv_pmu_use_irq = true;

From 9448d9accdd8f1483df6fe6692e3c50cfe507d7a Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Fri, 19 Jul 2024 09:15:19 -0700
Subject: [PATCH 161/346] riscv: Add vendor extensions to /proc/cpuinfo

All of the supported vendor extensions that have been listed in
riscv_isa_vendor_ext_list can be exported through /proc/cpuinfo.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Evan Green <evan@rivosinc.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240719-support_vendor_extensions-v3-2-0af7587bbec0@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/kernel/cpu.c | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
index c1f3655238fd..f6b13e9f5e6c 100644
--- a/arch/riscv/kernel/cpu.c
+++ b/arch/riscv/kernel/cpu.c
@@ -16,6 +16,7 @@
 #include <asm/sbi.h>
 #include <asm/smp.h>
 #include <asm/pgtable.h>
+#include <asm/vendor_extensions.h>
 
 bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
 {
@@ -235,7 +236,33 @@ arch_initcall(riscv_cpuinfo_init);
 
 #ifdef CONFIG_PROC_FS
 
-static void print_isa(struct seq_file *f, const unsigned long *isa_bitmap)
+#define ALL_CPUS -1
+
+static void print_vendor_isa(struct seq_file *f, int cpu)
+{
+	struct riscv_isavendorinfo *vendor_bitmap;
+	struct riscv_isa_vendor_ext_data_list *ext_list;
+	const struct riscv_isa_ext_data *ext_data;
+
+	for (int i = 0; i < riscv_isa_vendor_ext_list_size; i++) {
+		ext_list = riscv_isa_vendor_ext_list[i];
+		ext_data = riscv_isa_vendor_ext_list[i]->ext_data;
+
+		if (cpu == ALL_CPUS)
+			vendor_bitmap = &ext_list->all_harts_isa_bitmap;
+		else
+			vendor_bitmap = &ext_list->per_hart_isa_bitmap[cpu];
+
+		for (int j = 0; j < ext_list->ext_data_count; j++) {
+			if (!__riscv_isa_extension_available(vendor_bitmap->isa, ext_data[j].id))
+				continue;
+
+			seq_printf(f, "_%s", ext_data[j].name);
+		}
+	}
+}
+
+static void print_isa(struct seq_file *f, const unsigned long *isa_bitmap, int cpu)
 {
 
 	if (IS_ENABLED(CONFIG_32BIT))
@@ -254,6 +281,8 @@ static void print_isa(struct seq_file *f, const unsigned long *isa_bitmap)
 		seq_printf(f, "%s", riscv_isa_ext[i].name);
 	}
 
+	print_vendor_isa(f, cpu);
+
 	seq_puts(f, "\n");
 }
 
@@ -316,7 +345,7 @@ static int c_show(struct seq_file *m, void *v)
 	 * line.
 	 */
 	seq_puts(m, "isa\t\t: ");
-	print_isa(m, NULL);
+	print_isa(m, NULL, ALL_CPUS);
 	print_mmu(m);
 
 	if (acpi_disabled) {
@@ -338,7 +367,7 @@ static int c_show(struct seq_file *m, void *v)
 	 * additional extensions not present across all harts.
 	 */
 	seq_puts(m, "hart isa\t: ");
-	print_isa(m, hart_isa[cpu_id].isa);
+	print_isa(m, hart_isa[cpu_id].isa, cpu_id);
 	seq_puts(m, "\n");
 
 	return 0;

From 0f2425411101128a38ad0166268975ca11fbcaeb Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Fri, 19 Jul 2024 09:15:20 -0700
Subject: [PATCH 162/346] riscv: Introduce vendor variants of extension helpers

Vendor extensions are maintained in per-vendor structs (separate from
standard extensions which live in riscv_isa). Create vendor variants for
the existing extension helpers to interface with the riscv_isa_vendor
bitmaps.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Andy Chiu <andy.chiu@sifive.com>
Link: https://lore.kernel.org/r/20240719-support_vendor_extensions-v3-3-0af7587bbec0@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/vendor_extensions.h | 83 ++++++++++++++++++++++
 drivers/perf/riscv_pmu_sbi.c               |  3 +-
 2 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/include/asm/vendor_extensions.h b/arch/riscv/include/asm/vendor_extensions.h
index 5fca550fc1f6..04d72b02ae6b 100644
--- a/arch/riscv/include/asm/vendor_extensions.h
+++ b/arch/riscv/include/asm/vendor_extensions.h
@@ -42,8 +42,91 @@ extern const size_t riscv_isa_vendor_ext_list_size;
 #define VENDOR_EXT_ALL_CPUS			-1
 
 bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsigned int bit);
+#define riscv_cpu_isa_vendor_extension_available(cpu, vendor, ext)	\
+	__riscv_isa_vendor_extension_available(cpu, vendor, RISCV_ISA_VENDOR_EXT_##ext)
 #define riscv_isa_vendor_extension_available(vendor, ext)	\
 	__riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor, \
 					       RISCV_ISA_VENDOR_EXT_##ext)
 
+static __always_inline bool __riscv_has_extension_likely(const unsigned long vendor,
+							 const unsigned long ext)
+{
+	asm goto(ALTERNATIVE("j	%l[l_no]", "nop", %[vendor], %[ext], 1)
+	:
+	: [vendor] "i" (vendor), [ext] "i" (ext)
+	:
+	: l_no);
+
+	return true;
+l_no:
+	return false;
+}
+
+static __always_inline bool __riscv_has_extension_unlikely(const unsigned long vendor,
+							   const unsigned long ext)
+{
+	asm goto(ALTERNATIVE("nop", "j	%l[l_yes]", %[vendor], %[ext], 1)
+	:
+	: [vendor] "i" (vendor), [ext] "i" (ext)
+	:
+	: l_yes);
+
+	return false;
+l_yes:
+	return true;
+}
+
+static __always_inline bool riscv_has_vendor_extension_likely(const unsigned long vendor,
+							      const unsigned long ext)
+{
+	if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT))
+		return false;
+
+	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE))
+		return __riscv_has_extension_likely(vendor,
+						    ext + RISCV_VENDOR_EXT_ALTERNATIVES_BASE);
+
+	return __riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor, ext);
+}
+
+static __always_inline bool riscv_has_vendor_extension_unlikely(const unsigned long vendor,
+								const unsigned long ext)
+{
+	if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT))
+		return false;
+
+	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE))
+		return __riscv_has_extension_unlikely(vendor,
+						      ext + RISCV_VENDOR_EXT_ALTERNATIVES_BASE);
+
+	return __riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor, ext);
+}
+
+static __always_inline bool riscv_cpu_has_vendor_extension_likely(const unsigned long vendor,
+								  int cpu, const unsigned long ext)
+{
+	if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT))
+		return false;
+
+	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) &&
+	    __riscv_has_extension_likely(vendor, ext + RISCV_VENDOR_EXT_ALTERNATIVES_BASE))
+		return true;
+
+	return __riscv_isa_vendor_extension_available(cpu, vendor, ext);
+}
+
+static __always_inline bool riscv_cpu_has_vendor_extension_unlikely(const unsigned long vendor,
+								    int cpu,
+								    const unsigned long ext)
+{
+	if (!IS_ENABLED(CONFIG_RISCV_ISA_VENDOR_EXT))
+		return false;
+
+	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) &&
+	    __riscv_has_extension_unlikely(vendor, ext + RISCV_VENDOR_EXT_ALTERNATIVES_BASE))
+		return true;
+
+	return __riscv_isa_vendor_extension_available(cpu, vendor, ext);
+}
+
 #endif /* _ASM_VENDOR_EXTENSIONS_H */
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index 02719e0c6368..e1f0edfb7dcc 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -1064,7 +1064,8 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, struct platform_device *pde
 		   riscv_cached_mimpid(0) == 0) {
 		riscv_pmu_irq_num = THEAD_C9XX_RV_IRQ_PMU;
 		riscv_pmu_use_irq = true;
-	} else if (riscv_isa_vendor_extension_available(ANDES_VENDOR_ID, XANDESPMU) &&
+	} else if (riscv_has_vendor_extension_unlikely(ANDES_VENDOR_ID,
+						       RISCV_ISA_VENDOR_EXT_XANDESPMU) &&
 		   IS_ENABLED(CONFIG_ANDES_CUSTOM_PMU)) {
 		riscv_pmu_irq_num = ANDES_SLI_CAUSE_BASE + ANDES_RV_IRQ_PMOVI;
 		riscv_pmu_use_irq = true;

From d4c8d79f5199055da38f880f782a3e62c599ff5d Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Fri, 19 Jul 2024 09:15:21 -0700
Subject: [PATCH 163/346] riscv: cpufeature: Extract common elements from
 extension checking

The __riscv_has_extension_likely() and __riscv_has_extension_unlikely()
functions from the vendor_extensions.h can be used to simplify the
standard extension checking code as well. Migrate those functions to
cpufeature.h and reorganize the code in the file to use the functions.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Andy Chiu <andy.chiu@sifive.com>
Link: https://lore.kernel.org/r/20240719-support_vendor_extensions-v3-4-0af7587bbec0@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/cpufeature.h        | 78 ++++++++++++----------
 arch/riscv/include/asm/vendor_extensions.h | 28 --------
 2 files changed, 44 insertions(+), 62 deletions(-)

diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
index bfe7c0b881e9..45f9c1171a48 100644
--- a/arch/riscv/include/asm/cpufeature.h
+++ b/arch/riscv/include/asm/cpufeature.h
@@ -104,59 +104,66 @@ extern bool riscv_isa_fallback;
 
 unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap);
 
+#define STANDARD_EXT		0
+
 bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, unsigned int bit);
 #define riscv_isa_extension_available(isa_bitmap, ext)	\
 	__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_##ext)
 
-static __always_inline bool
-riscv_has_extension_likely(const unsigned long ext)
+static __always_inline bool __riscv_has_extension_likely(const unsigned long vendor,
+							 const unsigned long ext)
 {
-	compiletime_assert(ext < RISCV_ISA_EXT_MAX,
-			   "ext must be < RISCV_ISA_EXT_MAX");
-
-	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
-		asm goto(
-		ALTERNATIVE("j	%l[l_no]", "nop", 0, %[ext], 1)
-		:
-		: [ext] "i" (ext)
-		:
-		: l_no);
-	} else {
-		if (!__riscv_isa_extension_available(NULL, ext))
-			goto l_no;
-	}
+	asm goto(ALTERNATIVE("j	%l[l_no]", "nop", %[vendor], %[ext], 1)
+	:
+	: [vendor] "i" (vendor), [ext] "i" (ext)
+	:
+	: l_no);
 
 	return true;
 l_no:
 	return false;
 }
 
-static __always_inline bool
-riscv_has_extension_unlikely(const unsigned long ext)
+static __always_inline bool __riscv_has_extension_unlikely(const unsigned long vendor,
+							   const unsigned long ext)
 {
-	compiletime_assert(ext < RISCV_ISA_EXT_MAX,
-			   "ext must be < RISCV_ISA_EXT_MAX");
-
-	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
-		asm goto(
-		ALTERNATIVE("nop", "j	%l[l_yes]", 0, %[ext], 1)
-		:
-		: [ext] "i" (ext)
-		:
-		: l_yes);
-	} else {
-		if (__riscv_isa_extension_available(NULL, ext))
-			goto l_yes;
-	}
+	asm goto(ALTERNATIVE("nop", "j	%l[l_yes]", %[vendor], %[ext], 1)
+	:
+	: [vendor] "i" (vendor), [ext] "i" (ext)
+	:
+	: l_yes);
 
 	return false;
 l_yes:
 	return true;
 }
 
+static __always_inline bool riscv_has_extension_unlikely(const unsigned long ext)
+{
+	compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX");
+
+	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE))
+		return __riscv_has_extension_unlikely(STANDARD_EXT, ext);
+
+	return __riscv_isa_extension_available(NULL, ext);
+}
+
+static __always_inline bool riscv_has_extension_likely(const unsigned long ext)
+{
+	compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX");
+
+	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE))
+		return __riscv_has_extension_likely(STANDARD_EXT, ext);
+
+	return __riscv_isa_extension_available(NULL, ext);
+}
+
 static __always_inline bool riscv_cpu_has_extension_likely(int cpu, const unsigned long ext)
 {
-	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) && riscv_has_extension_likely(ext))
+	compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX");
+
+	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) &&
+	    __riscv_has_extension_likely(STANDARD_EXT, ext))
 		return true;
 
 	return __riscv_isa_extension_available(hart_isa[cpu].isa, ext);
@@ -164,7 +171,10 @@ static __always_inline bool riscv_cpu_has_extension_likely(int cpu, const unsign
 
 static __always_inline bool riscv_cpu_has_extension_unlikely(int cpu, const unsigned long ext)
 {
-	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) && riscv_has_extension_unlikely(ext))
+	compiletime_assert(ext < RISCV_ISA_EXT_MAX, "ext must be < RISCV_ISA_EXT_MAX");
+
+	if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) &&
+	    __riscv_has_extension_unlikely(STANDARD_EXT, ext))
 		return true;
 
 	return __riscv_isa_extension_available(hart_isa[cpu].isa, ext);
diff --git a/arch/riscv/include/asm/vendor_extensions.h b/arch/riscv/include/asm/vendor_extensions.h
index 04d72b02ae6b..7437304a71b9 100644
--- a/arch/riscv/include/asm/vendor_extensions.h
+++ b/arch/riscv/include/asm/vendor_extensions.h
@@ -48,34 +48,6 @@ bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsig
 	__riscv_isa_vendor_extension_available(VENDOR_EXT_ALL_CPUS, vendor, \
 					       RISCV_ISA_VENDOR_EXT_##ext)
 
-static __always_inline bool __riscv_has_extension_likely(const unsigned long vendor,
-							 const unsigned long ext)
-{
-	asm goto(ALTERNATIVE("j	%l[l_no]", "nop", %[vendor], %[ext], 1)
-	:
-	: [vendor] "i" (vendor), [ext] "i" (ext)
-	:
-	: l_no);
-
-	return true;
-l_no:
-	return false;
-}
-
-static __always_inline bool __riscv_has_extension_unlikely(const unsigned long vendor,
-							   const unsigned long ext)
-{
-	asm goto(ALTERNATIVE("nop", "j	%l[l_yes]", %[vendor], %[ext], 1)
-	:
-	: [vendor] "i" (vendor), [ext] "i" (ext)
-	:
-	: l_yes);
-
-	return false;
-l_yes:
-	return true;
-}
-
 static __always_inline bool riscv_has_vendor_extension_likely(const unsigned long vendor,
 							      const unsigned long ext)
 {

From 65d284a38c0ded391ae04a29640c34221c934cd2 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Thu, 6 Jun 2024 18:41:57 +0200
Subject: [PATCH 164/346] ceph: use cap_wait_list only if debugfs is enabled

Only debugfs uses this list.  By omitting it, we save some memory and
reduce lock contention on `caps_list_lock`.

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c       | 6 ++++++
 fs/ceph/mds_client.c | 2 ++
 fs/ceph/mds_client.h | 6 ++++++
 3 files changed, 14 insertions(+)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index c4941ba245ac..e98aa8219303 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3067,10 +3067,13 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
 				       flags, &_got);
 		WARN_ON_ONCE(ret == -EAGAIN);
 		if (!ret) {
+#ifdef CONFIG_DEBUG_FS
 			struct ceph_mds_client *mdsc = fsc->mdsc;
 			struct cap_wait cw;
+#endif
 			DEFINE_WAIT_FUNC(wait, woken_wake_function);
 
+#ifdef CONFIG_DEBUG_FS
 			cw.ino = ceph_ino(inode);
 			cw.tgid = current->tgid;
 			cw.need = need;
@@ -3079,6 +3082,7 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
 			spin_lock(&mdsc->caps_list_lock);
 			list_add(&cw.list, &mdsc->cap_wait_list);
 			spin_unlock(&mdsc->caps_list_lock);
+#endif
 
 			/* make sure used fmode not timeout */
 			ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
@@ -3097,9 +3101,11 @@ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
 			remove_wait_queue(&ci->i_cap_wq, &wait);
 			ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
 
+#ifdef CONFIG_DEBUG_FS
 			spin_lock(&mdsc->caps_list_lock);
 			list_del(&cw.list);
 			spin_unlock(&mdsc->caps_list_lock);
+#endif
 
 			if (ret == -EAGAIN)
 				continue;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c2157f6e0c69..62238f3e6e19 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5505,7 +5505,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
 	mdsc->last_renew_caps = jiffies;
 	INIT_LIST_HEAD(&mdsc->cap_delay_list);
+#ifdef CONFIG_DEBUG_FS
 	INIT_LIST_HEAD(&mdsc->cap_wait_list);
+#endif
 	spin_lock_init(&mdsc->cap_delay_lock);
 	INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list);
 	INIT_LIST_HEAD(&mdsc->snap_flush_list);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index cfa18cf915a0..9bcc7f181bfe 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -416,6 +416,8 @@ struct ceph_quotarealm_inode {
 	struct inode *inode;
 };
 
+#ifdef CONFIG_DEBUG_FS
+
 struct cap_wait {
 	struct list_head	list;
 	u64			ino;
@@ -424,6 +426,8 @@ struct cap_wait {
 	int			want;
 };
 
+#endif
+
 enum {
 	CEPH_MDSC_STOPPING_BEGIN = 1,
 	CEPH_MDSC_STOPPING_FLUSHING = 2,
@@ -512,7 +516,9 @@ struct ceph_mds_client {
 	spinlock_t	caps_list_lock;
 	struct		list_head caps_list; /* unused (reserved or
 						unreserved) */
+#ifdef CONFIG_DEBUG_FS
 	struct		list_head cap_wait_list;
+#endif
 	int		caps_total_count;    /* total caps allocated */
 	int		caps_use_count;      /* in use */
 	int		caps_use_max;	     /* max used caps */

From 77bb4a501a7756e6f5428b72fb0e420deb7ae562 Mon Sep 17 00:00:00 2001
From: Chen Ni <nichen@iscas.ac.cn>
Date: Tue, 9 Jul 2024 14:44:00 +0800
Subject: [PATCH 165/346] ceph: convert comma to semicolon in
 __ceph_dentry_dir_lease_touch()

Replace a comma between expression statements by a semicolon.

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 82a2e2a06a65..e4e7a856e9c2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1589,7 +1589,7 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
 	}
 
 	spin_lock(&mdsc->dentry_list_lock);
-	__dentry_dir_lease_touch(mdsc, di),
+	__dentry_dir_lease_touch(mdsc, di);
 	spin_unlock(&mdsc->dentry_list_lock);
 }
 

From 578eb54c4a16713d99eed736660e30ae6eb1877f Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Wed, 10 Jul 2024 20:16:54 +0800
Subject: [PATCH 166/346] ceph: periodically flush the cap releases

The MDS could be waiting the caps releases infinitely in some corner
case and then reporting the caps revoke stuck warning. To fix this
we should periodically flush the cap releases.

Link: https://tracker.ceph.com/issues/57244
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Venky Shankar <vshankar@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 62238f3e6e19..276e34ab3e2c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5446,6 +5446,8 @@ static void delayed_work(struct work_struct *work)
 		}
 		mutex_unlock(&mdsc->mutex);
 
+		ceph_flush_cap_releases(mdsc, s);
+
 		mutex_lock(&s->s_mutex);
 		if (renew_caps)
 			send_renew_caps(mdsc, s);

From 03230edb0bd831662a7c08b6fef66b2a9a817774 Mon Sep 17 00:00:00 2001
From: ethanwu <ethanwu@synology.com>
Date: Thu, 11 Jul 2024 14:47:56 +0800
Subject: [PATCH 167/346] ceph: fix incorrect kmalloc size of pagevec mempool

The kmalloc size of pagevec mempool is incorrectly calculated.
It misses the size of page pointer and only accounts the number for the array.

Fixes: a0102bda5bc0 ("ceph: move sb->wb_pagevec_pool to be a global mempool")
Signed-off-by: ethanwu <ethanwu@synology.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 885cb5d4e771..0cdf84cd1791 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -961,7 +961,8 @@ static int __init init_caches(void)
 	if (!ceph_mds_request_cachep)
 		goto bad_mds_req;
 
-	ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10, CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT);
+	ceph_wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+	    (CEPH_MAX_WRITE_SIZE >> PAGE_SHIFT) * sizeof(struct page *));
 	if (!ceph_wb_pagevec_pool)
 		goto bad_pagevec_pool;
 

From 65564157ae64cec0f527583f96e32f484f730f92 Mon Sep 17 00:00:00 2001
From: Nitin Gote <nitin.r.gote@intel.com>
Date: Thu, 11 Jul 2024 22:02:08 +0530
Subject: [PATCH 168/346] drm/i915/gt: Do not consider preemption during
 execlists_dequeue for gen8

We're seeing a GPU hang issue on a CHV platform, which was caused by commit
bac24f59f454 ("drm/i915/execlists: Enable coarse preemption boundaries for
Gen8").

The Gen8 platform only supports timeslicing and doesn't have a preemption
mechanism, as its engines do not have a preemption timer.

Commit 751f82b353a6 ("drm/i915/gt: Only disable preemption on Gen8 render
engines") addressed this issue only for render engines. This patch extends
that fix by ensuring that preemption is not considered for all engines on
Gen8 platforms.

v4:
 - Use the correct Fixes tag (Rodrigo Vivi)
 - Reworded commit log (Andi Shyti)

v3:
 - Inside need_preempt(), condition of can_preempt() is not required
   as simplified can_preempt() is enough. (Chris Wilson)

v2: Simplify can_preempt() function (Tvrtko Ursulin)

Fixes: 751f82b353a6 ("drm/i915/gt: Only disable preemption on gen8 render engines")
Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11396
Suggested-by: Andi Shyti <andi.shyti@intel.com>
Signed-off-by: Nitin Gote <nitin.r.gote@intel.com>
Cc: Chris Wilson <chris.p.wilson@linux.intel.com>
CC: <stable@vger.kernel.org> # v5.12+
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240711163208.1355736-1-nitin.r.gote@intel.com
(cherry picked from commit 7df0be6e6280c6fca01d039864bb123e5e36604b)
Signed-off-by: Tvrtko Ursulin <tursulin@ursulin.net>
---
 drivers/gpu/drm/i915/gt/intel_execlists_submission.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 21829439e686..72090f52fb85 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3315,11 +3315,7 @@ static void remove_from_engine(struct i915_request *rq)
 
 static bool can_preempt(struct intel_engine_cs *engine)
 {
-	if (GRAPHICS_VER(engine->i915) > 8)
-		return true;
-
-	/* GPGPU on bdw requires extra w/a; not implemented */
-	return engine->class != RENDER_CLASS;
+	return GRAPHICS_VER(engine->i915) > 8;
 }
 
 static void kick_execlists(const struct i915_request *rq, int prio)

From 26720dd2b5a1d088bff8f7e6355fca021c83718f Mon Sep 17 00:00:00 2001
From: Jonathan Cavitt <jonathan.cavitt@intel.com>
Date: Fri, 12 Jul 2024 14:41:56 -0700
Subject: [PATCH 169/346] drm/i915: Allow NULL memory region

Prevent a NULL pointer access in intel_memory_regions_hw_probe.

Fixes: 05da7d9f717b ("drm/i915/gem: Downgrade stolen lmem setup warning")
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11704
Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Reviewed-by: Nirmoy Das <nirmoy.das@intel.com>
Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240712214156.3969584-1-jonathan.cavitt@intel.com
(cherry picked from commit d75dec1fcbcb05b021c08b62551649567ab8955c)
Signed-off-by: Tvrtko Ursulin <tursulin@ursulin.net>
---
 drivers/gpu/drm/i915/intel_memory_region.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c
index 172dfa7c3588..d40ee1b42110 100644
--- a/drivers/gpu/drm/i915/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/intel_memory_region.c
@@ -368,8 +368,10 @@ int intel_memory_regions_hw_probe(struct drm_i915_private *i915)
 			goto out_cleanup;
 		}
 
-		mem->id = i;
-		i915->mm.regions[i] = mem;
+		if (mem) { /* Skip on non-fatal errors */
+			mem->id = i;
+			i915->mm.regions[i] = mem;
+		}
 	}
 
 	for (i = 0; i < ARRAY_SIZE(i915->mm.regions); i++) {

From 368990a7fe30737c990f628a60d26d9854a9e690 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 15 Jul 2024 09:47:08 +0200
Subject: [PATCH 170/346] xen: fix multicall debug data referencing

The recent adding of multicall debug mixed up the referencing of
the debug data. A __percpu tagged pointer can't be initialized with a
plain pointer, so use another percpu variable for the pointer and set
it on each new cpu via a function.

Fixes: 942d917cb92a ("xen: make multicall debug boot time selectable")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202407151106.5s7Mnfpz-lkp@intel.com/
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/multicalls.c | 19 ++++++++++++-------
 arch/x86/xen/smp_pv.c     |  1 +
 arch/x86/xen/xen-ops.h    |  3 +++
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index d4cefd8a9af4..10c660fae8b3 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -54,8 +54,9 @@ struct mc_debug_data {
 
 static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
 static struct mc_debug_data mc_debug_data_early __initdata;
-static struct mc_debug_data __percpu *mc_debug_data __refdata =
+static DEFINE_PER_CPU(struct mc_debug_data *, mc_debug_data) =
 	&mc_debug_data_early;
+static struct mc_debug_data __percpu *mc_debug_data_ptr;
 DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
 
 static struct static_key mc_debug __ro_after_init;
@@ -70,16 +71,20 @@ static int __init xen_parse_mc_debug(char *arg)
 }
 early_param("xen_mc_debug", xen_parse_mc_debug);
 
+void mc_percpu_init(unsigned int cpu)
+{
+	per_cpu(mc_debug_data, cpu) = per_cpu_ptr(mc_debug_data_ptr, cpu);
+}
+
 static int __init mc_debug_enable(void)
 {
-	struct mc_debug_data __percpu *mcdb;
 	unsigned long flags;
 
 	if (!mc_debug_enabled)
 		return 0;
 
-	mcdb = alloc_percpu(struct mc_debug_data);
-	if (!mcdb) {
+	mc_debug_data_ptr = alloc_percpu(struct mc_debug_data);
+	if (!mc_debug_data_ptr) {
 		pr_err("xen_mc_debug inactive\n");
 		static_key_slow_dec(&mc_debug);
 		return -ENOMEM;
@@ -88,7 +93,7 @@ static int __init mc_debug_enable(void)
 	/* Be careful when switching to percpu debug data. */
 	local_irq_save(flags);
 	xen_mc_flush();
-	mc_debug_data = mcdb;
+	mc_percpu_init(0);
 	local_irq_restore(flags);
 
 	pr_info("xen_mc_debug active\n");
@@ -150,7 +155,7 @@ void xen_mc_flush(void)
 	trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
 
 	if (static_key_false(&mc_debug)) {
-		mcdb = this_cpu_ptr(mc_debug_data);
+		mcdb = __this_cpu_read(mc_debug_data);
 		memcpy(mcdb->entries, b->entries,
 		       b->mcidx * sizeof(struct multicall_entry));
 	}
@@ -230,7 +235,7 @@ struct multicall_space __xen_mc_entry(size_t args)
 
 	ret.mc = &b->entries[b->mcidx];
 	if (static_key_false(&mc_debug)) {
-		struct mc_debug_data *mcdb = this_cpu_ptr(mc_debug_data);
+		struct mc_debug_data *mcdb = __this_cpu_read(mc_debug_data);
 
 		mcdb->caller[b->mcidx] = __builtin_return_address(0);
 		mcdb->argsz[b->mcidx] = args;
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 7ea57f728b89..6863d3da7dec 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -305,6 +305,7 @@ static int xen_pv_kick_ap(unsigned int cpu, struct task_struct *idle)
 		return rc;
 
 	xen_pmu_init(cpu);
+	mc_percpu_init(cpu);
 
 	/*
 	 * Why is this a BUG? If the hypercall fails then everything can be
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index e7775dff9452..0cf16fc79e0b 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -257,6 +257,9 @@ void xen_mc_callback(void (*fn)(void *), void *data);
  */
 struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
 
+/* Do percpu data initialization for multicalls. */
+void mc_percpu_init(unsigned int cpu);
+
 extern bool is_xen_pmu;
 
 irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);

From e5f98896efb3b6350cb6f1c241394966dcbcf240 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Jul 2024 21:00:35 +0200
Subject: [PATCH 171/346] thermal: trip: Split thermal_zone_device_set_mode()

Pull a wrapper around thermal zone .change_mode() callback out of
thermal_zone_device_set_mode() because it will be used elsewhere
subsequently.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://patch.msgid.link/2206793.irdbgypaU6@rjwysocki.net
---
 drivers/thermal/thermal_core.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index f6e700e48aad..48bf267d090d 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -272,6 +272,22 @@ static int __init thermal_register_governors(void)
 	return ret;
 }
 
+static int __thermal_zone_device_set_mode(struct thermal_zone_device *tz,
+					  enum thermal_device_mode mode)
+{
+	if (tz->ops.change_mode) {
+		int ret;
+
+		ret = tz->ops.change_mode(tz, mode);
+		if (ret)
+			return ret;
+	}
+
+	tz->mode = mode;
+
+	return 0;
+}
+
 /*
  * Zone update section: main control loop applied to each zone while monitoring
  * in polling mode. The monitoring is done using a workqueue.
@@ -540,7 +556,7 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
 static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
 					enum thermal_device_mode mode)
 {
-	int ret = 0;
+	int ret;
 
 	mutex_lock(&tz->lock);
 
@@ -548,14 +564,15 @@ static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
 	if (mode == tz->mode) {
 		mutex_unlock(&tz->lock);
 
-		return ret;
+		return 0;
 	}
 
-	if (tz->ops.change_mode)
-		ret = tz->ops.change_mode(tz, mode);
+	ret = __thermal_zone_device_set_mode(tz, mode);
+	if (ret) {
+		mutex_unlock(&tz->lock);
 
-	if (!ret)
-		tz->mode = mode;
+		return ret;
+	}
 
 	__thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
@@ -566,7 +583,7 @@ static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
 	else
 		thermal_notify_tz_disable(tz);
 
-	return ret;
+	return 0;
 }
 
 int thermal_zone_device_enable(struct thermal_zone_device *tz)

From b8a4518b5c2d1164f9bb2e586733a658c5239adf Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Tue, 23 Jul 2024 10:41:52 +0100
Subject: [PATCH 172/346] drbd: Add peer_device to Kernel doc

Add missing documentation of peer_device parameter to Kernel doc.

These parameters were added in commit 8164dd6c8ae1 ("drbd: Add peer
device parameter to whole-bitmap I/O handlers")

Flagged by W=1 builds.

Signed-off-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240723-drbd-doc-v1-1-a04d9b7a9688@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index f92673f05c7a..a9e49b212341 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3422,6 +3422,7 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
 /**
  * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
  * @device:	DRBD device.
+ * @peer_device: Peer DRBD device.
  *
  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
  */
@@ -3448,6 +3449,7 @@ int drbd_bmio_set_n_write(struct drbd_device *device,
 /**
  * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
  * @device:	DRBD device.
+ * @peer_device: Peer DRBD device.
  *
  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
  */
@@ -3501,6 +3503,7 @@ static int w_bitmap_io(struct drbd_work *w, int unused)
  * @done:	callback to be called after the bitmap IO was performed
  * @why:	Descriptive text of the reason for doing the IO
  * @flags:	Bitmap flags
+ * @peer_device: Peer DRBD device.
  *
  * While IO on the bitmap happens we freeze application IO thus we ensure
  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
@@ -3549,6 +3552,7 @@ void drbd_queue_bitmap_io(struct drbd_device *device,
  * @io_fn:	IO callback to be called when bitmap IO is possible
  * @why:	Descriptive text of the reason for doing the IO
  * @flags:	Bitmap flags
+ * @peer_device: Peer DRBD device.
  *
  * freezes application IO while that the actual IO operations runs. This
  * functions MAY NOT be called from worker context.

From 57e29f4c8919af7276c35c2da0ae5efb4c36c33e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 15 Jul 2024 10:33:06 +0200
Subject: [PATCH 173/346] s390: Add runtime constant support

Implement the runtime constant infrastructure for s390, allowing the
dcache d_hash() function to be generated using as a constant for hash
table address followed by shift by a constant of the hash index.

This is the s390 variant of commit 94a2bc0f611c ("arm64: add 'runtime
constant' support") and commit e3c92e81711d ("runtime constants: add
x86 architecture support").

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Acked-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/runtime-const.h | 77 +++++++++++++++++++++++++++
 arch/s390/kernel/vmlinux.lds.S        |  3 ++
 2 files changed, 80 insertions(+)
 create mode 100644 arch/s390/include/asm/runtime-const.h

diff --git a/arch/s390/include/asm/runtime-const.h b/arch/s390/include/asm/runtime-const.h
new file mode 100644
index 000000000000..17878b1d048c
--- /dev/null
+++ b/arch/s390/include/asm/runtime-const.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_RUNTIME_CONST_H
+#define _ASM_S390_RUNTIME_CONST_H
+
+#include <linux/uaccess.h>
+
+#define runtime_const_ptr(sym)					\
+({								\
+	typeof(sym) __ret;					\
+								\
+	asm_inline(						\
+		"0:	iihf	%[__ret],%[c1]\n"		\
+		"	iilf	%[__ret],%[c2]\n"		\
+		".pushsection runtime_ptr_" #sym ",\"a\"\n"	\
+		".long 0b - .\n"				\
+		".popsection"					\
+		: [__ret] "=d" (__ret)				\
+		: [c1] "i" (0x01234567UL),			\
+		  [c2] "i" (0x89abcdefUL));			\
+	__ret;							\
+})
+
+#define runtime_const_shift_right_32(val, sym)			\
+({								\
+	unsigned int __ret = (val);				\
+								\
+	asm_inline(						\
+		"0:	srl	%[__ret],12\n"			\
+		".pushsection runtime_shift_" #sym ",\"a\"\n"	\
+		".long 0b - .\n"				\
+		".popsection"					\
+		: [__ret] "+d" (__ret));			\
+	__ret;							\
+})
+
+#define runtime_const_init(type, sym) do {			\
+	extern s32 __start_runtime_##type##_##sym[];		\
+	extern s32 __stop_runtime_##type##_##sym[];		\
+								\
+	runtime_const_fixup(__runtime_fixup_##type,		\
+			    (unsigned long)(sym),		\
+			    __start_runtime_##type##_##sym,	\
+			    __stop_runtime_##type##_##sym);	\
+} while (0)
+
+/* 32-bit immediate for iihf and iilf in bits in I2 field */
+static inline void __runtime_fixup_32(u32 *p, unsigned int val)
+{
+	s390_kernel_write(p, &val, sizeof(val));
+}
+
+static inline void __runtime_fixup_ptr(void *where, unsigned long val)
+{
+	__runtime_fixup_32(where + 2, val >> 32);
+	__runtime_fixup_32(where + 8, val);
+}
+
+/* Immediate value is lower 12 bits of D2 field of srl */
+static inline void __runtime_fixup_shift(void *where, unsigned long val)
+{
+	u32 insn = *(u32 *)where;
+
+	insn &= 0xfffff000;
+	insn |= (val & 63);
+	s390_kernel_write(where, &insn, sizeof(insn));
+}
+
+static inline void runtime_const_fixup(void (*fn)(void *, unsigned long),
+				       unsigned long val, s32 *start, s32 *end)
+{
+	while (start < end) {
+		fn(*start + (void *)start, val);
+		start++;
+	}
+}
+
+#endif /* _ASM_S390_RUNTIME_CONST_H */
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index a1ce3925ec71..5128ccee9c67 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -190,6 +190,9 @@ SECTIONS
 	. = ALIGN(PAGE_SIZE);
 	INIT_DATA_SECTION(0x100)
 
+	RUNTIME_CONST(shift, d_hash_shift)
+	RUNTIME_CONST(ptr, dentry_hashtable)
+
 	PERCPU_SECTION(0x100)
 
 	. = ALIGN(PAGE_SIZE);

From 5fd11b96b43708f2f6e3964412c301c1bd20ec0f Mon Sep 17 00:00:00 2001
From: Gerd Bayer <gbayer@linux.ibm.com>
Date: Thu, 11 Jul 2024 15:45:26 +0200
Subject: [PATCH 174/346] s390/pci: Refactor arch_setup_msi_irqs()

Factor out adapter interrupt allocation from arch_setup_msi_irqs() in
preparation for enabling registration of multiple MSIs. Code movement
only, no change of functionality intended.

Signed-off-by: Gerd Bayer <gbayer@linux.ibm.com>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/pci/pci_irq.c | 54 ++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
index 0ef83b6ac0db..979f776b09b8 100644
--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -268,33 +268,20 @@ static void zpci_floating_irq_handler(struct airq_struct *airq,
 	}
 }
 
-int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs,
+			unsigned long *bit)
 {
-	struct zpci_dev *zdev = to_zpci(pdev);
-	unsigned int hwirq, msi_vecs, cpu;
-	unsigned long bit;
-	struct msi_desc *msi;
-	struct msi_msg msg;
-	int cpu_addr;
-	int rc, irq;
-
-	zdev->aisb = -1UL;
-	zdev->msi_first_bit = -1U;
-	if (type == PCI_CAP_ID_MSI && nvec > 1)
-		return 1;
-	msi_vecs = min_t(unsigned int, nvec, zdev->max_msi);
-
 	if (irq_delivery == DIRECTED) {
 		/* Allocate cpu vector bits */
-		bit = airq_iv_alloc(zpci_ibv[0], msi_vecs);
-		if (bit == -1UL)
+		*bit = airq_iv_alloc(zpci_ibv[0], msi_vecs);
+		if (*bit == -1UL)
 			return -EIO;
 	} else {
 		/* Allocate adapter summary indicator bit */
-		bit = airq_iv_alloc_bit(zpci_sbv);
-		if (bit == -1UL)
+		*bit = airq_iv_alloc_bit(zpci_sbv);
+		if (*bit == -1UL)
 			return -EIO;
-		zdev->aisb = bit;
+		zdev->aisb = *bit;
 
 		/* Create adapter interrupt vector */
 		zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL);
@@ -302,10 +289,33 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 			return -ENOMEM;
 
 		/* Wire up shortcut pointer */
-		zpci_ibv[bit] = zdev->aibv;
+		zpci_ibv[*bit] = zdev->aibv;
 		/* Each function has its own interrupt vector */
-		bit = 0;
+		*bit = 0;
 	}
+	return 0;
+}
+
+int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+	struct zpci_dev *zdev = to_zpci(pdev);
+	unsigned int hwirq, msi_vecs, cpu;
+	struct msi_desc *msi;
+	struct msi_msg msg;
+	unsigned long bit;
+	int cpu_addr;
+	int rc, irq;
+
+	zdev->aisb = -1UL;
+	zdev->msi_first_bit = -1U;
+
+	if (type == PCI_CAP_ID_MSI && nvec > 1)
+		return 1;
+	msi_vecs = min_t(unsigned int, nvec, zdev->max_msi);
+
+	rc = __alloc_airq(zdev, msi_vecs, &bit);
+	if (rc < 0)
+		return rc;
 
 	/* Request MSI interrupts */
 	hwirq = bit;

From ab42fcb511fd9d241bbab7cc3ca04e34e9fc0666 Mon Sep 17 00:00:00 2001
From: Gerd Bayer <gbayer@linux.ibm.com>
Date: Thu, 11 Jul 2024 15:45:27 +0200
Subject: [PATCH 175/346] s390/pci: Allow allocation of more than 1 MSI
 interrupt

On a PCI adapter that provides up to 8 MSI interrupt sources the s390
implementation of PCI interrupts rejected to accommodate them, although
the underlying hardware is able to support that.

For MSI-X it is sufficient to allocate a single irq_desc per msi_desc,
but for MSI multiple irq descriptors are attached to and controlled by
a single msi descriptor. Add the appropriate loops to maintain multiple
irq descriptors and tie/untie them to/from the appropriate AIBV bit, if
a device driver allocates more than 1 MSI interrupt.

Common PCI code passes on requests to allocate a number of interrupt
vectors based on the device drivers' demand and the PCI functions'
capabilities. However, the root-complex of s390 systems support just a
limited number of interrupt vectors per PCI function.
Produce a kernel log message to inform about any architecture-specific
capping that might be done.

With this change, we had a PCI adapter successfully raising
interrupts to its device driver via all 8 sources.

Fixes: a384c8924a8b ("s390/PCI: Fix single MSI only check")
Signed-off-by: Gerd Bayer <gbayer@linux.ibm.com>
Reviewed-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/pci/pci_irq.c | 62 ++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 20 deletions(-)

diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
index 979f776b09b8..84482a921332 100644
--- a/arch/s390/pci/pci_irq.c
+++ b/arch/s390/pci/pci_irq.c
@@ -298,8 +298,8 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs,
 
 int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 {
+	unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu;
 	struct zpci_dev *zdev = to_zpci(pdev);
-	unsigned int hwirq, msi_vecs, cpu;
 	struct msi_desc *msi;
 	struct msi_msg msg;
 	unsigned long bit;
@@ -309,30 +309,46 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 	zdev->aisb = -1UL;
 	zdev->msi_first_bit = -1U;
 
-	if (type == PCI_CAP_ID_MSI && nvec > 1)
-		return 1;
 	msi_vecs = min_t(unsigned int, nvec, zdev->max_msi);
+	if (msi_vecs < nvec) {
+		pr_info("%s requested %d irqs, allocate system limit of %d",
+			pci_name(pdev), nvec, zdev->max_msi);
+	}
 
 	rc = __alloc_airq(zdev, msi_vecs, &bit);
 	if (rc < 0)
 		return rc;
 
-	/* Request MSI interrupts */
+	/*
+	 * Request MSI interrupts:
+	 * When using MSI, nvec_used interrupt sources and their irq
+	 * descriptors are controlled through one msi descriptor.
+	 * Thus the outer loop over msi descriptors shall run only once,
+	 * while two inner loops iterate over the interrupt vectors.
+	 * When using MSI-X, each interrupt vector/irq descriptor
+	 * is bound to exactly one msi descriptor (nvec_used is one).
+	 * So the inner loops are executed once, while the outer iterates
+	 * over the MSI-X descriptors.
+	 */
 	hwirq = bit;
 	msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) {
-		rc = -EIO;
 		if (hwirq - bit >= msi_vecs)
 			break;
-		irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE,
-				(irq_delivery == DIRECTED) ?
-				msi->affinity : NULL);
+		irqs_per_msi = min_t(unsigned int, msi_vecs, msi->nvec_used);
+		irq = __irq_alloc_descs(-1, 0, irqs_per_msi, 0, THIS_MODULE,
+					(irq_delivery == DIRECTED) ?
+					msi->affinity : NULL);
 		if (irq < 0)
 			return -ENOMEM;
-		rc = irq_set_msi_desc(irq, msi);
-		if (rc)
-			return rc;
-		irq_set_chip_and_handler(irq, &zpci_irq_chip,
-					 handle_percpu_irq);
+
+		for (i = 0; i < irqs_per_msi; i++) {
+			rc = irq_set_msi_desc_off(irq, i, msi);
+			if (rc)
+				return rc;
+			irq_set_chip_and_handler(irq + i, &zpci_irq_chip,
+						 handle_percpu_irq);
+		}
+
 		msg.data = hwirq - bit;
 		if (irq_delivery == DIRECTED) {
 			if (msi->affinity)
@@ -345,31 +361,35 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 			msg.address_lo |= (cpu_addr << 8);
 
 			for_each_possible_cpu(cpu) {
-				airq_iv_set_data(zpci_ibv[cpu], hwirq, irq);
+				for (i = 0; i < irqs_per_msi; i++)
+					airq_iv_set_data(zpci_ibv[cpu],
+							 hwirq + i, irq + i);
 			}
 		} else {
 			msg.address_lo = zdev->msi_addr & 0xffffffff;
-			airq_iv_set_data(zdev->aibv, hwirq, irq);
+			for (i = 0; i < irqs_per_msi; i++)
+				airq_iv_set_data(zdev->aibv, hwirq + i, irq + i);
 		}
 		msg.address_hi = zdev->msi_addr >> 32;
 		pci_write_msi_msg(irq, &msg);
-		hwirq++;
+		hwirq += irqs_per_msi;
 	}
 
 	zdev->msi_first_bit = bit;
-	zdev->msi_nr_irqs = msi_vecs;
+	zdev->msi_nr_irqs = hwirq - bit;
 
 	rc = zpci_set_irq(zdev);
 	if (rc)
 		return rc;
 
-	return (msi_vecs == nvec) ? 0 : msi_vecs;
+	return (zdev->msi_nr_irqs == nvec) ? 0 : zdev->msi_nr_irqs;
 }
 
 void arch_teardown_msi_irqs(struct pci_dev *pdev)
 {
 	struct zpci_dev *zdev = to_zpci(pdev);
 	struct msi_desc *msi;
+	unsigned int i;
 	int rc;
 
 	/* Disable interrupts */
@@ -379,8 +399,10 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev)
 
 	/* Release MSI interrupts */
 	msi_for_each_desc(msi, &pdev->dev, MSI_DESC_ASSOCIATED) {
-		irq_set_msi_desc(msi->irq, NULL);
-		irq_free_desc(msi->irq);
+		for (i = 0; i < msi->nvec_used; i++) {
+			irq_set_msi_desc(msi->irq + i, NULL);
+			irq_free_desc(msi->irq + i);
+		}
 		msi->msg.address_lo = 0;
 		msi->msg.address_hi = 0;
 		msi->msg.data = 0;

From e188e5d5ffd01d484b5255b88739fcf67b300223 Mon Sep 17 00:00:00 2001
From: Vasily Gorbik <gor@linux.ibm.com>
Date: Thu, 11 Jul 2024 15:50:26 +0200
Subject: [PATCH 176/346] s390/setup: Fix __pa/__va for modules under non-GPL
 licenses

The struct vm_layout contains fields used in __pa/__va calculations. Such
fundamental things have to be exported with EXPORT_SYMBOL to avoid
breakages of out-of-tree modules under non-GPL licenses.

Fixes: 7de0446f0b26 ("s390/boot: Make identity mapping base address explicit")
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Acked-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 3993f4caf224..1faba11d5f0b 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -149,7 +149,7 @@ unsigned long __bootdata_preserved(max_mappable);
 struct physmem_info __bootdata(physmem_info);
 
 struct vm_layout __bootdata_preserved(vm_layout);
-EXPORT_SYMBOL_GPL(vm_layout);
+EXPORT_SYMBOL(vm_layout);
 int __bootdata_preserved(__kaslr_enabled);
 unsigned int __bootdata_preserved(zlib_dfltcc_support);
 EXPORT_SYMBOL(zlib_dfltcc_support);

From ec25f99cc834644e6577fa11582f7691589ed8cc Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Tue, 23 Jul 2024 14:44:11 +0200
Subject: [PATCH 177/346] s390/kmsan: Fix merge conflict with get_lowcore()
 introduction

Resolve the conflict between commit 2a48c8c9cf87 ("s390/kmsan:
implement the architecture-specific functions") and commit 39976f1278a9
("s390: Remove S390_lowcore").

Fixes: 2a48c8c9cf87 ("s390/kmsan: implement the architecture-specific functions")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Link: https://lore.kernel.org/r/20240723124441.120044-2-iii@linux.ibm.com
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/kmsan.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/kmsan.h b/arch/s390/include/asm/kmsan.h
index 27db65fbf3f6..f73e181d09ae 100644
--- a/arch/s390/include/asm/kmsan.h
+++ b/arch/s390/include/asm/kmsan.h
@@ -12,8 +12,8 @@
 
 static inline bool is_lowcore_addr(void *addr)
 {
-	return addr >= (void *)&S390_lowcore &&
-	       addr < (void *)(&S390_lowcore + 1);
+	return addr >= (void *)get_lowcore() &&
+	       addr < (void *)(get_lowcore() + 1);
 }
 
 static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin)
@@ -25,7 +25,7 @@ static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin)
 		 * order to get a distinct struct page.
 		 */
 		addr += (void *)lowcore_ptr[raw_smp_processor_id()] -
-			(void *)&S390_lowcore;
+			(void *)get_lowcore();
 		if (KMSAN_WARN_ON(is_lowcore_addr(addr)))
 			return NULL;
 		return kmsan_get_metadata(addr, is_origin);

From 19af288706b25f2213e85b8b2df140c04fd7c63d Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Tue, 23 Jul 2024 14:44:12 +0200
Subject: [PATCH 178/346] s390/ptdump: Add KMSAN page markers

Add KMSAN vmalloc metadata areas to
/sys/kernel/debug/kernel_page_tables. Example output:

    0x000003a95fff9000-0x000003a960000000        28K PTE I
    ---[ vmalloc Area End ]---
    ---[ Kmsan vmalloc Shadow Start ]---
    0x000003a960000000-0x000003a960010000        64K PTE RW NX
    [...]
    0x000003d3dfff9000-0x000003d3e0000000        28K PTE I
    ---[ Kmsan vmalloc Shadow End ]---
    ---[ Kmsan vmalloc Origins Start ]---
    0x000003d3e0000000-0x000003d3e0010000        64K PTE RW NX
    [...]
    0x000003fe5fff9000-0x000003fe60000000        28K PTE I
    ---[ Kmsan vmalloc Origins End ]---
    ---[ Kmsan Modules Shadow Start ]---
    0x000003fe60000000-0x000003fe60001000         4K PTE RW NX
    [...]
    0x000003fe60100000-0x000003fee0000000      2047M PMD I
    ---[ Kmsan Modules Shadow End ]---
    ---[ Kmsan Modules Origins Start ]---
    0x000003fee0000000-0x000003fee0001000         4K PTE RW NX
    [...]
    0x000003fee0100000-0x000003ff60000000      2047M PMD I
    ---[ Kmsan Modules Origins End ]---
    ---[ Modules Area Start ]---
    0x000003ff60000000-0x000003ff60001000         4K PTE RO X

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Link: https://lore.kernel.org/r/20240723124441.120044-3-iii@linux.ibm.com
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/mm/dump_pagetables.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 45db5f47b22d..98dab3e049de 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -36,6 +36,16 @@ enum address_markers_idx {
 	VMEMMAP_END_NR,
 	VMALLOC_NR,
 	VMALLOC_END_NR,
+#ifdef CONFIG_KMSAN
+	KMSAN_VMALLOC_SHADOW_START_NR,
+	KMSAN_VMALLOC_SHADOW_END_NR,
+	KMSAN_VMALLOC_ORIGIN_START_NR,
+	KMSAN_VMALLOC_ORIGIN_END_NR,
+	KMSAN_MODULES_SHADOW_START_NR,
+	KMSAN_MODULES_SHADOW_END_NR,
+	KMSAN_MODULES_ORIGIN_START_NR,
+	KMSAN_MODULES_ORIGIN_END_NR,
+#endif
 	MODULES_NR,
 	MODULES_END_NR,
 	ABS_LOWCORE_NR,
@@ -65,6 +75,16 @@ static struct addr_marker address_markers[] = {
 	[VMEMMAP_END_NR]	= {0, "vmemmap Area End"},
 	[VMALLOC_NR]		= {0, "vmalloc Area Start"},
 	[VMALLOC_END_NR]	= {0, "vmalloc Area End"},
+#ifdef CONFIG_KMSAN
+	[KMSAN_VMALLOC_SHADOW_START_NR]	= {0, "Kmsan vmalloc Shadow Start"},
+	[KMSAN_VMALLOC_SHADOW_END_NR]	= {0, "Kmsan vmalloc Shadow End"},
+	[KMSAN_VMALLOC_ORIGIN_START_NR]	= {0, "Kmsan vmalloc Origins Start"},
+	[KMSAN_VMALLOC_ORIGIN_END_NR]	= {0, "Kmsan vmalloc Origins End"},
+	[KMSAN_MODULES_SHADOW_START_NR]	= {0, "Kmsan Modules Shadow Start"},
+	[KMSAN_MODULES_SHADOW_END_NR]	= {0, "Kmsan Modules Shadow End"},
+	[KMSAN_MODULES_ORIGIN_START_NR]	= {0, "Kmsan Modules Origins Start"},
+	[KMSAN_MODULES_ORIGIN_END_NR]	= {0, "Kmsan Modules Origins End"},
+#endif
 	[MODULES_NR]		= {0, "Modules Area Start"},
 	[MODULES_END_NR]	= {0, "Modules Area End"},
 	[ABS_LOWCORE_NR]	= {0, "Lowcore Area Start"},
@@ -306,6 +326,16 @@ static int pt_dump_init(void)
 #ifdef CONFIG_KFENCE
 	address_markers[KFENCE_START_NR].start_address = kfence_start;
 	address_markers[KFENCE_END_NR].start_address = kfence_start + KFENCE_POOL_SIZE;
+#endif
+#ifdef CONFIG_KMSAN
+	address_markers[KMSAN_VMALLOC_SHADOW_START_NR].start_address = KMSAN_VMALLOC_SHADOW_START;
+	address_markers[KMSAN_VMALLOC_SHADOW_END_NR].start_address = KMSAN_VMALLOC_SHADOW_END;
+	address_markers[KMSAN_VMALLOC_ORIGIN_START_NR].start_address = KMSAN_VMALLOC_ORIGIN_START;
+	address_markers[KMSAN_VMALLOC_ORIGIN_END_NR].start_address = KMSAN_VMALLOC_ORIGIN_END;
+	address_markers[KMSAN_MODULES_SHADOW_START_NR].start_address = KMSAN_MODULES_SHADOW_START;
+	address_markers[KMSAN_MODULES_SHADOW_END_NR].start_address = KMSAN_MODULES_SHADOW_END;
+	address_markers[KMSAN_MODULES_ORIGIN_START_NR].start_address = KMSAN_MODULES_ORIGIN_START;
+	address_markers[KMSAN_MODULES_ORIGIN_END_NR].start_address = KMSAN_MODULES_ORIGIN_END;
 #endif
 	sort_address_markers();
 #ifdef CONFIG_PTDUMP_DEBUGFS

From e6ce1f12d777f6ee22b20e10ae6a771e7e6f44f5 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Mon, 15 Jul 2024 12:07:29 +0200
Subject: [PATCH 179/346] s390/cpum_cf: Fix endless loop in CF_DIAG event stop

Event CF_DIAG reads out complete counter sets using stcctm
instruction. This is done at event start time when the process
starts execution and at event stop time when the process is
removed from the CPU. During removal the difference of each
counter in the counter sets is calculated and saved as raw data
in the ring buffer. This works fine unless the number of counters
in a counter set is zero. This may happen for the extended counter
set. This set is machine specific and the size of the counter
set can be zero even when extended counter set is authorized for
read access.

This case is not handled. cfdiag_diffctr() checks authorization
of the extended counter set. If true the functions assumes
the extended counter set has been saved in a data buffer. However
this is not the case, cfdiag_getctrset() does not save a counter
set with counter set size of zero. This mismatch causes an endless
loop in the counter set readout during event stop handling.

The calculation of the difference of the counters in each counter
now verifies the size of the counter set is non-zero. A counter set
with size zero is skipped.

Fixes: a029a4eab39e ("s390/cpumf: Allow concurrent access for CPU Measurement Counter Facility")
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Acked-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/perf_cpum_cf.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 1434642e9cba..6968be98af11 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -556,25 +556,31 @@ static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth)
 	struct cf_trailer_entry *trailer_start, *trailer_stop;
 	struct cf_ctrset_entry *ctrstart, *ctrstop;
 	size_t offset = 0;
+	int i;
 
-	auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1;
-	do {
+	for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) {
 		ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset);
 		ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset);
 
+		/* Counter set not authorized */
+		if (!(auth & cpumf_ctr_ctl[i]))
+			continue;
+		/* Counter set size zero was not saved */
+		if (!cpum_cf_read_setsize(i))
+			continue;
+
 		if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) {
 			pr_err_once("cpum_cf_diag counter set compare error "
 				    "in set %i\n", ctrstart->set);
 			return 0;
 		}
-		auth &= ~cpumf_ctr_ctl[ctrstart->set];
 		if (ctrstart->def == CF_DIAG_CTRSET_DEF) {
 			cfdiag_diffctrset((u64 *)(ctrstart + 1),
 					  (u64 *)(ctrstop + 1), ctrstart->ctr);
 			offset += ctrstart->ctr * sizeof(u64) +
 							sizeof(*ctrstart);
 		}
-	} while (ctrstart->def && auth);
+	}
 
 	/* Save time_stamp from start of event in stop's trailer */
 	trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset);

From b798b685b42c9dbe508e59a74250d97c41bec35e Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@linux.ibm.com>
Date: Wed, 17 Jul 2024 21:43:22 +0200
Subject: [PATCH 180/346] s390/boot: Do not assume the decompressor range is
 reserved

When allocating a random memory range for .amode31 sections
the minimal randomization address is 0. That does not lead
to a possible overlap with the decompressor image (which also
starts from 0) since by that time the image range is already
reserved.

Do not assume the decompressor range is reserved and always
provide the minimal randomization address for .amode31
sections beyond the decompressor. That is a prerequisite
for moving the lowcore memory address from NULL elsewhere.

Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/boot/startup.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index c59014945af0..cc8753c0c121 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -478,8 +478,12 @@ void startup_kernel(void)
 	 * before the kernel started. Therefore, in case the two sections
 	 * overlap there is no risk of corrupting any data.
 	 */
-	if (kaslr_enabled())
-		amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, 0, SZ_2G);
+	if (kaslr_enabled()) {
+		unsigned long amode31_min;
+
+		amode31_min = (unsigned long)_decompressor_end;
+		amode31_lma = randomize_within_range(vmlinux.amode31_size, PAGE_SIZE, amode31_min, SZ_2G);
+	}
 	if (!amode31_lma)
 		amode31_lma = __kaslr_offset_phys - vmlinux.amode31_size;
 	physmem_reserve(RR_AMODE31, amode31_lma, vmlinux.amode31_size);

From a795eeaf851b91c79fc357a245ca72fd1e7df906 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 16 Jul 2024 09:26:13 +0200
Subject: [PATCH 181/346] s390/smp: Handle restart interrupt on ipl cpu

The current smp code allows to trigger a restart interrupt on CPUs
offline in linux. To allow using the percpu infrastructure instead
of the pcpu_devices array, switch to the ipl cpu which is always
online before calling do_restart().

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/smp.h |  1 -
 arch/s390/kernel/ipl.c      |  2 +-
 arch/s390/kernel/smp.c      | 15 ---------------
 3 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index c13c79025348..cd835f4fb11a 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -24,7 +24,6 @@ extern int __cpu_up(unsigned int cpu, struct task_struct *tidle);
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 
-extern void smp_call_online_cpu(void (*func)(void *), void *);
 extern void smp_call_ipl_cpu(void (*func)(void *), void *);
 extern void smp_emergency_stop(void);
 
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 3a7d6e172211..f17bb7bf9392 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -2112,7 +2112,7 @@ void do_restart(void *arg)
 	tracing_off();
 	debug_locks_off();
 	lgr_info_log();
-	smp_call_online_cpu(__do_restart, arg);
+	smp_call_ipl_cpu(__do_restart, arg);
 }
 
 /* on halt */
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index c3c54adf67bc..1e1290525423 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -382,21 +382,6 @@ static int pcpu_set_smt(unsigned int mtid)
 	return cc;
 }
 
-/*
- * Call function on an online CPU.
- */
-void smp_call_online_cpu(void (*func)(void *), void *data)
-{
-	struct pcpu *pcpu;
-
-	/* Use the current cpu if it is online. */
-	pcpu = pcpu_find_address(cpu_online_mask, stap());
-	if (!pcpu)
-		/* Use the first online cpu. */
-		pcpu = pcpu_devices + cpumask_first(cpu_online_mask);
-	pcpu_delegate(pcpu, func, data, (unsigned long) restart_stack);
-}
-
 /*
  * Call function on the ipl CPU.
  */

From 90fc5ac28235843b1d5070c5dac9b01e7d39b24c Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 16 Jul 2024 09:26:14 +0200
Subject: [PATCH 182/346] s390/smp: Switch pcpu_devices to percpu

In preparation of moving the CIF flags from lowcore to pcpu_devices,
convert the pcpu_devices array to use the percpu infrastructure.
This is required because using the pcpu_devices array as it is would
introduce a performance penalty due to the fact that CPU flags for
multiple CPUs would end up in the same cacheline.

Note that a pointer to the pcpu struct of the IPL CPU is still required.
This is because a restart interrupt can be triggered on an offline CPU.
s390 stores the percpu offset in lowcore, but offline CPUs have no
lowcore area allocated. So percpu data cannot be used from an offline
CPU and we need to get the pcpu pointer for the IPL cpu from somewhere
else.

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/smp.c | 113 +++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 56 deletions(-)

diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 1e1290525423..b36b089b9a26 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -83,7 +83,14 @@ struct pcpu {
 };
 
 static u8 boot_core_type;
-static struct pcpu pcpu_devices[NR_CPUS];
+static DEFINE_PER_CPU(struct pcpu, pcpu_devices);
+/*
+ * Pointer to the pcpu area of the boot CPU. This is required when a restart
+ * interrupt is triggered on an offline CPU. For that case accessing percpu
+ * data with the common primitives does not work, since the percpu offset is
+ * stored in a non existent lowcore.
+ */
+static struct pcpu *ipl_pcpu;
 
 unsigned int smp_cpu_mt_shift;
 EXPORT_SYMBOL(smp_cpu_mt_shift);
@@ -174,8 +181,8 @@ static struct pcpu *pcpu_find_address(const struct cpumask *mask, u16 address)
 	int cpu;
 
 	for_each_cpu(cpu, mask)
-		if (pcpu_devices[cpu].address == address)
-			return pcpu_devices + cpu;
+		if (per_cpu(pcpu_devices, cpu).address == address)
+			return &per_cpu(pcpu_devices, cpu);
 	return NULL;
 }
 
@@ -230,13 +237,11 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 	return -ENOMEM;
 }
 
-static void pcpu_free_lowcore(struct pcpu *pcpu)
+static void pcpu_free_lowcore(struct pcpu *pcpu, int cpu)
 {
 	unsigned long async_stack, nodat_stack, mcck_stack;
 	struct lowcore *lc;
-	int cpu;
 
-	cpu = pcpu - pcpu_devices;
 	lc = lowcore_ptr[cpu];
 	nodat_stack = lc->nodat_stack - STACK_INIT_OFFSET;
 	async_stack = lc->async_stack - STACK_INIT_OFFSET;
@@ -277,12 +282,10 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
 	arch_spin_lock_setup(cpu);
 }
 
-static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
+static void pcpu_attach_task(int cpu, struct task_struct *tsk)
 {
 	struct lowcore *lc;
-	int cpu;
 
-	cpu = pcpu - pcpu_devices;
 	lc = lowcore_ptr[cpu];
 	lc->kernel_stack = (unsigned long)task_stack_page(tsk) + STACK_INIT_OFFSET;
 	lc->current_task = (unsigned long)tsk;
@@ -296,18 +299,16 @@ static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
 	lc->steal_timer = 0;
 }
 
-static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data)
+static void pcpu_start_fn(int cpu, void (*func)(void *), void *data)
 {
 	struct lowcore *lc;
-	int cpu;
 
-	cpu = pcpu - pcpu_devices;
 	lc = lowcore_ptr[cpu];
 	lc->restart_stack = lc->kernel_stack;
 	lc->restart_fn = (unsigned long) func;
 	lc->restart_data = (unsigned long) data;
 	lc->restart_source = -1U;
-	pcpu_sigp_retry(pcpu, SIGP_RESTART, 0);
+	pcpu_sigp_retry(per_cpu_ptr(&pcpu_devices, cpu), SIGP_RESTART, 0);
 }
 
 typedef void (pcpu_delegate_fn)(void *);
@@ -320,14 +321,14 @@ static void __pcpu_delegate(pcpu_delegate_fn *func, void *data)
 	func(data);	/* should not return */
 }
 
-static void pcpu_delegate(struct pcpu *pcpu,
+static void pcpu_delegate(struct pcpu *pcpu, int cpu,
 			  pcpu_delegate_fn *func,
 			  void *data, unsigned long stack)
 {
 	struct lowcore *lc, *abs_lc;
 	unsigned int source_cpu;
 
-	lc = lowcore_ptr[pcpu - pcpu_devices];
+	lc = lowcore_ptr[cpu];
 	source_cpu = stap();
 
 	if (pcpu->address == source_cpu) {
@@ -377,7 +378,7 @@ static int pcpu_set_smt(unsigned int mtid)
 		smp_cpu_mt_shift = 0;
 		while (smp_cpu_mtid >= (1U << smp_cpu_mt_shift))
 			smp_cpu_mt_shift++;
-		pcpu_devices[0].address = stap();
+		per_cpu(pcpu_devices, 0).address = stap();
 	}
 	return cc;
 }
@@ -389,11 +390,10 @@ void smp_call_ipl_cpu(void (*func)(void *), void *data)
 {
 	struct lowcore *lc = lowcore_ptr[0];
 
-	if (pcpu_devices[0].address == stap())
+	if (ipl_pcpu->address == stap())
 		lc = get_lowcore();
 
-	pcpu_delegate(&pcpu_devices[0], func, data,
-		      lc->nodat_stack);
+	pcpu_delegate(ipl_pcpu, 0, func, data, lc->nodat_stack);
 }
 
 int smp_find_processor_id(u16 address)
@@ -401,21 +401,21 @@ int smp_find_processor_id(u16 address)
 	int cpu;
 
 	for_each_present_cpu(cpu)
-		if (pcpu_devices[cpu].address == address)
+		if (per_cpu(pcpu_devices, cpu).address == address)
 			return cpu;
 	return -1;
 }
 
 void schedule_mcck_handler(void)
 {
-	pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_mcck_pending);
+	pcpu_ec_call(this_cpu_ptr(&pcpu_devices), ec_mcck_pending);
 }
 
 bool notrace arch_vcpu_is_preempted(int cpu)
 {
 	if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu))
 		return false;
-	if (pcpu_running(pcpu_devices + cpu))
+	if (pcpu_running(per_cpu_ptr(&pcpu_devices, cpu)))
 		return false;
 	return true;
 }
@@ -427,7 +427,7 @@ void notrace smp_yield_cpu(int cpu)
 		return;
 	diag_stat_inc_norecursion(DIAG_STAT_X09C);
 	asm volatile("diag %0,0,0x9c"
-		     : : "d" (pcpu_devices[cpu].address));
+		     : : "d" (per_cpu(pcpu_devices, cpu).address));
 }
 EXPORT_SYMBOL_GPL(smp_yield_cpu);
 
@@ -448,7 +448,7 @@ void notrace smp_emergency_stop(void)
 
 	end = get_tod_clock() + (1000000UL << 12);
 	for_each_cpu(cpu, &cpumask) {
-		struct pcpu *pcpu = pcpu_devices + cpu;
+		struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu);
 		set_bit(ec_stop_cpu, &pcpu->ec_mask);
 		while (__pcpu_sigp(pcpu->address, SIGP_EMERGENCY_SIGNAL,
 				   0, NULL) == SIGP_CC_BUSY &&
@@ -457,7 +457,7 @@ void notrace smp_emergency_stop(void)
 	}
 	while (get_tod_clock() < end) {
 		for_each_cpu(cpu, &cpumask)
-			if (pcpu_stopped(pcpu_devices + cpu))
+			if (pcpu_stopped(per_cpu_ptr(&pcpu_devices, cpu)))
 				cpumask_clear_cpu(cpu, &cpumask);
 		if (cpumask_empty(&cpumask))
 			break;
@@ -472,6 +472,7 @@ NOKPROBE_SYMBOL(smp_emergency_stop);
  */
 void smp_send_stop(void)
 {
+	struct pcpu *pcpu;
 	int cpu;
 
 	/* Disable all interrupts/machine checks */
@@ -487,8 +488,9 @@ void smp_send_stop(void)
 	for_each_online_cpu(cpu) {
 		if (cpu == smp_processor_id())
 			continue;
-		pcpu_sigp_retry(pcpu_devices + cpu, SIGP_STOP, 0);
-		while (!pcpu_stopped(pcpu_devices + cpu))
+		pcpu = per_cpu_ptr(&pcpu_devices, cpu);
+		pcpu_sigp_retry(pcpu, SIGP_STOP, 0);
+		while (!pcpu_stopped(pcpu))
 			cpu_relax();
 	}
 }
@@ -502,7 +504,7 @@ static void smp_handle_ext_call(void)
 	unsigned long bits;
 
 	/* handle bit signal external calls */
-	bits = xchg(&pcpu_devices[smp_processor_id()].ec_mask, 0);
+	bits = this_cpu_xchg(pcpu_devices.ec_mask, 0);
 	if (test_bit(ec_stop_cpu, &bits))
 		smp_stop_cpu();
 	if (test_bit(ec_schedule, &bits))
@@ -527,12 +529,12 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 	int cpu;
 
 	for_each_cpu(cpu, mask)
-		pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single);
+		pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_call_function_single);
 }
 
 void arch_send_call_function_single_ipi(int cpu)
 {
-	pcpu_ec_call(pcpu_devices + cpu, ec_call_function_single);
+	pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_call_function_single);
 }
 
 /*
@@ -542,13 +544,13 @@ void arch_send_call_function_single_ipi(int cpu)
  */
 void arch_smp_send_reschedule(int cpu)
 {
-	pcpu_ec_call(pcpu_devices + cpu, ec_schedule);
+	pcpu_ec_call(per_cpu_ptr(&pcpu_devices, cpu), ec_schedule);
 }
 
 #ifdef CONFIG_IRQ_WORK
 void arch_irq_work_raise(void)
 {
-	pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_irq_work);
+	pcpu_ec_call(this_cpu_ptr(&pcpu_devices), ec_irq_work);
 }
 #endif
 
@@ -560,7 +562,7 @@ int smp_store_status(int cpu)
 	struct pcpu *pcpu;
 	unsigned long pa;
 
-	pcpu = pcpu_devices + cpu;
+	pcpu = per_cpu_ptr(&pcpu_devices, cpu);
 	lc = lowcore_ptr[cpu];
 	pa = __pa(&lc->floating_pt_save_area);
 	if (__pcpu_sigp_relax(pcpu->address, SIGP_STORE_STATUS_AT_ADDRESS,
@@ -668,17 +670,17 @@ void __init smp_save_dump_secondary_cpus(void)
 
 void smp_cpu_set_polarization(int cpu, int val)
 {
-	pcpu_devices[cpu].polarization = val;
+	per_cpu(pcpu_devices, cpu).polarization = val;
 }
 
 int smp_cpu_get_polarization(int cpu)
 {
-	return pcpu_devices[cpu].polarization;
+	return per_cpu(pcpu_devices, cpu).polarization;
 }
 
 int smp_cpu_get_cpu_address(int cpu)
 {
-	return pcpu_devices[cpu].address;
+	return per_cpu(pcpu_devices, cpu).address;
 }
 
 static void __ref smp_get_core_info(struct sclp_core_info *info, int early)
@@ -717,7 +719,7 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail,
 	for (i = 0; (i <= smp_cpu_mtid) && (cpu < nr_cpu_ids); i++) {
 		if (pcpu_find_address(cpu_present_mask, address + i))
 			continue;
-		pcpu = pcpu_devices + cpu;
+		pcpu = per_cpu_ptr(&pcpu_devices, cpu);
 		pcpu->address = address + i;
 		if (configured)
 			pcpu->state = CPU_STATE_CONFIGURED;
@@ -752,7 +754,7 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early)
 	 * that all SMT threads get subsequent logical CPU numbers.
 	 */
 	if (early) {
-		core_id = pcpu_devices[0].address >> smp_cpu_mt_shift;
+		core_id = per_cpu(pcpu_devices, 0).address >> smp_cpu_mt_shift;
 		for (i = 0; i < info->configured; i++) {
 			core = &info->core[i];
 			if (core->core_id == core_id) {
@@ -852,7 +854,7 @@ static void smp_start_secondary(void *cpuvoid)
 /* Upping and downing of CPUs */
 int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
-	struct pcpu *pcpu = pcpu_devices + cpu;
+	struct pcpu *pcpu = per_cpu_ptr(&pcpu_devices, cpu);
 	int rc;
 
 	if (pcpu->state != CPU_STATE_CONFIGURED)
@@ -870,8 +872,8 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 	 */
 	system_ctlreg_lock();
 	pcpu_prepare_secondary(pcpu, cpu);
-	pcpu_attach_task(pcpu, tidle);
-	pcpu_start_fn(pcpu, smp_start_secondary, NULL);
+	pcpu_attach_task(cpu, tidle);
+	pcpu_start_fn(cpu, smp_start_secondary, NULL);
 	/* Wait until cpu puts itself in the online & active maps */
 	while (!cpu_online(cpu))
 		cpu_relax();
@@ -916,10 +918,10 @@ void __cpu_die(unsigned int cpu)
 	struct pcpu *pcpu;
 
 	/* Wait until target cpu is down */
-	pcpu = pcpu_devices + cpu;
+	pcpu = per_cpu_ptr(&pcpu_devices, cpu);
 	while (!pcpu_stopped(pcpu))
 		cpu_relax();
-	pcpu_free_lowcore(pcpu);
+	pcpu_free_lowcore(pcpu, cpu);
 	cpumask_clear_cpu(cpu, mm_cpumask(&init_mm));
 	cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask);
 }
@@ -927,7 +929,7 @@ void __cpu_die(unsigned int cpu)
 void __noreturn cpu_die(void)
 {
 	idle_task_exit();
-	pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0);
+	pcpu_sigp_retry(this_cpu_ptr(&pcpu_devices), SIGP_STOP, 0);
 	for (;;) ;
 }
 
@@ -957,10 +959,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 
 void __init smp_prepare_boot_cpu(void)
 {
-	struct pcpu *pcpu = pcpu_devices;
-
 	WARN_ON(!cpu_present(0) || !cpu_online(0));
-	pcpu->state = CPU_STATE_CONFIGURED;
+	ipl_pcpu = per_cpu_ptr(&pcpu_devices, 0);
+	ipl_pcpu->state = CPU_STATE_CONFIGURED;
 	get_lowcore()->percpu_offset = __per_cpu_offset[0];
 	smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN);
 }
@@ -969,8 +970,8 @@ void __init smp_setup_processor_id(void)
 {
 	struct lowcore *lc = get_lowcore();
 
-	pcpu_devices[0].address = stap();
 	lc->cpu_nr = 0;
+	per_cpu(pcpu_devices, 0).address = stap();
 	lc->spinlock_lockval = arch_spin_lockval(0);
 	lc->spinlock_index = 0;
 }
@@ -992,7 +993,7 @@ static ssize_t cpu_configure_show(struct device *dev,
 	ssize_t count;
 
 	mutex_lock(&smp_cpu_state_mutex);
-	count = sprintf(buf, "%d\n", pcpu_devices[dev->id].state);
+	count = sprintf(buf, "%d\n", per_cpu(pcpu_devices, dev->id).state);
 	mutex_unlock(&smp_cpu_state_mutex);
 	return count;
 }
@@ -1018,7 +1019,7 @@ static ssize_t cpu_configure_store(struct device *dev,
 	for (i = 0; i <= smp_cpu_mtid; i++)
 		if (cpu_online(cpu + i))
 			goto out;
-	pcpu = pcpu_devices + cpu;
+	pcpu = per_cpu_ptr(&pcpu_devices, cpu);
 	rc = 0;
 	switch (val) {
 	case 0:
@@ -1030,7 +1031,7 @@ static ssize_t cpu_configure_store(struct device *dev,
 		for (i = 0; i <= smp_cpu_mtid; i++) {
 			if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i))
 				continue;
-			pcpu[i].state = CPU_STATE_STANDBY;
+			per_cpu(pcpu_devices, cpu + i).state = CPU_STATE_STANDBY;
 			smp_cpu_set_polarization(cpu + i,
 						 POLARIZATION_UNKNOWN);
 		}
@@ -1045,7 +1046,7 @@ static ssize_t cpu_configure_store(struct device *dev,
 		for (i = 0; i <= smp_cpu_mtid; i++) {
 			if (cpu + i >= nr_cpu_ids || !cpu_present(cpu + i))
 				continue;
-			pcpu[i].state = CPU_STATE_CONFIGURED;
+			per_cpu(pcpu_devices, cpu + i).state = CPU_STATE_CONFIGURED;
 			smp_cpu_set_polarization(cpu + i,
 						 POLARIZATION_UNKNOWN);
 		}
@@ -1064,7 +1065,7 @@ static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
 static ssize_t show_cpu_address(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", pcpu_devices[dev->id].address);
+	return sprintf(buf, "%d\n", per_cpu(pcpu_devices, dev->id).address);
 }
 static DEVICE_ATTR(address, 0444, show_cpu_address, NULL);
 
@@ -1090,14 +1091,14 @@ static struct attribute_group cpu_online_attr_group = {
 
 static int smp_cpu_online(unsigned int cpu)
 {
-	struct cpu *c = &per_cpu(cpu_devices, cpu);
+	struct cpu *c = per_cpu_ptr(&cpu_devices, cpu);
 
 	return sysfs_create_group(&c->dev.kobj, &cpu_online_attr_group);
 }
 
 static int smp_cpu_pre_down(unsigned int cpu)
 {
-	struct cpu *c = &per_cpu(cpu_devices, cpu);
+	struct cpu *c = per_cpu_ptr(&cpu_devices, cpu);
 
 	sysfs_remove_group(&c->dev.kobj, &cpu_online_attr_group);
 	return 0;
@@ -1110,7 +1111,7 @@ bool arch_cpu_is_hotpluggable(int cpu)
 
 int arch_register_cpu(int cpu)
 {
-	struct cpu *c = &per_cpu(cpu_devices, cpu);
+	struct cpu *c = per_cpu_ptr(&cpu_devices, cpu);
 	int rc;
 
 	c->hotpluggable = arch_cpu_is_hotpluggable(cpu);

From d3604ffba1521f59f312be3f19999084dddef446 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 16 Jul 2024 09:26:15 +0200
Subject: [PATCH 183/346] s390: Move CIF flags to struct pcpu

To allow testing flags for offline CPUs, move the CIF flags
to struct pcpu. To avoid having to calculate the array index
for each access, add a pointer to the pcpu member for the current
cpu to lowcore.

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/lowcore.h   |  3 +--
 arch/s390/include/asm/processor.h | 26 ++++++++++++++++++++------
 arch/s390/kernel/asm-offsets.c    |  4 +++-
 arch/s390/kernel/entry.S          |  3 ++-
 arch/s390/kernel/setup.c          |  1 +
 arch/s390/kernel/smp.c            | 17 +++++++----------
 6 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index c724e71e1785..bce3a69ab2a3 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -97,8 +97,7 @@ struct lowcore {
 	__u64	save_area_async[8];		/* 0x0240 */
 	__u64	save_area_restart[1];		/* 0x0280 */
 
-	/* CPU flags. */
-	__u64	cpu_flags;			/* 0x0288 */
+	__u64	pcpu;				/* 0x0288 */
 
 	/* Return psws. */
 	psw_t	return_psw;			/* 0x0290 */
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index c87cf2b8e81a..5debb12614ad 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -42,21 +42,37 @@
 #include <asm/irqflags.h>
 #include <asm/alternative.h>
 
+struct pcpu {
+	unsigned long ec_mask;		/* bit mask for ec_xxx functions */
+	unsigned long ec_clk;		/* sigp timestamp for ec_xxx */
+	unsigned long flags;		/* per CPU flags */
+	signed char state;		/* physical cpu state */
+	signed char polarization;	/* physical polarization */
+	u16 address;			/* physical cpu address */
+};
+
+DECLARE_PER_CPU(struct pcpu, pcpu_devices);
+
 typedef long (*sys_call_ptr_t)(struct pt_regs *regs);
 
+static __always_inline struct pcpu *this_pcpu(void)
+{
+	return (struct pcpu *)(get_lowcore()->pcpu);
+}
+
 static __always_inline void set_cpu_flag(int flag)
 {
-	get_lowcore()->cpu_flags |= (1UL << flag);
+	this_pcpu()->flags |= (1UL << flag);
 }
 
 static __always_inline void clear_cpu_flag(int flag)
 {
-	get_lowcore()->cpu_flags &= ~(1UL << flag);
+	this_pcpu()->flags &= ~(1UL << flag);
 }
 
 static __always_inline bool test_cpu_flag(int flag)
 {
-	return get_lowcore()->cpu_flags & (1UL << flag);
+	return this_pcpu()->flags & (1UL << flag);
 }
 
 static __always_inline bool test_and_set_cpu_flag(int flag)
@@ -81,9 +97,7 @@ static __always_inline bool test_and_clear_cpu_flag(int flag)
  */
 static __always_inline bool test_cpu_flag_of(int flag, int cpu)
 {
-	struct lowcore *lc = lowcore_ptr[cpu];
-
-	return lc->cpu_flags & (1UL << flag);
+	return per_cpu(pcpu_devices, cpu).flags & (1UL << flag);
 }
 
 #define arch_needs_cpu() test_cpu_flag(CIF_NOHZ_DELAY)
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 26bb45d0e6f1..58fc6b93b475 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -114,7 +114,7 @@ int main(void)
 	OFFSET(__LC_SAVE_AREA_SYNC, lowcore, save_area_sync);
 	OFFSET(__LC_SAVE_AREA_ASYNC, lowcore, save_area_async);
 	OFFSET(__LC_SAVE_AREA_RESTART, lowcore, save_area_restart);
-	OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags);
+	OFFSET(__LC_PCPU, lowcore, pcpu);
 	OFFSET(__LC_RETURN_PSW, lowcore, return_psw);
 	OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw);
 	OFFSET(__LC_SYS_ENTER_TIMER, lowcore, sys_enter_timer);
@@ -186,5 +186,7 @@ int main(void)
 #endif
 	OFFSET(__FTRACE_REGS_PT_REGS, ftrace_regs, regs);
 	DEFINE(__FTRACE_REGS_SIZE, sizeof(struct ftrace_regs));
+
+	OFFSET(__PCPU_FLAGS, pcpu, flags);
 	return 0;
 }
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 454b6b92c7f8..fa58bd2c48c9 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -480,7 +480,8 @@ SYM_CODE_START(mcck_int_handler)
 	clgrjl	%r9,%r14, 4f
 	larl	%r14,.Lsie_leave
 	clgrjhe	%r9,%r14, 4f
-	oi	__LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
+	lg	%r10,__LC_PCPU
+	oi	__PCPU_FLAGS+7(%r10), _CIF_MCCK_GUEST
 4:	BPENTER	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
 	SIEEXIT __SF_SIE_CONTROL(%r15)
 #endif
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 1faba11d5f0b..178daf4e3563 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -406,6 +406,7 @@ static void __init setup_lowcore(void)
 		panic("%s: Failed to allocate %zu bytes align=%zx\n",
 		      __func__, sizeof(*lc), sizeof(*lc));
 
+	lc->pcpu = (unsigned long)per_cpu_ptr(&pcpu_devices, 0);
 	lc->restart_psw.mask = PSW_KERNEL_BITS & ~PSW_MASK_DAT;
 	lc->restart_psw.addr = __pa(restart_int_handler);
 	lc->external_new_psw.mask = PSW_KERNEL_BITS;
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b36b089b9a26..fbba37ec53cf 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -74,16 +74,8 @@ enum {
 	CPU_STATE_CONFIGURED,
 };
 
-struct pcpu {
-	unsigned long ec_mask;		/* bit mask for ec_xxx functions */
-	unsigned long ec_clk;		/* sigp timestamp for ec_xxx */
-	signed char state;		/* physical cpu state */
-	signed char polarization;	/* physical polarization */
-	u16 address;			/* physical cpu address */
-};
-
 static u8 boot_core_type;
-static DEFINE_PER_CPU(struct pcpu, pcpu_devices);
+DEFINE_PER_CPU(struct pcpu, pcpu_devices);
 /*
  * Pointer to the pcpu area of the boot CPU. This is required when a restart
  * interrupt is triggered on an offline CPU. For that case accessing percpu
@@ -264,6 +256,7 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
 	cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask);
 	cpumask_set_cpu(cpu, mm_cpumask(&init_mm));
 	lc->cpu_nr = cpu;
+	lc->pcpu = (unsigned long)pcpu;
 	lc->restart_flags = RESTART_FLAG_CTLREGS;
 	lc->spinlock_lockval = arch_spin_lockval(cpu);
 	lc->spinlock_index = 0;
@@ -924,6 +917,7 @@ void __cpu_die(unsigned int cpu)
 	pcpu_free_lowcore(pcpu, cpu);
 	cpumask_clear_cpu(cpu, mm_cpumask(&init_mm));
 	cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask);
+	pcpu->flags = 0;
 }
 
 void __noreturn cpu_die(void)
@@ -959,10 +953,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 
 void __init smp_prepare_boot_cpu(void)
 {
+	struct lowcore *lc = get_lowcore();
+
 	WARN_ON(!cpu_present(0) || !cpu_online(0));
+	lc->percpu_offset = __per_cpu_offset[0];
 	ipl_pcpu = per_cpu_ptr(&pcpu_devices, 0);
 	ipl_pcpu->state = CPU_STATE_CONFIGURED;
-	get_lowcore()->percpu_offset = __per_cpu_offset[0];
+	lc->pcpu = (unsigned long)ipl_pcpu;
 	smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN);
 }
 

From 035248a7843242d51f249444fbad7340b7336f68 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 16 Jul 2024 13:50:48 +0200
Subject: [PATCH 184/346] s390/alternatives: Remove noaltinstr option

The current Kernel doesn't boot without alternative patching on
z16 machines. To avoid such bugs in the future, remove the option
disable alternative patching.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 .../admin-guide/kernel-parameters.txt         |  3 ---
 arch/s390/kernel/alternative.c                | 21 ++-----------------
 2 files changed, 2 insertions(+), 22 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index c1134ad5f06d..f1384c7b59c9 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3830,9 +3830,6 @@
 
 	noalign		[KNL,ARM]
 
-	noaltinstr	[S390,EARLY] Disables alternative instructions
-			patching (CPU alternatives feature).
-
 	noapic		[SMP,APIC,EARLY] Tells the kernel to not make use of any
 			IOAPICs that may be present in the system.
 
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index 1ac5f707dd70..7971bc1bf496 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -7,18 +7,8 @@
 #include <asm/facility.h>
 #include <asm/nospec-branch.h>
 
-static int __initdata_or_module alt_instr_disabled;
-
-static int __init disable_alternative_instructions(char *str)
-{
-	alt_instr_disabled = 1;
-	return 0;
-}
-
-early_param("noaltinstr", disable_alternative_instructions);
-
-static void __init_or_module __apply_alternatives(struct alt_instr *start,
-						  struct alt_instr *end)
+void __init_or_module apply_alternatives(struct alt_instr *start,
+					 struct alt_instr *end)
 {
 	struct alt_instr *a;
 	u8 *instr, *replacement;
@@ -37,13 +27,6 @@ static void __init_or_module __apply_alternatives(struct alt_instr *start,
 	}
 }
 
-void __init_or_module apply_alternatives(struct alt_instr *start,
-					 struct alt_instr *end)
-{
-	if (!alt_instr_disabled)
-		__apply_alternatives(start, end);
-}
-
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 void __init apply_alternative_instructions(void)
 {

From 9be999a61232fd6748ebe8654c71bcde1a0fbed3 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Tue, 16 Jul 2024 13:50:49 +0200
Subject: [PATCH 185/346] s390/alternatives: Use consistent naming

The alternative code is using the words facility and feature for the
same. Rename facility to more generic feature everywhere to have
consistent naming.

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/alternative.h | 30 ++++++++++++++---------------
 arch/s390/kernel/alternative.c      |  2 +-
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
index dd93b92c3ab6..07459553d64f 100644
--- a/arch/s390/include/asm/alternative.h
+++ b/arch/s390/include/asm/alternative.h
@@ -11,7 +11,7 @@
 struct alt_instr {
 	s32 instr_offset;	/* original instruction */
 	s32 repl_offset;	/* offset to replacement instruction */
-	u16 facility;		/* facility bit set for replacement */
+	u16 feature;		/* feature required for replacement */
 	u8  instrlen;		/* length of original instruction */
 } __packed;
 
@@ -48,10 +48,10 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 #define OLDINSTR(oldinstr) \
 	"661:\n\t" oldinstr "\n662:\n"
 
-#define ALTINSTR_ENTRY(facility, num)					\
+#define ALTINSTR_ENTRY(feature, num)					\
 	"\t.long 661b - .\n"			/* old instruction */	\
 	"\t.long " b_altinstr(num)"b - .\n"	/* alt instruction */	\
-	"\t.word " __stringify(facility) "\n"	/* facility bit    */	\
+	"\t.word " __stringify(feature) "\n"	/* feature	   */	\
 	"\t.byte " oldinstr_len "\n"		/* instruction len */	\
 	"\t.org . - (" oldinstr_len ") & 1\n"				\
 	"\t.org . - (" oldinstr_len ") + (" altinstr_len(num) ")\n"	\
@@ -61,24 +61,24 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 	b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n"
 
 /* alternative assembly primitive: */
-#define ALTERNATIVE(oldinstr, altinstr, facility) \
+#define ALTERNATIVE(oldinstr, altinstr, feature) \
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(altinstr, 1)				\
 	".popsection\n"							\
 	OLDINSTR(oldinstr)						\
 	".pushsection .altinstructions,\"a\"\n"				\
-	ALTINSTR_ENTRY(facility, 1)					\
+	ALTINSTR_ENTRY(feature, 1)					\
 	".popsection\n"
 
-#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2)\
+#define ALTERNATIVE_2(oldinstr, altinstr1, feature1, altinstr2, feature2)\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(altinstr1, 1)				\
 	ALTINSTR_REPLACEMENT(altinstr2, 2)				\
 	".popsection\n"							\
 	OLDINSTR(oldinstr)						\
 	".pushsection .altinstructions,\"a\"\n"				\
-	ALTINSTR_ENTRY(facility1, 1)					\
-	ALTINSTR_ENTRY(facility2, 2)					\
+	ALTINSTR_ENTRY(feature1, 1)					\
+	ALTINSTR_ENTRY(feature2, 2)					\
 	".popsection\n"
 
 /*
@@ -93,12 +93,12 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
  * For non barrier like inlines please define new variants
  * without volatile and memory clobber.
  */
-#define alternative(oldinstr, altinstr, facility)			\
-	asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, facility) : : : "memory")
+#define alternative(oldinstr, altinstr, feature)			\
+	asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, feature) : : : "memory")
 
-#define alternative_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \
-	asm_inline volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1,   \
-				   altinstr2, facility2) ::: "memory")
+#define alternative_2(oldinstr, altinstr1, feature1, altinstr2, feature2) \
+	asm_inline volatile(ALTERNATIVE_2(oldinstr, altinstr1, feature1,   \
+				   altinstr2, feature2) ::: "memory")
 
 /* Alternative inline assembly with input. */
 #define alternative_input(oldinstr, newinstr, feature, input...)	\
@@ -106,8 +106,8 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 		: : input)
 
 /* Like alternative_input, but with a single output argument */
-#define alternative_io(oldinstr, altinstr, facility, output, input...)	\
-	asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, facility)	\
+#define alternative_io(oldinstr, altinstr, feature, output, input...)	\
+	asm_inline volatile(ALTERNATIVE(oldinstr, altinstr, feature)	\
 		: output : input)
 
 /* Use this macro if more than one output parameter is needed. */
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index 7971bc1bf496..e2b6549f8eaf 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -21,7 +21,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 		instr = (u8 *)&a->instr_offset + a->instr_offset;
 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 
-		if (!__test_facility(a->facility, alt_stfle_fac_list))
+		if (!__test_facility(a->feature, alt_stfle_fac_list))
 			continue;
 		s390_kernel_write(instr, replacement, a->instrlen);
 	}

From c77f7354c4478bf4560b546913e097b3d4ab50c1 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Tue, 16 Jul 2024 13:50:50 +0200
Subject: [PATCH 186/346] s390/alternatives: Merge both alternative header
 files

The two alternative header files must stay in sync. This is easier to
achieve within one header file. Therefore merge both of them and have
only one file, like most other architectures.

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/alternative-asm.h | 57 -------------------------
 arch/s390/include/asm/alternative.h     | 50 ++++++++++++++++++++++
 arch/s390/kernel/entry.S                |  2 +-
 3 files changed, 51 insertions(+), 58 deletions(-)
 delete mode 100644 arch/s390/include/asm/alternative-asm.h

diff --git a/arch/s390/include/asm/alternative-asm.h b/arch/s390/include/asm/alternative-asm.h
deleted file mode 100644
index 608f6287ca9c..000000000000
--- a/arch/s390/include/asm/alternative-asm.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_S390_ALTERNATIVE_ASM_H
-#define _ASM_S390_ALTERNATIVE_ASM_H
-
-#ifdef __ASSEMBLY__
-
-/*
- * Issue one struct alt_instr descriptor entry (need to put it into
- * the section .altinstructions, see below). This entry contains
- * enough information for the alternatives patching code to patch an
- * instruction. See apply_alternatives().
- */
-.macro alt_entry orig_start, orig_end, alt_start, alt_end, feature
-	.long	\orig_start - .
-	.long	\alt_start - .
-	.word	\feature
-	.byte	\orig_end - \orig_start
-	.org	. - ( \orig_end - \orig_start ) & 1
-	.org	. - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start )
-	.org	. - ( \alt_end - \alt_start ) + ( \orig_end - \orig_start )
-.endm
-
-/*
- * Define an alternative between two instructions. If @feature is
- * present, early code in apply_alternatives() replaces @oldinstr with
- * @newinstr.
- */
-.macro ALTERNATIVE oldinstr, newinstr, feature
-	.pushsection .altinstr_replacement,"ax"
-770:	\newinstr
-771:	.popsection
-772:	\oldinstr
-773:	.pushsection .altinstructions,"a"
-	alt_entry 772b, 773b, 770b, 771b, \feature
-	.popsection
-.endm
-
-/*
- * Define an alternative between two instructions. If @feature is
- * present, early code in apply_alternatives() replaces @oldinstr with
- * @newinstr.
- */
-.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
-	.pushsection .altinstr_replacement,"ax"
-770:	\newinstr1
-771:	\newinstr2
-772:	.popsection
-773:	\oldinstr
-774:	.pushsection .altinstructions,"a"
-	alt_entry 773b, 774b, 770b, 771b,\feature1
-	alt_entry 773b, 774b, 771b, 772b,\feature2
-	.popsection
-.endm
-
-#endif	/*  __ASSEMBLY__  */
-
-#endif /* _ASM_S390_ALTERNATIVE_ASM_H */
diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
index 07459553d64f..16de33750d6c 100644
--- a/arch/s390/include/asm/alternative.h
+++ b/arch/s390/include/asm/alternative.h
@@ -116,6 +116,56 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 /* Use this macro if clobbers are needed without inputs. */
 #define ASM_NO_INPUT_CLOBBER(clobber...) : clobber
 
+#else  /* __ASSEMBLY__ */
+
+/*
+ * Issue one struct alt_instr descriptor entry (need to put it into
+ * the section .altinstructions, see below). This entry contains
+ * enough information for the alternatives patching code to patch an
+ * instruction. See apply_alternatives().
+ */
+.macro alt_entry orig_start, orig_end, alt_start, alt_end, feature
+	.long	\orig_start - .
+	.long	\alt_start - .
+	.word	\feature
+	.byte	\orig_end - \orig_start
+	.org	. - ( \orig_end - \orig_start ) & 1
+	.org	. - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start )
+	.org	. - ( \alt_end - \alt_start ) + ( \orig_end - \orig_start )
+.endm
+
+/*
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr.
+ */
+.macro ALTERNATIVE oldinstr, newinstr, feature
+	.pushsection .altinstr_replacement,"ax"
+770:	\newinstr
+771:	.popsection
+772:	\oldinstr
+773:	.pushsection .altinstructions,"a"
+	alt_entry 772b, 773b, 770b, 771b, \feature
+	.popsection
+.endm
+
+/*
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr.
+ */
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+	.pushsection .altinstr_replacement,"ax"
+770:	\newinstr1
+771:	\newinstr2
+772:	.popsection
+773:	\oldinstr
+774:	.pushsection .altinstructions,"a"
+	alt_entry 773b, 774b, 770b, 771b,\feature1
+	alt_entry 773b, 774b, 771b, 772b,\feature2
+	.popsection
+.endm
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_S390_ALTERNATIVE_H */
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index fa58bd2c48c9..866917ff013f 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/linkage.h>
 #include <asm/asm-extable.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
 #include <asm/processor.h>
 #include <asm/cache.h>
 #include <asm/dwarf.h>

From ace76fac944b06e46427e0406f315609f278ef91 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Tue, 16 Jul 2024 13:50:51 +0200
Subject: [PATCH 187/346] s390/alternatives: Move text sync functions

Move all text sync functions from alternative.c to processor.c. This
way there is only minimal code left in alternative.c left, which is a
prerequisite to use the C file within boot code as well.

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/alternative.c | 20 --------------------
 arch/s390/kernel/processor.c   | 20 +++++++++++++++++++-
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index e2b6549f8eaf..33debc2a26c9 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -1,8 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/facility.h>
 #include <asm/nospec-branch.h>
@@ -32,20 +29,3 @@ void __init apply_alternative_instructions(void)
 {
 	apply_alternatives(__alt_instructions, __alt_instructions_end);
 }
-
-static void do_sync_core(void *info)
-{
-	sync_core();
-}
-
-void text_poke_sync(void)
-{
-	on_each_cpu(do_sync_core, NULL, 1);
-}
-
-void text_poke_sync_lock(void)
-{
-	cpus_read_lock();
-	text_poke_sync();
-	cpus_read_unlock();
-}
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 65c1464eea4f..5ce9a795a0fe 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -17,7 +17,8 @@
 #include <linux/mm_types.h>
 #include <linux/delay.h>
 #include <linux/cpu.h>
-
+#include <linux/smp.h>
+#include <asm/text-patching.h>
 #include <asm/diag.h>
 #include <asm/facility.h>
 #include <asm/elf.h>
@@ -79,6 +80,23 @@ void notrace stop_machine_yield(const struct cpumask *cpumask)
 	}
 }
 
+static void do_sync_core(void *info)
+{
+	sync_core();
+}
+
+void text_poke_sync(void)
+{
+	on_each_cpu(do_sync_core, NULL, 1);
+}
+
+void text_poke_sync_lock(void)
+{
+	cpus_read_lock();
+	text_poke_sync();
+	cpus_read_unlock();
+}
+
 /*
  * cpu_init - initializes state that is per-CPU.
  */

From 030f7951c5b293739301fb616add0f1d3fb46073 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Tue, 16 Jul 2024 13:50:52 +0200
Subject: [PATCH 188/346] s390/uaccess: Make s390_kernel_write() usable for
 decompressor

To avoid lots of ifdefs in C code make s390_kernel_write() usable for
the decompressor: simply use memcpy() for this case since there is no
write protection enabled that early.

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/uaccess.h | 9 ++++++++-
 arch/s390/mm/maccess.c          | 4 ++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 9213be0529ee..a81f897a81ce 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -332,7 +332,14 @@ static inline unsigned long __must_check clear_user(void __user *to, unsigned lo
 	return __clear_user(to, n);
 }
 
-void *s390_kernel_write(void *dst, const void *src, size_t size);
+void *__s390_kernel_write(void *dst, const void *src, size_t size);
+
+static inline void *s390_kernel_write(void *dst, const void *src, size_t size)
+{
+	if (__is_defined(__DECOMPRESSOR))
+		return memcpy(dst, src, size);
+	return __s390_kernel_write(dst, src, size);
+}
 
 int __noreturn __put_kernel_bad(void);
 
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index 632c3a55feed..28a18c42ba99 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -48,7 +48,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz
 }
 
 /*
- * s390_kernel_write - write to kernel memory bypassing DAT
+ * __s390_kernel_write - write to kernel memory bypassing DAT
  * @dst: destination address
  * @src: source address
  * @size: number of bytes to copy
@@ -61,7 +61,7 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz
  */
 static DEFINE_SPINLOCK(s390_kernel_write_lock);
 
-notrace void *s390_kernel_write(void *dst, const void *src, size_t size)
+notrace void *__s390_kernel_write(void *dst, const void *src, size_t size)
 {
 	void *tmp = dst;
 	unsigned long flags;

From b3e0c5f734f934dab1cfdef669e3baa165a0cbfe Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Tue, 16 Jul 2024 13:50:53 +0200
Subject: [PATCH 189/346] s390/alternatives: Rework to allow for callbacks

Rework alternatives to allow for callbacks. With this every
alternative entry has additional data encoded:

- When (aka context) an alternative is supposed to be applied

- The type of an alternative, which allows for type specific handling
  and callbacks

- Extra type specific payload (patch information), which can be passed
  to callbacks in order to decide if an alternative should be applied
  or not

With this only the "late" context is implemented, which means there is
no change to the previous behaviour. All code is just converted to the
more generic new infrastructure.

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/alternative.h | 66 ++++++++++++++++++++++++++---
 arch/s390/include/asm/processor.h   |  2 +-
 arch/s390/include/asm/spinlock.h    |  2 +-
 arch/s390/kernel/alternative.c      | 30 +++++++------
 arch/s390/kernel/entry.S            | 22 +++++-----
 arch/s390/lib/spinlock.c            |  4 +-
 6 files changed, 92 insertions(+), 34 deletions(-)

diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
index 16de33750d6c..5b931070be16 100644
--- a/arch/s390/include/asm/alternative.h
+++ b/arch/s390/include/asm/alternative.h
@@ -2,6 +2,44 @@
 #ifndef _ASM_S390_ALTERNATIVE_H
 #define _ASM_S390_ALTERNATIVE_H
 
+/*
+ * Each alternative comes with a 32 bit feature field:
+ *	union {
+ *		u32 feature;
+ *		struct {
+ *			u32 ctx	 : 4;
+ *			u32 type : 8;
+ *			u32 data : 20;
+ *		};
+ *	}
+ *
+ * @ctx is a bitfield, where only one bit must be set. Each bit defines
+ * in which context an alternative is supposed to be applied to the
+ * kernel image:
+ *
+ * - from the decompressor before the kernel itself is executed
+ * - from early kernel code from within the kernel
+ *
+ * @type is a number which defines the type and with that the type
+ * specific alternative patching.
+ *
+ * @data is additional type specific information which defines if an
+ * alternative should be applied.
+ */
+
+#define ALT_CTX_LATE		1
+#define ALT_CTX_ALL		ALT_CTX_LATE
+
+#define ALT_TYPE_FACILITY	0
+
+#define ALT_DATA_SHIFT		0
+#define ALT_TYPE_SHIFT		20
+#define ALT_CTX_SHIFT		28
+
+#define ALT_FACILITY(facility)		(ALT_CTX_LATE << ALT_CTX_SHIFT		| \
+					 ALT_TYPE_FACILITY << ALT_TYPE_SHIFT	| \
+					 (facility) << ALT_DATA_SHIFT)
+
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
@@ -11,12 +49,30 @@
 struct alt_instr {
 	s32 instr_offset;	/* original instruction */
 	s32 repl_offset;	/* offset to replacement instruction */
-	u16 feature;		/* feature required for replacement */
+	union {
+		u32 feature;	/* feature required for replacement */
+		struct {
+			u32 ctx	 : 4;  /* context */
+			u32 type : 8;  /* type of alternative */
+			u32 data : 20; /* patching information */
+		};
+	};
 	u8  instrlen;		/* length of original instruction */
 } __packed;
 
-void apply_alternative_instructions(void);
-void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+
+void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsigned int ctx);
+
+static inline void apply_alternative_instructions(void)
+{
+	__apply_alternatives(__alt_instructions, __alt_instructions_end, ALT_CTX_LATE);
+}
+
+static inline void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
+{
+	__apply_alternatives(start, end, ALT_CTX_ALL);
+}
 
 /*
  * +---------------------------------+
@@ -51,7 +107,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 #define ALTINSTR_ENTRY(feature, num)					\
 	"\t.long 661b - .\n"			/* old instruction */	\
 	"\t.long " b_altinstr(num)"b - .\n"	/* alt instruction */	\
-	"\t.word " __stringify(feature) "\n"	/* feature	   */	\
+	"\t.long " __stringify(feature) "\n"	/* feature	   */	\
 	"\t.byte " oldinstr_len "\n"		/* instruction len */	\
 	"\t.org . - (" oldinstr_len ") & 1\n"				\
 	"\t.org . - (" oldinstr_len ") + (" altinstr_len(num) ")\n"	\
@@ -127,7 +183,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 .macro alt_entry orig_start, orig_end, alt_start, alt_end, feature
 	.long	\orig_start - .
 	.long	\alt_start - .
-	.word	\feature
+	.long	\feature
 	.byte	\orig_end - \orig_start
 	.org	. - ( \orig_end - \orig_start ) & 1
 	.org	. - ( \orig_end - \orig_start ) + ( \alt_end - \alt_start )
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 5debb12614ad..8a52554f49f0 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -419,7 +419,7 @@ static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
 
 static __always_inline void bpon(void)
 {
-	asm volatile(ALTERNATIVE("nop", ".insn	rrf,0xb2e80000,0,0,13,0", 82));
+	asm volatile(ALTERNATIVE("nop", ".insn	rrf,0xb2e80000,0,0,13,0", ALT_FACILITY(82)));
 }
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 3e43c90ff135..77d5e804af93 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -79,7 +79,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp)
 	typecheck(int, lp->lock);
 	kcsan_release();
 	asm_inline volatile(
-		ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", 49) /* NIAI 7 */
+		ALTERNATIVE("nop", ".insn rre,0xb2fa0000,7,0", ALT_FACILITY(49)) /* NIAI 7 */
 		"	sth	%1,%0\n"
 		: "=R" (((unsigned short *) &lp->lock)[1])
 		: "d" (0) : "cc", "memory");
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index 33debc2a26c9..ecabdff89bce 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -1,31 +1,33 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/module.h>
+
+#include <linux/uaccess.h>
 #include <asm/alternative.h>
 #include <asm/facility.h>
-#include <asm/nospec-branch.h>
 
-void __init_or_module apply_alternatives(struct alt_instr *start,
-					 struct alt_instr *end)
+void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsigned int ctx)
 {
-	struct alt_instr *a;
 	u8 *instr, *replacement;
+	struct alt_instr *a;
+	bool replace;
 
 	/*
 	 * The scan order should be from start to end. A later scanned
 	 * alternative code can overwrite previously scanned alternative code.
 	 */
 	for (a = start; a < end; a++) {
+		if (!(a->ctx & ctx))
+			continue;
+		switch (a->type) {
+		case ALT_TYPE_FACILITY:
+			replace = __test_facility(a->data, alt_stfle_fac_list);
+			break;
+		default:
+			replace = false;
+		}
+		if (!replace)
+			continue;
 		instr = (u8 *)&a->instr_offset + a->instr_offset;
 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
-
-		if (!__test_facility(a->feature, alt_stfle_fac_list))
-			continue;
 		s390_kernel_write(instr, replacement, a->instrlen);
 	}
 }
-
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-void __init apply_alternative_instructions(void)
-{
-	apply_alternatives(__alt_instructions, __alt_instructions_end);
-}
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 866917ff013f..90027a57a524 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -32,19 +32,19 @@
 _LPP_OFFSET	= __LC_LPP
 
 	.macro STBEAR address
-	ALTERNATIVE "nop", ".insn s,0xb2010000,\address", 193
+	ALTERNATIVE "nop", ".insn s,0xb2010000,\address", ALT_FACILITY(193)
 	.endm
 
 	.macro LBEAR address
-	ALTERNATIVE "nop", ".insn s,0xb2000000,\address", 193
+	ALTERNATIVE "nop", ".insn s,0xb2000000,\address", ALT_FACILITY(193)
 	.endm
 
 	.macro LPSWEY address,lpswe
-	ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", 193
+	ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY(193)
 	.endm
 
 	.macro MBEAR reg
-	ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), 193
+	ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), ALT_FACILITY(193)
 	.endm
 
 	.macro	CHECK_STACK savearea
@@ -100,22 +100,22 @@ _LPP_OFFSET	= __LC_LPP
 	.endm
 
 	.macro BPOFF
-	ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", 82
+	ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", ALT_FACILITY(82)
 	.endm
 
 	.macro BPON
-	ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", 82
+	ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_FACILITY(82)
 	.endm
 
 	.macro BPENTER tif_ptr,tif_mask
 	ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \
-		    "j .+12; nop; nop", 82
+		    "j .+12; nop; nop", ALT_FACILITY(82)
 	.endm
 
 	.macro BPEXIT tif_ptr,tif_mask
 	TSTMSK	\tif_ptr,\tif_mask
 	ALTERNATIVE "jz .+8;  .insn rrf,0xb2e80000,0,0,12,0", \
-		    "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", 82
+		    "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", ALT_FACILITY(82)
 	.endm
 
 #if IS_ENABLED(CONFIG_KVM)
@@ -169,7 +169,7 @@ SYM_FUNC_START(__switch_to_asm)
 	aghi	%r3,__TASK_pid
 	mvc	__LC_CURRENT_PID(4,%r0),0(%r3)	# store pid of next
 	lmg	%r6,%r15,__SF_GPRS(%r15)	# load gprs of next task
-	ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40
+	ALTERNATIVE "nop", "lpp _LPP_OFFSET", ALT_FACILITY(40)
 	BR_EX	%r14
 SYM_FUNC_END(__switch_to_asm)
 
@@ -515,7 +515,7 @@ SYM_CODE_START(mcck_int_handler)
 	jno	0f
 	BPON
 	stpt	__LC_EXIT_TIMER
-0:	ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA),193
+0:	ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA), ALT_FACILITY(193)
 	LBEAR	0(%r12)
 	lmg	%r11,%r15,__PT_R11(%r11)
 	LPSWEY	__LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE
@@ -551,7 +551,7 @@ SYM_CODE_START(mcck_int_handler)
 SYM_CODE_END(mcck_int_handler)
 
 SYM_CODE_START(restart_int_handler)
-	ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40
+	ALTERNATIVE "nop", "lpp _LPP_OFFSET", ALT_FACILITY(40)
 	stg	%r15,__LC_SAVE_AREA_RESTART
 	TSTMSK	__LC_RESTART_FLAGS,RESTART_FLAG_CTLREGS,4
 	jz	0f
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index 0c9a73a18826..9f86ad8fa8b4 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -75,7 +75,7 @@ static inline int arch_load_niai4(int *lock)
 	int owner;
 
 	asm_inline volatile(
-		ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", 49) /* NIAI 4 */
+		ALTERNATIVE("nop", ".insn rre,0xb2fa0000,4,0", ALT_FACILITY(49)) /* NIAI 4 */
 		"	l	%0,%1\n"
 		: "=d" (owner) : "Q" (*lock) : "memory");
 	return owner;
@@ -86,7 +86,7 @@ static inline int arch_cmpxchg_niai8(int *lock, int old, int new)
 	int expected = old;
 
 	asm_inline volatile(
-		ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", 49) /* NIAI 8 */
+		ALTERNATIVE("nop", ".insn rre,0xb2fa0000,8,0", ALT_FACILITY(49)) /* NIAI 8 */
 		"	cs	%0,%3,%1\n"
 		: "=d" (old), "=Q" (*lock)
 		: "0" (old), "d" (new), "Q" (*lock)

From 7f9d85998f6c5b989796470fd1ac066232c60723 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 16 Jul 2024 13:50:54 +0200
Subject: [PATCH 190/346] s390/alternatives: Allow early alternative patching
 in decompressor

Add the required code to patch alternatives early in the decompressor.
This is required for the upcoming lowcore relocation changes, where
alternatives for facility 193 need to get patched before lowcore
alternatives.

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Co-developed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/boot/Makefile             |  2 +-
 arch/s390/boot/alternative.c        |  3 +++
 arch/s390/boot/boot.h               |  2 ++
 arch/s390/boot/startup.c            |  5 +++++
 arch/s390/include/asm/alternative.h | 12 +++++++++---
 arch/s390/kernel/alternative.c      |  5 +++++
 arch/s390/kernel/vmlinux.lds.S      |  2 ++
 7 files changed, 27 insertions(+), 4 deletions(-)
 create mode 100644 arch/s390/boot/alternative.c

diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index e7658997452b..5d8cb7e3b096 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -39,7 +39,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
 
 obj-y	:= head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o
 obj-y	+= string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
-obj-y	+= version.o pgm_check_info.o ctype.o ipl_data.o relocs.o
+obj-y	+= version.o pgm_check_info.o ctype.o ipl_data.o relocs.o alternative.o
 obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE))	+= uv.o
 obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr.o
 obj-y	+= $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o
diff --git a/arch/s390/boot/alternative.c b/arch/s390/boot/alternative.c
new file mode 100644
index 000000000000..abc08d2c873d
--- /dev/null
+++ b/arch/s390/boot/alternative.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../kernel/alternative.c"
diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h
index 18027fdc92b0..ed2f0ec24f0d 100644
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -30,6 +30,8 @@ struct vmlinux_info {
 	unsigned long init_mm_off;
 	unsigned long swapper_pg_dir_off;
 	unsigned long invalid_pg_dir_off;
+	unsigned long alt_instructions;
+	unsigned long alt_instructions_end;
 #ifdef CONFIG_KASAN
 	unsigned long kasan_early_shadow_page_off;
 	unsigned long kasan_early_shadow_pte_off;
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index cc8753c0c121..cca2f1bad33c 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -376,6 +376,8 @@ static void kaslr_adjust_vmlinux_info(long offset)
 	vmlinux.init_mm_off += offset;
 	vmlinux.swapper_pg_dir_off += offset;
 	vmlinux.invalid_pg_dir_off += offset;
+	vmlinux.alt_instructions += offset;
+	vmlinux.alt_instructions_end += offset;
 #ifdef CONFIG_KASAN
 	vmlinux.kasan_early_shadow_page_off += offset;
 	vmlinux.kasan_early_shadow_pte_off += offset;
@@ -507,6 +509,9 @@ void startup_kernel(void)
 	kaslr_adjust_got(__kaslr_offset);
 	setup_vmem(__kaslr_offset, __kaslr_offset + kernel_size, asce_limit);
 	copy_bootdata();
+	__apply_alternatives((struct alt_instr *)_vmlinux_info.alt_instructions,
+			     (struct alt_instr *)_vmlinux_info.alt_instructions_end,
+			     ALT_CTX_EARLY);
 
 	/*
 	 * Save KASLR offset for early dumps, before vmcore_info is set.
diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
index 5b931070be16..32c208332e57 100644
--- a/arch/s390/include/asm/alternative.h
+++ b/arch/s390/include/asm/alternative.h
@@ -27,15 +27,21 @@
  * alternative should be applied.
  */
 
-#define ALT_CTX_LATE		1
-#define ALT_CTX_ALL		ALT_CTX_LATE
+#define ALT_CTX_EARLY		1
+#define ALT_CTX_LATE		2
+#define ALT_CTX_ALL		(ALT_CTX_EARLY | ALT_CTX_LATE)
 
-#define ALT_TYPE_FACILITY	0
+#define ALT_TYPE_FACILITY_EARLY	0
+#define ALT_TYPE_FACILITY	1
 
 #define ALT_DATA_SHIFT		0
 #define ALT_TYPE_SHIFT		20
 #define ALT_CTX_SHIFT		28
 
+#define ALT_FACILITY_EARLY(facility)	(ALT_CTX_EARLY << ALT_CTX_SHIFT		   | \
+					 ALT_TYPE_FACILITY_EARLY << ALT_TYPE_SHIFT | \
+					 (facility) << ALT_DATA_SHIFT)
+
 #define ALT_FACILITY(facility)		(ALT_CTX_LATE << ALT_CTX_SHIFT		| \
 					 ALT_TYPE_FACILITY << ALT_TYPE_SHIFT	| \
 					 (facility) << ALT_DATA_SHIFT)
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index ecabdff89bce..de89c9e8b1a3 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -18,9 +18,14 @@ void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsign
 		if (!(a->ctx & ctx))
 			continue;
 		switch (a->type) {
+		case ALT_TYPE_FACILITY_EARLY:
+			replace = test_facility(a->data);
+			break;
+#ifndef __DECOMPRESSOR
 		case ALT_TYPE_FACILITY:
 			replace = __test_facility(a->data, alt_stfle_fac_list);
 			break;
+#endif
 		default:
 			replace = false;
 		}
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 5128ccee9c67..975c654cf5a5 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -222,6 +222,8 @@ SECTIONS
 		QUAD(init_mm)
 		QUAD(swapper_pg_dir)
 		QUAD(invalid_pg_dir)
+		QUAD(__alt_instructions)
+		QUAD(__alt_instructions_end)
 #ifdef CONFIG_KASAN
 		QUAD(kasan_early_shadow_page)
 		QUAD(kasan_early_shadow_pte)

From 47837a5c74f432ad992239cfa5966543f466d4df Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Tue, 16 Jul 2024 13:50:55 +0200
Subject: [PATCH 191/346] s390/nospec: Push down alternative handling

The nospec implementation is deeply integrated into the alternatives
code: only for nospec an alternative facility list is implemented and
used by the alternative code, while it is modified by nospec specific
needs.

Push down the nospec alternative handling into the nospec by
introducing a new alternative type and a specific nospec callback to
decide if alternatives should be applied.

Also introduce a new global nobp variable which together with facility
82 can be used to decide if nobp is enabled or not.

Acked-by: Alexander Gordeev <agordeev@linux.ibm.com>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/alternative.h   |  5 +++++
 arch/s390/include/asm/nospec-branch.h |  9 +++++++++
 arch/s390/include/asm/processor.h     |  2 +-
 arch/s390/kernel/alternative.c        |  4 ++++
 arch/s390/kernel/early.c              |  2 --
 arch/s390/kernel/entry.S              |  8 ++++----
 arch/s390/kernel/nospec-branch.c      | 16 +++++++++-------
 arch/s390/kernel/nospec-sysfs.c       |  2 +-
 8 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
index 32c208332e57..5f56a2f3aba6 100644
--- a/arch/s390/include/asm/alternative.h
+++ b/arch/s390/include/asm/alternative.h
@@ -33,6 +33,7 @@
 
 #define ALT_TYPE_FACILITY_EARLY	0
 #define ALT_TYPE_FACILITY	1
+#define ALT_TYPE_SPEC		2
 
 #define ALT_DATA_SHIFT		0
 #define ALT_TYPE_SHIFT		20
@@ -46,6 +47,10 @@
 					 ALT_TYPE_FACILITY << ALT_TYPE_SHIFT	| \
 					 (facility) << ALT_DATA_SHIFT)
 
+#define ALT_SPEC(facility)		(ALT_CTX_LATE << ALT_CTX_SHIFT		| \
+					 ALT_TYPE_SPEC << ALT_TYPE_SHIFT	| \
+					 (facility) << ALT_DATA_SHIFT)
+
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h
index b9c1f3cae842..192835a3e24d 100644
--- a/arch/s390/include/asm/nospec-branch.h
+++ b/arch/s390/include/asm/nospec-branch.h
@@ -5,8 +5,17 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <asm/facility.h>
 
 extern int nospec_disable;
+extern int nobp;
+
+static inline bool nobp_enabled(void)
+{
+	if (__is_defined(__DECOMPRESSOR))
+		return false;
+	return nobp && test_facility(82);
+}
 
 void nospec_init_branches(void);
 void nospec_auto_detect(void);
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 8a52554f49f0..3063488014eb 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -419,7 +419,7 @@ static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
 
 static __always_inline void bpon(void)
 {
-	asm volatile(ALTERNATIVE("nop", ".insn	rrf,0xb2e80000,0,0,13,0", ALT_FACILITY(82)));
+	asm volatile(ALTERNATIVE("nop", ".insn	rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82)));
 }
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index de89c9e8b1a3..05545669552f 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/uaccess.h>
+#include <asm/nospec-branch.h>
 #include <asm/alternative.h>
 #include <asm/facility.h>
 
@@ -26,6 +27,9 @@ void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsign
 			replace = __test_facility(a->data, alt_stfle_fac_list);
 			break;
 #endif
+		case ALT_TYPE_SPEC:
+			replace = nobp_enabled();
+			break;
 		default:
 			replace = false;
 		}
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 467ed4dba817..d142598e0532 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -193,8 +193,6 @@ static noinline __init void setup_lowcore_early(void)
 static noinline __init void setup_facility_list(void)
 {
 	memcpy(alt_stfle_fac_list, stfle_fac_list, sizeof(alt_stfle_fac_list));
-	if (!IS_ENABLED(CONFIG_KERNEL_NOBP))
-		__clear_facility(82, alt_stfle_fac_list);
 }
 
 static __init void detect_diag9c(void)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 90027a57a524..8caf893d1b59 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -100,22 +100,22 @@ _LPP_OFFSET	= __LC_LPP
 	.endm
 
 	.macro BPOFF
-	ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", ALT_FACILITY(82)
+	ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,12,0", ALT_SPEC(82)
 	.endm
 
 	.macro BPON
-	ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_FACILITY(82)
+	ALTERNATIVE "nop", ".insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82)
 	.endm
 
 	.macro BPENTER tif_ptr,tif_mask
 	ALTERNATIVE "TSTMSK \tif_ptr,\tif_mask; jz .+8; .insn rrf,0xb2e80000,0,0,13,0", \
-		    "j .+12; nop; nop", ALT_FACILITY(82)
+		    "j .+12; nop; nop", ALT_SPEC(82)
 	.endm
 
 	.macro BPEXIT tif_ptr,tif_mask
 	TSTMSK	\tif_ptr,\tif_mask
 	ALTERNATIVE "jz .+8;  .insn rrf,0xb2e80000,0,0,12,0", \
-		    "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", ALT_FACILITY(82)
+		    "jnz .+8; .insn rrf,0xb2e80000,0,0,13,0", ALT_SPEC(82)
 	.endm
 
 #if IS_ENABLED(CONFIG_KVM)
diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
index 9b8c24ebb008..e11ec15960a1 100644
--- a/arch/s390/kernel/nospec-branch.c
+++ b/arch/s390/kernel/nospec-branch.c
@@ -4,6 +4,8 @@
 #include <linux/cpu.h>
 #include <asm/nospec-branch.h>
 
+int nobp = IS_ENABLED(CONFIG_KERNEL_NOBP);
+
 static int __init nobp_setup_early(char *str)
 {
 	bool enabled;
@@ -17,11 +19,11 @@ static int __init nobp_setup_early(char *str)
 		 * The user explicitly requested nobp=1, enable it and
 		 * disable the expoline support.
 		 */
-		__set_facility(82, alt_stfle_fac_list);
+		nobp = 1;
 		if (IS_ENABLED(CONFIG_EXPOLINE))
 			nospec_disable = 1;
 	} else {
-		__clear_facility(82, alt_stfle_fac_list);
+		nobp = 0;
 	}
 	return 0;
 }
@@ -29,7 +31,7 @@ early_param("nobp", nobp_setup_early);
 
 static int __init nospec_setup_early(char *str)
 {
-	__clear_facility(82, alt_stfle_fac_list);
+	nobp = 0;
 	return 0;
 }
 early_param("nospec", nospec_setup_early);
@@ -40,7 +42,7 @@ static int __init nospec_report(void)
 		pr_info("Spectre V2 mitigation: etokens\n");
 	if (nospec_uses_trampoline())
 		pr_info("Spectre V2 mitigation: execute trampolines\n");
-	if (__test_facility(82, alt_stfle_fac_list))
+	if (nobp_enabled())
 		pr_info("Spectre V2 mitigation: limited branch prediction\n");
 	return 0;
 }
@@ -66,14 +68,14 @@ void __init nospec_auto_detect(void)
 		 */
 		if (__is_defined(CC_USING_EXPOLINE))
 			nospec_disable = 1;
-		__clear_facility(82, alt_stfle_fac_list);
+		nobp = 0;
 	} else if (__is_defined(CC_USING_EXPOLINE)) {
 		/*
 		 * The kernel has been compiled with expolines.
 		 * Keep expolines enabled and disable nobp.
 		 */
 		nospec_disable = 0;
-		__clear_facility(82, alt_stfle_fac_list);
+		nobp = 0;
 	}
 	/*
 	 * If the kernel has not been compiled with expolines the
@@ -86,7 +88,7 @@ static int __init spectre_v2_setup_early(char *str)
 {
 	if (str && !strncmp(str, "on", 2)) {
 		nospec_disable = 0;
-		__clear_facility(82, alt_stfle_fac_list);
+		nobp = 0;
 	}
 	if (str && !strncmp(str, "off", 3))
 		nospec_disable = 1;
diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c
index 52d4353188ad..a95188818637 100644
--- a/arch/s390/kernel/nospec-sysfs.c
+++ b/arch/s390/kernel/nospec-sysfs.c
@@ -17,7 +17,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
 		return sprintf(buf, "Mitigation: etokens\n");
 	if (nospec_uses_trampoline())
 		return sprintf(buf, "Mitigation: execute trampolines\n");
-	if (__test_facility(82, alt_stfle_fac_list))
+	if (nobp_enabled())
 		return sprintf(buf, "Mitigation: limited branch prediction\n");
 	return sprintf(buf, "Vulnerable\n");
 }

From beb8cee06f9b8726616ba87783116cb8fb889c7a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Tue, 16 Jul 2024 13:50:56 +0200
Subject: [PATCH 192/346] s390/alternatives: Remove alternative facility list

The alternative and the normal facility list are always identical. Remove
the alternative facility list, which allows to simplify the alternatives
code.

Acked-by: Alexander Gordeev <agordeev@linux.ibm.com>
Tested-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/alternative.h | 9 ++++-----
 arch/s390/include/asm/facility.h    | 1 -
 arch/s390/kernel/alternative.c      | 7 +------
 arch/s390/kernel/early.c            | 6 ------
 arch/s390/kernel/setup.c            | 1 -
 5 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
index 5f56a2f3aba6..3ddd6dbe5635 100644
--- a/arch/s390/include/asm/alternative.h
+++ b/arch/s390/include/asm/alternative.h
@@ -31,16 +31,15 @@
 #define ALT_CTX_LATE		2
 #define ALT_CTX_ALL		(ALT_CTX_EARLY | ALT_CTX_LATE)
 
-#define ALT_TYPE_FACILITY_EARLY	0
-#define ALT_TYPE_FACILITY	1
-#define ALT_TYPE_SPEC		2
+#define ALT_TYPE_FACILITY	0
+#define ALT_TYPE_SPEC		1
 
 #define ALT_DATA_SHIFT		0
 #define ALT_TYPE_SHIFT		20
 #define ALT_CTX_SHIFT		28
 
-#define ALT_FACILITY_EARLY(facility)	(ALT_CTX_EARLY << ALT_CTX_SHIFT		   | \
-					 ALT_TYPE_FACILITY_EARLY << ALT_TYPE_SHIFT | \
+#define ALT_FACILITY_EARLY(facility)	(ALT_CTX_EARLY << ALT_CTX_SHIFT		| \
+					 ALT_TYPE_FACILITY << ALT_TYPE_SHIFT	| \
 					 (facility) << ALT_DATA_SHIFT)
 
 #define ALT_FACILITY(facility)		(ALT_CTX_LATE << ALT_CTX_SHIFT		| \
diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h
index d46cc725f024..b7d234838a36 100644
--- a/arch/s390/include/asm/facility.h
+++ b/arch/s390/include/asm/facility.h
@@ -20,7 +20,6 @@
 #define MAX_FACILITY_BIT (sizeof(stfle_fac_list) * 8)
 
 extern u64 stfle_fac_list[16];
-extern u64 alt_stfle_fac_list[16];
 
 static inline void __set_facility(unsigned long nr, void *facilities)
 {
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index 05545669552f..eae254466192 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -19,14 +19,9 @@ void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsign
 		if (!(a->ctx & ctx))
 			continue;
 		switch (a->type) {
-		case ALT_TYPE_FACILITY_EARLY:
+		case ALT_TYPE_FACILITY:
 			replace = test_facility(a->data);
 			break;
-#ifndef __DECOMPRESSOR
-		case ALT_TYPE_FACILITY:
-			replace = __test_facility(a->data, alt_stfle_fac_list);
-			break;
-#endif
 		case ALT_TYPE_SPEC:
 			replace = nobp_enabled();
 			break;
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index d142598e0532..3ce77cee272d 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -190,11 +190,6 @@ static noinline __init void setup_lowcore_early(void)
 	get_lowcore()->preempt_count = INIT_PREEMPT_COUNT;
 }
 
-static noinline __init void setup_facility_list(void)
-{
-	memcpy(alt_stfle_fac_list, stfle_fac_list, sizeof(alt_stfle_fac_list));
-}
-
 static __init void detect_diag9c(void)
 {
 	unsigned int cpu_address;
@@ -289,7 +284,6 @@ void __init startup_init(void)
 	lockdep_off();
 	sort_amode31_extable();
 	setup_lowcore_early();
-	setup_facility_list();
 	detect_machine_type();
 	setup_arch_string();
 	setup_boot_command_line();
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 178daf4e3563..700003e1bc76 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -155,7 +155,6 @@ unsigned int __bootdata_preserved(zlib_dfltcc_support);
 EXPORT_SYMBOL(zlib_dfltcc_support);
 u64 __bootdata_preserved(stfle_fac_list[16]);
 EXPORT_SYMBOL(stfle_fac_list);
-u64 alt_stfle_fac_list[16];
 struct oldmem_data __bootdata_preserved(oldmem_data);
 
 unsigned long VMALLOC_START;

From 213400c4afd5c89fd8bd17d06addf145f6c8f0d5 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:11 +0200
Subject: [PATCH 193/346] s390/nmi: Simplify ptregs setup

The low level machine check handler code fills the ptregs structure
partially with the register contents present at machine check handler
entry and partially with contents from the machine check save area.

In case of a machine check the contents of all general purpose
registers are saved by the CPU to the machine check save area.
Therefore simplify the code and fill the ptregs structure by only
using the machine check save area as source.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/entry.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 8caf893d1b59..a72d6494701d 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -491,8 +491,8 @@ SYM_CODE_START(mcck_int_handler)
 	stctg	%c1,%c1,__PT_CR1(%r11)
 	lctlg	%c1,%c1,__LC_KERNEL_ASCE
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
-	lghi	%r14,__LC_GPREGS_SAVE_AREA+64
-	stmg	%r0,%r7,__PT_R0(%r11)
+	lghi	%r14,__LC_GPREGS_SAVE_AREA
+	mvc	__PT_R0(128,%r11),0(%r14)
 	# clear user controlled registers to prevent speculative use
 	xgr	%r0,%r0
 	xgr	%r1,%r1
@@ -502,7 +502,6 @@ SYM_CODE_START(mcck_int_handler)
 	xgr	%r6,%r6
 	xgr	%r7,%r7
 	xgr	%r10,%r10
-	mvc	__PT_R8(64,%r11),0(%r14)
 	stmg	%r8,%r9,__PT_PSW(%r11)
 	xc	__PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)

From fc8eac33ad93420b4e51cdd811e12d3fc9b531a5 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:12 +0200
Subject: [PATCH 194/346] s390/entry: Move SIE indicator flag to thread info

CIF_SIE indicates if a thread is running in SIE context. This is the
state of a thread and not the CPU. Therefore move this indicator to
thread info.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/processor.h   |  2 --
 arch/s390/include/asm/thread_info.h |  1 +
 arch/s390/kernel/asm-offsets.c      |  1 +
 arch/s390/kernel/entry.S            | 20 ++++++++++++--------
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 3063488014eb..5ecd442535b9 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -14,13 +14,11 @@
 
 #include <linux/bits.h>
 
-#define CIF_SIE			0	/* CPU needs SIE exit cleanup */
 #define CIF_NOHZ_DELAY		2	/* delay HZ disable for a tick */
 #define CIF_ENABLED_WAIT	5	/* in enabled wait state */
 #define CIF_MCCK_GUEST		6	/* machine check happening in guest */
 #define CIF_DEDICATED_CPU	7	/* this CPU is dedicated */
 
-#define _CIF_SIE		BIT(CIF_SIE)
 #define _CIF_NOHZ_DELAY		BIT(CIF_NOHZ_DELAY)
 #define _CIF_ENABLED_WAIT	BIT(CIF_ENABLED_WAIT)
 #define _CIF_MCCK_GUEST		BIT(CIF_MCCK_GUEST)
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index d02a709717b8..00ac01874a12 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -40,6 +40,7 @@ struct thread_info {
 	unsigned long		flags;		/* low level flags */
 	unsigned long		syscall_work;	/* SYSCALL_WORK_ flags */
 	unsigned int		cpu;		/* current CPU */
+	unsigned char		sie;		/* running in SIE context */
 };
 
 /*
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 58fc6b93b475..ffa0dd2dbaac 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -28,6 +28,7 @@ int main(void)
 	BLANK();
 	/* thread info offsets */
 	OFFSET(__TI_flags, task_struct, thread_info.flags);
+	OFFSET(__TI_sie, task_struct, thread_info.sie);
 	BLANK();
 	/* pt_regs offsets */
 	OFFSET(__PT_PSW, pt_regs, psw);
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index a72d6494701d..df351622c94c 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -123,7 +123,8 @@ _LPP_OFFSET	= __LC_LPP
 	lg	%r9,\sie_control		# get control block pointer
 	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
 	lctlg	%c1,%c1,__LC_KERNEL_ASCE	# load primary asce
-	ni	__LC_CPU_FLAGS+7,255-_CIF_SIE
+	lg	%r9,__LC_CURRENT
+	mvi	__TI_sie(%r9),0
 	larl	%r9,sie_exit			# skip forward to sie_exit
 	.endm
 #endif
@@ -183,15 +184,15 @@ SYM_FUNC_END(__switch_to_asm)
  */
 SYM_FUNC_START(__sie64a)
 	stmg	%r6,%r14,__SF_GPRS(%r15)	# save kernel registers
-	lg	%r12,__LC_CURRENT
+	lg	%r14,__LC_CURRENT
 	stg	%r2,__SF_SIE_CONTROL_PHYS(%r15)	# save sie block physical..
 	stg	%r3,__SF_SIE_CONTROL(%r15)	# ...and virtual addresses
 	stg	%r4,__SF_SIE_SAVEAREA(%r15)	# save guest register save area
 	stg	%r5,__SF_SIE_GUEST_ASCE(%r15)	# save guest asce
 	xc	__SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0
-	mvc	__SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags
+	mvc	__SF_SIE_FLAGS(8,%r15),__TI_flags(%r14) # copy thread flags
 	lmg	%r0,%r13,0(%r4)			# load guest gprs 0-13
-	oi	__LC_CPU_FLAGS+7,_CIF_SIE
+	mvi	__TI_sie(%r14),1
 	lctlg	%c1,%c1,__SF_SIE_GUEST_ASCE(%r15) # load primary asce
 	lg	%r14,__SF_SIE_CONTROL(%r15)	# get control block pointer
 	oi	__SIE_PROG0C+3(%r14),1		# we are going into SIE now
@@ -211,7 +212,8 @@ SYM_FUNC_START(__sie64a)
 	lg	%r14,__SF_SIE_CONTROL(%r15)	# get control block pointer
 	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
 	lctlg	%c1,%c1,__LC_KERNEL_ASCE	# load primary asce
-	ni	__LC_CPU_FLAGS+7,255-_CIF_SIE
+	lg	%r14,__LC_CURRENT
+	mvi	__TI_sie(%r14),0
 # some program checks are suppressing. C code (e.g. do_protection_exception)
 # will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There
 # are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
@@ -394,7 +396,8 @@ SYM_CODE_START(\name)
 	tmhh	%r8,0x0001			# interrupting from user ?
 	jnz	1f
 #if IS_ENABLED(CONFIG_KVM)
-	TSTMSK	__LC_CPU_FLAGS,_CIF_SIE
+	lg	%r10,__LC_CURRENT
+	tm	__TI_sie(%r10),0xff
 	jz	0f
 	BPENTER	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
 	SIEEXIT __SF_SIE_CONTROL(%r15)
@@ -469,9 +472,10 @@ SYM_CODE_START(mcck_int_handler)
 	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID
 	jno	.Lmcck_panic
 #if IS_ENABLED(CONFIG_KVM)
-	TSTMSK	__LC_CPU_FLAGS,_CIF_SIE
+	lg	%r10,__LC_CURRENT
+	tm	__TI_sie(%r10),0xff
 	jz	.Lmcck_user
-	# Need to compare the address instead of a CIF_SIE* flag.
+	# Need to compare the address instead of __TI_SIE flag.
 	# Otherwise there would be a race between setting the flag
 	# and entering SIE (or leaving and clearing the flag). This
 	# would cause machine checks targeted at the guest to be

From 13be21f39ab58184fb91844d2242e33805dda40e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:13 +0200
Subject: [PATCH 195/346] s390/atomic_ops: Disable flag outputs constraint for
 GCC versions below 14.2.0

GCC may die with an ICE if the flag outputs constraint is used in
combination with other inline assemblies. This will be fixed with
GCC 14.2.0.

Therefore disable the use of the constraint for now.

Link: https://gcc.gnu.org/git?p=gcc.git;a=commit;h=cd11413ff7c4353a3e336db415304f788d23a393
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/atomic_ops.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h
index 2b379d1d9046..742c7919cbcd 100644
--- a/arch/s390/include/asm/atomic_ops.h
+++ b/arch/s390/include/asm/atomic_ops.h
@@ -188,7 +188,8 @@ static __always_inline long __atomic64_cmpxchg(long *ptr, long old, long new)
 	return old;
 }
 
-#ifdef __GCC_ASM_FLAG_OUTPUTS__
+/* GCC versions before 14.2.0 may die with an ICE in some configurations. */
+#if defined(__GCC_ASM_FLAG_OUTPUTS__) && !(IS_ENABLED(CONFIG_CC_IS_GCC) && (GCC_VERSION < 140200))
 
 static __always_inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new)
 {

From 5ade5be4edf855245955108860d2016af3065a37 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:14 +0200
Subject: [PATCH 196/346] s390: Add infrastructure to patch lowcore accesses

The s390 architecture defines two special per-CPU data pages
called the "prefix area". In s390-linux terminology this is usually
called "lowcore". This memory area contains system configuration
data like old/new PSW's for system call/interrupt/machine check
handlers and lots of other data. It is normally mapped to logical
address 0. This area can only be accessed when in supervisor mode.

This means that kernel code can dereference NULL pointers, because
accesses to address 0 are allowed. Parts of lowcore can be write
protected, but read accesses and write accesses outside of the write
protected areas are not caught.

To remove this limitation for debugging and testing, remap lowcore to
another address and define a function get_lowcore() which simply
returns the address where lowcore is mapped at. This would normally
introduce a pointer dereference (=memory read). As lowcore is used
for several very often used variables, add code to patch this function
during runtime, so we avoid the memory reads.

For C code get_lowcore() has to be used, for assembly code it is
the GET_LC macro. When using this macro/function a reference is added
to alternative patching. All these locations will be patched to the
actual lowcore location when the kernel is booted or a module is loaded.

To make debugging/bisecting problems easier, this patch adds all the
infrastructure but the lowcore address is still hardwired to 0. This
way the code can be converted on a per function basis, and the
functionality is enabled in a patch after all the functions have
been converted.

Note that this requires at least z16 because the old lpsw instruction
only allowed a 12 bit displacement. z16 introduced lpswey which allows
20 bits (signed), so the lowcore can effectively be mapped from
address 0 - 0x7e000. To use 0x7e000 as address, a 6 byte lgfi
instruction would have to be used in the alternative. To save two
bytes, llilh can be used, but this only allows to set bits 16-31 of
the address. In order to use the llilh instruction, use 0x70000 as
alternative lowcore address. This is still large enough to catch
NULL pointer dereferences into large arrays.

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/boot/boot.h               |  2 ++
 arch/s390/boot/ipl_parm.c           |  1 +
 arch/s390/boot/startup.c            |  1 +
 arch/s390/boot/vmem.c               | 11 ++++++++++-
 arch/s390/include/asm/abs_lowcore.h |  8 ++++++++
 arch/s390/include/asm/alternative.h |  4 ++++
 arch/s390/include/asm/lowcore.h     | 23 ++++++++++++++++++++++-
 arch/s390/kernel/abs_lowcore.c      |  1 +
 arch/s390/kernel/alternative.c      |  4 ++++
 arch/s390/kernel/alternative.h      |  0
 arch/s390/kernel/early.c            |  1 +
 arch/s390/kernel/setup.c            |  3 +++
 12 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 arch/s390/kernel/alternative.h

diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h
index ed2f0ec24f0d..83e2ce050b6c 100644
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -91,8 +91,10 @@ extern char _end[], _decompressor_end[];
 extern unsigned char _compressed_start[];
 extern unsigned char _compressed_end[];
 extern struct vmlinux_info _vmlinux_info;
+
 #define vmlinux _vmlinux_info
 
+#define __lowcore_pa(x)		((unsigned long)(x) % sizeof(struct lowcore))
 #define __abs_lowcore_pa(x)	(((unsigned long)(x) - __abs_lowcore) % sizeof(struct lowcore))
 #define __kernel_va(x)		((void *)((unsigned long)(x) - __kaslr_offset_phys + __kaslr_offset))
 #define __kernel_pa(x)		((unsigned long)(x) - __kaslr_offset + __kaslr_offset_phys)
diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c
index a21f301acd29..337c14931ccb 100644
--- a/arch/s390/boot/ipl_parm.c
+++ b/arch/s390/boot/ipl_parm.c
@@ -3,6 +3,7 @@
 #include <linux/init.h>
 #include <linux/ctype.h>
 #include <linux/pgtable.h>
+#include <asm/abs_lowcore.h>
 #include <asm/page-states.h>
 #include <asm/ebcdic.h>
 #include <asm/sclp.h>
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index cca2f1bad33c..ce232552bc1c 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -30,6 +30,7 @@ unsigned long __bootdata_preserved(vmemmap_size);
 unsigned long __bootdata_preserved(MODULES_VADDR);
 unsigned long __bootdata_preserved(MODULES_END);
 unsigned long __bootdata_preserved(max_mappable);
+int __bootdata_preserved(relocate_lowcore);
 
 u64 __bootdata_preserved(stfle_fac_list[16]);
 struct oldmem_data __bootdata_preserved(oldmem_data);
diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c
index a255ca189aaa..2847cc059ab7 100644
--- a/arch/s390/boot/vmem.c
+++ b/arch/s390/boot/vmem.c
@@ -26,6 +26,7 @@ atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]);
 enum populate_mode {
 	POPULATE_NONE,
 	POPULATE_DIRECT,
+	POPULATE_LOWCORE,
 	POPULATE_ABS_LOWCORE,
 	POPULATE_IDENTITY,
 	POPULATE_KERNEL,
@@ -242,6 +243,8 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m
 		return -1;
 	case POPULATE_DIRECT:
 		return addr;
+	case POPULATE_LOWCORE:
+		return __lowcore_pa(addr);
 	case POPULATE_ABS_LOWCORE:
 		return __abs_lowcore_pa(addr);
 	case POPULATE_KERNEL:
@@ -418,6 +421,7 @@ static void pgtable_populate(unsigned long addr, unsigned long end, enum populat
 
 void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned long asce_limit)
 {
+	unsigned long lowcore_address = 0;
 	unsigned long start, end;
 	unsigned long asce_type;
 	unsigned long asce_bits;
@@ -455,12 +459,17 @@ void setup_vmem(unsigned long kernel_start, unsigned long kernel_end, unsigned l
 	__arch_set_page_dat((void *)swapper_pg_dir, 1UL << CRST_ALLOC_ORDER);
 	__arch_set_page_dat((void *)invalid_pg_dir, 1UL << CRST_ALLOC_ORDER);
 
+	if (relocate_lowcore)
+		lowcore_address = LOWCORE_ALT_ADDRESS;
+
 	/*
 	 * To allow prefixing the lowcore must be mapped with 4KB pages.
 	 * To prevent creation of a large page at address 0 first map
 	 * the lowcore and create the identity mapping only afterwards.
 	 */
-	pgtable_populate(0, sizeof(struct lowcore), POPULATE_DIRECT);
+	pgtable_populate(lowcore_address,
+			 lowcore_address + sizeof(struct lowcore),
+			 POPULATE_LOWCORE);
 	for_each_physmem_usable_range(i, &start, &end) {
 		pgtable_populate((unsigned long)__identity_va(start),
 				 (unsigned long)__identity_va(end),
diff --git a/arch/s390/include/asm/abs_lowcore.h b/arch/s390/include/asm/abs_lowcore.h
index 6f264b79e377..d20df8c923fc 100644
--- a/arch/s390/include/asm/abs_lowcore.h
+++ b/arch/s390/include/asm/abs_lowcore.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_S390_ABS_LOWCORE_H
 #define _ASM_S390_ABS_LOWCORE_H
 
+#include <asm/sections.h>
 #include <asm/lowcore.h>
 
 #define ABS_LOWCORE_MAP_SIZE	(NR_CPUS * sizeof(struct lowcore))
@@ -24,4 +25,11 @@ static inline void put_abs_lowcore(struct lowcore *lc)
 	put_cpu();
 }
 
+extern int __bootdata_preserved(relocate_lowcore);
+
+static inline int have_relocated_lowcore(void)
+{
+	return relocate_lowcore;
+}
+
 #endif /* _ASM_S390_ABS_LOWCORE_H */
diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h
index 3ddd6dbe5635..de980c938a3e 100644
--- a/arch/s390/include/asm/alternative.h
+++ b/arch/s390/include/asm/alternative.h
@@ -33,6 +33,7 @@
 
 #define ALT_TYPE_FACILITY	0
 #define ALT_TYPE_SPEC		1
+#define ALT_TYPE_LOWCORE	2
 
 #define ALT_DATA_SHIFT		0
 #define ALT_TYPE_SHIFT		20
@@ -50,6 +51,9 @@
 					 ALT_TYPE_SPEC << ALT_TYPE_SHIFT	| \
 					 (facility) << ALT_DATA_SHIFT)
 
+#define ALT_LOWCORE			(ALT_CTX_EARLY << ALT_CTX_SHIFT		| \
+					 ALT_TYPE_LOWCORE << ALT_TYPE_SHIFT)
+
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index bce3a69ab2a3..52c90b65a2b8 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -14,10 +14,15 @@
 #include <asm/ctlreg.h>
 #include <asm/cpu.h>
 #include <asm/types.h>
+#include <asm/alternative.h>
 
 #define LC_ORDER 1
 #define LC_PAGES 2
 
+#define LOWCORE_ALT_ADDRESS	_AC(0x70000, UL)
+
+#ifndef __ASSEMBLY__
+
 struct pgm_tdb {
 	u64 data[32];
 };
@@ -214,7 +219,14 @@ struct lowcore {
 
 static __always_inline struct lowcore *get_lowcore(void)
 {
-	return NULL;
+	struct lowcore *lc;
+
+	if (__is_defined(__DECOMPRESSOR))
+		return NULL;
+	asm(ALTERNATIVE("llilh %[lc],0", "llilh %[lc],%[alt]", ALT_LOWCORE)
+	    : [lc] "=d" (lc)
+	    : [alt] "i" (LOWCORE_ALT_ADDRESS >> 16));
+	return lc;
 }
 
 extern struct lowcore *lowcore_ptr[];
@@ -224,4 +236,13 @@ static inline void set_prefix(__u32 address)
 	asm volatile("spx %0" : : "Q" (address) : "memory");
 }
 
+#else /* __ASSEMBLY__ */
+
+.macro GET_LC reg
+	ALTERNATIVE "llilh	\reg,0",					\
+		__stringify(llilh	\reg, LOWCORE_ALT_ADDRESS >> 16),	\
+		ALT_LOWCORE
+.endm
+
+#endif /* __ASSEMBLY__ */
 #endif /* _ASM_S390_LOWCORE_H */
diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c
index f9efc54ec4b7..09cd24cbe74e 100644
--- a/arch/s390/kernel/abs_lowcore.c
+++ b/arch/s390/kernel/abs_lowcore.c
@@ -4,6 +4,7 @@
 #include <asm/abs_lowcore.h>
 
 unsigned long __bootdata_preserved(__abs_lowcore);
+int __bootdata_preserved(relocate_lowcore);
 
 int abs_lowcore_map(int cpu, struct lowcore *lc, bool alloc)
 {
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index eae254466192..8d5d0de35de0 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -2,6 +2,7 @@
 
 #include <linux/uaccess.h>
 #include <asm/nospec-branch.h>
+#include <asm/abs_lowcore.h>
 #include <asm/alternative.h>
 #include <asm/facility.h>
 
@@ -25,6 +26,9 @@ void __apply_alternatives(struct alt_instr *start, struct alt_instr *end, unsign
 		case ALT_TYPE_SPEC:
 			replace = nobp_enabled();
 			break;
+		case ALT_TYPE_LOWCORE:
+			replace = have_relocated_lowcore();
+			break;
 		default:
 			replace = false;
 		}
diff --git a/arch/s390/kernel/alternative.h b/arch/s390/kernel/alternative.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 3ce77cee272d..14d324865e33 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -48,6 +48,7 @@ decompressor_handled_param(dfltcc);
 decompressor_handled_param(facilities);
 decompressor_handled_param(nokaslr);
 decompressor_handled_param(cmma);
+decompressor_handled_param(relocate_lowcore);
 #if IS_ENABLED(CONFIG_KVM)
 decompressor_handled_param(prot_virt);
 #endif
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 700003e1bc76..4ec99f73fa27 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -889,6 +889,9 @@ void __init setup_arch(char **cmdline_p)
 	else
 		pr_info("Linux is running as a guest in 64-bit mode\n");
 
+	if (have_relocated_lowcore())
+		pr_info("Lowcore relocated to 0x%px\n", get_lowcore());
+
 	log_component_list();
 
 	/* Have one command line that is parsed and saved in /proc/cmdline */

From 39e8c5d6a4ce6512af5178f70c0c5d735141fc10 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:15 +0200
Subject: [PATCH 197/346] s390/head64: Make startup code ready for lowcore
 relocation

In preparation of having lowcore at different address than zero,
add the base register to all lowcore accesses in startup_continue().

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/head64.S | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index 45413b04efc5..396034b2fe67 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -10,6 +10,7 @@
 
 #include <linux/init.h>
 #include <linux/linkage.h>
+#include <asm/lowcore.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/page.h>
@@ -18,14 +19,15 @@
 __HEAD
 SYM_CODE_START(startup_continue)
 	larl	%r1,tod_clock_base
-	mvc	0(16,%r1),__LC_BOOT_CLOCK
+	GET_LC	%r2
+	mvc	0(16,%r1),__LC_BOOT_CLOCK(%r2)
 #
 # Setup stack
 #
 	larl	%r14,init_task
-	stg	%r14,__LC_CURRENT
+	stg	%r14,__LC_CURRENT(%r2)
 	larl	%r15,init_thread_union+STACK_INIT_OFFSET
-	stg	%r15,__LC_KERNEL_STACK
+	stg	%r15,__LC_KERNEL_STACK(%r2)
 	brasl	%r14,sclp_early_adjust_va	# allow sclp_early_printk
 	brasl	%r14,startup_init		# s390 specific early init
 	brasl	%r14,start_kernel		# common init code

From 12184a46767b40c1c9b022cd96a9b4019ebd368f Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:16 +0200
Subject: [PATCH 198/346] s390/entry: Make __sie64a() ready for lowcore
 relocation

In preparation of having lowcore at different address than zero,
add the base register to all lowcore accesses in __sie64a().

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/entry.S | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index df351622c94c..618b8b774932 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -28,6 +28,7 @@
 #include <asm/setup.h>
 #include <asm/nmi.h>
 #include <asm/nospec-insn.h>
+#include <asm/lowcore.h>
 
 _LPP_OFFSET	= __LC_LPP
 
@@ -184,7 +185,8 @@ SYM_FUNC_END(__switch_to_asm)
  */
 SYM_FUNC_START(__sie64a)
 	stmg	%r6,%r14,__SF_GPRS(%r15)	# save kernel registers
-	lg	%r14,__LC_CURRENT
+	GET_LC	%r13
+	lg	%r14,__LC_CURRENT(%r13)
 	stg	%r2,__SF_SIE_CONTROL_PHYS(%r15)	# save sie block physical..
 	stg	%r3,__SF_SIE_CONTROL(%r15)	# ...and virtual addresses
 	stg	%r4,__SF_SIE_SAVEAREA(%r15)	# save guest register save area
@@ -211,8 +213,9 @@ SYM_FUNC_START(__sie64a)
 .Lsie_skip:
 	lg	%r14,__SF_SIE_CONTROL(%r15)	# get control block pointer
 	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE	# load primary asce
-	lg	%r14,__LC_CURRENT
+	GET_LC	%r14
+	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r14)	# load primary asce
+	lg	%r14,__LC_CURRENT(%r14)
 	mvi	__TI_sie(%r14),0
 # some program checks are suppressing. C code (e.g. do_protection_exception)
 # will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There

From ca2f0a26c498c42fcccdf09527e8755481801eea Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:17 +0200
Subject: [PATCH 199/346] s390/entry: Add base register to MBEAR macro

In preparation of having lowcore at different address than zero,
add the base register to MBEAR. No functional change, because
%r0 is passed to the macro.

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/entry.S | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 618b8b774932..0d624045f2a6 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -44,8 +44,9 @@ _LPP_OFFSET	= __LC_LPP
 	ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY(193)
 	.endm
 
-	.macro MBEAR reg
-	ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK), ALT_FACILITY(193)
+	.macro MBEAR reg, lowcore
+	ALTERNATIVE "brcl 0,0", __stringify(mvc __PT_LAST_BREAK(8,\reg),__LC_LAST_BREAK(\lowcore)),\
+		ALT_FACILITY(193)
 	.endm
 
 	.macro	CHECK_STACK savearea
@@ -282,7 +283,7 @@ SYM_CODE_START(system_call)
 	xgr	%r11,%r11
 	la	%r2,STACK_FRAME_OVERHEAD(%r15)	# pointer to pt_regs
 	mvc	__PT_R8(64,%r2),__LC_SAVE_AREA_SYNC
-	MBEAR	%r2
+	MBEAR	%r2,%r0
 	lgr	%r3,%r14
 	brasl	%r14,__do_syscall
 	STACKLEAK_ERASE
@@ -424,7 +425,7 @@ SYM_CODE_START(\name)
 	xgr	%r10,%r10
 	xc	__PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
 	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
-	MBEAR	%r11
+	MBEAR	%r11,%r0
 	stmg	%r8,%r9,__PT_PSW(%r11)
 	lgr	%r2,%r11		# pass pointer to pt_regs
 	brasl	%r14,\handler

From 6908f8f916f24636621a5b4c300bdf9a0155f07e Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:18 +0200
Subject: [PATCH 200/346] s390/entry: Add base register to SIEEXIT macro

In preparation of having lowcore at different address than zero,
add the base register to SIEEXIT. No functional change, because
%r0 is passed to the macro.

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/entry.S | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 0d624045f2a6..7a5e11d9db8b 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -121,11 +121,11 @@ _LPP_OFFSET	= __LC_LPP
 	.endm
 
 #if IS_ENABLED(CONFIG_KVM)
-	.macro SIEEXIT sie_control
-	lg	%r9,\sie_control		# get control block pointer
-	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE	# load primary asce
-	lg	%r9,__LC_CURRENT
+	.macro SIEEXIT sie_control,lowcore
+	lg	%r9,\sie_control			# get control block pointer
+	ni	__SIE_PROG0C+3(%r9),0xfe		# no longer in SIE
+	lctlg	%c1,%c1,__LC_KERNEL_ASCE(\lowcore)	# load primary asce
+	lg	%r9,__LC_CURRENT(\lowcore)
 	mvi	__TI_sie(%r9),0
 	larl	%r9,sie_exit			# skip forward to sie_exit
 	.endm
@@ -349,7 +349,7 @@ SYM_CODE_START(pgm_check_handler)
 	clc	__GMAP_ASCE(8,%r12), __PT_CR1(%r11)
 	jne	5f
 	BPENTER	__SF_SIE_FLAGS(%r10),_TIF_ISOLATE_BP_GUEST
-	SIEEXIT __SF_SIE_CONTROL(%r10)
+	SIEEXIT __SF_SIE_CONTROL(%r10),%r0
 #endif
 5:	stmg	%r8,%r9,__PT_PSW(%r11)
 	# clear user controlled registers to prevent speculative use
@@ -404,7 +404,7 @@ SYM_CODE_START(\name)
 	tm	__TI_sie(%r10),0xff
 	jz	0f
 	BPENTER	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
-	SIEEXIT __SF_SIE_CONTROL(%r15)
+	SIEEXIT __SF_SIE_CONTROL(%r15),%r0
 #endif
 0:	CHECK_STACK __LC_SAVE_AREA_ASYNC
 	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
@@ -491,7 +491,7 @@ SYM_CODE_START(mcck_int_handler)
 	lg	%r10,__LC_PCPU
 	oi	__PCPU_FLAGS+7(%r10), _CIF_MCCK_GUEST
 4:	BPENTER	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
-	SIEEXIT __SF_SIE_CONTROL(%r15)
+	SIEEXIT __SF_SIE_CONTROL(%r15),%r0
 #endif
 .Lmcck_user:
 	lg	%r15,__LC_MCCK_STACK

From 86e08d64eec35cbe6a85798add4bfc1218ca9513 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:19 +0200
Subject: [PATCH 201/346] s390/entry: Add base register to
 CHECK_VMAP_STACK/CHECK_STACK macro

In preparation of having lowcore at different address than zero,
add the base register to CHECK_VMAP_STACK and CHECK_STACK. No
functional change, because %r0 is passed to the macro.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/entry.S | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 7a5e11d9db8b..1d12f3c29a43 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -49,30 +49,30 @@ _LPP_OFFSET	= __LC_LPP
 		ALT_FACILITY(193)
 	.endm
 
-	.macro	CHECK_STACK savearea
+	.macro	CHECK_STACK savearea, lowcore
 #ifdef CONFIG_CHECK_STACK
 	tml	%r15,THREAD_SIZE - CONFIG_STACK_GUARD
-	lghi	%r14,\savearea
+	la	%r14,\savearea(\lowcore)
 	jz	stack_overflow
 #endif
 	.endm
 
-	.macro	CHECK_VMAP_STACK savearea,oklabel
+	.macro	CHECK_VMAP_STACK savearea, lowcore, oklabel
 #ifdef CONFIG_VMAP_STACK
 	lgr	%r14,%r15
 	nill	%r14,0x10000 - THREAD_SIZE
 	oill	%r14,STACK_INIT_OFFSET
-	clg	%r14,__LC_KERNEL_STACK
+	clg	%r14,__LC_KERNEL_STACK(\lowcore)
 	je	\oklabel
-	clg	%r14,__LC_ASYNC_STACK
+	clg	%r14,__LC_ASYNC_STACK(\lowcore)
 	je	\oklabel
-	clg	%r14,__LC_MCCK_STACK
+	clg	%r14,__LC_MCCK_STACK(\lowcore)
 	je	\oklabel
-	clg	%r14,__LC_NODAT_STACK
+	clg	%r14,__LC_NODAT_STACK(\lowcore)
 	je	\oklabel
-	clg	%r14,__LC_RESTART_STACK
+	clg	%r14,__LC_RESTART_STACK(\lowcore)
 	je	\oklabel
-	lghi	%r14,\savearea
+	la	%r14,\savearea(\lowcore)
 	j	stack_overflow
 #else
 	j	\oklabel
@@ -331,10 +331,10 @@ SYM_CODE_START(pgm_check_handler)
 	jnz	2f			# -> enabled, can't be a double fault
 	tm	__LC_PGM_ILC+3,0x80	# check for per exception
 	jnz	.Lpgm_svcper		# -> single stepped svc
-2:	CHECK_STACK __LC_SAVE_AREA_SYNC
+2:	CHECK_STACK __LC_SAVE_AREA_SYNC,%r0
 	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
 	# CHECK_VMAP_STACK branches to stack_overflow or 4f
-	CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f
+	CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,%r0,4f
 3:	lg	%r15,__LC_KERNEL_STACK
 4:	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	xc	__PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
@@ -406,7 +406,7 @@ SYM_CODE_START(\name)
 	BPENTER	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
 	SIEEXIT __SF_SIE_CONTROL(%r15),%r0
 #endif
-0:	CHECK_STACK __LC_SAVE_AREA_ASYNC
+0:	CHECK_STACK __LC_SAVE_AREA_ASYNC,%r0
 	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
 	j	2f
 1:	lctlg	%c1,%c1,__LC_KERNEL_ASCE

From 9e1e275fa28d5896ca7cdf8afa5eb58c0117a303 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:20 +0200
Subject: [PATCH 202/346] s390/entry: Make pgm_check_handler() ready for
 lowcore relocation

In preparation of having lowcore at different address than zero,
add the base register to all lowcore accesses in pgm_check_handler().

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/lowcore.h |  6 +++++
 arch/s390/kernel/entry.S        | 47 ++++++++++++++++++---------------
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 52c90b65a2b8..183ac29afaf8 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -244,5 +244,11 @@ static inline void set_prefix(__u32 address)
 		ALT_LOWCORE
 .endm
 
+.macro STMG_LC start, end, savearea
+	ALTERNATIVE "stmg	\start, \end, \savearea",				\
+		__stringify(stmg	\start, \end, LOWCORE_ALT_ADDRESS + \savearea),	\
+		ALT_LOWCORE
+.endm
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_S390_LOWCORE_H */
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 1d12f3c29a43..5f63f3fbb34c 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -40,8 +40,11 @@ _LPP_OFFSET	= __LC_LPP
 	ALTERNATIVE "nop", ".insn s,0xb2000000,\address", ALT_FACILITY(193)
 	.endm
 
-	.macro LPSWEY address,lpswe
-	ALTERNATIVE "b \lpswe; nopr", ".insn siy,0xeb0000000071,\address,0", ALT_FACILITY(193)
+	.macro LPSWEY address, lpswe
+	ALTERNATIVE_2 "b \lpswe;nopr", \
+		".insn siy,0xeb0000000071,\address,0", ALT_FACILITY_EARLY(193),		\
+		__stringify(.insn siy,0xeb0000000071,LOWCORE_ALT_ADDRESS+\address,0),	\
+		ALT_LOWCORE
 	.endm
 
 	.macro MBEAR reg, lowcore
@@ -317,39 +320,40 @@ SYM_CODE_END(ret_from_fork)
  */
 
 SYM_CODE_START(pgm_check_handler)
-	stpt	__LC_SYS_ENTER_TIMER
+	STMG_LC	%r8,%r15,__LC_SAVE_AREA_SYNC
+	GET_LC	%r13
+	stpt	__LC_SYS_ENTER_TIMER(%r13)
 	BPOFF
-	stmg	%r8,%r15,__LC_SAVE_AREA_SYNC
 	lgr	%r10,%r15
-	lmg	%r8,%r9,__LC_PGM_OLD_PSW
+	lmg	%r8,%r9,__LC_PGM_OLD_PSW(%r13)
 	tmhh	%r8,0x0001		# coming from user space?
 	jno	.Lpgm_skip_asce
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE
+	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
 	j	3f			# -> fault in user space
 .Lpgm_skip_asce:
 1:	tmhh	%r8,0x4000		# PER bit set in old PSW ?
 	jnz	2f			# -> enabled, can't be a double fault
-	tm	__LC_PGM_ILC+3,0x80	# check for per exception
+	tm	__LC_PGM_ILC+3(%r13),0x80	# check for per exception
 	jnz	.Lpgm_svcper		# -> single stepped svc
-2:	CHECK_STACK __LC_SAVE_AREA_SYNC,%r0
+2:	CHECK_STACK __LC_SAVE_AREA_SYNC,%r13
 	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
 	# CHECK_VMAP_STACK branches to stack_overflow or 4f
-	CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,%r0,4f
-3:	lg	%r15,__LC_KERNEL_STACK
+	CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,%r13,4f
+3:	lg	%r15,__LC_KERNEL_STACK(%r13)
 4:	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	xc	__PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 	stmg	%r0,%r7,__PT_R0(%r11)
-	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
-	mvc	__PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK
+	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_SYNC(%r13)
+	mvc	__PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK(%r13)
 	stctg	%c1,%c1,__PT_CR1(%r11)
 #if IS_ENABLED(CONFIG_KVM)
-	ltg	%r12,__LC_GMAP
+	ltg	%r12,__LC_GMAP(%r13)
 	jz	5f
 	clc	__GMAP_ASCE(8,%r12), __PT_CR1(%r11)
 	jne	5f
 	BPENTER	__SF_SIE_FLAGS(%r10),_TIF_ISOLATE_BP_GUEST
-	SIEEXIT __SF_SIE_CONTROL(%r10),%r0
+	SIEEXIT __SF_SIE_CONTROL(%r10),%r13
 #endif
 5:	stmg	%r8,%r9,__PT_PSW(%r11)
 	# clear user controlled registers to prevent speculative use
@@ -365,11 +369,11 @@ SYM_CODE_START(pgm_check_handler)
 	tmhh	%r8,0x0001		# returning to user space?
 	jno	.Lpgm_exit_kernel
 	STACKLEAK_ERASE
-	lctlg	%c1,%c1,__LC_USER_ASCE
+	lctlg	%c1,%c1,__LC_USER_ASCE(%r13)
 	BPON
-	stpt	__LC_EXIT_TIMER
+	stpt	__LC_EXIT_TIMER(%r13)
 .Lpgm_exit_kernel:
-	mvc	__LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+	mvc	__LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
 	LBEAR	STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
 	lmg	%r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
 	LPSWEY	__LC_RETURN_PSW,__LC_RETURN_LPSWE
@@ -378,11 +382,11 @@ SYM_CODE_START(pgm_check_handler)
 # single stepped system call
 #
 .Lpgm_svcper:
-	mvc	__LC_RETURN_PSW(8),__LC_SVC_NEW_PSW
+	mvc	__LC_RETURN_PSW(8,%r13),__LC_SVC_NEW_PSW(%r13)
 	larl	%r14,.Lsysc_per
-	stg	%r14,__LC_RETURN_PSW+8
+	stg	%r14,__LC_RETURN_PSW+8(%r13)
 	lghi	%r14,1
-	LBEAR	__LC_PGM_LAST_BREAK
+	LBEAR	__LC_PGM_LAST_BREAK(%r13)
 	LPSWEY	__LC_RETURN_PSW,__LC_RETURN_LPSWE # branch to .Lsysc_per
 SYM_CODE_END(pgm_check_handler)
 
@@ -596,7 +600,8 @@ SYM_CODE_END(restart_int_handler)
  * Setup a pt_regs so that show_trace can provide a good call trace.
  */
 SYM_CODE_START(stack_overflow)
-	lg	%r15,__LC_NODAT_STACK	# change to panic stack
+	GET_LC	%r15
+	lg	%r15,__LC_NODAT_STACK(%r15) # change to panic stack
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	stmg	%r0,%r7,__PT_R0(%r11)
 	stmg	%r8,%r9,__PT_PSW(%r11)

From bd2c55b307f77fbf19d76250672266ff06f4a324 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:21 +0200
Subject: [PATCH 203/346] s390/entry: Make int handlers ready for lowcore
 relocation

In preparation of having lowcore at different address than zero,
add the base register to all lowcore accesses in the ext/io interrupt
handlers.

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/entry.S | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 5f63f3fbb34c..5c303eff08e0 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -395,26 +395,27 @@ SYM_CODE_END(pgm_check_handler)
  */
 .macro INT_HANDLER name,lc_old_psw,handler
 SYM_CODE_START(\name)
-	stckf	__LC_INT_CLOCK
-	stpt	__LC_SYS_ENTER_TIMER
-	STBEAR	__LC_LAST_BREAK
+	STMG_LC	%r8,%r15,__LC_SAVE_AREA_ASYNC
+	GET_LC	%r13
+	stckf	__LC_INT_CLOCK(%r13)
+	stpt	__LC_SYS_ENTER_TIMER(%r13)
+	STBEAR	__LC_LAST_BREAK(%r13)
 	BPOFF
-	stmg	%r8,%r15,__LC_SAVE_AREA_ASYNC
-	lmg	%r8,%r9,\lc_old_psw
+	lmg	%r8,%r9,\lc_old_psw(%r13)
 	tmhh	%r8,0x0001			# interrupting from user ?
 	jnz	1f
 #if IS_ENABLED(CONFIG_KVM)
-	lg	%r10,__LC_CURRENT
+	lg	%r10,__LC_CURRENT(%r13)
 	tm	__TI_sie(%r10),0xff
 	jz	0f
 	BPENTER	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
-	SIEEXIT __SF_SIE_CONTROL(%r15),%r0
+	SIEEXIT __SF_SIE_CONTROL(%r15),%r13
 #endif
-0:	CHECK_STACK __LC_SAVE_AREA_ASYNC,%r0
+0:	CHECK_STACK __LC_SAVE_AREA_ASYNC,%r13
 	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
 	j	2f
-1:	lctlg	%c1,%c1,__LC_KERNEL_ASCE
-	lg	%r15,__LC_KERNEL_STACK
+1:	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
+	lg	%r15,__LC_KERNEL_STACK(%r13)
 2:	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	stmg	%r0,%r7,__PT_R0(%r11)
@@ -428,18 +429,18 @@ SYM_CODE_START(\name)
 	xgr	%r7,%r7
 	xgr	%r10,%r10
 	xc	__PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
-	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
-	MBEAR	%r11,%r0
+	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC(%r13)
+	MBEAR	%r11,%r13
 	stmg	%r8,%r9,__PT_PSW(%r11)
 	lgr	%r2,%r11		# pass pointer to pt_regs
 	brasl	%r14,\handler
-	mvc	__LC_RETURN_PSW(16),__PT_PSW(%r11)
+	mvc	__LC_RETURN_PSW(16,%r13),__PT_PSW(%r11)
 	tmhh	%r8,0x0001		# returning to user ?
 	jno	2f
 	STACKLEAK_ERASE
-	lctlg	%c1,%c1,__LC_USER_ASCE
+	lctlg	%c1,%c1,__LC_USER_ASCE(%r13)
 	BPON
-	stpt	__LC_EXIT_TIMER
+	stpt	__LC_EXIT_TIMER(%r13)
 2:	LBEAR	__PT_LAST_BREAK(%r11)
 	lmg	%r0,%r15,__PT_R0(%r11)
 	LPSWEY	__LC_RETURN_PSW,__LC_RETURN_LPSWE

From 0001b7bbc53aeb8d31f650701d2a55e498634a2d Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:22 +0200
Subject: [PATCH 204/346] s390/entry: Make mchk_int_handler() ready for lowcore
 relocation

In preparation of having lowcore at different address than zero,
add the base register to all lowcore accesses in mcck_int_handler().

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/entry.S | 48 +++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 5c303eff08e0..a855f901f6e6 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -455,33 +455,34 @@ INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq
  */
 SYM_CODE_START(mcck_int_handler)
 	BPOFF
-	lmg	%r8,%r9,__LC_MCK_OLD_PSW
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE
+	GET_LC	%r13
+	lmg	%r8,%r9,__LC_MCK_OLD_PSW(%r13)
+	TSTMSK	__LC_MCCK_CODE(%r13),MCCK_CODE_SYSTEM_DAMAGE
 	jo	.Lmcck_panic		# yes -> rest of mcck code invalid
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_CR_VALID
+	TSTMSK	__LC_MCCK_CODE(%r13),MCCK_CODE_CR_VALID
 	jno	.Lmcck_panic		# control registers invalid -> panic
 	ptlb
-	lghi	%r14,__LC_CPU_TIMER_SAVE_AREA
-	mvc	__LC_MCCK_ENTER_TIMER(8),0(%r14)
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID
+	lay	%r14,__LC_CPU_TIMER_SAVE_AREA(%r13)
+	mvc	__LC_MCCK_ENTER_TIMER(8,%r13),0(%r14)
+	TSTMSK	__LC_MCCK_CODE(%r13),MCCK_CODE_CPU_TIMER_VALID
 	jo	3f
-	la	%r14,__LC_SYS_ENTER_TIMER
-	clc	0(8,%r14),__LC_EXIT_TIMER
+	la	%r14,__LC_SYS_ENTER_TIMER(%r13)
+	clc	0(8,%r14),__LC_EXIT_TIMER(%r13)
 	jl	1f
-	la	%r14,__LC_EXIT_TIMER
-1:	clc	0(8,%r14),__LC_LAST_UPDATE_TIMER
+	la	%r14,__LC_EXIT_TIMER(%r13)
+1:	clc	0(8,%r14),__LC_LAST_UPDATE_TIMER(%r13)
 	jl	2f
-	la	%r14,__LC_LAST_UPDATE_TIMER
+	la	%r14,__LC_LAST_UPDATE_TIMER(%r13)
 2:	spt	0(%r14)
-	mvc	__LC_MCCK_ENTER_TIMER(8),0(%r14)
-3:	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID
+	mvc	__LC_MCCK_ENTER_TIMER(8,%r13),0(%r14)
+3:	TSTMSK	__LC_MCCK_CODE(%r13),MCCK_CODE_PSW_MWP_VALID
 	jno	.Lmcck_panic
 	tmhh	%r8,0x0001		# interrupting from user ?
 	jnz	.Lmcck_user
-	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID
+	TSTMSK	__LC_MCCK_CODE(%r13),MCCK_CODE_PSW_IA_VALID
 	jno	.Lmcck_panic
 #if IS_ENABLED(CONFIG_KVM)
-	lg	%r10,__LC_CURRENT
+	lg	%r10,__LC_CURRENT(%r13)
 	tm	__TI_sie(%r10),0xff
 	jz	.Lmcck_user
 	# Need to compare the address instead of __TI_SIE flag.
@@ -496,15 +497,15 @@ SYM_CODE_START(mcck_int_handler)
 	lg	%r10,__LC_PCPU
 	oi	__PCPU_FLAGS+7(%r10), _CIF_MCCK_GUEST
 4:	BPENTER	__SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
-	SIEEXIT __SF_SIE_CONTROL(%r15),%r0
+	SIEEXIT __SF_SIE_CONTROL(%r15),%r13
 #endif
 .Lmcck_user:
-	lg	%r15,__LC_MCCK_STACK
+	lg	%r15,__LC_MCCK_STACK(%r13)
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	stctg	%c1,%c1,__PT_CR1(%r11)
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE
+	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
-	lghi	%r14,__LC_GPREGS_SAVE_AREA
+	lay	%r14,__LC_GPREGS_SAVE_AREA(%r13)
 	mvc	__PT_R0(128,%r11),0(%r14)
 	# clear user controlled registers to prevent speculative use
 	xgr	%r0,%r0
@@ -522,12 +523,13 @@ SYM_CODE_START(mcck_int_handler)
 	brasl	%r14,s390_do_machine_check
 	lctlg	%c1,%c1,__PT_CR1(%r11)
 	lmg	%r0,%r10,__PT_R0(%r11)
-	mvc	__LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW
-	tm	__LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
+	mvc	__LC_RETURN_MCCK_PSW(16,%r13),__PT_PSW(%r11) # move return PSW
+	tm	__LC_RETURN_MCCK_PSW+1(%r13),0x01 # returning to user ?
 	jno	0f
 	BPON
-	stpt	__LC_EXIT_TIMER
-0:	ALTERNATIVE "nop", __stringify(lghi %r12,__LC_LAST_BREAK_SAVE_AREA), ALT_FACILITY(193)
+	stpt	__LC_EXIT_TIMER(%r13)
+0:	ALTERNATIVE "brcl 0,0", __stringify(lay %r12,__LC_LAST_BREAK_SAVE_AREA(%r13)),\
+		ALT_FACILITY(193)
 	LBEAR	0(%r12)
 	lmg	%r11,%r15,__PT_R11(%r11)
 	LPSWEY	__LC_RETURN_MCCK_PSW,__LC_RETURN_MCCK_LPSWE

From 4064b711127e1eb5b5fd42d539fd45e3e33c9b6f Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:23 +0200
Subject: [PATCH 205/346] s390/entry: Make restart_int_handler() ready for
 lowcore relocation

In preparation of having lowcore at different address than zero,
add the base register to all lowcore accesses in restart_int_handler().

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/entry.S | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index a855f901f6e6..ca58b3da3916 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -573,15 +573,17 @@ SYM_CODE_START(restart_int_handler)
 0:	larl	%r15,daton_psw
 	lpswe	0(%r15)				# turn dat on, keep irqs off
 .Ldaton:
-	lg	%r15,__LC_RESTART_STACK
+	GET_LC	%r15
+	lg	%r15,__LC_RESTART_STACK(%r15)
 	xc	STACK_FRAME_OVERHEAD(__PT_SIZE,%r15),STACK_FRAME_OVERHEAD(%r15)
 	stmg	%r0,%r14,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
-	mvc	STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART
-	mvc	STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW
+	GET_LC	%r13
+	mvc	STACK_FRAME_OVERHEAD+__PT_R15(8,%r15),__LC_SAVE_AREA_RESTART(%r13)
+	mvc	STACK_FRAME_OVERHEAD+__PT_PSW(16,%r15),__LC_RST_OLD_PSW(%r13)
 	xc	0(STACK_FRAME_OVERHEAD,%r15),0(%r15)
-	lg	%r1,__LC_RESTART_FN		# load fn, parm & source cpu
-	lg	%r2,__LC_RESTART_DATA
-	lgf	%r3,__LC_RESTART_SOURCE
+	lg	%r1,__LC_RESTART_FN(%r13)	# load fn, parm & source cpu
+	lg	%r2,__LC_RESTART_DATA(%r13)
+	lgf	%r3,__LC_RESTART_SOURCE(%r13)
 	ltgr	%r3,%r3				# test source cpu address
 	jm	1f				# negative -> skip source stop
 0:	sigp	%r4,%r3,SIGP_SENSE		# sigp sense to source cpu

From 7cc86dee44a47d961fd6195fd91f75ce176b992d Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:24 +0200
Subject: [PATCH 206/346] s390/entry: Make __switch_to() ready for lowcore
 relocation

In preparation of having lowcore at different address than zero,
add the base register to all lowcore accesses in __switch_to().

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/entry.S | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index ca58b3da3916..bbdbe3c3a770 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -169,13 +169,14 @@ SYM_FUNC_START(__switch_to_asm)
 	stg	%r15,__THREAD_ksp(%r1,%r2)	# store kernel stack of prev
 	lg	%r15,0(%r4,%r3)			# start of kernel stack of next
 	agr	%r15,%r5			# end of kernel stack of next
-	stg	%r3,__LC_CURRENT		# store task struct of next
-	stg	%r15,__LC_KERNEL_STACK		# store end of kernel stack
+	GET_LC	%r13
+	stg	%r3,__LC_CURRENT(%r13)		# store task struct of next
+	stg	%r15,__LC_KERNEL_STACK(%r13)	# store end of kernel stack
 	lg	%r15,__THREAD_ksp(%r1,%r3)	# load kernel stack of next
 	aghi	%r3,__TASK_pid
-	mvc	__LC_CURRENT_PID(4,%r0),0(%r3)	# store pid of next
+	mvc	__LC_CURRENT_PID(4,%r13),0(%r3)	# store pid of next
+	ALTERNATIVE "nop", "lpp _LPP_OFFSET(%r13)", ALT_FACILITY(40)
 	lmg	%r6,%r15,__SF_GPRS(%r15)	# load gprs of next task
-	ALTERNATIVE "nop", "lpp _LPP_OFFSET", ALT_FACILITY(40)
 	BR_EX	%r14
 SYM_FUNC_END(__switch_to_asm)
 

From 9b3dcae128f8803950d646329f2301cae3fe8f4d Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:25 +0200
Subject: [PATCH 207/346] s390/entry: Make ret_from_fork() ready for lowcore
 relocation

In preparation of having lowcore at different address than zero,
add the base register to all lowcore accesses in ret_from_fork().

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/entry.S | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index bbdbe3c3a770..2bd9ef24ace3 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -307,12 +307,13 @@ SYM_CODE_START(ret_from_fork)
 	lgr	%r3,%r11
 	brasl	%r14,__ret_from_fork
 	STACKLEAK_ERASE
-	lctlg	%c1,%c1,__LC_USER_ASCE
-	mvc	__LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+	GET_LC	%r13
+	lctlg	%c1,%c1,__LC_USER_ASCE(%r13)
+	mvc	__LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
 	BPON
 	LBEAR	STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+	stpt	__LC_EXIT_TIMER(%r13)
 	lmg	%r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
-	stpt	__LC_EXIT_TIMER
 	LPSWEY	__LC_RETURN_PSW,__LC_RETURN_LPSWE
 SYM_CODE_END(ret_from_fork)
 

From 361f6ec2fe203760353c708480099e0325295b21 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:26 +0200
Subject: [PATCH 208/346] s390/entry: Make system_call() ready for lowcore
 relocation

In preparation of having lowcore at different address than zero,
add the base register to all lowcore accesses in system_call().

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/entry.S | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 2bd9ef24ace3..749410cfdbc0 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -264,14 +264,15 @@ EXPORT_SYMBOL(sie_exit)
  */
 
 SYM_CODE_START(system_call)
-	stpt	__LC_SYS_ENTER_TIMER
-	stmg	%r8,%r15,__LC_SAVE_AREA_SYNC
+	STMG_LC	%r8,%r15,__LC_SAVE_AREA_SYNC
+	GET_LC	%r13
+	stpt	__LC_SYS_ENTER_TIMER(%r13)
 	BPOFF
 	lghi	%r14,0
 .Lsysc_per:
-	STBEAR	__LC_LAST_BREAK
-	lctlg	%c1,%c1,__LC_KERNEL_ASCE
-	lg	%r15,__LC_KERNEL_STACK
+	STBEAR	__LC_LAST_BREAK(%r13)
+	lctlg	%c1,%c1,__LC_KERNEL_ASCE(%r13)
+	lg	%r15,__LC_KERNEL_STACK(%r13)
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 	stmg	%r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
 	# clear user controlled register to prevent speculative use
@@ -286,17 +287,17 @@ SYM_CODE_START(system_call)
 	xgr	%r10,%r10
 	xgr	%r11,%r11
 	la	%r2,STACK_FRAME_OVERHEAD(%r15)	# pointer to pt_regs
-	mvc	__PT_R8(64,%r2),__LC_SAVE_AREA_SYNC
-	MBEAR	%r2,%r0
+	mvc	__PT_R8(64,%r2),__LC_SAVE_AREA_SYNC(%r13)
+	MBEAR	%r2,%r13
 	lgr	%r3,%r14
 	brasl	%r14,__do_syscall
 	STACKLEAK_ERASE
-	lctlg	%c1,%c1,__LC_USER_ASCE
-	mvc	__LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
+	lctlg	%c1,%c1,__LC_USER_ASCE(%r13)
+	mvc	__LC_RETURN_PSW(16,%r13),STACK_FRAME_OVERHEAD+__PT_PSW(%r15)
 	BPON
 	LBEAR	STACK_FRAME_OVERHEAD+__PT_LAST_BREAK(%r15)
+	stpt	__LC_EXIT_TIMER(%r13)
 	lmg	%r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15)
-	stpt	__LC_EXIT_TIMER
 	LPSWEY	__LC_RETURN_PSW,__LC_RETURN_LPSWE
 SYM_CODE_END(system_call)
 

From 97cee3dd4a07413a4175e247f550a4931d39cee1 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:27 +0200
Subject: [PATCH 209/346] s390/kdump: Make kdump ready for lowcore relocation

In preparation of having lowcore at different address than zero,
add the base register to all lowcore accesses in store_status()
and __do_machine_kdump().

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/kernel/machine_kexec.c |  2 +-
 arch/s390/kernel/reipl.S         | 26 +++++++++++---------------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index f4cf65da6d49..8f681ccfb83a 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -62,7 +62,7 @@ static void __do_machine_kdump(void *data)
 	 * This need to be done *after* s390_reset_system set the
 	 * prefix register of this CPU to zero
 	 */
-	memcpy(absolute_pointer(__LC_FPREGS_SAVE_AREA),
+	memcpy(absolute_pointer(get_lowcore()->floating_pt_save_area),
 	       phys_to_virt(prefix + __LC_FPREGS_SAVE_AREA), 512);
 
 	call_nodat(1, int, purgatory, int, 1);
diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S
index 88087a32ebc6..69fcaf54d5ca 100644
--- a/arch/s390/kernel/reipl.S
+++ b/arch/s390/kernel/reipl.S
@@ -9,6 +9,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/nospec-insn.h>
 #include <asm/sigp.h>
+#include <asm/lowcore.h>
 
 	GEN_BR_THUNK %r9
 
@@ -20,20 +21,15 @@
 # r3 = Parameter for function
 #
 SYM_CODE_START(store_status)
-	/* Save register one and load save area base */
-	stg	%r1,__LC_SAVE_AREA_RESTART
+	STMG_LC	%r0,%r15,__LC_GPREGS_SAVE_AREA
 	/* General purpose registers */
-	lghi	%r1,__LC_GPREGS_SAVE_AREA
-	stmg	%r0,%r15,0(%r1)
-	mvc	8(8,%r1),__LC_SAVE_AREA_RESTART
+	GET_LC	%r13
 	/* Control registers */
-	lghi	%r1,__LC_CREGS_SAVE_AREA
-	stctg	%c0,%c15,0(%r1)
+	stctg	%c0,%c15,__LC_CREGS_SAVE_AREA(%r13)
 	/* Access registers */
-	lghi	%r1,__LC_AREGS_SAVE_AREA
-	stam	%a0,%a15,0(%r1)
+	stamy	%a0,%a15,__LC_AREGS_SAVE_AREA(%r13)
 	/* Floating point registers */
-	lghi	%r1,__LC_FPREGS_SAVE_AREA
+	lay	%r1,__LC_FPREGS_SAVE_AREA(%r13)
 	std	%f0, 0x00(%r1)
 	std	%f1, 0x08(%r1)
 	std	%f2, 0x10(%r1)
@@ -51,21 +47,21 @@ SYM_CODE_START(store_status)
 	std	%f14,0x70(%r1)
 	std	%f15,0x78(%r1)
 	/* Floating point control register */
-	lghi	%r1,__LC_FP_CREG_SAVE_AREA
+	lay	%r1,__LC_FP_CREG_SAVE_AREA(%r13)
 	stfpc	0(%r1)
 	/* CPU timer */
-	lghi	%r1,__LC_CPU_TIMER_SAVE_AREA
+	lay	%r1,__LC_CPU_TIMER_SAVE_AREA(%r13)
 	stpt	0(%r1)
 	/* Store prefix register */
-	lghi	%r1,__LC_PREFIX_SAVE_AREA
+	lay	%r1,__LC_PREFIX_SAVE_AREA(%r13)
 	stpx	0(%r1)
 	/* Clock comparator - seven bytes */
-	lghi	%r1,__LC_CLOCK_COMP_SAVE_AREA
 	larl	%r4,clkcmp
 	stckc	0(%r4)
+	lay	%r1,__LC_CLOCK_COMP_SAVE_AREA(%r13)
 	mvc	1(7,%r1),1(%r4)
 	/* Program status word */
-	lghi	%r1,__LC_PSW_SAVE_AREA
+	lay	%r1,__LC_PSW_SAVE_AREA(%r13)
 	epsw	%r4,%r5
 	st	%r4,0(%r1)
 	st	%r5,4(%r1)

From 8f1e70adb1a3ecb982bb6c475209b080bf985074 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Mon, 22 Jul 2024 15:41:28 +0200
Subject: [PATCH 210/346] s390/boot: Add cmdline option to relocate lowcore

Now that everything has been converted, add the option
'relocate_lowcore' to enable relocating the lowcore.

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/boot/ipl_parm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c
index 337c14931ccb..1773b72a6a7b 100644
--- a/arch/s390/boot/ipl_parm.c
+++ b/arch/s390/boot/ipl_parm.c
@@ -311,5 +311,7 @@ void parse_boot_command_line(void)
 				prot_virt_host = 1;
 		}
 #endif
+		if (!strcmp(param, "relocate_lowcore") && test_facility(193))
+			relocate_lowcore = 1;
 	}
 }

From 6dc2e98d5f1de162d1777aee97e59d75d70d07c5 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.ibm.com>
Date: Thu, 4 Jul 2024 11:02:46 +0000
Subject: [PATCH 211/346] s390: Remove protvirt and kvm config guards for uv
 code

Removing the CONFIG_PROTECTED_VIRTUALIZATION_GUEST ifdefs and config
option as well as CONFIG_KVM ifdefs in uv files.

Having this configurable has been more of a pain than a help.
It's time to remove the ifdefs and the config option.

Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/Kconfig                 | 11 ----------
 arch/s390/boot/Makefile           |  3 +--
 arch/s390/boot/uv.c               |  8 -------
 arch/s390/boot/uv.h               | 13 ------------
 arch/s390/configs/debug_defconfig |  1 -
 arch/s390/configs/defconfig       |  1 -
 arch/s390/include/asm/page.h      |  2 --
 arch/s390/include/asm/uv.h        | 32 ----------------------------
 arch/s390/kernel/Makefile         |  3 +--
 arch/s390/kernel/uv.c             | 35 +++++++++++++------------------
 drivers/s390/char/Kconfig         |  2 +-
 11 files changed, 18 insertions(+), 93 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 59e0d861e26f..a822f952f64a 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -799,17 +799,6 @@ config HAVE_PNETID
 
 menu "Virtualization"
 
-config PROTECTED_VIRTUALIZATION_GUEST
-	def_bool n
-	prompt "Protected virtualization guest support"
-	help
-	  Select this option, if you want to be able to run this
-	  kernel as a protected virtualization KVM guest.
-	  Protected virtualization capable machines have a mini hypervisor
-	  located at machine level (an ultravisor). With help of the
-	  Ultravisor, KVM will be able to run "protected" VMs, special
-	  VMs whose memory and management data are unavailable to KVM.
-
 config PFAULT
 	def_bool y
 	prompt "Pseudo page fault support"
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index 5d8cb7e3b096..4f476884d340 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -39,8 +39,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
 
 obj-y	:= head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o
 obj-y	+= string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
-obj-y	+= version.o pgm_check_info.o ctype.o ipl_data.o relocs.o alternative.o
-obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE))	+= uv.o
+obj-y	+= version.o pgm_check_info.o ctype.o ipl_data.o relocs.o alternative.o uv.o
 obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr.o
 obj-y	+= $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o
 obj-$(CONFIG_KERNEL_ZSTD) += clz_ctz.o
diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c
index 1e66d2cbb096..318e6ba95bfd 100644
--- a/arch/s390/boot/uv.c
+++ b/arch/s390/boot/uv.c
@@ -8,12 +8,8 @@
 #include "uv.h"
 
 /* will be used in arch/s390/kernel/uv.c */
-#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
 int __bootdata_preserved(prot_virt_guest);
-#endif
-#if IS_ENABLED(CONFIG_KVM)
 int __bootdata_preserved(prot_virt_host);
-#endif
 struct uv_info __bootdata_preserved(uv_info);
 
 void uv_query_info(void)
@@ -53,14 +49,11 @@ void uv_query_info(void)
 		uv_info.max_secrets = uvcb.max_secrets;
 	}
 
-#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
 	if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) &&
 	    test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list))
 		prot_virt_guest = 1;
-#endif
 }
 
-#if IS_ENABLED(CONFIG_KVM)
 unsigned long adjust_to_uv_max(unsigned long limit)
 {
 	if (is_prot_virt_host() && uv_info.max_sec_stor_addr)
@@ -92,4 +85,3 @@ void sanitize_prot_virt_host(void)
 {
 	prot_virt_host = is_prot_virt_host_capable();
 }
-#endif
diff --git a/arch/s390/boot/uv.h b/arch/s390/boot/uv.h
index 0f3070856f8d..da4a4a8d48e0 100644
--- a/arch/s390/boot/uv.h
+++ b/arch/s390/boot/uv.h
@@ -2,21 +2,8 @@
 #ifndef BOOT_UV_H
 #define BOOT_UV_H
 
-#if IS_ENABLED(CONFIG_KVM)
 unsigned long adjust_to_uv_max(unsigned long limit);
 void sanitize_prot_virt_host(void);
-#else
-static inline unsigned long adjust_to_uv_max(unsigned long limit)
-{
-	return limit;
-}
-static inline void sanitize_prot_virt_host(void) {}
-#endif
-
-#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM)
 void uv_query_info(void);
-#else
-static inline void uv_query_info(void) {}
-#endif
 
 #endif /* BOOT_UV_H */
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index f3602414a961..ea63a7342f5f 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -55,7 +55,6 @@ CONFIG_EXPOLINE_AUTO=y
 CONFIG_CHSC_SCH=y
 CONFIG_VFIO_CCW=m
 CONFIG_VFIO_AP=m
-CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y
 CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
 CONFIG_S390_HYPFS_FS=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index d0d8925fdf09..d8b28ff8ff45 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -53,7 +53,6 @@ CONFIG_EXPOLINE_AUTO=y
 CONFIG_CHSC_SCH=y
 CONFIG_VFIO_CCW=m
 CONFIG_VFIO_AP=m
-CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y
 CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
 CONFIG_S390_HYPFS_FS=y
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 5ec41ec3d761..06416b3f94f5 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -174,12 +174,10 @@ static inline int devmem_is_allowed(unsigned long pfn)
 #define HAVE_ARCH_FREE_PAGE
 #define HAVE_ARCH_ALLOC_PAGE
 
-#if IS_ENABLED(CONFIG_PGSTE)
 int arch_make_folio_accessible(struct folio *folio);
 #define HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
 int arch_make_page_accessible(struct page *page);
 #define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
-#endif
 
 struct vm_layout {
 	unsigned long kaslr_offset;
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index 0679445cac0b..0b5f8f3e84f1 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -414,7 +414,6 @@ static inline bool uv_has_feature(u8 feature_bit)
 	return test_bit_inv(feature_bit, &uv_info.uv_feature_indications);
 }
 
-#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
 extern int prot_virt_guest;
 
 static inline int is_prot_virt_guest(void)
@@ -466,13 +465,6 @@ static inline int uv_remove_shared(unsigned long addr)
 	return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS);
 }
 
-#else
-#define is_prot_virt_guest() 0
-static inline int uv_set_shared(unsigned long addr) { return 0; }
-static inline int uv_remove_shared(unsigned long addr) { return 0; }
-#endif
-
-#if IS_ENABLED(CONFIG_KVM)
 extern int prot_virt_host;
 
 static inline int is_prot_virt_host(void)
@@ -489,29 +481,5 @@ int uv_convert_from_secure_pte(pte_t pte);
 int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr);
 
 void setup_uv(void);
-#else
-#define is_prot_virt_host() 0
-static inline void setup_uv(void) {}
-
-static inline int uv_pin_shared(unsigned long paddr)
-{
-	return 0;
-}
-
-static inline int uv_destroy_folio(struct folio *folio)
-{
-	return 0;
-}
-
-static inline int uv_destroy_pte(pte_t pte)
-{
-	return 0;
-}
-
-static inline int uv_convert_from_secure_pte(pte_t pte)
-{
-	return 0;
-}
-#endif
 
 #endif /* _ASM_S390_UV_H */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 7241fa194709..e47a4be54ff8 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -43,7 +43,7 @@ obj-y	+= sysinfo.o lgr.o os_info.o ctlreg.o
 obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
 obj-y	+= entry.o reipl.o kdebugfs.o alternative.o
 obj-y	+= nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
-obj-y	+= smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o
+obj-y	+= smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o uv.o
 
 extra-y				+= vmlinux.lds
 
@@ -80,7 +80,6 @@ obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_events.o perf_regs.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_pai_crypto.o perf_pai_ext.o
 
 obj-$(CONFIG_TRACEPOINTS)	+= trace.o
-obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE))	+= uv.o
 
 # vdso
 obj-y				+= vdso64/
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index fa62fa0e369f..36db065c7cf7 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -18,11 +18,22 @@
 #include <asm/sections.h>
 #include <asm/uv.h>
 
+#if !IS_ENABLED(CONFIG_KVM)
+unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
+{
+	return 0;
+}
+
+int gmap_fault(struct gmap *gmap, unsigned long gaddr,
+	       unsigned int fault_flags)
+{
+	return 0;
+}
+#endif
+
 /* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */
-#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
 int __bootdata_preserved(prot_virt_guest);
 EXPORT_SYMBOL(prot_virt_guest);
-#endif
 
 /*
  * uv_info contains both host and guest information but it's currently only
@@ -35,7 +46,6 @@ EXPORT_SYMBOL(prot_virt_guest);
 struct uv_info __bootdata_preserved(uv_info);
 EXPORT_SYMBOL(uv_info);
 
-#if IS_ENABLED(CONFIG_KVM)
 int __bootdata_preserved(prot_virt_host);
 EXPORT_SYMBOL(prot_virt_host);
 
@@ -543,9 +553,6 @@ int arch_make_page_accessible(struct page *page)
 	return arch_make_folio_accessible(page_folio(page));
 }
 EXPORT_SYMBOL_GPL(arch_make_page_accessible);
-#endif
-
-#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM)
 static ssize_t uv_query_facilities(struct kobject *kobj,
 				   struct kobj_attribute *attr, char *buf)
 {
@@ -721,24 +728,13 @@ static struct attribute_group uv_query_attr_group = {
 static ssize_t uv_is_prot_virt_guest(struct kobject *kobj,
 				     struct kobj_attribute *attr, char *buf)
 {
-	int val = 0;
-
-#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
-	val = prot_virt_guest;
-#endif
-	return sysfs_emit(buf, "%d\n", val);
+	return sysfs_emit(buf, "%d\n", prot_virt_guest);
 }
 
 static ssize_t uv_is_prot_virt_host(struct kobject *kobj,
 				    struct kobj_attribute *attr, char *buf)
 {
-	int val = 0;
-
-#if IS_ENABLED(CONFIG_KVM)
-	val = prot_virt_host;
-#endif
-
-	return sysfs_emit(buf, "%d\n", val);
+	return sysfs_emit(buf, "%d\n", prot_virt_host);
 }
 
 static struct kobj_attribute uv_prot_virt_guest =
@@ -790,4 +786,3 @@ static int __init uv_info_init(void)
 	return rc;
 }
 device_initcall(uv_info_init);
-#endif
diff --git a/drivers/s390/char/Kconfig b/drivers/s390/char/Kconfig
index 8a03af5ee5b3..80c4e5101c97 100644
--- a/drivers/s390/char/Kconfig
+++ b/drivers/s390/char/Kconfig
@@ -96,7 +96,7 @@ config SCLP_OFB
 config S390_UV_UAPI
 	def_tristate m
 	prompt "Ultravisor userspace API"
-	depends on S390 && (KVM || PROTECTED_VIRTUALIZATION_GUEST)
+	depends on S390
 	help
 	  Selecting exposes parts of the UV interface to userspace
 	  by providing a misc character device at /dev/uv.

From 62e2397c2203d41c8ebe2cca0c728293ff2af313 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Tue, 16 Jul 2024 04:28:17 +0900
Subject: [PATCH 212/346] arm64: remove redundant 'if HAVE_ARCH_KASAN' in
 Kconfig

Since commit 0383808e4d99 ("arm64: kasan: Reduce minimum shadow
alignment and enable 5 level paging"), HAVE_ARCH_KASAN is always 'y'.

The condition 'if HAVE_ARCH_KASAN' is always met.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20240715192843.2201439-1-masahiroy@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 167e51067508..5776bf3acd69 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -168,9 +168,9 @@ config ARM64
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_JUMP_LABEL_RELATIVE
 	select HAVE_ARCH_KASAN
-	select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN
-	select HAVE_ARCH_KASAN_SW_TAGS if HAVE_ARCH_KASAN
-	select HAVE_ARCH_KASAN_HW_TAGS if (HAVE_ARCH_KASAN && ARM64_MTE)
+	select HAVE_ARCH_KASAN_VMALLOC
+	select HAVE_ARCH_KASAN_SW_TAGS
+	select HAVE_ARCH_KASAN_HW_TAGS if ARM64_MTE
 	# Some instrumentation may be unsound, hence EXPERT
 	select HAVE_ARCH_KCSAN if EXPERT
 	select HAVE_ARCH_KFENCE

From add6128fc7f0d24199e977d13ca724f011c03fc2 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 16 Jul 2024 10:39:15 +0530
Subject: [PATCH 213/346] arm64/Kconfig: Remove redundant 'if
 HAVE_FUNCTION_GRAPH_TRACER'

Since the commit 819e50e25d0c ("arm64: Add ftrace support"),
HAVE_FUNCTION_GRAPH_TRACER has always been enabled. Although a subsequent
commit 364697032246 ("arm64: ftrace: Enable HAVE_FUNCTION_GRAPH_RETVAL")
redundantly added check on HAVE_FUNCTION_GRAPH_TRACER, while enabling the
config HAVE_FUNCTION_GRAPH_RETVAL. Let's just drop this redundant check.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20240716050915.2657694-1-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5776bf3acd69..9823bf1ac018 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -211,8 +211,8 @@ config ARM64
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_ERROR_INJECTION
-	select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
+	select HAVE_FUNCTION_GRAPH_RETVAL
 	select HAVE_GCC_PLUGINS
 	select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && \
 		HW_PERF_EVENTS && HAVE_PERF_EVENTS_NMI

From 0c35e3bd412a6a2676adc35df950cfbb9f464b0c Mon Sep 17 00:00:00 2001
From: Remington Brasga <rbrasga@uci.edu>
Date: Fri, 12 Jul 2024 23:17:30 +0000
Subject: [PATCH 214/346] kselftest: missing arg in ptrace.c

The string passed to ksft_test_result_skip is missing the `type_name`

Signed-off-by: Remington Brasga <rbrasga@uci.edu>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20240712231730.2794-1-rbrasga@uci.edu
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/testing/selftests/arm64/abi/ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/abi/ptrace.c b/tools/testing/selftests/arm64/abi/ptrace.c
index 4c941270d8de..e4fa507cbdd0 100644
--- a/tools/testing/selftests/arm64/abi/ptrace.c
+++ b/tools/testing/selftests/arm64/abi/ptrace.c
@@ -156,7 +156,7 @@ static void test_hw_debug(pid_t child, int type, const char *type_name)
 		/* Zero is not currently architecturally valid */
 		ksft_test_result(arch, "%s_arch_set\n", type_name);
 	} else {
-		ksft_test_result_skip("%s_arch_set\n");
+		ksft_test_result_skip("%s_arch_set\n", type_name);
 	}
 }
 

From 48f6430505c0b0498ee9020ce3cf9558b1caaaeb Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Thu, 18 Jul 2024 10:34:23 -0700
Subject: [PATCH 215/346] arm64/vdso: Remove --hash-style=sysv

glibc added support for .gnu.hash in 2006 and .hash has been obsoleted
for more than one decade in many Linux distributions.  Using
--hash-style=sysv might imply unaddressed issues and confuse readers.

Just drop the option and rely on the linker default, which is likely
"both", or "gnu" when the distribution really wants to eliminate sysv
hash overhead.

Similar to commit 6b7e26547fad ("x86/vdso: Emit a GNU hash").

Signed-off-by: Fangrui Song <maskray@google.com>
Link: https://lore.kernel.org/r/20240718173423.1574395-1-maskray@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/vdso/Makefile   | 2 +-
 arch/arm64/kernel/vdso32/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index d63930c82839..d11da6461278 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -21,7 +21,7 @@ btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti
 # potential future proofing if we end up with internal calls to the exported
 # routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so
 # preparation in build-time C")).
-ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv	\
+ldflags-y := -shared -soname=linux-vdso.so.1 \
 	     -Bsymbolic --build-id=sha1 -n $(btildflags-y)
 
 ifdef CONFIG_LD_ORPHAN_WARN
diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile
index cc4508c604b2..25a2cb6317f3 100644
--- a/arch/arm64/kernel/vdso32/Makefile
+++ b/arch/arm64/kernel/vdso32/Makefile
@@ -98,7 +98,7 @@ VDSO_AFLAGS += -D__ASSEMBLY__
 # From arm vDSO Makefile
 VDSO_LDFLAGS += -Bsymbolic --no-undefined -soname=linux-vdso.so.1
 VDSO_LDFLAGS += -z max-page-size=4096 -z common-page-size=4096
-VDSO_LDFLAGS += -shared --hash-style=sysv --build-id=sha1
+VDSO_LDFLAGS += -shared --build-id=sha1
 VDSO_LDFLAGS += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL)
 
 

From f3dfcd25455b1cbb3c7e2d19b0a06acc6c7472a5 Mon Sep 17 00:00:00 2001
From: Raghavendra Rao Ananta <rananta@google.com>
Date: Thu, 18 Jul 2024 21:55:32 +0000
Subject: [PATCH 216/346] arm64/sysreg: Correct the values for GICv4.1

Currently, sysreg has value as 0b0010 for the presence of GICv4.1 in
ID_PFR1_EL1 and ID_AA64PFR0_EL1, instead of 0b0011 as per ARM ARM.
Hence, correct them to reflect ARM ARM.

Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Reviewed-by: Zenghui Yu <yuzenghui@huawei.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20240718215532.616447-1-rananta@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/tools/sysreg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index a4c1dd4741a4..7ceaa1e0b4bc 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -149,7 +149,7 @@ Res0	63:32
 UnsignedEnum	31:28	GIC
 	0b0000	NI
 	0b0001	GICv3
-	0b0010	GICv4p1
+	0b0011	GICv4p1
 EndEnum
 UnsignedEnum	27:24	Virt_frac
 	0b0000	NI
@@ -903,7 +903,7 @@ EndEnum
 UnsignedEnum	27:24	GIC
 	0b0000	NI
 	0b0001	IMP
-	0b0010	V4P1
+	0b0011	V4P1
 EndEnum
 SignedEnum	23:20	AdvSIMD
 	0b0000	IMP

From 193cc89ea0ca1da311877d2b4bb5e9f03bcc82a2 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Sun, 21 Jul 2024 15:45:56 -0500
Subject: [PATCH 217/346] cifs: fix potential null pointer use in
 destroy_workqueue in init_cifs error path

Dan Carpenter reported a Smack static checker warning:
   fs/smb/client/cifsfs.c:1981 init_cifs()
   error: we previously assumed 'serverclose_wq' could be null (see line 1895)

The patch which introduced the serverclose workqueue used the wrong
oredering in error paths in init_cifs() for freeing it on errors.

Fixes: 173217bd7336 ("smb3: retrying on failed server close")
Cc: stable@vger.kernel.org
Cc: Ritvik Budhiraja <rbudhiraja@microsoft.com>
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: David Howells <dhowell@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsfs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index c92937bed133..2c4b357d85e2 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1894,12 +1894,12 @@ init_cifs(void)
 					   WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
 	if (!serverclose_wq) {
 		rc = -ENOMEM;
-		goto out_destroy_serverclose_wq;
+		goto out_destroy_deferredclose_wq;
 	}
 
 	rc = cifs_init_inodecache();
 	if (rc)
-		goto out_destroy_deferredclose_wq;
+		goto out_destroy_serverclose_wq;
 
 	rc = cifs_init_netfs();
 	if (rc)
@@ -1967,6 +1967,8 @@ init_cifs(void)
 	cifs_destroy_netfs();
 out_destroy_inodecache:
 	cifs_destroy_inodecache();
+out_destroy_serverclose_wq:
+	destroy_workqueue(serverclose_wq);
 out_destroy_deferredclose_wq:
 	destroy_workqueue(deferredclose_wq);
 out_destroy_cifsoplockd_wq:
@@ -1977,8 +1979,6 @@ init_cifs(void)
 	destroy_workqueue(decrypt_wq);
 out_destroy_cifsiod_wq:
 	destroy_workqueue(cifsiod_wq);
-out_destroy_serverclose_wq:
-	destroy_workqueue(serverclose_wq);
 out_clean_proc:
 	cifs_proc_clean();
 	return rc;

From 630482ee0653decf9e2482ac6181897eb6cde5b8 Mon Sep 17 00:00:00 2001
From: Artem Chernyshev <artem.chernyshev@red-soft.ru>
Date: Tue, 16 Jul 2024 15:55:14 +0300
Subject: [PATCH 218/346] iommu: sprd: Avoid NULL deref in sprd_iommu_hw_en

In sprd_iommu_cleanup() before calling function sprd_iommu_hw_en()
dom->sdev is equal to NULL, which leads to null dereference.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Fixes: 9afea57384d4 ("iommu/sprd: Release dma buffer to avoid memory leak")
Signed-off-by: Artem Chernyshev <artem.chernyshev@red-soft.ru>
Reviewed-by: Chunyan Zhang <zhang.lyra@gmail.com>
Link: https://lore.kernel.org/r/20240716125522.3690358-1-artem.chernyshev@red-soft.ru
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/sprd-iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c
index ba53571a8239..a2f4ffe6d949 100644
--- a/drivers/iommu/sprd-iommu.c
+++ b/drivers/iommu/sprd-iommu.c
@@ -232,8 +232,8 @@ static void sprd_iommu_cleanup(struct sprd_iommu_domain *dom)
 
 	pgt_size = sprd_iommu_pgt_size(&dom->domain);
 	dma_free_coherent(dom->sdev->dev, pgt_size, dom->pgt_va, dom->pgt_pa);
-	dom->sdev = NULL;
 	sprd_iommu_hw_en(dom->sdev, false);
+	dom->sdev = NULL;
 }
 
 static void sprd_iommu_domain_free(struct iommu_domain *domain)

From 86c5eac3c4c4a2ee124d202af9a141bd0457ee68 Mon Sep 17 00:00:00 2001
From: Chen Ni <nichen@iscas.ac.cn>
Date: Tue, 16 Jul 2024 15:25:45 +0800
Subject: [PATCH 219/346] iommu/amd: Convert comma to semicolon

Replace a comma between expression statements by a semicolon.

Fixes: c9b258c6be09 ("iommu/amd: Prepare for generic IO page table framework")
Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Reviewed-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Link: https://lore.kernel.org/r/20240716072545.968690-1-nichen@iscas.ac.cn
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/amd/io_pgtable.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
index 9d9a7fde59e7..1074ee25064d 100644
--- a/drivers/iommu/amd/io_pgtable.c
+++ b/drivers/iommu/amd/io_pgtable.c
@@ -588,9 +588,9 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo
 {
 	struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg);
 
-	cfg->pgsize_bitmap  = AMD_IOMMU_PGSIZES,
-	cfg->ias            = IOMMU_IN_ADDR_BIT_SIZE,
-	cfg->oas            = IOMMU_OUT_ADDR_BIT_SIZE,
+	cfg->pgsize_bitmap  = AMD_IOMMU_PGSIZES;
+	cfg->ias            = IOMMU_IN_ADDR_BIT_SIZE;
+	cfg->oas            = IOMMU_OUT_ADDR_BIT_SIZE;
 	cfg->tlb            = &v1_flush_ops;
 
 	pgtable->iop.ops.map_pages    = iommu_v1_map_pages;

From f99b3feb3b0e9fca2257c90fc8317be8ee44c19a Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Mon, 22 Jul 2024 08:33:09 +0200
Subject: [PATCH 220/346] clk: samsung: fix getting Exynos4 fin_pll rate from
 external clocks

Commit 0dc83ad8bfc9 ("clk: samsung: Don't register clkdev lookup for the
fixed rate clocks") claimed registering clkdev lookup is not necessary
anymore, but that was not entirely true: Exynos4210/4212/4412 clock code
still relied on it to get the clock rate of xxti or xusbxti external
clocks.

Drop that requirement by accessing already registered clk_hw when
looking up the xxti/xusbxti rate.

Reported-by: Artur Weber <aweber.kernel@gmail.com>
Closes: https://lore.kernel.org/all/6227c1fb-d769-462a-b79b-abcc15d3db8e@gmail.com/
Fixes: 0dc83ad8bfc9 ("clk: samsung: Don't register clkdev lookup for the fixed rate clocks")
Cc: <stable@vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240722063309.60054-1-krzysztof.kozlowski@linaro.org
Tested-by: Artur Weber <aweber.kernel@gmail.com> # Exynos4212
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/samsung/clk-exynos4.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/clk/samsung/clk-exynos4.c b/drivers/clk/samsung/clk-exynos4.c
index a026ccca7315..28945b6b0ee1 100644
--- a/drivers/clk/samsung/clk-exynos4.c
+++ b/drivers/clk/samsung/clk-exynos4.c
@@ -1040,19 +1040,20 @@ static unsigned long __init exynos4_get_xom(void)
 static void __init exynos4_clk_register_finpll(struct samsung_clk_provider *ctx)
 {
 	struct samsung_fixed_rate_clock fclk;
-	struct clk *clk;
-	unsigned long finpll_f = 24000000;
+	unsigned long finpll_f;
+	unsigned int parent;
 	char *parent_name;
 	unsigned int xom = exynos4_get_xom();
 
 	parent_name = xom & 1 ? "xusbxti" : "xxti";
-	clk = clk_get(NULL, parent_name);
-	if (IS_ERR(clk)) {
+	parent = xom & 1 ? CLK_XUSBXTI : CLK_XXTI;
+
+	finpll_f = clk_hw_get_rate(ctx->clk_data.hws[parent]);
+	if (!finpll_f) {
 		pr_err("%s: failed to lookup parent clock %s, assuming "
 			"fin_pll clock frequency is 24MHz\n", __func__,
 			parent_name);
-	} else {
-		finpll_f = clk_get_rate(clk);
+		finpll_f = 24000000;
 	}
 
 	fclk.id = CLK_FIN_PLL;

From a214384ce26b6111ea8c8d58fa82a1ca63996c38 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Mon, 22 Jul 2024 23:40:08 -0500
Subject: [PATCH 221/346] cifs: fix reconnect with SMB1 UNIX Extensions

When mounting with the SMB1 Unix Extensions (e.g. mounts
to Samba with vers=1.0), reconnects no longer reset the
Unix Extensions (SetFSInfo SET_FILE_UNIX_BASIC) after tcon so most
operations (e.g. stat, ls, open, statfs) will fail continuously
with:
        "Operation not supported"
if the connection ever resets (e.g. due to brief network disconnect)

Cc: stable@vger.kernel.org
Reviewed-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/connect.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 7a16e12f5da8..89d9f86cc29a 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -3686,6 +3686,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
 }
 #endif
 
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
 /*
  * Issue a TREE_CONNECT request.
  */
@@ -3807,11 +3808,25 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
 		else
 			tcon->Flags = 0;
 		cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags);
-	}
 
+		/*
+		 * reset_cifs_unix_caps calls QFSInfo which requires
+		 * need_reconnect to be false, but we would not need to call
+		 * reset_caps if this were not a reconnect case so must check
+		 * need_reconnect flag here.  The caller will also clear
+		 * need_reconnect when tcon was successful but needed to be
+		 * cleared earlier in the case of unix extensions reconnect
+		 */
+		if (tcon->need_reconnect && tcon->unix_ext) {
+			cifs_dbg(FYI, "resetting caps for %s\n", tcon->tree_name);
+			tcon->need_reconnect = false;
+			reset_cifs_unix_caps(xid, tcon, NULL, NULL);
+		}
+	}
 	cifs_buf_release(smb_buffer);
 	return rc;
 }
+#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
 
 static void delayed_free(struct rcu_head *p)
 {

From 0e314e452687ce0ec5874e42cdb993a34325d3d2 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Tue, 23 Jul 2024 00:44:48 -0500
Subject: [PATCH 222/346] cifs: mount with "unix" mount option for SMB1
 incorrectly handled

Although by default we negotiate CIFS Unix Extensions for SMB1 mounts to
Samba (and they work if the user does not specify "unix" or "posix" or
"linux" on mount), and we do properly handle when a user turns them off
with "nounix" mount parm.  But with the changes to the mount API we
broke cases where the user explicitly specifies the "unix" option (or
equivalently "linux" or "posix") on mount with vers=1.0 to Samba or other
servers which support the CIFS Unix Extensions.

 "mount error(95): Operation not supported"

and logged:

 "CIFS: VFS: Check vers= mount option. SMB3.11 disabled but required for POSIX extensions"

even though CIFS Unix Extensions are supported for vers=1.0  This patch fixes
the case where the user specifies both "unix" (or equivalently "posix" or
"linux") and "vers=1.0" on mount to a server which supports the
CIFS Unix Extensions.

Cc: stable@vger.kernel.org
Reviewed-by: David Howells <dhowell@redhat.com>
Reviewed-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/connect.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 89d9f86cc29a..d2307162a2de 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -2614,6 +2614,13 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
 			cifs_dbg(VFS, "Server does not support mounting with posix SMB3.11 extensions\n");
 			rc = -EOPNOTSUPP;
 			goto out_fail;
+		} else if (ses->server->vals->protocol_id == SMB10_PROT_ID)
+			if (cap_unix(ses))
+				cifs_dbg(FYI, "Unix Extensions requested on SMB1 mount\n");
+			else {
+				cifs_dbg(VFS, "SMB1 Unix Extensions not supported by server\n");
+				rc = -EOPNOTSUPP;
+				goto out_fail;
 		} else {
 			cifs_dbg(VFS,
 				"Check vers= mount option. SMB3.11 disabled but required for POSIX extensions\n");

From ba6c664081afd18da86ac49cb22ceb266f89a561 Mon Sep 17 00:00:00 2001
From: Petr Vorel <pvorel@suse.cz>
Date: Wed, 24 Jul 2024 10:46:55 +0200
Subject: [PATCH 223/346] kbuild: rpm-pkg: Fix C locale setup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

semicolon separation in LC_ALL is wrong. Either variable needs to be
exported before as a separate commit or set as part of the commit in the
beginning. Used second variant.

This fixes broken build on user's locale setup which makes 'date' binary
to produce invalid characters in rpm changelog (e.g. cs_CZ.UTF-8 'čec'):

$ make binrpm-pkg
  GEN     rpmbuild/SPECS/kernel.spec
rpmbuild -bb rpmbuild/SPECS/kernel.spec --define='_topdirlinux/rpmbuild' \
    --target x86_64-linux --build-in-place --noprep --define='_smp_mflags \
    %{nil}' $(rpm -q rpm >/dev/null 2>&1 || echo --nodeps)
Building target platforms: x86_64-linux
Building for target x86_64-linux
error: bad date in %changelog: St čec 24 2024 user <user@somehost>
make[2]: *** [scripts/Makefile.package:71: binrpm-pkg] Error 1
make[1]: *** [linux/Makefile:1546: binrpm-pkg] Error 2
make: *** [Makefile:224: __sub-make] Error 2

Fixes: 301c10908e42 ("kbuild: rpm-pkg: introduce a simple changelog section for kernel.spec")
Signed-off-by: Petr Vorel <pvorel@suse.cz>
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 scripts/package/mkspec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/package/mkspec b/scripts/package/mkspec
index ead54d67a024..4dc1466dfc81 100755
--- a/scripts/package/mkspec
+++ b/scripts/package/mkspec
@@ -50,6 +50,6 @@ fi
 cat << EOF
 
 %changelog
-* $(LC_ALL=C; date +'%a %b %d %Y') ${name} <${email}>
+* $(LC_ALL=C date +'%a %b %d %Y') ${name} <${email}>
 - Custom built Linux kernel.
 EOF

From ae67ed9010a7b52933ad1038d13df8a3aae34b83 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Date: Wed, 24 Jul 2024 11:19:31 +0300
Subject: [PATCH 224/346] ASoC: SOF: ipc4-topology: Only handle dai_config with
 HW_PARAMS for ChainDMA

The DMA Link ID is only valid in snd_sof_dai_config_data when the
dai_config is called with HW_PARAMS.

The commit that this patch fixes is actually moved a code section without
changing it, the same bug exists in the original code, needing different
patch to kernel prior to 6.9 kernels.

Cc: stable@vger.kernel.org
Fixes: 3858464de57b ("ASoC: SOF: ipc4-topology: change chain_dma handling in dai_config")
Link: https://github.com/thesofproject/linux/issues/5116
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://patch.msgid.link/20240724081932.24542-2-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/sof/ipc4-topology.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c
index 90f6856ee80c..4a4234d5c941 100644
--- a/sound/soc/sof/ipc4-topology.c
+++ b/sound/soc/sof/ipc4-topology.c
@@ -3095,8 +3095,14 @@ static int sof_ipc4_dai_config(struct snd_sof_dev *sdev, struct snd_sof_widget *
 		return 0;
 
 	if (pipeline->use_chain_dma) {
-		pipeline->msg.primary &= ~SOF_IPC4_GLB_CHAIN_DMA_LINK_ID_MASK;
-		pipeline->msg.primary |= SOF_IPC4_GLB_CHAIN_DMA_LINK_ID(data->dai_data);
+		/*
+		 * Only configure the DMA Link ID for ChainDMA when this op is
+		 * invoked with SOF_DAI_CONFIG_FLAGS_HW_PARAMS
+		 */
+		if (flags & SOF_DAI_CONFIG_FLAGS_HW_PARAMS) {
+			pipeline->msg.primary &= ~SOF_IPC4_GLB_CHAIN_DMA_LINK_ID_MASK;
+			pipeline->msg.primary |= SOF_IPC4_GLB_CHAIN_DMA_LINK_ID(data->dai_data);
+		}
 		return 0;
 	}
 

From e6fc5fcaeffa04a3fa1db8dfccdfd4b6001c0446 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Date: Wed, 24 Jul 2024 11:19:32 +0300
Subject: [PATCH 225/346] ASoC: SOF: ipc4-topology: Preserve the DMA Link ID
 for ChainDMA on unprepare

The DMA Link ID is set to the IPC message's primary during dai_config,
which is only during hw_params.
During xrun handling the hw_params is not called and the DMA Link ID
information will be lost.

All other fields in the message expected to be 0 for re-configuration, only
the DMA Link ID needs to be preserved and the in case of repeated
dai_config, it is correctly updated (masked and then set).

Cc: stable@vger.kernel.org
Fixes: ca5ce0caa67f ("ASoC: SOF: ipc4/intel: Add support for chained DMA")
Link: https://github.com/thesofproject/linux/issues/5116
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://patch.msgid.link/20240724081932.24542-3-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/sof/ipc4-topology.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sound/soc/sof/ipc4-topology.c b/sound/soc/sof/ipc4-topology.c
index 4a4234d5c941..87be7f16e8c2 100644
--- a/sound/soc/sof/ipc4-topology.c
+++ b/sound/soc/sof/ipc4-topology.c
@@ -1358,7 +1358,13 @@ static void sof_ipc4_unprepare_copier_module(struct snd_sof_widget *swidget)
 		ipc4_copier = dai->private;
 
 		if (pipeline->use_chain_dma) {
-			pipeline->msg.primary = 0;
+			/*
+			 * Preserve the DMA Link ID and clear other bits since
+			 * the DMA Link ID is only configured once during
+			 * dai_config, other fields are expected to be 0 for
+			 * re-configuration
+			 */
+			pipeline->msg.primary &= SOF_IPC4_GLB_CHAIN_DMA_LINK_ID_MASK;
 			pipeline->msg.extension = 0;
 		}
 

From f7c1b0e4ae47e67c6f9af84568a5f4a80638ccd8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Jul 2024 21:01:14 +0200
Subject: [PATCH 226/346] thermal: core: Back off when polling thermal zones on
 errors

Commit a8a261774466 ("thermal: core: Call monitor_thermal_zone() if zone
temperature is invalid") introduced a polling mechanism by which the
thermal core attampts to get a valid temperature value for thermal zones
where the .get_temp() callback returns errors to start with (for
example, due to initialization ordering woes).  However, this polling is
carried out periodically ad infinitum and every iteration of it causes
a message to be printed to the kernel log which means a lot of log noise
on systems where there are thermal zones that never get ready for some
reason.  It is also not really useful to continuously poll thermal zones
that never respond.

To address this, modify the thermal core to increase the delay between
consecutive thermal zone temperature checks after every check that fails
until it reaches a certain maximum value.  At that point, the thermal
zone in question will be disabled, but user space will be able to
reenable it if it believes that the failure is transient.

Also change the code to print messages regarding failed temperature
checks to the kernel log only twice, once when the thermal zone's
.get_temp() callback returns an error for the first time and once when
disabling the given thermal zone.  In addition, a dev_crit() message
will be printed at that point if the given thermal zone contains a
critical trip point to notify the system operator about the situation.

Fixes: a8a261774466 ("thermal: core: Call monitor_thermal_zone() if zone temperature is invalid")
Link: https://lore.kernel.org/linux-acpi/CAGnHSE=RyPK++UG0-wAtVKgeJxe0uzFYgLxm+RUOKKoQquW=Ow@mail.gmail.com/
Reported-by: Tom Yan <tom.ty89@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://patch.msgid.link/2962033.e9J7NaK4W3@rjwysocki.net
---
 drivers/thermal/thermal_core.c | 58 +++++++++++++++++++++++++++++++---
 drivers/thermal/thermal_core.h | 10 ++++--
 2 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 48bf267d090d..95c399f94744 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -288,6 +288,28 @@ static int __thermal_zone_device_set_mode(struct thermal_zone_device *tz,
 	return 0;
 }
 
+static void thermal_zone_broken_disable(struct thermal_zone_device *tz)
+{
+	struct thermal_trip_desc *td;
+
+	dev_err(&tz->device, "Unable to get temperature, disabling!\n");
+	/*
+	 * This function only runs for enabled thermal zones, so no need to
+	 * check for the current mode.
+	 */
+	__thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED);
+	thermal_notify_tz_disable(tz);
+
+	for_each_trip_desc(tz, td) {
+		if (td->trip.type == THERMAL_TRIP_CRITICAL &&
+		    td->trip.temperature > THERMAL_TEMP_INVALID) {
+			dev_crit(&tz->device,
+				 "Disabled thermal zone with critical trip point\n");
+			return;
+		}
+	}
+}
+
 /*
  * Zone update section: main control loop applied to each zone while monitoring
  * in polling mode. The monitoring is done using a workqueue.
@@ -308,6 +330,34 @@ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
 		cancel_delayed_work(&tz->poll_queue);
 }
 
+static void thermal_zone_recheck(struct thermal_zone_device *tz, int error)
+{
+	if (error == -EAGAIN) {
+		thermal_zone_device_set_polling(tz, THERMAL_RECHECK_DELAY);
+		return;
+	}
+
+	/*
+	 * Print the message once to reduce log noise.  It will be followed by
+	 * another one if the temperature cannot be determined after multiple
+	 * attempts.
+	 */
+	if (tz->recheck_delay_jiffies == THERMAL_RECHECK_DELAY)
+		dev_info(&tz->device, "Temperature check failed (%d)\n", error);
+
+	thermal_zone_device_set_polling(tz, tz->recheck_delay_jiffies);
+
+	tz->recheck_delay_jiffies += max(tz->recheck_delay_jiffies >> 1, 1ULL);
+	if (tz->recheck_delay_jiffies > THERMAL_MAX_RECHECK_DELAY) {
+		thermal_zone_broken_disable(tz);
+		/*
+		 * Restore the original recheck delay value to allow the thermal
+		 * zone to try to recover when it is reenabled by user space.
+		 */
+		tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
+	}
+}
+
 static void monitor_thermal_zone(struct thermal_zone_device *tz)
 {
 	if (tz->mode != THERMAL_DEVICE_ENABLED)
@@ -507,10 +557,7 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
 
 	ret = __thermal_zone_get_temp(tz, &temp);
 	if (ret) {
-		if (ret != -EAGAIN)
-			dev_info(&tz->device, "Temperature check failed (%d)\n", ret);
-
-		thermal_zone_device_set_polling(tz, msecs_to_jiffies(THERMAL_RECHECK_DELAY_MS));
+		thermal_zone_recheck(tz, ret);
 		return;
 	} else if (temp <= THERMAL_TEMP_INVALID) {
 		/*
@@ -522,6 +569,8 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
 		goto monitor;
 	}
 
+	tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
+
 	tz->last_temperature = tz->temperature;
 	tz->temperature = temp;
 
@@ -1462,6 +1511,7 @@ thermal_zone_device_register_with_trips(const char *type,
 
 	thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay);
 	thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay);
+	tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
 
 	/* sys I/F */
 	/* Add nodes that are always present via .groups */
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index ba8e6fc807ca..4cf2b7230d04 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -67,6 +67,8 @@ struct thermal_governor {
  * @polling_delay_jiffies: number of jiffies to wait between polls when
  *			checking whether trip points have been crossed (0 for
  *			interrupt driven systems)
+ * @recheck_delay_jiffies: delay after a failed attempt to determine the zone
+ * 			temperature before trying again
  * @temperature:	current temperature.  This is only for core code,
  *			drivers should use thermal_zone_get_temp() to get the
  *			current temperature
@@ -108,6 +110,7 @@ struct thermal_zone_device {
 	int num_trips;
 	unsigned long passive_delay_jiffies;
 	unsigned long polling_delay_jiffies;
+	unsigned long recheck_delay_jiffies;
 	int temperature;
 	int last_temperature;
 	int emul_temperature;
@@ -137,10 +140,11 @@ struct thermal_zone_device {
 #define THERMAL_TEMP_INIT	INT_MIN
 
 /*
- * Default delay after a failing thermal zone temperature check before
- * attempting to check it again.
+ * Default and maximum delay after a failed thermal zone temperature check
+ * before attempting to check it again (in jiffies).
  */
-#define THERMAL_RECHECK_DELAY_MS	250
+#define THERMAL_RECHECK_DELAY		msecs_to_jiffies(250)
+#define THERMAL_MAX_RECHECK_DELAY	(120 * HZ)
 
 /* Default Thermal Governor */
 #if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE)

From 3308172276db5d070975d9a321283fcd423ce465 Mon Sep 17 00:00:00 2001
From: Jinjie Ruan <ruanjinjie@huawei.com>
Date: Thu, 13 Jun 2024 19:13:47 +0800
Subject: [PATCH 227/346] trace: riscv: Remove deprecated kprobe on ftrace
 support

Since commit 7caa9765465f60 ("ftrace: riscv: move from REGS to ARGS"),
kprobe on ftrace is not supported by riscv, because riscv's support for
FTRACE_WITH_REGS has been replaced with support for FTRACE_WITH_ARGS, and
KPROBES_ON_FTRACE will be supplanted by FPROBES. So remove the deprecated
kprobe on ftrace support, which is misunderstood.

Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20240613111347.1745379-1-ruanjinjie@huawei.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/Kconfig                |  1 -
 arch/riscv/kernel/probes/Makefile |  1 -
 arch/riscv/kernel/probes/ftrace.c | 65 -------------------------------
 3 files changed, 67 deletions(-)
 delete mode 100644 arch/riscv/kernel/probes/ftrace.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 45eafb3306e9..34ae39ecb898 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -154,7 +154,6 @@ config RISCV
 	select HAVE_KERNEL_UNCOMPRESSED if !XIP_KERNEL && !EFI_ZBOOT
 	select HAVE_KERNEL_ZSTD if !XIP_KERNEL && !EFI_ZBOOT
 	select HAVE_KPROBES if !XIP_KERNEL
-	select HAVE_KPROBES_ON_FTRACE if !XIP_KERNEL
 	select HAVE_KRETPROBES if !XIP_KERNEL
 	# https://github.com/ClangBuiltLinux/linux/issues/1881
 	select HAVE_LD_DEAD_CODE_DATA_ELIMINATION if !LD_IS_LLD
diff --git a/arch/riscv/kernel/probes/Makefile b/arch/riscv/kernel/probes/Makefile
index 8265ff497977..d2129f2c61b8 100644
--- a/arch/riscv/kernel/probes/Makefile
+++ b/arch/riscv/kernel/probes/Makefile
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_KPROBES)		+= kprobes.o decode-insn.o simulate-insn.o
 obj-$(CONFIG_RETHOOK)		+= rethook.o rethook_trampoline.o
-obj-$(CONFIG_KPROBES_ON_FTRACE)	+= ftrace.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o decode-insn.o simulate-insn.o
 CFLAGS_REMOVE_simulate-insn.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_rethook.o = $(CC_FLAGS_FTRACE)
diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c
deleted file mode 100644
index a69dfa610aa8..000000000000
--- a/arch/riscv/kernel/probes/ftrace.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/kprobes.h>
-
-/* Ftrace callback handler for kprobes -- called under preepmt disabled */
-void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-			   struct ftrace_ops *ops, struct ftrace_regs *fregs)
-{
-	struct kprobe *p;
-	struct pt_regs *regs;
-	struct kprobe_ctlblk *kcb;
-	int bit;
-
-	if (unlikely(kprobe_ftrace_disabled))
-		return;
-
-	bit = ftrace_test_recursion_trylock(ip, parent_ip);
-	if (bit < 0)
-		return;
-
-	p = get_kprobe((kprobe_opcode_t *)ip);
-	if (unlikely(!p) || kprobe_disabled(p))
-		goto out;
-
-	regs = ftrace_get_regs(fregs);
-	kcb = get_kprobe_ctlblk();
-	if (kprobe_running()) {
-		kprobes_inc_nmissed_count(p);
-	} else {
-		unsigned long orig_ip = instruction_pointer(regs);
-
-		instruction_pointer_set(regs, ip);
-
-		__this_cpu_write(current_kprobe, p);
-		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-		if (!p->pre_handler || !p->pre_handler(p, regs)) {
-			/*
-			 * Emulate singlestep (and also recover regs->pc)
-			 * as if there is a nop
-			 */
-			instruction_pointer_set(regs,
-				(unsigned long)p->addr + MCOUNT_INSN_SIZE);
-			if (unlikely(p->post_handler)) {
-				kcb->kprobe_status = KPROBE_HIT_SSDONE;
-				p->post_handler(p, regs, 0);
-			}
-			instruction_pointer_set(regs, orig_ip);
-		}
-
-		/*
-		 * If pre_handler returns !0, it changes regs->pc. We have to
-		 * skip emulating post_handler.
-		 */
-		__this_cpu_write(current_kprobe, NULL);
-	}
-out:
-	ftrace_test_recursion_unlock(bit);
-}
-NOKPROBE_SYMBOL(kprobe_ftrace_handler);
-
-int arch_prepare_kprobe_ftrace(struct kprobe *p)
-{
-	p->ainsn.api.insn = NULL;
-	return 0;
-}

From 8d22d0db5bbcec95c70e025dce3d00821da8be7d Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@kernel.org>
Date: Thu, 13 Jun 2024 23:30:53 +0800
Subject: [PATCH 228/346] riscv: boot: remove duplicated targets line

The "targets:" is duplicated in another line, remove the one with less
targets.

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
Reviewed-by: Emil Renner Berthing <emil.renner.berthing@canonical.com>
Link: https://lore.kernel.org/r/20240613153053.3835-1-jszhang@kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/boot/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/riscv/boot/Makefile b/arch/riscv/boot/Makefile
index 869c0345b908..4e9e7a28bf9b 100644
--- a/arch/riscv/boot/Makefile
+++ b/arch/riscv/boot/Makefile
@@ -18,7 +18,6 @@ OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
 OBJCOPYFLAGS_loader.bin :=-O binary
 OBJCOPYFLAGS_xipImage :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
 
-targets := Image Image.* loader loader.o loader.lds loader.bin
 targets := Image Image.* loader loader.o loader.lds loader.bin xipImage
 
 ifeq ($(CONFIG_XIP_KERNEL),y)

From f8b632e89a101dae349a7b212c1771d7925f441b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 24 Jul 2024 12:16:16 +0100
Subject: [PATCH 229/346] io_uring: tighten task exit cancellations

io_uring_cancel_generic() should retry if any state changes like a
request is completed, however in case of a task exit it only goes for
another loop and avoids schedule() if any tracked (i.e. REQ_F_INFLIGHT)
request got completed.

Let's assume we have a non-tracked request executing in iowq and a
tracked request linked to it. Let's also assume
io_uring_cancel_generic() fails to find and cancel the request, i.e.
via io_run_local_work(), which may happen as io-wq has gaps.
Next, the request logically completes, io-wq still hold a ref but queues
it for completion via tw, which happens in
io_uring_try_cancel_requests(). After, right before prepare_to_wait()
io-wq puts the request, grabs the linked one and tries executes it, e.g.
arms polling. Finally the cancellation loop calls prepare_to_wait(),
there are no tw to run, no tracked request was completed, so the
tctx_inflight() check passes and the task is put to indefinite sleep.

Cc: stable@vger.kernel.org
Fixes: 3f48cf18f886c ("io_uring: unify files and task cancel")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/acac7311f4e02ce3c43293f8f1fda9c705d158f1.1721819383.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8e6faa942a6f..10c409e56241 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3031,8 +3031,11 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
 		bool loop = false;
 
 		io_uring_drop_tctx_refs(current);
+		if (!tctx_inflight(tctx, !cancel_all))
+			break;
+
 		/* read completions before cancelations */
-		inflight = tctx_inflight(tctx, !cancel_all);
+		inflight = tctx_inflight(tctx, false);
 		if (!inflight)
 			break;
 

From bd44d7e902c2b34c217d3b48874b079760ca7b6e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 24 Jul 2024 12:16:17 +0100
Subject: [PATCH 230/346] io_uring: don't allow netpolling with SETUP_IOPOLL

IORING_SETUP_IOPOLL rings don't have any netpoll handling, let's fail
attempts to register netpolling in this case, there might be people who
will mix up IOPOLL and netpoll.

Cc: stable@vger.kernel.org
Fixes: ef1186c1a875b ("io_uring: add register/unregister napi function")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1e7553aee0a8ae4edec6742cd6dd0c1e6914fba8.1721819383.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/napi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/io_uring/napi.c b/io_uring/napi.c
index 762254a7ff3f..327e5f3a8abe 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -222,6 +222,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 	};
 	struct io_uring_napi napi;
 
+	if (ctx->flags & IORING_SETUP_IOPOLL)
+		return -EINVAL;
 	if (copy_from_user(&napi, arg, sizeof(napi)))
 		return -EFAULT;
 	if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)

From e142e9cd8891b0c6f277ac2c2c254199a6aa56e3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 24 Jul 2024 12:16:18 +0100
Subject: [PATCH 231/346] io_uring: fix io_match_task must_hold

The __must_hold annotation in io_match_task() uses a non existing
parameter "req", fix it.

Fixes: 6af3f48bf6156 ("io_uring: fix link traversal locking")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3e65ee7709e96507cef3d93291746f2c489f2307.1721819383.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/timeout.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 1c9bf07499b1..9973876d91b0 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -639,7 +639,7 @@ void io_queue_linked_timeout(struct io_kiocb *req)
 
 static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
 			  bool cancel_all)
-	__must_hold(&req->ctx->timeout_lock)
+	__must_hold(&head->ctx->timeout_lock)
 {
 	struct io_kiocb *req;
 

From f1dcdfcadb0c8c13dddd931c1f4dc58e54fdc9c0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 24 Jul 2024 12:16:19 +0100
Subject: [PATCH 232/346] io_uring: simplify io_uring_cmd return

We don't have to return error code from an op handler back to core
io_uring, so once io_uring_cmd() sets the results and handles errors we
can juts return IOU_OK and simplify the code.

Note, only valid with e0b23d9953b0c ("io_uring: optimise ltimeout for
inline execution"), there was a problem with iopoll before.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/8eae2be5b2a49236cd5f1dadbd1aa5730e9e2d4f.1721819383.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/uring_cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index a54163a83968..8391c7c7c1ec 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -265,7 +265,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 		req_set_fail(req);
 	io_req_uring_cleanup(req, issue_flags);
 	io_req_set_res(req, ret, 0);
-	return ret < 0 ? ret : IOU_OK;
+	return IOU_OK;
 }
 
 int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,

From a2b72b81fb3ba18717fc000949ca9d45a3351130 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 24 Jul 2024 12:16:20 +0100
Subject: [PATCH 233/346] io_uring: kill REQ_F_CANCEL_SEQ

We removed the reliance on the flag by the cancellation path and now
it's unused.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/e57afe566bbe4fefeb44daffb08900f2a4756577.1721819383.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3bb6198d1523..e62aa9f0629f 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -461,7 +461,6 @@ enum {
 	REQ_F_SUPPORT_NOWAIT_BIT,
 	REQ_F_ISREG_BIT,
 	REQ_F_POLL_NO_LAZY_BIT,
-	REQ_F_CANCEL_SEQ_BIT,
 	REQ_F_CAN_POLL_BIT,
 	REQ_F_BL_EMPTY_BIT,
 	REQ_F_BL_NO_RECYCLE_BIT,
@@ -536,8 +535,6 @@ enum {
 	REQ_F_HASH_LOCKED	= IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT),
 	/* don't use lazy poll wake for this request */
 	REQ_F_POLL_NO_LAZY	= IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT),
-	/* cancel sequence is set and valid */
-	REQ_F_CANCEL_SEQ	= IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT),
 	/* file is pollable */
 	REQ_F_CAN_POLL		= IO_REQ_FLAG(REQ_F_CAN_POLL_BIT),
 	/* buffer list was empty after selection of buffer */

From 29d63b94036e561a016ec8878b44aad6650d23e2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 24 Jul 2024 12:16:21 +0100
Subject: [PATCH 234/346] io_uring: align iowq and task request error handling

There is a difference in how io_queue_sqe and io_wq_submit_work treat
error codes they get from io_issue_sqe. The first one fails anything
unknown but latter only fails when the code is negative.

It doesn't make sense to have this discrepancy, align them to the
io_queue_sqe behaviour.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c550e152bf4a290187f91a4322ddcb5d6d1f2c73.1721819383.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 10c409e56241..2626424f5d73 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1849,7 +1849,7 @@ void io_wq_submit_work(struct io_wq_work *work)
 	} while (1);
 
 	/* avoid locking problems by failing it from a clean context */
-	if (ret < 0)
+	if (ret)
 		io_req_task_queue_fail(req, ret);
 }
 

From 38738947db38520b58b7dae64bd0eec513e83139 Mon Sep 17 00:00:00 2001
From: Sia Jee Heng <jeeheng.sia@starfivetech.com>
Date: Thu, 2 May 2024 00:37:51 -0700
Subject: [PATCH 235/346] RISC-V: ACPI: Enable SPCR table for console output on
 RISC-V

The ACPI SPCR code has been used to enable console output for ARM64 and
X86. The same code can be reused for RISC-V. Furthermore, SPCR table is
mandated for headless system as outlined in the RISC-V BRS
Specification, chapter 6.

Signed-off-by: Sia Jee Heng <jeeheng.sia@starfivetech.com>
Reviewed-by: Sunil V L <sunilvl@ventanamicro.com>
Link: https://lore.kernel.org/r/20240502073751.102093-2-jeeheng.sia@starfivetech.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/Kconfig       |  1 +
 arch/riscv/kernel/acpi.c | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index b94176e25be1..287c17daa9d5 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -14,6 +14,7 @@ config RISCV
 	def_bool y
 	select ACPI_GENERIC_GSI if ACPI
 	select ACPI_REDUCED_HARDWARE_ONLY if ACPI
+	select ACPI_SPCR_TABLE if ACPI
 	select ARCH_DMA_DEFAULT_COHERENT
 	select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
 	select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c
index e619edc8b0cc..43a12c00ae8b 100644
--- a/arch/riscv/kernel/acpi.c
+++ b/arch/riscv/kernel/acpi.c
@@ -17,7 +17,9 @@
 #include <linux/efi.h>
 #include <linux/io.h>
 #include <linux/memblock.h>
+#include <linux/of_fdt.h>
 #include <linux/pci.h>
+#include <linux/serial_core.h>
 
 int acpi_noirq = 1;		/* skip ACPI IRQ initialization */
 int acpi_disabled = 1;
@@ -131,7 +133,7 @@ void __init acpi_boot_table_init(void)
 	if (param_acpi_off ||
 	    (!param_acpi_on && !param_acpi_force &&
 	     efi.acpi20 == EFI_INVALID_TABLE_ADDR))
-		return;
+		goto done;
 
 	/*
 	 * ACPI is disabled at this point. Enable it in order to parse
@@ -151,6 +153,14 @@ void __init acpi_boot_table_init(void)
 		if (!param_acpi_force)
 			disable_acpi();
 	}
+
+done:
+	if (acpi_disabled) {
+		if (earlycon_acpi_spcr_enable)
+			early_init_dt_scan_chosen_stdout();
+	} else {
+		acpi_parse_spcr(earlycon_acpi_spcr_enable, true);
+	}
 }
 
 static int acpi_parse_madt_rintc(union acpi_subtable_headers *header, const unsigned long end)

From ee3fab10cb1566562aa683f319066eaeecccf918 Mon Sep 17 00:00:00 2001
From: Yunhui Cui <cuiyunhui@bytedance.com>
Date: Mon, 17 Jun 2024 21:14:23 +0800
Subject: [PATCH 236/346] riscv: cacheinfo: remove the useless input parameter
 (node) of ci_leaf_init()

ci_leaf_init() is a declared static function. The implementation of the
function body and the caller do not use the parameter (struct device_node
*node) input parameter, so remove it.

Fixes: 6a24915145c9 ("Revert "riscv: Set more data to cacheinfo"")
Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
Reviewed-by: Jeremy Linton <jeremy.linton@arm.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/20240617131425.7526-1-cuiyunhui@bytedance.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/kernel/cacheinfo.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c
index 09e9b88110d1..30a6878287ad 100644
--- a/arch/riscv/kernel/cacheinfo.c
+++ b/arch/riscv/kernel/cacheinfo.c
@@ -64,7 +64,6 @@ uintptr_t get_cache_geometry(u32 level, enum cache_type type)
 }
 
 static void ci_leaf_init(struct cacheinfo *this_leaf,
-			 struct device_node *node,
 			 enum cache_type type, unsigned int level)
 {
 	this_leaf->level = level;
@@ -80,11 +79,11 @@ int populate_cache_leaves(unsigned int cpu)
 	int levels = 1, level = 1;
 
 	if (of_property_read_bool(np, "cache-size"))
-		ci_leaf_init(this_leaf++, np, CACHE_TYPE_UNIFIED, level);
+		ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level);
 	if (of_property_read_bool(np, "i-cache-size"))
-		ci_leaf_init(this_leaf++, np, CACHE_TYPE_INST, level);
+		ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level);
 	if (of_property_read_bool(np, "d-cache-size"))
-		ci_leaf_init(this_leaf++, np, CACHE_TYPE_DATA, level);
+		ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level);
 
 	prev = np;
 	while ((np = of_find_next_cache_node(np))) {
@@ -97,11 +96,11 @@ int populate_cache_leaves(unsigned int cpu)
 		if (level <= levels)
 			break;
 		if (of_property_read_bool(np, "cache-size"))
-			ci_leaf_init(this_leaf++, np, CACHE_TYPE_UNIFIED, level);
+			ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level);
 		if (of_property_read_bool(np, "i-cache-size"))
-			ci_leaf_init(this_leaf++, np, CACHE_TYPE_INST, level);
+			ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level);
 		if (of_property_read_bool(np, "d-cache-size"))
-			ci_leaf_init(this_leaf++, np, CACHE_TYPE_DATA, level);
+			ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level);
 		levels = level;
 	}
 	of_node_put(np);

From 604f32ea6909b0ebb8ab0bf1ab7dc66ee3dc8955 Mon Sep 17 00:00:00 2001
From: Yunhui Cui <cuiyunhui@bytedance.com>
Date: Mon, 17 Jun 2024 21:14:24 +0800
Subject: [PATCH 237/346] riscv: cacheinfo: initialize cacheinfo's level and
 type from ACPI PPTT

Before cacheinfo can be built correctly, we need to initialize level
and type. Since RISC-V currently does not have a register group that
describes cache-related attributes like ARM64, we cannot obtain them
directly, so now we obtain cache leaves from the ACPI PPTT table
(acpi_get_cache_info()) and set the cache type through split_levels.

Suggested-by: Jeremy Linton <jeremy.linton@arm.com>
Suggested-by: Sudeep Holla <sudeep.holla@arm.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Sunil V L <sunilvl@ventanamicro.com>
Reviewed-by: Jeremy Linton <jeremy.linton@arm.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
Link: https://lore.kernel.org/r/20240617131425.7526-2-cuiyunhui@bytedance.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/kernel/cacheinfo.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c
index 30a6878287ad..d6c108c50cba 100644
--- a/arch/riscv/kernel/cacheinfo.c
+++ b/arch/riscv/kernel/cacheinfo.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2017 SiFive
  */
 
+#include <linux/acpi.h>
 #include <linux/cpu.h>
 #include <linux/of.h>
 #include <asm/cacheinfo.h>
@@ -78,6 +79,27 @@ int populate_cache_leaves(unsigned int cpu)
 	struct device_node *prev = NULL;
 	int levels = 1, level = 1;
 
+	if (!acpi_disabled) {
+		int ret, fw_levels, split_levels;
+
+		ret = acpi_get_cache_info(cpu, &fw_levels, &split_levels);
+		if (ret)
+			return ret;
+
+		BUG_ON((split_levels > fw_levels) ||
+		       (split_levels + fw_levels > this_cpu_ci->num_leaves));
+
+		for (; level <= this_cpu_ci->num_levels; level++) {
+			if (level <= split_levels) {
+				ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level);
+				ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level);
+			} else {
+				ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level);
+			}
+		}
+		return 0;
+	}
+
 	if (of_property_read_bool(np, "cache-size"))
 		ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level);
 	if (of_property_read_bool(np, "i-cache-size"))

From 66381d36771e4ffedbea0dd23da17cf3d38e9aae Mon Sep 17 00:00:00 2001
From: Yunhui Cui <cuiyunhui@bytedance.com>
Date: Mon, 17 Jun 2024 21:14:25 +0800
Subject: [PATCH 238/346] RISC-V: Select ACPI PPTT drivers

After adding ACPI support to populate_cache_leaves(), RISC-V can build
cacheinfo through the ACPI PPTT table, thus enabling the ACPI_PPTT
configuration.

Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
Reviewed-by: Jeremy Linton <jeremy.linton@arm.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Reviewed-by: Sunil V L <sunilvl@ventanamicro.com>
Link: https://lore.kernel.org/r/20240617131425.7526-3-cuiyunhui@bytedance.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index b94176e25be1..15f062e1d5ee 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -13,6 +13,7 @@ config 32BIT
 config RISCV
 	def_bool y
 	select ACPI_GENERIC_GSI if ACPI
+	select ACPI_PPTT if ACPI
 	select ACPI_REDUCED_HARDWARE_ONLY if ACPI
 	select ARCH_DMA_DEFAULT_COHERENT
 	select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION

From 39705a6c29f8a2b93cf5b99528a55366c50014d1 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 24 Jul 2024 14:49:01 +0200
Subject: [PATCH 239/346] landlock: Don't lose track of restrictions on
 cred_transfer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a process' cred struct is replaced, this _almost_ always invokes
the cred_prepare LSM hook; but in one special case (when
KEYCTL_SESSION_TO_PARENT updates the parent's credentials), the
cred_transfer LSM hook is used instead.  Landlock only implements the
cred_prepare hook, not cred_transfer, so KEYCTL_SESSION_TO_PARENT causes
all information on Landlock restrictions to be lost.

This basically means that a process with the ability to use the fork()
and keyctl() syscalls can get rid of all Landlock restrictions on
itself.

Fix it by adding a cred_transfer hook that does the same thing as the
existing cred_prepare hook. (Implemented by having hook_cred_prepare()
call hook_cred_transfer() so that the two functions are less likely to
accidentally diverge in the future.)

Cc: stable@kernel.org
Fixes: 385975dca53e ("landlock: Set up the security framework and manage credentials")
Signed-off-by: Jann Horn <jannh@google.com>
Link: https://lore.kernel.org/r/20240724-landlock-houdini-fix-v1-1-df89a4560ca3@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 security/landlock/cred.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/security/landlock/cred.c b/security/landlock/cred.c
index 786af18c4a1c..db9fe7d906ba 100644
--- a/security/landlock/cred.c
+++ b/security/landlock/cred.c
@@ -14,8 +14,8 @@
 #include "ruleset.h"
 #include "setup.h"
 
-static int hook_cred_prepare(struct cred *const new,
-			     const struct cred *const old, const gfp_t gfp)
+static void hook_cred_transfer(struct cred *const new,
+			       const struct cred *const old)
 {
 	struct landlock_ruleset *const old_dom = landlock_cred(old)->domain;
 
@@ -23,6 +23,12 @@ static int hook_cred_prepare(struct cred *const new,
 		landlock_get_ruleset(old_dom);
 		landlock_cred(new)->domain = old_dom;
 	}
+}
+
+static int hook_cred_prepare(struct cred *const new,
+			     const struct cred *const old, const gfp_t gfp)
+{
+	hook_cred_transfer(new, old);
 	return 0;
 }
 
@@ -36,6 +42,7 @@ static void hook_cred_free(struct cred *const cred)
 
 static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(cred_prepare, hook_cred_prepare),
+	LSM_HOOK_INIT(cred_transfer, hook_cred_transfer),
 	LSM_HOOK_INIT(cred_free, hook_cred_free),
 };
 

From cc374782b6ca0fd634482391da977542443d3368 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= <mic@digikod.net>
Date: Wed, 24 Jul 2024 16:54:26 +0200
Subject: [PATCH 240/346] selftests/landlock: Add cred_transfer test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Check that keyctl(KEYCTL_SESSION_TO_PARENT) preserves the parent's
restrictions.

Fixes: e1199815b47b ("selftests/landlock: Add user space tests")
Co-developed-by: Jann Horn <jannh@google.com>
Signed-off-by: Jann Horn <jannh@google.com>
Link: https://lore.kernel.org/r/20240724.Ood5aige9she@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 tools/testing/selftests/landlock/base_test.c | 74 ++++++++++++++++++++
 tools/testing/selftests/landlock/config      |  1 +
 2 files changed, 75 insertions(+)

diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index 3c1e9f35b531..3b26bf3cf5b9 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -9,6 +9,7 @@
 #define _GNU_SOURCE
 #include <errno.h>
 #include <fcntl.h>
+#include <linux/keyctl.h>
 #include <linux/landlock.h>
 #include <string.h>
 #include <sys/prctl.h>
@@ -326,4 +327,77 @@ TEST(ruleset_fd_transfer)
 	ASSERT_EQ(EXIT_SUCCESS, WEXITSTATUS(status));
 }
 
+TEST(cred_transfer)
+{
+	struct landlock_ruleset_attr ruleset_attr = {
+		.handled_access_fs = LANDLOCK_ACCESS_FS_READ_DIR,
+	};
+	int ruleset_fd, dir_fd;
+	pid_t child;
+	int status;
+
+	drop_caps(_metadata);
+
+	dir_fd = open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+	EXPECT_LE(0, dir_fd);
+	EXPECT_EQ(0, close(dir_fd));
+
+	/* Denies opening directories. */
+	ruleset_fd =
+		landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
+	ASSERT_LE(0, ruleset_fd);
+	EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
+	ASSERT_EQ(0, landlock_restrict_self(ruleset_fd, 0));
+	EXPECT_EQ(0, close(ruleset_fd));
+
+	/* Checks ruleset enforcement. */
+	EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
+	EXPECT_EQ(EACCES, errno);
+
+	/* Needed for KEYCTL_SESSION_TO_PARENT permission checks */
+	EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL, 0,
+			      0, 0))
+	{
+		TH_LOG("Failed to join session keyring: %s", strerror(errno));
+	}
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		/* Checks ruleset enforcement. */
+		EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
+		EXPECT_EQ(EACCES, errno);
+
+		/*
+		 * KEYCTL_SESSION_TO_PARENT is a no-op unless we have a
+		 * different session keyring in the child, so make that happen.
+		 */
+		EXPECT_NE(-1, syscall(__NR_keyctl, KEYCTL_JOIN_SESSION_KEYRING,
+				      NULL, 0, 0, 0));
+
+		/*
+		 * KEYCTL_SESSION_TO_PARENT installs credentials on the parent
+		 * that never go through the cred_prepare hook, this path uses
+		 * cred_transfer instead.
+		 */
+		EXPECT_EQ(0, syscall(__NR_keyctl, KEYCTL_SESSION_TO_PARENT, 0,
+				     0, 0, 0));
+
+		/* Re-checks ruleset enforcement. */
+		EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
+		EXPECT_EQ(EACCES, errno);
+
+		_exit(_metadata->exit_code);
+		return;
+	}
+
+	EXPECT_EQ(child, waitpid(child, &status, 0));
+	EXPECT_EQ(1, WIFEXITED(status));
+	EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status));
+
+	/* Re-checks ruleset enforcement. */
+	EXPECT_EQ(-1, open("/", O_RDONLY | O_DIRECTORY | O_CLOEXEC));
+	EXPECT_EQ(EACCES, errno);
+}
+
 TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/landlock/config b/tools/testing/selftests/landlock/config
index 0086efaa7b68..29af19c4e9f9 100644
--- a/tools/testing/selftests/landlock/config
+++ b/tools/testing/selftests/landlock/config
@@ -2,6 +2,7 @@ CONFIG_CGROUPS=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_INET=y
 CONFIG_IPV6=y
+CONFIG_KEYS=y
 CONFIG_NET=y
 CONFIG_NET_NS=y
 CONFIG_OVERLAY_FS=y

From 7e04da2dc7013af50ed3a2beb698d5168d1e594b Mon Sep 17 00:00:00 2001
From: Yang Yang <yang.yang@vivo.com>
Date: Wed, 24 Jul 2024 15:04:12 +0800
Subject: [PATCH 241/346] block: fix deadlock between sd_remove & sd_release

Our test report the following hung task:

[ 2538.459400] INFO: task "kworker/0:0":7 blocked for more than 188 seconds.
[ 2538.459427] Call trace:
[ 2538.459430]  __switch_to+0x174/0x338
[ 2538.459436]  __schedule+0x628/0x9c4
[ 2538.459442]  schedule+0x7c/0xe8
[ 2538.459447]  schedule_preempt_disabled+0x24/0x40
[ 2538.459453]  __mutex_lock+0x3ec/0xf04
[ 2538.459456]  __mutex_lock_slowpath+0x14/0x24
[ 2538.459459]  mutex_lock+0x30/0xd8
[ 2538.459462]  del_gendisk+0xdc/0x350
[ 2538.459466]  sd_remove+0x30/0x60
[ 2538.459470]  device_release_driver_internal+0x1c4/0x2c4
[ 2538.459474]  device_release_driver+0x18/0x28
[ 2538.459478]  bus_remove_device+0x15c/0x174
[ 2538.459483]  device_del+0x1d0/0x358
[ 2538.459488]  __scsi_remove_device+0xa8/0x198
[ 2538.459493]  scsi_forget_host+0x50/0x70
[ 2538.459497]  scsi_remove_host+0x80/0x180
[ 2538.459502]  usb_stor_disconnect+0x68/0xf4
[ 2538.459506]  usb_unbind_interface+0xd4/0x280
[ 2538.459510]  device_release_driver_internal+0x1c4/0x2c4
[ 2538.459514]  device_release_driver+0x18/0x28
[ 2538.459518]  bus_remove_device+0x15c/0x174
[ 2538.459523]  device_del+0x1d0/0x358
[ 2538.459528]  usb_disable_device+0x84/0x194
[ 2538.459532]  usb_disconnect+0xec/0x300
[ 2538.459537]  hub_event+0xb80/0x1870
[ 2538.459541]  process_scheduled_works+0x248/0x4dc
[ 2538.459545]  worker_thread+0x244/0x334
[ 2538.459549]  kthread+0x114/0x1bc

[ 2538.461001] INFO: task "fsck.":15415 blocked for more than 188 seconds.
[ 2538.461014] Call trace:
[ 2538.461016]  __switch_to+0x174/0x338
[ 2538.461021]  __schedule+0x628/0x9c4
[ 2538.461025]  schedule+0x7c/0xe8
[ 2538.461030]  blk_queue_enter+0xc4/0x160
[ 2538.461034]  blk_mq_alloc_request+0x120/0x1d4
[ 2538.461037]  scsi_execute_cmd+0x7c/0x23c
[ 2538.461040]  ioctl_internal_command+0x5c/0x164
[ 2538.461046]  scsi_set_medium_removal+0x5c/0xb0
[ 2538.461051]  sd_release+0x50/0x94
[ 2538.461054]  blkdev_put+0x190/0x28c
[ 2538.461058]  blkdev_release+0x28/0x40
[ 2538.461063]  __fput+0xf8/0x2a8
[ 2538.461066]  __fput_sync+0x28/0x5c
[ 2538.461070]  __arm64_sys_close+0x84/0xe8
[ 2538.461073]  invoke_syscall+0x58/0x114
[ 2538.461078]  el0_svc_common+0xac/0xe0
[ 2538.461082]  do_el0_svc+0x1c/0x28
[ 2538.461087]  el0_svc+0x38/0x68
[ 2538.461090]  el0t_64_sync_handler+0x68/0xbc
[ 2538.461093]  el0t_64_sync+0x1a8/0x1ac

  T1:				T2:
  sd_remove
  del_gendisk
  __blk_mark_disk_dead
  blk_freeze_queue_start
  ++q->mq_freeze_depth
  				bdev_release
 				mutex_lock(&disk->open_mutex)
  				sd_release
 				scsi_execute_cmd
 				blk_queue_enter
 				wait_event(!q->mq_freeze_depth)
  mutex_lock(&disk->open_mutex)

SCSI does not set GD_OWNS_QUEUE, so QUEUE_FLAG_DYING is not set in
this scenario. This is a classic ABBA deadlock. To fix the deadlock,
make sure we don't try to acquire disk->open_mutex after freezing
the queue.

Cc: stable@vger.kernel.org
Fixes: eec1be4c30df ("block: delete partitions later in del_gendisk")
Signed-off-by: Yang Yang <yang.yang@vivo.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Fixes: and Cc: stable tags are missing. Otherwise this patch looks fine
Link: https://lore.kernel.org/r/20240724070412.22521-1-yang.yang@vivo.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index 4dc95a463505..1c05dd4c6980 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -663,12 +663,12 @@ void del_gendisk(struct gendisk *disk)
 	 */
 	if (!test_bit(GD_DEAD, &disk->state))
 		blk_report_disk_dead(disk, false);
-	__blk_mark_disk_dead(disk);
 
 	/*
 	 * Drop all partitions now that the disk is marked dead.
 	 */
 	mutex_lock(&disk->open_mutex);
+	__blk_mark_disk_dead(disk);
 	xa_for_each_start(&disk->part_tbl, idx, part, 1)
 		drop_partition(part);
 	mutex_unlock(&disk->open_mutex);

From 55fbb9a5d64e0e590cad5eacc16c99f2482a008f Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 24 Jul 2024 22:33:11 +0800
Subject: [PATCH 242/346] ublk: fix UBLK_CMD_DEL_DEV_ASYNC handling

In ublk_ctrl_uring_cmd(), ioctl command NR should be used for
matching _IOC_NR(cmd_op).

Fix it by adding one private macro, and this way is clean.

Fixes: 13fe8e6825e4 ("ublk: add UBLK_CMD_DEL_DEV_ASYNC")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20240724143311.2646330-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 6fb799ec0d88..890c08792ba8 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -48,6 +48,9 @@
 
 #define UBLK_MINORS		(1U << MINORBITS)
 
+/* private ioctl command mirror */
+#define UBLK_CMD_DEL_DEV_ASYNC	_IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
+
 /* All UBLK_F_* have to be included into UBLK_F_ALL */
 #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
 		| UBLK_F_URING_CMD_COMP_IN_TASK \
@@ -2903,7 +2906,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 	case UBLK_CMD_DEL_DEV:
 		ret = ublk_ctrl_del_dev(&ub, true);
 		break;
-	case UBLK_U_CMD_DEL_DEV_ASYNC:
+	case UBLK_CMD_DEL_DEV_ASYNC:
 		ret = ublk_ctrl_del_dev(&ub, false);
 		break;
 	case UBLK_CMD_GET_QUEUE_AFFINITY:

From e6e18021ddd0dc5af487fb86b6d7c964e062d692 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Tue, 23 Jul 2024 13:12:24 +1200
Subject: [PATCH 243/346] ALSA: hda/realtek: cs35l41: Fixup remaining asus
 strix models

Adjust quirks for 0x3a20, 0x3a30, 0x3a50 to match the 0x3a60. This
set has now been confirmed to work with this patch.

Signed-off-by: Luke D. Jones <luke@ljones.dev>
Fixes: 811dd426a9b1 ("ALSA: hda/realtek: Add quirks for Asus ROG 2024 laptops using CS35L41")
Cc: <stable@vger.kernel.org>
Link: https://patch.msgid.link/20240723011224.115579-1-luke@ljones.dev
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index c3a86a99f8c6..462194bf6954 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -10359,10 +10359,10 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1043, 0x1f62, "ASUS UX7602ZM", ALC245_FIXUP_CS35L41_SPI_2),
 	SND_PCI_QUIRK(0x1043, 0x1f92, "ASUS ROG Flow X16", ALC289_FIXUP_ASUS_GA401),
 	SND_PCI_QUIRK(0x1043, 0x3030, "ASUS ZN270IE", ALC256_FIXUP_ASUS_AIO_GPIO2),
-	SND_PCI_QUIRK(0x1043, 0x3a20, "ASUS G614JZR", ALC245_FIXUP_CS35L41_SPI_2),
-	SND_PCI_QUIRK(0x1043, 0x3a30, "ASUS G814JVR/JIR", ALC245_FIXUP_CS35L41_SPI_2),
+	SND_PCI_QUIRK(0x1043, 0x3a20, "ASUS G614JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS),
+	SND_PCI_QUIRK(0x1043, 0x3a30, "ASUS G814JVR/JIR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS),
 	SND_PCI_QUIRK(0x1043, 0x3a40, "ASUS G814JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS),
-	SND_PCI_QUIRK(0x1043, 0x3a50, "ASUS G834JYR/JZR", ALC245_FIXUP_CS35L41_SPI_2),
+	SND_PCI_QUIRK(0x1043, 0x3a50, "ASUS G834JYR/JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS),
 	SND_PCI_QUIRK(0x1043, 0x3a60, "ASUS G634JYR/JZR", ALC285_FIXUP_ASUS_SPI_REAR_SPEAKERS),
 	SND_PCI_QUIRK(0x1043, 0x831a, "ASUS P901", ALC269_FIXUP_STEREO_DMIC),
 	SND_PCI_QUIRK(0x1043, 0x834a, "ASUS S101", ALC269_FIXUP_STEREO_DMIC),

From 33be0cfa5ba522ba88ba25cb95e582932843409b Mon Sep 17 00:00:00 2001
From: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Date: Fri, 28 Jun 2024 17:37:12 +0200
Subject: [PATCH 244/346] apparmor: take nosymfollow flag into account

A "nosymfollow" flag was added in commit
dab741e0e02b ("Add a "nosymfollow" mount option.")

While we don't need to implement any special logic on
the AppArmor kernel side to handle it, we should provide
user with a correct list of mount flags in audit logs.

Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Georgia Garcia <georgia.garcia@canonical.com>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/mount.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/security/apparmor/mount.c b/security/apparmor/mount.c
index 49fe8da6fea4..bf8863253e07 100644
--- a/security/apparmor/mount.c
+++ b/security/apparmor/mount.c
@@ -44,6 +44,8 @@ static void audit_mnt_flags(struct audit_buffer *ab, unsigned long flags)
 		audit_log_format(ab, ", mand");
 	if (flags & MS_DIRSYNC)
 		audit_log_format(ab, ", dirsync");
+	if (flags & MS_NOSYMFOLLOW)
+		audit_log_format(ab, ", nosymfollow");
 	if (flags & MS_NOATIME)
 		audit_log_format(ab, ", noatime");
 	if (flags & MS_NODIRATIME)

From 4b954a025591a1c7d3a0c0111b6d4730596046b6 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Wed, 29 May 2024 18:21:39 -0700
Subject: [PATCH 245/346] apparmor: test: add MODULE_DESCRIPTION()

Fix the 'make W=1' warning:
WARNING: modpost: missing MODULE_DESCRIPTION() in security/apparmor/apparmor_policy_unpack_test.o

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/policy_unpack_test.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/security/apparmor/policy_unpack_test.c b/security/apparmor/policy_unpack_test.c
index 5c9bde25e56d..874fcf97794e 100644
--- a/security/apparmor/policy_unpack_test.c
+++ b/security/apparmor/policy_unpack_test.c
@@ -604,4 +604,5 @@ static struct kunit_suite apparmor_policy_unpack_test_module = {
 
 kunit_test_suite(apparmor_policy_unpack_test_module);
 
+MODULE_DESCRIPTION("KUnit tests for AppArmor's policy unpack");
 MODULE_LICENSE("GPL");

From f4fee216df7d28b87d1c9cc60bcebfecb51c1a05 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Thu, 20 Jun 2024 19:15:27 +0200
Subject: [PATCH 246/346] apparmor: try to avoid refing the label in
 apparmor_file_open

If the label is not stale (which is the common case), the fact that the
passed file object holds a reference can be leverged to avoid the
ref/unref cycle. Doing so reduces performance impact of apparmor on
parallel open() invocations.

When benchmarking on a 24-core vm using will-it-scale's open1_process
("Separate file open"), the results are (ops/s):
before: 6092196
after:  8309726 (+36%)

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/include/cred.h | 20 ++++++++++++++++++++
 security/apparmor/lsm.c          |  5 +++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/security/apparmor/include/cred.h b/security/apparmor/include/cred.h
index 58fdc72af664..7265d2f81dd5 100644
--- a/security/apparmor/include/cred.h
+++ b/security/apparmor/include/cred.h
@@ -63,6 +63,26 @@ static inline struct aa_label *aa_get_newest_cred_label(const struct cred *cred)
 	return aa_get_newest_label(aa_cred_raw_label(cred));
 }
 
+static inline struct aa_label *aa_get_newest_cred_label_condref(const struct cred *cred,
+								bool *needput)
+{
+	struct aa_label *l = aa_cred_raw_label(cred);
+
+	if (unlikely(label_is_stale(l))) {
+		*needput = true;
+		return aa_get_newest_label(l);
+	}
+
+	*needput = false;
+	return l;
+}
+
+static inline void aa_put_label_condref(struct aa_label *l, bool needput)
+{
+	if (unlikely(needput))
+		aa_put_label(l);
+}
+
 /**
  * aa_current_raw_label - find the current tasks confining label
  *
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 717204fba938..242d4cf857a7 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -461,6 +461,7 @@ static int apparmor_file_open(struct file *file)
 	struct aa_file_ctx *fctx = file_ctx(file);
 	struct aa_label *label;
 	int error = 0;
+	bool needput;
 
 	if (!path_mediated_fs(file->f_path.dentry))
 		return 0;
@@ -477,7 +478,7 @@ static int apparmor_file_open(struct file *file)
 		return 0;
 	}
 
-	label = aa_get_newest_cred_label(file->f_cred);
+	label = aa_get_newest_cred_label_condref(file->f_cred, &needput);
 	if (!unconfined(label)) {
 		struct mnt_idmap *idmap = file_mnt_idmap(file);
 		struct inode *inode = file_inode(file);
@@ -494,7 +495,7 @@ static int apparmor_file_open(struct file *file)
 		/* todo cache full allowed permissions set and state */
 		fctx->allow = aa_map_file_to_perms(file);
 	}
-	aa_put_label(label);
+	aa_put_label_condref(label, needput);
 
 	return error;
 }

From e0ff0cff1f6cdce0aa596aac04129893201c4162 Mon Sep 17 00:00:00 2001
From: Georgia Garcia <georgia.garcia@canonical.com>
Date: Mon, 10 Jun 2024 09:51:48 -0300
Subject: [PATCH 247/346] apparmor: unpack transition table if dfa is not
 present

Due to a bug in earlier userspaces, a transition table may be present
even when the dfa is not. Commit 7572fea31e3e
("apparmor: convert fperm lookup to use accept as an index") made the
verification check more rigourous regressing old userspaces with
the bug. For compatibility reasons allow the orphaned transition table
during unpack and discard.

Fixes: 7572fea31e3e ("apparmor: convert fperm lookup to use accept as an index")
Signed-off-by: Georgia Garcia <georgia.garcia@canonical.com>
Signed-off-by: John Johansen <john.johansen@canonical.com>
---
 security/apparmor/policy_unpack.c | 42 ++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c
index 75452acd0e35..5a570235427d 100644
--- a/security/apparmor/policy_unpack.c
+++ b/security/apparmor/policy_unpack.c
@@ -747,34 +747,42 @@ static int unpack_pdb(struct aa_ext *e, struct aa_policydb **policy,
 			*info = "missing required dfa";
 			goto fail;
 		}
-		goto out;
+	} else {
+		/*
+		 * only unpack the following if a dfa is present
+		 *
+		 * sadly start was given different names for file and policydb
+		 * but since it is optional we can try both
+		 */
+		if (!aa_unpack_u32(e, &pdb->start[0], "start"))
+			/* default start state */
+			pdb->start[0] = DFA_START;
+		if (!aa_unpack_u32(e, &pdb->start[AA_CLASS_FILE], "dfa_start")) {
+			/* default start state for xmatch and file dfa */
+			pdb->start[AA_CLASS_FILE] = DFA_START;
+		}	/* setup class index */
+		for (i = AA_CLASS_FILE + 1; i <= AA_CLASS_LAST; i++) {
+			pdb->start[i] = aa_dfa_next(pdb->dfa, pdb->start[0],
+						    i);
+		}
 	}
 
 	/*
-	 * only unpack the following if a dfa is present
-	 *
-	 * sadly start was given different names for file and policydb
-	 * but since it is optional we can try both
+	 * Unfortunately due to a bug in earlier userspaces, a
+	 * transition table may be present even when the dfa is
+	 * not. For compatibility reasons unpack and discard.
 	 */
-	if (!aa_unpack_u32(e, &pdb->start[0], "start"))
-		/* default start state */
-		pdb->start[0] = DFA_START;
-	if (!aa_unpack_u32(e, &pdb->start[AA_CLASS_FILE], "dfa_start")) {
-		/* default start state for xmatch and file dfa */
-		pdb->start[AA_CLASS_FILE] = DFA_START;
-	}	/* setup class index */
-	for (i = AA_CLASS_FILE + 1; i <= AA_CLASS_LAST; i++) {
-		pdb->start[i] = aa_dfa_next(pdb->dfa, pdb->start[0],
-					       i);
-	}
 	if (!unpack_trans_table(e, &pdb->trans) && required_trans) {
 		*info = "failed to unpack profile transition table";
 		goto fail;
 	}
 
+	if (!pdb->dfa && pdb->trans.table)
+		aa_free_str_table(&pdb->trans);
+
 	/* TODO: move compat mapping here, requires dfa merging first */
 	/* TODO: move verify here, it has to be done after compat mappings */
-out:
+
 	*policy = pdb;
 	return 0;
 

From a03ebf116303e5d13ba9a2b65726b106cb1e96f6 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Tue, 9 Jul 2024 17:54:11 -0400
Subject: [PATCH 248/346] drm/amdgpu/sdma5.2: Update wptr registers as well as
 doorbell

We seem to have a case where SDMA will sometimes miss a doorbell
if GFX is entering the powergating state when the doorbell comes in.
To workaround this, we can update the wptr via MMIO, however,
this is only safe because we disallow gfxoff in begin_ring() for
SDMA 5.2 and then allow it again in end_ring().

Enable this workaround while we are root causing the issue with
the HW team.

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/3440
Tested-by: Friedrich Vock <friedrich.vock@gmx.de>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
(cherry picked from commit f2ac52634963fc38e4935e11077b6f7854e5d700)
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index cc9e961f0078..af1e90159ce3 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -176,6 +176,14 @@ static void sdma_v5_2_ring_set_wptr(struct amdgpu_ring *ring)
 		DRM_DEBUG("calling WDOORBELL64(0x%08x, 0x%016llx)\n",
 				ring->doorbell_index, ring->wptr << 2);
 		WDOORBELL64(ring->doorbell_index, ring->wptr << 2);
+		/* SDMA seems to miss doorbells sometimes when powergating kicks in.
+		 * Updating the wptr directly will wake it. This is only safe because
+		 * we disallow gfxoff in begin_use() and then allow it again in end_use().
+		 */
+		WREG32(sdma_v5_2_get_reg_offset(adev, ring->me, mmSDMA0_GFX_RB_WPTR),
+		       lower_32_bits(ring->wptr << 2));
+		WREG32(sdma_v5_2_get_reg_offset(adev, ring->me, mmSDMA0_GFX_RB_WPTR_HI),
+		       upper_32_bits(ring->wptr << 2));
 	} else {
 		DRM_DEBUG("Not using doorbell -- "
 				"mmSDMA%i_GFX_RB_WPTR == 0x%08x "
@@ -1647,6 +1655,10 @@ static void sdma_v5_2_ring_begin_use(struct amdgpu_ring *ring)
 	 * but it shouldn't hurt for other parts since
 	 * this GFXOFF will be disallowed anyway when SDMA is
 	 * active, this just makes it explicit.
+	 * sdma_v5_2_ring_set_wptr() takes advantage of this
+	 * to update the wptr because sometimes SDMA seems to miss
+	 * doorbells when entering PG.  If you remove this, update
+	 * sdma_v5_2_ring_set_wptr() as well!
 	 */
 	amdgpu_gfx_off_ctrl(adev, false);
 }

From 73048bda46c3085df5fd42840de09523386d3e54 Mon Sep 17 00:00:00 2001
From: David Belanger <david.belanger@amd.com>
Date: Mon, 10 Jun 2024 16:38:55 -0400
Subject: [PATCH 249/346] drm/amdgpu: Fix atomics on GFX12

If PCIe supports atomics, configure register to prevent DF from
breaking atomics in separate load/store operations.

Signed-off-by: David Belanger <david.belanger@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 666f14cab21b17ccc1bdfe1e82458aa429b3b7e0)
---
 drivers/gpu/drm/amd/amdgpu/Makefile           |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_df.h        |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c |  5 +++
 drivers/gpu/drm/amd/amdgpu/df_v4_15.c         | 45 +++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/df_v4_15.h         | 30 +++++++++++++
 drivers/gpu/drm/amd/amdgpu/soc24.c            |  4 ++
 .../amd/include/asic_reg/df/df_4_15_offset.h  | 28 ++++++++++++
 .../amd/include/asic_reg/df/df_4_15_sh_mask.h | 28 ++++++++++++
 8 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_15.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_15.h
 create mode 100644 drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_offset.h
 create mode 100644 drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_sh_mask.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile
index 9dd8294032ef..38408e4e158e 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -106,7 +106,8 @@ amdgpu-y += \
 	df_v1_7.o \
 	df_v3_6.o \
 	df_v4_3.o \
-	df_v4_6_2.o
+	df_v4_6_2.o \
+	df_v4_15.o
 
 # add GMC block
 amdgpu-y += \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h
index 1538b2dbfff1..eb605e79ae0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h
@@ -33,6 +33,7 @@ struct amdgpu_df_hash_status {
 struct amdgpu_df_funcs {
 	void (*sw_init)(struct amdgpu_device *adev);
 	void (*sw_fini)(struct amdgpu_device *adev);
+	void (*hw_init)(struct amdgpu_device *adev);
 	void (*enable_broadcast_mode)(struct amdgpu_device *adev,
 				      bool enable);
 	u32 (*get_fb_channel_number)(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b241f61fe9c9..ac108fca64fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -37,6 +37,7 @@
 #include "df_v3_6.h"
 #include "df_v4_3.h"
 #include "df_v4_6_2.h"
+#include "df_v4_15.h"
 #include "nbio_v6_1.h"
 #include "nbio_v7_0.h"
 #include "nbio_v7_4.h"
@@ -2803,6 +2804,10 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
 	case IP_VERSION(4, 6, 2):
 		adev->df.funcs = &df_v4_6_2_funcs;
 		break;
+	case IP_VERSION(4, 15, 0):
+	case IP_VERSION(4, 15, 1):
+		adev->df.funcs = &df_v4_15_funcs;
+		break;
 	default:
 		break;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_15.c b/drivers/gpu/drm/amd/amdgpu/df_v4_15.c
new file mode 100644
index 000000000000..2a573e33908b
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/df_v4_15.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu.h"
+#include "df_v4_15.h"
+
+#include "df/df_4_15_offset.h"
+#include "df/df_4_15_sh_mask.h"
+
+static void df_v4_15_hw_init(struct amdgpu_device *adev)
+{
+	if (adev->have_atomics_support) {
+		uint32_t tmp;
+		uint32_t dis_lcl_proc = (1 <<  1 |
+					1 <<  2 |
+					1 << 13);
+
+		tmp = RREG32_SOC15(DF, 0, regNCSConfigurationRegister1);
+		tmp |= (dis_lcl_proc << NCSConfigurationRegister1__DisIntAtomicsLclProcessing__SHIFT);
+		WREG32_SOC15(DF, 0, regNCSConfigurationRegister1, tmp);
+	}
+}
+
+const struct amdgpu_df_funcs df_v4_15_funcs = {
+	.hw_init = df_v4_15_hw_init
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_15.h b/drivers/gpu/drm/amd/amdgpu/df_v4_15.h
new file mode 100644
index 000000000000..dddf2422112a
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/df_v4_15.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __DF_V4_15_H__
+#define __DF_V4_15_H__
+
+extern const struct amdgpu_df_funcs df_v4_15_funcs;
+
+#endif /* __DF_V4_15_H__ */
+
diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c b/drivers/gpu/drm/amd/amdgpu/soc24.c
index d27fb4ea6612..7d641d0dadba 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc24.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc24.c
@@ -484,6 +484,10 @@ static int soc24_common_hw_init(void *handle)
 	 */
 	if (adev->nbio.funcs->remap_hdp_registers)
 		adev->nbio.funcs->remap_hdp_registers(adev);
+
+	if (adev->df.funcs->hw_init)
+		adev->df.funcs->hw_init(adev);
+
 	/* enable the doorbell aperture */
 	soc24_enable_doorbell_aperture(adev, true);
 
diff --git a/drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_offset.h b/drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_offset.h
new file mode 100644
index 000000000000..c2b009752f60
--- /dev/null
+++ b/drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_offset.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2024  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _df_4_15_OFFSET_HEADER
+#define _df_4_15_OFFSET_HEADER
+
+#define regNCSConfigurationRegister1               0x0901
+#define regNCSConfigurationRegister1_BASE_IDX           4
+
+#endif
diff --git a/drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_sh_mask.h
new file mode 100644
index 000000000000..9868a9c32795
--- /dev/null
+++ b/drivers/gpu/drm/amd/include/asic_reg/df/df_4_15_sh_mask.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2024  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _df_4_15_SH_MASK_HEADER
+#define _df_4_15_SH_MASK_HEADER
+
+#define NCSConfigurationRegister1__DisIntAtomicsLclProcessing__SHIFT         0x3
+#define NCSConfigurationRegister1__DisIntAtomicsLclProcessing_MASK    0x0003FFF8L
+
+#endif

From df65aabef3c0327c23b840ab5520150df4db6b5f Mon Sep 17 00:00:00 2001
From: Ma Ke <make24@iscas.ac.cn>
Date: Thu, 18 Jul 2024 22:17:35 +0800
Subject: [PATCH 250/346] drm/amd/amdgpu: Fix uninitialized variable warnings

Return 0 to avoid returning an uninitialized variable r.

Cc: stable@vger.kernel.org
Fixes: 230dd6bb6117 ("drm/amd/amdgpu: implement mode2 reset on smu_v13_0_10")
Signed-off-by: Ma Ke <make24@iscas.ac.cn>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 6472de66c0aa18d50a4b5ca85f8272e88a737676)
---
 drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
index 04c797d54511..0af648931df5 100644
--- a/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
+++ b/drivers/gpu/drm/amd/amdgpu/smu_v13_0_10.c
@@ -91,7 +91,7 @@ static int smu_v13_0_10_mode2_suspend_ip(struct amdgpu_device *adev)
 		adev->ip_blocks[i].status.hw = false;
 	}
 
-	return r;
+	return 0;
 }
 
 static int

From 23df34997d386f1442d26f2d8edf44a5b5fee79f Mon Sep 17 00:00:00 2001
From: Lijo Lazar <lijo.lazar@amd.com>
Date: Mon, 11 Dec 2023 10:45:38 +0530
Subject: [PATCH 251/346] drm/amdgpu: Add empty HDP flush function to JPEG
 v4.0.3

JPEG v4.0.3 doesn't support HDP flush when RRMT is enabled. Instead,
mmsch fw will do the flush.

This change is necessary for JPEG v4.0.3, no need for backward compatibility

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Jane Jian <Jane.Jian@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 585e3fdb36f59c5cfed0ae06c852dc1df22b1d60)
---
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index 04d8966423de..30a143ab592d 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -621,6 +621,13 @@ static uint64_t jpeg_v4_0_3_dec_ring_get_wptr(struct amdgpu_ring *ring)
 			ring->pipe ? (0x40 * ring->pipe - 0xc80) : 0);
 }
 
+static void jpeg_v4_0_3_ring_emit_hdp_flush(struct amdgpu_ring *ring)
+{
+	/* JPEG engine access for HDP flush doesn't work when RRMT is enabled.
+	 * This is a workaround to avoid any HDP flush through JPEG ring.
+	 */
+}
+
 /**
  * jpeg_v4_0_3_dec_ring_set_wptr - set write pointer
  *
@@ -1072,6 +1079,7 @@ static const struct amdgpu_ring_funcs jpeg_v4_0_3_dec_ring_vm_funcs = {
 	.emit_ib = jpeg_v4_0_3_dec_ring_emit_ib,
 	.emit_fence = jpeg_v4_0_3_dec_ring_emit_fence,
 	.emit_vm_flush = jpeg_v4_0_3_dec_ring_emit_vm_flush,
+	.emit_hdp_flush = jpeg_v4_0_3_ring_emit_hdp_flush,
 	.test_ring = amdgpu_jpeg_dec_ring_test_ring,
 	.test_ib = amdgpu_jpeg_dec_ring_test_ib,
 	.insert_nop = jpeg_v4_0_3_dec_ring_nop,

From 485432d090016aebf8fb407a44ce7a2f856d11eb Mon Sep 17 00:00:00 2001
From: Lijo Lazar <lijo.lazar@amd.com>
Date: Mon, 11 Dec 2023 11:18:42 +0530
Subject: [PATCH 252/346] drm/amdgpu: Add empty HDP flush function to VCN
 v4.0.3

VCN 4.0.3 does not HDP flush with RRMT enabled. Instead, mmsch
will do the HDP flush.

This change is necessary for VCN v4.0.3, no need for backward compatibility

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Jane Jian <Jane.Jian@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 49cfaebe48e97500a68d5322a8194736b0a2c3cf)
---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index f53054e39ebb..101b120f6fbd 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -1375,6 +1375,13 @@ static uint64_t vcn_v4_0_3_unified_ring_get_wptr(struct amdgpu_ring *ring)
 				    regUVD_RB_WPTR);
 }
 
+static void vcn_v4_0_3_ring_emit_hdp_flush(struct amdgpu_ring *ring)
+{
+	/* VCN engine access for HDP flush doesn't work when RRMT is enabled.
+	 * This is a workaround to avoid any HDP flush through VCN ring.
+	 */
+}
+
 /**
  * vcn_v4_0_3_unified_ring_set_wptr - set enc write pointer
  *
@@ -1415,6 +1422,7 @@ static const struct amdgpu_ring_funcs vcn_v4_0_3_unified_ring_vm_funcs = {
 	.emit_ib = vcn_v2_0_enc_ring_emit_ib,
 	.emit_fence = vcn_v2_0_enc_ring_emit_fence,
 	.emit_vm_flush = vcn_v2_0_enc_ring_emit_vm_flush,
+	.emit_hdp_flush = vcn_v4_0_3_ring_emit_hdp_flush,
 	.test_ring = amdgpu_vcn_enc_ring_test_ring,
 	.test_ib = amdgpu_vcn_unified_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,

From 6728f55590a667c292cee10c009e16d90f48d63a Mon Sep 17 00:00:00 2001
From: Jane Jian <Jane.Jian@amd.com>
Date: Mon, 15 Jul 2024 18:48:31 +0800
Subject: [PATCH 253/346] drm/amdgpu/vcn: Use offsets local to VCN/JPEG in VF

For VCN/JPEG 4.0.3, use only the local addressing scheme.

- Mask bit higher than AID0 range

v2
remain the case for mmhub use master XCC

Signed-off-by: Jane Jian <Jane.Jian@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit caaf576292f8ccef5cdc0ac16e77b87dbf6e17ab)
---
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 19 ++++++++--
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c  | 46 ++++++++++++++++++++++--
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index 30a143ab592d..ad524ddc9760 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -32,6 +32,9 @@
 #include "vcn/vcn_4_0_3_sh_mask.h"
 #include "ivsrcid/vcn/irqsrcs_vcn_4_0.h"
 
+#define NORMALIZE_JPEG_REG_OFFSET(offset) \
+		(offset & 0x1FFFF)
+
 enum jpeg_engin_status {
 	UVD_PGFSM_STATUS__UVDJ_PWR_ON  = 0,
 	UVD_PGFSM_STATUS__UVDJ_PWR_OFF = 2,
@@ -824,7 +827,13 @@ void jpeg_v4_0_3_dec_ring_emit_ib(struct amdgpu_ring *ring,
 void jpeg_v4_0_3_dec_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg,
 				uint32_t val, uint32_t mask)
 {
-	uint32_t reg_offset = (reg << 2);
+	uint32_t reg_offset;
+
+	/* For VF, only local offsets should be used */
+	if (amdgpu_sriov_vf(ring->adev))
+		reg = NORMALIZE_JPEG_REG_OFFSET(reg);
+
+	reg_offset = (reg << 2);
 
 	amdgpu_ring_write(ring, PACKETJ(regUVD_JRBC_RB_COND_RD_TIMER_INTERNAL_OFFSET,
 		0, 0, PACKETJ_TYPE0));
@@ -865,7 +874,13 @@ void jpeg_v4_0_3_dec_ring_emit_vm_flush(struct amdgpu_ring *ring,
 
 void jpeg_v4_0_3_dec_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint32_t val)
 {
-	uint32_t reg_offset = (reg << 2);
+	uint32_t reg_offset;
+
+	/* For VF, only local offsets should be used */
+	if (amdgpu_sriov_vf(ring->adev))
+		reg = NORMALIZE_JPEG_REG_OFFSET(reg);
+
+	reg_offset = (reg << 2);
 
 	amdgpu_ring_write(ring,	PACKETJ(regUVD_JRBC_EXTERNAL_REG_INTERNAL_OFFSET,
 		0, 0, PACKETJ_TYPE0));
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index 101b120f6fbd..9bae95538b62 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -45,6 +45,9 @@
 #define VCN_VID_SOC_ADDRESS_2_0		0x1fb00
 #define VCN1_VID_SOC_ADDRESS_3_0	0x48300
 
+#define NORMALIZE_VCN_REG_OFFSET(offset) \
+		(offset & 0x1FFFF)
+
 static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev);
 static void vcn_v4_0_3_set_unified_ring_funcs(struct amdgpu_device *adev);
 static void vcn_v4_0_3_set_irq_funcs(struct amdgpu_device *adev);
@@ -1375,6 +1378,43 @@ static uint64_t vcn_v4_0_3_unified_ring_get_wptr(struct amdgpu_ring *ring)
 				    regUVD_RB_WPTR);
 }
 
+static void vcn_v4_0_3_enc_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg,
+				uint32_t val, uint32_t mask)
+{
+	/* For VF, only local offsets should be used */
+	if (amdgpu_sriov_vf(ring->adev))
+		reg = NORMALIZE_VCN_REG_OFFSET(reg);
+
+	amdgpu_ring_write(ring, VCN_ENC_CMD_REG_WAIT);
+	amdgpu_ring_write(ring, reg << 2);
+	amdgpu_ring_write(ring, mask);
+	amdgpu_ring_write(ring, val);
+}
+
+static void vcn_v4_0_3_enc_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint32_t val)
+{
+	/* For VF, only local offsets should be used */
+	if (amdgpu_sriov_vf(ring->adev))
+		reg = NORMALIZE_VCN_REG_OFFSET(reg);
+
+	amdgpu_ring_write(ring, VCN_ENC_CMD_REG_WRITE);
+	amdgpu_ring_write(ring,	reg << 2);
+	amdgpu_ring_write(ring, val);
+}
+
+static void vcn_v4_0_3_enc_ring_emit_vm_flush(struct amdgpu_ring *ring,
+				unsigned int vmid, uint64_t pd_addr)
+{
+	struct amdgpu_vmhub *hub = &ring->adev->vmhub[ring->vm_hub];
+
+	pd_addr = amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
+
+	/* wait for reg writes */
+	vcn_v4_0_3_enc_ring_emit_reg_wait(ring, hub->ctx0_ptb_addr_lo32 +
+					vmid * hub->ctx_addr_distance,
+					lower_32_bits(pd_addr), 0xffffffff);
+}
+
 static void vcn_v4_0_3_ring_emit_hdp_flush(struct amdgpu_ring *ring)
 {
 	/* VCN engine access for HDP flush doesn't work when RRMT is enabled.
@@ -1421,7 +1461,7 @@ static const struct amdgpu_ring_funcs vcn_v4_0_3_unified_ring_vm_funcs = {
 	.emit_ib_size = 5, /* vcn_v2_0_enc_ring_emit_ib */
 	.emit_ib = vcn_v2_0_enc_ring_emit_ib,
 	.emit_fence = vcn_v2_0_enc_ring_emit_fence,
-	.emit_vm_flush = vcn_v2_0_enc_ring_emit_vm_flush,
+	.emit_vm_flush = vcn_v4_0_3_enc_ring_emit_vm_flush,
 	.emit_hdp_flush = vcn_v4_0_3_ring_emit_hdp_flush,
 	.test_ring = amdgpu_vcn_enc_ring_test_ring,
 	.test_ib = amdgpu_vcn_unified_ring_test_ib,
@@ -1430,8 +1470,8 @@ static const struct amdgpu_ring_funcs vcn_v4_0_3_unified_ring_vm_funcs = {
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.begin_use = amdgpu_vcn_ring_begin_use,
 	.end_use = amdgpu_vcn_ring_end_use,
-	.emit_wreg = vcn_v2_0_enc_ring_emit_wreg,
-	.emit_reg_wait = vcn_v2_0_enc_ring_emit_reg_wait,
+	.emit_wreg = vcn_v4_0_3_enc_ring_emit_wreg,
+	.emit_reg_wait = vcn_v4_0_3_enc_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,
 };
 

From 4ab68e168ae1695f7c04fae98930740aaf7c50fa Mon Sep 17 00:00:00 2001
From: Sung Joon Kim <sungjoon.kim@amd.com>
Date: Mon, 8 Jul 2024 19:29:49 -0400
Subject: [PATCH 254/346] drm/amd/display: Check for NULL pointer

[why & how]
Need to make sure plane_state is initialized
before accessing its members.

Cc: Mario Limonciello <mario.limonciello@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Reviewed-by: Xi (Alex) Liu <xi.liu@amd.com>
Signed-off-by: Sung Joon Kim <sungjoon.kim@amd.com>
Signed-off-by: Aurabindo Pillai <aurabindo.pillai@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 295d91cbc700651782a60572f83c24861607b648)
---
 drivers/gpu/drm/amd/display/dc/core/dc_surface.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_surface.c b/drivers/gpu/drm/amd/display/dc/core/dc_surface.c
index 067f6555cfdf..ccbb15f1638c 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_surface.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_surface.c
@@ -143,7 +143,8 @@ const struct dc_plane_status *dc_plane_get_status(
 		if (pipe_ctx->plane_state != plane_state)
 			continue;
 
-		pipe_ctx->plane_state->status.is_flip_pending = false;
+		if (pipe_ctx->plane_state)
+			pipe_ctx->plane_state->status.is_flip_pending = false;
 
 		break;
 	}

From 5302d1a06a2cd9855378122a07c9e0942f0f04a9 Mon Sep 17 00:00:00 2001
From: Rodrigo Siqueira <rodrigo.siqueira@amd.com>
Date: Thu, 4 Jul 2024 11:54:34 -0600
Subject: [PATCH 255/346] drm/amd/display: Remove ASSERT if significance is
 zero in math_ceil2

In the DML math_ceil2 function, there is one ASSERT if the significance
is equal to zero. However, significance might be equal to zero
sometimes, and this is not an issue for a ceil function, but the current
ASSERT will trigger warnings in those cases. This commit removes the
ASSERT if the significance is equal to zero to avoid unnecessary noise.

Cc: Mario Limonciello <mario.limonciello@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
Reviewed-by: Chaitanya Dhere <chaitanya.dhere@amd.com>
Signed-off-by: Rodrigo Siqueira <rodrigo.siqueira@amd.com>
Signed-off-by: Aurabindo Pillai <aurabindo.pillai@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 332315885d3ccc6d8fe99700f3c2e4c24aa65ab7)
---
 .../dml2/dml21/src/dml2_standalone_libraries/lib_float_math.c   | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_standalone_libraries/lib_float_math.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_standalone_libraries/lib_float_math.c
index defe13436a2c..e73579f1a88e 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_standalone_libraries/lib_float_math.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_standalone_libraries/lib_float_math.c
@@ -64,8 +64,6 @@ double math_ceil(const double arg)
 
 double math_ceil2(const double arg, const double significance)
 {
-	ASSERT(significance != 0);
-
 	return ((int)(arg / significance + 0.99999)) * significance;
 }
 

From afac8c6554ccee54bfd1743755e10af005be3bcf Mon Sep 17 00:00:00 2001
From: YiPeng Chai <YiPeng.Chai@amd.com>
Date: Fri, 19 Jul 2024 20:43:04 +0800
Subject: [PATCH 256/346] drm/amdgpu: fix ras UE error injection failure issue

The ras command shared memory is allocated from
VRAM and the response status of the command
buffer will not be zero due to gpu being in
fatal error state after ras UE error injection.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 8284951a6e79c6806c675e5f68a4cd425dd56bc4)
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 7cdff355cedb..189574d53ebd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1630,9 +1630,7 @@ static int psp_ras_send_cmd(struct psp_context *psp,
 
 	switch (cmd) {
 	case TA_RAS_COMMAND__TRIGGER_ERROR:
-		if (ret || psp->cmd_buf_mem->resp.status)
-			ret = -EINVAL;
-		else if (out)
+		if (!ret && out)
 			memcpy(out, &ras_cmd->ras_status, sizeof(ras_cmd->ras_status));
 		break;
 	case TA_RAS_COMMAND__QUERY_ADDRESS:

From 1a8825259a9ccc53faddcdec24cf94e0a36b32cc Mon Sep 17 00:00:00 2001
From: "Stanley.Yang" <Stanley.Yang@amd.com>
Date: Thu, 18 Jul 2024 10:58:04 +0800
Subject: [PATCH 257/346] drm/amdgpu: Fix eeprom max record count

The eeprom table is empty before initializing,
set eeprom table version first before initializing.

Changed from V1:
	Reuse amdgpu_ras_set_eeprom_table_version function

Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 015b8a2fdf39a4c288ff24e7b715b8d9198e56dc)
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index eae0a555df3c..aab8077e5098 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -1011,6 +1011,9 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
 
 uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *control)
 {
+	/* get available eeprom table version first before eeprom table init */
+	amdgpu_ras_set_eeprom_table_version(control);
+
 	if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
 		return RAS_MAX_RECORD_COUNT_V2_1;
 	else

From fab1ead0ae3a4757afb92ff6909b37d63db17e55 Mon Sep 17 00:00:00 2001
From: Tim Huang <tim.huang@amd.com>
Date: Tue, 23 Jul 2024 16:54:34 +0800
Subject: [PATCH 258/346] drm/amdgpu: add missed harvest check for VCN IP v4/v5

To prevent below probe failure, add a check for models with VCN
IP v4.0.6 where VCN1 may be harvested.

v2:
Apply the same check to VCN IP v4.0 and v5.0.

[   54.070117] RIP: 0010:vcn_v4_0_5_start_dpg_mode+0x9be/0x36b0 [amdgpu]
[   54.071055] Code: 80 fb ff 8d 82 00 80 fe ff 81 fe 00 06 00 00 0f 43
c2 49 69 d5 38 0d 00 00 48 8d 71 04 c1 e8 02 4c 01 f2 48 89 b2 50 f6 02
00 <89> 01 48 8b 82 50 f6 02 00 48 8d 48 04 48 89 8a 50 f6 02 00 c7 00
[   54.072408] RSP: 0018:ffffb17985f736f8 EFLAGS: 00010286
[   54.072793] RAX: 00000000000000d6 RBX: ffff99a82f680000 RCX:
0000000000000000
[   54.073315] RDX: ffff99a82f680000 RSI: 0000000000000004 RDI:
ffff99a82f680000
[   54.073835] RBP: ffffb17985f73730 R08: 0000000000000001 R09:
0000000000000000
[   54.074353] R10: 0000000000000008 R11: ffffb17983c05000 R12:
0000000000000000
[   54.074879] R13: 0000000000000000 R14: ffff99a82f680000 R15:
0000000000000001
[   54.075400] FS:  00007f8d9c79a000(0000) GS:ffff99ab2f140000(0000)
knlGS:0000000000000000
[   54.075988] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   54.076408] CR2: 0000000000000000 CR3: 0000000140c3a000 CR4:
0000000000750ef0
[   54.076927] PKRU: 55555554
[   54.077132] Call Trace:
[   54.077319]  <TASK>
[   54.077484]  ? show_regs+0x69/0x80
[   54.077747]  ? __die+0x28/0x70
[   54.077979]  ? page_fault_oops+0x180/0x4b0
[   54.078286]  ? do_user_addr_fault+0x2d2/0x680
[   54.078610]  ? exc_page_fault+0x84/0x190
[   54.078910]  ? asm_exc_page_fault+0x2b/0x30
[   54.079224]  ? vcn_v4_0_5_start_dpg_mode+0x9be/0x36b0 [amdgpu]
[   54.079941]  ? vcn_v4_0_5_start_dpg_mode+0xe6/0x36b0 [amdgpu]
[   54.080617]  vcn_v4_0_5_set_powergating_state+0x82/0x19b0 [amdgpu]
[   54.081316]  amdgpu_device_ip_set_powergating_state+0x64/0xc0
[amdgpu]
[   54.082057]  amdgpu_vcn_ring_begin_use+0x6f/0x1d0 [amdgpu]
[   54.082727]  amdgpu_ring_alloc+0x44/0x70 [amdgpu]
[   54.083351]  amdgpu_vcn_dec_sw_ring_test_ring+0x40/0x110 [amdgpu]
[   54.084054]  amdgpu_ring_test_helper+0x22/0x90 [amdgpu]
[   54.084698]  vcn_v4_0_5_hw_init+0x87/0xc0 [amdgpu]
[   54.085307]  amdgpu_device_init+0x1f96/0x2780 [amdgpu]
[   54.085951]  amdgpu_driver_load_kms+0x1e/0xc0 [amdgpu]
[   54.086591]  amdgpu_pci_probe+0x19f/0x550 [amdgpu]
[   54.087215]  local_pci_probe+0x48/0xa0
[   54.087509]  pci_device_probe+0xc9/0x250
[   54.087812]  really_probe+0x1a4/0x3f0
[   54.088101]  __driver_probe_device+0x7d/0x170
[   54.088443]  driver_probe_device+0x24/0xa0
[   54.088765]  __driver_attach+0xdd/0x1d0
[   54.089068]  ? __pfx___driver_attach+0x10/0x10
[   54.089417]  bus_for_each_dev+0x8e/0xe0
[   54.089718]  driver_attach+0x22/0x30
[   54.090000]  bus_add_driver+0x120/0x220
[   54.090303]  driver_register+0x62/0x120
[   54.090606]  ? __pfx_amdgpu_init+0x10/0x10 [amdgpu]
[   54.091255]  __pci_register_driver+0x62/0x70
[   54.091593]  amdgpu_init+0x67/0xff0 [amdgpu]
[   54.092190]  do_one_initcall+0x5f/0x330
[   54.092495]  do_init_module+0x68/0x240
[   54.092794]  load_module+0x201c/0x2110
[   54.093093]  init_module_from_file+0x97/0xd0
[   54.093428]  ? init_module_from_file+0x97/0xd0
[   54.093777]  idempotent_init_module+0x11c/0x2a0
[   54.094134]  __x64_sys_finit_module+0x64/0xc0
[   54.094476]  do_syscall_64+0x58/0x120
[   54.094767]  entry_SYSCALL_64_after_hwframe+0x6e/0x76

Signed-off-by: Tim Huang <tim.huang@amd.com>
Reviewed-by: Saleemkhan Jamadar <saleemkhan.jamadar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
(cherry picked from commit 0b071245ddd98539d4f7493bdd188417fcf2d629)
---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c   | 6 ++++++
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c | 6 ++++++
 drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c | 6 ++++++
 3 files changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index f6d96a44d75f..776c539bfdda 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -1045,6 +1045,9 @@ static int vcn_v4_0_start(struct amdgpu_device *adev)
 		amdgpu_dpm_enable_uvd(adev, true);
 
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
+		if (adev->vcn.harvest_config & (1 << i))
+			continue;
+
 		fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
 
 		if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) {
@@ -1498,6 +1501,9 @@ static int vcn_v4_0_stop(struct amdgpu_device *adev)
 	int i, r = 0;
 
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
+		if (adev->vcn.harvest_config & (1 << i))
+			continue;
+
 		fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
 		fw_shared->sq.queue_mode |= FW_QUEUE_DPG_HOLD_OFF;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
index f45495de6875..8d75061f9f38 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
@@ -958,6 +958,9 @@ static int vcn_v4_0_5_start(struct amdgpu_device *adev)
 		amdgpu_dpm_enable_uvd(adev, true);
 
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
+		if (adev->vcn.harvest_config & (1 << i))
+			continue;
+
 		fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
 
 		if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) {
@@ -1162,6 +1165,9 @@ static int vcn_v4_0_5_stop(struct amdgpu_device *adev)
 	int i, r = 0;
 
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
+		if (adev->vcn.harvest_config & (1 << i))
+			continue;
+
 		fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
 		fw_shared->sq.queue_mode |= FW_QUEUE_DPG_HOLD_OFF;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
index 070b56610c7d..68c97fcd539b 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
@@ -721,6 +721,9 @@ static int vcn_v5_0_0_start(struct amdgpu_device *adev)
 		amdgpu_dpm_enable_uvd(adev, true);
 
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
+		if (adev->vcn.harvest_config & (1 << i))
+			continue;
+
 		fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
 
 		if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) {
@@ -898,6 +901,9 @@ static int vcn_v5_0_0_stop(struct amdgpu_device *adev)
 	int i, r = 0;
 
 	for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
+		if (adev->vcn.harvest_config & (1 << i))
+			continue;
+
 		fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
 		fw_shared->sq.queue_mode |= FW_QUEUE_DPG_HOLD_OFF;
 

From 5659b0c93a1ea02c662a030b322093203f299185 Mon Sep 17 00:00:00 2001
From: ZhenGuo Yin <zhenguo.yin@amd.com>
Date: Fri, 19 Jul 2024 16:10:40 +0800
Subject: [PATCH 259/346] drm/amdgpu: reset vm state machine after gpu
 reset(vram lost)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[Why]
Page table of compute VM in the VRAM will lost after gpu reset.
VRAM won't be restored since compute VM has no shadows.

[How]
Use higher 32-bit of vm->generation to record a vram_lost_counter.
Reset the VM state machine when vm->genertaion is not equal to
the new generation token.

v2: Check vm->generation instead of calling drm_sched_entity_error
in amdgpu_vm_validate.
v3: Use new generation token instead of vram_lost_counter for check.

Signed-off-by: ZhenGuo Yin <zhenguo.yin@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
(cherry picked from commit 47c0388b0589cb481c294dcb857d25a214c46eb3)
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 3abfa66d72a2..a060c28f0877 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -434,7 +434,7 @@ uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 	if (!vm)
 		return result;
 
-	result += vm->generation;
+	result += lower_32_bits(vm->generation);
 	/* Add one if the page tables will be re-generated on next CS */
 	if (drm_sched_entity_error(&vm->delayed))
 		++result;
@@ -463,13 +463,14 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 		       int (*validate)(void *p, struct amdgpu_bo *bo),
 		       void *param)
 {
+	uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm);
 	struct amdgpu_vm_bo_base *bo_base;
 	struct amdgpu_bo *shadow;
 	struct amdgpu_bo *bo;
 	int r;
 
-	if (drm_sched_entity_error(&vm->delayed)) {
-		++vm->generation;
+	if (vm->generation != new_vm_generation) {
+		vm->generation = new_vm_generation;
 		amdgpu_vm_bo_reset_state_machine(vm);
 		amdgpu_vm_fini_entities(vm);
 		r = amdgpu_vm_init_entities(adev, vm);
@@ -2439,7 +2440,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	vm->last_update = dma_fence_get_stub();
 	vm->last_unlocked = dma_fence_get_stub();
 	vm->last_tlb_flush = dma_fence_get_stub();
-	vm->generation = 0;
+	vm->generation = amdgpu_vm_generation(adev, NULL);
 
 	mutex_init(&vm->eviction_lock);
 	vm->evicting = false;

From dcfed708742c7ab5e0c9cfcc673cb4e5fafb18ff Mon Sep 17 00:00:00 2001
From: Nick Weihs <nick.weihs@gmail.com>
Date: Wed, 24 Jul 2024 22:47:22 -0700
Subject: [PATCH 260/346] ALSA: hda/realtek: Implement sound init sequence for
 Samsung Galaxy Book3 Pro 360

Samsung Galaxy Book3 Pro 360 sends a large amount of data to the codec
through hda processing coefficients.  This data was captured using a
modified version of QEMU, but the actual content of the data remains
opaque to me.  Elliding any part of the data seems to cause sound to
not work.

Signed-off-by: Nick Weihs <nick.weihs@gmail.com>

Link: https://patch.msgid.link/20240725054722.42597-1-nick.weihs@gmail.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c  |   9 +
 sound/pci/hda/samsung_helper.c | 310 +++++++++++++++++++++++++++++++++
 2 files changed, 319 insertions(+)
 create mode 100644 sound/pci/hda/samsung_helper.c

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 462194bf6954..ba0ce8750ca4 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -4800,6 +4800,8 @@ static void alc298_fixup_samsung_amp(struct hda_codec *codec,
 	}
 }
 
+#include "samsung_helper.c"
+
 #if IS_REACHABLE(CONFIG_INPUT)
 static void gpio2_mic_hotkey_event(struct hda_codec *codec,
 				   struct hda_jack_callback *event)
@@ -7429,6 +7431,7 @@ enum {
 	ALC236_FIXUP_HP_MUTE_LED,
 	ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF,
 	ALC298_FIXUP_SAMSUNG_AMP,
+	ALC298_FIXUP_SAMSUNG_AMP2,
 	ALC298_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET,
 	ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET,
 	ALC295_FIXUP_ASUS_MIC_NO_PRESENCE,
@@ -9055,6 +9058,10 @@ static const struct hda_fixup alc269_fixups[] = {
 		.chained = true,
 		.chain_id = ALC298_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET
 	},
+	[ALC298_FIXUP_SAMSUNG_AMP2] = {
+		.type = HDA_FIXUP_FUNC,
+		.v.func = alc298_fixup_samsung_amp2
+	},
 	[ALC298_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET] = {
 		.type = HDA_FIXUP_VERBS,
 		.v.verbs = (const struct hda_verb[]) {
@@ -10406,6 +10413,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x144d, 0xc832, "Samsung Galaxy Book Flex Alpha (NP730QCJ)", ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET),
 	SND_PCI_QUIRK(0x144d, 0xca03, "Samsung Galaxy Book2 Pro 360 (NP930QED)", ALC298_FIXUP_SAMSUNG_AMP),
 	SND_PCI_QUIRK(0x144d, 0xc868, "Samsung Galaxy Book2 Pro (NP930XED)", ALC298_FIXUP_SAMSUNG_AMP),
+	SND_PCI_QUIRK(0x144d, 0xc1ca, "Samsung Galaxy Book3 Pro 360 (NP960QFG-KB1US)", ALC298_FIXUP_SAMSUNG_AMP2),
 	SND_PCI_QUIRK(0x1458, 0xfa53, "Gigabyte BXBT-2807", ALC283_FIXUP_HEADSET_MIC),
 	SND_PCI_QUIRK(0x1462, 0xb120, "MSI Cubi MS-B120", ALC283_FIXUP_HEADSET_MIC),
 	SND_PCI_QUIRK(0x1462, 0xb171, "Cubi N 8GL (MS-B171)", ALC283_FIXUP_HEADSET_MIC),
@@ -10843,6 +10851,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = {
 	{.id = ALC298_FIXUP_HUAWEI_MBX_STEREO, .name = "huawei-mbx-stereo"},
 	{.id = ALC256_FIXUP_MEDION_HEADSET_NO_PRESENCE, .name = "alc256-medion-headset"},
 	{.id = ALC298_FIXUP_SAMSUNG_AMP, .name = "alc298-samsung-amp"},
+	{.id = ALC298_FIXUP_SAMSUNG_AMP2, .name = "alc298-samsung-amp2"},
 	{.id = ALC256_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET, .name = "alc256-samsung-headphone"},
 	{.id = ALC255_FIXUP_XIAOMI_HEADSET_MIC, .name = "alc255-xiaomi-headset"},
 	{.id = ALC274_FIXUP_HP_MIC, .name = "alc274-hp-mic-detect"},
diff --git a/sound/pci/hda/samsung_helper.c b/sound/pci/hda/samsung_helper.c
new file mode 100644
index 000000000000..a40175b69015
--- /dev/null
+++ b/sound/pci/hda/samsung_helper.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Helper functions for Samsung Galaxy Book3 audio initialization */
+
+struct alc298_samsung_coeff_fixup_desc {
+	unsigned char coeff_idx;
+	unsigned short coeff_value;
+};
+
+struct alc298_samsung_coeff_seq_desc {
+	unsigned short coeff_0x23;
+	unsigned short coeff_0x24;
+	unsigned short coeff_0x25;
+	unsigned short coeff_0x26;
+};
+
+
+static inline void alc298_samsung_write_coef_pack2(struct hda_codec *codec,
+						   const struct alc298_samsung_coeff_seq_desc *seq)
+{
+	int i;
+
+	for (i = 0; i < 100; i++) {
+		if ((alc_read_coef_idx(codec, 0x26) & 0x0010) == 0)
+			break;
+
+		usleep_range(500, 1000);
+	}
+
+	alc_write_coef_idx(codec, 0x23, seq->coeff_0x23);
+	alc_write_coef_idx(codec, 0x24, seq->coeff_0x24);
+	alc_write_coef_idx(codec, 0x25, seq->coeff_0x25);
+	alc_write_coef_idx(codec, 0x26, seq->coeff_0x26);
+}
+
+static inline void alc298_samsung_write_coef_pack_seq(
+						struct hda_codec *codec,
+						unsigned char target,
+						const struct alc298_samsung_coeff_seq_desc seq[],
+						int count)
+{
+	alc_write_coef_idx(codec, 0x22, target);
+	for (int i = 0; i < count; i++)
+		alc298_samsung_write_coef_pack2(codec, &seq[i]);
+}
+
+static void alc298_fixup_samsung_amp2(struct hda_codec *codec,
+				      const struct hda_fixup *fix, int action)
+{
+	int i;
+	static const struct alc298_samsung_coeff_fixup_desc fixups1[] = {
+		{ 0x99, 0x8000 }, { 0x82, 0x4408 }, { 0x32, 0x3f00 }, { 0x0e, 0x6f80 },
+		{ 0x10, 0x0e21 }, { 0x55, 0x8000 }, { 0x08, 0x2fcf }, { 0x08, 0x2fcf },
+		{ 0x2d, 0xc020 }, { 0x19, 0x0017 }, { 0x50, 0x1000 }, { 0x0e, 0x6f80 },
+		{ 0x08, 0x2fcf }, { 0x80, 0x0011 }, { 0x2b, 0x0c10 }, { 0x2d, 0xc020 },
+		{ 0x03, 0x0042 }, { 0x0f, 0x0062 }, { 0x08, 0x2fcf },
+	};
+
+	static const struct alc298_samsung_coeff_seq_desc amp_0x38[] = {
+		{ 0x2000, 0x0000, 0x0001, 0xb011 }, { 0x23ff, 0x0000, 0x0000, 0xb011 },
+		{ 0x203a, 0x0000, 0x0080, 0xb011 }, { 0x23e1, 0x0000, 0x0000, 0xb011 },
+		{ 0x2012, 0x0000, 0x006f, 0xb011 }, { 0x2014, 0x0000, 0x0000, 0xb011 },
+		{ 0x201b, 0x0000, 0x0001, 0xb011 }, { 0x201d, 0x0000, 0x0001, 0xb011 },
+		{ 0x201f, 0x0000, 0x00fe, 0xb011 }, { 0x2021, 0x0000, 0x0000, 0xb011 },
+		{ 0x2022, 0x0000, 0x0010, 0xb011 }, { 0x203d, 0x0000, 0x0005, 0xb011 },
+		{ 0x203f, 0x0000, 0x0003, 0xb011 }, { 0x2050, 0x0000, 0x002c, 0xb011 },
+		{ 0x2076, 0x0000, 0x000e, 0xb011 }, { 0x207c, 0x0000, 0x004a, 0xb011 },
+		{ 0x2081, 0x0000, 0x0003, 0xb011 }, { 0x2399, 0x0000, 0x0003, 0xb011 },
+		{ 0x23a4, 0x0000, 0x00b5, 0xb011 }, { 0x23a5, 0x0000, 0x0001, 0xb011 },
+		{ 0x23ba, 0x0000, 0x0094, 0xb011 }, { 0x2100, 0x00d0, 0x950e, 0xb017 },
+		{ 0x2104, 0x0061, 0xd4e2, 0xb017 }, { 0x2108, 0x00d0, 0x950e, 0xb017 },
+		{ 0x210c, 0x0075, 0xf4e2, 0xb017 }, { 0x2110, 0x00b4, 0x4b0d, 0xb017 },
+		{ 0x2114, 0x000a, 0x1000, 0xb017 }, { 0x2118, 0x0015, 0x2000, 0xb017 },
+		{ 0x211c, 0x000a, 0x1000, 0xb017 }, { 0x2120, 0x0075, 0xf4e2, 0xb017 },
+		{ 0x2124, 0x00b4, 0x4b0d, 0xb017 }, { 0x2128, 0x0000, 0x0010, 0xb017 },
+		{ 0x212c, 0x0000, 0x0000, 0xb017 }, { 0x2130, 0x0000, 0x0000, 0xb017 },
+		{ 0x2134, 0x0000, 0x0000, 0xb017 }, { 0x2138, 0x0000, 0x0000, 0xb017 },
+		{ 0x213c, 0x0000, 0x0010, 0xb017 }, { 0x2140, 0x0000, 0x0000, 0xb017 },
+		{ 0x2144, 0x0000, 0x0000, 0xb017 }, { 0x2148, 0x0000, 0x0000, 0xb017 },
+		{ 0x214c, 0x0000, 0x0000, 0xb017 }, { 0x2150, 0x0000, 0x0010, 0xb017 },
+		{ 0x2154, 0x0000, 0x0000, 0xb017 }, { 0x2158, 0x0000, 0x0000, 0xb017 },
+		{ 0x215c, 0x0000, 0x0000, 0xb017 }, { 0x2160, 0x0000, 0x0000, 0xb017 },
+		{ 0x2164, 0x0000, 0x0010, 0xb017 }, { 0x2168, 0x0000, 0x0000, 0xb017 },
+		{ 0x216c, 0x0000, 0x0000, 0xb017 }, { 0x2170, 0x0000, 0x0000, 0xb017 },
+		{ 0x2174, 0x0000, 0x0000, 0xb017 }, { 0x2178, 0x0000, 0x0010, 0xb017 },
+		{ 0x217c, 0x0000, 0x0000, 0xb017 }, { 0x2180, 0x0000, 0x0000, 0xb017 },
+		{ 0x2184, 0x0000, 0x0000, 0xb017 }, { 0x2188, 0x0000, 0x0000, 0xb017 },
+		{ 0x218c, 0x0064, 0x5800, 0xb017 }, { 0x2190, 0x00c8, 0xb000, 0xb017 },
+		{ 0x2194, 0x0064, 0x5800, 0xb017 }, { 0x2198, 0x003d, 0x5be7, 0xb017 },
+		{ 0x219c, 0x0054, 0x060a, 0xb017 }, { 0x21a0, 0x00c8, 0xa310, 0xb017 },
+		{ 0x21a4, 0x0029, 0x4de5, 0xb017 }, { 0x21a8, 0x0032, 0x420c, 0xb017 },
+		{ 0x21ac, 0x0029, 0x4de5, 0xb017 }, { 0x21b0, 0x00fa, 0xe50c, 0xb017 },
+		{ 0x21b4, 0x0000, 0x0010, 0xb017 }, { 0x21b8, 0x0000, 0x0000, 0xb017 },
+		{ 0x21bc, 0x0000, 0x0000, 0xb017 }, { 0x21c0, 0x0000, 0x0000, 0xb017 },
+		{ 0x21c4, 0x0000, 0x0000, 0xb017 }, { 0x21c8, 0x0056, 0xc50f, 0xb017 },
+		{ 0x21cc, 0x007b, 0xd7e1, 0xb017 }, { 0x21d0, 0x0077, 0xa70e, 0xb017 },
+		{ 0x21d4, 0x00e0, 0xbde1, 0xb017 }, { 0x21d8, 0x0032, 0x530e, 0xb017 },
+		{ 0x2204, 0x00fb, 0x7e0f, 0xb017 }, { 0x2208, 0x000b, 0x02e1, 0xb017 },
+		{ 0x220c, 0x00fb, 0x7e0f, 0xb017 }, { 0x2210, 0x00d5, 0x17e1, 0xb017 },
+		{ 0x2214, 0x00c0, 0x130f, 0xb017 }, { 0x2218, 0x00e5, 0x0a00, 0xb017 },
+		{ 0x221c, 0x00cb, 0x1500, 0xb017 }, { 0x2220, 0x00e5, 0x0a00, 0xb017 },
+		{ 0x2224, 0x00d5, 0x17e1, 0xb017 }, { 0x2228, 0x00c0, 0x130f, 0xb017 },
+		{ 0x222c, 0x00f5, 0xdb0e, 0xb017 }, { 0x2230, 0x0017, 0x48e2, 0xb017 },
+		{ 0x2234, 0x00f5, 0xdb0e, 0xb017 }, { 0x2238, 0x00ef, 0x5ce2, 0xb017 },
+		{ 0x223c, 0x00c1, 0xcc0d, 0xb017 }, { 0x2240, 0x00f5, 0xdb0e, 0xb017 },
+		{ 0x2244, 0x0017, 0x48e2, 0xb017 }, { 0x2248, 0x00f5, 0xdb0e, 0xb017 },
+		{ 0x224c, 0x00ef, 0x5ce2, 0xb017 }, { 0x2250, 0x00c1, 0xcc0d, 0xb017 },
+		{ 0x2254, 0x00f5, 0xdb0e, 0xb017 }, { 0x2258, 0x0017, 0x48e2, 0xb017 },
+		{ 0x225c, 0x00f5, 0xdb0e, 0xb017 }, { 0x2260, 0x00ef, 0x5ce2, 0xb017 },
+		{ 0x2264, 0x00c1, 0xcc0d, 0xb017 }, { 0x2268, 0x00f5, 0xdb0e, 0xb017 },
+		{ 0x226c, 0x0017, 0x48e2, 0xb017 }, { 0x2270, 0x00f5, 0xdb0e, 0xb017 },
+		{ 0x2274, 0x00ef, 0x5ce2, 0xb017 }, { 0x2278, 0x00c1, 0xcc0d, 0xb017 },
+		{ 0x227c, 0x00f5, 0xdb0e, 0xb017 }, { 0x2280, 0x0017, 0x48e2, 0xb017 },
+		{ 0x2284, 0x00f5, 0xdb0e, 0xb017 }, { 0x2288, 0x00ef, 0x5ce2, 0xb017 },
+		{ 0x228c, 0x00c1, 0xcc0d, 0xb017 }, { 0x22cc, 0x00e8, 0x8d00, 0xb017 },
+		{ 0x22d0, 0x0000, 0x0000, 0xb017 }, { 0x22d4, 0x0018, 0x72ff, 0xb017 },
+		{ 0x22d8, 0x00ce, 0x25e1, 0xb017 }, { 0x22dc, 0x002f, 0xe40e, 0xb017 },
+		{ 0x238e, 0x0000, 0x0099, 0xb011 }, { 0x238f, 0x0000, 0x0011, 0xb011 },
+		{ 0x2390, 0x0000, 0x0056, 0xb011 }, { 0x2391, 0x0000, 0x0004, 0xb011 },
+		{ 0x2392, 0x0000, 0x00bb, 0xb011 }, { 0x2393, 0x0000, 0x006d, 0xb011 },
+		{ 0x2394, 0x0000, 0x0010, 0xb011 }, { 0x2395, 0x0000, 0x0064, 0xb011 },
+		{ 0x2396, 0x0000, 0x00b6, 0xb011 }, { 0x2397, 0x0000, 0x0028, 0xb011 },
+		{ 0x2398, 0x0000, 0x000b, 0xb011 }, { 0x239a, 0x0000, 0x0099, 0xb011 },
+		{ 0x239b, 0x0000, 0x000d, 0xb011 }, { 0x23a6, 0x0000, 0x0064, 0xb011 },
+		{ 0x23a7, 0x0000, 0x0078, 0xb011 }, { 0x23b9, 0x0000, 0x0000, 0xb011 },
+		{ 0x23e0, 0x0000, 0x0021, 0xb011 }, { 0x23e1, 0x0000, 0x0001, 0xb011 },
+	};
+
+	static const struct alc298_samsung_coeff_seq_desc amp_0x39[] = {
+		{ 0x2000, 0x0000, 0x0001, 0xb011 }, { 0x23ff, 0x0000, 0x0000, 0xb011 },
+		{ 0x203a, 0x0000, 0x0080, 0xb011 }, { 0x23e1, 0x0000, 0x0000, 0xb011 },
+		{ 0x2012, 0x0000, 0x006f, 0xb011 }, { 0x2014, 0x0000, 0x0000, 0xb011 },
+		{ 0x201b, 0x0000, 0x0002, 0xb011 }, { 0x201d, 0x0000, 0x0002, 0xb011 },
+		{ 0x201f, 0x0000, 0x00fd, 0xb011 }, { 0x2021, 0x0000, 0x0001, 0xb011 },
+		{ 0x2022, 0x0000, 0x0010, 0xb011 }, { 0x203d, 0x0000, 0x0005, 0xb011 },
+		{ 0x203f, 0x0000, 0x0003, 0xb011 }, { 0x2050, 0x0000, 0x002c, 0xb011 },
+		{ 0x2076, 0x0000, 0x000e, 0xb011 }, { 0x207c, 0x0000, 0x004a, 0xb011 },
+		{ 0x2081, 0x0000, 0x0003, 0xb011 }, { 0x2399, 0x0000, 0x0003, 0xb011 },
+		{ 0x23a4, 0x0000, 0x00b5, 0xb011 }, { 0x23a5, 0x0000, 0x0001, 0xb011 },
+		{ 0x23ba, 0x0000, 0x0094, 0xb011 }, { 0x2100, 0x00d0, 0x950e, 0xb017 },
+		{ 0x2104, 0x0061, 0xd4e2, 0xb017 }, { 0x2108, 0x00d0, 0x950e, 0xb017 },
+		{ 0x210c, 0x0075, 0xf4e2, 0xb017 }, { 0x2110, 0x00b4, 0x4b0d, 0xb017 },
+		{ 0x2114, 0x000a, 0x1000, 0xb017 }, { 0x2118, 0x0015, 0x2000, 0xb017 },
+		{ 0x211c, 0x000a, 0x1000, 0xb017 }, { 0x2120, 0x0075, 0xf4e2, 0xb017 },
+		{ 0x2124, 0x00b4, 0x4b0d, 0xb017 }, { 0x2128, 0x0000, 0x0010, 0xb017 },
+		{ 0x212c, 0x0000, 0x0000, 0xb017 }, { 0x2130, 0x0000, 0x0000, 0xb017 },
+		{ 0x2134, 0x0000, 0x0000, 0xb017 }, { 0x2138, 0x0000, 0x0000, 0xb017 },
+		{ 0x213c, 0x0000, 0x0010, 0xb017 }, { 0x2140, 0x0000, 0x0000, 0xb017 },
+		{ 0x2144, 0x0000, 0x0000, 0xb017 }, { 0x2148, 0x0000, 0x0000, 0xb017 },
+		{ 0x214c, 0x0000, 0x0000, 0xb017 }, { 0x2150, 0x0000, 0x0010, 0xb017 },
+		{ 0x2154, 0x0000, 0x0000, 0xb017 }, { 0x2158, 0x0000, 0x0000, 0xb017 },
+		{ 0x215c, 0x0000, 0x0000, 0xb017 }, { 0x2160, 0x0000, 0x0000, 0xb017 },
+		{ 0x2164, 0x0000, 0x0010, 0xb017 }, { 0x2168, 0x0000, 0x0000, 0xb017 },
+		{ 0x216c, 0x0000, 0x0000, 0xb017 }, { 0x2170, 0x0000, 0x0000, 0xb017 },
+		{ 0x2174, 0x0000, 0x0000, 0xb017 }, { 0x2178, 0x0000, 0x0010, 0xb017 },
+		{ 0x217c, 0x0000, 0x0000, 0xb017 }, { 0x2180, 0x0000, 0x0000, 0xb017 },
+		{ 0x2184, 0x0000, 0x0000, 0xb017 }, { 0x2188, 0x0000, 0x0000, 0xb017 },
+		{ 0x218c, 0x0064, 0x5800, 0xb017 }, { 0x2190, 0x00c8, 0xb000, 0xb017 },
+		{ 0x2194, 0x0064, 0x5800, 0xb017 }, { 0x2198, 0x003d, 0x5be7, 0xb017 },
+		{ 0x219c, 0x0054, 0x060a, 0xb017 }, { 0x21a0, 0x00c8, 0xa310, 0xb017 },
+		{ 0x21a4, 0x0029, 0x4de5, 0xb017 }, { 0x21a8, 0x0032, 0x420c, 0xb017 },
+		{ 0x21ac, 0x0029, 0x4de5, 0xb017 }, { 0x21b0, 0x00fa, 0xe50c, 0xb017 },
+		{ 0x21b4, 0x0000, 0x0010, 0xb017 }, { 0x21b8, 0x0000, 0x0000, 0xb017 },
+		{ 0x21bc, 0x0000, 0x0000, 0xb017 }, { 0x21c0, 0x0000, 0x0000, 0xb017 },
+		{ 0x21c4, 0x0000, 0x0000, 0xb017 }, { 0x21c8, 0x0056, 0xc50f, 0xb017 },
+		{ 0x21cc, 0x007b, 0xd7e1, 0xb017 }, { 0x21d0, 0x0077, 0xa70e, 0xb017 },
+		{ 0x21d4, 0x00e0, 0xbde1, 0xb017 }, { 0x21d8, 0x0032, 0x530e, 0xb017 },
+		{ 0x2204, 0x00fb, 0x7e0f, 0xb017 }, { 0x2208, 0x000b, 0x02e1, 0xb017 },
+		{ 0x220c, 0x00fb, 0x7e0f, 0xb017 }, { 0x2210, 0x00d5, 0x17e1, 0xb017 },
+		{ 0x2214, 0x00c0, 0x130f, 0xb017 }, { 0x2218, 0x00e5, 0x0a00, 0xb017 },
+		{ 0x221c, 0x00cb, 0x1500, 0xb017 }, { 0x2220, 0x00e5, 0x0a00, 0xb017 },
+		{ 0x2224, 0x00d5, 0x17e1, 0xb017 }, { 0x2228, 0x00c0, 0x130f, 0xb017 },
+		{ 0x222c, 0x00f5, 0xdb0e, 0xb017 }, { 0x2230, 0x0017, 0x48e2, 0xb017 },
+		{ 0x2234, 0x00f5, 0xdb0e, 0xb017 }, { 0x2238, 0x00ef, 0x5ce2, 0xb017 },
+		{ 0x223c, 0x00c1, 0xcc0d, 0xb017 }, { 0x2240, 0x00f5, 0xdb0e, 0xb017 },
+		{ 0x2244, 0x0017, 0x48e2, 0xb017 }, { 0x2248, 0x00f5, 0xdb0e, 0xb017 },
+		{ 0x224c, 0x00ef, 0x5ce2, 0xb017 }, { 0x2250, 0x00c1, 0xcc0d, 0xb017 },
+		{ 0x2254, 0x00f5, 0xdb0e, 0xb017 }, { 0x2258, 0x0017, 0x48e2, 0xb017 },
+		{ 0x225c, 0x00f5, 0xdb0e, 0xb017 }, { 0x2260, 0x00ef, 0x5ce2, 0xb017 },
+		{ 0x2264, 0x00c1, 0xcc0d, 0xb017 }, { 0x2268, 0x00f5, 0xdb0e, 0xb017 },
+		{ 0x226c, 0x0017, 0x48e2, 0xb017 }, { 0x2270, 0x00f5, 0xdb0e, 0xb017 },
+		{ 0x2274, 0x00ef, 0x5ce2, 0xb017 }, { 0x2278, 0x00c1, 0xcc0d, 0xb017 },
+		{ 0x227c, 0x00f5, 0xdb0e, 0xb017 }, { 0x2280, 0x0017, 0x48e2, 0xb017 },
+		{ 0x2284, 0x00f5, 0xdb0e, 0xb017 }, { 0x2288, 0x00ef, 0x5ce2, 0xb017 },
+		{ 0x228c, 0x00c1, 0xcc0d, 0xb017 }, { 0x22cc, 0x00e8, 0x8d00, 0xb017 },
+		{ 0x22d0, 0x0000, 0x0000, 0xb017 }, { 0x22d4, 0x0018, 0x72ff, 0xb017 },
+		{ 0x22d8, 0x00ce, 0x25e1, 0xb017 }, { 0x22dc, 0x002f, 0xe40e, 0xb017 },
+		{ 0x238e, 0x0000, 0x0099, 0xb011 }, { 0x238f, 0x0000, 0x0011, 0xb011 },
+		{ 0x2390, 0x0000, 0x0056, 0xb011 }, { 0x2391, 0x0000, 0x0004, 0xb011 },
+		{ 0x2392, 0x0000, 0x00bb, 0xb011 }, { 0x2393, 0x0000, 0x006d, 0xb011 },
+		{ 0x2394, 0x0000, 0x0010, 0xb011 }, { 0x2395, 0x0000, 0x0064, 0xb011 },
+		{ 0x2396, 0x0000, 0x00b6, 0xb011 }, { 0x2397, 0x0000, 0x0028, 0xb011 },
+		{ 0x2398, 0x0000, 0x000b, 0xb011 }, { 0x239a, 0x0000, 0x0099, 0xb011 },
+		{ 0x239b, 0x0000, 0x000d, 0xb011 }, { 0x23a6, 0x0000, 0x0064, 0xb011 },
+		{ 0x23a7, 0x0000, 0x0078, 0xb011 }, { 0x23b9, 0x0000, 0x0000, 0xb011 },
+		{ 0x23e0, 0x0000, 0x0021, 0xb011 }, { 0x23e1, 0x0000, 0x0001, 0xb011 },
+	};
+
+	static const struct alc298_samsung_coeff_seq_desc amp_0x3c[] = {
+		{ 0x2000, 0x0000, 0x0001, 0xb011 }, { 0x23ff, 0x0000, 0x0000, 0xb011 },
+		{ 0x203a, 0x0000, 0x0080, 0xb011 }, { 0x23e1, 0x0000, 0x0000, 0xb011 },
+		{ 0x2012, 0x0000, 0x006f, 0xb011 }, { 0x2014, 0x0000, 0x0000, 0xb011 },
+		{ 0x201b, 0x0000, 0x0001, 0xb011 }, { 0x201d, 0x0000, 0x0001, 0xb011 },
+		{ 0x201f, 0x0000, 0x00fe, 0xb011 }, { 0x2021, 0x0000, 0x0000, 0xb011 },
+		{ 0x2022, 0x0000, 0x0010, 0xb011 }, { 0x203d, 0x0000, 0x0005, 0xb011 },
+		{ 0x203f, 0x0000, 0x0003, 0xb011 }, { 0x2050, 0x0000, 0x002c, 0xb011 },
+		{ 0x2076, 0x0000, 0x000e, 0xb011 }, { 0x207c, 0x0000, 0x004a, 0xb011 },
+		{ 0x2081, 0x0000, 0x0003, 0xb011 }, { 0x23ba, 0x0000, 0x008d, 0xb011 },
+		{ 0x2128, 0x0005, 0x460d, 0xb017 }, { 0x212c, 0x00f6, 0x73e5, 0xb017 },
+		{ 0x2130, 0x0005, 0x460d, 0xb017 }, { 0x2134, 0x00c0, 0xe9e5, 0xb017 },
+		{ 0x2138, 0x00d5, 0x010b, 0xb017 }, { 0x213c, 0x009d, 0x7809, 0xb017 },
+		{ 0x2140, 0x00c5, 0x0eed, 0xb017 }, { 0x2144, 0x009d, 0x7809, 0xb017 },
+		{ 0x2148, 0x00c4, 0x4ef0, 0xb017 }, { 0x214c, 0x003a, 0x3106, 0xb017 },
+		{ 0x2150, 0x00af, 0x750e, 0xb017 }, { 0x2154, 0x008c, 0x1ff1, 0xb017 },
+		{ 0x2158, 0x009e, 0x360c, 0xb017 }, { 0x215c, 0x008c, 0x1ff1, 0xb017 },
+		{ 0x2160, 0x004d, 0xac0a, 0xb017 }, { 0x2164, 0x007d, 0xa00f, 0xb017 },
+		{ 0x2168, 0x00e1, 0x9ce3, 0xb017 }, { 0x216c, 0x00e8, 0x590e, 0xb017 },
+		{ 0x2170, 0x00e1, 0x9ce3, 0xb017 }, { 0x2174, 0x0066, 0xfa0d, 0xb017 },
+		{ 0x2178, 0x0000, 0x0010, 0xb017 }, { 0x217c, 0x0000, 0x0000, 0xb017 },
+		{ 0x2180, 0x0000, 0x0000, 0xb017 }, { 0x2184, 0x0000, 0x0000, 0xb017 },
+		{ 0x2188, 0x0000, 0x0000, 0xb017 }, { 0x218c, 0x0000, 0x0010, 0xb017 },
+		{ 0x2190, 0x0000, 0x0000, 0xb017 }, { 0x2194, 0x0000, 0x0000, 0xb017 },
+		{ 0x2198, 0x0000, 0x0000, 0xb017 }, { 0x219c, 0x0000, 0x0000, 0xb017 },
+		{ 0x21a0, 0x0000, 0x0010, 0xb017 }, { 0x21a4, 0x0000, 0x0000, 0xb017 },
+		{ 0x21a8, 0x0000, 0x0000, 0xb017 }, { 0x21ac, 0x0000, 0x0000, 0xb017 },
+		{ 0x21b0, 0x0000, 0x0000, 0xb017 }, { 0x21b4, 0x0000, 0x0010, 0xb017 },
+		{ 0x21b8, 0x0000, 0x0000, 0xb017 }, { 0x21bc, 0x0000, 0x0000, 0xb017 },
+		{ 0x21c0, 0x0000, 0x0000, 0xb017 }, { 0x21c4, 0x0000, 0x0000, 0xb017 },
+		{ 0x23b9, 0x0000, 0x0000, 0xb011 }, { 0x23e0, 0x0000, 0x0020, 0xb011 },
+		{ 0x23e1, 0x0000, 0x0001, 0xb011 },
+	};
+
+	static const struct alc298_samsung_coeff_seq_desc amp_0x3d[] = {
+		{ 0x2000, 0x0000, 0x0001, 0xb011 }, { 0x23ff, 0x0000, 0x0000, 0xb011 },
+		{ 0x203a, 0x0000, 0x0080, 0xb011 }, { 0x23e1, 0x0000, 0x0000, 0xb011 },
+		{ 0x2012, 0x0000, 0x006f, 0xb011 }, { 0x2014, 0x0000, 0x0000, 0xb011 },
+		{ 0x201b, 0x0000, 0x0002, 0xb011 }, { 0x201d, 0x0000, 0x0002, 0xb011 },
+		{ 0x201f, 0x0000, 0x00fd, 0xb011 }, { 0x2021, 0x0000, 0x0001, 0xb011 },
+		{ 0x2022, 0x0000, 0x0010, 0xb011 }, { 0x203d, 0x0000, 0x0005, 0xb011 },
+		{ 0x203f, 0x0000, 0x0003, 0xb011 }, { 0x2050, 0x0000, 0x002c, 0xb011 },
+		{ 0x2076, 0x0000, 0x000e, 0xb011 }, { 0x207c, 0x0000, 0x004a, 0xb011 },
+		{ 0x2081, 0x0000, 0x0003, 0xb011 }, { 0x23ba, 0x0000, 0x008d, 0xb011 },
+		{ 0x2128, 0x0005, 0x460d, 0xb017 }, { 0x212c, 0x00f6, 0x73e5, 0xb017 },
+		{ 0x2130, 0x0005, 0x460d, 0xb017 }, { 0x2134, 0x00c0, 0xe9e5, 0xb017 },
+		{ 0x2138, 0x00d5, 0x010b, 0xb017 }, { 0x213c, 0x009d, 0x7809, 0xb017 },
+		{ 0x2140, 0x00c5, 0x0eed, 0xb017 }, { 0x2144, 0x009d, 0x7809, 0xb017 },
+		{ 0x2148, 0x00c4, 0x4ef0, 0xb017 }, { 0x214c, 0x003a, 0x3106, 0xb017 },
+		{ 0x2150, 0x00af, 0x750e, 0xb017 }, { 0x2154, 0x008c, 0x1ff1, 0xb017 },
+		{ 0x2158, 0x009e, 0x360c, 0xb017 }, { 0x215c, 0x008c, 0x1ff1, 0xb017 },
+		{ 0x2160, 0x004d, 0xac0a, 0xb017 }, { 0x2164, 0x007d, 0xa00f, 0xb017 },
+		{ 0x2168, 0x00e1, 0x9ce3, 0xb017 }, { 0x216c, 0x00e8, 0x590e, 0xb017 },
+		{ 0x2170, 0x00e1, 0x9ce3, 0xb017 }, { 0x2174, 0x0066, 0xfa0d, 0xb017 },
+		{ 0x2178, 0x0000, 0x0010, 0xb017 }, { 0x217c, 0x0000, 0x0000, 0xb017 },
+		{ 0x2180, 0x0000, 0x0000, 0xb017 }, { 0x2184, 0x0000, 0x0000, 0xb017 },
+		{ 0x2188, 0x0000, 0x0000, 0xb017 }, { 0x218c, 0x0000, 0x0010, 0xb017 },
+		{ 0x2190, 0x0000, 0x0000, 0xb017 }, { 0x2194, 0x0000, 0x0000, 0xb017 },
+		{ 0x2198, 0x0000, 0x0000, 0xb017 }, { 0x219c, 0x0000, 0x0000, 0xb017 },
+		{ 0x21a0, 0x0000, 0x0010, 0xb017 }, { 0x21a4, 0x0000, 0x0000, 0xb017 },
+		{ 0x21a8, 0x0000, 0x0000, 0xb017 }, { 0x21ac, 0x0000, 0x0000, 0xb017 },
+		{ 0x21b0, 0x0000, 0x0000, 0xb017 }, { 0x21b4, 0x0000, 0x0010, 0xb017 },
+		{ 0x21b8, 0x0000, 0x0000, 0xb017 }, { 0x21bc, 0x0000, 0x0000, 0xb017 },
+		{ 0x21c0, 0x0000, 0x0000, 0xb017 }, { 0x21c4, 0x0000, 0x0000, 0xb017 },
+		{ 0x23b9, 0x0000, 0x0000, 0xb011 }, { 0x23e0, 0x0000, 0x0020, 0xb011 },
+		{ 0x23e1, 0x0000, 0x0001, 0xb011 },
+	};
+
+	static const struct alc298_samsung_coeff_seq_desc amp_seq1[] = {
+		{ 0x23ff, 0x0000, 0x0000, 0xb011 }, { 0x203a, 0x0000, 0x0080, 0xb011 },
+	};
+
+	static const struct alc298_samsung_coeff_fixup_desc fixups2[] = {
+		{ 0x4f, 0xb029 }, { 0x05, 0x2be0 }, { 0x30, 0x2421 },
+	};
+
+
+	static const struct alc298_samsung_coeff_seq_desc amp_seq2[] = {
+		{ 0x203a, 0x0000, 0x0081, 0xb011 }, { 0x23ff, 0x0000, 0x0001, 0xb011 },
+	};
+
+	if (action != HDA_FIXUP_ACT_INIT)
+		return;
+
+	// First set of fixups
+	for (i = 0; i < ARRAY_SIZE(fixups1); i++)
+		alc_write_coef_idx(codec, fixups1[i].coeff_idx, fixups1[i].coeff_value);
+
+	// First set of writes
+	alc298_samsung_write_coef_pack_seq(codec, 0x38, amp_0x38, ARRAY_SIZE(amp_0x38));
+	alc298_samsung_write_coef_pack_seq(codec, 0x39, amp_0x39, ARRAY_SIZE(amp_0x39));
+	alc298_samsung_write_coef_pack_seq(codec, 0x3c, amp_0x3c, ARRAY_SIZE(amp_0x3c));
+	alc298_samsung_write_coef_pack_seq(codec, 0x3d, amp_0x3d, ARRAY_SIZE(amp_0x3d));
+
+	// Second set of writes
+	alc298_samsung_write_coef_pack_seq(codec, 0x38, amp_seq1, ARRAY_SIZE(amp_seq1));
+	alc298_samsung_write_coef_pack_seq(codec, 0x39, amp_seq1, ARRAY_SIZE(amp_seq1));
+	alc298_samsung_write_coef_pack_seq(codec, 0x3c, amp_seq1, ARRAY_SIZE(amp_seq1));
+	alc298_samsung_write_coef_pack_seq(codec, 0x3d, amp_seq1, ARRAY_SIZE(amp_seq1));
+
+	// Second set of fixups
+	for (i = 0; i < ARRAY_SIZE(fixups2); i++)
+		alc_write_coef_idx(codec, fixups2[i].coeff_idx, fixups2[i].coeff_value);
+
+	// Third set of writes
+	alc298_samsung_write_coef_pack_seq(codec, 0x38, amp_seq2, ARRAY_SIZE(amp_seq2));
+	alc298_samsung_write_coef_pack_seq(codec, 0x39, amp_seq2, ARRAY_SIZE(amp_seq2));
+	alc298_samsung_write_coef_pack_seq(codec, 0x3c, amp_seq2, ARRAY_SIZE(amp_seq2));
+	alc298_samsung_write_coef_pack_seq(codec, 0x3d, amp_seq2, ARRAY_SIZE(amp_seq2));
+
+	// Final fixup
+	alc_write_coef_idx(codec, 0x10, 0x0F21);
+}

From f5c466a0fdb2d9f3650d2e3911b0735f17ba00cf Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 23 Jul 2024 17:54:39 +0200
Subject: [PATCH 261/346] rbd: rename RBD_LOCK_STATE_RELEASING and
 releasing_wait

... to RBD_LOCK_STATE_QUIESCING and quiescing_wait to recognize that
this state and the associated completion are backing rbd_quiesce_lock(),
which isn't specific to releasing the lock.

While exclusive lock does get quiesced before it's released, it also
gets quiesced before an attempt to update the cookie is made and there
the lock is not released as long as ceph_cls_set_cookie() succeeds.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
---
 drivers/block/rbd.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 26ff5cd2bf0a..c30d227753d7 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -362,7 +362,7 @@ enum rbd_watch_state {
 enum rbd_lock_state {
 	RBD_LOCK_STATE_UNLOCKED,
 	RBD_LOCK_STATE_LOCKED,
-	RBD_LOCK_STATE_RELEASING,
+	RBD_LOCK_STATE_QUIESCING,
 };
 
 /* WatchNotify::ClientId */
@@ -422,7 +422,7 @@ struct rbd_device {
 	struct list_head	running_list;
 	struct completion	acquire_wait;
 	int			acquire_err;
-	struct completion	releasing_wait;
+	struct completion	quiescing_wait;
 
 	spinlock_t		object_map_lock;
 	u8			*object_map;
@@ -525,7 +525,7 @@ static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
 	lockdep_assert_held(&rbd_dev->lock_rwsem);
 
 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
-	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
+	       rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING;
 }
 
 static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
@@ -3458,12 +3458,12 @@ static void rbd_lock_del_request(struct rbd_img_request *img_req)
 	spin_lock(&rbd_dev->lock_lists_lock);
 	if (!list_empty(&img_req->lock_item)) {
 		list_del_init(&img_req->lock_item);
-		need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
+		need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING &&
 			       list_empty(&rbd_dev->running_list));
 	}
 	spin_unlock(&rbd_dev->lock_lists_lock);
 	if (need_wakeup)
-		complete(&rbd_dev->releasing_wait);
+		complete(&rbd_dev->quiescing_wait);
 }
 
 static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
@@ -4181,16 +4181,16 @@ static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
 	/*
 	 * Ensure that all in-flight IO is flushed.
 	 */
-	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
-	rbd_assert(!completion_done(&rbd_dev->releasing_wait));
+	rbd_dev->lock_state = RBD_LOCK_STATE_QUIESCING;
+	rbd_assert(!completion_done(&rbd_dev->quiescing_wait));
 	if (list_empty(&rbd_dev->running_list))
 		return true;
 
 	up_write(&rbd_dev->lock_rwsem);
-	wait_for_completion(&rbd_dev->releasing_wait);
+	wait_for_completion(&rbd_dev->quiescing_wait);
 
 	down_write(&rbd_dev->lock_rwsem);
-	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
+	if (rbd_dev->lock_state != RBD_LOCK_STATE_QUIESCING)
 		return false;
 
 	rbd_assert(list_empty(&rbd_dev->running_list));
@@ -5383,7 +5383,7 @@ static struct rbd_device *__rbd_dev_create(struct rbd_spec *spec)
 	INIT_LIST_HEAD(&rbd_dev->acquiring_list);
 	INIT_LIST_HEAD(&rbd_dev->running_list);
 	init_completion(&rbd_dev->acquire_wait);
-	init_completion(&rbd_dev->releasing_wait);
+	init_completion(&rbd_dev->quiescing_wait);
 
 	spin_lock_init(&rbd_dev->object_map_lock);
 

From 2237ceb71f89837ac47c5dce2aaa2c2b3a337a3c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 23 Jul 2024 18:07:59 +0200
Subject: [PATCH 262/346] rbd: don't assume RBD_LOCK_STATE_LOCKED for exclusive
 mappings

Every time a watch is reestablished after getting lost, we need to
update the cookie which involves quiescing exclusive lock.  For this,
we transition from RBD_LOCK_STATE_LOCKED to RBD_LOCK_STATE_QUIESCING
roughly for the duration of rbd_reacquire_lock() call.  If the mapping
is exclusive and I/O happens to arrive in this time window, it's failed
with EROFS (later translated to EIO) based on the wrong assumption in
rbd_img_exclusive_lock() -- "lock got released?" check there stopped
making sense with commit a2b1da09793d ("rbd: lock should be quiesced on
reacquire").

To make it worse, any such I/O is added to the acquiring list before
EROFS is returned and this sets up for violating rbd_lock_del_request()
precondition that the request is either on the running list or not on
any list at all -- see commit ded080c86b3f ("rbd: don't move requests
to the running list on errors").  rbd_lock_del_request() ends up
processing these requests as if they were on the running list which
screws up quiescing_wait completion counter and ultimately leads to

    rbd_assert(!completion_done(&rbd_dev->quiescing_wait));

being triggered on the next watch error.

Cc: stable@vger.kernel.org # 06ef84c4e9c4: rbd: rename RBD_LOCK_STATE_RELEASING and releasing_wait
Cc: stable@vger.kernel.org
Fixes: 637cd060537d ("rbd: new exclusive lock wait/wake code")
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
---
 drivers/block/rbd.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c30d227753d7..ea6c592e015c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3457,6 +3457,7 @@ static void rbd_lock_del_request(struct rbd_img_request *img_req)
 	lockdep_assert_held(&rbd_dev->lock_rwsem);
 	spin_lock(&rbd_dev->lock_lists_lock);
 	if (!list_empty(&img_req->lock_item)) {
+		rbd_assert(!list_empty(&rbd_dev->running_list));
 		list_del_init(&img_req->lock_item);
 		need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_QUIESCING &&
 			       list_empty(&rbd_dev->running_list));
@@ -3476,11 +3477,6 @@ static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
 	if (rbd_lock_add_request(img_req))
 		return 1;
 
-	if (rbd_dev->opts->exclusive) {
-		WARN_ON(1); /* lock got released? */
-		return -EROFS;
-	}
-
 	/*
 	 * Note the use of mod_delayed_work() in rbd_acquire_lock()
 	 * and cancel_delayed_work() in wake_lock_waiters().
@@ -4601,6 +4597,10 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
 			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
 				 ret);
 
+		if (rbd_dev->opts->exclusive)
+			rbd_warn(rbd_dev,
+			     "temporarily releasing lock on exclusive mapping");
+
 		/*
 		 * Lock cookie cannot be updated on older OSDs, so do
 		 * a manual release and queue an acquire.

From 3ceccb14f5576e02b81cc8b105ab81f224bd87f6 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 23 Jul 2024 18:08:08 +0200
Subject: [PATCH 263/346] rbd: don't assume rbd_is_lock_owner() for exclusive
 mappings

Expanding on the previous commit, assuming that rbd_is_lock_owner()
always returns true (i.e. that we are either in RBD_LOCK_STATE_LOCKED
or RBD_LOCK_STATE_QUIESCING) if the mapping is exclusive is wrong too.
In case ceph_cls_set_cookie() fails, the lock would be temporarily
released even if the mapping is exclusive, meaning that we can end up
even in RBD_LOCK_STATE_UNLOCKED.

IOW, exclusive mappings are really "just" about disabling automatic
lock transitions (as documented in the man page), not about grabbing
the lock and holding on to it whatever it takes.

Cc: stable@vger.kernel.org
Fixes: 637cd060537d ("rbd: new exclusive lock wait/wake code")
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
---
 drivers/block/rbd.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ea6c592e015c..da22ce38c039 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -6589,11 +6589,6 @@ static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
 	if (ret)
 		return ret;
 
-	/*
-	 * The lock may have been released by now, unless automatic lock
-	 * transitions are disabled.
-	 */
-	rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
 	return 0;
 }
 

From fc05ea89c9ab45e70cb73e70bc0b9cdd403e0ee1 Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Thu, 25 Jul 2024 09:31:14 +0200
Subject: [PATCH 264/346] x86/xen: move xen_reserve_extra_memory()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation for making the function static.

No functional change.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Message-ID: <20240725073116.14626-2-roger.pau@citrix.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/enlighten_pvh.c | 96 ++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index 27a2a02ef8fb..91c6db4ec054 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -27,54 +27,6 @@
 bool __ro_after_init xen_pvh;
 EXPORT_SYMBOL_GPL(xen_pvh);
 
-void __init xen_pvh_init(struct boot_params *boot_params)
-{
-	u32 msr;
-	u64 pfn;
-
-	xen_pvh = 1;
-	xen_domain_type = XEN_HVM_DOMAIN;
-	xen_start_flags = pvh_start_info.flags;
-
-	msr = cpuid_ebx(xen_cpuid_base() + 2);
-	pfn = __pa(hypercall_page);
-	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
-
-	if (xen_initial_domain())
-		x86_init.oem.arch_setup = xen_add_preferred_consoles;
-	x86_init.oem.banner = xen_banner;
-
-	xen_efi_init(boot_params);
-
-	if (xen_initial_domain()) {
-		struct xen_platform_op op = {
-			.cmd = XENPF_get_dom0_console,
-		};
-		int ret = HYPERVISOR_platform_op(&op);
-
-		if (ret > 0)
-			xen_init_vga(&op.u.dom0_console,
-				     min(ret * sizeof(char),
-					 sizeof(op.u.dom0_console)),
-				     &boot_params->screen_info);
-	}
-}
-
-void __init mem_map_via_hcall(struct boot_params *boot_params_p)
-{
-	struct xen_memory_map memmap;
-	int rc;
-
-	memmap.nr_entries = ARRAY_SIZE(boot_params_p->e820_table);
-	set_xen_guest_handle(memmap.buffer, boot_params_p->e820_table);
-	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
-	if (rc) {
-		xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
-		BUG();
-	}
-	boot_params_p->e820_entries = memmap.nr_entries;
-}
-
 /*
  * Reserve e820 UNUSABLE regions to inflate the memory balloon.
  *
@@ -141,3 +93,51 @@ void __init xen_reserve_extra_memory(struct boot_params *bootp)
 		xen_add_extra_mem(PFN_UP(e->addr), pages);
 	}
 }
+
+void __init xen_pvh_init(struct boot_params *boot_params)
+{
+	u32 msr;
+	u64 pfn;
+
+	xen_pvh = 1;
+	xen_domain_type = XEN_HVM_DOMAIN;
+	xen_start_flags = pvh_start_info.flags;
+
+	msr = cpuid_ebx(xen_cpuid_base() + 2);
+	pfn = __pa(hypercall_page);
+	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+	if (xen_initial_domain())
+		x86_init.oem.arch_setup = xen_add_preferred_consoles;
+	x86_init.oem.banner = xen_banner;
+
+	xen_efi_init(boot_params);
+
+	if (xen_initial_domain()) {
+		struct xen_platform_op op = {
+			.cmd = XENPF_get_dom0_console,
+		};
+		int ret = HYPERVISOR_platform_op(&op);
+
+		if (ret > 0)
+			xen_init_vga(&op.u.dom0_console,
+				     min(ret * sizeof(char),
+					 sizeof(op.u.dom0_console)),
+				     &boot_params->screen_info);
+	}
+}
+
+void __init mem_map_via_hcall(struct boot_params *boot_params_p)
+{
+	struct xen_memory_map memmap;
+	int rc;
+
+	memmap.nr_entries = ARRAY_SIZE(boot_params_p->e820_table);
+	set_xen_guest_handle(memmap.buffer, boot_params_p->e820_table);
+	rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+	if (rc) {
+		xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
+		BUG();
+	}
+	boot_params_p->e820_entries = memmap.nr_entries;
+}

From 4c006734898a113a64a528027274a571b04af95a Mon Sep 17 00:00:00 2001
From: Roger Pau Monne <roger.pau@citrix.com>
Date: Thu, 25 Jul 2024 09:31:15 +0200
Subject: [PATCH 265/346] x86/xen: fix memblock_reserve() usage on PVH
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current usage of memblock_reserve() in init_pvh_bootparams() is done before
the .bss is zeroed, and that used to be fine when
memblock_reserved_init_regions implicitly ended up in the .meminit.data
section.  However after commit 73db3abdca58c memblock_reserved_init_regions
ends up in the .bss section, thus breaking it's usage before the .bss is
cleared.

Move and rename the call to xen_reserve_extra_memory() so it's done in the
x86_init.oem.arch_setup hook, which gets executed after the .bss has been
zeroed, but before calling e820__memory_setup().

Fixes: 73db3abdca58c ("init/modpost: conditionally check section mismatch to __meminit*")
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Message-ID: <20240725073116.14626-3-roger.pau@citrix.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/include/asm/xen/hypervisor.h |  5 -----
 arch/x86/platform/pvh/enlighten.c     |  3 ---
 arch/x86/xen/enlighten_pvh.c          | 15 ++++++++++++---
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 64fbd2dbc5b7..a9088250770f 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -62,11 +62,6 @@ void xen_arch_unregister_cpu(int num);
 #ifdef CONFIG_PVH
 void __init xen_pvh_init(struct boot_params *boot_params);
 void __init mem_map_via_hcall(struct boot_params *boot_params_p);
-#ifdef CONFIG_XEN_PVH
-void __init xen_reserve_extra_memory(struct boot_params *bootp);
-#else
-static inline void xen_reserve_extra_memory(struct boot_params *bootp) { }
-#endif
 #endif
 
 /* Lazy mode for batching updates / context switch */
diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
index 8c2d4b8de25d..944e0290f2c0 100644
--- a/arch/x86/platform/pvh/enlighten.c
+++ b/arch/x86/platform/pvh/enlighten.c
@@ -75,9 +75,6 @@ static void __init init_pvh_bootparams(bool xen_guest)
 	} else
 		xen_raw_printk("Warning: Can fit ISA range into e820\n");
 
-	if (xen_guest)
-		xen_reserve_extra_memory(&pvh_bootparams);
-
 	pvh_bootparams.hdr.cmd_line_ptr =
 		pvh_start_info.cmdline_paddr;
 
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index 91c6db4ec054..728a4366ca85 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -9,6 +9,7 @@
 #include <asm/io_apic.h>
 #include <asm/hypervisor.h>
 #include <asm/e820/api.h>
+#include <asm/setup.h>
 
 #include <xen/xen.h>
 #include <asm/xen/interface.h>
@@ -41,8 +42,9 @@ EXPORT_SYMBOL_GPL(xen_pvh);
  * hypervisor should notify us which memory ranges are suitable for creating
  * foreign mappings, but that's not yet implemented.
  */
-void __init xen_reserve_extra_memory(struct boot_params *bootp)
+static void __init pvh_reserve_extra_memory(void)
 {
+	struct boot_params *bootp = &boot_params;
 	unsigned int i, ram_pages = 0, extra_pages;
 
 	for (i = 0; i < bootp->e820_entries; i++) {
@@ -94,6 +96,14 @@ void __init xen_reserve_extra_memory(struct boot_params *bootp)
 	}
 }
 
+static void __init pvh_arch_setup(void)
+{
+	pvh_reserve_extra_memory();
+
+	if (xen_initial_domain())
+		xen_add_preferred_consoles();
+}
+
 void __init xen_pvh_init(struct boot_params *boot_params)
 {
 	u32 msr;
@@ -107,8 +117,7 @@ void __init xen_pvh_init(struct boot_params *boot_params)
 	pfn = __pa(hypercall_page);
 	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
 
-	if (xen_initial_domain())
-		x86_init.oem.arch_setup = xen_add_preferred_consoles;
+	x86_init.oem.arch_setup = pvh_arch_setup;
 	x86_init.oem.banner = xen_banner;
 
 	xen_efi_init(boot_params);

From 1d9ce4440414c92acb17eece3218fe5c92b141e3 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 25 Jul 2024 08:54:28 +0200
Subject: [PATCH 266/346] ASoC: amd: yc: Support mic on Lenovo Thinkpad E16 Gen
 2

Lenovo Thinkpad E16 Gen 2 AMD model (model 21M5) needs a corresponding
quirk entry for making the internal mic working.

Link: https://bugzilla.suse.com/show_bug.cgi?id=1228269
Cc: stable@vger.kernel.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20240725065442.9293-1-tiwai@suse.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/amd/yc/acp6x-mach.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c
index f54466ed8e3e..1769e07e83dc 100644
--- a/sound/soc/amd/yc/acp6x-mach.c
+++ b/sound/soc/amd/yc/acp6x-mach.c
@@ -220,6 +220,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "21J6"),
 		}
 	},
+	{
+		.driver_data = &acp6x_card,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "21M5"),
+		}
+	},
 	{
 		.driver_data = &acp6x_card,
 		.matches = {

From 7b52a9d9ee6c5505a51ebe03703a56a72c75f9de Mon Sep 17 00:00:00 2001
From: Huqiang Qin <huqiang.qin@amlogic.com>
Date: Tue, 9 Jul 2024 16:48:25 +0800
Subject: [PATCH 267/346] dt-bindings: watchdog: add support for Amlogic A4
 SoCs

Update dt-binding document for watchdog of Amlogic A4 SoCs.

Signed-off-by: Huqiang Qin <huqiang.qin@amlogic.com>
Signed-off-by: Xianwei Zhao <xianwei.zhao@amlogic.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20240709-a4-a5_watchdog-v1-1-2ae852e05ec2@amlogic.com
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 .../devicetree/bindings/watchdog/amlogic,meson-gxbb-wdt.yaml     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/watchdog/amlogic,meson-gxbb-wdt.yaml b/Documentation/devicetree/bindings/watchdog/amlogic,meson-gxbb-wdt.yaml
index 69845ec32e81..d0eff1ea52b4 100644
--- a/Documentation/devicetree/bindings/watchdog/amlogic,meson-gxbb-wdt.yaml
+++ b/Documentation/devicetree/bindings/watchdog/amlogic,meson-gxbb-wdt.yaml
@@ -21,6 +21,7 @@ properties:
           - amlogic,t7-wdt
       - items:
           - enum:
+              - amlogic,a4-wdt
               - amlogic,c3-wdt
               - amlogic,s4-wdt
           - const: amlogic,t7-wdt

From 9722c3b66e21ff08aec570d02a97d331087fd70f Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca.ceresoli@bootlin.com>
Date: Wed, 24 Jul 2024 18:33:06 +0200
Subject: [PATCH 268/346] of: remove internal arguments from
 of_property_for_each_u32()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The of_property_for_each_u32() macro needs five parameters, two of which
are primarily meant as internal variables for the macro itself (in the
for() clause). Yet these two parameters are used by a few drivers, and this
can be considered misuse or at least bad practice.

Now that the kernel uses C11 to build, these two parameters can be avoided
by declaring them internally, thus changing this pattern:

  struct property *prop;
  const __be32 *p;
  u32 val;

  of_property_for_each_u32(np, "xyz", prop, p, val) { ... }

to this:

  u32 val;

  of_property_for_each_u32(np, "xyz", val) { ... }

However two variables cannot be declared in the for clause even with C11,
so declare one struct that contain the two variables we actually need. As
the variables inside this struct are not meant to be used by users of this
macro, give the struct instance the noticeable name "_it" so it is visible
during code reviews, helping to avoid new code to use it directly.

Most usages are trivially converted as they do not use those two
parameters, as expected. The non-trivial cases are:

 - drivers/clk/clk.c, of_clk_get_parent_name(): easily doable anyway
 - drivers/clk/clk-si5351.c, si5351_dt_parse(): this is more complex as the
   checks had to be replicated in a different way, making code more verbose
   and somewhat uglier, but I refrained from a full rework to keep as much
   of the original code untouched having no hardware to test my changes

All the changes have been build tested. The few for which I have the
hardware have been runtime-tested too.

Reviewed-by: Andre Przywara <andre.przywara@arm.com> # drivers/clk/sunxi/clk-simple-gates.c, drivers/clk/sunxi/clk-sun8i-bus-gates.c
Acked-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org> # drivers/gpio/gpio-brcmstb.c
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com> # drivers/irqchip/irq-atmel-aic-common.c
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # drivers/iio/adc/ti_am335x_adc.c
Acked-by: Uwe Kleine-König <u.kleine-koenig@baylibre.com> # drivers/pwm/pwm-samsung.c
Acked-by: Richard Leitner <richard.leitner@linux.dev> # drivers/usb/misc/usb251xb.c
Acked-by: Mark Brown <broonie@kernel.org> # sound/soc/codecs/arizona.c
Reviewed-by: Richard Fitzgerald <rf@opensource.cirrus.com> # sound/soc/codecs/arizona.c
Acked-by: Michael Ellerman <mpe@ellerman.id.au> # arch/powerpc/sysdev/xive/spapr.c
Acked-by: Stephen Boyd <sboyd@kernel.org> # clk
Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
Acked-by: Lee Jones <lee@kernel.org>
Link: https://lore.kernel.org/r/20240724-of_property_for_each_u32-v3-1-bea82ce429e2@bootlin.com
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 arch/powerpc/sysdev/xive/native.c       |  4 +--
 arch/powerpc/sysdev/xive/spapr.c        |  3 +-
 drivers/bus/ti-sysc.c                   |  4 +--
 drivers/clk/clk-conf.c                  |  4 +--
 drivers/clk/clk-si5351.c                | 43 +++++++++++++++----------
 drivers/clk/clk.c                       | 12 +++----
 drivers/clk/qcom/common.c               |  4 +--
 drivers/clk/sunxi/clk-simple-gates.c    |  4 +--
 drivers/clk/sunxi/clk-sun8i-bus-gates.c |  4 +--
 drivers/clocksource/samsung_pwm_timer.c |  4 +--
 drivers/gpio/gpio-brcmstb.c             |  5 +--
 drivers/iio/adc/ti_am335x_adc.c         |  4 +--
 drivers/irqchip/irq-atmel-aic-common.c  |  4 +--
 drivers/irqchip/irq-pic32-evic.c        |  4 +--
 drivers/mfd/ti_am335x_tscadc.c          |  4 +--
 drivers/pinctrl/nxp/pinctrl-s32cc.c     |  4 +--
 drivers/pinctrl/pinctrl-k210.c          |  4 +--
 drivers/pwm/pwm-samsung.c               |  4 +--
 drivers/tty/sysrq.c                     |  4 +--
 drivers/usb/misc/usb251xb.c             |  4 +--
 include/linux/of.h                      | 15 ++++-----
 sound/soc/codecs/arizona.c              | 12 +++----
 22 files changed, 61 insertions(+), 93 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 517b963e3e6a..a0934b516933 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -559,9 +559,7 @@ bool __init xive_native_init(void)
 	struct device_node *np;
 	struct resource r;
 	void __iomem *tima;
-	struct property *prop;
 	u8 max_prio = 7;
-	const __be32 *p;
 	u32 val, cpu;
 	s64 rc;
 
@@ -592,7 +590,7 @@ bool __init xive_native_init(void)
 		max_prio = val - 1;
 
 	/* Iterate the EQ sizes and pick one */
-	of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, p, val) {
+	of_property_for_each_u32(np, "ibm,xive-eq-sizes", val) {
 		xive_queue_shift = val;
 		if (val == PAGE_SHIFT)
 			break;
diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c
index e45419264391..f2fa985a2c77 100644
--- a/arch/powerpc/sysdev/xive/spapr.c
+++ b/arch/powerpc/sysdev/xive/spapr.c
@@ -814,7 +814,6 @@ bool __init xive_spapr_init(void)
 	struct device_node *np;
 	struct resource r;
 	void __iomem *tima;
-	struct property *prop;
 	u8 max_prio;
 	u32 val;
 	u32 len;
@@ -866,7 +865,7 @@ bool __init xive_spapr_init(void)
 	}
 
 	/* Iterate the EQ sizes and pick one */
-	of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, reg, val) {
+	of_property_for_each_u32(np, "ibm,xive-eq-sizes", val) {
 		xive_queue_shift = val;
 		if (val == PAGE_SHIFT)
 			break;
diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index 8767e04d6c89..2b59ef61dda2 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -2291,11 +2291,9 @@ static int sysc_init_idlemode(struct sysc *ddata, u8 *idlemodes,
 			      const char *name)
 {
 	struct device_node *np = ddata->dev->of_node;
-	struct property *prop;
-	const __be32 *p;
 	u32 val;
 
-	of_property_for_each_u32(np, name, prop, p, val) {
+	of_property_for_each_u32(np, name, val) {
 		if (val >= SYSC_NR_IDLEMODES) {
 			dev_err(ddata->dev, "invalid idlemode: %i\n", val);
 			return -EINVAL;
diff --git a/drivers/clk/clk-conf.c b/drivers/clk/clk-conf.c
index 1a4e6340f95c..058420562020 100644
--- a/drivers/clk/clk-conf.c
+++ b/drivers/clk/clk-conf.c
@@ -81,13 +81,11 @@ static int __set_clk_parents(struct device_node *node, bool clk_supplier)
 static int __set_clk_rates(struct device_node *node, bool clk_supplier)
 {
 	struct of_phandle_args clkspec;
-	struct property	*prop;
-	const __be32 *cur;
 	int rc, index = 0;
 	struct clk *clk;
 	u32 rate;
 
-	of_property_for_each_u32(node, "assigned-clock-rates", prop, cur, rate) {
+	of_property_for_each_u32(node, "assigned-clock-rates", rate) {
 		if (rate) {
 			rc = of_parse_phandle_with_args(node, "assigned-clocks",
 					"#clock-cells",	index, &clkspec);
diff --git a/drivers/clk/clk-si5351.c b/drivers/clk/clk-si5351.c
index 4ce83c5265b8..a4c92c5ef3ff 100644
--- a/drivers/clk/clk-si5351.c
+++ b/drivers/clk/clk-si5351.c
@@ -1175,8 +1175,8 @@ static int si5351_dt_parse(struct i2c_client *client,
 {
 	struct device_node *child, *np = client->dev.of_node;
 	struct si5351_platform_data *pdata;
-	struct property *prop;
-	const __be32 *p;
+	u32 array[4];
+	int sz, i;
 	int num = 0;
 	u32 val;
 
@@ -1191,20 +1191,24 @@ static int si5351_dt_parse(struct i2c_client *client,
 	 * property silabs,pll-source : <num src>, [<..>]
 	 * allow to selectively set pll source
 	 */
-	of_property_for_each_u32(np, "silabs,pll-source", prop, p, num) {
+	sz = of_property_read_variable_u32_array(np, "silabs,pll-source", array, 2, 4);
+	sz = (sz == -EINVAL) ? 0 : sz; /* Missing property is OK */
+	if (sz < 0)
+		return dev_err_probe(&client->dev, sz, "invalid pll-source\n");
+	if (sz % 2)
+		return dev_err_probe(&client->dev, -EINVAL,
+				     "missing pll-source for pll %d\n", array[sz - 1]);
+
+	for (i = 0; i < sz; i += 2) {
+		num = array[i];
+		val = array[i + 1];
+
 		if (num >= 2) {
 			dev_err(&client->dev,
 				"invalid pll %d on pll-source prop\n", num);
 			return -EINVAL;
 		}
 
-		p = of_prop_next_u32(prop, p, &val);
-		if (!p) {
-			dev_err(&client->dev,
-				"missing pll-source for pll %d\n", num);
-			return -EINVAL;
-		}
-
 		switch (val) {
 		case 0:
 			pdata->pll_src[num] = SI5351_PLL_SRC_XTAL;
@@ -1232,19 +1236,24 @@ static int si5351_dt_parse(struct i2c_client *client,
 	pdata->pll_reset[0] = true;
 	pdata->pll_reset[1] = true;
 
-	of_property_for_each_u32(np, "silabs,pll-reset-mode", prop, p, num) {
+	sz = of_property_read_variable_u32_array(np, "silabs,pll-reset-mode", array, 2, 4);
+	sz = (sz == -EINVAL) ? 0 : sz; /* Missing property is OK */
+	if (sz < 0)
+		return dev_err_probe(&client->dev, sz, "invalid pll-reset-mode\n");
+	if (sz % 2)
+		return dev_err_probe(&client->dev, -EINVAL,
+				     "missing pll-reset-mode for pll %d\n", array[sz - 1]);
+
+	for (i = 0; i < sz; i += 2) {
+		num = array[i];
+		val = array[i + 1];
+
 		if (num >= 2) {
 			dev_err(&client->dev,
 				"invalid pll %d on pll-reset-mode prop\n", num);
 			return -EINVAL;
 		}
 
-		p = of_prop_next_u32(prop, p, &val);
-		if (!p) {
-			dev_err(&client->dev,
-				"missing pll-reset-mode for pll %d\n", num);
-			return -EINVAL;
-		}
 
 		switch (val) {
 		case 0:
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 8cca52be993f..285ed1ad8a37 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -5364,9 +5364,8 @@ EXPORT_SYMBOL_GPL(of_clk_get_parent_count);
 const char *of_clk_get_parent_name(const struct device_node *np, int index)
 {
 	struct of_phandle_args clkspec;
-	struct property *prop;
 	const char *clk_name;
-	const __be32 *vp;
+	bool found = false;
 	u32 pv;
 	int rc;
 	int count;
@@ -5383,15 +5382,16 @@ const char *of_clk_get_parent_name(const struct device_node *np, int index)
 	/* if there is an indices property, use it to transfer the index
 	 * specified into an array offset for the clock-output-names property.
 	 */
-	of_property_for_each_u32(clkspec.np, "clock-indices", prop, vp, pv) {
+	of_property_for_each_u32(clkspec.np, "clock-indices", pv) {
 		if (index == pv) {
 			index = count;
+			found = true;
 			break;
 		}
 		count++;
 	}
 	/* We went off the end of 'clock-indices' without finding it */
-	if (prop && !vp)
+	if (of_property_present(clkspec.np, "clock-indices") && !found)
 		return NULL;
 
 	if (of_property_read_string_index(clkspec.np, "clock-output-names",
@@ -5504,14 +5504,12 @@ static int parent_ready(struct device_node *np)
 int of_clk_detect_critical(struct device_node *np, int index,
 			   unsigned long *flags)
 {
-	struct property *prop;
-	const __be32 *cur;
 	uint32_t idx;
 
 	if (!np || !flags)
 		return -EINVAL;
 
-	of_property_for_each_u32(np, "clock-critical", prop, cur, idx)
+	of_property_for_each_u32(np, "clock-critical", idx)
 		if (index == idx)
 			*flags |= CLK_IS_CRITICAL;
 
diff --git a/drivers/clk/qcom/common.c b/drivers/clk/qcom/common.c
index ea3788ba46f7..33cc1f73c69d 100644
--- a/drivers/clk/qcom/common.c
+++ b/drivers/clk/qcom/common.c
@@ -227,11 +227,9 @@ EXPORT_SYMBOL_GPL(qcom_cc_register_sleep_clk);
 static void qcom_cc_drop_protected(struct device *dev, struct qcom_cc *cc)
 {
 	struct device_node *np = dev->of_node;
-	struct property *prop;
-	const __be32 *p;
 	u32 i;
 
-	of_property_for_each_u32(np, "protected-clocks", prop, p, i) {
+	of_property_for_each_u32(np, "protected-clocks", i) {
 		if (i >= cc->num_rclks)
 			continue;
 
diff --git a/drivers/clk/sunxi/clk-simple-gates.c b/drivers/clk/sunxi/clk-simple-gates.c
index 0399627c226a..845efc1ec800 100644
--- a/drivers/clk/sunxi/clk-simple-gates.c
+++ b/drivers/clk/sunxi/clk-simple-gates.c
@@ -21,11 +21,9 @@ static void __init sunxi_simple_gates_setup(struct device_node *node,
 {
 	struct clk_onecell_data *clk_data;
 	const char *clk_parent, *clk_name;
-	struct property *prop;
 	struct resource res;
 	void __iomem *clk_reg;
 	void __iomem *reg;
-	const __be32 *p;
 	int number, i = 0, j;
 	u8 clk_bit;
 	u32 index;
@@ -47,7 +45,7 @@ static void __init sunxi_simple_gates_setup(struct device_node *node,
 	if (!clk_data->clks)
 		goto err_free_data;
 
-	of_property_for_each_u32(node, "clock-indices", prop, p, index) {
+	of_property_for_each_u32(node, "clock-indices", index) {
 		of_property_read_string_index(node, "clock-output-names",
 					      i, &clk_name);
 
diff --git a/drivers/clk/sunxi/clk-sun8i-bus-gates.c b/drivers/clk/sunxi/clk-sun8i-bus-gates.c
index b87f331f63c9..8482ac8e5898 100644
--- a/drivers/clk/sunxi/clk-sun8i-bus-gates.c
+++ b/drivers/clk/sunxi/clk-sun8i-bus-gates.c
@@ -24,11 +24,9 @@ static void __init sun8i_h3_bus_gates_init(struct device_node *node)
 	const char *parents[PARENT_MAX];
 	struct clk_onecell_data *clk_data;
 	const char *clk_name;
-	struct property *prop;
 	struct resource res;
 	void __iomem *clk_reg;
 	void __iomem *reg;
-	const __be32 *p;
 	int number, i;
 	u8 clk_bit;
 	int index;
@@ -58,7 +56,7 @@ static void __init sun8i_h3_bus_gates_init(struct device_node *node)
 		goto err_free_data;
 
 	i = 0;
-	of_property_for_each_u32(node, "clock-indices", prop, p, index) {
+	of_property_for_each_u32(node, "clock-indices", index) {
 		of_property_read_string_index(node, "clock-output-names",
 					      i, &clk_name);
 
diff --git a/drivers/clocksource/samsung_pwm_timer.c b/drivers/clocksource/samsung_pwm_timer.c
index 6e46781bc9ac..b9561e3f196c 100644
--- a/drivers/clocksource/samsung_pwm_timer.c
+++ b/drivers/clocksource/samsung_pwm_timer.c
@@ -418,8 +418,6 @@ void __init samsung_pwm_clocksource_init(void __iomem *base,
 static int __init samsung_pwm_alloc(struct device_node *np,
 				    const struct samsung_pwm_variant *variant)
 {
-	struct property *prop;
-	const __be32 *cur;
 	u32 val;
 	int i, ret;
 
@@ -427,7 +425,7 @@ static int __init samsung_pwm_alloc(struct device_node *np,
 	for (i = 0; i < SAMSUNG_PWM_NUM; ++i)
 		pwm.irq[i] = irq_of_parse_and_map(np, i);
 
-	of_property_for_each_u32(np, "samsung,pwm-outputs", prop, cur, val) {
+	of_property_for_each_u32(np, "samsung,pwm-outputs", val) {
 		if (val >= SAMSUNG_PWM_NUM) {
 			pr_warn("%s: invalid channel index in samsung,pwm-outputs property\n", __func__);
 			continue;
diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c
index 8dce78ea7139..5762e517338e 100644
--- a/drivers/gpio/gpio-brcmstb.c
+++ b/drivers/gpio/gpio-brcmstb.c
@@ -591,8 +591,6 @@ static int brcmstb_gpio_probe(struct platform_device *pdev)
 	void __iomem *reg_base;
 	struct brcmstb_gpio_priv *priv;
 	struct resource *res;
-	struct property *prop;
-	const __be32 *p;
 	u32 bank_width;
 	int num_banks = 0;
 	int num_gpios = 0;
@@ -636,8 +634,7 @@ static int brcmstb_gpio_probe(struct platform_device *pdev)
 	flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER;
 #endif
 
-	of_property_for_each_u32(np, "brcm,gpio-bank-widths", prop, p,
-			bank_width) {
+	of_property_for_each_u32(np, "brcm,gpio-bank-widths", bank_width) {
 		struct brcmstb_gpio_bank *bank;
 		struct gpio_chip *gc;
 
diff --git a/drivers/iio/adc/ti_am335x_adc.c b/drivers/iio/adc/ti_am335x_adc.c
index 95fa857e8aad..426e3c9f88a1 100644
--- a/drivers/iio/adc/ti_am335x_adc.c
+++ b/drivers/iio/adc/ti_am335x_adc.c
@@ -564,13 +564,11 @@ static int tiadc_parse_dt(struct platform_device *pdev,
 			  struct tiadc_device *adc_dev)
 {
 	struct device_node *node = pdev->dev.of_node;
-	struct property *prop;
-	const __be32 *cur;
 	int channels = 0;
 	u32 val;
 	int i;
 
-	of_property_for_each_u32(node, "ti,adc-channels", prop, cur, val) {
+	of_property_for_each_u32(node, "ti,adc-channels", val) {
 		adc_dev->channel_line[channels] = val;
 
 		/* Set Default values for optional DT parameters */
diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c
index 072bd227b6c6..4525366d16d6 100644
--- a/drivers/irqchip/irq-atmel-aic-common.c
+++ b/drivers/irqchip/irq-atmel-aic-common.c
@@ -111,8 +111,6 @@ static void __init aic_common_ext_irq_of_init(struct irq_domain *domain)
 	struct device_node *node = irq_domain_get_of_node(domain);
 	struct irq_chip_generic *gc;
 	struct aic_chip_data *aic;
-	struct property *prop;
-	const __be32 *p;
 	u32 hwirq;
 
 	gc = irq_get_domain_generic_chip(domain, 0);
@@ -120,7 +118,7 @@ static void __init aic_common_ext_irq_of_init(struct irq_domain *domain)
 	aic = gc->private;
 	aic->ext_irqs |= 1;
 
-	of_property_for_each_u32(node, "atmel,external-irqs", prop, p, hwirq) {
+	of_property_for_each_u32(node, "atmel,external-irqs", hwirq) {
 		gc = irq_get_domain_generic_chip(domain, hwirq);
 		if (!gc) {
 			pr_warn("AIC: external irq %d >= %d skip it\n",
diff --git a/drivers/irqchip/irq-pic32-evic.c b/drivers/irqchip/irq-pic32-evic.c
index 1d9bb28d13e5..5d6b8e025bb8 100644
--- a/drivers/irqchip/irq-pic32-evic.c
+++ b/drivers/irqchip/irq-pic32-evic.c
@@ -190,13 +190,11 @@ static void __init pic32_ext_irq_of_init(struct irq_domain *domain)
 {
 	struct device_node *node = irq_domain_get_of_node(domain);
 	struct evic_chip_data *priv = domain->host_data;
-	struct property *prop;
-	const __le32 *p;
 	u32 hwirq;
 	int i = 0;
 	const char *pname = "microchip,external-irqs";
 
-	of_property_for_each_u32(node, pname, prop, p, hwirq) {
+	of_property_for_each_u32(node, pname, hwirq) {
 		if (i >= ARRAY_SIZE(priv->ext_irqs)) {
 			pr_warn("More than %d external irq, skip rest\n",
 				ARRAY_SIZE(priv->ext_irqs));
diff --git a/drivers/mfd/ti_am335x_tscadc.c b/drivers/mfd/ti_am335x_tscadc.c
index 4bbd542d753e..0c1364d88469 100644
--- a/drivers/mfd/ti_am335x_tscadc.c
+++ b/drivers/mfd/ti_am335x_tscadc.c
@@ -119,8 +119,6 @@ static	int ti_tscadc_probe(struct platform_device *pdev)
 	struct clk *clk;
 	struct device_node *node;
 	struct mfd_cell *cell;
-	struct property *prop;
-	const __be32 *cur;
 	bool use_tsc = false, use_mag = false;
 	u32 val;
 	int err;
@@ -167,7 +165,7 @@ static	int ti_tscadc_probe(struct platform_device *pdev)
 	}
 
 	node = of_get_child_by_name(pdev->dev.of_node, "adc");
-	of_property_for_each_u32(node, "ti,adc-channels", prop, cur, val) {
+	of_property_for_each_u32(node, "ti,adc-channels", val) {
 		adc_channels++;
 		if (val > 7) {
 			dev_err(&pdev->dev, " PIN numbers are 0..7 (not %d)\n",
diff --git a/drivers/pinctrl/nxp/pinctrl-s32cc.c b/drivers/pinctrl/nxp/pinctrl-s32cc.c
index df3e5d82da4b..f2609a35c312 100644
--- a/drivers/pinctrl/nxp/pinctrl-s32cc.c
+++ b/drivers/pinctrl/nxp/pinctrl-s32cc.c
@@ -730,9 +730,7 @@ static int s32_pinctrl_parse_groups(struct device_node *np,
 				     struct s32_pin_group *grp,
 				     struct s32_pinctrl_soc_info *info)
 {
-	const __be32 *p;
 	struct device *dev;
-	struct property *prop;
 	unsigned int *pins, *sss;
 	int i, npins;
 	u32 pinmux;
@@ -763,7 +761,7 @@ static int s32_pinctrl_parse_groups(struct device_node *np,
 		return -ENOMEM;
 
 	i = 0;
-	of_property_for_each_u32(np, "pinmux", prop, p, pinmux) {
+	of_property_for_each_u32(np, "pinmux", pinmux) {
 		pins[i] = get_pin_no(pinmux);
 		sss[i] = get_pin_func(pinmux);
 
diff --git a/drivers/pinctrl/pinctrl-k210.c b/drivers/pinctrl/pinctrl-k210.c
index 2753e14c3e38..a898e40451fe 100644
--- a/drivers/pinctrl/pinctrl-k210.c
+++ b/drivers/pinctrl/pinctrl-k210.c
@@ -763,8 +763,6 @@ static int k210_pinctrl_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 					  unsigned int *reserved_maps,
 					  unsigned int *num_maps)
 {
-	struct property *prop;
-	const __be32 *p;
 	int ret, pinmux_groups;
 	u32 pinmux_group;
 	unsigned long *configs = NULL;
@@ -797,7 +795,7 @@ static int k210_pinctrl_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 	if (ret < 0)
 		goto exit;
 
-	of_property_for_each_u32(np, "pinmux", prop, p, pinmux_group) {
+	of_property_for_each_u32(np, "pinmux", pinmux_group) {
 		const char *group_name, *func_name;
 		u32 pin = FIELD_GET(K210_PG_PIN, pinmux_group);
 		u32 func = FIELD_GET(K210_PG_FUNC, pinmux_group);
diff --git a/drivers/pwm/pwm-samsung.c b/drivers/pwm/pwm-samsung.c
index 7adf4f2b1049..951b38ff5f8e 100644
--- a/drivers/pwm/pwm-samsung.c
+++ b/drivers/pwm/pwm-samsung.c
@@ -510,8 +510,6 @@ static int pwm_samsung_parse_dt(struct pwm_chip *chip)
 	struct samsung_pwm_chip *our_chip = to_samsung_pwm_chip(chip);
 	struct device_node *np = pwmchip_parent(chip)->of_node;
 	const struct of_device_id *match;
-	struct property *prop;
-	const __be32 *cur;
 	u32 val;
 
 	match = of_match_node(samsung_pwm_matches, np);
@@ -520,7 +518,7 @@ static int pwm_samsung_parse_dt(struct pwm_chip *chip)
 
 	memcpy(&our_chip->variant, match->data, sizeof(our_chip->variant));
 
-	of_property_for_each_u32(np, "samsung,pwm-outputs", prop, cur, val) {
+	of_property_for_each_u32(np, "samsung,pwm-outputs", val) {
 		if (val >= SAMSUNG_PWM_NUM) {
 			dev_err(pwmchip_parent(chip),
 				"%s: invalid channel index in samsung,pwm-outputs property\n",
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index e5974b8239c9..dab332681036 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -770,8 +770,6 @@ static void sysrq_of_get_keyreset_config(void)
 {
 	u32 key;
 	struct device_node *np;
-	struct property *prop;
-	const __be32 *p;
 
 	np = of_find_node_by_path("/chosen/linux,sysrq-reset-seq");
 	if (!np) {
@@ -782,7 +780,7 @@ static void sysrq_of_get_keyreset_config(void)
 	/* Reset in case a __weak definition was present */
 	sysrq_reset_seq_len = 0;
 
-	of_property_for_each_u32(np, "keyset", prop, p, key) {
+	of_property_for_each_u32(np, "keyset", key) {
 		if (key == KEY_RESERVED || key > KEY_MAX ||
 		    sysrq_reset_seq_len == SYSRQ_KEY_RESET_MAX)
 			break;
diff --git a/drivers/usb/misc/usb251xb.c b/drivers/usb/misc/usb251xb.c
index b98cda1cef73..e24cdb667307 100644
--- a/drivers/usb/misc/usb251xb.c
+++ b/drivers/usb/misc/usb251xb.c
@@ -382,11 +382,9 @@ static void usb251xb_get_ports_field(struct usb251xb *hub,
 				    bool ds_only, u8 *fld)
 {
 	struct device *dev = hub->dev;
-	struct property *prop;
-	const __be32 *p;
 	u32 port;
 
-	of_property_for_each_u32(dev->of_node, prop_name, prop, p, port) {
+	of_property_for_each_u32(dev->of_node, prop_name, port) {
 		if ((port >= ds_only ? 1 : 0) && (port <= port_cnt))
 			*fld |= BIT(port);
 		else
diff --git a/include/linux/of.h b/include/linux/of.h
index 13cf7a43b473..85b60ac9eec5 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -430,11 +430,9 @@ extern int of_detach_node(struct device_node *);
 #define of_match_ptr(_ptr)	(_ptr)
 
 /*
- * struct property *prop;
- * const __be32 *p;
  * u32 u;
  *
- * of_property_for_each_u32(np, "propname", prop, p, u)
+ * of_property_for_each_u32(np, "propname", u)
  *         printk("U32 value: %x\n", u);
  */
 const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur,
@@ -1431,11 +1429,12 @@ static inline int of_property_read_s32(const struct device_node *np,
 	     err == 0;							\
 	     err = of_phandle_iterator_next(it))
 
-#define of_property_for_each_u32(np, propname, prop, p, u)	\
-	for (prop = of_find_property(np, propname, NULL),	\
-		p = of_prop_next_u32(prop, NULL, &u);		\
-		p;						\
-		p = of_prop_next_u32(prop, p, &u))
+#define of_property_for_each_u32(np, propname, u)			\
+	for (struct {struct property *prop; const __be32 *item; } _it =	\
+		{of_find_property(np, propname, NULL),			\
+		 of_prop_next_u32(_it.prop, NULL, &u)};			\
+	     _it.item;							\
+	     _it.item = of_prop_next_u32(_it.prop, _it.item, &u))
 
 #define of_property_for_each_string(np, propname, prop, s)	\
 	for (prop = of_find_property(np, propname, NULL),	\
diff --git a/sound/soc/codecs/arizona.c b/sound/soc/codecs/arizona.c
index 7434aeeda292..402b9a2ff024 100644
--- a/sound/soc/codecs/arizona.c
+++ b/sound/soc/codecs/arizona.c
@@ -2786,15 +2786,13 @@ int arizona_of_get_audio_pdata(struct arizona *arizona)
 {
 	struct arizona_pdata *pdata = &arizona->pdata;
 	struct device_node *np = arizona->dev->of_node;
-	struct property *prop;
-	const __be32 *cur;
 	u32 val;
 	u32 pdm_val[ARIZONA_MAX_PDM_SPK];
 	int ret;
 	int count = 0;
 
 	count = 0;
-	of_property_for_each_u32(np, "wlf,inmode", prop, cur, val) {
+	of_property_for_each_u32(np, "wlf,inmode", val) {
 		if (count == ARRAY_SIZE(pdata->inmode))
 			break;
 
@@ -2803,7 +2801,7 @@ int arizona_of_get_audio_pdata(struct arizona *arizona)
 	}
 
 	count = 0;
-	of_property_for_each_u32(np, "wlf,dmic-ref", prop, cur, val) {
+	of_property_for_each_u32(np, "wlf,dmic-ref", val) {
 		if (count == ARRAY_SIZE(pdata->dmic_ref))
 			break;
 
@@ -2812,7 +2810,7 @@ int arizona_of_get_audio_pdata(struct arizona *arizona)
 	}
 
 	count = 0;
-	of_property_for_each_u32(np, "wlf,out-mono", prop, cur, val) {
+	of_property_for_each_u32(np, "wlf,out-mono", val) {
 		if (count == ARRAY_SIZE(pdata->out_mono))
 			break;
 
@@ -2821,7 +2819,7 @@ int arizona_of_get_audio_pdata(struct arizona *arizona)
 	}
 
 	count = 0;
-	of_property_for_each_u32(np, "wlf,max-channels-clocked", prop, cur, val) {
+	of_property_for_each_u32(np, "wlf,max-channels-clocked", val) {
 		if (count == ARRAY_SIZE(pdata->max_channels_clocked))
 			break;
 
@@ -2830,7 +2828,7 @@ int arizona_of_get_audio_pdata(struct arizona *arizona)
 	}
 
 	count = 0;
-	of_property_for_each_u32(np, "wlf,out-volume-limit", prop, cur, val) {
+	of_property_for_each_u32(np, "wlf,out-volume-limit", val) {
 		if (count == ARRAY_SIZE(pdata->out_vol_limit))
 			break;
 

From 726d4f528dbc98a84d9ce3c749dfdada3dcdd5ca Mon Sep 17 00:00:00 2001
From: Ashish Mhetre <amhetre@nvidia.com>
Date: Wed, 24 Jul 2024 17:31:32 +0000
Subject: [PATCH 269/346] iommu: arm-smmu: Fix Tegra workaround for PAGE_SIZE
 mappings

PAGE_SIZE can be 16KB for Tegra which is not supported by MMU-500 on
both Tegra194 and Tegra234. Retain only valid granularities from
pgsize_bitmap which would either be 4KB or 64KB.

Signed-off-by: Ashish Mhetre <amhetre@nvidia.com>
Link: https://lore.kernel.org/r/20240724173132.219978-1-amhetre@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
index 4b2994b6126d..2fce4f6d4e1b 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
@@ -277,7 +277,7 @@ static int nvidia_smmu_init_context(struct arm_smmu_domain *smmu_domain,
 	 */
 	if (of_device_is_compatible(np, "nvidia,tegra234-smmu") ||
 	    of_device_is_compatible(np, "nvidia,tegra194-smmu")) {
-		smmu->pgsize_bitmap = PAGE_SIZE;
+		smmu->pgsize_bitmap &= GENMASK(PAGE_SHIFT, 0);
 		pgtbl_cfg->pgsize_bitmap = smmu->pgsize_bitmap;
 	}
 

From 36639013b3462c06ff8e3400a427f775b4fc97f5 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 25 Jul 2024 10:03:45 +0100
Subject: [PATCH 270/346] arm64: mm: Fix lockless walks with static and dynamic
 page-table folding

Lina reports random oopsen originating from the fast GUP code when
16K pages are used with 4-level page-tables, the fourth level being
folded at runtime due to lack of LPA2.

In this configuration, the generic implementation of
p4d_offset_lockless() will return a 'p4d_t *' corresponding to the
'pgd_t' allocated on the stack of the caller, gup_fast_pgd_range().
This is normally fine, but when the fourth level of page-table is folded
at runtime, pud_offset_lockless() will offset from the address of the
'p4d_t' to calculate the address of the PUD in the same page-table page.
This results in a stray stack read when the 'p4d_t' has been allocated
on the stack and can send the walker into the weeds.

Fix the problem by providing our own definition of p4d_offset_lockless()
when CONFIG_PGTABLE_LEVELS <= 4 which returns the real page-table
pointer rather than the address of the local stack variable.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/50360968-13fb-4e6f-8f52-1725b3177215@asahilina.net
Fixes: 0dd4f60a2c76 ("arm64: mm: Add support for folding PUDs at runtime")
Reported-by: Asahi Lina <lina@asahilina.net>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20240725090345.28461-1-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/pgtable.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index f8efbc128446..7a4f5604be3f 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1065,6 +1065,28 @@ static inline bool pgtable_l5_enabled(void) { return false; }
 
 #define p4d_offset_kimg(dir,addr)	((p4d_t *)dir)
 
+static inline
+p4d_t *p4d_offset_lockless_folded(pgd_t *pgdp, pgd_t pgd, unsigned long addr)
+{
+	/*
+	 * With runtime folding of the pud, pud_offset_lockless() passes
+	 * the 'pgd_t *' we return here to p4d_to_folded_pud(), which
+	 * will offset the pointer assuming that it points into
+	 * a page-table page. However, the fast GUP path passes us a
+	 * pgd_t allocated on the stack and so we must use the original
+	 * pointer in 'pgdp' to construct the p4d pointer instead of
+	 * using the generic p4d_offset_lockless() implementation.
+	 *
+	 * Note: reusing the original pointer means that we may
+	 * dereference the same (live) page-table entry multiple times.
+	 * This is safe because it is still only loaded once in the
+	 * context of each level and the CPU guarantees same-address
+	 * read-after-read ordering.
+	 */
+	return p4d_offset(pgdp, addr);
+}
+#define p4d_offset_lockless p4d_offset_lockless_folded
+
 #endif  /* CONFIG_PGTABLE_LEVELS > 4 */
 
 #define pgd_ERROR(e)	\

From ab53dfdcdd1ec8df8729890aefa5b0e3c900afbb Mon Sep 17 00:00:00 2001
From: Shengjiu Wang <shengjiu.wang@nxp.com>
Date: Thu, 25 Jul 2024 11:22:53 +0800
Subject: [PATCH 271/346] ASoC: fsl-asoc-card: Dynamically allocate memory for
 snd_soc_dai_link_components

The static snd_soc_dai_link_components cause conflict for multiple
instances of this generic driver. For example, when there is
wm8962 and SPDIF case enabled together, the contaminated
snd_soc_dai_link_components will cause another device probe fail.

Fixes: 6d174cc4f224 ("ASoC: fsl-asoc-card: merge spdif support from imx-spdif.c")
Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Link: https://patch.msgid.link/1721877773-5229-1-git-send-email-shengjiu.wang@nxp.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 sound/soc/fsl/fsl-asoc-card.c | 46 ++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/sound/soc/fsl/fsl-asoc-card.c b/sound/soc/fsl/fsl-asoc-card.c
index 82df887b3af5..f6c3aeff0d8e 100644
--- a/sound/soc/fsl/fsl-asoc-card.c
+++ b/sound/soc/fsl/fsl-asoc-card.c
@@ -306,27 +306,12 @@ static int be_hw_params_fixup(struct snd_soc_pcm_runtime *rtd,
 	return 0;
 }
 
-SND_SOC_DAILINK_DEFS(hifi,
-	DAILINK_COMP_ARRAY(COMP_EMPTY()),
-	DAILINK_COMP_ARRAY(COMP_EMPTY(), COMP_EMPTY()),
-	DAILINK_COMP_ARRAY(COMP_EMPTY()));
-
-SND_SOC_DAILINK_DEFS(hifi_fe,
-	DAILINK_COMP_ARRAY(COMP_EMPTY()),
-	DAILINK_COMP_ARRAY(COMP_DUMMY()),
-	DAILINK_COMP_ARRAY(COMP_EMPTY()));
-
-SND_SOC_DAILINK_DEFS(hifi_be,
-	DAILINK_COMP_ARRAY(COMP_EMPTY()),
-	DAILINK_COMP_ARRAY(COMP_EMPTY(), COMP_EMPTY()));
-
 static const struct snd_soc_dai_link fsl_asoc_card_dai[] = {
 	/* Default ASoC DAI Link*/
 	{
 		.name = "HiFi",
 		.stream_name = "HiFi",
 		.ops = &fsl_asoc_card_ops,
-		SND_SOC_DAILINK_REG(hifi),
 	},
 	/* DPCM Link between Front-End and Back-End (Optional) */
 	{
@@ -335,7 +320,6 @@ static const struct snd_soc_dai_link fsl_asoc_card_dai[] = {
 		.dpcm_playback = 1,
 		.dpcm_capture = 1,
 		.dynamic = 1,
-		SND_SOC_DAILINK_REG(hifi_fe),
 	},
 	{
 		.name = "HiFi-ASRC-BE",
@@ -345,7 +329,6 @@ static const struct snd_soc_dai_link fsl_asoc_card_dai[] = {
 		.dpcm_playback = 1,
 		.dpcm_capture = 1,
 		.no_pcm = 1,
-		SND_SOC_DAILINK_REG(hifi_be),
 	},
 };
 
@@ -637,6 +620,7 @@ static int fsl_asoc_card_probe(struct platform_device *pdev)
 	struct platform_device *cpu_pdev;
 	struct fsl_asoc_card_priv *priv;
 	struct device *codec_dev[2] = { NULL, NULL };
+	struct snd_soc_dai_link_component *dlc;
 	const char *codec_dai_name[2];
 	const char *codec_dev_name[2];
 	u32 asrc_fmt = 0;
@@ -717,7 +701,35 @@ static int fsl_asoc_card_probe(struct platform_device *pdev)
 
 	memcpy(priv->dai_link, fsl_asoc_card_dai,
 	       sizeof(struct snd_soc_dai_link) * ARRAY_SIZE(priv->dai_link));
+	/*
+	 * "Default ASoC DAI Link": 1 cpus, 2 codecs, 1 platforms
+	 * "DPCM Link Front-End":  1 cpus, 1 codecs (dummy), 1 platforms
+	 * "DPCM Link Back-End": 1 cpus, 2 codecs
+	 * totally 10 components
+	 */
+	dlc = devm_kcalloc(&pdev->dev, 10, sizeof(*dlc), GFP_KERNEL);
+	if (!dlc) {
+		ret = -ENOMEM;
+		goto asrc_fail;
+	}
+
+	priv->dai_link[0].cpus = &dlc[0];
+	priv->dai_link[0].num_cpus = 1;
+	priv->dai_link[0].codecs = &dlc[1];
 	priv->dai_link[0].num_codecs = 1;
+	priv->dai_link[0].platforms = &dlc[3];
+	priv->dai_link[0].num_platforms = 1;
+
+	priv->dai_link[1].cpus = &dlc[4];
+	priv->dai_link[1].num_cpus = 1;
+	priv->dai_link[1].codecs = &dlc[5];
+	priv->dai_link[1].num_codecs = 0; /* dummy */
+	priv->dai_link[1].platforms = &dlc[6];
+	priv->dai_link[1].num_platforms = 1;
+
+	priv->dai_link[2].cpus = &dlc[7];
+	priv->dai_link[2].num_cpus = 1;
+	priv->dai_link[2].codecs = &dlc[8];
 	priv->dai_link[2].num_codecs = 1;
 
 	priv->card.dapm_routes = audio_map;

From c31fad1470389666ac7169fe43aa65bf5b7e2cfd Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 24 Jul 2024 13:31:14 +0300
Subject: [PATCH 272/346] nvme-pci: add missing condition check for existence
 of mapped data

nvme_map_data() is called when request has physical segments, hence
the nvme_unmap_data() should have same condition to avoid dereference.

Fixes: 4aedb705437f ("nvme-pci: split metadata handling from nvme_map_data / nvme_unmap_data")
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8087b054da66..6cd9395ba9ec 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -863,7 +863,8 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 	nvme_start_request(req);
 	return BLK_STS_OK;
 out_unmap_data:
-	nvme_unmap_data(dev, req);
+	if (blk_rq_nr_phys_segments(req))
+		nvme_unmap_data(dev, req);
 out_free_cmd:
 	nvme_cleanup_cmd(req);
 	return ret;

From 0db4618e8fabfcc404af4dda23799bba726785a5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 25 Jul 2024 08:41:35 -0600
Subject: [PATCH 273/346] io_uring/msg_ring: fix uninitialized use of
 target_req->flags

syzbot reports that KMSAN complains that 'nr_tw' is an uninit-value
with the following report:

BUG: KMSAN: uninit-value in io_req_local_work_add io_uring/io_uring.c:1192 [inline]
BUG: KMSAN: uninit-value in io_req_task_work_add_remote+0x588/0x5d0 io_uring/io_uring.c:1240
 io_req_local_work_add io_uring/io_uring.c:1192 [inline]
 io_req_task_work_add_remote+0x588/0x5d0 io_uring/io_uring.c:1240
 io_msg_remote_post io_uring/msg_ring.c:102 [inline]
 io_msg_data_remote io_uring/msg_ring.c:133 [inline]
 io_msg_ring_data io_uring/msg_ring.c:152 [inline]
 io_msg_ring+0x1c38/0x1ef0 io_uring/msg_ring.c:305
 io_issue_sqe+0x383/0x22c0 io_uring/io_uring.c:1710
 io_queue_sqe io_uring/io_uring.c:1924 [inline]
 io_submit_sqe io_uring/io_uring.c:2180 [inline]
 io_submit_sqes+0x1259/0x2f20 io_uring/io_uring.c:2295
 __do_sys_io_uring_enter io_uring/io_uring.c:3205 [inline]
 __se_sys_io_uring_enter+0x40c/0x3ca0 io_uring/io_uring.c:3142
 __x64_sys_io_uring_enter+0x11f/0x1a0 io_uring/io_uring.c:3142
 x64_sys_call+0x2d82/0x3c10 arch/x86/include/generated/asm/syscalls_64.h:427
 do_syscall_x64 arch/x86/entry/common.c:52 [inline]
 do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

which is the following check:

if (nr_tw < nr_wait)
	return;

in io_req_local_work_add(). While nr_tw itself cannot be uninitialized,
it does depend on req->flags, which off the msg ring issue path can
indeed be uninitialized.

Fix this by always clearing the allocated 'req' fully if we can't grab
one from the cache itself.

Fixes: 50cf5f3842af ("io_uring/msg_ring: add an alloc cache for io_kiocb entries")
Reported-by: syzbot+82609b8937a4458106ca@syzkaller.appspotmail.com
Link: https://lore.kernel.org/io-uring/000000000000fd3d8d061dfc0e4a@google.com/
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/msg_ring.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 29fa9285a33d..7fd9badcfaf8 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -110,10 +110,10 @@ static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx)
 	if (spin_trylock(&ctx->msg_lock)) {
 		req = io_alloc_cache_get(&ctx->msg_cache);
 		spin_unlock(&ctx->msg_lock);
+		if (req)
+			return req;
 	}
-	if (req)
-		return req;
-	return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN);
+	return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
 }
 
 static int io_msg_data_remote(struct io_kiocb *req)

From 9c421ef3f6b30ab912eaaa3c3d20cfb921fd8c8f Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Thu, 18 Jul 2024 16:32:43 +0800
Subject: [PATCH 274/346] erofs: support STATX_DIOALIGN

Add support for STATX_DIOALIGN to EROFS, so that direct I/O
alignment restrictions are exposed to userspace in a generic
way.

[Before]
```
./statx_test /mnt/erofs/testfile
statx(/mnt/erofs/testfile) = 0
dio mem align:0
dio offset align:0
```

[After]
```
./statx_test /mnt/erofs/testfile
statx(/mnt/erofs/testfile) = 0
dio mem align:512
dio offset align:512
```

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240718083243.2485437-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/inode.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 5f6439a63af7..43c09aae2afc 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -334,14 +334,29 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
 		  unsigned int query_flags)
 {
 	struct inode *const inode = d_inode(path->dentry);
+	bool compressed =
+		erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout);
 
-	if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+	if (compressed)
 		stat->attributes |= STATX_ATTR_COMPRESSED;
-
 	stat->attributes |= STATX_ATTR_IMMUTABLE;
 	stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
 				  STATX_ATTR_IMMUTABLE);
 
+	/*
+	 * Return the DIO alignment restrictions if requested.
+	 *
+	 * In EROFS, STATX_DIOALIGN is not supported in ondemand mode and
+	 * compressed files, so in these cases we report no DIO support.
+	 */
+	if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+		stat->result_mask |= STATX_DIOALIGN;
+		if (!erofs_is_fscache_mode(inode->i_sb) && !compressed) {
+			stat->dio_mem_align =
+				bdev_logical_block_size(inode->i_sb->s_bdev);
+			stat->dio_offset_align = stat->dio_mem_align;
+		}
+	}
 	generic_fillattr(idmap, request_mask, inode, stat);
 	return 0;
 }

From 7dc5537c3f8be87e005f0844a7626c987914f8fd Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Mon, 22 Jul 2024 11:51:10 +0800
Subject: [PATCH 275/346] erofs: fix race in z_erofs_get_gbuf()

In z_erofs_get_gbuf(), the current task may be migrated to another
CPU between `z_erofs_gbuf_id()` and `spin_lock(&gbuf->lock)`.

Therefore, z_erofs_put_gbuf() will trigger the following issue
which was found by stress test:

<2>[772156.434168] kernel BUG at fs/erofs/zutil.c:58!
..
<4>[772156.435007]
<4>[772156.439237] CPU: 0 PID: 3078 Comm: stress Kdump: loaded Tainted: G            E      6.10.0-rc7+ #2
<4>[772156.439239] Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 1.0.0 01/01/2017
<4>[772156.439241] pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
<4>[772156.439243] pc : z_erofs_put_gbuf+0x64/0x70 [erofs]
<4>[772156.439252] lr : z_erofs_lz4_decompress+0x600/0x6a0 [erofs]
..
<6>[772156.445958] stress (3127): drop_caches: 1
<4>[772156.446120] Call trace:
<4>[772156.446121]  z_erofs_put_gbuf+0x64/0x70 [erofs]
<4>[772156.446761]  z_erofs_lz4_decompress+0x600/0x6a0 [erofs]
<4>[772156.446897]  z_erofs_decompress_queue+0x740/0xa10 [erofs]
<4>[772156.447036]  z_erofs_runqueue+0x428/0x8c0 [erofs]
<4>[772156.447160]  z_erofs_readahead+0x224/0x390 [erofs]
..

Fixes: f36f3010f676 ("erofs: rename per-CPU buffers to global buffer pool and make it configurable")
Cc: <stable@vger.kernel.org> # 6.10+
Reviewed-by: Chunhai Guo <guochunhai@vivo.com>
Reviewed-by: Sandeep Dhavale <dhavale@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240722035110.3456740-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/zutil.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index b80f612867c2..9b53883e5caf 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -38,11 +38,13 @@ void *z_erofs_get_gbuf(unsigned int requiredpages)
 {
 	struct z_erofs_gbuf *gbuf;
 
+	migrate_disable();
 	gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()];
 	spin_lock(&gbuf->lock);
 	/* check if the buffer is too small */
 	if (requiredpages > gbuf->nrpages) {
 		spin_unlock(&gbuf->lock);
+		migrate_enable();
 		/* (for sparse checker) pretend gbuf->lock is still taken */
 		__acquire(gbuf->lock);
 		return NULL;
@@ -57,6 +59,7 @@ void z_erofs_put_gbuf(void *ptr) __releases(gbuf->lock)
 	gbuf = &z_erofs_gbufpool[z_erofs_gbuf_id()];
 	DBG_BUGON(gbuf->ptr != ptr);
 	spin_unlock(&gbuf->lock);
+	migrate_enable();
 }
 
 int z_erofs_gbuf_growsize(unsigned int nrpages)

From 684b290abc774202ff88897648f24520f40c916b Mon Sep 17 00:00:00 2001
From: Huang Xiaojia <huangxiaojia2@huawei.com>
Date: Sat, 20 Jul 2024 16:23:35 +0800
Subject: [PATCH 276/346] erofs: add support for FS_IOC_GETFSSYSFSPATH

FS_IOC_GETFSSYSFSPATH ioctl exposes /sys/fs path of a given filesystem,
potentially standarizing sysfs reporting. This patch add support for
FS_IOC_GETFSSYSFSPATH for erofs, "erofs/<dev>" will be outputted for bdev
cases, "erofs/[domain_id,]<fs_id>" will be outputted for fscache cases.

Signed-off-by: Huang Xiaojia <huangxiaojia2@huawei.com>
Link: https://lore.kernel.org/r/20240720082335.441563-1-huangxiaojia2@huawei.com
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/super.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 35268263aaed..32ce5b35e1df 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -576,6 +576,21 @@ static const struct export_operations erofs_export_ops = {
 	.get_parent = erofs_get_parent,
 };
 
+static void erofs_set_sysfs_name(struct super_block *sb)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	if (erofs_is_fscache_mode(sb)) {
+		if (sbi->domain_id)
+			super_set_sysfs_name_generic(sb, "%s,%s",sbi->domain_id,
+						     sbi->fsid);
+		else
+			super_set_sysfs_name_generic(sb, "%s", sbi->fsid);
+		return;
+	}
+	super_set_sysfs_name_id(sb);
+}
+
 static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct inode *inode;
@@ -643,6 +658,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 		sb->s_flags |= SB_POSIXACL;
 	else
 		sb->s_flags &= ~SB_POSIXACL;
+	erofs_set_sysfs_name(sb);
 
 #ifdef CONFIG_EROFS_FS_ZIP
 	xa_init(&sbi->managed_pslots);

From 5d3bb77e5fce1d224b94da31abae0a7afed54735 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 23 Jul 2024 15:30:24 +0800
Subject: [PATCH 277/346] erofs: support multi-page folios for erofs_bread()

If the requested page is part of the previous multi-page folio, there
is no need to call read_mapping_folio() again.

Also, get rid of the remaining one of page->index [1] in our codebase.

[1] https://lore.kernel.org/r/Zp8fgUSIBGQ1TN0D@casper.infradead.org

Cc: Matthew Wilcox <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240723073024.875290-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/data.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 8be60797ea2f..1b7eba38ba1e 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -21,38 +21,32 @@ void erofs_put_metabuf(struct erofs_buf *buf)
 	if (!buf->page)
 		return;
 	erofs_unmap_metabuf(buf);
-	put_page(buf->page);
+	folio_put(page_folio(buf->page));
 	buf->page = NULL;
 }
 
-/*
- * Derive the block size from inode->i_blkbits to make compatible with
- * anonymous inode in fscache mode.
- */
 void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
 		  enum erofs_kmap_type type)
 {
 	pgoff_t index = offset >> PAGE_SHIFT;
-	struct page *page = buf->page;
-	struct folio *folio;
-	unsigned int nofs_flag;
+	struct folio *folio = NULL;
 
-	if (!page || page->index != index) {
+	if (buf->page) {
+		folio = page_folio(buf->page);
+		if (folio_file_page(folio, index) != buf->page)
+			erofs_unmap_metabuf(buf);
+	}
+	if (!folio || !folio_contains(folio, index)) {
 		erofs_put_metabuf(buf);
-
-		nofs_flag = memalloc_nofs_save();
-		folio = read_cache_folio(buf->mapping, index, NULL, NULL);
-		memalloc_nofs_restore(nofs_flag);
+		folio = read_mapping_folio(buf->mapping, index, NULL);
 		if (IS_ERR(folio))
 			return folio;
-
-		/* should already be PageUptodate, no need to lock page */
-		page = folio_file_page(folio, index);
-		buf->page = page;
 	}
+	buf->page = folio_file_page(folio, index);
+
 	if (buf->kmap_type == EROFS_NO_KMAP) {
 		if (type == EROFS_KMAP)
-			buf->base = kmap_local_page(page);
+			buf->base = kmap_local_page(buf->page);
 		buf->kmap_type = type;
 	} else if (buf->kmap_type != type) {
 		DBG_BUGON(1);

From 14e9283fb22d0d259820a5f05c6059678bab9ac5 Mon Sep 17 00:00:00 2001
From: Chen Ni <nichen@iscas.ac.cn>
Date: Wed, 24 Jul 2024 10:07:21 +0800
Subject: [PATCH 278/346] erofs: convert comma to semicolon

Replace a comma between expression statements by a semicolon.

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Link: https://lore.kernel.org/r/20240724020721.2389738-1-nichen@iscas.ac.cn
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/decompressor_lzma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 06a722b85a45..40666815046f 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -188,7 +188,7 @@ static int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
 			       !rq->partial_decoding);
 	buf.in_size = min(rq->inputsize, PAGE_SIZE - rq->pageofs_in);
 	rq->inputsize -= buf.in_size;
-	buf.in = dctx.kin + rq->pageofs_in,
+	buf.in = dctx.kin + rq->pageofs_in;
 	dctx.bounce = strm->bounce;
 	do {
 		dctx.avail_out = buf.out_size - buf.out_pos;

From 9bc7501b0b90f4d0c34b97c14ff1f708ce7ad8f3 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Mon, 3 Jun 2024 11:15:27 -0400
Subject: [PATCH 279/346] i3c: master: svc: resend target address when get NACK

According to I3C Spec 1.1.1, 11-Jun-2021, section: 5.1.2.2.3:

If the Controller chooses to start an I3C Message with an I3C Dynamic
Address, then special provisions shall be made because that same I3C Target
may be initiating an IBI or a Controller Role Request. So, one of three
things may happen: (skip 1, 2)

3. The Addresses match and the RnW bits also match, and so neither
Controller nor Target will ACK since both are expecting the other side to
provide ACK. As a result, each side might think it had "won" arbitration,
but neither side would continue, as each would subsequently see that the
other did not provide ACK.
...
For either value of RnW: Due to the NACK, the Controller shall defer the
Private Write or Private Read, and should typically transmit the Target
						    ^^^^^^^^^^^^^^^^^^^
Address again after a Repeated START (i.e., the next one or any one prior
^^^^^^^^^^^^^
to a STOP in the Frame). Since the Address Header following a Repeated
START is not arbitrated, the Controller will always win (see Section
5.1.2.2.4).

Resend target address again if address is not 7E and controller get NACK.

Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/svc-i3c-master.c | 58 ++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c
index bb299ce02ccc..f0362509319e 100644
--- a/drivers/i3c/master/svc-i3c-master.c
+++ b/drivers/i3c/master/svc-i3c-master.c
@@ -1052,29 +1052,59 @@ static int svc_i3c_master_xfer(struct svc_i3c_master *master,
 			       u8 *in, const u8 *out, unsigned int xfer_len,
 			       unsigned int *actual_len, bool continued)
 {
+	int retry = 2;
 	u32 reg;
 	int ret;
 
 	/* clean SVC_I3C_MINT_IBIWON w1c bits */
 	writel(SVC_I3C_MINT_IBIWON, master->regs + SVC_I3C_MSTATUS);
 
-	writel(SVC_I3C_MCTRL_REQUEST_START_ADDR |
-	       xfer_type |
-	       SVC_I3C_MCTRL_IBIRESP_NACK |
-	       SVC_I3C_MCTRL_DIR(rnw) |
-	       SVC_I3C_MCTRL_ADDR(addr) |
-	       SVC_I3C_MCTRL_RDTERM(*actual_len),
-	       master->regs + SVC_I3C_MCTRL);
 
-	ret = readl_poll_timeout(master->regs + SVC_I3C_MSTATUS, reg,
+	while (retry--) {
+		writel(SVC_I3C_MCTRL_REQUEST_START_ADDR |
+		       xfer_type |
+		       SVC_I3C_MCTRL_IBIRESP_NACK |
+		       SVC_I3C_MCTRL_DIR(rnw) |
+		       SVC_I3C_MCTRL_ADDR(addr) |
+		       SVC_I3C_MCTRL_RDTERM(*actual_len),
+		       master->regs + SVC_I3C_MCTRL);
+
+		ret = readl_poll_timeout(master->regs + SVC_I3C_MSTATUS, reg,
 				 SVC_I3C_MSTATUS_MCTRLDONE(reg), 0, 1000);
-	if (ret)
-		goto emit_stop;
+		if (ret)
+			goto emit_stop;
 
-	if (readl(master->regs + SVC_I3C_MERRWARN) & SVC_I3C_MERRWARN_NACK) {
-		ret = -ENXIO;
-		*actual_len = 0;
-		goto emit_stop;
+		if (readl(master->regs + SVC_I3C_MERRWARN) & SVC_I3C_MERRWARN_NACK) {
+			/*
+			 * According to I3C Spec 1.1.1, 11-Jun-2021, section: 5.1.2.2.3.
+			 * If the Controller chooses to start an I3C Message with an I3C Dynamic
+			 * Address, then special provisions shall be made because that same I3C
+			 * Target may be initiating an IBI or a Controller Role Request. So, one of
+			 * three things may happen: (skip 1, 2)
+			 *
+			 * 3. The Addresses match and the RnW bits also match, and so neither
+			 * Controller nor Target will ACK since both are expecting the other side to
+			 * provide ACK. As a result, each side might think it had "won" arbitration,
+			 * but neither side would continue, as each would subsequently see that the
+			 * other did not provide ACK.
+			 * ...
+			 * For either value of RnW: Due to the NACK, the Controller shall defer the
+			 * Private Write or Private Read, and should typically transmit the Target
+			 * Address again after a Repeated START (i.e., the next one or any one prior
+			 * to a STOP in the Frame). Since the Address Header following a Repeated
+			 * START is not arbitrated, the Controller will always win (see Section
+			 * 5.1.2.2.4).
+			 */
+			if (retry && addr != 0x7e) {
+				writel(SVC_I3C_MERRWARN_NACK, master->regs + SVC_I3C_MERRWARN);
+			} else {
+				ret = -ENXIO;
+				*actual_len = 0;
+				goto emit_stop;
+			}
+		} else {
+			break;
+		}
 	}
 
 	/*

From be90ae1ba14a83962b33c4d4c854ef081186b0e4 Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Fri, 14 Jun 2024 17:02:08 +0300
Subject: [PATCH 280/346] i3c: mipi-i3c-hci: Fix number of DAT/DCT entries for
 HCI versions < 1.1

I was wrong about the TABLE_SIZE field description in the
commit 0676bfebf576 ("i3c: mipi-i3c-hci: Fix DAT/DCT entry sizes").

For the MIPI I3C HCI versions 1.0 and earlier the TABLE_SIZE field in
the registers DAT_SECTION_OFFSET and DCT_SECTION_OFFSET is indeed defined
in DWORDs and not number of entries like it is defined in later versions.

Where above fix allowed driver initialization to continue the wrongly
interpreted TABLE_SIZE field leads variables DAT_entries being twice and
DCT_entries four times as big as they really are.

That in turn leads clearing the DAT table over the boundary in the
dat_v1.c: hci_dat_v1_init().

So interprete the TABLE_SIZE field in DWORDs for HCI versions < 1.1 and
fix number of DAT/DCT entries accordingly.

Fixes: 0676bfebf576 ("i3c: mipi-i3c-hci: Fix DAT/DCT entry sizes")
Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/mipi-i3c-hci/core.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/i3c/master/mipi-i3c-hci/core.c b/drivers/i3c/master/mipi-i3c-hci/core.c
index d7e966a25583..4e7d6a43ee9b 100644
--- a/drivers/i3c/master/mipi-i3c-hci/core.c
+++ b/drivers/i3c/master/mipi-i3c-hci/core.c
@@ -631,6 +631,7 @@ static irqreturn_t i3c_hci_irq_handler(int irq, void *dev_id)
 static int i3c_hci_init(struct i3c_hci *hci)
 {
 	u32 regval, offset;
+	bool size_in_dwords;
 	int ret;
 
 	/* Validate HCI hardware version */
@@ -654,11 +655,16 @@ static int i3c_hci_init(struct i3c_hci *hci)
 	hci->caps = reg_read(HC_CAPABILITIES);
 	DBG("caps = %#x", hci->caps);
 
+	size_in_dwords = hci->version_major < 1 ||
+			 (hci->version_major == 1 && hci->version_minor < 1);
+
 	regval = reg_read(DAT_SECTION);
 	offset = FIELD_GET(DAT_TABLE_OFFSET, regval);
 	hci->DAT_regs = offset ? hci->base_regs + offset : NULL;
 	hci->DAT_entries = FIELD_GET(DAT_TABLE_SIZE, regval);
 	hci->DAT_entry_size = FIELD_GET(DAT_ENTRY_SIZE, regval) ? 0 : 8;
+	if (size_in_dwords)
+		hci->DAT_entries = 4 * hci->DAT_entries / hci->DAT_entry_size;
 	dev_info(&hci->master.dev, "DAT: %u %u-bytes entries at offset %#x\n",
 		 hci->DAT_entries, hci->DAT_entry_size, offset);
 
@@ -667,6 +673,8 @@ static int i3c_hci_init(struct i3c_hci *hci)
 	hci->DCT_regs = offset ? hci->base_regs + offset : NULL;
 	hci->DCT_entries = FIELD_GET(DCT_TABLE_SIZE, regval);
 	hci->DCT_entry_size = FIELD_GET(DCT_ENTRY_SIZE, regval) ? 0 : 16;
+	if (size_in_dwords)
+		hci->DCT_entries = 4 * hci->DCT_entries / hci->DCT_entry_size;
 	dev_info(&hci->master.dev, "DCT: %u %u-bytes entries at offset %#x\n",
 		 hci->DCT_entries, hci->DCT_entry_size, offset);
 

From 8f2cb0327938c786cede21a542fb538ff243a03a Mon Sep 17 00:00:00 2001
From: Aniket <aniketmaurya@google.com>
Date: Tue, 11 Jun 2024 11:26:50 +0000
Subject: [PATCH 281/346] i3c: dw: Fix clearing queue thld

QUEUE_THLD_CTRL_IBI_STAT_MASK is repeated twice.
Replace with QUEUE_THLD_CTRL_IBI_DATA_MASK.

Signed-off-by: Aniket <aniketmaurya@google.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/dw-i3c-master.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c
index 0ec00e644bd4..0ba4960aa77b 100644
--- a/drivers/i3c/master/dw-i3c-master.c
+++ b/drivers/i3c/master/dw-i3c-master.c
@@ -631,7 +631,7 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m)
 	thld_ctrl = readl(master->regs + QUEUE_THLD_CTRL);
 	thld_ctrl &= ~(QUEUE_THLD_CTRL_RESP_BUF_MASK |
 		       QUEUE_THLD_CTRL_IBI_STAT_MASK |
-		       QUEUE_THLD_CTRL_IBI_STAT_MASK);
+		       QUEUE_THLD_CTRL_IBI_DATA_MASK);
 	thld_ctrl |= QUEUE_THLD_CTRL_IBI_STAT(1) |
 		QUEUE_THLD_CTRL_IBI_DATA(31);
 	writel(thld_ctrl, master->regs + QUEUE_THLD_CTRL);

From 64bf1459080670f05b8bb9c0cd140befd6780b5e Mon Sep 17 00:00:00 2001
From: Aniket <aniketmaurya@google.com>
Date: Fri, 7 Jun 2024 07:20:30 +0000
Subject: [PATCH 282/346] i3c: dw: Fix IBI intr programming

IBI_SIR_REQ_REJECT register is not present if the IP has
IC_HAS_IBI_DATA = 1 set. So don't rely on doing read-
modify-write op on this register.
Instead maintain a variable to store the sir reject mask
and use it to set IBI_SIR_REQ_REJECT.

Signed-off-by: Aniket <aniketmaurya@google.com>
Reviewed-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/dw-i3c-master.c | 15 ++++++++-------
 drivers/i3c/master/dw-i3c-master.h |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c
index 0ba4960aa77b..77a2a1c3fd1d 100644
--- a/drivers/i3c/master/dw-i3c-master.c
+++ b/drivers/i3c/master/dw-i3c-master.c
@@ -658,7 +658,9 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m)
 	if (ret)
 		return ret;
 
-	writel(IBI_REQ_REJECT_ALL, master->regs + IBI_SIR_REQ_REJECT);
+	master->sir_rej_mask = IBI_REQ_REJECT_ALL;
+	writel(master->sir_rej_mask, master->regs + IBI_SIR_REQ_REJECT);
+
 	writel(IBI_REQ_REJECT_ALL, master->regs + IBI_MR_REQ_REJECT);
 
 	/* For now don't support Hot-Join */
@@ -1175,17 +1177,16 @@ static void dw_i3c_master_set_sir_enabled(struct dw_i3c_master *master,
 	master->platform_ops->set_dat_ibi(master, dev, enable, &reg);
 	writel(reg, master->regs + dat_entry);
 
-	reg = readl(master->regs + IBI_SIR_REQ_REJECT);
 	if (enable) {
-		global = reg == 0xffffffff;
-		reg &= ~BIT(idx);
+		global = (master->sir_rej_mask == IBI_REQ_REJECT_ALL);
+		master->sir_rej_mask &= ~BIT(idx);
 	} else {
 		bool hj_rejected = !!(readl(master->regs + DEVICE_CTRL) & DEV_CTRL_HOT_JOIN_NACK);
 
-		reg |= BIT(idx);
-		global = (reg == 0xffffffff) && hj_rejected;
+		master->sir_rej_mask |= BIT(idx);
+		global = (master->sir_rej_mask == IBI_REQ_REJECT_ALL) && hj_rejected;
 	}
-	writel(reg, master->regs + IBI_SIR_REQ_REJECT);
+	writel(master->sir_rej_mask, master->regs + IBI_SIR_REQ_REJECT);
 
 	if (global)
 		dw_i3c_master_enable_sir_signal(master, enable);
diff --git a/drivers/i3c/master/dw-i3c-master.h b/drivers/i3c/master/dw-i3c-master.h
index 4ab94aa72252..8cb617b8147e 100644
--- a/drivers/i3c/master/dw-i3c-master.h
+++ b/drivers/i3c/master/dw-i3c-master.h
@@ -39,7 +39,7 @@ struct dw_i3c_master {
 	char version[5];
 	char type[5];
 	bool ibi_capable;
-
+	u32 sir_rej_mask;
 	/*
 	 * Per-device hardware data, used to manage the device address table
 	 * (DAT)

From d9deb28f700cbe15800f1835da7994a5c6f03928 Mon Sep 17 00:00:00 2001
From: Aniket <aniketmaurya@google.com>
Date: Thu, 27 Jun 2024 03:41:19 +0000
Subject: [PATCH 283/346] i3c: dw: Remove ibi_capable property

Since DW I3C IP master role always supports IBI, we don't need
to keep two variants of master ops and select one using this
property. Hence remove the code.

Signed-off-by: Aniket <aniketmaurya@google.com>
Link: https://lore.kernel.org/r/20240627034119.3938050-1-aniketmaurya@google.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/ast2600-i3c-master.c |  1 -
 drivers/i3c/master/dw-i3c-master.c      | 23 ++---------------------
 drivers/i3c/master/dw-i3c-master.h      |  1 -
 3 files changed, 2 insertions(+), 23 deletions(-)

diff --git a/drivers/i3c/master/ast2600-i3c-master.c b/drivers/i3c/master/ast2600-i3c-master.c
index 01a47d3dd499..84942dbb6f80 100644
--- a/drivers/i3c/master/ast2600-i3c-master.c
+++ b/drivers/i3c/master/ast2600-i3c-master.c
@@ -156,7 +156,6 @@ static int ast2600_i3c_probe(struct platform_device *pdev)
 			i3c->sda_pullup);
 
 	i3c->dw.platform_ops = &ast2600_i3c_ops;
-	i3c->dw.ibi_capable = true;
 	return dw_i3c_common_probe(&i3c->dw, pdev);
 }
 
diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c
index 77a2a1c3fd1d..9aae5c8dba8d 100644
--- a/drivers/i3c/master/dw-i3c-master.c
+++ b/drivers/i3c/master/dw-i3c-master.c
@@ -1404,21 +1404,6 @@ static const struct i3c_master_controller_ops dw_mipi_i3c_ops = {
 	.attach_i2c_dev = dw_i3c_master_attach_i2c_dev,
 	.detach_i2c_dev = dw_i3c_master_detach_i2c_dev,
 	.i2c_xfers = dw_i3c_master_i2c_xfers,
-};
-
-static const struct i3c_master_controller_ops dw_mipi_i3c_ibi_ops = {
-	.bus_init = dw_i3c_master_bus_init,
-	.bus_cleanup = dw_i3c_master_bus_cleanup,
-	.attach_i3c_dev = dw_i3c_master_attach_i3c_dev,
-	.reattach_i3c_dev = dw_i3c_master_reattach_i3c_dev,
-	.detach_i3c_dev = dw_i3c_master_detach_i3c_dev,
-	.do_daa = dw_i3c_master_daa,
-	.supports_ccc_cmd = dw_i3c_master_supports_ccc_cmd,
-	.send_ccc_cmd = dw_i3c_master_send_ccc_cmd,
-	.priv_xfers = dw_i3c_master_priv_xfers,
-	.attach_i2c_dev = dw_i3c_master_attach_i2c_dev,
-	.detach_i2c_dev = dw_i3c_master_detach_i2c_dev,
-	.i2c_xfers = dw_i3c_master_i2c_xfers,
 	.request_ibi = dw_i3c_master_request_ibi,
 	.free_ibi = dw_i3c_master_free_ibi,
 	.enable_ibi = dw_i3c_master_enable_ibi,
@@ -1456,7 +1441,6 @@ static void dw_i3c_hj_work(struct work_struct *work)
 int dw_i3c_common_probe(struct dw_i3c_master *master,
 			struct platform_device *pdev)
 {
-	const struct i3c_master_controller_ops *ops;
 	int ret, irq;
 
 	if (!master->platform_ops)
@@ -1506,12 +1490,9 @@ int dw_i3c_common_probe(struct dw_i3c_master *master,
 	master->maxdevs = ret >> 16;
 	master->free_pos = GENMASK(master->maxdevs - 1, 0);
 
-	ops = &dw_mipi_i3c_ops;
-	if (master->ibi_capable)
-		ops = &dw_mipi_i3c_ibi_ops;
-
 	INIT_WORK(&master->hj_work, dw_i3c_hj_work);
-	ret = i3c_master_register(&master->base, &pdev->dev, ops, false);
+	ret = i3c_master_register(&master->base, &pdev->dev,
+				  &dw_mipi_i3c_ops, false);
 	if (ret)
 		goto err_assert_rst;
 
diff --git a/drivers/i3c/master/dw-i3c-master.h b/drivers/i3c/master/dw-i3c-master.h
index 8cb617b8147e..7e76ca381d9f 100644
--- a/drivers/i3c/master/dw-i3c-master.h
+++ b/drivers/i3c/master/dw-i3c-master.h
@@ -38,7 +38,6 @@ struct dw_i3c_master {
 	struct clk *core_clk;
 	char version[5];
 	char type[5];
-	bool ibi_capable;
 	u32 sir_rej_mask;
 	/*
 	 * Per-device hardware data, used to manage the device address table

From 74e931f090a1d1a71d460284e87bd668667b98db Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Fri, 28 Jun 2024 16:15:56 +0300
Subject: [PATCH 284/346] i3c: mipi-i3c-hci: Switch to
 lower_32_bits()/upper_32_bits() helpers

Rather than having own lo32()/hi32() helpers for dealing with 32-bit and
64-bit build targets switch to generic lower_32_bits()/upper_32_bits()
helpers.

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Link: https://lore.kernel.org/r/20240628131559.502822-1-jarkko.nikula@linux.intel.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/mipi-i3c-hci/dma.c | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c
index 4e01a95cc4d0..6c5964e727b3 100644
--- a/drivers/i3c/master/mipi-i3c-hci/dma.c
+++ b/drivers/i3c/master/mipi-i3c-hci/dma.c
@@ -147,21 +147,6 @@ struct hci_dma_dev_ibi_data {
 	unsigned int max_len;
 };
 
-static inline u32 lo32(dma_addr_t physaddr)
-{
-	return physaddr;
-}
-
-static inline u32 hi32(dma_addr_t physaddr)
-{
-	/* trickery to avoid compiler warnings on 32-bit build targets */
-	if (sizeof(dma_addr_t) > 4) {
-		u64 hi = physaddr;
-		return hi >> 32;
-	}
-	return 0;
-}
-
 static void hci_dma_cleanup(struct i3c_hci *hci)
 {
 	struct hci_rings_data *rings = hci->io_data;
@@ -265,10 +250,10 @@ static int hci_dma_init(struct i3c_hci *hci)
 		if (!rh->xfer || !rh->resp || !rh->src_xfers)
 			goto err_out;
 
-		rh_reg_write(CMD_RING_BASE_LO, lo32(rh->xfer_dma));
-		rh_reg_write(CMD_RING_BASE_HI, hi32(rh->xfer_dma));
-		rh_reg_write(RESP_RING_BASE_LO, lo32(rh->resp_dma));
-		rh_reg_write(RESP_RING_BASE_HI, hi32(rh->resp_dma));
+		rh_reg_write(CMD_RING_BASE_LO, lower_32_bits(rh->xfer_dma));
+		rh_reg_write(CMD_RING_BASE_HI, upper_32_bits(rh->xfer_dma));
+		rh_reg_write(RESP_RING_BASE_LO, lower_32_bits(rh->resp_dma));
+		rh_reg_write(RESP_RING_BASE_HI, upper_32_bits(rh->resp_dma));
 
 		regval = FIELD_PREP(CR_RING_SIZE, rh->xfer_entries);
 		rh_reg_write(CR_SETUP, regval);
@@ -404,8 +389,8 @@ static int hci_dma_queue_xfer(struct i3c_hci *hci,
 				hci_dma_unmap_xfer(hci, xfer_list, i);
 				return -ENOMEM;
 			}
-			*ring_data++ = lo32(xfer->data_dma);
-			*ring_data++ = hi32(xfer->data_dma);
+			*ring_data++ = lower_32_bits(xfer->data_dma);
+			*ring_data++ = upper_32_bits(xfer->data_dma);
 		} else {
 			*ring_data++ = 0;
 			*ring_data++ = 0;

From 2df1de813a5ccbc89c7c831365e9e2c479aea0c3 Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Fri, 28 Jun 2024 16:15:57 +0300
Subject: [PATCH 285/346] i3c: mipi-i3c-hci: Set IBI Status and Data Ring base
 addresses

IBI Status and Data Ring base address registers are not set so HW
obviously cannot update those rings after In-Band Interrupt.

Set them to already allocated and mapped ring addresses.

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Link: https://lore.kernel.org/r/20240628131559.502822-2-jarkko.nikula@linux.intel.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/mipi-i3c-hci/dma.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c
index 6c5964e727b3..7a56ae4a5ddf 100644
--- a/drivers/i3c/master/mipi-i3c-hci/dma.c
+++ b/drivers/i3c/master/mipi-i3c-hci/dma.c
@@ -300,6 +300,11 @@ static int hci_dma_init(struct i3c_hci *hci)
 			goto err_out;
 		}
 
+		rh_reg_write(IBI_STATUS_RING_BASE_LO, lower_32_bits(rh->ibi_status_dma));
+		rh_reg_write(IBI_STATUS_RING_BASE_HI, upper_32_bits(rh->ibi_status_dma));
+		rh_reg_write(IBI_DATA_RING_BASE_LO, lower_32_bits(rh->ibi_data_dma));
+		rh_reg_write(IBI_DATA_RING_BASE_HI, upper_32_bits(rh->ibi_data_dma));
+
 		regval = FIELD_PREP(IBI_STATUS_RING_SIZE,
 				    rh->ibi_status_entries) |
 			 FIELD_PREP(IBI_DATA_CHUNK_SIZE,

From 8a2be2f1db268ec735419e53ef04ca039fc027dc Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Fri, 28 Jun 2024 16:15:58 +0300
Subject: [PATCH 286/346] i3c: mipi-i3c-hci: Error out instead on BUG_ON() in
 IBI DMA setup

Definitely condition dma_get_cache_alignment * defined value > 256
during driver initialization is not reason to BUG_ON(). Turn that to
graceful error out with -EINVAL.

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Link: https://lore.kernel.org/r/20240628131559.502822-3-jarkko.nikula@linux.intel.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/mipi-i3c-hci/dma.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c
index 7a56ae4a5ddf..9bdfe40bc1e1 100644
--- a/drivers/i3c/master/mipi-i3c-hci/dma.c
+++ b/drivers/i3c/master/mipi-i3c-hci/dma.c
@@ -279,7 +279,10 @@ static int hci_dma_init(struct i3c_hci *hci)
 
 		rh->ibi_chunk_sz = dma_get_cache_alignment();
 		rh->ibi_chunk_sz *= IBI_CHUNK_CACHELINES;
-		BUG_ON(rh->ibi_chunk_sz > 256);
+		if (rh->ibi_chunk_sz > 256) {
+			ret = -EINVAL;
+			goto err_out;
+		}
 
 		ibi_status_ring_sz = rh->ibi_status_sz * rh->ibi_status_entries;
 		ibi_data_ring_sz = rh->ibi_chunk_sz * rh->ibi_chunks_total;

From 4642f7eddb75efb5944ff21e5340ccce66928ff1 Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Fri, 28 Jun 2024 16:15:59 +0300
Subject: [PATCH 287/346] i3c: mipi-i3c-hci: Round IBI data chunk size to HW
 supported value

The dma.c: hci_dma_init() sets the CHUNK_SIZE field in the IBI_SETUP
register incorrectly if the calculated ibi_chunk_sz is not exactly
2^(n+2) bytes, where n is 0..6.

Fix this by rounding the chunk size up to nearest 2^(n+2) bytes.

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Link: https://lore.kernel.org/r/20240628131559.502822-4-jarkko.nikula@linux.intel.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/mipi-i3c-hci/dma.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/i3c/master/mipi-i3c-hci/dma.c b/drivers/i3c/master/mipi-i3c-hci/dma.c
index 9bdfe40bc1e1..a918e96b21fd 100644
--- a/drivers/i3c/master/mipi-i3c-hci/dma.c
+++ b/drivers/i3c/master/mipi-i3c-hci/dma.c
@@ -279,6 +279,13 @@ static int hci_dma_init(struct i3c_hci *hci)
 
 		rh->ibi_chunk_sz = dma_get_cache_alignment();
 		rh->ibi_chunk_sz *= IBI_CHUNK_CACHELINES;
+		/*
+		 * Round IBI data chunk size to number of bytes supported by
+		 * the HW. Chunk size can be 2^n number of DWORDs which is the
+		 * same as 2^(n+2) bytes, where n is 0..6.
+		 */
+		rh->ibi_chunk_sz = umax(4, rh->ibi_chunk_sz);
+		rh->ibi_chunk_sz = roundup_pow_of_two(rh->ibi_chunk_sz);
 		if (rh->ibi_chunk_sz > 256) {
 			ret = -EINVAL;
 			goto err_out;

From 48a6dcdafdbe7b9596ac6ecbecc347c2e2413f81 Mon Sep 17 00:00:00 2001
From: Chen Ni <nichen@iscas.ac.cn>
Date: Tue, 2 Jul 2024 10:47:58 +0800
Subject: [PATCH 288/346] i3c: master: svc: Convert comma to semicolon

Replace a comma between expression statements by a semicolon.

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Link: https://lore.kernel.org/r/20240702024758.1411569-1-nichen@iscas.ac.cn
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/svc-i3c-master.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c
index f0362509319e..eb42280f465a 100644
--- a/drivers/i3c/master/svc-i3c-master.c
+++ b/drivers/i3c/master/svc-i3c-master.c
@@ -1351,7 +1351,7 @@ static int svc_i3c_master_send_direct_ccc_cmd(struct svc_i3c_master *master,
 	cmd->addr = ccc->dests[0].addr;
 	cmd->rnw = ccc->rnw;
 	cmd->in = ccc->rnw ? ccc->dests[0].payload.data : NULL;
-	cmd->out = ccc->rnw ? NULL : ccc->dests[0].payload.data,
+	cmd->out = ccc->rnw ? NULL : ccc->dests[0].payload.data;
 	cmd->len = xfer_len;
 	cmd->actual_len = actual_len;
 	cmd->continued = false;

From 71134c13ac59fcadbfe39b8f9022a37266c4ed5a Mon Sep 17 00:00:00 2001
From: Aniket <aniketmaurya@google.com>
Date: Fri, 28 Jun 2024 15:45:26 +0000
Subject: [PATCH 289/346] dt-bindings: i3c: dw: Add apb clock binding

Add dt binding for optional apb clock. Core clock is mandatory.

Signed-off-by: Aniket <aniketmaurya@google.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240628154526.324068-1-aniketmaurya@google.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 .../devicetree/bindings/i3c/snps,dw-i3c-master.yaml   | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml b/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml
index c0e805e531be..4fc13e3c0f75 100644
--- a/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml
+++ b/Documentation/devicetree/bindings/i3c/snps,dw-i3c-master.yaml
@@ -20,7 +20,16 @@ properties:
     maxItems: 1
 
   clocks:
-    maxItems: 1
+    minItems: 1
+    items:
+      - description: Core clock
+      - description: APB clock
+
+  clock-names:
+    minItems: 1
+    items:
+      - const: core
+      - const: apb
 
   interrupts:
     maxItems: 1

From 54f5079e0dfcce2886178382ee9f03f35cc28325 Mon Sep 17 00:00:00 2001
From: Aniket <aniketmaurya@google.com>
Date: Fri, 28 Jun 2024 15:46:03 +0000
Subject: [PATCH 290/346] i3c: dw: Use new *_enabled clk API

Move to "enabled" variant of clk_get API. It takes care
of enable and disable calls during the probe and remove.

Signed-off-by: Aniket <aniketmaurya@google.com>
Link: https://lore.kernel.org/r/20240628154603.326075-1-aniketmaurya@google.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/dw-i3c-master.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c
index 9aae5c8dba8d..4e3335641dcd 100644
--- a/drivers/i3c/master/dw-i3c-master.c
+++ b/drivers/i3c/master/dw-i3c-master.c
@@ -1450,7 +1450,7 @@ int dw_i3c_common_probe(struct dw_i3c_master *master,
 	if (IS_ERR(master->regs))
 		return PTR_ERR(master->regs);
 
-	master->core_clk = devm_clk_get(&pdev->dev, NULL);
+	master->core_clk = devm_clk_get_enabled(&pdev->dev, NULL);
 	if (IS_ERR(master->core_clk))
 		return PTR_ERR(master->core_clk);
 
@@ -1459,10 +1459,6 @@ int dw_i3c_common_probe(struct dw_i3c_master *master,
 	if (IS_ERR(master->core_rst))
 		return PTR_ERR(master->core_rst);
 
-	ret = clk_prepare_enable(master->core_clk);
-	if (ret)
-		goto err_disable_core_clk;
-
 	reset_control_deassert(master->core_rst);
 
 	spin_lock_init(&master->xferqueue.lock);
@@ -1501,9 +1497,6 @@ int dw_i3c_common_probe(struct dw_i3c_master *master,
 err_assert_rst:
 	reset_control_assert(master->core_rst);
 
-err_disable_core_clk:
-	clk_disable_unprepare(master->core_clk);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(dw_i3c_common_probe);
@@ -1513,8 +1506,6 @@ void dw_i3c_common_remove(struct dw_i3c_master *master)
 	i3c_master_unregister(&master->base);
 
 	reset_control_assert(master->core_rst);
-
-	clk_disable_unprepare(master->core_clk);
 }
 EXPORT_SYMBOL_GPL(dw_i3c_common_remove);
 

From a0d48ebf39cee74f3832d42126b6ddf4497fc07c Mon Sep 17 00:00:00 2001
From: Aniket <aniketmaurya@google.com>
Date: Fri, 28 Jun 2024 15:46:18 +0000
Subject: [PATCH 291/346] i3c: dw: Add optional apb clock

Besides the core clock, IP also has an apb interface clock.
Add an optional hook for the same.

Signed-off-by: Aniket <aniketmaurya@google.com>
Link: https://lore.kernel.org/r/20240628154618.327151-1-aniketmaurya@google.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/dw-i3c-master.c | 4 ++++
 drivers/i3c/master/dw-i3c-master.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c
index 4e3335641dcd..0ca41782f3a6 100644
--- a/drivers/i3c/master/dw-i3c-master.c
+++ b/drivers/i3c/master/dw-i3c-master.c
@@ -1454,6 +1454,10 @@ int dw_i3c_common_probe(struct dw_i3c_master *master,
 	if (IS_ERR(master->core_clk))
 		return PTR_ERR(master->core_clk);
 
+	master->pclk = devm_clk_get_optional_enabled(&pdev->dev, "pclk");
+	if (IS_ERR(master->pclk))
+		return PTR_ERR(master->pclk);
+
 	master->core_rst = devm_reset_control_get_optional_exclusive(&pdev->dev,
 								    "core_rst");
 	if (IS_ERR(master->core_rst))
diff --git a/drivers/i3c/master/dw-i3c-master.h b/drivers/i3c/master/dw-i3c-master.h
index 7e76ca381d9f..fb7121c6c687 100644
--- a/drivers/i3c/master/dw-i3c-master.h
+++ b/drivers/i3c/master/dw-i3c-master.h
@@ -36,6 +36,7 @@ struct dw_i3c_master {
 	void __iomem *regs;
 	struct reset_control *core_rst;
 	struct clk *core_clk;
+	struct clk *pclk;
 	char version[5];
 	char type[5];
 	u32 sir_rej_mask;

From 915d0741e0c8e5fe406f8a8369745a73e2c681ef Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Tue, 2 Jul 2024 18:31:07 -0400
Subject: [PATCH 292/346] i3c: master: svc: Improve DAA STOP handle code logic

The REQUEST_PROC_DAA command behaves differently from other commands.
Sometimes the hardware can auto emit STOP, but in other conditions, it
cannot.

Improves the code logic to better handle these situations.

Hardware can auto emit STOP only when the following conditions are met:
- The previous I3C device correctly returns a PID and ACKs an I3C address.
- A NACK is received when emitting 7E to try to get the next I3C device's
PID.

In all other cases, a manual STOP emission is needed.

The code is changed to emit STOP when break the while loop and 'return 0'
only when the hardware can auto emit STOP.

Signed-off-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20240702223107.403057-1-Frank.Li@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/svc-i3c-master.c | 61 ++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 18 deletions(-)

diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c
index eb42280f465a..e80c002991f7 100644
--- a/drivers/i3c/master/svc-i3c-master.c
+++ b/drivers/i3c/master/svc-i3c-master.c
@@ -790,7 +790,20 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master,
 	int ret, i;
 
 	while (true) {
-		/* Enter/proceed with DAA */
+		/* SVC_I3C_MCTRL_REQUEST_PROC_DAA have two mode, ENTER DAA or PROCESS DAA.
+		 *
+		 * ENTER DAA:
+		 *   1 will issue START, 7E, ENTDAA, and then emits 7E/R to process first target.
+		 *   2 Stops just before the new Dynamic Address (DA) is to be emitted.
+		 *
+		 * PROCESS DAA:
+		 *   1 The DA is written using MWDATAB or ADDR bits 6:0.
+		 *   2 ProcessDAA is requested again to write the new address, and then starts the
+		 *     next (START, 7E, ENTDAA)  unless marked to STOP; an MSTATUS indicating NACK
+		 *     means DA was not accepted (e.g. parity error). If PROCESSDAA is NACKed on the
+		 *     7E/R, which means no more Slaves need a DA, then a COMPLETE will be signaled
+		 *     (along with DONE), and a STOP issued automatically.
+		 */
 		writel(SVC_I3C_MCTRL_REQUEST_PROC_DAA |
 		       SVC_I3C_MCTRL_TYPE_I3C |
 		       SVC_I3C_MCTRL_IBIRESP_NACK |
@@ -807,7 +820,7 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master,
 						SVC_I3C_MSTATUS_MCTRLDONE(reg),
 						1, 1000);
 		if (ret)
-			return ret;
+			break;
 
 		if (SVC_I3C_MSTATUS_RXPEND(reg)) {
 			u8 data[6];
@@ -819,7 +832,7 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master,
 			 */
 			ret = svc_i3c_master_readb(master, data, 6);
 			if (ret)
-				return ret;
+				break;
 
 			for (i = 0; i < 6; i++)
 				prov_id[dev_nb] |= (u64)(data[i]) << (8 * (5 - i));
@@ -827,7 +840,7 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master,
 			/* We do not care about the BCR and DCR yet */
 			ret = svc_i3c_master_readb(master, data, 2);
 			if (ret)
-				return ret;
+				break;
 		} else if (SVC_I3C_MSTATUS_MCTRLDONE(reg)) {
 			if (SVC_I3C_MSTATUS_STATE_IDLE(reg) &&
 			    SVC_I3C_MSTATUS_COMPLETE(reg)) {
@@ -835,12 +848,23 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master,
 				 * All devices received and acked they dynamic
 				 * address, this is the natural end of the DAA
 				 * procedure.
+				 *
+				 * Hardware will auto emit STOP at this case.
 				 */
-				break;
+				*count = dev_nb;
+				return 0;
+
 			} else if (SVC_I3C_MSTATUS_NACKED(reg)) {
 				/* No I3C devices attached */
-				if (dev_nb == 0)
+				if (dev_nb == 0) {
+					/*
+					 * Hardware can't treat first NACK for ENTAA as normal
+					 * COMPLETE. So need manual emit STOP.
+					 */
+					ret = 0;
+					*count = 0;
 					break;
+				}
 
 				/*
 				 * A slave device nacked the address, this is
@@ -849,8 +873,10 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master,
 				 * answer again immediately and shall ack the
 				 * address this time.
 				 */
-				if (prov_id[dev_nb] == nacking_prov_id)
-					return -EIO;
+				if (prov_id[dev_nb] == nacking_prov_id) {
+					ret = EIO;
+					break;
+				}
 
 				dev_nb--;
 				nacking_prov_id = prov_id[dev_nb];
@@ -858,7 +884,7 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master,
 
 				continue;
 			} else {
-				return -EIO;
+				break;
 			}
 		}
 
@@ -870,12 +896,12 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master,
 						SVC_I3C_MSTATUS_BETWEEN(reg),
 						0, 1000);
 		if (ret)
-			return ret;
+			break;
 
 		/* Give the slave device a suitable dynamic address */
 		ret = i3c_master_get_free_addr(&master->base, last_addr + 1);
 		if (ret < 0)
-			return ret;
+			break;
 
 		addrs[dev_nb] = ret;
 		dev_dbg(master->dev, "DAA: device %d assigned to 0x%02x\n",
@@ -885,9 +911,9 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master,
 		last_addr = addrs[dev_nb++];
 	}
 
-	*count = dev_nb;
-
-	return 0;
+	/* Need manual issue STOP except for Complete condition */
+	svc_i3c_master_emit_stop(master);
+	return ret;
 }
 
 static int svc_i3c_update_ibirules(struct svc_i3c_master *master)
@@ -961,11 +987,10 @@ static int svc_i3c_master_do_daa(struct i3c_master_controller *m)
 	spin_lock_irqsave(&master->xferqueue.lock, flags);
 	ret = svc_i3c_master_do_daa_locked(master, addrs, &dev_nb);
 	spin_unlock_irqrestore(&master->xferqueue.lock, flags);
-	if (ret) {
-		svc_i3c_master_emit_stop(master);
-		svc_i3c_master_clear_merrwarn(master);
+
+	svc_i3c_master_clear_merrwarn(master);
+	if (ret)
 		goto rpm_out;
-	}
 
 	/* Register all devices who participated to the core */
 	for (i = 0; i < dev_nb; i++) {

From 4e89bc48e66a9aef46a705943d3f81bffb3a2288 Mon Sep 17 00:00:00 2001
From: Aniket <aniketmaurya@google.com>
Date: Mon, 8 Jul 2024 06:21:01 +0000
Subject: [PATCH 293/346] i3c: dw: Save timing registers and other values

Add variables to store timing registers and other values.
These variables would be later used to restore registers
during resume without recomputation.

Signed-off-by: Aniket <aniketmaurya@google.com>
Link: https://lore.kernel.org/r/20240708062103.3296587-2-aniketmaurya@google.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/dw-i3c-master.c | 15 +++++++++++++--
 drivers/i3c/master/dw-i3c-master.h |  9 +++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c
index 0ca41782f3a6..fcfa37f55d86 100644
--- a/drivers/i3c/master/dw-i3c-master.c
+++ b/drivers/i3c/master/dw-i3c-master.c
@@ -543,18 +543,22 @@ static int dw_i3c_clk_cfg(struct dw_i3c_master *master)
 
 	scl_timing = SCL_I3C_TIMING_HCNT(hcnt) | SCL_I3C_TIMING_LCNT(lcnt);
 	writel(scl_timing, master->regs + SCL_I3C_PP_TIMING);
+	master->i3c_pp_timing = scl_timing;
 
 	/*
 	 * In pure i3c mode, MST_FREE represents tCAS. In shared mode, this
 	 * will be set up by dw_i2c_clk_cfg as tLOW.
 	 */
-	if (master->base.bus.mode == I3C_BUS_MODE_PURE)
+	if (master->base.bus.mode == I3C_BUS_MODE_PURE) {
 		writel(BUS_I3C_MST_FREE(lcnt), master->regs + BUS_FREE_TIMING);
+		master->bus_free_timing = BUS_I3C_MST_FREE(lcnt);
+	}
 
 	lcnt = max_t(u8,
 		     DIV_ROUND_UP(I3C_BUS_TLOW_OD_MIN_NS, core_period), lcnt);
 	scl_timing = SCL_I3C_TIMING_HCNT(hcnt) | SCL_I3C_TIMING_LCNT(lcnt);
 	writel(scl_timing, master->regs + SCL_I3C_OD_TIMING);
+	master->i3c_od_timing = scl_timing;
 
 	lcnt = DIV_ROUND_UP(core_rate, I3C_BUS_SDR1_SCL_RATE) - hcnt;
 	scl_timing = SCL_EXT_LCNT_1(lcnt);
@@ -565,6 +569,7 @@ static int dw_i3c_clk_cfg(struct dw_i3c_master *master)
 	lcnt = DIV_ROUND_UP(core_rate, I3C_BUS_SDR4_SCL_RATE) - hcnt;
 	scl_timing |= SCL_EXT_LCNT_4(lcnt);
 	writel(scl_timing, master->regs + SCL_EXT_LCNT_TIMING);
+	master->ext_lcnt_timing = scl_timing;
 
 	return 0;
 }
@@ -586,16 +591,21 @@ static int dw_i2c_clk_cfg(struct dw_i3c_master *master)
 	scl_timing = SCL_I2C_FMP_TIMING_HCNT(hcnt) |
 		     SCL_I2C_FMP_TIMING_LCNT(lcnt);
 	writel(scl_timing, master->regs + SCL_I2C_FMP_TIMING);
+	master->i2c_fmp_timing = scl_timing;
 
 	lcnt = DIV_ROUND_UP(I3C_BUS_I2C_FM_TLOW_MIN_NS, core_period);
 	hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_SCL_RATE) - lcnt;
 	scl_timing = SCL_I2C_FM_TIMING_HCNT(hcnt) |
 		     SCL_I2C_FM_TIMING_LCNT(lcnt);
 	writel(scl_timing, master->regs + SCL_I2C_FM_TIMING);
+	master->i2c_fm_timing = scl_timing;
 
 	writel(BUS_I3C_MST_FREE(lcnt), master->regs + BUS_FREE_TIMING);
+	master->bus_free_timing = BUS_I3C_MST_FREE(lcnt);
+
 	writel(readl(master->regs + DEVICE_CTRL) | DEV_CTRL_I2C_SLAVE_PRESENT,
 	       master->regs + DEVICE_CTRL);
+	master->i2c_slv_prsnt = true;
 
 	return 0;
 }
@@ -650,7 +660,7 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m)
 
 	writel(DEV_ADDR_DYNAMIC_ADDR_VALID | DEV_ADDR_DYNAMIC(ret),
 	       master->regs + DEVICE_ADDR);
-
+	master->dev_addr = ret;
 	memset(&info, 0, sizeof(info));
 	info.dyn_addr = ret;
 
@@ -1077,6 +1087,7 @@ static int dw_i3c_master_attach_i2c_dev(struct i2c_dev_desc *dev)
 
 	data->index = pos;
 	master->devs[pos].addr = dev->addr;
+	master->devs[pos].is_i2c_addr = true;
 	master->free_pos &= ~BIT(pos);
 	i2c_dev_set_master_data(dev, data);
 
diff --git a/drivers/i3c/master/dw-i3c-master.h b/drivers/i3c/master/dw-i3c-master.h
index fb7121c6c687..f23e9d5aca86 100644
--- a/drivers/i3c/master/dw-i3c-master.h
+++ b/drivers/i3c/master/dw-i3c-master.h
@@ -19,6 +19,7 @@ struct dw_i3c_master_caps {
 
 struct dw_i3c_dat_entry {
 	u8 addr;
+	bool is_i2c_addr;
 	struct i3c_dev_desc *ibi_dev;
 };
 
@@ -40,6 +41,14 @@ struct dw_i3c_master {
 	char version[5];
 	char type[5];
 	u32 sir_rej_mask;
+	bool i2c_slv_prsnt;
+	u32 dev_addr;
+	u32 i3c_pp_timing;
+	u32 i3c_od_timing;
+	u32 ext_lcnt_timing;
+	u32 bus_free_timing;
+	u32 i2c_fm_timing;
+	u32 i2c_fmp_timing;
 	/*
 	 * Per-device hardware data, used to manage the device address table
 	 * (DAT)

From effd21743c39a6535f204812006054fa172e4313 Mon Sep 17 00:00:00 2001
From: Aniket <aniketmaurya@google.com>
Date: Mon, 8 Jul 2024 06:21:02 +0000
Subject: [PATCH 294/346] i3c: dw: Add some functions for reusability

Separate logic for setting intr/thld registers in a func.
Also modify enable function to take care of setting all fields in DEVICE_CTRL.
These functions can be reused later for power management.

Signed-off-by: Aniket <aniketmaurya@google.com>
Link: https://lore.kernel.org/r/20240708062103.3296587-3-aniketmaurya@google.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/dw-i3c-master.c | 62 +++++++++++++++++-------------
 1 file changed, 35 insertions(+), 27 deletions(-)

diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c
index fcfa37f55d86..fadb58ac8b68 100644
--- a/drivers/i3c/master/dw-i3c-master.c
+++ b/drivers/i3c/master/dw-i3c-master.c
@@ -300,7 +300,14 @@ static void dw_i3c_master_disable(struct dw_i3c_master *master)
 
 static void dw_i3c_master_enable(struct dw_i3c_master *master)
 {
-	writel(readl(master->regs + DEVICE_CTRL) | DEV_CTRL_ENABLE,
+	u32 dev_ctrl;
+
+	dev_ctrl = readl(master->regs + DEVICE_CTRL);
+	/* For now don't support Hot-Join */
+	dev_ctrl |= DEV_CTRL_HOT_JOIN_NACK;
+	if (master->i2c_slv_prsnt)
+		dev_ctrl |= DEV_CTRL_I2C_SLAVE_PRESENT;
+	writel(dev_ctrl | DEV_CTRL_ENABLE,
 	       master->regs + DEVICE_CTRL);
 }
 
@@ -521,6 +528,32 @@ static void dw_i3c_master_end_xfer_locked(struct dw_i3c_master *master, u32 isr)
 	dw_i3c_master_start_xfer_locked(master);
 }
 
+static void dw_i3c_master_set_intr_regs(struct dw_i3c_master *master)
+{
+	u32 thld_ctrl;
+
+	thld_ctrl = readl(master->regs + QUEUE_THLD_CTRL);
+	thld_ctrl &= ~(QUEUE_THLD_CTRL_RESP_BUF_MASK |
+		       QUEUE_THLD_CTRL_IBI_STAT_MASK |
+		       QUEUE_THLD_CTRL_IBI_DATA_MASK);
+	thld_ctrl |= QUEUE_THLD_CTRL_IBI_STAT(1) |
+		QUEUE_THLD_CTRL_IBI_DATA(31);
+	writel(thld_ctrl, master->regs + QUEUE_THLD_CTRL);
+
+	thld_ctrl = readl(master->regs + DATA_BUFFER_THLD_CTRL);
+	thld_ctrl &= ~DATA_BUFFER_THLD_CTRL_RX_BUF;
+	writel(thld_ctrl, master->regs + DATA_BUFFER_THLD_CTRL);
+
+	writel(INTR_ALL, master->regs + INTR_STATUS);
+	writel(INTR_MASTER_MASK, master->regs + INTR_STATUS_EN);
+	writel(INTR_MASTER_MASK, master->regs + INTR_SIGNAL_EN);
+
+	master->sir_rej_mask = IBI_REQ_REJECT_ALL;
+	writel(master->sir_rej_mask, master->regs + IBI_SIR_REQ_REJECT);
+
+	writel(IBI_REQ_REJECT_ALL, master->regs + IBI_MR_REQ_REJECT);
+}
+
 static int dw_i3c_clk_cfg(struct dw_i3c_master *master)
 {
 	unsigned long core_rate, core_period;
@@ -615,7 +648,6 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m)
 	struct dw_i3c_master *master = to_dw_i3c_master(m);
 	struct i3c_bus *bus = i3c_master_get_bus(m);
 	struct i3c_device_info info = { };
-	u32 thld_ctrl;
 	int ret;
 
 	ret = master->platform_ops->init(master);
@@ -638,22 +670,6 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m)
 		return -EINVAL;
 	}
 
-	thld_ctrl = readl(master->regs + QUEUE_THLD_CTRL);
-	thld_ctrl &= ~(QUEUE_THLD_CTRL_RESP_BUF_MASK |
-		       QUEUE_THLD_CTRL_IBI_STAT_MASK |
-		       QUEUE_THLD_CTRL_IBI_DATA_MASK);
-	thld_ctrl |= QUEUE_THLD_CTRL_IBI_STAT(1) |
-		QUEUE_THLD_CTRL_IBI_DATA(31);
-	writel(thld_ctrl, master->regs + QUEUE_THLD_CTRL);
-
-	thld_ctrl = readl(master->regs + DATA_BUFFER_THLD_CTRL);
-	thld_ctrl &= ~DATA_BUFFER_THLD_CTRL_RX_BUF;
-	writel(thld_ctrl, master->regs + DATA_BUFFER_THLD_CTRL);
-
-	writel(INTR_ALL, master->regs + INTR_STATUS);
-	writel(INTR_MASTER_MASK, master->regs + INTR_STATUS_EN);
-	writel(INTR_MASTER_MASK, master->regs + INTR_SIGNAL_EN);
-
 	ret = i3c_master_get_free_addr(m, 0);
 	if (ret < 0)
 		return ret;
@@ -668,15 +684,7 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m)
 	if (ret)
 		return ret;
 
-	master->sir_rej_mask = IBI_REQ_REJECT_ALL;
-	writel(master->sir_rej_mask, master->regs + IBI_SIR_REQ_REJECT);
-
-	writel(IBI_REQ_REJECT_ALL, master->regs + IBI_MR_REQ_REJECT);
-
-	/* For now don't support Hot-Join */
-	writel(readl(master->regs + DEVICE_CTRL) | DEV_CTRL_HOT_JOIN_NACK,
-	       master->regs + DEVICE_CTRL);
-
+	dw_i3c_master_set_intr_regs(master);
 	dw_i3c_master_enable(master);
 
 	return 0;

From 62fe9d06f5709caa1fa68055873ff279e05ade37 Mon Sep 17 00:00:00 2001
From: Aniket <aniketmaurya@google.com>
Date: Mon, 8 Jul 2024 06:21:03 +0000
Subject: [PATCH 295/346] i3c: dw: Add power management support

Add support for runtime and system power management.
Handle clocks, resets, pads as part of suspend and resume.
Restore controller registers that could be lost due to suspend.
Finally add get and put calls appropriately in functions which
access controller : bus_init, do_daa, send_ccc_cmd, priv_xfers,
i2c_xfers, ibi and hot-join.

Signed-off-by: Aniket <aniketmaurya@google.com>
Link: https://lore.kernel.org/r/20240708062103.3296587-4-aniketmaurya@google.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/dw-i3c-master.c | 225 ++++++++++++++++++++++++++---
 drivers/i3c/master/dw-i3c-master.h |   1 +
 2 files changed, 209 insertions(+), 17 deletions(-)

diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c
index fadb58ac8b68..8d694672c110 100644
--- a/drivers/i3c/master/dw-i3c-master.c
+++ b/drivers/i3c/master/dw-i3c-master.c
@@ -17,7 +17,9 @@
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/pinctrl/consumer.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/reset.h>
 #include <linux/slab.h>
 
@@ -217,7 +219,7 @@
 #define I3C_BUS_THIGH_MAX_NS		41
 
 #define XFER_TIMEOUT (msecs_to_jiffies(1000))
-
+#define RPM_AUTOSUSPEND_TIMEOUT 1000 /* ms */
 struct dw_i3c_cmd {
 	u32 cmd_lo;
 	u32 cmd_hi;
@@ -650,29 +652,38 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m)
 	struct i3c_device_info info = { };
 	int ret;
 
+	ret = pm_runtime_resume_and_get(master->dev);
+	if (ret < 0) {
+		dev_err(master->dev,
+			"<%s> cannot resume i3c bus master, err: %d\n",
+			__func__, ret);
+		return ret;
+	}
+
 	ret = master->platform_ops->init(master);
 	if (ret)
-		return ret;
+		goto rpm_out;
 
 	switch (bus->mode) {
 	case I3C_BUS_MODE_MIXED_FAST:
 	case I3C_BUS_MODE_MIXED_LIMITED:
 		ret = dw_i2c_clk_cfg(master);
 		if (ret)
-			return ret;
+			goto rpm_out;
 		fallthrough;
 	case I3C_BUS_MODE_PURE:
 		ret = dw_i3c_clk_cfg(master);
 		if (ret)
-			return ret;
+			goto rpm_out;
 		break;
 	default:
-		return -EINVAL;
+		ret = -EINVAL;
+		goto rpm_out;
 	}
 
 	ret = i3c_master_get_free_addr(m, 0);
 	if (ret < 0)
-		return ret;
+		goto rpm_out;
 
 	writel(DEV_ADDR_DYNAMIC_ADDR_VALID | DEV_ADDR_DYNAMIC(ret),
 	       master->regs + DEVICE_ADDR);
@@ -682,12 +693,15 @@ static int dw_i3c_master_bus_init(struct i3c_master_controller *m)
 
 	ret = i3c_master_set_info(&master->base, &info);
 	if (ret)
-		return ret;
+		goto rpm_out;
 
 	dw_i3c_master_set_intr_regs(master);
 	dw_i3c_master_enable(master);
 
-	return 0;
+rpm_out:
+	pm_runtime_mark_last_busy(master->dev);
+	pm_runtime_put_autosuspend(master->dev);
+	return ret;
 }
 
 static void dw_i3c_master_bus_cleanup(struct i3c_master_controller *m)
@@ -789,11 +803,21 @@ static int dw_i3c_master_send_ccc_cmd(struct i3c_master_controller *m,
 	if (ccc->id == I3C_CCC_ENTDAA)
 		return -EINVAL;
 
+	ret = pm_runtime_resume_and_get(master->dev);
+	if (ret < 0) {
+		dev_err(master->dev,
+			"<%s> cannot resume i3c bus master, err: %d\n",
+			__func__, ret);
+		return ret;
+	}
+
 	if (ccc->rnw)
 		ret = dw_i3c_ccc_get(master, ccc);
 	else
 		ret = dw_i3c_ccc_set(master, ccc);
 
+	pm_runtime_mark_last_busy(master->dev);
+	pm_runtime_put_autosuspend(master->dev);
 	return ret;
 }
 
@@ -806,6 +830,14 @@ static int dw_i3c_master_daa(struct i3c_master_controller *m)
 	u8 p, last_addr = 0;
 	int ret, pos;
 
+	ret = pm_runtime_resume_and_get(master->dev);
+	if (ret < 0) {
+		dev_err(master->dev,
+			"<%s> cannot resume i3c bus master, err: %d\n",
+			__func__, ret);
+		return ret;
+	}
+
 	olddevs = ~(master->free_pos);
 
 	/* Prepare DAT before launching DAA. */
@@ -814,8 +846,10 @@ static int dw_i3c_master_daa(struct i3c_master_controller *m)
 			continue;
 
 		ret = i3c_master_get_free_addr(m, last_addr + 1);
-		if (ret < 0)
-			return -ENOSPC;
+		if (ret < 0) {
+			ret = -ENOSPC;
+			goto rpm_out;
+		}
 
 		master->devs[pos].addr = ret;
 		p = even_parity(ret);
@@ -825,16 +859,21 @@ static int dw_i3c_master_daa(struct i3c_master_controller *m)
 		writel(DEV_ADDR_TABLE_DYNAMIC_ADDR(ret),
 		       master->regs +
 		       DEV_ADDR_TABLE_LOC(master->datstartaddr, pos));
+
+		ret = 0;
 	}
 
 	xfer = dw_i3c_master_alloc_xfer(master, 1);
-	if (!xfer)
-		return -ENOMEM;
+	if (!xfer) {
+		ret = -ENOMEM;
+		goto rpm_out;
+	}
 
 	pos = dw_i3c_master_get_free_pos(master);
 	if (pos < 0) {
 		dw_i3c_master_free_xfer(xfer);
-		return pos;
+		ret = pos;
+		goto rpm_out;
 	}
 	cmd = &xfer->cmds[0];
 	cmd->cmd_hi = 0x1;
@@ -859,7 +898,10 @@ static int dw_i3c_master_daa(struct i3c_master_controller *m)
 
 	dw_i3c_master_free_xfer(xfer);
 
-	return 0;
+rpm_out:
+	pm_runtime_mark_last_busy(master->dev);
+	pm_runtime_put_autosuspend(master->dev);
+	return ret;
 }
 
 static int dw_i3c_master_priv_xfers(struct i3c_dev_desc *dev,
@@ -894,6 +936,14 @@ static int dw_i3c_master_priv_xfers(struct i3c_dev_desc *dev,
 	if (!xfer)
 		return -ENOMEM;
 
+	ret = pm_runtime_resume_and_get(master->dev);
+	if (ret < 0) {
+		dev_err(master->dev,
+			"<%s> cannot resume i3c bus master, err: %d\n",
+			__func__, ret);
+		return ret;
+	}
+
 	for (i = 0; i < i3c_nxfers; i++) {
 		struct dw_i3c_cmd *cmd = &xfer->cmds[i];
 
@@ -935,6 +985,8 @@ static int dw_i3c_master_priv_xfers(struct i3c_dev_desc *dev,
 	ret = xfer->ret;
 	dw_i3c_master_free_xfer(xfer);
 
+	pm_runtime_mark_last_busy(master->dev);
+	pm_runtime_put_autosuspend(master->dev);
 	return ret;
 }
 
@@ -1045,6 +1097,14 @@ static int dw_i3c_master_i2c_xfers(struct i2c_dev_desc *dev,
 	if (!xfer)
 		return -ENOMEM;
 
+	ret = pm_runtime_resume_and_get(master->dev);
+	if (ret < 0) {
+		dev_err(master->dev,
+			"<%s> cannot resume i3c bus master, err: %d\n",
+			__func__, ret);
+		return ret;
+	}
+
 	for (i = 0; i < i2c_nxfers; i++) {
 		struct dw_i3c_cmd *cmd = &xfer->cmds[i];
 
@@ -1075,6 +1135,8 @@ static int dw_i3c_master_i2c_xfers(struct i2c_dev_desc *dev,
 	ret = xfer->ret;
 	dw_i3c_master_free_xfer(xfer);
 
+	pm_runtime_mark_last_busy(master->dev);
+	pm_runtime_put_autosuspend(master->dev);
 	return ret;
 }
 
@@ -1217,6 +1279,15 @@ static void dw_i3c_master_set_sir_enabled(struct dw_i3c_master *master,
 static int dw_i3c_master_enable_hotjoin(struct i3c_master_controller *m)
 {
 	struct dw_i3c_master *master = to_dw_i3c_master(m);
+	int ret;
+
+	ret = pm_runtime_resume_and_get(master->dev);
+	if (ret < 0) {
+		dev_err(master->dev,
+			"<%s> cannot resume i3c bus master, err: %d\n",
+			__func__, ret);
+		return ret;
+	}
 
 	dw_i3c_master_enable_sir_signal(master, true);
 	writel(readl(master->regs + DEVICE_CTRL) & ~DEV_CTRL_HOT_JOIN_NACK,
@@ -1232,6 +1303,8 @@ static int dw_i3c_master_disable_hotjoin(struct i3c_master_controller *m)
 	writel(readl(master->regs + DEVICE_CTRL) | DEV_CTRL_HOT_JOIN_NACK,
 	       master->regs + DEVICE_CTRL);
 
+	pm_runtime_mark_last_busy(master->dev);
+	pm_runtime_put_autosuspend(master->dev);
 	return 0;
 }
 
@@ -1242,12 +1315,23 @@ static int dw_i3c_master_enable_ibi(struct i3c_dev_desc *dev)
 	struct dw_i3c_master *master = to_dw_i3c_master(m);
 	int rc;
 
+	rc = pm_runtime_resume_and_get(master->dev);
+	if (rc < 0) {
+		dev_err(master->dev,
+			"<%s> cannot resume i3c bus master, err: %d\n",
+			__func__, rc);
+		return rc;
+	}
+
 	dw_i3c_master_set_sir_enabled(master, dev, data->index, true);
 
 	rc = i3c_master_enec_locked(m, dev->info.dyn_addr, I3C_CCC_EVENT_SIR);
 
-	if (rc)
+	if (rc) {
 		dw_i3c_master_set_sir_enabled(master, dev, data->index, false);
+		pm_runtime_mark_last_busy(master->dev);
+		pm_runtime_put_autosuspend(master->dev);
+	}
 
 	return rc;
 }
@@ -1265,6 +1349,8 @@ static int dw_i3c_master_disable_ibi(struct i3c_dev_desc *dev)
 
 	dw_i3c_master_set_sir_enabled(master, dev, data->index, false);
 
+	pm_runtime_mark_last_busy(master->dev);
+	pm_runtime_put_autosuspend(master->dev);
 	return 0;
 }
 
@@ -1465,6 +1551,8 @@ int dw_i3c_common_probe(struct dw_i3c_master *master,
 	if (!master->platform_ops)
 		master->platform_ops = &dw_i3c_platform_ops_default;
 
+	master->dev = &pdev->dev;
+
 	master->regs = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(master->regs))
 		return PTR_ERR(master->regs);
@@ -1497,6 +1585,11 @@ int dw_i3c_common_probe(struct dw_i3c_master *master,
 
 	platform_set_drvdata(pdev, master);
 
+	pm_runtime_set_autosuspend_delay(&pdev->dev, RPM_AUTOSUSPEND_TIMEOUT);
+	pm_runtime_use_autosuspend(&pdev->dev);
+	pm_runtime_set_active(&pdev->dev);
+	pm_runtime_enable(&pdev->dev);
+
 	/* Information regarding the FIFOs/QUEUEs depth */
 	ret = readl(master->regs + QUEUE_STATUS_LEVEL);
 	master->caps.cmdfifodepth = QUEUE_STATUS_LEVEL_CMD(ret);
@@ -1513,10 +1606,15 @@ int dw_i3c_common_probe(struct dw_i3c_master *master,
 	ret = i3c_master_register(&master->base, &pdev->dev,
 				  &dw_mipi_i3c_ops, false);
 	if (ret)
-		goto err_assert_rst;
+		goto err_disable_pm;
 
 	return 0;
 
+err_disable_pm:
+	pm_runtime_disable(&pdev->dev);
+	pm_runtime_set_suspended(&pdev->dev);
+	pm_runtime_dont_use_autosuspend(&pdev->dev);
+
 err_assert_rst:
 	reset_control_assert(master->core_rst);
 
@@ -1528,7 +1626,9 @@ void dw_i3c_common_remove(struct dw_i3c_master *master)
 {
 	i3c_master_unregister(&master->base);
 
-	reset_control_assert(master->core_rst);
+	pm_runtime_disable(master->dev);
+	pm_runtime_set_suspended(master->dev);
+	pm_runtime_dont_use_autosuspend(master->dev);
 }
 EXPORT_SYMBOL_GPL(dw_i3c_common_remove);
 
@@ -1552,6 +1652,96 @@ static void dw_i3c_remove(struct platform_device *pdev)
 	dw_i3c_common_remove(master);
 }
 
+static void dw_i3c_master_restore_addrs(struct dw_i3c_master *master)
+{
+	u32 pos, reg_val;
+
+	writel(DEV_ADDR_DYNAMIC_ADDR_VALID | DEV_ADDR_DYNAMIC(master->dev_addr),
+	       master->regs + DEVICE_ADDR);
+
+	for (pos = 0; pos < master->maxdevs; pos++) {
+		if (master->free_pos & BIT(pos))
+			continue;
+
+		if (master->devs[pos].is_i2c_addr)
+			reg_val = DEV_ADDR_TABLE_LEGACY_I2C_DEV |
+			       DEV_ADDR_TABLE_STATIC_ADDR(master->devs[pos].addr);
+		else
+			reg_val = DEV_ADDR_TABLE_DYNAMIC_ADDR(master->devs[pos].addr);
+
+		writel(reg_val, master->regs + DEV_ADDR_TABLE_LOC(master->datstartaddr, pos));
+	}
+}
+
+static void dw_i3c_master_restore_timing_regs(struct dw_i3c_master *master)
+{
+	writel(master->i3c_pp_timing, master->regs + SCL_I3C_PP_TIMING);
+	writel(master->bus_free_timing, master->regs + BUS_FREE_TIMING);
+	writel(master->i3c_od_timing, master->regs + SCL_I3C_OD_TIMING);
+	writel(master->ext_lcnt_timing, master->regs + SCL_EXT_LCNT_TIMING);
+
+	if (master->i2c_slv_prsnt) {
+		writel(master->i2c_fmp_timing, master->regs + SCL_I2C_FMP_TIMING);
+		writel(master->i2c_fm_timing, master->regs + SCL_I2C_FM_TIMING);
+	}
+}
+
+static int dw_i3c_master_enable_clks(struct dw_i3c_master *master)
+{
+	int ret = 0;
+
+	ret = clk_prepare_enable(master->core_clk);
+	if (ret)
+		return ret;
+
+	ret = clk_prepare_enable(master->pclk);
+	if (ret) {
+		clk_disable_unprepare(master->core_clk);
+		return ret;
+	}
+
+	return 0;
+}
+
+static inline void dw_i3c_master_disable_clks(struct dw_i3c_master *master)
+{
+	clk_disable_unprepare(master->pclk);
+	clk_disable_unprepare(master->core_clk);
+}
+
+static int __maybe_unused dw_i3c_master_runtime_suspend(struct device *dev)
+{
+	struct dw_i3c_master *master = dev_get_drvdata(dev);
+
+	dw_i3c_master_disable(master);
+
+	reset_control_assert(master->core_rst);
+	dw_i3c_master_disable_clks(master);
+	pinctrl_pm_select_sleep_state(dev);
+	return 0;
+}
+
+static int __maybe_unused dw_i3c_master_runtime_resume(struct device *dev)
+{
+	struct dw_i3c_master *master = dev_get_drvdata(dev);
+
+	pinctrl_pm_select_default_state(dev);
+	dw_i3c_master_enable_clks(master);
+	reset_control_deassert(master->core_rst);
+
+	dw_i3c_master_set_intr_regs(master);
+	dw_i3c_master_restore_timing_regs(master);
+	dw_i3c_master_restore_addrs(master);
+
+	dw_i3c_master_enable(master);
+	return 0;
+}
+
+static const struct dev_pm_ops dw_i3c_pm_ops = {
+	SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend, pm_runtime_force_resume)
+	SET_RUNTIME_PM_OPS(dw_i3c_master_runtime_suspend, dw_i3c_master_runtime_resume, NULL)
+};
+
 static const struct of_device_id dw_i3c_master_of_match[] = {
 	{ .compatible = "snps,dw-i3c-master-1.00a", },
 	{},
@@ -1564,6 +1754,7 @@ static struct platform_driver dw_i3c_driver = {
 	.driver = {
 		.name = "dw-i3c-master",
 		.of_match_table = dw_i3c_master_of_match,
+		.pm = &dw_i3c_pm_ops,
 	},
 };
 module_platform_driver(dw_i3c_driver);
diff --git a/drivers/i3c/master/dw-i3c-master.h b/drivers/i3c/master/dw-i3c-master.h
index f23e9d5aca86..219ff815d3a7 100644
--- a/drivers/i3c/master/dw-i3c-master.h
+++ b/drivers/i3c/master/dw-i3c-master.h
@@ -25,6 +25,7 @@ struct dw_i3c_dat_entry {
 
 struct dw_i3c_master {
 	struct i3c_master_controller base;
+	struct device *dev;
 	u16 maxdevs;
 	u16 datstartaddr;
 	u32 free_pos;

From 63c33ca0969cf4d4574103b69fb46f58a19a182b Mon Sep 17 00:00:00 2001
From: Bhoomik Gupta <bhoomik.gupta@nxp.com>
Date: Mon, 8 Jul 2024 11:08:35 +0530
Subject: [PATCH 296/346] i3c: master: Enhance i3c_bus_type visibility for
 device searching & event monitoring

Improve the visibility of i3c_bus_type to facilitate searching for
i3c devices attached to the i3c bus. Enable other drivers to use
bus_register_notifier to monitor i3c bus device events.

Signed-off-by: Bhoomik Gupta <bhoomik.gupta@nxp.com>
Link: https://lore.kernel.org/r/20240708053835.3003986-1-bhoomik.gupta@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/internals.h    | 2 --
 drivers/i3c/master.c       | 1 +
 include/linux/i3c/master.h | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/i3c/internals.h b/drivers/i3c/internals.h
index 4d99a3524171..433f6088b7ce 100644
--- a/drivers/i3c/internals.h
+++ b/drivers/i3c/internals.h
@@ -10,8 +10,6 @@
 
 #include <linux/i3c/master.h>
 
-extern const struct bus_type i3c_bus_type;
-
 void i3c_bus_normaluse_lock(struct i3c_bus *bus);
 void i3c_bus_normaluse_unlock(struct i3c_bus *bus);
 
diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index 3b4d6a8edca3..a211dc4d25bb 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -342,6 +342,7 @@ const struct bus_type i3c_bus_type = {
 	.probe = i3c_device_probe,
 	.remove = i3c_device_remove,
 };
+EXPORT_SYMBOL_GPL(i3c_bus_type);
 
 static enum i3c_addr_slot_status
 i3c_bus_get_addr_slot_status(struct i3c_bus *bus, u16 addr)
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 0ca27dd86956..074f632868d9 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -33,6 +33,7 @@ enum {
 struct i3c_master_controller;
 struct i3c_bus;
 struct i3c_device;
+extern const struct bus_type i3c_bus_type;
 
 /**
  * struct i3c_i2c_dev_desc - Common part of the I3C/I2C device descriptor

From b73c983491e0a61331611d2dab1dfccd4464b060 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Thu, 18 Jul 2024 13:30:22 -0500
Subject: [PATCH 297/346] i3c: master: svc: Fix error code in
 svc_i3c_master_do_daa_locked()

This code has a typo so it returns positive EIO instead of negative -EIO.  Fix
it!

Fixes: a7809cb368b9 ("i3c: master: svc: Improve DAA STOP handle code logic")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/e017edfc-da64-496b-8516-958bec27cd9a@stanley.mountain
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master/svc-i3c-master.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c
index e80c002991f7..0a68fd1b81d4 100644
--- a/drivers/i3c/master/svc-i3c-master.c
+++ b/drivers/i3c/master/svc-i3c-master.c
@@ -874,7 +874,7 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master,
 				 * address this time.
 				 */
 				if (prov_id[dev_nb] == nacking_prov_id) {
-					ret = EIO;
+					ret = -EIO;
 					break;
 				}
 

From 24168c5e6dfbdd5b414f048f47f75d64533296ca Mon Sep 17 00:00:00 2001
From: Carlos Song <carlos.song@nxp.com>
Date: Mon, 15 Jul 2024 18:53:51 -0400
Subject: [PATCH 298/346] dt-bindings: i3c: add header for generic I3C flags

Add header file for generic I3C flags to avoid hard code in dts file.

Signed-off-by: Carlos Song <carlos.song@nxp.com>
Reviewed-by: Frank Li <frank.li@nxp.com>
Acked-by: Jason Liu <jason.hui.liu@nxp.com>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240715225351.3237284-1-Frank.Li@nxp.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 Documentation/devicetree/bindings/i3c/i3c.yaml |  5 ++++-
 MAINTAINERS                                    |  1 +
 include/dt-bindings/i3c/i3c.h                  | 16 ++++++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 include/dt-bindings/i3c/i3c.h

diff --git a/Documentation/devicetree/bindings/i3c/i3c.yaml b/Documentation/devicetree/bindings/i3c/i3c.yaml
index 113957ebe9f1..e25fa72fd785 100644
--- a/Documentation/devicetree/bindings/i3c/i3c.yaml
+++ b/Documentation/devicetree/bindings/i3c/i3c.yaml
@@ -91,6 +91,7 @@ patternProperties:
               - const: 0
               - description: |
                   Shall encode the I3C LVR (Legacy Virtual Register):
+                  See include/dt-bindings/i3c/i3c.h
                     bit[31:8]: unused/ignored
                     bit[7:5]: I2C device index. Possible values:
                       * 0: I2C device has a 50 ns spike filter
@@ -153,6 +154,8 @@ additionalProperties: true
 
 examples:
   - |
+    #include <dt-bindings/i3c/i3c.h>
+
     i3c@d040000 {
         compatible = "cdns,i3c-master";
         clocks = <&coreclock>, <&i3csysclock>;
@@ -166,7 +169,7 @@ examples:
         /* I2C device. */
         eeprom@57 {
             compatible = "atmel,24c01";
-            reg = <0x57 0x0 0x10>;
+            reg = <0x57 0x0 (I2C_FM | I2C_FILTER)>;
             pagesize = <0x8>;
         };
 
diff --git a/MAINTAINERS b/MAINTAINERS
index d6c90161c7bf..8b27d7752b9b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10479,6 +10479,7 @@ F:	Documentation/ABI/testing/sysfs-bus-i3c
 F:	Documentation/devicetree/bindings/i3c/
 F:	Documentation/driver-api/i3c
 F:	drivers/i3c/
+F:	include/dt-bindings/i3c/
 F:	include/linux/i3c/
 
 IBM Operation Panel Input Driver
diff --git a/include/dt-bindings/i3c/i3c.h b/include/dt-bindings/i3c/i3c.h
new file mode 100644
index 000000000000..373439218bba
--- /dev/null
+++ b/include/dt-bindings/i3c/i3c.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */
+/*
+ * Copyright 2024 NXP
+ */
+
+#ifndef _DT_BINDINGS_I3C_I3C_H
+#define _DT_BINDINGS_I3C_I3C_H
+
+#define I2C_FM      (1 << 4)
+#define I2C_FM_PLUS (0 << 4)
+
+#define I2C_FILTER  (0 << 5)
+#define I2C_NO_FILTER_HIGH_FREQUENCY    (1 << 5)
+#define I2C_NO_FILTER_LOW_FREQUENCY     (2 << 5)
+
+#endif

From 5d5fc33ce58e81e8738816f5ee59f8e85fd3b404 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <antonb@tenstorrent.com>
Date: Thu, 6 Jun 2024 23:13:35 -0700
Subject: [PATCH 299/346] riscv: Improve exception and system call latency

Many CPUs implement return address branch prediction as a stack. The
RISCV architecture refers to this as a return address stack (RAS). If
this gets corrupted then the CPU will mispredict at least one but
potentally many function returns.

There are two issues with the current RISCV exception code:

- We are using the alternate link stack (x5/t0) for the indirect branch
  which makes the hardware think this is a function return. This will
  corrupt the RAS.

- We modify the return address of handle_exception to point to
  ret_from_exception. This will also corrupt the RAS.

Testing the null system call latency before and after the patch:

Visionfive2 (StarFive JH7110 / U74)
baseline: 189.87 ns
patched:  176.76 ns

Lichee pi 4a (T-Head TH1520 / C910)
baseline: 666.58 ns
patched:  636.90 ns

Just over 7% on the U74 and just over 4% on the C910.

Signed-off-by: Anton Blanchard <antonb@tenstorrent.com>
Signed-off-by: Cyril Bur <cyrilbur@tenstorrent.com>
Tested-by: Jisheng Zhang <jszhang@kernel.org>
Reviewed-by: Jisheng Zhang <jszhang@kernel.org>
Link: https://lore.kernel.org/r/20240607061335.2197383-1-cyrilbur@tenstorrent.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/kernel/entry.S      | 17 ++++++++++-------
 arch/riscv/kernel/stacktrace.c |  4 ++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 68a24cf9481a..c933460ed3e9 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -88,7 +88,6 @@ SYM_CODE_START(handle_exception)
 	call riscv_v_context_nesting_start
 #endif
 	move a0, sp /* pt_regs */
-	la ra, ret_from_exception
 
 	/*
 	 * MSB of cause differentiates between
@@ -97,7 +96,8 @@ SYM_CODE_START(handle_exception)
 	bge s4, zero, 1f
 
 	/* Handle interrupts */
-	tail do_irq
+	call do_irq
+	j ret_from_exception
 1:
 	/* Handle other exceptions */
 	slli t0, s4, RISCV_LGPTR
@@ -105,11 +105,14 @@ SYM_CODE_START(handle_exception)
 	la t2, excp_vect_table_end
 	add t0, t1, t0
 	/* Check if exception code lies within bounds */
-	bgeu t0, t2, 1f
-	REG_L t0, 0(t0)
-	jr t0
-1:
-	tail do_trap_unknown
+	bgeu t0, t2, 3f
+	REG_L t1, 0(t0)
+2:	jalr t1
+	j ret_from_exception
+3:
+
+	la t1, do_trap_unknown
+	j 2b
 SYM_CODE_END(handle_exception)
 ASM_NOKPROBE(handle_exception)
 
diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c
index 10e311b2759d..c6d5de22463f 100644
--- a/arch/riscv/kernel/stacktrace.c
+++ b/arch/riscv/kernel/stacktrace.c
@@ -16,7 +16,7 @@
 
 #ifdef CONFIG_FRAME_POINTER
 
-extern asmlinkage void ret_from_exception(void);
+extern asmlinkage void handle_exception(void);
 
 static inline int fp_is_valid(unsigned long fp, unsigned long sp)
 {
@@ -71,7 +71,7 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs,
 			fp = frame->fp;
 			pc = ftrace_graph_ret_addr(current, &graph_idx, frame->ra,
 						   &frame->ra);
-			if (pc == (unsigned long)ret_from_exception) {
+			if (pc == (unsigned long)handle_exception) {
 				if (unlikely(!__kernel_text_address(pc) || !fn(arg, pc)))
 					break;
 

From 1d20e5d437cfebefb5e6f4d652c3a1561fc23fc7 Mon Sep 17 00:00:00 2001
From: Zhongqiu Han <quic_zhonhan@quicinc.com>
Date: Thu, 20 Jun 2024 11:34:34 +0800
Subject: [PATCH 300/346] riscv: signal: Remove unlikely() from WARN_ON()
 condition

"WARN_ON(unlikely(x))" is excessive. WARN_ON() already uses unlikely()
internally.

Signed-off-by: Zhongqiu Han <quic_zhonhan@quicinc.com>
Reviewed-by: Bjorn Andersson <quic_bjorande@quicinc.com>
Reviewed-by: Andy Chiu <andy.chiu@sifive.com>
Link: https://lore.kernel.org/r/20240620033434.3778156-1-quic_zhonhan@quicinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c
index 5a2edd7f027e..dcd282419456 100644
--- a/arch/riscv/kernel/signal.c
+++ b/arch/riscv/kernel/signal.c
@@ -84,7 +84,7 @@ static long save_v_state(struct pt_regs *regs, void __user **sc_vec)
 	datap = state + 1;
 
 	/* datap is designed to be 16 byte aligned for better performance */
-	WARN_ON(unlikely(!IS_ALIGNED((unsigned long)datap, 16)));
+	WARN_ON(!IS_ALIGNED((unsigned long)datap, 16));
 
 	get_cpu_vector_context();
 	riscv_v_vstate_save(&current->thread.vstate, regs);

From b5db73fb18257cd5d9cb59bc4b779fffa629566a Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@kernel.org>
Date: Mon, 24 Jun 2024 07:53:16 +0800
Subject: [PATCH 301/346] riscv: enable HAVE_ARCH_STACKLEAK

Add support for the stackleak feature. Whenever the kernel returns to user
space the kernel stack is filled with a poison value.

At the same time, disables the plugin in EFI stub code because EFI stub
is out of scope for the protection.

Tested on qemu and milkv duo:
/ # echo STACKLEAK_ERASING > /sys/kernel/debug/provoke-crash/DIRECT
[   38.675575] lkdtm: Performing direct entry STACKLEAK_ERASING
[   38.678448] lkdtm: stackleak stack usage:
[   38.678448]   high offset: 288 bytes
[   38.678448]   current:     496 bytes
[   38.678448]   lowest:      1328 bytes
[   38.678448]   tracked:     1328 bytes
[   38.678448]   untracked:   448 bytes
[   38.678448]   poisoned:    14312 bytes
[   38.678448]   low offset:  8 bytes
[   38.689887] lkdtm: OK: the rest of the thread stack is properly erased

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20240623235316.2010-1-jszhang@kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/Kconfig                    | 1 +
 arch/riscv/include/asm/thread_info.h  | 1 +
 arch/riscv/kernel/entry.S             | 4 ++++
 drivers/firmware/efi/libstub/Makefile | 3 ++-
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 34ae39ecb898..19ca402e6f26 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -123,6 +123,7 @@ config RISCV
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
 	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_SECCOMP_FILTER
+	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU
diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
index 5d473343634b..fca5c6be2b81 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -10,6 +10,7 @@
 
 #include <asm/page.h>
 #include <linux/const.h>
+#include <linux/sizes.h>
 
 /* thread information allocation */
 #define THREAD_SIZE_ORDER	CONFIG_THREAD_SIZE_ORDER
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index c933460ed3e9..ac2e908d4418 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -133,6 +133,10 @@ SYM_CODE_START_NOALIGN(ret_from_exception)
 #endif
 	bnez s0, 1f
 
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+	call	stackleak_erase_on_task_stack
+#endif
+
 	/* Save unwound kernel stack pointer in thread_info */
 	addi s0, sp, PT_SIZE_ON_STACK
 	REG_S s0, TASK_TI_KERNEL_SP(tp)
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 1f32d6cf98d6..f23ba62ce127 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -28,7 +28,8 @@ cflags-$(CONFIG_ARM)		+= -DEFI_HAVE_STRLEN -DEFI_HAVE_STRNLEN \
 				   -DEFI_HAVE_MEMCHR -DEFI_HAVE_STRRCHR \
 				   -DEFI_HAVE_STRCMP -fno-builtin -fpic \
 				   $(call cc-option,-mno-single-pic-base)
-cflags-$(CONFIG_RISCV)		+= -fpic -DNO_ALTERNATIVE -mno-relax
+cflags-$(CONFIG_RISCV)		+= -fpic -DNO_ALTERNATIVE -mno-relax \
+				   $(DISABLE_STACKLEAK_PLUGIN)
 cflags-$(CONFIG_LOONGARCH)	+= -fpie
 
 cflags-$(CONFIG_EFI_PARAMS_FROM_FDT)	+= -I$(srctree)/scripts/dtc/libfdt

From 5c8405d763dc2b125b39166bc70be1b8dcc80582 Mon Sep 17 00:00:00 2001
From: Stuart Menefy <stuart.menefy@codasip.com>
Date: Sun, 30 Jun 2024 12:05:49 +0100
Subject: [PATCH 302/346] riscv: Extend sv39 linear mapping max size to 128G

This harmonizes all virtual addressing modes which can now all map
(PGDIR_SIZE * PTRS_PER_PGD) / 4 of physical memory.

The RISCV implementation of KASAN requires that the boundary between
shallow mappings are aligned on an 8G boundary. In this case we need
VMALLOC_START to be 8G aligned. So although we only need to move the
start of the linear mapping down by 4GiB to allow 128GiB to be mapped,
we actually move it down by 8GiB (creating a 4GiB hole between the
linear mapping and KASAN shadow space) to maintain the alignment
requirement.

Signed-off-by: Stuart Menefy <stuart.menefy@codasip.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20240630110550.1731929-1-stuart.menefy@codasip.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 Documentation/arch/riscv/vm-layout.rst | 11 ++++++-----
 arch/riscv/include/asm/page.h          |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/Documentation/arch/riscv/vm-layout.rst b/Documentation/arch/riscv/vm-layout.rst
index e476b4386bd9..077b968dcc81 100644
--- a/Documentation/arch/riscv/vm-layout.rst
+++ b/Documentation/arch/riscv/vm-layout.rst
@@ -47,11 +47,12 @@ RISC-V Linux Kernel SV39
                                                               | Kernel-space virtual memory, shared between all processes:
   ____________________________________________________________|___________________________________________________________
                     |            |                  |         |
-   ffffffc6fea00000 | -228    GB | ffffffc6feffffff |    6 MB | fixmap
-   ffffffc6ff000000 | -228    GB | ffffffc6ffffffff |   16 MB | PCI io
-   ffffffc700000000 | -228    GB | ffffffc7ffffffff |    4 GB | vmemmap
-   ffffffc800000000 | -224    GB | ffffffd7ffffffff |   64 GB | vmalloc/ioremap space
-   ffffffd800000000 | -160    GB | fffffff6ffffffff |  124 GB | direct mapping of all physical memory
+   ffffffc4fea00000 | -236    GB | ffffffc4feffffff |    6 MB | fixmap
+   ffffffc4ff000000 | -236    GB | ffffffc4ffffffff |   16 MB | PCI io
+   ffffffc500000000 | -236    GB | ffffffc5ffffffff |    4 GB | vmemmap
+   ffffffc600000000 | -232    GB | ffffffd5ffffffff |   64 GB | vmalloc/ioremap space
+   ffffffd600000000 | -168    GB | fffffff5ffffffff |  128 GB | direct mapping of all physical memory
+                    |            |                  |         |
    fffffff700000000 |  -36    GB | fffffffeffffffff |   32 GB | kasan
   __________________|____________|__________________|_________|____________________________________________________________
                                                               |
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 235fd45d998d..7ede2111c591 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -37,7 +37,7 @@
  * define the PAGE_OFFSET value for SV48 and SV39.
  */
 #define PAGE_OFFSET_L4		_AC(0xffffaf8000000000, UL)
-#define PAGE_OFFSET_L3		_AC(0xffffffd800000000, UL)
+#define PAGE_OFFSET_L3		_AC(0xffffffd600000000, UL)
 #else
 #define PAGE_OFFSET		_AC(CONFIG_PAGE_OFFSET, UL)
 #endif /* CONFIG_64BIT */

From 52420e483d3e1562f11a208d3c540b27b5e5dbf4 Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@rivosinc.com>
Date: Tue, 2 Jul 2024 11:37:31 +0800
Subject: [PATCH 303/346] RISC-V: Provide the frequency of time CSR via hwprobe

The RISC-V architecture makes a real time counter CSR (via RDTIME
instruction) available for applications in U-mode but there is no
architected mechanism for an application to discover the frequency
the counter is running at. Some applications (e.g., DPDK) use the
time counter for basic performance analysis as well as fine grained
time-keeping.

Add support to the hwprobe system call to export the time CSR
frequency to code running in U-mode.

Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
Reviewed-by: Evan Green <evan@rivosinc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Acked-by: Punit Agrawal <punit.agrawal@bytedance.com>
Link: https://lore.kernel.org/r/20240702033731.71955-2-cuiyunhui@bytedance.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 Documentation/arch/riscv/hwprobe.rst  | 2 ++
 arch/riscv/include/asm/hwprobe.h      | 2 +-
 arch/riscv/include/uapi/asm/hwprobe.h | 1 +
 arch/riscv/kernel/sys_hwprobe.c       | 5 +++++
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/Documentation/arch/riscv/hwprobe.rst b/Documentation/arch/riscv/hwprobe.rst
index 02eb4d98b7de..3db60a0911df 100644
--- a/Documentation/arch/riscv/hwprobe.rst
+++ b/Documentation/arch/riscv/hwprobe.rst
@@ -264,3 +264,5 @@ The following keys are defined:
 
 * :c:macro:`RISCV_HWPROBE_KEY_HIGHEST_VIRT_ADDRESS`: An unsigned long which
   represent the highest userspace virtual address usable.
+
+* :c:macro:`RISCV_HWPROBE_KEY_TIME_CSR_FREQ`: Frequency (in Hz) of `time CSR`.
diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
index 150a9877b0af..ef01c182af2b 100644
--- a/arch/riscv/include/asm/hwprobe.h
+++ b/arch/riscv/include/asm/hwprobe.h
@@ -8,7 +8,7 @@
 
 #include <uapi/asm/hwprobe.h>
 
-#define RISCV_HWPROBE_MAX_KEY 7
+#define RISCV_HWPROBE_MAX_KEY 8
 
 static inline bool riscv_hwprobe_key_is_valid(__s64 key)
 {
diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
index 8b8f6ac0eae2..b706c8e47b02 100644
--- a/arch/riscv/include/uapi/asm/hwprobe.h
+++ b/arch/riscv/include/uapi/asm/hwprobe.h
@@ -81,6 +81,7 @@ struct riscv_hwprobe {
 #define		RISCV_HWPROBE_MISALIGNED_MASK		(7 << 0)
 #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE	6
 #define RISCV_HWPROBE_KEY_HIGHEST_VIRT_ADDRESS	7
+#define RISCV_HWPROBE_KEY_TIME_CSR_FREQ	8
 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
 
 /* Flags */
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index 8a1c9ce170e8..8d1b5c35d2a7 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -9,6 +9,7 @@
 #include <asm/cpufeature.h>
 #include <asm/hwprobe.h>
 #include <asm/processor.h>
+#include <asm/delay.h>
 #include <asm/sbi.h>
 #include <asm/switch_to.h>
 #include <asm/uaccess.h>
@@ -236,6 +237,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
 		pair->value = user_max_virt_addr();
 		break;
 
+	case RISCV_HWPROBE_KEY_TIME_CSR_FREQ:
+		pair->value = riscv_timebase;
+		break;
+
 	/*
 	 * For forward compatibility, unknown keys don't fail the whole
 	 * call, but get their element key set to -1 and value set to 0

From e975d955c07cbc2cd6a83a5d8235d8373441fdb9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 26 Jul 2024 14:49:41 +0100
Subject: [PATCH 304/346] regulator: Further restrict RZG2L USB VBCTRL
 regulator dependencies

Since the regulator can't be used without the USB controller also
tighten the dependency to match, as well as the default.

Reviewed-by: Biju Das <biju.das.jz@bp.renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://patch.msgid.link/20240726-regulator-restrict-rzg2l-v1-1-640e508896e2@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 5be44606ac15..5560dc4db986 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -1636,7 +1636,7 @@ config REGULATOR_UNIPHIER
 
 config REGULATOR_RZG2L_VBCTRL
 	tristate "Renesas RZ/G2L USB VBUS regulator driver"
-	depends on ARCH_RZG2L || COMPILE_TEST
+	depends on RESET_RZG2L_USBPHY_CTRL || COMPILE_TEST
 	depends on OF
 	select REGMAP_MMIO
 	default RESET_RZG2L_USBPHY_CTRL

From 342b2e395d5f34c9f111a818556e617939f83a8c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 26 Jul 2024 15:24:30 +0100
Subject: [PATCH 305/346] io_uring/napi: use ktime in busy polling

It's more natural to use ktime/ns instead of keeping around usec,
especially since we're comparing it against user provided timers,
so convert napi busy poll internal handling to ktime. It's also nicer
since the type (ktime_t vs unsigned long) now tells the unit of measure.

Keep everything as ktime, which we convert to/from micro seconds for
IORING_[UN]REGISTER_NAPI. The net/ busy polling works seems to work with
usec, however it's not real usec as shift by 10 is used to get it from
nsecs, see busy_loop_current_time(), so it's easy to get truncated nsec
back and we get back better precision.

Note, we can further improve it later by removing the truncation and
maybe convincing net/ to use ktime/ns instead.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/95e7ec8d095069a3ed5d40a4bc6f8b586698bc7e.1722003776.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/io_uring.h            |  2 +-
 io_uring/napi.c                | 48 +++++++++++++++++++---------------
 io_uring/napi.h                |  2 +-
 4 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e62aa9f0629f..3315005df117 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -404,7 +404,7 @@ struct io_ring_ctx {
 	spinlock_t		napi_lock;	/* napi_list lock */
 
 	/* napi busy poll default timeout */
-	unsigned int		napi_busy_poll_to;
+	ktime_t			napi_busy_poll_dt;
 	bool			napi_prefer_busy_poll;
 	bool			napi_enabled;
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e1ce908f0679..c2acf6180845 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -43,7 +43,7 @@ struct io_wait_queue {
 	ktime_t timeout;
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
-	unsigned int napi_busy_poll_to;
+	ktime_t napi_busy_poll_dt;
 	bool napi_prefer_busy_poll;
 #endif
 };
diff --git a/io_uring/napi.c b/io_uring/napi.c
index 327e5f3a8abe..6bdb267e9c33 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -33,6 +33,12 @@ static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
 	return NULL;
 }
 
+static inline ktime_t net_to_ktime(unsigned long t)
+{
+	/* napi approximating usecs, reverse busy_loop_current_time */
+	return ns_to_ktime(t << 10);
+}
+
 void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
 {
 	struct hlist_head *hash_list;
@@ -102,14 +108,14 @@ static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
 		__io_napi_remove_stale(ctx);
 }
 
-static inline bool io_napi_busy_loop_timeout(unsigned long start_time,
-					     unsigned long bp_usec)
+static inline bool io_napi_busy_loop_timeout(ktime_t start_time,
+					     ktime_t bp)
 {
-	if (bp_usec) {
-		unsigned long end_time = start_time + bp_usec;
-		unsigned long now = busy_loop_current_time();
+	if (bp) {
+		ktime_t end_time = ktime_add(start_time, bp);
+		ktime_t now = net_to_ktime(busy_loop_current_time());
 
-		return time_after(now, end_time);
+		return ktime_after(now, end_time);
 	}
 
 	return true;
@@ -124,7 +130,8 @@ static bool io_napi_busy_loop_should_end(void *data,
 		return true;
 	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
 		return true;
-	if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to))
+	if (io_napi_busy_loop_timeout(net_to_ktime(start_time),
+				      iowq->napi_busy_poll_dt))
 		return true;
 
 	return false;
@@ -181,10 +188,12 @@ static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
  */
 void io_napi_init(struct io_ring_ctx *ctx)
 {
+	u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC;
+
 	INIT_LIST_HEAD(&ctx->napi_list);
 	spin_lock_init(&ctx->napi_lock);
 	ctx->napi_prefer_busy_poll = false;
-	ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
+	ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
 }
 
 /*
@@ -217,7 +226,7 @@ void io_napi_free(struct io_ring_ctx *ctx)
 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 {
 	const struct io_uring_napi curr = {
-		.busy_poll_to 	  = ctx->napi_busy_poll_to,
+		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
 		.prefer_busy_poll = ctx->napi_prefer_busy_poll
 	};
 	struct io_uring_napi napi;
@@ -232,7 +241,7 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 	if (copy_to_user(arg, &curr, sizeof(curr)))
 		return -EFAULT;
 
-	WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to);
+	WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
 	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
 	WRITE_ONCE(ctx->napi_enabled, true);
 	return 0;
@@ -249,14 +258,14 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
 {
 	const struct io_uring_napi curr = {
-		.busy_poll_to 	  = ctx->napi_busy_poll_to,
+		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
 		.prefer_busy_poll = ctx->napi_prefer_busy_poll
 	};
 
 	if (arg && copy_to_user(arg, &curr, sizeof(curr)))
 		return -EFAULT;
 
-	WRITE_ONCE(ctx->napi_busy_poll_to, 0);
+	WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
 	WRITE_ONCE(ctx->napi_enabled, false);
 	return 0;
@@ -275,23 +284,20 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
 void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
 			      struct timespec64 *ts)
 {
-	unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to);
+	ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
 
 	if (ts) {
 		struct timespec64 poll_to_ts;
 
-		poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to);
+		poll_to_ts = ns_to_timespec64(ktime_to_ns(poll_dt));
 		if (timespec64_compare(ts, &poll_to_ts) < 0) {
 			s64 poll_to_ns = timespec64_to_ns(ts);
-			if (poll_to_ns > 0) {
-				u64 val = poll_to_ns + 999;
-				do_div(val, 1000);
-				poll_to = val;
-			}
+			if (poll_to_ns > 0)
+				poll_dt = ns_to_ktime(poll_to_ns);
 		}
 	}
 
-	iowq->napi_busy_poll_to = poll_to;
+	iowq->napi_busy_poll_dt = poll_dt;
 }
 
 /*
@@ -320,7 +326,7 @@ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
 	LIST_HEAD(napi_list);
 	bool is_stale = false;
 
-	if (!READ_ONCE(ctx->napi_busy_poll_to))
+	if (!READ_ONCE(ctx->napi_busy_poll_dt))
 		return 0;
 	if (list_empty_careful(&ctx->napi_list))
 		return 0;
diff --git a/io_uring/napi.h b/io_uring/napi.h
index 6fc0393d0dbe..babbee36cd3e 100644
--- a/io_uring/napi.h
+++ b/io_uring/napi.h
@@ -55,7 +55,7 @@ static inline void io_napi_add(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct socket *sock;
 
-	if (!READ_ONCE(ctx->napi_busy_poll_to))
+	if (!READ_ONCE(ctx->napi_busy_poll_dt))
 		return;
 
 	sock = sock_from_file(req->file);

From 358169617602f6f71b31e5c9532a09b95a34b043 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 26 Jul 2024 15:24:31 +0100
Subject: [PATCH 306/346] io_uring/napi: pass ktime to io_napi_adjust_timeout

Pass the waiting time for __io_napi_adjust_timeout as ktime and get rid
of all timespec64 conversions. It's especially simpler since the caller
already have a ktime.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4f5b8e8eed4f53a1879e031a6712b25381adc23d.1722003776.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  6 ++++--
 io_uring/napi.c     | 14 +++-----------
 io_uring/napi.h     |  8 ++++----
 3 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2626424f5d73..3942db160f18 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2416,12 +2416,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 
 	if (uts) {
 		struct timespec64 ts;
+		ktime_t dt;
 
 		if (get_timespec64(&ts, uts))
 			return -EFAULT;
 
-		iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
-		io_napi_adjust_timeout(ctx, &iowq, &ts);
+		dt = timespec64_to_ktime(ts);
+		iowq.timeout = ktime_add(dt, ktime_get());
+		io_napi_adjust_timeout(ctx, &iowq, dt);
 	}
 
 	if (sig) {
diff --git a/io_uring/napi.c b/io_uring/napi.c
index 6bdb267e9c33..4fd6bb331e1e 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -282,20 +282,12 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
  * the NAPI timeout accordingly.
  */
 void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
-			      struct timespec64 *ts)
+			      ktime_t to_wait)
 {
 	ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
 
-	if (ts) {
-		struct timespec64 poll_to_ts;
-
-		poll_to_ts = ns_to_timespec64(ktime_to_ns(poll_dt));
-		if (timespec64_compare(ts, &poll_to_ts) < 0) {
-			s64 poll_to_ns = timespec64_to_ns(ts);
-			if (poll_to_ns > 0)
-				poll_dt = ns_to_ktime(poll_to_ns);
-		}
-	}
+	if (to_wait)
+		poll_dt = min(poll_dt, to_wait);
 
 	iowq->napi_busy_poll_dt = poll_dt;
 }
diff --git a/io_uring/napi.h b/io_uring/napi.h
index babbee36cd3e..88f1c21d5548 100644
--- a/io_uring/napi.h
+++ b/io_uring/napi.h
@@ -18,7 +18,7 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg);
 void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock);
 
 void __io_napi_adjust_timeout(struct io_ring_ctx *ctx,
-		struct io_wait_queue *iowq, struct timespec64 *ts);
+		struct io_wait_queue *iowq, ktime_t to_wait);
 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx);
 
@@ -29,11 +29,11 @@ static inline bool io_napi(struct io_ring_ctx *ctx)
 
 static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
 					  struct io_wait_queue *iowq,
-					  struct timespec64 *ts)
+					  ktime_t to_wait)
 {
 	if (!io_napi(ctx))
 		return;
-	__io_napi_adjust_timeout(ctx, iowq, ts);
+	__io_napi_adjust_timeout(ctx, iowq, to_wait);
 }
 
 static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
@@ -88,7 +88,7 @@ static inline void io_napi_add(struct io_kiocb *req)
 }
 static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
 					  struct io_wait_queue *iowq,
-					  struct timespec64 *ts)
+					  ktime_t to_wait)
 {
 }
 static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,

From 4c7be57f2772c8ce4e1e43c6a79b8f8d401a4795 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 23 Jul 2024 13:21:02 -0700
Subject: [PATCH 307/346] arm64: allow installing compressed image by default

On arm64 we build compressed images, but "make install" by default will
install the old non-compressed one.  To actually get the compressed
image install, you need to use "make zinstall", which is not the usual
way to install a kernel.

Which may not sound like much of an issue, but when you deal with
multiple architectures (and years of your fingers knowing the regular
"make install" incantation), this inconsistency is pretty annoying.

But as Will Deacon says:
 "Sadly, bootloaders being as top quality as you might expect, I don't
  think we're in a position to rely on decompressor support across the
  board. Our Image.gz is literally just that -- we don't have a built-in
  decompressor (nor do I think we want to rush into that again after the
  fun we had on arm32) and the recent EFI zboot support solves that
  problem for platforms using EFI.

  Changing the default 'install' target terrifies me. There are bound to
  be folks with embedded boards who've scripted this and we could really
  ruin their day if we quietly give them a compressed kernel that their
  bootloader doesn't know how to handle :/"

So make this conditional on a new "COMPRESSED_INSTALL" option.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig  | 11 +++++++++++
 arch/arm64/Makefile | 10 ++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7fd70be0463f..d84c5f905b1a 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2337,6 +2337,17 @@ config EFI
 	  allow the kernel to be booted as an EFI application. This
 	  is only useful on systems that have UEFI firmware.
 
+config COMPRESSED_INSTALL
+	bool "Install compressed image by default"
+	help
+	  This makes the regular "make install" install the compressed
+	  image we built, not the legacy uncompressed one.
+
+	  You can check that a compressed image works for you by doing
+	  "make zinstall" first, and verifying that everything is fine
+	  in your environment before making "make install" do this for
+	  you.
+
 config DMI
 	bool "Enable support for SMBIOS (DMI) tables"
 	depends on EFI
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 3f0f35fd5bb7..f6bc3da1ef11 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -182,7 +182,13 @@ $(BOOT_TARGETS): vmlinux
 Image.%: Image
 	$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
 
-install: KBUILD_IMAGE := $(boot)/Image
+ifeq ($(CONFIG_COMPRESSED_INSTALL),y)
+ DEFAULT_KBUILD_IMAGE = $(KBUILD_IMAGE)
+else
+ DEFAULT_KBUILD_IMAGE = $(boot)/Image
+endif
+
+install: KBUILD_IMAGE := $(DEFAULT_KBUILD_IMAGE)
 install zinstall:
 	$(call cmd,install)
 
@@ -229,7 +235,7 @@ define archhelp
   echo  '* Image.gz      - Compressed kernel image (arch/$(ARCH)/boot/Image.gz)'
   echo  '  Image         - Uncompressed kernel image (arch/$(ARCH)/boot/Image)'
   echo  '  image.fit     - Flat Image Tree (arch/$(ARCH)/boot/image.fit)'
-  echo  '  install       - Install uncompressed kernel'
+  echo  '  install       - Install kernel (compressed if COMPRESSED_INSTALL set)'
   echo  '  zinstall      - Install compressed kernel'
   echo  '                  Install using (your) ~/bin/installkernel or'
   echo  '                  (distribution) /sbin/installkernel or'

From 5779d398dbcd74c30a641c209946b8498e668a53 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Tue, 23 Jul 2024 18:12:40 -0500
Subject: [PATCH 308/346] smb3: add dynamic tracepoint for reflink errors

There are cases where debugging clone_range ("smb2_duplicate_extents"
function) and in the future copy_range ("smb2_copychunk_range") can
be helpful. Add dynamic trace points for any errors in clone, and
a followon patch will add them for copychunk.

  "trace-cmd record -e smb3_clone_err"

Reviewed-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2ops.c |  5 ++++
 fs/smb/client/trace.h   | 57 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)

diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 7fe59235f090..1539463285ee 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -2075,6 +2075,11 @@ smb2_duplicate_extents(const unsigned int xid,
 		cifs_dbg(FYI, "Non-zero response length in duplicate extents\n");
 
 duplicate_extents_out:
+	if (rc)
+		trace_smb3_clone_err(xid, srcfile->fid.volatile_fid,
+				     trgtfile->fid.volatile_fid,
+				     tcon->tid, tcon->ses->Suid, src_off,
+				     dest_off, len, rc);
 	return rc;
 }
 
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 36d5295c2a6f..b027282d6db7 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -206,6 +206,63 @@ DEFINE_SMB3_OTHER_ERR_EVENT(query_dir_err);
 DEFINE_SMB3_OTHER_ERR_EVENT(zero_err);
 DEFINE_SMB3_OTHER_ERR_EVENT(falloc_err);
 
+/*
+ * For logging errors in reflink and copy_range ops e.g. smb2_copychunk_range
+ * and smb2_duplicate_extents
+ */
+DECLARE_EVENT_CLASS(smb3_copy_range_err_class,
+	TP_PROTO(unsigned int xid,
+		__u64	src_fid,
+		__u64   target_fid,
+		__u32	tid,
+		__u64	sesid,
+		__u64	src_offset,
+		__u64   target_offset,
+		__u32	len,
+		int	rc),
+	TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len, rc),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u64, src_fid)
+		__field(__u64, target_fid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(__u64, src_offset)
+		__field(__u64, target_offset)
+		__field(__u32, len)
+		__field(int, rc)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->src_fid = src_fid;
+		__entry->target_fid = target_fid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->src_offset = src_offset;
+		__entry->target_offset = target_offset;
+		__entry->len = len;
+		__entry->rc = rc;
+	),
+	TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d",
+		__entry->xid, __entry->sesid, __entry->tid, __entry->target_fid,
+		__entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len, __entry->rc)
+)
+
+#define DEFINE_SMB3_COPY_RANGE_ERR_EVENT(name)	\
+DEFINE_EVENT(smb3_copy_range_err_class, smb3_##name, \
+	TP_PROTO(unsigned int xid,		\
+		__u64	src_fid,		\
+		__u64   target_fid,		\
+		__u32	tid,			\
+		__u64	sesid,			\
+		__u64	src_offset,		\
+		__u64	target_offset,		\
+		__u32	len,			\
+		int	rc),			\
+	TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len, rc))
+
+DEFINE_SMB3_COPY_RANGE_ERR_EVENT(clone_err);
+/* TODO: Add SMB3_COPY_RANGE_ERR_EVENT(copychunk_err) */
 
 /* For logging successful read or write */
 DECLARE_EVENT_CLASS(smb3_rw_done_class,

From 6629f87b97e0740431b7b29b8dfdfa9d842c4bc5 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Wed, 24 Jul 2024 11:57:18 -0500
Subject: [PATCH 309/346] smb3: add four dynamic tracepoints for
 copy_file_range and reflink

Add more dynamic tracepoints to help debug copy_file_range (copychunk)
and clone_range ("duplicate extents").  These are tracepoints for
entering the function and completing without error. For example:

  "trace-cmd record -e smb3_copychunk_enter -e smb3_copychunk_done"

or

  "trace-cmd record -e smb3_clone_enter -e smb3_clone_done"

Here is sample output:

       TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
         | |         |   |||||     |         |
       cp-5964    [005] .....  2176.168977: smb3_clone_enter:
         xid=17 sid=0xeb275be4 tid=0x7ffa7cdb source fid=0x1ed02e15
         source offset=0x0 target fid=0x1ed02e15 target offset=0x0
         len=0xa0000
       cp-5964    [005] .....  2176.170668: smb3_clone_done:
         xid=17 sid=0xeb275be4 tid=0x7ffa7cdb source fid=0x1ed02e15
         source offset=0x0 target fid=0x1ed02e15 target offset=0x0
         len=0xa0000

Reviewed-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2ops.c | 15 +++++++++++-
 fs/smb/client/trace.h   | 53 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 1539463285ee..322cabc69c6f 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -1812,6 +1812,10 @@ smb2_copychunk_range(const unsigned int xid,
 
 	tcon = tlink_tcon(trgtfile->tlink);
 
+	trace_smb3_copychunk_enter(xid, srcfile->fid.volatile_fid,
+				   trgtfile->fid.volatile_fid, tcon->tid,
+				   tcon->ses->Suid, src_off, dest_off, len);
+
 	while (len > 0) {
 		pcchunk->SourceOffset = cpu_to_le64(src_off);
 		pcchunk->TargetOffset = cpu_to_le64(dest_off);
@@ -1863,6 +1867,9 @@ smb2_copychunk_range(const unsigned int xid,
 				le32_to_cpu(retbuf->ChunksWritten),
 				le32_to_cpu(retbuf->ChunkBytesWritten),
 				bytes_written);
+			trace_smb3_copychunk_done(xid, srcfile->fid.volatile_fid,
+				trgtfile->fid.volatile_fid, tcon->tid,
+				tcon->ses->Suid, src_off, dest_off, len);
 		} else if (rc == -EINVAL) {
 			if (ret_data_len != sizeof(struct copychunk_ioctl_rsp))
 				goto cchunk_out;
@@ -2046,7 +2053,9 @@ smb2_duplicate_extents(const unsigned int xid,
 	dup_ext_buf.ByteCount = cpu_to_le64(len);
 	cifs_dbg(FYI, "Duplicate extents: src off %lld dst off %lld len %lld\n",
 		src_off, dest_off, len);
-
+	trace_smb3_clone_enter(xid, srcfile->fid.volatile_fid,
+			       trgtfile->fid.volatile_fid, tcon->tid,
+			       tcon->ses->Suid, src_off, dest_off, len);
 	inode = d_inode(trgtfile->dentry);
 	if (inode->i_size < dest_off + len) {
 		rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false);
@@ -2080,6 +2089,10 @@ smb2_duplicate_extents(const unsigned int xid,
 				     trgtfile->fid.volatile_fid,
 				     tcon->tid, tcon->ses->Suid, src_off,
 				     dest_off, len, rc);
+	else
+		trace_smb3_clone_done(xid, srcfile->fid.volatile_fid,
+				      trgtfile->fid.volatile_fid, tcon->tid,
+				      tcon->ses->Suid, src_off, dest_off, len);
 	return rc;
 }
 
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index b027282d6db7..bc4f8b3ad6ff 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -264,6 +264,59 @@ DEFINE_EVENT(smb3_copy_range_err_class, smb3_##name, \
 DEFINE_SMB3_COPY_RANGE_ERR_EVENT(clone_err);
 /* TODO: Add SMB3_COPY_RANGE_ERR_EVENT(copychunk_err) */
 
+DECLARE_EVENT_CLASS(smb3_copy_range_done_class,
+	TP_PROTO(unsigned int xid,
+		__u64	src_fid,
+		__u64   target_fid,
+		__u32	tid,
+		__u64	sesid,
+		__u64	src_offset,
+		__u64   target_offset,
+		__u32	len),
+	TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u64, src_fid)
+		__field(__u64, target_fid)
+		__field(__u32, tid)
+		__field(__u64, sesid)
+		__field(__u64, src_offset)
+		__field(__u64, target_offset)
+		__field(__u32, len)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->src_fid = src_fid;
+		__entry->target_fid = target_fid;
+		__entry->tid = tid;
+		__entry->sesid = sesid;
+		__entry->src_offset = src_offset;
+		__entry->target_offset = target_offset;
+		__entry->len = len;
+	),
+	TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x",
+		__entry->xid, __entry->sesid, __entry->tid, __entry->target_fid,
+		__entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len)
+)
+
+#define DEFINE_SMB3_COPY_RANGE_DONE_EVENT(name)	\
+DEFINE_EVENT(smb3_copy_range_done_class, smb3_##name, \
+	TP_PROTO(unsigned int xid,		\
+		__u64	src_fid,		\
+		__u64   target_fid,		\
+		__u32	tid,			\
+		__u64	sesid,			\
+		__u64	src_offset,		\
+		__u64	target_offset,		\
+		__u32	len),			\
+	TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len))
+
+DEFINE_SMB3_COPY_RANGE_DONE_EVENT(copychunk_enter);
+DEFINE_SMB3_COPY_RANGE_DONE_EVENT(clone_enter);
+DEFINE_SMB3_COPY_RANGE_DONE_EVENT(copychunk_done);
+DEFINE_SMB3_COPY_RANGE_DONE_EVENT(clone_done);
+
+
 /* For logging successful read or write */
 DECLARE_EVENT_CLASS(smb3_rw_done_class,
 	TP_PROTO(unsigned int rreq_debug_id,

From b6f6a7aa689f1c255e06fee3ca13c9f9e5c12780 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Fri, 26 Jul 2024 01:06:20 -0500
Subject: [PATCH 310/346] smb3: add dynamic trace point for session setup key
 expired failures

There are cases where services need to remount (or change their
credentials files) when keys have expired, but it can be helpful
to have a dynamic trace point to make it easier to notify the
service to refresh the storage account key.

Here is sample output, one from mount with bad password, one
from a reconnect where the password has been changed or expired
and reconnect fails (requiring remount with new storage account key)

       TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
          | |         |   |||||     |         |

  mount.cifs-11362  [000] .....  6000.241620: smb3_key_expired:
    rc=-13 user=testpassu conn_id=0x2 server=localhost addr=127.0.0.1:445
  kworker/4:0-8458  [004] .....  6044.892283: smb3_key_expired:
    rc=-13 user=testpassu conn_id=0x3 server=localhost addr=127.0.0.1:445

Reviewed-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2pdu.c |  8 +++++++-
 fs/smb/client/trace.h   | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 9fc5b11c0b6c..9a06b5594669 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -1562,8 +1562,14 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data)
 	cifs_small_buf_release(sess_data->iov[0].iov_base);
 	if (rc == 0)
 		sess_data->ses->expired_pwd = false;
-	else if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED))
+	else if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) {
+		if (sess_data->ses->expired_pwd == false)
+			trace_smb3_key_expired(sess_data->server->hostname,
+					       sess_data->ses->user_name,
+					       sess_data->server->conn_id,
+					       &sess_data->server->dstaddr, rc);
 		sess_data->ses->expired_pwd = true;
+	}
 
 	memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec));
 
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index bc4f8b3ad6ff..6b3bdfb97211 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -1281,6 +1281,46 @@ DEFINE_EVENT(smb3_connect_err_class, smb3_##name,  \
 
 DEFINE_SMB3_CONNECT_ERR_EVENT(connect_err);
 
+DECLARE_EVENT_CLASS(smb3_sess_setup_err_class,
+	TP_PROTO(char *hostname, char *username, __u64 conn_id,
+		const struct __kernel_sockaddr_storage *dst_addr, int rc),
+	TP_ARGS(hostname, username, conn_id, dst_addr, rc),
+	TP_STRUCT__entry(
+		__string(hostname, hostname)
+		__string(username, username)
+		__field(__u64, conn_id)
+		__array(__u8, dst_addr, sizeof(struct sockaddr_storage))
+		__field(int, rc)
+	),
+	TP_fast_assign(
+		struct sockaddr_storage *pss = NULL;
+
+		__entry->conn_id = conn_id;
+		__entry->rc = rc;
+		pss = (struct sockaddr_storage *)__entry->dst_addr;
+		*pss = *dst_addr;
+		__assign_str(hostname);
+		__assign_str(username);
+	),
+	TP_printk("rc=%d user=%s conn_id=0x%llx server=%s addr=%pISpsfc",
+		__entry->rc,
+		__get_str(username),
+		__entry->conn_id,
+		__get_str(hostname),
+		__entry->dst_addr)
+)
+
+#define DEFINE_SMB3_SES_SETUP_ERR_EVENT(name)        \
+DEFINE_EVENT(smb3_sess_setup_err_class, smb3_##name,  \
+	TP_PROTO(char *hostname,		\
+		char *username,			\
+		__u64 conn_id,			\
+		const struct __kernel_sockaddr_storage *addr,	\
+		int rc),			\
+	TP_ARGS(hostname, username, conn_id, addr, rc))
+
+DEFINE_SMB3_SES_SETUP_ERR_EVENT(key_expired);
+
 DECLARE_EVENT_CLASS(smb3_reconnect_class,
 	TP_PROTO(__u64	currmid,
 		__u64 conn_id,

From 52e130764ab6bdc439bcf124ac3e15f52ca0c8e5 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Wed, 10 Jul 2024 11:34:38 +0200
Subject: [PATCH 311/346] tools/power turbostat: Move debug prints from stdout
 to stderr

This leaves the stdout cleaner, having only counter data. It makes it
easier for programs to parse the output of turbostat, for example
selftests.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index be345a4bbe96..cccd9f4311a0 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1984,6 +1984,7 @@ void help(void)
 		"  -c, --cpu	cpu-set	limit output to summary plus cpu-set:\n"
 		"		  {core | package | j,k,l..m,n-p }\n"
 		"  -d, --debug	displays usec, Time_Of_Day_Seconds and more debugging\n"
+		"		debug messages are printed to stderr\n"
 		"  -D, --Dump	displays the raw counter values\n"
 		"  -e, --enable	[all | column]\n"
 		"		shows all or the specified disabled column\n"
@@ -8262,7 +8263,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo)
 			pinfo->scale = perf_scale;
 
 			if (debug)
-				printf("Add perf/%s/%s cpu%d: %d\n",
+				fprintf(stderr, "Add perf/%s/%s cpu%d: %d\n",
 				       pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]);
 		}
 
@@ -8422,7 +8423,7 @@ struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name)
 
 	for (mp = head; mp; mp = mp->next) {
 		if (debug)
-			printf("%s: %s %s\n", __func__, name, mp->name);
+			fprintf(stderr, "%s: %s %s\n", __func__, name, mp->name);
 		if (!strncmp(name, mp->name, strlen(mp->name)))
 			return mp;
 	}
@@ -8439,8 +8440,8 @@ int add_counter(unsigned int msr_num, char *path, char *name,
 		errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num);
 
 	if (debug)
-		printf("%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n", __func__, msr_num,
-		       path, name, width, scope, type, format, flags, id);
+		fprintf(stderr, "%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n",
+			__func__, msr_num, path, name, width, scope, type, format, flags, id);
 
 	switch (scope) {
 
@@ -8448,7 +8449,7 @@ int add_counter(unsigned int msr_num, char *path, char *name,
 		msrp = find_msrp_by_name(sys.tp, name);
 		if (msrp) {
 			if (debug)
-				printf("%s: %s FOUND\n", __func__, name);
+				fprintf(stderr, "%s: %s FOUND\n", __func__, name);
 			break;
 		}
 		if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) {
@@ -8460,7 +8461,7 @@ int add_counter(unsigned int msr_num, char *path, char *name,
 		msrp = find_msrp_by_name(sys.cp, name);
 		if (msrp) {
 			if (debug)
-				printf("%s: %s FOUND\n", __func__, name);
+				fprintf(stderr, "%s: %s FOUND\n", __func__, name);
 			break;
 		}
 		if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) {
@@ -8472,7 +8473,7 @@ int add_counter(unsigned int msr_num, char *path, char *name,
 		msrp = find_msrp_by_name(sys.pp, name);
 		if (msrp) {
 			if (debug)
-				printf("%s: %s FOUND\n", __func__, name);
+				fprintf(stderr, "%s: %s FOUND\n", __func__, name);
 			break;
 		}
 		if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) {
@@ -8617,7 +8618,7 @@ int add_perf_counter(const char *perf_device, const char *perf_event, const char
 
 	// FIXME: we might not have debug here yet
 	if (debug)
-		printf("%s: %s/%s, name: %s, scope%d\n",
+		fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n",
 		       __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope);
 
 	return 0;

From b2e4a5dfafcce87ed2b9aac0b2887f421cd2930a Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Wed, 10 Jul 2024 14:28:03 +0200
Subject: [PATCH 312/346] tools/power turbostat: Move verbose counter messages
 to level 2

Printing information about the source and value during initialization and
reading of the counter for each cpu, while useful when debugging,
results in too verbose output.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index cccd9f4311a0..86bbea7d8e57 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -3797,7 +3797,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct
 	unsigned long long perf_data[NUM_RAPL_COUNTERS + 1];
 	struct rapl_counter_info_t *rci;
 
-	if (debug)
+	if (debug >= 2)
 		fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain);
 
 	assert(rapl_counter_info_perdomain);
@@ -3827,7 +3827,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct
 			assert(pi < ARRAY_SIZE(perf_data));
 			assert(rci->fd_perf != -1);
 
-			if (debug)
+			if (debug >= 2)
 				fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n",
 					i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]);
 
@@ -3837,7 +3837,7 @@ int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct
 			break;
 
 		case COUNTER_SOURCE_MSR:
-			if (debug)
+			if (debug >= 2)
 				fprintf(stderr, "Reading rapl counter via msr at %u\n", i);
 
 			assert(!no_msr);
@@ -3895,7 +3895,7 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat
 
 	struct cstate_counter_info_t *cci;
 
-	if (debug)
+	if (debug >= 2)
 		fprintf(stderr, "%s: cpu%d\n", __func__, cpu);
 
 	assert(ccstate_counter_info);
@@ -3965,9 +3965,8 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat
 			assert(pi < ARRAY_SIZE(perf_data));
 			assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1);
 
-			if (debug) {
+			if (debug >= 2)
 				fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]);
-			}
 
 			cci->data[i] = perf_data[pi];
 
@@ -3979,9 +3978,8 @@ int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_dat
 			if (get_msr(cpu, cci->msr[i], &cci->data[i]))
 				return -13 - i;
 
-			if (debug) {
+			if (debug >= 2)
 				fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]);
-			}
 
 			break;
 		}
@@ -4036,7 +4034,7 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t)
 
 	struct msr_counter_info_t *mci;
 
-	if (debug)
+	if (debug >= 2)
 		fprintf(stderr, "%s: cpu%d\n", __func__, cpu);
 
 	assert(msr_counter_info);
@@ -4066,7 +4064,7 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t)
 			assert(pi < ARRAY_SIZE(perf_data));
 			assert(mci->fd_perf != -1);
 
-			if (debug)
+			if (debug >= 2)
 				fprintf(stderr, "Reading msr counter via perf at %u: %llu\n", i, perf_data[pi]);
 
 			mci->data[i] = perf_data[pi];
@@ -4082,7 +4080,7 @@ int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t)
 
 			mci->data[i] &= mci->msr_mask[i];
 
-			if (debug)
+			if (debug >= 2)
 				fprintf(stderr, "Reading msr counter via msr at %u: %llu\n", i, mci->data[i]);
 
 			break;
@@ -7125,7 +7123,7 @@ int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct
 {
 	int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit);
 
-	if (debug)
+	if (debug >= 2)
 		fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
 
 	return ret;
@@ -7284,7 +7282,7 @@ int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const st
 {
 	int ret = add_cstate_perf_counter_(cpu, cci, cai);
 
-	if (debug)
+	if (debug >= 2)
 		fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
 
 	return ret;

From 1f8add13e6c86653da2ec599dfc94e7a8865cdcb Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Wed, 3 Jul 2024 23:14:14 +0200
Subject: [PATCH 313/346] tools/power turbostat: Add selftests for SMI, APERF
 and MPERF counters

The test requests BICs that are dependent on SMI, APERF and MPERF
counters and checks if the columns show up in the output and the
turbostat doesn't crash. Read the counters in both --no-msr
and --no-perf mode.

The test skips counters that are not present or user does not have
permissions to read. It is not an error, but the test may not be as
exhaustive.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 .../selftests/turbostat/smi_aperf_mperf.py    | 157 ++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100755 tools/testing/selftests/turbostat/smi_aperf_mperf.py

diff --git a/tools/testing/selftests/turbostat/smi_aperf_mperf.py b/tools/testing/selftests/turbostat/smi_aperf_mperf.py
new file mode 100755
index 000000000000..6289cc47d5f0
--- /dev/null
+++ b/tools/testing/selftests/turbostat/smi_aperf_mperf.py
@@ -0,0 +1,157 @@
+#!/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+from shutil import which
+from os import pread
+
+# CDLL calls dlopen underneath.
+# Calling it with None (null), we get handle to the our own image (python interpreter).
+# We hope to find sched_getcpu() inside ;]
+# This is a bit ugly, but helps shipping working software, so..
+try:
+	import ctypes
+
+	this_image = ctypes.CDLL(None)
+	BASE_CPU = this_image.sched_getcpu()
+except:
+	BASE_CPU = 0 # If we fail, set to 0 and pray it's not offline.
+
+MSR_IA32_MPERF = 0x000000e7
+MSR_IA32_APERF = 0x000000e8
+
+def check_perf_access():
+	perf = which('perf')
+	if perf is None:
+		print('SKIP: Could not find perf binary, thus could not determine perf access.')
+		return False
+
+	def has_perf_counter_access(counter_name):
+		proc_perf = subprocess.run([perf, 'stat', '-e', counter_name, '--timeout', '10'],
+					    capture_output = True)
+
+		if proc_perf.returncode != 0:
+			print(f'SKIP: Could not read {counter_name} perf counter, assuming no access.')
+			return False
+
+		if b'<not supported>' in proc_perf.stderr:
+			print(f'SKIP: Could not read {counter_name} perf counter, assuming no access.')
+			return False
+
+		return True
+
+	if not has_perf_counter_access('msr/mperf/'):
+		return False
+	if not has_perf_counter_access('msr/aperf/'):
+		return False
+	if not has_perf_counter_access('msr/smi/'):
+		return False
+
+	return True
+
+def check_msr_access():
+	try:
+		file_msr = open(f'/dev/cpu/{BASE_CPU}/msr', 'rb')
+	except:
+		return False
+
+	if len(pread(file_msr.fileno(), 8, MSR_IA32_MPERF)) != 8:
+		return False
+
+	if len(pread(file_msr.fileno(), 8, MSR_IA32_APERF)) != 8:
+		return False
+
+	return True
+
+has_perf_access = check_perf_access()
+has_msr_access = check_msr_access()
+
+turbostat_counter_source_opts = ['']
+
+if has_msr_access:
+	turbostat_counter_source_opts.append('--no-perf')
+else:
+	print('SKIP: doesn\'t have MSR access, skipping run with --no-perf')
+
+if has_perf_access:
+	turbostat_counter_source_opts.append('--no-msr')
+else:
+	print('SKIP: doesn\'t have perf access, skipping run with --no-msr')
+
+if not has_msr_access and not has_perf_access:
+	print('SKIP: No MSR nor perf access detected. Skipping the tests entirely')
+	exit(0)
+
+turbostat = which('turbostat')
+if turbostat is None:
+	print('Could not find turbostat binary')
+	exit(1)
+
+timeout = which('timeout')
+if timeout is None:
+	print('Could not find timeout binary')
+	exit(1)
+
+proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True)
+if proc_turbostat.returncode != 0:
+	print(f'turbostat failed with {proc_turbostat.returncode}')
+	exit(1)
+
+EXPECTED_COLUMNS_DEBUG_DEFAULT = b'usec\tTime_Of_Day_Seconds\tAPIC\tX2APIC'
+
+SMI_APERF_MPERF_DEPENDENT_BICS = [
+	'SMI',
+	'Avg_MHz',
+	'Busy%',
+	'Bzy_MHz',
+]
+if has_perf_access:
+	SMI_APERF_MPERF_DEPENDENT_BICS.append('IPC')
+
+for bic in SMI_APERF_MPERF_DEPENDENT_BICS:
+	for counter_source_opt in turbostat_counter_source_opts:
+
+		# Ugly special case, but it is what it is..
+		if counter_source_opt == '--no-perf' and bic == 'IPC':
+			continue
+
+		expected_columns = bic.encode()
+		expected_columns_debug = EXPECTED_COLUMNS_DEBUG_DEFAULT + f'\t{bic}'.encode()
+
+		#
+		# Run turbostat for some time and send SIGINT
+		#
+		timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '0.2s']
+		turbostat_argv = [turbostat, '-i', '0.50', '--show', bic]
+
+		if counter_source_opt:
+			turbostat_argv.append(counter_source_opt)
+
+		print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True)
+		proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True)
+		if proc_turbostat.returncode != 0:
+			print(f'turbostat failed with {proc_turbostat.returncode}')
+			exit(1)
+
+		actual_columns = proc_turbostat.stdout.split(b'\n')[0]
+		if expected_columns != actual_columns:
+			print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}')
+			exit(1)
+		print('OK')
+
+		#
+		# Same, but with --debug
+		#
+		turbostat_argv.append('--debug')
+
+		print(f'Running turbostat with {turbostat_argv=}... ', end = '', flush = True)
+		proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True)
+		if proc_turbostat.returncode != 0:
+			print(f'turbostat failed with {proc_turbostat.returncode}')
+			exit(1)
+
+		actual_columns = proc_turbostat.stdout.split(b'\n')[0]
+		if expected_columns_debug != actual_columns:
+			print(f'turbostat column check failed\n{expected_columns_debug=}\n{actual_columns=}')
+			exit(1)
+		print('OK')

From f3065f9c3917fa9279992623a2f3282f1fd43515 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Thu, 4 Jul 2024 20:11:33 +0200
Subject: [PATCH 314/346] tools/power turbostat: Add selftests for added perf
 counters

Test adds several perf counters from msr, cstate_core and cstate_pkg
groups and checks if the columns for those counters show up.
The test skips the counters that are not present. It is not an error,
but the test may not be as exhaustive.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 .../turbostat/added_perf_counters.py          | 178 ++++++++++++++++++
 1 file changed, 178 insertions(+)
 create mode 100755 tools/testing/selftests/turbostat/added_perf_counters.py

diff --git a/tools/testing/selftests/turbostat/added_perf_counters.py b/tools/testing/selftests/turbostat/added_perf_counters.py
new file mode 100755
index 000000000000..9ab4aaf45fb8
--- /dev/null
+++ b/tools/testing/selftests/turbostat/added_perf_counters.py
@@ -0,0 +1,178 @@
+#!/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+from shutil import which
+from os import pread
+
+class PerfCounterInfo:
+	def __init__(self, subsys, event):
+		self.subsys = subsys
+		self.event = event
+
+	def get_perf_event_name(self):
+		return f'{self.subsys}/{self.event}/'
+
+	def get_turbostat_perf_id(self, counter_scope, counter_type, column_name):
+		return f'perf/{self.subsys}/{self.event},{counter_scope},{counter_type},{column_name}'
+
+PERF_COUNTERS_CANDIDATES = [
+	PerfCounterInfo('msr', 'mperf'),
+	PerfCounterInfo('msr', 'aperf'),
+	PerfCounterInfo('msr', 'tsc'),
+	PerfCounterInfo('cstate_core', 'c1-residency'),
+	PerfCounterInfo('cstate_core', 'c6-residency'),
+	PerfCounterInfo('cstate_core', 'c7-residency'),
+	PerfCounterInfo('cstate_pkg', 'c2-residency'),
+	PerfCounterInfo('cstate_pkg', 'c3-residency'),
+	PerfCounterInfo('cstate_pkg', 'c6-residency'),
+	PerfCounterInfo('cstate_pkg', 'c7-residency'),
+	PerfCounterInfo('cstate_pkg', 'c8-residency'),
+	PerfCounterInfo('cstate_pkg', 'c9-residency'),
+	PerfCounterInfo('cstate_pkg', 'c10-residency'),
+]
+present_perf_counters = []
+
+def check_perf_access():
+	perf = which('perf')
+	if perf is None:
+		print('SKIP: Could not find perf binary, thus could not determine perf access.')
+		return False
+
+	def has_perf_counter_access(counter_name):
+		proc_perf = subprocess.run([perf, 'stat', '-e', counter_name, '--timeout', '10'],
+							 capture_output = True)
+
+		if proc_perf.returncode != 0:
+			print(f'SKIP: Could not read {counter_name} perf counter.')
+			return False
+
+		if b'<not supported>' in proc_perf.stderr:
+			print(f'SKIP: Could not read {counter_name} perf counter.')
+			return False
+
+		return True
+
+	for counter in PERF_COUNTERS_CANDIDATES:
+		if has_perf_counter_access(counter.get_perf_event_name()):
+			present_perf_counters.append(counter)
+
+	if len(present_perf_counters) == 0:
+		print('SKIP: Could not read any perf counter.')
+		return False
+
+	if len(present_perf_counters) != len(PERF_COUNTERS_CANDIDATES):
+		print(f'WARN: Could not access all of the counters - some will be left untested')
+
+	return True
+
+if not check_perf_access():
+	exit(0)
+
+turbostat_counter_source_opts = ['']
+
+turbostat = which('turbostat')
+if turbostat is None:
+	print('Could not find turbostat binary')
+	exit(1)
+
+timeout = which('timeout')
+if timeout is None:
+	print('Could not find timeout binary')
+	exit(1)
+
+proc_turbostat = subprocess.run([turbostat, '--list'], capture_output = True)
+if proc_turbostat.returncode != 0:
+	print(f'turbostat failed with {proc_turbostat.returncode}')
+	exit(1)
+
+EXPECTED_COLUMNS_DEBUG_DEFAULT = [b'usec', b'Time_Of_Day_Seconds', b'APIC', b'X2APIC']
+
+expected_columns = [b'CPU']
+counters_argv = []
+for counter in present_perf_counters:
+	if counter.subsys == 'cstate_core':
+		counter_scope = 'core'
+	elif counter.subsys == 'cstate_pkg':
+		counter_scope = 'package'
+	else:
+		counter_scope = 'cpu'
+
+	counter_type = 'delta'
+	column_name = counter.event
+
+	cparams = counter.get_turbostat_perf_id(
+		counter_scope = counter_scope,
+		counter_type = counter_type,
+		column_name = column_name
+	)
+	expected_columns.append(column_name.encode())
+	counters_argv.extend(['--add', cparams])
+
+expected_columns_debug = EXPECTED_COLUMNS_DEBUG_DEFAULT + expected_columns
+
+def gen_user_friendly_cmdline(argv_):
+	argv = argv_[:]
+	ret = ''
+
+	while len(argv) != 0:
+		arg = argv.pop(0)
+		arg_next = ''
+
+		if arg in ('-i', '--show', '--add'):
+			arg_next = argv.pop(0) if len(argv) > 0 else ''
+
+		ret += f'{arg} {arg_next} \\\n\t'
+
+	# Remove the last separator and return
+	return ret[:-4]
+
+#
+# Run turbostat for some time and send SIGINT
+#
+timeout_argv = [timeout, '--preserve-status', '-s', 'SIGINT', '-k', '3', '0.2s']
+turbostat_argv = [turbostat, '-i', '0.50', '--show', 'CPU'] + counters_argv
+
+def check_columns_or_fail(expected_columns: list, actual_columns: list):
+	if len(actual_columns) != len(expected_columns):
+		print(f'turbostat column check failed\n{expected_columns=}\n{actual_columns=}')
+		exit(1)
+
+	failed = False
+	for expected_column in expected_columns:
+		if expected_column not in actual_columns:
+			print(f'turbostat column check failed: missing column {expected_column.decode()}')
+			failed = True
+
+	if failed:
+		exit(1)
+
+cmdline = gen_user_friendly_cmdline(turbostat_argv)
+print(f'Running turbostat with:\n\t{cmdline}\n... ', end = '', flush = True)
+proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True)
+if proc_turbostat.returncode != 0:
+	print(f'turbostat failed with {proc_turbostat.returncode}')
+	exit(1)
+
+actual_columns = proc_turbostat.stdout.split(b'\n')[0].split(b'\t')
+check_columns_or_fail(expected_columns, actual_columns)
+print('OK')
+
+#
+# Same, but with --debug
+#
+# We explicitly specify '--show CPU' to make sure turbostat
+# don't show a bunch of default counters instead.
+#
+turbostat_argv.append('--debug')
+
+cmdline = gen_user_friendly_cmdline(turbostat_argv)
+print(f'Running turbostat (in debug mode) with:\n\t{cmdline}\n... ', end = '', flush = True)
+proc_turbostat = subprocess.run(timeout_argv + turbostat_argv, capture_output = True)
+if proc_turbostat.returncode != 0:
+	print(f'turbostat failed with {proc_turbostat.returncode}')
+	exit(1)
+
+actual_columns = proc_turbostat.stdout.split(b'\n')[0].split(b'\t')
+check_columns_or_fail(expected_columns_debug, actual_columns)
+print('OK')

From f0e4ed752fda6997b41917c94a5478b340178001 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Mon, 22 Jul 2024 22:11:03 +0200
Subject: [PATCH 315/346] tools/power turbostat: Add early support for PMT
 counters

Allows users to read Intel PMT (Platform Monitoring Technology)
counters, providing interface similar to one used to add MSR and perf
counters. Because PMT is exposed as a raw MMIO range, without metadata,
user has to supply the necessary information to find and correctly
display the requested counter.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 768 +++++++++++++++++++++++++-
 1 file changed, 766 insertions(+), 2 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 86bbea7d8e57..c9fdfb074ce9 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -44,6 +44,7 @@
 #include <sys/stat.h>
 #include <sys/select.h>
 #include <sys/resource.h>
+#include <sys/mman.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <sys/time.h>
@@ -1082,6 +1083,9 @@ size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affi
 #define MAX_ADDED_THREAD_COUNTERS 24
 #define MAX_ADDED_CORE_COUNTERS 8
 #define MAX_ADDED_PACKAGE_COUNTERS 16
+#define PMT_MAX_ADDED_THREAD_COUNTERS 24
+#define PMT_MAX_ADDED_CORE_COUNTERS 8
+#define PMT_MAX_ADDED_PACKAGE_COUNTERS 16
 #define BITMASK_SIZE 32
 
 #define ZERO_ARRAY(arr) (memset(arr, 0, sizeof(arr)) + __must_be_array(arr))
@@ -1466,6 +1470,100 @@ static struct msr_counter_arch_info msr_counter_arch_infos[] = {
 	},
 };
 
+/* Can be redefined when compiling, useful for testing. */
+#ifndef SYSFS_TELEM_PATH
+#define SYSFS_TELEM_PATH "/sys/class/intel_pmt"
+#endif
+
+#define PMT_COUNTER_NAME_SIZE_BYTES      16
+#define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32
+
+struct pmt_mmio {
+	struct pmt_mmio *next;
+
+	unsigned int guid;
+	unsigned int size;
+
+	/* Base pointer to the mmaped memory. */
+	void *mmio_base;
+
+	/*
+	 * Offset to be applied to the mmio_base
+	 * to get the beginning of the PMT counters for given GUID.
+	 */
+	unsigned long pmt_offset;
+} *pmt_mmios;
+
+enum pmt_datatype {
+	PMT_TYPE_RAW,
+};
+
+struct pmt_domain_info {
+	/*
+	 * Pointer to the MMIO obtained by applying a counter offset
+	 * to the mmio_base of the mmaped region for the given GUID.
+	 *
+	 * This is where to read the raw value of the counter from.
+	 */
+	unsigned long *pcounter;
+};
+
+struct pmt_counter {
+	struct pmt_counter *next;
+
+	/* PMT metadata */
+	char name[PMT_COUNTER_NAME_SIZE_BYTES];
+	enum pmt_datatype type;
+	enum counter_scope scope;
+	unsigned int lsb;
+	unsigned int msb;
+
+	/* BIC-like metadata */
+	enum counter_format format;
+
+	unsigned int num_domains;
+	struct pmt_domain_info *domains;
+};
+
+unsigned int pmt_counter_get_width(const struct pmt_counter *p)
+{
+	return (p->msb - p->lsb)+1;
+}
+
+void pmt_counter_resize_(struct pmt_counter *pcounter, unsigned int new_size)
+{
+	struct pmt_domain_info *new_mem;
+
+	new_mem = (struct pmt_domain_info*) reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains));
+	if (!new_mem) {
+		fprintf(stderr, "%s: failed to allocate memory for PMT counters\n", __func__);
+		exit(1);
+	}
+
+	/* Zero initialize just allocated memory. */
+	const size_t num_new_domains = new_size - pcounter->num_domains;
+
+	memset(&new_mem[pcounter->num_domains], 0, num_new_domains * sizeof(*pcounter->domains));
+
+	pcounter->num_domains = new_size;
+	pcounter->domains = new_mem;
+}
+
+void pmt_counter_resize(struct pmt_counter *pcounter, unsigned int new_size)
+{
+	/*
+	 * Allocate more memory ahead of time.
+	 *
+	 * Always allocate space for at least 8 elements
+	 * and double the size when growing.
+	 */
+	if (new_size < 8)
+		new_size = 8;
+	new_size = MAX(new_size, pcounter->num_domains*2);
+
+	pmt_counter_resize_(pcounter, new_size);
+}
+
 struct thread_data {
 	struct timeval tv_begin;
 	struct timeval tv_end;
@@ -1484,6 +1582,7 @@ struct thread_data {
 	bool is_atom;
 	unsigned long long counter[MAX_ADDED_THREAD_COUNTERS];
 	unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS];
+	unsigned long long pmt_counter[PMT_MAX_ADDED_THREAD_COUNTERS];
 } *thread_even, *thread_odd;
 
 struct core_data {
@@ -1498,6 +1597,7 @@ struct core_data {
 	unsigned long long core_throt_cnt;
 	unsigned long long counter[MAX_ADDED_CORE_COUNTERS];
 	unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS];
+	unsigned long long pmt_counter[PMT_MAX_ADDED_CORE_COUNTERS];
 } *core_even, *core_odd;
 
 struct pkg_data {
@@ -1532,6 +1632,7 @@ struct pkg_data {
 	unsigned int uncore_mhz;
 	unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS];
 	unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS];
+	unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS];
 } *package_even, *package_odd;
 
 #define ODD_COUNTERS thread_odd, core_odd, package_odd
@@ -1681,6 +1782,10 @@ struct sys_counters {
 	struct perf_counter_info *perf_tp;
 	struct perf_counter_info *perf_cp;
 	struct perf_counter_info *perf_pp;
+
+	struct pmt_counter *pmt_tp;
+	struct pmt_counter *pmt_cp;
+	struct pmt_counter *pmt_pp;
 } sys;
 
 static size_t free_msr_counters_(struct msr_counter **pp)
@@ -1981,6 +2086,7 @@ void help(void)
 		"  -a, --add	add a counter\n"
 		"		  eg. --add msr0x10,u64,cpu,delta,MY_TSC\n"
 		"		  eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n"
+		"		  eg. --add pmt,name=XTAL,type=raw,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102\n"
 		"  -c, --cpu	cpu-set	limit output to summary plus cpu-set:\n"
 		"		  {core | package | j,k,l..m,n-p }\n"
 		"  -d, --debug	displays usec, Time_Of_Day_Seconds and more debugging\n"
@@ -2092,6 +2198,7 @@ void print_header(char *delim)
 {
 	struct msr_counter *mp;
 	struct perf_counter_info *pp;
+	struct pmt_counter *ppmt;
 	int printed = 0;
 
 	if (DO_BIC(BIC_USEC))
@@ -2164,6 +2271,21 @@ void print_header(char *delim)
 		}
 	}
 
+	ppmt = sys.pmt_tp;
+	while (ppmt) {
+		switch(ppmt->type) {
+		case PMT_TYPE_RAW:
+			if (pmt_counter_get_width(ppmt) <= 32)
+				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
+			else
+				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
+
+			break;
+		}
+
+		ppmt = ppmt->next;
+	}
+
 	if (DO_BIC(BIC_CPU_c1))
 		outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : ""));
 	if (DO_BIC(BIC_CPU_c3))
@@ -2219,6 +2341,21 @@ void print_header(char *delim)
 		}
 	}
 
+	ppmt = sys.pmt_cp;
+	while (ppmt) {
+		switch(ppmt->type) {
+		case PMT_TYPE_RAW:
+			if (pmt_counter_get_width(ppmt) <= 32)
+				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
+			else
+				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
+
+			break;
+		}
+
+		ppmt = ppmt->next;
+	}
+
 	if (DO_BIC(BIC_PkgTmp))
 		outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : ""));
 
@@ -2329,6 +2466,21 @@ void print_header(char *delim)
 		}
 	}
 
+	ppmt = sys.pmt_pp;
+	while (ppmt) {
+		switch(ppmt->type) {
+		case PMT_TYPE_RAW:
+			if (pmt_counter_get_width(ppmt) <= 32)
+				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
+			else
+				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
+
+			break;
+		}
+
+		ppmt = ppmt->next;
+	}
+
 	outp += sprintf(outp, "\n");
 }
 
@@ -2450,6 +2602,7 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	int i;
 	struct msr_counter *mp;
 	struct perf_counter_info *pp;
+	struct pmt_counter *ppmt;
 	char *delim = "\t";
 	int printed = 0;
 
@@ -2612,6 +2765,19 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 		}
 	}
 
+	for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
+		switch (ppmt->type) {
+		case PMT_TYPE_RAW:
+			if (pmt_counter_get_width(ppmt) <= 32)
+				outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
+						(unsigned int)t->pmt_counter[i]);
+			else
+				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->pmt_counter[i]);
+
+			break;
+		}
+	}
+
 	/* C1 */
 	if (DO_BIC(BIC_CPU_c1))
 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc);
@@ -2673,6 +2839,19 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 		}
 	}
 
+	for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
+		switch (ppmt->type) {
+		case PMT_TYPE_RAW:
+			if (pmt_counter_get_width(ppmt) <= 32)
+				outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
+						(unsigned int)c->pmt_counter[i]);
+			else
+				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->pmt_counter[i]);
+
+			break;
+		}
+	}
+
 	fmt8 = "%s%.2f";
 
 	if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
@@ -2842,9 +3021,23 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 				outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->perf_counter[i]);
 		} else if (pp->format == FORMAT_PERCENT) {
 			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->perf_counter[i] / tsc);
-		} else if (pp->type == COUNTER_K2M)
+		} else if (pp->type == COUNTER_K2M) {
 			outp +=
 			    sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000);
+		}
+	}
+
+	for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
+		switch (ppmt->type) {
+		case PMT_TYPE_RAW:
+			if (pmt_counter_get_width(ppmt) <= 32)
+				outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
+						(unsigned int)p->pmt_counter[i]);
+			else
+				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->pmt_counter[i]);
+
+			break;
+		}
 	}
 
 done:
@@ -2901,6 +3094,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old)
 	int i;
 	struct msr_counter *mp;
 	struct perf_counter_info *pp;
+	struct pmt_counter *ppmt;
 
 	if (DO_BIC(BIC_Totl_c0))
 		old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0;
@@ -2970,6 +3164,13 @@ int delta_package(struct pkg_data *new, struct pkg_data *old)
 			old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
 	}
 
+	for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
+		if (ppmt->format == FORMAT_RAW)
+			old->pmt_counter[i] = new->pmt_counter[i];
+		else
+			old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i];
+	}
+
 	return 0;
 }
 
@@ -2978,6 +3179,7 @@ void delta_core(struct core_data *new, struct core_data *old)
 	int i;
 	struct msr_counter *mp;
 	struct perf_counter_info *pp;
+	struct pmt_counter *ppmt;
 
 	old->c3 = new->c3 - old->c3;
 	old->c6 = new->c6 - old->c6;
@@ -3001,6 +3203,13 @@ void delta_core(struct core_data *new, struct core_data *old)
 		else
 			old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
 	}
+
+	for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
+		if (ppmt->format == FORMAT_RAW)
+			old->pmt_counter[i] = new->pmt_counter[i];
+		else
+			old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i];
+	}
 }
 
 int soft_c1_residency_display(int bic)
@@ -3019,6 +3228,7 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
 	int i;
 	struct msr_counter *mp;
 	struct perf_counter_info *pp;
+	struct pmt_counter *ppmt;
 
 	/* we run cpuid just the 1st time, copy the results */
 	if (DO_BIC(BIC_APIC))
@@ -3105,6 +3315,13 @@ int delta_thread(struct thread_data *new, struct thread_data *old, struct core_d
 			old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
 	}
 
+	for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
+		if (ppmt->format == FORMAT_RAW)
+			old->pmt_counter[i] = new->pmt_counter[i];
+		else
+			old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i];
+	}
+
 	return 0;
 }
 
@@ -3211,6 +3428,10 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	memset(&t->perf_counter[0], 0, sizeof(t->perf_counter));
 	memset(&c->perf_counter[0], 0, sizeof(c->perf_counter));
 	memset(&p->perf_counter[0], 0, sizeof(p->perf_counter));
+
+	memset(&t->pmt_counter[0], 0, ARRAY_SIZE(t->pmt_counter));
+	memset(&c->pmt_counter[0], 0, ARRAY_SIZE(c->pmt_counter));
+	memset(&p->pmt_counter[0], 0, ARRAY_SIZE(p->pmt_counter));
 }
 
 void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src)
@@ -3232,6 +3453,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	int i;
 	struct msr_counter *mp;
 	struct perf_counter_info *pp;
+	struct pmt_counter *ppmt;
 
 	/* copy un-changing apic_id's */
 	if (DO_BIC(BIC_APIC))
@@ -3268,6 +3490,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		average.threads.perf_counter[i] += t->perf_counter[i];
 	}
 
+	for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
+		average.threads.pmt_counter[i] += t->pmt_counter[i];
+	}
+
 	/* sum per-core values only for 1st thread in core */
 	if (!is_cpu_first_thread_in_core(t, c, p))
 		return 0;
@@ -3294,6 +3520,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 		average.cores.perf_counter[i] += c->perf_counter[i];
 	}
 
+	for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
+		average.cores.pmt_counter[i] += c->pmt_counter[i];
+	}
+
 	/* sum per-pkg values only for 1st core in pkg */
 	if (!is_cpu_first_core_in_package(t, c, p))
 		return 0;
@@ -3353,6 +3583,10 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 			average.packages.perf_counter[i] += p->perf_counter[i];
 	}
 
+	for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
+		average.packages.pmt_counter[i] += p->pmt_counter[i];
+	}
+
 	return 0;
 }
 
@@ -3365,6 +3599,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
 	int i;
 	struct msr_counter *mp;
 	struct perf_counter_info *pp;
+	struct pmt_counter *ppmt;
 
 	clear_counters(&average.threads, &average.cores, &average.packages);
 
@@ -3465,6 +3700,16 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
 		}
 		average.packages.perf_counter[i] /= topo.allowed_packages;
 	}
+
+	for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
+		average.threads.pmt_counter[i] /= topo.allowed_cpus;
+	}
+	for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
+		average.cores.pmt_counter[i] /= topo.allowed_cores;
+	}
+	for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
+		average.packages.pmt_counter[i] /= topo.allowed_packages;
+	}
 }
 
 static unsigned long long rdtsc(void)
@@ -4120,6 +4365,32 @@ int perf_counter_info_read_values(struct perf_counter_info *pp, int cpu, unsigne
 	return 0;
 }
 
+unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb)
+{
+	unsigned long mask;
+
+	if (msb == 63)
+		mask = 0xffffffffffffffff;
+	else
+		mask = ((1<<(msb+1))-1);
+
+	mask -= (1<<lsb)-1;
+
+	return mask;
+}
+
+unsigned long pmt_read_counter(struct pmt_counter *ppmt, unsigned int domain_id)
+{
+	assert(domain_id < ppmt->num_domains);
+
+	const unsigned long *pmmio = ppmt->domains[domain_id].pcounter;
+	const unsigned long value = pmmio ? *pmmio : 0;
+	const unsigned long value_mask = pmt_gen_value_mask(ppmt->lsb, ppmt->msb);
+	const unsigned long value_shift = ppmt->lsb;
+
+	return (value & value_mask) >> value_shift;
+}
+
 /*
  * get_counters(...)
  * migrate to cpu
@@ -4130,6 +4401,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	int cpu = t->cpu_id;
 	unsigned long long msr;
 	struct msr_counter *mp;
+	struct pmt_counter *pp;
 	int i;
 	int status;
 
@@ -4164,6 +4436,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	if (perf_counter_info_read_values(sys.perf_tp, cpu, t->perf_counter, MAX_ADDED_THREAD_COUNTERS))
 		return -10;
 
+	for (i = 0, pp = sys.pmt_tp; pp; i++, pp = pp->next)
+		t->pmt_counter[i] = pmt_read_counter(pp, t->cpu_id);
+
 	/* collect core counters only for 1st thread in core */
 	if (!is_cpu_first_thread_in_core(t, c, p))
 		goto done;
@@ -4205,6 +4480,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	if (perf_counter_info_read_values(sys.perf_cp, cpu, c->perf_counter, MAX_ADDED_CORE_COUNTERS))
 		return -10;
 
+	for (i = 0, pp = sys.pmt_cp; pp; i++, pp = pp->next)
+		c->pmt_counter[i] = pmt_read_counter(pp, c->core_id);
+
 	/* collect package counters only for 1st core in package */
 	if (!is_cpu_first_core_in_package(t, c, p))
 		goto done;
@@ -4281,6 +4559,9 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	if (perf_counter_info_read_values(sys.perf_pp, cpu, p->perf_counter, MAX_ADDED_PACKAGE_COUNTERS))
 		return -10;
 
+	for (i = 0, pp = sys.pmt_pp; pp; i++, pp = pp->next)
+		p->pmt_counter[i] = pmt_read_counter(pp, p->package_id);
+
 done:
 	gettimeofday(&t->tv_end, (struct timezone *)NULL);
 
@@ -8285,6 +8566,304 @@ void added_perf_counters_init(void)
 		errx(1, "%s: %s", __func__, "package");
 }
 
+int parse_telem_info_file(int fd_dir, const char *info_filename, const char *format, unsigned long *output)
+{
+	int fd_telem_info;
+	FILE *file_telem_info;
+	unsigned long value;
+
+	fd_telem_info = openat(fd_dir, info_filename, O_RDONLY);
+	if (fd_telem_info == -1)
+		return -1;
+
+	file_telem_info = fdopen(fd_telem_info, "r");
+	if (file_telem_info == NULL) {
+		close(fd_telem_info);
+		return -1;
+	}
+
+	if (fscanf(file_telem_info, format, &value) != 1) {
+		fclose(file_telem_info);
+		return -1;
+	}
+
+	fclose(file_telem_info);
+
+	*output = value;
+
+	return 0;
+}
+
+struct pmt_mmio* pmt_mmio_open(unsigned int target_guid)
+{
+	DIR *dirp;
+	struct dirent *entry;
+	struct stat st;
+	unsigned int telem_idx;
+	int fd_telem_dir, fd_pmt;
+	unsigned long guid, size, offset;
+	size_t mmap_size;
+	void *mmio;
+	struct pmt_mmio *ret = NULL;
+
+	if (stat(SYSFS_TELEM_PATH, &st) == -1)
+		return NULL;
+
+	dirp = opendir(SYSFS_TELEM_PATH);
+	if (dirp == NULL)
+		return NULL;
+
+	for (;;) {
+		entry = readdir(dirp);
+
+		if (entry == NULL)
+			break;
+
+		if (strcmp(entry->d_name, ".") == 0)
+			continue;
+
+		if (strcmp(entry->d_name, "..") == 0)
+			continue;
+
+		if (sscanf(entry->d_name, "telem%u", &telem_idx) != 1)
+			continue;
+
+		if (fstatat(dirfd(dirp), entry->d_name, &st, 0) == -1) {
+			break;
+		}
+
+		if (!S_ISDIR(st.st_mode))
+			continue;
+
+		fd_telem_dir = openat(dirfd(dirp), entry->d_name, O_RDONLY);
+		if (fd_telem_dir == -1) {
+			break;
+		}
+
+		if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) {
+			close(fd_telem_dir);
+			break;
+		}
+
+		if (parse_telem_info_file(fd_telem_dir, "size", "%lu", &size)) {
+			close(fd_telem_dir);
+			break;
+		}
+
+		if (guid != target_guid) {
+			close(fd_telem_dir);
+			continue;
+		}
+
+		if (parse_telem_info_file(fd_telem_dir, "offset", "%lu", &offset)) {
+			close(fd_telem_dir);
+			break;
+		}
+
+		assert(offset == 0);
+
+		fd_pmt = openat(fd_telem_dir, "telem", O_RDONLY);
+		if (fd_pmt == -1)
+			goto loop_cleanup_and_break;
+
+		mmap_size = (size + 0x1000UL) & (~0x1000UL);
+		mmio = mmap(0, mmap_size, PROT_READ, MAP_SHARED, fd_pmt, 0);
+		if (mmio != MAP_FAILED) {
+
+			if (debug)
+				fprintf(stderr, "%s: 0x%lx mmaped at: %p\n", __func__, guid, mmio);
+
+			ret = calloc(1, sizeof(*ret));
+
+			if (!ret) {
+				fprintf(stderr, "%s: Failed to allocate pmt_mmio\n", __func__);
+				exit(1);
+			}
+
+			ret->guid = guid;
+			ret->mmio_base = mmio;
+			ret->pmt_offset = offset;
+			ret->size = size;
+
+			ret->next = pmt_mmios;
+			pmt_mmios = ret;
+		}
+
+loop_cleanup_and_break:
+		close(fd_pmt);
+		close(fd_telem_dir);
+		break;
+	}
+
+	closedir(dirp);
+
+	return ret;
+}
+
+struct pmt_mmio* pmt_mmio_find(unsigned int guid)
+{
+	struct pmt_mmio *pmmio = pmt_mmios;
+
+	while (pmmio) {
+		if (pmmio->guid == guid)
+			return pmmio;
+
+		pmmio = pmmio->next;
+	 }
+
+	return NULL;
+}
+
+void* pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset)
+{
+	char *ret;
+
+	/* Get base of mmaped PMT file. */
+	ret = (char*)pmmio->mmio_base;
+
+	/*
+	 * Apply PMT MMIO offset to obtain beginning of the mmaped telemetry data.
+	 * It's not guaranteed that the mmaped memory begins with the telemetry data
+	 * 	- we might have to apply the offset first.
+	 */
+	ret += pmmio->pmt_offset;
+
+	/* Apply the counter offset to get the address to the mmaped counter. */
+	ret += counter_offset;
+
+	return ret;
+}
+
+struct pmt_mmio* pmt_add_guid(unsigned int guid)
+{
+	struct pmt_mmio *ret;
+
+	ret = pmt_mmio_find(guid);
+	if (!ret)
+		ret = pmt_mmio_open(guid);
+
+	return ret;
+}
+
+enum pmt_open_mode {
+	PMT_OPEN_TRY,		/* Open failure is not an error. */
+	PMT_OPEN_REQUIRED,	/* Open failure is a fatal error. */
+};
+
+struct pmt_counter* pmt_find_counter(struct pmt_counter *pcounter, const char *name)
+{
+	while (pcounter) {
+		if (strcmp(pcounter->name, name) == 0)
+			break;
+
+		pcounter = pcounter->next;
+	}
+
+	return pcounter;
+}
+
+struct pmt_counter** pmt_get_scope_root(enum counter_scope scope)
+{
+	switch(scope) {
+	case SCOPE_CPU:
+		return &sys.pmt_tp;
+	case SCOPE_CORE:
+		return &sys.pmt_cp;
+	case SCOPE_PACKAGE:
+		return &sys.pmt_pp;
+	}
+
+	__builtin_unreachable();
+}
+
+void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, unsigned int domain_id)
+{
+	/* Make sure the new domain fits. */
+	if (domain_id >= pcounter->num_domains)
+		pmt_counter_resize(pcounter, domain_id+1);
+
+	assert(pcounter->domains);
+	assert(domain_id < pcounter->num_domains);
+
+	pcounter->domains[domain_id].pcounter = pmmio;
+}
+
+int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type,
+	unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope,
+	enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode)
+{
+	struct pmt_mmio *mmio;
+	struct pmt_counter *pcounter;
+	struct pmt_counter ** const pmt_root = pmt_get_scope_root(scope);
+	bool new_counter = false;
+	int conflict = 0;
+
+	if (lsb > msb) {
+		fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "lsb <= msb", name);
+		exit(1);
+	}
+
+	if (msb >= 64) {
+		fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "msb < 64", name);
+		exit(1);
+	}
+
+	mmio = pmt_add_guid(guid);
+	if (!mmio) {
+		if (mode != PMT_OPEN_TRY) {
+			fprintf(stderr, "%s: failed to map PMT MMIO for guid %x\n", __func__, guid);
+			exit(1);
+		}
+
+		return 1;
+	}
+
+	if (offset >= mmio->size) {
+		if (mode != PMT_OPEN_TRY) {
+			fprintf(stderr, "%s: offset %u outside of PMT MMIO size %u\n", __func__, offset, mmio->size);
+			exit(1);
+		}
+
+		return 1;
+	}
+
+	pcounter = pmt_find_counter(*pmt_root, name);
+	if (!pcounter) {
+		pcounter = calloc(1, sizeof(*pcounter));
+		new_counter = true;
+	}
+
+	if (new_counter) {
+		strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name)-1);
+		pcounter->type = type;
+		pcounter->scope = scope;
+		pcounter->lsb = lsb;
+		pcounter->msb = msb;
+		pcounter->format = format;
+	} else {
+		conflict += pcounter->type != type;
+		conflict += pcounter->scope != scope;
+		conflict += pcounter->lsb != lsb;
+		conflict += pcounter->msb != msb;
+		conflict += pcounter->format != format;
+	}
+
+	if (conflict) {
+		fprintf(stderr, "%s: conflicting parameters for the PMT counter with the same name %s\n",
+			__func__, name);
+		exit(1);
+	}
+
+	pmt_counter_add_domain(pcounter, pmt_get_counter_pointer(mmio, offset), domain_id);
+
+	if (new_counter) {
+		pcounter->next = *pmt_root;
+		*pmt_root = pcounter;
+	}
+
+	return 0;
+}
+
 void turbostat_init()
 {
 	setup_all_buffers(true);
@@ -8622,7 +9201,7 @@ int add_perf_counter(const char *perf_device, const char *perf_event, const char
 	return 0;
 }
 
-void parse_add_command(char *add_command)
+void parse_add_command_msr(char *add_command)
 {
 	int msr_num = 0;
 	char *path = NULL;
@@ -8747,6 +9326,191 @@ void parse_add_command(char *add_command)
 	}
 }
 
+bool starts_with(const char *str, const char *prefix)
+{
+	return strncmp(prefix, str, strlen(prefix)) == 0;
+}
+
+void parse_add_command_pmt(char *add_command)
+{
+	char *name = NULL;
+	char *type_name = NULL;
+	char *format_name = NULL;
+	unsigned int offset;
+	unsigned int lsb;
+	unsigned int msb;
+	unsigned int guid;
+	unsigned int domain_id;
+	enum counter_scope scope = 0;
+	enum pmt_datatype type = PMT_TYPE_RAW;
+	enum counter_format format = FORMAT_RAW;
+	bool has_offset = false;
+	bool has_lsb = false;
+	bool has_msb = false;
+	bool has_format = true; /* Format has a default value. */
+	bool has_guid = false;
+	bool has_scope = false;
+	bool has_type = true; /* Type has a default value. */
+
+	/* Consume the "pmt," prefix. */
+	add_command = strchr(add_command, ',');
+	if (!add_command) {
+		help();
+		exit(1);
+	}
+	++add_command;
+
+	while (add_command) {
+		if (starts_with(add_command, "name=")) {
+			name = add_command + strlen("name=");
+			goto next;
+		}
+
+		if (starts_with(add_command, "type=")) {
+			type_name = add_command + strlen("type=");
+			goto next;
+		}
+
+		if (starts_with(add_command, "domain=")) {
+			const size_t prefix_len = strlen("domain=");
+
+			if (sscanf(add_command + prefix_len, "cpu%u", &domain_id) == 1) {
+				scope = SCOPE_CPU;
+				has_scope = true;
+			} else if (sscanf(add_command + prefix_len, "core%u", &domain_id) == 1) {
+				scope = SCOPE_CORE;
+				has_scope = true;
+			} else if (sscanf(add_command + prefix_len, "package%u", &domain_id) == 1) {
+				scope = SCOPE_PACKAGE;
+				has_scope = true;
+			}
+
+			if (!has_scope) {
+				printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n",
+					__func__);
+				exit(1);
+			}
+
+			goto next;
+		}
+
+		if (starts_with(add_command, "format=")) {
+			format_name = add_command + strlen("format=");
+			goto next;
+		}
+
+		if (sscanf(add_command, "offset=%u", &offset) == 1) {
+			has_offset = true;
+			goto next;
+		}
+
+		if (sscanf(add_command, "lsb=%u", &lsb) == 1) {
+			has_lsb = true;
+			goto next;
+		}
+
+		if (sscanf(add_command, "msb=%u", &msb) == 1) {
+			has_msb = true;
+			goto next;
+		}
+
+		if (sscanf(add_command, "guid=%x", &guid) == 1) {
+			has_guid = true;
+			goto next;
+		}
+
+next:
+		add_command = strchr(add_command, ',');
+		if (add_command) {
+			*add_command = '\0';
+			add_command++;
+		}
+	}
+
+	if (!name) {
+		printf("%s: missing %s\n", __func__, "name");
+		exit(1);
+	}
+
+	if (strlen(name) >= PMT_COUNTER_NAME_SIZE_BYTES) {
+		printf("%s: name has to be at most %d characters long\n",
+			__func__, PMT_COUNTER_NAME_SIZE_BYTES);
+		exit(1);
+	}
+
+	if (format_name) {
+		has_format = false;
+
+		if (strcmp("raw", format_name) == 0) {
+			format = FORMAT_RAW;
+			has_format = true;
+		}
+
+		if (strcmp("delta", format_name) == 0) {
+			format = FORMAT_DELTA;
+			has_format = true;
+		}
+
+		if (!has_format) {
+			fprintf(stderr, "%s: Invalid format %s. Expected raw or delta\n", __func__, format_name);
+			exit(1);
+		}
+	}
+
+	if (type_name) {
+		has_type = false;
+
+		if (strcmp("raw", type_name) == 0) {
+			type = PMT_TYPE_RAW;
+			has_type = true;
+		}
+
+		if (!has_type) {
+			printf("%s: invalid %s: %s\n", __func__, "type", type_name);
+			exit(1);
+		}
+	}
+
+	if (!has_offset) {
+		printf("%s : missing %s\n", __func__, "offset");
+		exit(1);
+	}
+
+	if (!has_lsb) {
+		printf("%s: missing %s\n", __func__, "lsb");
+		exit(1);
+	}
+
+	if (!has_msb) {
+		printf("%s: missing %s\n", __func__, "msb");
+		exit(1);
+	}
+
+	if (!has_guid) {
+		printf("%s: missing %s\n", __func__, "guid");
+		exit(1);
+	}
+
+	if (!has_scope) {
+		printf("%s: missing %s\n", __func__, "scope");
+		exit(1);
+	}
+
+	if (lsb > msb) {
+		printf("%s: lsb > msb doesn't make sense\n", __func__);
+		exit(1);
+	}
+
+	pmt_add_counter(guid, name, type, lsb, msb, offset, scope, format, domain_id, PMT_OPEN_REQUIRED);
+}
+
+void parse_add_command(char *add_command)
+{
+	if (strncmp(add_command, "pmt", strlen("pmt")) == 0)
+		return parse_add_command_pmt(add_command);
+	return parse_add_command_msr(add_command);
+}
+
 int is_deferred_add(char *name)
 {
 	int i;

From 640540beb88363a825524295664acfdb0f5d5fc2 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Mon, 22 Jul 2024 22:12:22 +0200
Subject: [PATCH 316/346] tools/power turbostat: Add MTL's PMT DC6 builtin
 counter

Provide a definition for metadata that allows reading DC6 residency
counter via PMT and exposes it as a builtin counter.

Note that this residency counter is updated and read via
entirely different mechanisms vs the MSR-based residency counters.
On MTL processors, there are times when Die%c6 will report above 100%.
This is still useful, but don't expect 3 digits of precision...

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 70 ++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index c9fdfb074ce9..f769e8beabd3 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -193,6 +193,7 @@ struct msr_counter bic[] = {
 	{ 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 },
 	{ 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 },
 	{ 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 },
+	{ 0x0, "Die%c6", NULL, 0, 0, 0, NULL, 0 },
 };
 
 #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter))
@@ -254,11 +255,12 @@ struct msr_counter bic[] = {
 #define	BIC_SAM_mc6		(1ULL << 55)
 #define	BIC_SAMMHz		(1ULL << 56)
 #define	BIC_SAMACTMHz		(1ULL << 57)
+#define	BIC_Diec6		(1ULL << 58)
 
 #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die )
 #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__)
 #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
-#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6)
+#define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6)
 #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
 
 #define BIC_DISABLED_BY_DEFAULT	(BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC)
@@ -1475,6 +1477,11 @@ static struct msr_counter_arch_info msr_counter_arch_infos[] = {
 #define SYSFS_TELEM_PATH "/sys/class/intel_pmt"
 #endif
 
+#define PMT_COUNTER_MTL_DC6_OFFSET 120
+#define PMT_COUNTER_MTL_DC6_LSB    0
+#define PMT_COUNTER_MTL_DC6_MSB    63
+#define PMT_MTL_DC6_GUID           0x1a067102
+
 #define PMT_COUNTER_NAME_SIZE_BYTES      16
 #define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32
 
@@ -1496,6 +1503,7 @@ struct pmt_mmio {
 
 enum pmt_datatype {
 	PMT_TYPE_RAW,
+	PMT_TYPE_XTAL_TIME,
 };
 
 struct pmt_domain_info {
@@ -1630,6 +1638,7 @@ struct pkg_data {
 	struct rapl_counter rapl_dram_perf_status;	/* MSR_DRAM_PERF_STATUS */
 	unsigned int pkg_temp_c;
 	unsigned int uncore_mhz;
+	unsigned long long die_c6;
 	unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS];
 	unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS];
 	unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS];
@@ -2281,6 +2290,10 @@ void print_header(char *delim)
 				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
 
 			break;
+
+		case PMT_TYPE_XTAL_TIME:
+			outp += sprintf(outp, "%s%s", delim, ppmt->name);
+			break;
 		}
 
 		ppmt = ppmt->next;
@@ -2351,6 +2364,10 @@ void print_header(char *delim)
 				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
 
 			break;
+
+		case PMT_TYPE_XTAL_TIME:
+			outp += sprintf(outp, "%s%s", delim, ppmt->name);
+			break;
 		}
 
 		ppmt = ppmt->next;
@@ -2400,6 +2417,8 @@ void print_header(char *delim)
 		outp += sprintf(outp, "%sPkg%%pc9", (printed++ ? delim : ""));
 	if (DO_BIC(BIC_Pkgpc10))
 		outp += sprintf(outp, "%sPk%%pc10", (printed++ ? delim : ""));
+	if (DO_BIC(BIC_Diec6))
+		outp += sprintf(outp, "%sDie%%c6", (printed++ ? delim : ""));
 	if (DO_BIC(BIC_CPU_LPI))
 		outp += sprintf(outp, "%sCPU%%LPI", (printed++ ? delim : ""));
 	if (DO_BIC(BIC_SYS_LPI))
@@ -2476,6 +2495,10 @@ void print_header(char *delim)
 				outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
 
 			break;
+
+		case PMT_TYPE_XTAL_TIME:
+			outp += sprintf(outp, "%s%s", delim, ppmt->name);
+			break;
 		}
 
 		ppmt = ppmt->next;
@@ -2775,6 +2798,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->pmt_counter[i]);
 
 			break;
+
+		case PMT_TYPE_XTAL_TIME:
+			const unsigned long value_raw = t->pmt_counter[i];
+			const double value_converted = 100.0 * value_raw / crystal_hz / interval_float;
+
+			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
+			break;
 		}
 	}
 
@@ -2849,6 +2879,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->pmt_counter[i]);
 
 			break;
+
+		case PMT_TYPE_XTAL_TIME:
+			const unsigned long value_raw = c->pmt_counter[i];
+			const double value_converted = 100.0 * value_raw / crystal_hz / interval_float;
+
+			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
+			break;
 		}
 	}
 
@@ -2931,6 +2968,9 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	if (DO_BIC(BIC_Pkgpc10))
 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc);
 
+	if (DO_BIC(BIC_Diec6))
+		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float);
+
 	if (DO_BIC(BIC_CPU_LPI)) {
 		if (p->cpu_lpi >= 0)
 			outp +=
@@ -3037,6 +3077,13 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 				outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->pmt_counter[i]);
 
 			break;
+
+		case PMT_TYPE_XTAL_TIME:
+			const unsigned long value_raw = p->pmt_counter[i];
+			const double value_converted = 100.0 * value_raw / crystal_hz / interval_float;
+
+			outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
+			break;
 		}
 	}
 
@@ -3115,6 +3162,7 @@ int delta_package(struct pkg_data *new, struct pkg_data *old)
 	old->pc8 = new->pc8 - old->pc8;
 	old->pc9 = new->pc9 - old->pc9;
 	old->pc10 = new->pc10 - old->pc10;
+	old->die_c6 = new->die_c6 - old->die_c6;
 	old->cpu_lpi = new->cpu_lpi - old->cpu_lpi;
 	old->sys_lpi = new->sys_lpi - old->sys_lpi;
 	old->pkg_temp_c = new->pkg_temp_c;
@@ -3398,6 +3446,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	p->pc8 = 0;
 	p->pc9 = 0;
 	p->pc10 = 0;
+	p->die_c6 = 0;
 	p->cpu_lpi = 0;
 	p->sys_lpi = 0;
 
@@ -3547,6 +3596,7 @@ int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	average.packages.pc8 += p->pc8;
 	average.packages.pc9 += p->pc9;
 	average.packages.pc10 += p->pc10;
+	average.packages.die_c6 += p->die_c6;
 
 	average.packages.cpu_lpi = p->cpu_lpi;
 	average.packages.sys_lpi = p->sys_lpi;
@@ -3642,6 +3692,7 @@ void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data
 	average.packages.pc8 /= topo.allowed_packages;
 	average.packages.pc9 /= topo.allowed_packages;
 	average.packages.pc10 /= topo.allowed_packages;
+	average.packages.die_c6 /= topo.allowed_packages;
 
 	for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
 		if (mp->format == FORMAT_RAW)
@@ -5505,6 +5556,7 @@ void msr_perf_init(void);
 void rapl_perf_init(void);
 void cstate_perf_init(void);
 void added_perf_counters_init(void);
+void pmt_init(void);
 
 void re_initialize(void)
 {
@@ -5515,6 +5567,7 @@ void re_initialize(void)
 	rapl_perf_init();
 	cstate_perf_init();
 	added_perf_counters_init();
+	pmt_init();
 	fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus,
 		topo.allowed_cpus);
 }
@@ -8864,6 +8917,15 @@ int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type,
 	return 0;
 }
 
+void pmt_init(void)
+{
+	if (BIC_IS_ENABLED(BIC_Diec6)) {
+		pmt_add_counter(PMT_MTL_DC6_GUID, "Die%c6", PMT_TYPE_XTAL_TIME, PMT_COUNTER_MTL_DC6_LSB,
+				PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, SCOPE_PACKAGE, FORMAT_DELTA,
+				0, PMT_OPEN_TRY);
+	}
+}
+
 void turbostat_init()
 {
 	setup_all_buffers(true);
@@ -8878,6 +8940,7 @@ void turbostat_init()
 	rapl_perf_init();
 	cstate_perf_init();
 	added_perf_counters_init();
+	pmt_init();
 
 	for_all_cpus(get_cpu_type, ODD_COUNTERS);
 	for_all_cpus(get_cpu_type, EVEN_COUNTERS);
@@ -9465,6 +9528,11 @@ void parse_add_command_pmt(char *add_command)
 			has_type = true;
 		}
 
+		if (strcmp("txtal_time", type_name) == 0) {
+			type = PMT_TYPE_XTAL_TIME;
+			has_type = true;
+		}
+
 		if (!has_type) {
 			printf("%s: invalid %s: %s\n", __func__, "type", type_name);
 			exit(1);

From 944264a2a99cc4dd4b10eaa9798b6aab80adf4be Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Tue, 23 Jul 2024 20:12:32 +0200
Subject: [PATCH 317/346] tools/power turbostat: Document PMT in turbostat.8

Add a general description of the user interface for adding PMT
counters with the new --add pmt,... option.

Provide a complete example for requesting two counters.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.8 | 65 +++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8
index 44cbc7cb382d..067717bce1d4 100644
--- a/tools/power/x86/turbostat/turbostat.8
+++ b/tools/power/x86/turbostat/turbostat.8
@@ -55,6 +55,39 @@ name as necessary to disambiguate it from others is necessary.  Note that option
 		as the column header.
 .fi
 .PP
+\fB--add pmt,[attr_name=attr_value, ...]\fP add column with a PMT (Intel Platform Monitoring Technology) counter in a similar way to --add option above, but require PMT metadata to be supplied to correctly read and display the counter. The metadata can be found in the Intel PMT XML files, hosted at https://github.com/intel/Intel-PMT. For a complete example see "ADD PMT COUNTER EXAMPLE".
+.nf
+	name="name_string"
+		For column header.
+
+	type={\fBraw\fP}
+		'raw' shows the counter contents in hex.
+		default: raw
+
+	format={\fBraw\fP | \fBdelta\fP}
+		'raw' shows the counter contents in hex.
+		'delta' shows the difference in values during the measurement interval.
+		default: raw
+
+	domain={\fBcpu%u\fP | \fBcore%u\fP | \fBpackage%u\fP}
+		'cpu' per cpu/thread counter.
+		'core' per core counter.
+		'package' per package counter.
+		'%u' denotes id of the domain that the counter is associated with. For example core4 would mean that the counter is associated with core number 4.
+
+	offset=\fB%u\fP
+		'%u' offset within the PMT MMIO region.
+
+	lsb=\fB%u\fP
+		'%u' least significant bit within the 64 bit value read from 'offset'. Together with 'msb', used to form a read mask.
+
+	msb=\fB%u\fP
+		'%u' most significant bit within the 64 bit value read from 'offset'. Together with 'lsb', used to form a read mask.
+
+	guid=\fB%x\fP
+		'%x' hex identifier of the PMT MMIO region.
+.fi
+.PP
 \fB--cpu cpu-set\fP limit output to system summary plus the specified cpu-set.  If cpu-set is the string "core", then the system summary plus the first CPU in each core are printed -- eg. subsequent HT siblings are not printed.  Or if cpu-set is the string "package", then the system summary plus the first CPU in each package is printed.  Otherwise, the system summary plus the specified set of CPUs are printed.  The cpu-set is ordered from low to high, comma delimited with ".." and "-" permitted to denote a range. eg. 1,2,8,14..17,21-44
 .PP
 \fB--hide column\fP do not show the specified built-in columns.  May be invoked multiple times, or with a comma-separated list of column names.
@@ -354,6 +387,38 @@ CPU     pCPU%c1 CPU%c1
 
 .fi
 
+.SH ADD PMT COUNTER EXAMPLE
+Here we limit turbostat to showing just the CPU number 0.
+We add two counters, showing crystal clock count and the DC6 residency.
+All the parameters passed are based on the metadata found in the PMT XML files.
+
+For the crystal clock count, we
+label it with the column header, "XTAL",
+we set the type to 'raw', to read the number of clock ticks in hex,
+we set the format to 'delta', to display the difference in ticks during the measurement interval,
+we set the domain to 'package0', to collect it and associate it with the whole package number 0,
+we set the offset to '0', which is a offset of the counter within the PMT MMIO region,
+we set the lsb and msb to cover all 64 bits of the read 64 bit value,
+and finally we set the guid to '0x1a067102', that identifies the PMT MMIO region to which the 'offset' is applied to read the counter value.
+
+For the DC6 residency counter, we
+label it with the column header, "Die%c6",
+we set the type to 'txtal_time', to obtain the percent residency value
+we set the format to 'delta', to display the difference in ticks during the measurement interval,
+we set the domain to 'package0', to collect it and associate it with the whole package number 0,
+we set the offset to '0', which is a offset of the counter within the PMT MMIO region,
+we set the lsb and msb to cover all 64 bits of the read 64 bit value,
+and finally we set the guid to '0x1a067102', that identifies the PMT MMIO region to which the 'offset' is applied to read the counter value.
+
+.nf
+sudo ./turbostat --quiet --cpu 0 --show CPU --add pmt,name=XTAL,type=raw,format=delta,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102 --add pmt,name=Die%c6,type=txtal_time,format=delta,domain=package0,offset=120,lsb=0,msb=63,guid=0x1a067102
+0.104352 sec
+CPU                   XTAL      Die%c6
+-       0x0000006d4d957ca7      0.00
+0       0x0000006d4d957ca7      0.00
+0.102448 sec
+.fi
+
 .SH INPUT
 
 For interval-mode, turbostat will immediately end the current interval

From 19d076903b95896ce55c7cc3679f795731591ac6 Mon Sep 17 00:00:00 2001
From: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Date: Wed, 24 Jul 2024 13:17:30 +0200
Subject: [PATCH 318/346] tools/power turbostat: Include umask=%x in perf
 counter's config

Some counters, like cpu/cache-misses/, expose and require umask=%x
parameter alongside event=%x in the sysfs perf counter's event file.
This change make sure we parse and use it when opening user added
counters.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 60 ++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 10 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index f769e8beabd3..ed5f032c6b3f 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -4017,15 +4017,55 @@ static unsigned int read_perf_type(const char *subsys)
 	return read_perf_counter_info_n(path, format);
 }
 
-static unsigned int read_rapl_config(const char *subsys, const char *event_name)
+static unsigned int read_perf_config(const char *subsys, const char *event_name)
 {
 	const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s";
-	const char *const format = "event=%x";
+	FILE *fconfig = NULL;
 	char path[128];
+	char config_str[64];
+	unsigned int config;
+	unsigned int umask;
+	bool has_config = false;
+	bool has_umask = false;
+	unsigned int ret = -1;
 
 	snprintf(path, sizeof(path), path_format, subsys, event_name);
 
-	return read_perf_counter_info_n(path, format);
+	fconfig = fopen(path, "r");
+	if (!fconfig)
+		return -1;
+
+	if (fgets(config_str, ARRAY_SIZE(config_str), fconfig) != config_str)
+		goto cleanup_and_exit;
+
+	for (char *pconfig_str = &config_str[0]; pconfig_str;) {
+		if (sscanf(pconfig_str, "event=%x", &config) == 1) {
+			has_config = true;
+			goto next;
+		}
+
+		if (sscanf(pconfig_str, "umask=%x", &umask) == 1) {
+			has_umask = true;
+			goto next;
+		}
+
+	next:
+		pconfig_str = strchr(pconfig_str, ',');
+		if (pconfig_str) {
+			*pconfig_str = '\0';
+			++pconfig_str;
+		}
+	}
+
+	if (!has_umask)
+		umask = 0;
+
+	if (has_config)
+		ret = (umask << 8) | config;
+
+cleanup_and_exit:
+	fclose(fconfig);
+	return ret;
 }
 
 static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name)
@@ -4044,7 +4084,7 @@ static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_na
 	return RAPL_UNIT_INVALID;
 }
 
-static double read_perf_rapl_scale(const char *subsys, const char *event_name)
+static double read_perf_scale(const char *subsys, const char *event_name)
 {
 	const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale";
 	const char *const format = "%lf";
@@ -7425,7 +7465,7 @@ int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struc
 	if (no_perf)
 		return -1;
 
-	const double scale = read_perf_rapl_scale(cai->perf_subsys, cai->perf_name);
+	const double scale = read_perf_scale(cai->perf_subsys, cai->perf_name);
 
 	if (scale == 0.0)
 		return -1;
@@ -7436,7 +7476,7 @@ int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struc
 		return -1;
 
 	const unsigned int rapl_type = read_perf_type(cai->perf_subsys);
-	const unsigned int rapl_energy_pkg_config = read_rapl_config(cai->perf_subsys, cai->perf_name);
+	const unsigned int rapl_energy_pkg_config = read_perf_config(cai->perf_subsys, cai->perf_name);
 
 	const int fd_counter =
 	    open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP);
@@ -7598,7 +7638,7 @@ int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const s
 		return -1;
 
 	const unsigned int type = read_perf_type(cai->perf_subsys);
-	const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name);
+	const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name);
 
 	const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP);
 
@@ -7628,7 +7668,7 @@ int add_msr_perf_counter_(int cpu, struct msr_counter_info_t *cci, const struct
 		return -1;
 
 	const unsigned int type = read_perf_type(cai->perf_subsys);
-	const unsigned int config = read_rapl_config(cai->perf_subsys, cai->perf_name);
+	const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name);
 
 	const int fd_counter = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP);
 
@@ -8571,7 +8611,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo)
 				continue;
 			}
 
-			perf_config = read_rapl_config(pinfo->device, pinfo->event);
+			perf_config = read_perf_config(pinfo->device, pinfo->event);
 			if (perf_config == (unsigned int)-1) {
 				warnx("%s: perf/%s/%s: failed to read %s",
 				      __func__, pinfo->device, pinfo->event, "config");
@@ -8579,7 +8619,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo)
 			}
 
 			/* Scale is not required, some counters just don't have it. */
-			perf_scale = read_perf_rapl_scale(pinfo->device, pinfo->event);
+			perf_scale = read_perf_scale(pinfo->device, pinfo->event);
 			if (perf_scale == 0.0)
 				perf_scale = 1.0;
 

From 866d2d36b81d7d0e6d91423b6dd9b1bcfd0510dd Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 26 Jul 2024 14:16:28 -0400
Subject: [PATCH 319/346] tools/power turbostat: version 2024.07.26

Release 2024.07.26:

Enable turbostat extensions to add both perf and PMT
(Intel Platform Monitoring Technology) counters from the cmdline.

Demonstrate PMT access with built-in support for Meteor Lake's Die%c6 counter.

This commit:

Clean up white-space nits introduced since version 2024.05.10

Signed-off-by: Len Brown <len.brown@intel.com>
---
 tools/power/x86/turbostat/turbostat.c | 105 +++++++++++++-------------
 1 file changed, 52 insertions(+), 53 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index ed5f032c6b3f..089220aaa5c9 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1057,8 +1057,7 @@ void probe_platform_features(unsigned int family, unsigned int model)
 		return;
 
 	for (i = 0; turbostat_pdata[i].features; i++) {
-		if (VFM_FAMILY(turbostat_pdata[i].vfm) == family &&
-		    VFM_MODEL(turbostat_pdata[i].vfm) == model) {
+		if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && VFM_MODEL(turbostat_pdata[i].vfm) == model) {
 			platform = turbostat_pdata[i].features;
 			return;
 		}
@@ -1448,28 +1447,28 @@ enum msr_arch_info_index {
 
 static struct msr_counter_arch_info msr_counter_arch_infos[] = {
 	[MSR_ARCH_INFO_APERF_INDEX] = {
-		.perf_subsys = "msr",
-		.perf_name = "aperf",
-		.msr = MSR_IA32_APERF,
-		.msr_mask = 0xFFFFFFFFFFFFFFFF,
-		.rci_index = MSR_RCI_INDEX_APERF,
-	},
+				       .perf_subsys = "msr",
+				       .perf_name = "aperf",
+				       .msr = MSR_IA32_APERF,
+				       .msr_mask = 0xFFFFFFFFFFFFFFFF,
+				       .rci_index = MSR_RCI_INDEX_APERF,
+				        },
 
 	[MSR_ARCH_INFO_MPERF_INDEX] = {
-		.perf_subsys = "msr",
-		.perf_name = "mperf",
-		.msr = MSR_IA32_MPERF,
-		.msr_mask = 0xFFFFFFFFFFFFFFFF,
-		.rci_index = MSR_RCI_INDEX_MPERF,
-	},
+				       .perf_subsys = "msr",
+				       .perf_name = "mperf",
+				       .msr = MSR_IA32_MPERF,
+				       .msr_mask = 0xFFFFFFFFFFFFFFFF,
+				       .rci_index = MSR_RCI_INDEX_MPERF,
+				        },
 
 	[MSR_ARCH_INFO_SMI_INDEX] = {
-		.perf_subsys = "msr",
-		.perf_name = "smi",
-		.msr = MSR_SMI_COUNT,
-		.msr_mask = 0xFFFFFFFF,
-		.rci_index = MSR_RCI_INDEX_SMI,
-	},
+				     .perf_subsys = "msr",
+				     .perf_name = "smi",
+				     .msr = MSR_SMI_COUNT,
+				     .msr_mask = 0xFFFFFFFF,
+				     .rci_index = MSR_RCI_INDEX_SMI,
+				      },
 };
 
 /* Can be redefined when compiling, useful for testing. */
@@ -1535,14 +1534,14 @@ struct pmt_counter {
 
 unsigned int pmt_counter_get_width(const struct pmt_counter *p)
 {
-	return (p->msb - p->lsb)+1;
+	return (p->msb - p->lsb) + 1;
 }
 
 void pmt_counter_resize_(struct pmt_counter *pcounter, unsigned int new_size)
 {
 	struct pmt_domain_info *new_mem;
 
-	new_mem = (struct pmt_domain_info*) reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains));
+	new_mem = (struct pmt_domain_info *)reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains));
 	if (!new_mem) {
 		fprintf(stderr, "%s: failed to allocate memory for PMT counters\n", __func__);
 		exit(1);
@@ -1567,7 +1566,7 @@ void pmt_counter_resize(struct pmt_counter *pcounter, unsigned int new_size)
 	 */
 	if (new_size < 8)
 		new_size = 8;
-	new_size = MAX(new_size, pcounter->num_domains*2);
+	new_size = MAX(new_size, pcounter->num_domains * 2);
 
 	pmt_counter_resize_(pcounter, new_size);
 }
@@ -2282,7 +2281,7 @@ void print_header(char *delim)
 
 	ppmt = sys.pmt_tp;
 	while (ppmt) {
-		switch(ppmt->type) {
+		switch (ppmt->type) {
 		case PMT_TYPE_RAW:
 			if (pmt_counter_get_width(ppmt) <= 32)
 				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
@@ -2356,7 +2355,7 @@ void print_header(char *delim)
 
 	ppmt = sys.pmt_cp;
 	while (ppmt) {
-		switch(ppmt->type) {
+		switch (ppmt->type) {
 		case PMT_TYPE_RAW:
 			if (pmt_counter_get_width(ppmt) <= 32)
 				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
@@ -2487,7 +2486,7 @@ void print_header(char *delim)
 
 	ppmt = sys.pmt_pp;
 	while (ppmt) {
-		switch(ppmt->type) {
+		switch (ppmt->type) {
 		case PMT_TYPE_RAW:
 			if (pmt_counter_get_width(ppmt) <= 32)
 				outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
@@ -2969,7 +2968,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc);
 
 	if (DO_BIC(BIC_Diec6))
-		outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float);
+		outp +=
+		    sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float);
 
 	if (DO_BIC(BIC_CPU_LPI)) {
 		if (p->cpu_lpi >= 0)
@@ -4049,7 +4049,7 @@ static unsigned int read_perf_config(const char *subsys, const char *event_name)
 			goto next;
 		}
 
-	next:
+next:
 		pconfig_str = strchr(pconfig_str, ',');
 		if (pconfig_str) {
 			*pconfig_str = '\0';
@@ -4463,9 +4463,9 @@ unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb)
 	if (msb == 63)
 		mask = 0xffffffffffffffff;
 	else
-		mask = ((1<<(msb+1))-1);
+		mask = ((1 << (msb + 1)) - 1);
 
-	mask -= (1<<lsb)-1;
+	mask -= (1 << lsb) - 1;
 
 	return mask;
 }
@@ -8636,7 +8636,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo)
 
 			if (debug)
 				fprintf(stderr, "Add perf/%s/%s cpu%d: %d\n",
-				       pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]);
+					pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]);
 		}
 
 		pinfo = pinfo->next;
@@ -8687,7 +8687,7 @@ int parse_telem_info_file(int fd_dir, const char *info_filename, const char *for
 	return 0;
 }
 
-struct pmt_mmio* pmt_mmio_open(unsigned int target_guid)
+struct pmt_mmio *pmt_mmio_open(unsigned int target_guid)
 {
 	DIR *dirp;
 	struct dirent *entry;
@@ -8793,7 +8793,7 @@ struct pmt_mmio* pmt_mmio_open(unsigned int target_guid)
 	return ret;
 }
 
-struct pmt_mmio* pmt_mmio_find(unsigned int guid)
+struct pmt_mmio *pmt_mmio_find(unsigned int guid)
 {
 	struct pmt_mmio *pmmio = pmt_mmios;
 
@@ -8802,22 +8802,22 @@ struct pmt_mmio* pmt_mmio_find(unsigned int guid)
 			return pmmio;
 
 		pmmio = pmmio->next;
-	 }
+	}
 
 	return NULL;
 }
 
-void* pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset)
+void *pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset)
 {
 	char *ret;
 
 	/* Get base of mmaped PMT file. */
-	ret = (char*)pmmio->mmio_base;
+	ret = (char *)pmmio->mmio_base;
 
 	/*
 	 * Apply PMT MMIO offset to obtain beginning of the mmaped telemetry data.
 	 * It's not guaranteed that the mmaped memory begins with the telemetry data
-	 * 	- we might have to apply the offset first.
+	 *      - we might have to apply the offset first.
 	 */
 	ret += pmmio->pmt_offset;
 
@@ -8827,7 +8827,7 @@ void* pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offs
 	return ret;
 }
 
-struct pmt_mmio* pmt_add_guid(unsigned int guid)
+struct pmt_mmio *pmt_add_guid(unsigned int guid)
 {
 	struct pmt_mmio *ret;
 
@@ -8843,7 +8843,7 @@ enum pmt_open_mode {
 	PMT_OPEN_REQUIRED,	/* Open failure is a fatal error. */
 };
 
-struct pmt_counter* pmt_find_counter(struct pmt_counter *pcounter, const char *name)
+struct pmt_counter *pmt_find_counter(struct pmt_counter *pcounter, const char *name)
 {
 	while (pcounter) {
 		if (strcmp(pcounter->name, name) == 0)
@@ -8855,9 +8855,9 @@ struct pmt_counter* pmt_find_counter(struct pmt_counter *pcounter, const char *n
 	return pcounter;
 }
 
-struct pmt_counter** pmt_get_scope_root(enum counter_scope scope)
+struct pmt_counter **pmt_get_scope_root(enum counter_scope scope)
 {
-	switch(scope) {
+	switch (scope) {
 	case SCOPE_CPU:
 		return &sys.pmt_tp;
 	case SCOPE_CORE:
@@ -8873,7 +8873,7 @@ void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio,
 {
 	/* Make sure the new domain fits. */
 	if (domain_id >= pcounter->num_domains)
-		pmt_counter_resize(pcounter, domain_id+1);
+		pmt_counter_resize(pcounter, domain_id + 1);
 
 	assert(pcounter->domains);
 	assert(domain_id < pcounter->num_domains);
@@ -8882,12 +8882,12 @@ void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio,
 }
 
 int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type,
-	unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope,
-	enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode)
+		    unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope,
+		    enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode)
 {
 	struct pmt_mmio *mmio;
 	struct pmt_counter *pcounter;
-	struct pmt_counter ** const pmt_root = pmt_get_scope_root(scope);
+	struct pmt_counter **const pmt_root = pmt_get_scope_root(scope);
 	bool new_counter = false;
 	int conflict = 0;
 
@@ -8927,7 +8927,7 @@ int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type,
 	}
 
 	if (new_counter) {
-		strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name)-1);
+		strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name) - 1);
 		pcounter->type = type;
 		pcounter->scope = scope;
 		pcounter->lsb = lsb;
@@ -9071,7 +9071,7 @@ int get_and_dump_counters(void)
 
 void print_version()
 {
-	fprintf(outf, "turbostat version 2024.05.10 - Len Brown <lenb@kernel.org>\n");
+	fprintf(outf, "turbostat version 2024.07.26 - Len Brown <lenb@kernel.org>\n");
 }
 
 #define COMMAND_LINE_SIZE 2048
@@ -9299,7 +9299,7 @@ int add_perf_counter(const char *perf_device, const char *perf_event, const char
 	// FIXME: we might not have debug here yet
 	if (debug)
 		fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n",
-		       __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope);
+			__func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope);
 
 	return 0;
 }
@@ -9450,10 +9450,10 @@ void parse_add_command_pmt(char *add_command)
 	bool has_offset = false;
 	bool has_lsb = false;
 	bool has_msb = false;
-	bool has_format = true; /* Format has a default value. */
+	bool has_format = true;	/* Format has a default value. */
 	bool has_guid = false;
 	bool has_scope = false;
-	bool has_type = true; /* Type has a default value. */
+	bool has_type = true;	/* Type has a default value. */
 
 	/* Consume the "pmt," prefix. */
 	add_command = strchr(add_command, ',');
@@ -9490,7 +9490,7 @@ void parse_add_command_pmt(char *add_command)
 
 			if (!has_scope) {
 				printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n",
-					__func__);
+				       __func__);
 				exit(1);
 			}
 
@@ -9536,8 +9536,7 @@ void parse_add_command_pmt(char *add_command)
 	}
 
 	if (strlen(name) >= PMT_COUNTER_NAME_SIZE_BYTES) {
-		printf("%s: name has to be at most %d characters long\n",
-			__func__, PMT_COUNTER_NAME_SIZE_BYTES);
+		printf("%s: name has to be at most %d characters long\n", __func__, PMT_COUNTER_NAME_SIZE_BYTES);
 		exit(1);
 	}
 

From 6dc55268f64b780eb8774de3705f791b689853bb Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Tue, 23 Jul 2024 18:09:03 -0500
Subject: [PATCH 320/346] dt-bindings: iio: adc: ad7192: Fix 'single-channel'
 constraints

The 'single-channel' property is an uint32, not an array, so 'items' is
an incorrect constraint. This didn't matter until dtschema recently
changed how properties are decoded. This results in this warning:

Documentation/devicetree/bindings/iio/adc/adi,ad7192.example.dtb: adc@0: \
  channel@1:single-channel: 1 is not of type 'array'

Fixes: caf7b7632b8d ("dt-bindings: iio: adc: ad7192: Add AD7194 support")
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240723230904.1299744-1-robh@kernel.org
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml
index a03da9489ed9..190889c7b62a 100644
--- a/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml
+++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7192.yaml
@@ -120,9 +120,8 @@ patternProperties:
         description:
           Positive input can be connected to pins AIN1 to AIN16 by choosing the
           appropriate value from 1 to 16. Negative input is connected to AINCOM.
-        items:
-          minimum: 1
-          maximum: 16
+        minimum: 1
+        maximum: 16
 
     oneOf:
       - required:

From 5bf6f3c595d38e3e92130373ae4c0799da5026ee Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Tue, 9 Jul 2024 11:25:10 +0100
Subject: [PATCH 321/346] MAINTAINERS: mailmap: update James Clark's email
 address

My new address is james.clark@linaro.org

Link: https://lkml.kernel.org/r/20240709102512.31212-2-james.clark@linaro.org
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Bjorn Andersson <quic_bjorande@quicinc.com>
Cc: Conor Dooley <conor+dt@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Geliang Tang <geliang@kernel.org>
Cc: Hao Zhang <quic_hazha@quicinc.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Kees Cook <kees@kernel.org>
Cc: Krzysztof Kozlowski <krzk+dt@kernel.org>
Cc: Mao Jinlong <quic_jinlmao@quicinc.com>
Cc: Matthieu Baerts <matttbe@kernel.org>
Cc: Matt Ranostay <matt@ranostay.sg>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Oleksij Rempel <o.rempel@pengutronix.de>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .mailmap    | 1 +
 MAINTAINERS | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.mailmap b/.mailmap
index cbdf5d63c381..e51d76df75c2 100644
--- a/.mailmap
+++ b/.mailmap
@@ -260,6 +260,7 @@ Jaegeuk Kim <jaegeuk@kernel.org> <jaegeuk@motorola.com>
 Jakub Kicinski <kuba@kernel.org> <jakub.kicinski@netronome.com>
 James Bottomley <jejb@mulgrave.(none)>
 James Bottomley <jejb@titanic.il.steeleye.com>
+James Clark <james.clark@linaro.org> <james.clark@arm.com>
 James E Wilson <wilson@specifix.com>
 James Hogan <jhogan@kernel.org> <james@albanarts.com>
 James Hogan <jhogan@kernel.org> <james.hogan@imgtec.com>
diff --git a/MAINTAINERS b/MAINTAINERS
index c0a3d9e93689..5bbcfea5e3fa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2196,7 +2196,7 @@ N:	digicolor
 ARM/CORESIGHT FRAMEWORK AND DRIVERS
 M:	Suzuki K Poulose <suzuki.poulose@arm.com>
 R:	Mike Leach <mike.leach@linaro.org>
-R:	James Clark <james.clark@arm.com>
+R:	James Clark <james.clark@linaro.org>
 L:	coresight@lists.linaro.org (moderated for non-subscribers)
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
@@ -17894,7 +17894,7 @@ F:	tools/perf/
 PERFORMANCE EVENTS TOOLING ARM64
 R:	John Garry <john.g.garry@oracle.com>
 R:	Will Deacon <will@kernel.org>
-R:	James Clark <james.clark@arm.com>
+R:	James Clark <james.clark@linaro.org>
 R:	Mike Leach <mike.leach@linaro.org>
 R:	Leo Yan <leo.yan@linux.dev>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)

From 34e526f6182e12b71f6076d43760dc6e0ae175a3 Mon Sep 17 00:00:00 2001
From: James Clark <james.clark@linaro.org>
Date: Tue, 9 Jul 2024 11:25:11 +0100
Subject: [PATCH 322/346] dt-bindings: arm: update James Clark's email address

My new address is james.clark@linaro.org

Link: https://lkml.kernel.org/r/20240709102512.31212-3-james.clark@linaro.org
Signed-off-by: James Clark <james.clark@linaro.org>
Cc: Bjorn Andersson <quic_bjorande@quicinc.com>
Cc: Conor Dooley <conor+dt@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Geliang Tang <geliang@kernel.org>
Cc: Hao Zhang <quic_hazha@quicinc.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Kees Cook <kees@kernel.org>
Cc: Krzysztof Kozlowski <krzk+dt@kernel.org>
Cc: Mao Jinlong <quic_jinlmao@quicinc.com>
Cc: Matthieu Baerts <matttbe@kernel.org>
Cc: Matt Ranostay <matt@ranostay.sg>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Oleksij Rempel <o.rempel@pengutronix.de>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 .../devicetree/bindings/arm/arm,coresight-dummy-sink.yaml       | 2 +-
 .../devicetree/bindings/arm/arm,coresight-dummy-source.yaml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml
index c960c8e0a9a5..08b89b62c505 100644
--- a/Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-dummy-sink.yaml
@@ -30,7 +30,7 @@ description: |
 maintainers:
   - Mike Leach <mike.leach@linaro.org>
   - Suzuki K Poulose <suzuki.poulose@arm.com>
-  - James Clark <james.clark@arm.com>
+  - James Clark <james.clark@linaro.org>
   - Mao Jinlong <quic_jinlmao@quicinc.com>
   - Hao Zhang <quic_hazha@quicinc.com>
 
diff --git a/Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml b/Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml
index 6745b4cc8f1c..d50a60368e27 100644
--- a/Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml
+++ b/Documentation/devicetree/bindings/arm/arm,coresight-dummy-source.yaml
@@ -29,7 +29,7 @@ description: |
 maintainers:
   - Mike Leach <mike.leach@linaro.org>
   - Suzuki K Poulose <suzuki.poulose@arm.com>
-  - James Clark <james.clark@arm.com>
+  - James Clark <james.clark@linaro.org>
   - Mao Jinlong <quic_jinlmao@quicinc.com>
   - Hao Zhang <quic_hazha@quicinc.com>
 

From 4cd7ba16a0afb36550eed7690e73d3e7a743fa96 Mon Sep 17 00:00:00 2001
From: Ram Tummala <rtummala@nvidia.com>
Date: Tue, 9 Jul 2024 18:45:39 -0700
Subject: [PATCH 323/346] mm: fix old/young bit handling in the faulting path

Commit 3bd786f76de2 ("mm: convert do_set_pte() to set_pte_range()")
replaced do_set_pte() with set_pte_range() and that introduced a
regression in the following faulting path of non-anonymous vmas which
caused the PTE for the faulting address to be marked as old instead of
young.

handle_pte_fault()
  do_pte_missing()
    do_fault()
      do_read_fault() || do_cow_fault() || do_shared_fault()
        finish_fault()
          set_pte_range()

The polarity of prefault calculation is incorrect.  This leads to prefault
being incorrectly set for the faulting address.  The following check will
incorrectly mark the PTE old rather than young.  On some architectures
this will cause a double fault to mark it young when the access is
retried.

    if (prefault && arch_wants_old_prefaulted_pte())
        entry = pte_mkold(entry);

On a subsequent fault on the same address, the faulting path will see a
non NULL vmf->pte and instead of reaching the do_pte_missing() path, PTE
will then be correctly marked young in handle_pte_fault() itself.

Due to this bug, performance degradation in the fault handling path will
be observed due to unnecessary double faulting.

Link: https://lkml.kernel.org/r/20240710014539.746200-1-rtummala@nvidia.com
Fixes: 3bd786f76de2 ("mm: convert do_set_pte() to set_pte_range()")
Signed-off-by: Ram Tummala <rtummala@nvidia.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index 1ff7b6f51ec1..34f8402d2046 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4780,7 +4780,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 {
 	struct vm_area_struct *vma = vmf->vma;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
-	bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
+	bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE);
 	pte_t entry;
 
 	flush_icache_pages(vma, page, nr);

From d9592025000b3cf26c742f3505da7b83aedc26d5 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang@os.amperecomputing.com>
Date: Fri, 12 Jul 2024 08:58:55 -0700
Subject: [PATCH 324/346] mm: huge_memory: use !CONFIG_64BIT to relax huge page
 alignment on 32 bit machines

Yves-Alexis Perez reported commit 4ef9ad19e176 ("mm: huge_memory: don't
force huge page alignment on 32 bit") didn't work for x86_32 [1].  It is
because x86_32 uses CONFIG_X86_32 instead of CONFIG_32BIT.

!CONFIG_64BIT should cover all 32 bit machines.

[1] https://lore.kernel.org/linux-mm/CAHbLzkr1LwH3pcTgM+aGQ31ip2bKqiqEQ8=FQB+t2c3dhNKNHA@mail.gmail.com/

Link: https://lkml.kernel.org/r/20240712155855.1130330-1-yang@os.amperecomputing.com
Fixes: 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit")
Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
Reported-by: Yves-Alexis Perez <corsac@debian.org>
Tested-by: Yves-Alexis Perez <corsac@debian.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Salvatore Bonaccorso <carnil@debian.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: <stable@vger.kernel.org>	[6.8+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/huge_memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f9696c94e211..04b72912035d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -877,7 +877,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
 	loff_t off_align = round_up(off, size);
 	unsigned long len_pad, ret, off_sub;
 
-	if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
+	if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
 		return 0;
 
 	if (off_end <= off_align || (off_end - off_align) < size)

From d659b715e94ac039803d7601505d3473393fc0be Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Mon, 15 Jul 2024 10:04:23 +1000
Subject: [PATCH 325/346] mm/huge_memory: avoid PMD-size page cache if needed

xarray can't support arbitrary page cache size.  the largest and supported
page cache size is defined as MAX_PAGECACHE_ORDER by commit 099d90642a71
("mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray").  However,
it's possible to have 512MB page cache in the huge memory's collapsing
path on ARM64 system whose base page size is 64KB.  512MB page cache is
breaking the limitation and a warning is raised when the xarray entry is
split as shown in the following example.

[root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize
KernelPageSize:       64 kB
[root@dhcp-10-26-1-207 ~]# cat /tmp/test.c
   :
int main(int argc, char **argv)
{
	const char *filename = TEST_XFS_FILENAME;
	int fd = 0;
	void *buf = (void *)-1, *p;
	int pgsize = getpagesize();
	int ret = 0;

	if (pgsize != 0x10000) {
		fprintf(stdout, "System with 64KB base page size is required!\n");
		return -EPERM;
	}

	system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb");
	system("echo 1 > /proc/sys/vm/drop_caches");

	/* Open the xfs file */
	fd = open(filename, O_RDONLY);
	assert(fd > 0);

	/* Create VMA */
	buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0);
	assert(buf != (void *)-1);
	fprintf(stdout, "mapped buffer at 0x%p\n", buf);

	/* Populate VMA */
	ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE);
	assert(ret == 0);
	ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ);
	assert(ret == 0);

	/* Collapse VMA */
	ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE);
	assert(ret == 0);
	ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE);
	if (ret) {
		fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno);
		goto out;
	}

	/* Split xarray entry. Write permission is needed */
	munmap(buf, TEST_MEM_SIZE);
	buf = (void *)-1;
	close(fd);
	fd = open(filename, O_RDWR);
	assert(fd > 0);
	fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
 		  TEST_MEM_SIZE - pgsize, pgsize);
out:
	if (buf != (void *)-1)
		munmap(buf, TEST_MEM_SIZE);
	if (fd > 0)
		close(fd);

	return ret;
}

[root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test
[root@dhcp-10-26-1-207 ~]# /tmp/test
 ------------[ cut here ]------------
 WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
 Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib    \
 nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct      \
 nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4      \
 ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse   \
 xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net  \
 sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio
 CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9
 Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
 pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
 pc : xas_split_alloc+0xf8/0x128
 lr : split_huge_page_to_list_to_order+0x1c4/0x780
 sp : ffff8000ac32f660
 x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0
 x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d
 x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000
 x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000
 x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000
 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
 x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c
 x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8
 x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40
 x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
 Call trace:
  xas_split_alloc+0xf8/0x128
  split_huge_page_to_list_to_order+0x1c4/0x780
  truncate_inode_partial_folio+0xdc/0x160
  truncate_inode_pages_range+0x1b4/0x4a8
  truncate_pagecache_range+0x84/0xa0
  xfs_flush_unmap_range+0x70/0x90 [xfs]
  xfs_file_fallocate+0xfc/0x4d8 [xfs]
  vfs_fallocate+0x124/0x2f0
  ksys_fallocate+0x4c/0xa0
  __arm64_sys_fallocate+0x24/0x38
  invoke_syscall.constprop.0+0x7c/0xd8
  do_el0_svc+0xb4/0xd0
  el0_svc+0x44/0x1d8
  el0t_64_sync_handler+0x134/0x150
  el0t_64_sync+0x17c/0x180

Fix it by correcting the supported page cache orders, different sets for
DAX and other files.  With it corrected, 512MB page cache becomes
disallowed on all non-DAX files on ARM64 system where the base page size
is 64KB.  After this patch is applied, the test program fails with error
-EINVAL returned from __thp_vma_allowable_orders() and the madvise()
system call to collapse the page caches.

Link: https://lkml.kernel.org/r/20240715000423.316491-1-gshan@redhat.com
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Signed-off-by: Gavin Shan <gshan@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Don Dutile <ddutile@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: <stable@vger.kernel.org>	[5.17+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 12 +++++++++---
 mm/huge_memory.c        | 12 ++++++++++--
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index cff002be83eb..e25d9ebfdf89 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -74,14 +74,20 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
 #define THP_ORDERS_ALL_ANON	((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
 
 /*
- * Mask of all large folio orders supported for file THP.
+ * Mask of all large folio orders supported for file THP. Folios in a DAX
+ * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to
+ * it.
  */
-#define THP_ORDERS_ALL_FILE	(BIT(PMD_ORDER) | BIT(PUD_ORDER))
+#define THP_ORDERS_ALL_FILE_DAX		\
+	(BIT(PMD_ORDER) | BIT(PUD_ORDER))
+#define THP_ORDERS_ALL_FILE_DEFAULT	\
+	((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
 
 /*
  * Mask of all large folio orders supported for THP.
  */
-#define THP_ORDERS_ALL		(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE)
+#define THP_ORDERS_ALL	\
+	(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
 
 #define TVA_SMAPS		(1 << 0)	/* Will be used for procfs */
 #define TVA_IN_PF		(1 << 1)	/* Page fault handler */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 04b72912035d..f4be468e06a4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -89,9 +89,17 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 	bool smaps = tva_flags & TVA_SMAPS;
 	bool in_pf = tva_flags & TVA_IN_PF;
 	bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
+	unsigned long supported_orders;
+
 	/* Check the intersection of requested and supported orders. */
-	orders &= vma_is_anonymous(vma) ?
-			THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
+	if (vma_is_anonymous(vma))
+		supported_orders = THP_ORDERS_ALL_ANON;
+	else if (vma_is_dax(vma))
+		supported_orders = THP_ORDERS_ALL_FILE_DAX;
+	else
+		supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
+
+	orders &= supported_orders;
 	if (!orders)
 		return 0;
 

From bf6acd5d16057d7accbbb1bf7dc6d8c56eeb4ecc Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Wed, 17 Jul 2024 17:20:16 +0100
Subject: [PATCH 326/346] decompress_bunzip2: fix rare decompression failure

The decompression code parses a huffman tree and counts the number of
symbols for a given bit length.  In rare cases, there may be >= 256
symbols with a given bit length, causing the unsigned char to overflow.
This causes a decompression failure later when the code tries and fails to
find the bit length for a given symbol.

Since the maximum number of symbols is 258, use unsigned short instead.

Link: https://lkml.kernel.org/r/20240717162016.1514077-1-ross.lagerwall@citrix.com
Fixes: bc22c17e12c1 ("bzip2/lzma: library support for gzip, bzip2 and lzma decompression")
Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Cc: Alain Knaff <alain@knaff.lu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 lib/decompress_bunzip2.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c
index 3518e7394eca..ca736166f100 100644
--- a/lib/decompress_bunzip2.c
+++ b/lib/decompress_bunzip2.c
@@ -232,7 +232,8 @@ static int INIT get_next_block(struct bunzip_data *bd)
 	   RUNB) */
 	symCount = symTotal+2;
 	for (j = 0; j < groupCount; j++) {
-		unsigned char length[MAX_SYMBOLS], temp[MAX_HUFCODE_BITS+1];
+		unsigned char length[MAX_SYMBOLS];
+		unsigned short temp[MAX_HUFCODE_BITS+1];
 		int	minLen,	maxLen, pp;
 		/* Read Huffman code lengths for each symbol.  They're
 		   stored in a way similar to mtf; record a starting

From b3bebe44306e23827397d0d774d206e3fa374041 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 17 Jul 2024 14:28:44 -0700
Subject: [PATCH 327/346] alloc_tag: outline and export free_reserved_page()

Outline and export free_reserved_page() because modules use it and it in
turn uses page_ext_{get|put} which should not be exported.  The same
result could be obtained by outlining {get|put}_page_tag_ref() but that
would have higher performance impact as these functions are used in more
performance critical paths.

Link: https://lkml.kernel.org/r/20240717212844.2749975-1-surenb@google.com
Fixes: dcfe378c81f7 ("lib: introduce support for page allocation tagging")
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202407080044.DWMC9N9I-lkp@intel.com/
Suggested-by: Christoph Hellwig <hch@infradead.org>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Sourav Panda <souravpanda@google.com>
Cc: <stable@vger.kernel.org>	[6.10]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 16 +---------------
 mm/page_alloc.c    | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4563b560ced7..c4b238a20b76 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3137,21 +3137,7 @@ extern void reserve_bootmem_region(phys_addr_t start,
 				   phys_addr_t end, int nid);
 
 /* Free the reserved page into the buddy system, so it gets managed. */
-static inline void free_reserved_page(struct page *page)
-{
-	if (mem_alloc_profiling_enabled()) {
-		union codetag_ref *ref = get_page_tag_ref(page);
-
-		if (ref) {
-			set_codetag_empty(ref);
-			put_page_tag_ref(ref);
-		}
-	}
-	ClearPageReserved(page);
-	init_page_count(page);
-	__free_page(page);
-	adjust_managed_page_count(page, 1);
-}
+void free_reserved_page(struct page *page);
 #define free_highmem_page(page) free_reserved_page(page)
 
 static inline void mark_page_reserved(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8337926b89d4..7ac8d61148fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5815,6 +5815,23 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
 	return pages;
 }
 
+void free_reserved_page(struct page *page)
+{
+	if (mem_alloc_profiling_enabled()) {
+		union codetag_ref *ref = get_page_tag_ref(page);
+
+		if (ref) {
+			set_codetag_empty(ref);
+			put_page_tag_ref(ref);
+		}
+	}
+	ClearPageReserved(page);
+	init_page_count(page);
+	__free_page(page);
+	adjust_managed_page_count(page, 1);
+}
+EXPORT_SYMBOL(free_reserved_page);
+
 static int page_alloc_cpu_dead(unsigned int cpu)
 {
 	struct zone *zone;

From f59adcf5933271ab7247c4c8938c67be8905b725 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <roman.gushchin@linux.dev>
Date: Tue, 23 Jul 2024 17:12:44 +0000
Subject: [PATCH 328/346] mm: memcg: add cacheline padding after lruvec in
 mem_cgroup_per_node

Oliver Sand reported a performance regression caused by commit
98c9daf5ae6b ("mm: memcg: guard memcg1-specific members of struct
mem_cgroup_per_node"), which puts some fields of the mem_cgroup_per_node
structure under the CONFIG_MEMCG_V1 config option.  Apparently it causes a
false cache sharing between lruvec and lru_zone_size members of the
structure.  Fix it by adding an explicit padding after the lruvec member.

Even though the padding is not required with CONFIG_MEMCG_V1 set, it seems
like the introduced memory overhead is not significant enough to warrant
another divergence in the mem_cgroup_per_node layout, so the padding is
added unconditionally.

Link: https://lkml.kernel.org/r/20240723171244.747521-1-roman.gushchin@linux.dev
Fixes: 98c9daf5ae6b ("mm: memcg: guard memcg1-specific members of struct mem_cgroup_per_node")
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202407121335.31a10cb6-oliver.sang@intel.com
Tested-by: Oliver Sang <oliver.sang@intel.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7e2eb091049a..0e5bf25d324f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -109,6 +109,7 @@ struct mem_cgroup_per_node {
 
 	/* Fields which get updated often at the end. */
 	struct lruvec		lruvec;
+	CACHELINE_PADDING(_pad2_);
 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 	struct mem_cgroup_reclaim_iter	iter;
 };

From 66eca1021a42856d6af2a9802c99e160278aed91 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Tue, 23 Jul 2024 14:44:28 +0800
Subject: [PATCH 329/346] mm/page_alloc: fix pcp->count race between
 drain_pages_zone() vs __rmqueue_pcplist()

It's expected that no page should be left in pcp_list after calling
zone_pcp_disable() in offline_pages().  Previously, it's observed that
offline_pages() gets stuck [1] due to some pages remaining in pcp_list.

Cause:
There is a race condition between drain_pages_zone() and __rmqueue_pcplist()
involving the pcp->count variable. See below scenario:

         CPU0                              CPU1
    ----------------                    ---------------
                                      spin_lock(&pcp->lock);
                                      __rmqueue_pcplist() {
zone_pcp_disable() {
                                        /* list is empty */
                                        if (list_empty(list)) {
                                          /* add pages to pcp_list */
                                          alloced = rmqueue_bulk()
  mutex_lock(&pcp_batch_high_lock)
  ...
  __drain_all_pages() {
    drain_pages_zone() {
      /* read pcp->count, it's 0 here */
      count = READ_ONCE(pcp->count)
      /* 0 means nothing to drain */
                                          /* update pcp->count */
                                          pcp->count += alloced << order;
      ...
                                      ...
                                      spin_unlock(&pcp->lock);

In this case, after calling zone_pcp_disable() though, there are still some
pages in pcp_list. And these pages in pcp_list are neither movable nor
isolated, offline_pages() gets stuck as a result.

Solution:
Expand the scope of the pcp->lock to also protect pcp->count in
drain_pages_zone(), to ensure no pages are left in the pcp list after
zone_pcp_disable()

[1] https://lore.kernel.org/linux-mm/6a07125f-e720-404c-b2f9-e55f3f166e85@fujitsu.com/

Link: https://lkml.kernel.org/r/20240723064428.1179519-1-lizhijian@fujitsu.com
Fixes: 4b23a68f9536 ("mm/page_alloc: protect PCP lists with a spinlock")
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Reported-by: Yao Xingtao <yaoxt.fnst@fujitsu.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7ac8d61148fe..28f80daf5c04 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2343,16 +2343,20 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
 	struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-	int count = READ_ONCE(pcp->count);
-
-	while (count) {
-		int to_drain = min(count, pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX);
-		count -= to_drain;
+	int count;
 
+	do {
 		spin_lock(&pcp->lock);
-		free_pcppages_bulk(zone, to_drain, pcp, 0);
+		count = pcp->count;
+		if (count) {
+			int to_drain = min(count,
+				pcp->batch << CONFIG_PCP_BATCH_SCALE_MAX);
+
+			free_pcppages_bulk(zone, to_drain, pcp, 0);
+			count -= to_drain;
+		}
 		spin_unlock(&pcp->lock);
-	}
+	} while (count);
 }
 
 /*

From f556acc2facdd240de2c7416cd1da48a1dae70ec Mon Sep 17 00:00:00 2001
From: Dev Jain <dev.jain@arm.com>
Date: Thu, 18 Jul 2024 10:55:04 +0530
Subject: [PATCH 330/346] selftests/mm: skip test for non-LPA2 and non-LVA
 systems

Post my improvement of the test in e4a4ba415419 ("selftests/mm:
va_high_addr_switch: dynamically initialize testcases to enable LPA2
testing"):

The test begins to fail on 4k and 16k pages, on non-LPA2 systems.  To
reduce noise in the CI systems, let us skip the test when higher address
space is not implemented.

Link: https://lkml.kernel.org/r/20240718052504.356517-1-dev.jain@arm.com
Fixes: e4a4ba415419 ("selftests/mm: va_high_addr_switch: dynamically initialize testcases to enable LPA2 testing")
Signed-off-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 tools/testing/selftests/mm/va_high_addr_switch.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c
index fa7eabfaf841..896b3f73fc53 100644
--- a/tools/testing/selftests/mm/va_high_addr_switch.c
+++ b/tools/testing/selftests/mm/va_high_addr_switch.c
@@ -293,6 +293,20 @@ static int run_test(struct testcase *test, int count)
 	return ret;
 }
 
+#ifdef __aarch64__
+/* Check if userspace VA > 48 bits */
+static int high_address_present(void)
+{
+	void *ptr = mmap((void *)(1UL << 50), 1, PROT_READ | PROT_WRITE,
+			 MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+	if (ptr == MAP_FAILED)
+		return 0;
+
+	munmap(ptr, 1);
+	return 1;
+}
+#endif
+
 static int supported_arch(void)
 {
 #if defined(__powerpc64__)
@@ -300,7 +314,7 @@ static int supported_arch(void)
 #elif defined(__x86_64__)
 	return 1;
 #elif defined(__aarch64__)
-	return 1;
+	return high_address_present();
 #else
 	return 0;
 #endif

From 4811f7af6090e8f5a398fbdd766f903ef6c0d787 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Date: Thu, 25 Jul 2024 14:20:07 +0900
Subject: [PATCH 331/346] nilfs2: handle inconsistent state in
 nilfs_btnode_create_block()

Syzbot reported that a buffer state inconsistency was detected in
nilfs_btnode_create_block(), triggering a kernel bug.

It is not appropriate to treat this inconsistency as a bug; it can occur
if the argument block address (the buffer index of the newly created
block) is a virtual block number and has been reallocated due to
corruption of the bitmap used to manage its allocation state.

So, modify nilfs_btnode_create_block() and its callers to treat it as a
possible filesystem error, rather than triggering a kernel bug.

Link: https://lkml.kernel.org/r/20240725052007.4562-1-konishi.ryusuke@gmail.com
Fixes: a60be987d45d ("nilfs2: B-tree node cache")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Reported-by: syzbot+89cc4f2324ed37988b60@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=89cc4f2324ed37988b60
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nilfs2/btnode.c | 25 ++++++++++++++++++++-----
 fs/nilfs2/btree.c  |  4 ++--
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 0131d83b912d..c034080c334b 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -51,12 +51,21 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
 
 	bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
 	if (unlikely(!bh))
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
 		     buffer_dirty(bh))) {
-		brelse(bh);
-		BUG();
+		/*
+		 * The block buffer at the specified new address was already
+		 * in use.  This can happen if it is a virtual block number
+		 * and has been reallocated due to corruption of the bitmap
+		 * used to manage its allocation state (if not, the buffer
+		 * clearing of an abandoned b-tree node is missing somewhere).
+		 */
+		nilfs_error(inode->i_sb,
+			    "state inconsistency probably due to duplicate use of b-tree node block address %llu (ino=%lu)",
+			    (unsigned long long)blocknr, inode->i_ino);
+		goto failed;
 	}
 	memset(bh->b_data, 0, i_blocksize(inode));
 	bh->b_bdev = inode->i_sb->s_bdev;
@@ -67,6 +76,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
 	folio_unlock(bh->b_folio);
 	folio_put(bh->b_folio);
 	return bh;
+
+failed:
+	folio_unlock(bh->b_folio);
+	folio_put(bh->b_folio);
+	brelse(bh);
+	return ERR_PTR(-EIO);
 }
 
 int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
@@ -217,8 +232,8 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
 	}
 
 	nbh = nilfs_btnode_create_block(btnc, newkey);
-	if (!nbh)
-		return -ENOMEM;
+	if (IS_ERR(nbh))
+		return PTR_ERR(nbh);
 
 	BUG_ON(nbh == obh);
 	ctxt->newbh = nbh;
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index a139970e4804..862bdf23120e 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -63,8 +63,8 @@ static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
 	struct buffer_head *bh;
 
 	bh = nilfs_btnode_create_block(btnc, ptr);
-	if (!bh)
-		return -ENOMEM;
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	set_buffer_nilfs_volatile(bh);
 	*bhp = bh;

From e8432ac802a028eaee6b1e86383d7cd8e9fb8431 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 26 Jul 2024 15:09:07 -0700
Subject: [PATCH 332/346] minmax: avoid overly complex min()/max() macro
 arguments in xen

We have some very fancy min/max macros that have tons of sanity checking
to warn about mixed signedness etc.

This is all things that a sane compiler should warn about, but there are
no sane compiler interfaces for this, and '-Wsign-compare' is broken [1]
and not useful.

So then we compensate (some would say over-compensate) by doing the
checks manually with some truly horrid macro games.

And no, we can't just use __builtin_types_compatible_p(), because the
whole question of "does it make sense to compare these two values" is a
lot more complicated than that.

For example, it makes a ton of sense to compare unsigned values with
simple constants like "5", even if that is indeed a signed type.  So we
have these very strange macros to try to make sensible type checking
decisions on the arguments to 'min()' and 'max()'.

But that can cause enormous code expansion if the min()/max() macros are
used with complicated expressions, and particularly if you nest these
things so that you get the first big expansion then expanded again.

The xen setup.c file ended up ballooning to over 50MB of preprocessed
noise that takes 15s to compile (obviously depending on the build host),
largely due to one single line.

So let's split that one single line to just be simpler.  I think it ends
up being more legible to humans too at the same time.  Now that single
file compiles in under a second.

Reported-and-reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Link: https://lore.kernel.org/all/c83c17bb-be75-4c67-979d-54eee38774c6@lucifer.local/
Link: https://staticthinking.wordpress.com/2023/07/25/wsign-compare-is-garbage/ [1]
Cc: David Laight <David.Laight@aculab.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/xen/setup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a0c3e77e3d5b..806ddb2391d9 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -690,6 +690,7 @@ char * __init xen_memory_setup(void)
 	struct xen_memory_map memmap;
 	unsigned long max_pages;
 	unsigned long extra_pages = 0;
+	unsigned long maxmem_pages;
 	int i;
 	int op;
 
@@ -761,8 +762,8 @@ char * __init xen_memory_setup(void)
 	 * Make sure we have no memory above max_pages, as this area
 	 * isn't handled by the p2m management.
 	 */
-	extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
-			   extra_pages, max_pages - max_pfn);
+	maxmem_pages = EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM));
+	extra_pages = min3(maxmem_pages, extra_pages, max_pages - max_pfn);
 	i = 0;
 	addr = xen_e820_table.entries[0].addr;
 	size = xen_e820_table.entries[0].size;

From 3a7e02c040b130b5545e4b115aada7bacd80a2b6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 26 Jul 2024 15:32:27 -0700
Subject: [PATCH 333/346] minmax: avoid overly complicated constant expressions
 in VM code

The minmax infrastructure is overkill for simple constants, and can
cause huge expansions because those simple constants are then used by
other things.

For example, 'pageblock_order' is a core VM constant, but because it was
implemented using 'min_t()' and all the type-checking that involves, it
actually expanded to something like 2.5kB of preprocessor noise.

And when that simple constant was then used inside other expansions:

  #define pageblock_nr_pages      (1UL << pageblock_order)
  #define pageblock_start_pfn(pfn)  ALIGN_DOWN((pfn), pageblock_nr_pages)

and we then use that inside a 'max()' macro:

	case ISOLATE_SUCCESS:
		update_cached = false;
		last_migrated_pfn = max(cc->zone->zone_start_pfn,
			pageblock_start_pfn(cc->migrate_pfn - 1));

the end result was that one statement expanding to 253kB in size.

There are probably other cases of this, but this one case certainly
stood out.

I've added 'MIN_T()' and 'MAX_T()' macros for this kind of "core simple
constant with specific type" use.  These macros skip the type checking,
and as such need to be very sparingly used only for obvious cases that
have active issues like this.

Reported-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Link: https://lore.kernel.org/all/36aa2cad-1db1-4abf-8dd2-fb20484aabc3@lucifer.local/
Cc: David Laight <David.Laight@aculab.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/minmax.h          | 7 +++++++
 include/linux/pageblock-flags.h | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 2ec559284a9f..a7ef65f78933 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -270,4 +270,11 @@ static inline bool in_range32(u32 val, u32 start, u32 len)
 #define swap(a, b) \
 	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
 
+/*
+ * Use these carefully: no type checking, and uses the arguments
+ * multiple times. Use for obvious constants only.
+ */
+#define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b))
+#define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b))
+
 #endif	/* _LINUX_MINMAX_H */
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 547e82cdc89a..fc6b9c87cb0a 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -41,13 +41,13 @@ extern unsigned int pageblock_order;
  * Huge pages are a constant size, but don't exceed the maximum allocation
  * granularity.
  */
-#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER)
+#define pageblock_order		MIN_T(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
 #elif defined(CONFIG_TRANSPARENT_HUGEPAGE)
 
-#define pageblock_order		min_t(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER)
+#define pageblock_order		MIN_T(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER)
 
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 

From 00e3913b0416fe69d28745c0a2a340e2f76c219c Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Fri, 26 Jul 2024 01:16:48 +0900
Subject: [PATCH 334/346] Revert "firewire: Annotate struct fw_iso_packet with
 __counted_by()"

This reverts commit d3155742db89df3b3c96da383c400e6ff4d23c25.

The header_length field is byte unit, thus it can not express the number of
elements in header field. It seems that the argument for counted_by
attribute can have no arithmetic expression, therefore this commit just
reverts the issued commit.

Suggested-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20240725161648.130404-1-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 include/linux/firewire.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index 00abe0e5d602..1cca14cf5652 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -462,9 +462,8 @@ struct fw_iso_packet {
 				/* rx: Sync bit, wait for matching sy	*/
 	u32 tag:2;		/* tx: Tag in packet header		*/
 	u32 sy:4;		/* tx: Sy in packet header		*/
-	u32 header_length:8;	/* Length of immediate header		*/
-				/* tx: Top of 1394 isoch. data_block    */
-	u32 header[] __counted_by(header_length);
+	u32 header_length:8;	/* Size of immediate header		*/
+	u32 header[];		/* tx: Top of 1394 isoch. data_block	*/
 };
 
 #define FW_ISO_CONTEXT_TRANSMIT			0

From c1839501fe3e67d98635f159dba8b170d08f6521 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Fri, 26 Jul 2024 00:56:40 +0900
Subject: [PATCH 335/346] ALSA: firewire-lib: fix wrong value as length of
 header for CIP_NO_HEADER case

In a commit 1d717123bb1a ("ALSA: firewire-lib: Avoid
-Wflex-array-member-not-at-end warning"), DEFINE_FLEX() macro was used to
handle variable length of array for header field in struct fw_iso_packet
structure. The usage of macro has a side effect that the designated
initializer assigns the count of array to the given field. Therefore
CIP_HEADER_QUADLETS (=2) is assigned to struct fw_iso_packet.header,
while the original designated initializer assigns zero to all fields.

With CIP_NO_HEADER flag, the change causes invalid length of header in
isochronous packet for 1394 OHCI IT context. This bug affects all of
devices supported by ALSA fireface driver; RME Fireface 400, 800, UCX, UFX,
and 802.

This commit fixes the bug by replacing it with the alternative version of
macro which corresponds no initializer.

Cc: stable@vger.kernel.org
Fixes: 1d717123bb1a ("ALSA: firewire-lib: Avoid -Wflex-array-member-not-at-end warning")
Reported-by: Edmund Raile <edmund.raile@proton.me>
Closes: https://lore.kernel.org/r/rrufondjeynlkx2lniot26ablsltnynfaq2gnqvbiso7ds32il@qk4r6xps7jh2/
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20240725155640.128442-1-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 sound/firewire/amdtp-stream.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sound/firewire/amdtp-stream.c b/sound/firewire/amdtp-stream.c
index d35d0a420ee0..1a163bbcabd7 100644
--- a/sound/firewire/amdtp-stream.c
+++ b/sound/firewire/amdtp-stream.c
@@ -1180,8 +1180,7 @@ static void process_rx_packets(struct fw_iso_context *context, u32 tstamp, size_
 		(void)fw_card_read_cycle_time(fw_parent_device(s->unit)->card, &curr_cycle_time);
 
 	for (i = 0; i < packets; ++i) {
-		DEFINE_FLEX(struct fw_iso_packet, template, header,
-			    header_length, CIP_HEADER_QUADLETS);
+		DEFINE_RAW_FLEX(struct fw_iso_packet, template, header, CIP_HEADER_QUADLETS);
 		bool sched_irq = false;
 
 		build_it_pkt_header(s, desc->cycle, template, pkt_header_length,

From e1c5ae59c0f22f7fe5c07fb5513a29e4aad868c9 Mon Sep 17 00:00:00 2001
From: "Seth Forshee (DigitalOcean)" <sforshee@kernel.org>
Date: Wed, 24 Jul 2024 09:53:59 -0500
Subject: [PATCH 336/346] fs: don't allow non-init s_user_ns for filesystems
 without FS_USERNS_MOUNT

Christian noticed that it is possible for a privileged user to mount
most filesystems with a non-initial user namespace in sb->s_user_ns.
When fsopen() is called in a non-init namespace the caller's namespace
is recorded in fs_context->user_ns. If the returned file descriptor is
then passed to a process priviliged in init_user_ns, that process can
call fsconfig(fd_fs, FSCONFIG_CMD_CREATE), creating a new superblock
with sb->s_user_ns set to the namespace of the process which called
fsopen().

This is problematic. We cannot assume that any filesystem which does not
set FS_USERNS_MOUNT has been written with a non-initial s_user_ns in
mind, increasing the risk for bugs and security issues.

Prevent this by returning EPERM from sget_fc() when FS_USERNS_MOUNT is
not set for the filesystem and a non-initial user namespace will be
used. sget() does not need to be updated as it always uses the user
namespace of the current context, or the initial user namespace if
SB_SUBMOUNT is set.

Fixes: cb50b348c71f ("convenience helpers: vfs_get_super() and sget_fc()")
Reported-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Link: https://lore.kernel.org/r/20240724-s_user_ns-fix-v1-1-895d07c94701@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/super.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/super.c b/fs/super.c
index 095ba793e10c..38d72a3cf6fc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -736,6 +736,17 @@ struct super_block *sget_fc(struct fs_context *fc,
 	struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns;
 	int err;
 
+	/*
+	 * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is
+	 * not set, as the filesystem is likely unprepared to handle it.
+	 * This can happen when fsconfig() is called from init_user_ns with
+	 * an fs_fd opened in another user namespace.
+	 */
+	if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) {
+		errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed");
+		return ERR_PTR(-EPERM);
+	}
+
 retry:
 	spin_lock(&sb_lock);
 	if (test) {

From ef9ca17ca458ac7253ae71b552e601e49311fc48 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Thu, 25 Jul 2024 14:51:30 +0800
Subject: [PATCH 337/346] hostfs: fix the host directory parse when mounting.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

hostfs not keep the host directory when mounting. When the host
directory is none (default), fc->source is used as the host root
directory, and this is wrong. Here we use `parse_monolithic` to
handle the old mount path for parsing the root directory. For new
mount path, The `parse_param` is used for the host directory parse.

Reported-and-tested-by: Maciej Żenczykowski <maze@google.com>
Fixes: cd140ce9f611 ("hostfs: convert hostfs to use the new mount API")
Link: https://lore.kernel.org/all/CANP3RGceNzwdb7w=vPf5=7BCid5HVQDmz1K5kC9JG42+HVAh_g@mail.gmail.com/
Cc: Christian Brauner <brauner@kernel.org>
Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Link: https://lore.kernel.org/r/20240725065130.1821964-1-lihongbo22@huawei.com
[brauner: minor fixes]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/hostfs/hostfs_kern.c | 65 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 10 deletions(-)

diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2b532670a561..e866b08e448f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -17,6 +17,7 @@
 #include <linux/writeback.h>
 #include <linux/mount.h>
 #include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/namei.h>
 #include "hostfs.h"
 #include <init.h>
@@ -927,7 +928,6 @@ static const struct inode_operations hostfs_link_iops = {
 static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct hostfs_fs_info *fsi = sb->s_fs_info;
-	const char *host_root = fc->source;
 	struct inode *root_inode;
 	int err;
 
@@ -941,15 +941,6 @@ static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (err)
 		return err;
 
-	/* NULL is printed as '(null)' by printf(): avoid that. */
-	if (fc->source == NULL)
-		host_root = "";
-
-	fsi->host_root_path =
-		kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root);
-	if (fsi->host_root_path == NULL)
-		return -ENOMEM;
-
 	root_inode = hostfs_iget(sb, fsi->host_root_path);
 	if (IS_ERR(root_inode))
 		return PTR_ERR(root_inode);
@@ -975,6 +966,58 @@ static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	return 0;
 }
 
+enum hostfs_parma {
+	Opt_hostfs,
+};
+
+static const struct fs_parameter_spec hostfs_param_specs[] = {
+	fsparam_string_empty("hostfs",		Opt_hostfs),
+	{}
+};
+
+static int hostfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct hostfs_fs_info *fsi = fc->s_fs_info;
+	struct fs_parse_result result;
+	char *host_root;
+	int opt;
+
+	opt = fs_parse(fc, hostfs_param_specs, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_hostfs:
+		host_root = param->string;
+		if (!*host_root)
+			host_root = "";
+		fsi->host_root_path =
+			kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root);
+		if (fsi->host_root_path == NULL)
+			return -ENOMEM;
+		break;
+	}
+
+	return 0;
+}
+
+static int hostfs_parse_monolithic(struct fs_context *fc, void *data)
+{
+	struct hostfs_fs_info *fsi = fc->s_fs_info;
+	char *host_root = (char *)data;
+
+	/* NULL is printed as '(null)' by printf(): avoid that. */
+	if (host_root == NULL)
+		host_root = "";
+
+	fsi->host_root_path =
+		kasprintf(GFP_KERNEL, "%s/%s", root_ino, host_root);
+	if (fsi->host_root_path == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
 static int hostfs_fc_get_tree(struct fs_context *fc)
 {
 	return get_tree_nodev(fc, hostfs_fill_super);
@@ -992,6 +1035,8 @@ static void hostfs_fc_free(struct fs_context *fc)
 }
 
 static const struct fs_context_operations hostfs_context_ops = {
+	.parse_monolithic = hostfs_parse_monolithic,
+	.parse_param	= hostfs_parse_param,
 	.get_tree	= hostfs_fc_get_tree,
 	.free		= hostfs_fc_free,
 };

From d01c14074be79e5f5270498f90530a12583fbf7a Mon Sep 17 00:00:00 2001
From: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>
Date: Fri, 26 Jul 2024 11:00:26 +0200
Subject: [PATCH 338/346] kbuild: rpm-pkg: ghost modules.weakdep file

In the same way as for other similar files, mark as ghost the new file
generated by depmod for configured weak dependencies for modules,
modules.weakdep, so that although it is not included in the package,
claim the ownership on it.

Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 scripts/package/kernel.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/package/kernel.spec b/scripts/package/kernel.spec
index 74355ff0e106..ac3e5ac01d8a 100644
--- a/scripts/package/kernel.spec
+++ b/scripts/package/kernel.spec
@@ -74,7 +74,7 @@ ln -fns /usr/src/kernels/%{KERNELRELEASE} %{buildroot}/lib/modules/%{KERNELRELEA
 	echo "/lib/modules/%{KERNELRELEASE}"
 
 	for x in alias alias.bin builtin.alias.bin builtin.bin dep dep.bin \
-					devname softdep symbols symbols.bin; do
+				devname softdep symbols symbols.bin weakdep; do
 		echo "%ghost /lib/modules/%{KERNELRELEASE}/modules.${x}"
 	done
 

From 92a286e90203ce3e6c3a6d945fa36da419c3671f Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Sat, 13 Jul 2024 09:35:19 +0200
Subject: [PATCH 339/346] ubi: Fix ubi_init() ubiblock_exit() section mismatch

Since ubiblock_exit() is now called from an init function,
the __exit section no longer makes sense.

Cc: Ben Hutchings <bwh@kernel.org>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202407131403.wZJpd8n2-lkp@intel.com/
Signed-off-by: Richard Weinberger <richard@nod.at>
Reviewed-by: Zhihao Cheng <chengzhihao1@huawei.com>
---
 drivers/mtd/ubi/block.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index bf7308e8ec2f..60d0155be869 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -670,7 +670,7 @@ int __init ubiblock_init(void)
 	return ret;
 }
 
-void __exit ubiblock_exit(void)
+void ubiblock_exit(void)
 {
 	ubi_unregister_volume_notifier(&ubiblock_notifier);
 	ubiblock_remove_all();

From 3415b10a03945b0da4a635e146750dfe5ce0f448 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Fri, 26 Jul 2024 11:05:00 -0700
Subject: [PATCH 340/346] kbuild: Fix '-S -c' in x86 stack protector scripts

After a recent change in clang to stop consuming all instances of '-S'
and '-c' [1], the stack protector scripts break due to the kernel's use
of -Werror=unused-command-line-argument to catch cases where flags are
not being properly consumed by the compiler driver:

  $ echo | clang -o - -x c - -S -c -Werror=unused-command-line-argument
  clang: error: argument unused during compilation: '-c' [-Werror,-Wunused-command-line-argument]

This results in CONFIG_STACKPROTECTOR getting disabled because
CONFIG_CC_HAS_SANE_STACKPROTECTOR is no longer set.

'-c' and '-S' both instruct the compiler to stop at different stages of
the pipeline ('-S' after compiling, '-c' after assembling), so having
them present together in the same command makes little sense. In this
case, the test wants to stop before assembling because it is looking at
the textual assembly output of the compiler for either '%fs' or '%gs',
so remove '-c' from the list of arguments to resolve the error.

All versions of GCC continue to work after this change, along with
versions of clang that do or do not contain the change mentioned above.

Cc: stable@vger.kernel.org
Fixes: 4f7fd4d7a791 ("[PATCH] Add the -fstack-protector option to the CFLAGS")
Fixes: 60a5317ff0f4 ("x86: implement x86_32 stack protector")
Link: https://github.com/llvm/llvm-project/commit/6461e537815f7fa68cef06842505353cf5600e9c [1]
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 scripts/gcc-x86_32-has-stack-protector.sh | 2 +-
 scripts/gcc-x86_64-has-stack-protector.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/gcc-x86_32-has-stack-protector.sh b/scripts/gcc-x86_32-has-stack-protector.sh
index 825c75c5b715..9459ca4f0f11 100755
--- a/scripts/gcc-x86_32-has-stack-protector.sh
+++ b/scripts/gcc-x86_32-has-stack-protector.sh
@@ -5,4 +5,4 @@
 # -mstack-protector-guard-reg, added by
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81708
 
-echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -c -m32 -O0 -fstack-protector -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - -o - 2> /dev/null | grep -q "%fs"
+echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m32 -O0 -fstack-protector -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard - -o - 2> /dev/null | grep -q "%fs"
diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh
index 75e4e22b986a..f680bb01aeeb 100755
--- a/scripts/gcc-x86_64-has-stack-protector.sh
+++ b/scripts/gcc-x86_64-has-stack-protector.sh
@@ -1,4 +1,4 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 
-echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -c -m64 -O0 -mcmodel=kernel -fno-PIE -fstack-protector - -o - 2> /dev/null | grep -q "%gs"
+echo "int foo(void) { char X[200]; return 3; }" | $* -S -x c -m64 -O0 -mcmodel=kernel -fno-PIE -fstack-protector - -o - 2> /dev/null | grep -q "%gs"

From 4477b39c32fdc03363affef4b11d48391e6dc9ff Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 28 Jul 2024 13:03:48 -0700
Subject: [PATCH 341/346] minmax: add a few more MIN_T/MAX_T users

Commit 3a7e02c040b1 ("minmax: avoid overly complicated constant
expressions in VM code") added the simpler MIN_T/MAX_T macros in order
to avoid some excessive expansion from the rather complicated regular
min/max macros.

The complexity of those macros stems from two issues:

 (a) trying to use them in situations that require a C constant
     expression (in static initializers and for array sizes)

 (b) the type sanity checking

and MIN_T/MAX_T avoids both of these issues.

Now, in the whole (long) discussion about all this, it was pointed out
that the whole type sanity checking is entirely unnecessary for
min_t/max_t which get a fixed type that the comparison is done in.

But that still leaves min_t/max_t unnecessarily complicated due to
worries about the C constant expression case.

However, it turns out that there really aren't very many cases that use
min_t/max_t for this, and we can just force-convert those.

This does exactly that.

Which in turn will then allow for much simpler implementations of
min_t()/max_t().  All the usual "macros in all upper case will evaluate
the arguments multiple times" rules apply.

We should do all the same things for the regular min/max() vs MIN/MAX()
cases, but that has the added complexity of various drivers defining
their own local versions of MIN/MAX, so that needs another level of
fixes first.

Link: https://lore.kernel.org/all/b47fad1d0cf8449886ad148f8c013dae@AcuMS.aculab.com/
Cc: David Laight <David.Laight@aculab.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/pgtable.c                             | 2 +-
 drivers/edac/sb_edac.c                            | 4 ++--
 drivers/gpu/drm/drm_color_mgmt.c                  | 2 +-
 drivers/md/dm-integrity.c                         | 6 +++---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +-
 net/ipv4/proc.c                                   | 2 +-
 net/ipv6/proc.c                                   | 2 +-
 7 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 93e54ba91fbf..f5931499c2d6 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -110,7 +110,7 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
 #define MAX_UNSHARED_PTRS_PER_PGD			\
-	max_t(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
+	MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD)
 
 
 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index cbc92d3683e6..e5c05a876947 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -109,8 +109,8 @@ static const u32 knl_interleave_list[] = {
 	0x104, 0x10c, 0x114, 0x11c,   /* 20-23 */
 };
 #define MAX_INTERLEAVE							\
-	(max_t(unsigned int, ARRAY_SIZE(sbridge_interleave_list),	\
-	       max_t(unsigned int, ARRAY_SIZE(ibridge_interleave_list),	\
+	(MAX_T(unsigned int, ARRAY_SIZE(sbridge_interleave_list),	\
+	       MAX_T(unsigned int, ARRAY_SIZE(ibridge_interleave_list),	\
 		     ARRAY_SIZE(knl_interleave_list))))
 
 struct interleave_pkg {
diff --git a/drivers/gpu/drm/drm_color_mgmt.c b/drivers/gpu/drm/drm_color_mgmt.c
index d021497841b8..3969dc548cff 100644
--- a/drivers/gpu/drm/drm_color_mgmt.c
+++ b/drivers/gpu/drm/drm_color_mgmt.c
@@ -532,7 +532,7 @@ int drm_plane_create_color_properties(struct drm_plane *plane,
 {
 	struct drm_device *dev = plane->dev;
 	struct drm_property *prop;
-	struct drm_prop_enum_list enum_list[max_t(int, DRM_COLOR_ENCODING_MAX,
+	struct drm_prop_enum_list enum_list[MAX_T(int, DRM_COLOR_ENCODING_MAX,
 						       DRM_COLOR_RANGE_MAX)];
 	int i, len;
 
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 48ac628f471b..51e6964c1305 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1776,7 +1776,7 @@ static void integrity_metadata(struct work_struct *w)
 		struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
 		char *checksums;
 		unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
-		char checksums_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
+		char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
 		sector_t sector;
 		unsigned int sectors_to_process;
 
@@ -2064,7 +2064,7 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio,
 				} while (++s < ic->sectors_per_block);
 #ifdef INTERNAL_VERIFY
 				if (ic->internal_hash) {
-					char checksums_onstack[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
+					char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
 
 					integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
 					if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
@@ -2837,7 +2837,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start
 				    unlikely(from_replay) &&
 #endif
 				    ic->internal_hash) {
-					char test_tag[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
+					char test_tag[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
 
 					integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
 								  (char *)access_journal_data(ic, i, l), test_tag);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 12689774d755..f3a1b179aaea 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2912,7 +2912,7 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv)
 	u32 channels_to_check = tx_channel_count > rx_channel_count ?
 				tx_channel_count : rx_channel_count;
 	u32 chan;
-	int status[max_t(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)];
+	int status[MAX_T(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)];
 
 	/* Make sure we never check beyond our status buffer. */
 	if (WARN_ON_ONCE(channels_to_check > ARRAY_SIZE(status)))
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 6c4664c681ca..40053a02bae1 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -44,7 +44,7 @@
 #include <net/sock.h>
 #include <net/raw.h>
 
-#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX)
+#define TCPUDP_MIB_MAX MAX_T(u32, UDP_MIB_MAX, TCP_MIB_MAX)
 
 /*
  *	Report socket allocation statistics [mea@utu.fi]
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 6d1d9221649d..752327b10dde 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -27,7 +27,7 @@
 #include <net/ipv6.h>
 
 #define MAX4(a, b, c, d) \
-	max_t(u32, max_t(u32, a, b), max_t(u32, c, d))
+	MAX_T(u32, MAX_T(u32, a, b), MAX_T(u32, c, d))
 #define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \
 			IPSTATS_MIB_MAX, ICMP_MIB_MAX)
 

From 017fa3e89187848fd056af757769c9e66ac3e93d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 28 Jul 2024 13:50:01 -0700
Subject: [PATCH 342/346] minmax: simplify and clarify min_t()/max_t()
 implementation

This simplifies the min_t() and max_t() macros by no longer making them
work in the context of a C constant expression.

That means that you can no longer use them for static initializers or
for array sizes in type definitions, but there were only a couple of
such uses, and all of them were converted (famous last words) to use
MIN_T/MAX_T instead.

Cc: David Laight <David.Laight@aculab.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/minmax.h | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index a7ef65f78933..9c2848abc804 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -45,17 +45,20 @@
 
 #define __cmp(op, x, y)	((x) __cmp_op_##op (y) ? (x) : (y))
 
-#define __cmp_once(op, x, y, unique_x, unique_y) ({	\
-	typeof(x) unique_x = (x);			\
-	typeof(y) unique_y = (y);			\
+#define __cmp_once_unique(op, type, x, y, ux, uy) \
+	({ type ux = (x); type uy = (y); __cmp(op, ux, uy); })
+
+#define __cmp_once(op, type, x, y) \
+	__cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_))
+
+#define __careful_cmp_once(op, x, y) ({			\
 	static_assert(__types_ok(x, y),			\
 		#op "(" #x ", " #y ") signedness error, fix types or consider u" #op "() before " #op "_t()"); \
-	__cmp(op, unique_x, unique_y); })
+	__cmp_once(op, __auto_type, x, y); })
 
 #define __careful_cmp(op, x, y)					\
 	__builtin_choose_expr(__is_constexpr((x) - (y)),	\
-		__cmp(op, x, y),				\
-		__cmp_once(op, x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y)))
+		__cmp(op, x, y), __careful_cmp_once(op, x, y))
 
 #define __clamp(val, lo, hi)	\
 	((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val)))
@@ -158,7 +161,7 @@
  * @x: first value
  * @y: second value
  */
-#define min_t(type, x, y)	__careful_cmp(min, (type)(x), (type)(y))
+#define min_t(type, x, y) __cmp_once(min, type, x, y)
 
 /**
  * max_t - return maximum of two values, using the specified type
@@ -166,7 +169,7 @@
  * @x: first value
  * @y: second value
  */
-#define max_t(type, x, y)	__careful_cmp(max, (type)(x), (type)(y))
+#define max_t(type, x, y) __cmp_once(max, type, x, y)
 
 /*
  * Do not check the array parameter using __must_be_array().

From 8400291e289ee6b2bf9779ff1c83a291501f017b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 28 Jul 2024 14:19:55 -0700
Subject: [PATCH 343/346] Linux 6.11-rc1

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 2c02dafb48ce..8ad55d6e7b60 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 VERSION = 6
-PATCHLEVEL = 10
+PATCHLEVEL = 11
 SUBLEVEL = 0
-EXTRAVERSION =
+EXTRAVERSION = -rc1
 NAME = Baby Opossum Posse
 
 # *DOCUMENTATION*

From 1da91ea87aefe2c25b68c9f96947a9271ba6325d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 31 May 2024 14:12:01 -0400
Subject: [PATCH 344/346] introduce fd_file(), convert all accessors to it.

	For any changes of struct fd representation we need to
turn existing accesses to fields into calls of wrappers.
Accesses to struct fd::flags are very few (3 in linux/file.h,
1 in net/socket.c, 3 in fs/overlayfs/file.c and 3 more in
explicit initializers).
	Those can be dealt with in the commit converting to
new layout; accesses to struct fd::file are too many for that.
	This commit converts (almost) all of f.file to
fd_file(f).  It's not entirely mechanical ('file' is used as
a member name more than just in struct fd) and it does not
even attempt to distinguish the uses in pointer context from
those in boolean context; the latter will be eventually turned
into a separate helper (fd_empty()).

	NOTE: mass conversion to fd_empty(), tempting as it
might be, is a bad idea; better do that piecewise in commit
that convert from fdget...() to CLASS(...).

[conflicts in fs/fhandle.c, kernel/bpf/syscall.c, mm/memcontrol.c
caught by git; fs/stat.c one got caught by git grep]
[fs/xattr.c conflict]

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/osf_sys.c                |   4 +-
 arch/arm/kernel/sys_oabi-compat.c          |  10 +-
 arch/powerpc/kvm/book3s_64_vio.c           |   4 +-
 arch/powerpc/kvm/powerpc.c                 |  12 +--
 arch/powerpc/platforms/cell/spu_syscalls.c |   8 +-
 arch/x86/kernel/cpu/sgx/main.c             |   4 +-
 arch/x86/kvm/svm/sev.c                     |  16 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c  |   8 +-
 drivers/gpu/drm/drm_syncobj.c              |   6 +-
 drivers/infiniband/core/ucma.c             |   6 +-
 drivers/infiniband/core/uverbs_cmd.c       |   8 +-
 drivers/media/mc/mc-request.c              |   6 +-
 drivers/media/rc/lirc_dev.c                |   8 +-
 drivers/vfio/group.c                       |   6 +-
 drivers/vfio/virqfd.c                      |   6 +-
 drivers/virt/acrn/irqfd.c                  |   6 +-
 drivers/xen/privcmd.c                      |  10 +-
 fs/btrfs/ioctl.c                           |   4 +-
 fs/coda/inode.c                            |   4 +-
 fs/eventfd.c                               |   4 +-
 fs/eventpoll.c                             |  30 +++---
 fs/ext4/ioctl.c                            |   6 +-
 fs/f2fs/file.c                             |   6 +-
 fs/fcntl.c                                 |  38 +++----
 fs/fhandle.c                               |   4 +-
 fs/fsopen.c                                |   6 +-
 fs/fuse/dev.c                              |   6 +-
 fs/ioctl.c                                 |  30 +++---
 fs/kernel_read_file.c                      |   4 +-
 fs/locks.c                                 |  14 +--
 fs/namei.c                                 |  10 +-
 fs/namespace.c                             |  12 +--
 fs/notify/fanotify/fanotify_user.c         |  12 +--
 fs/notify/inotify/inotify_user.c           |  12 +--
 fs/ocfs2/cluster/heartbeat.c               |   6 +-
 fs/open.c                                  |  24 ++---
 fs/overlayfs/file.c                        |  40 +++----
 fs/quota/quota.c                           |   8 +-
 fs/read_write.c                            | 118 ++++++++++-----------
 fs/readdir.c                               |  20 ++--
 fs/remap_range.c                           |   2 +-
 fs/select.c                                |   8 +-
 fs/signalfd.c                              |   6 +-
 fs/smb/client/ioctl.c                      |   8 +-
 fs/splice.c                                |  22 ++--
 fs/stat.c                                  |   8 +-
 fs/statfs.c                                |   4 +-
 fs/sync.c                                  |  14 +--
 fs/timerfd.c                               |   8 +-
 fs/utimes.c                                |   4 +-
 fs/xattr.c                                 |  36 +++----
 fs/xfs/xfs_exchrange.c                     |   4 +-
 fs/xfs/xfs_handle.c                        |   4 +-
 fs/xfs/xfs_ioctl.c                         |  28 ++---
 include/linux/cleanup.h                    |   2 +-
 include/linux/file.h                       |   6 +-
 io_uring/sqpoll.c                          |  10 +-
 ipc/mqueue.c                               |  50 ++++-----
 kernel/bpf/bpf_inode_storage.c             |  14 +--
 kernel/bpf/btf.c                           |   6 +-
 kernel/bpf/syscall.c                       |  42 ++++----
 kernel/bpf/token.c                         |  10 +-
 kernel/cgroup/cgroup.c                     |   4 +-
 kernel/events/core.c                       |  12 +--
 kernel/module/main.c                       |   2 +-
 kernel/nsproxy.c                           |  12 +--
 kernel/pid.c                               |  10 +-
 kernel/signal.c                            |   6 +-
 kernel/sys.c                               |  10 +-
 kernel/taskstats.c                         |   4 +-
 kernel/watch_queue.c                       |   4 +-
 mm/fadvise.c                               |   4 +-
 mm/filemap.c                               |   6 +-
 mm/memcontrol-v1.c                         |  12 +--
 mm/readahead.c                             |  10 +-
 net/core/net_namespace.c                   |   6 +-
 net/socket.c                               |  12 +--
 security/integrity/ima/ima_main.c          |   4 +-
 security/landlock/syscalls.c               |  22 ++--
 security/loadpin/loadpin.c                 |   4 +-
 sound/core/pcm_native.c                    |   6 +-
 virt/kvm/eventfd.c                         |   6 +-
 virt/kvm/vfio.c                            |   8 +-
 83 files changed, 504 insertions(+), 502 deletions(-)

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index e5f881bc8288..56fea57f9642 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -160,10 +160,10 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
 		.count = count
 	};
 
-	if (!arg.file)
+	if (!fd_file(arg))
 		return -EBADF;
 
-	error = iterate_dir(arg.file, &buf.ctx);
+	error = iterate_dir(fd_file(arg), &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	if (count != buf.count)
diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index d00f4040a9f5..f5781ff54a5c 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -239,19 +239,19 @@ asmlinkage long sys_oabi_fcntl64(unsigned int fd, unsigned int cmd,
 	struct flock64 flock;
 	long err = -EBADF;
 
-	if (!f.file)
+	if (!fd_file(f))
 		goto out;
 
 	switch (cmd) {
 	case F_GETLK64:
 	case F_OFD_GETLK:
-		err = security_file_fcntl(f.file, cmd, arg);
+		err = security_file_fcntl(fd_file(f), cmd, arg);
 		if (err)
 			break;
 		err = get_oabi_flock(&flock, argp);
 		if (err)
 			break;
-		err = fcntl_getlk64(f.file, cmd, &flock);
+		err = fcntl_getlk64(fd_file(f), cmd, &flock);
 		if (!err)
 		       err = put_oabi_flock(&flock, argp);
 		break;
@@ -259,13 +259,13 @@ asmlinkage long sys_oabi_fcntl64(unsigned int fd, unsigned int cmd,
 	case F_SETLKW64:
 	case F_OFD_SETLK:
 	case F_OFD_SETLKW:
-		err = security_file_fcntl(f.file, cmd, arg);
+		err = security_file_fcntl(fd_file(f), cmd, arg);
 		if (err)
 			break;
 		err = get_oabi_flock(&flock, argp);
 		if (err)
 			break;
-		err = fcntl_setlk64(fd, f.file, cmd, &flock);
+		err = fcntl_setlk64(fd, fd_file(f), cmd, &flock);
 		break;
 	default:
 		err = sys_fcntl64(fd, cmd, arg);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 3ff3de9a52ac..34c0adb9fdbf 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -118,12 +118,12 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
 	struct fd f;
 
 	f = fdget(tablefd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
-		if (stt == f.file->private_data) {
+		if (stt == fd_file(f)->private_data) {
 			found = true;
 			break;
 		}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 5e6c7b527677..f14329989e9a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1938,11 +1938,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
 		r = -EBADF;
 		f = fdget(cap->args[0]);
-		if (!f.file)
+		if (!fd_file(f))
 			break;
 
 		r = -EPERM;
-		dev = kvm_device_from_filp(f.file);
+		dev = kvm_device_from_filp(fd_file(f));
 		if (dev)
 			r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]);
 
@@ -1957,11 +1957,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
 		r = -EBADF;
 		f = fdget(cap->args[0]);
-		if (!f.file)
+		if (!fd_file(f))
 			break;
 
 		r = -EPERM;
-		dev = kvm_device_from_filp(f.file);
+		dev = kvm_device_from_filp(fd_file(f));
 		if (dev) {
 			if (xics_on_xive())
 				r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]);
@@ -1980,7 +1980,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
 		r = -EBADF;
 		f = fdget(cap->args[0]);
-		if (!f.file)
+		if (!fd_file(f))
 			break;
 
 		r = -ENXIO;
@@ -1990,7 +1990,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		}
 
 		r = -EPERM;
-		dev = kvm_device_from_filp(f.file);
+		dev = kvm_device_from_filp(fd_file(f));
 		if (dev)
 			r = kvmppc_xive_native_connect_vcpu(dev, vcpu,
 							    cap->args[1]);
diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c
index 87ad7d563cfa..cd7d42fc12a6 100644
--- a/arch/powerpc/platforms/cell/spu_syscalls.c
+++ b/arch/powerpc/platforms/cell/spu_syscalls.c
@@ -66,8 +66,8 @@ SYSCALL_DEFINE4(spu_create, const char __user *, name, unsigned int, flags,
 	if (flags & SPU_CREATE_AFFINITY_SPU) {
 		struct fd neighbor = fdget(neighbor_fd);
 		ret = -EBADF;
-		if (neighbor.file) {
-			ret = calls->create_thread(name, flags, mode, neighbor.file);
+		if (fd_file(neighbor)) {
+			ret = calls->create_thread(name, flags, mode, fd_file(neighbor));
 			fdput(neighbor);
 		}
 	} else
@@ -89,8 +89,8 @@ SYSCALL_DEFINE3(spu_run,int, fd, __u32 __user *, unpc, __u32 __user *, ustatus)
 
 	ret = -EBADF;
 	arg = fdget(fd);
-	if (arg.file) {
-		ret = calls->spu_run(arg.file, unpc, ustatus);
+	if (fd_file(arg)) {
+		ret = calls->spu_run(fd_file(arg), unpc, ustatus);
 		fdput(arg);
 	}
 
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 27892e57c4ef..d01deb386395 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -895,10 +895,10 @@ int sgx_set_attribute(unsigned long *allowed_attributes,
 {
 	struct fd f = fdget(attribute_fd);
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EINVAL;
 
-	if (f.file->f_op != &sgx_provision_fops) {
+	if (fd_file(f)->f_op != &sgx_provision_fops) {
 		fdput(f);
 		return -EINVAL;
 	}
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index a16c873b3232..0e38b5223263 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -534,10 +534,10 @@ static int __sev_issue_cmd(int fd, int id, void *data, int *error)
 	int ret;
 
 	f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	ret = sev_issue_cmd_external_user(f.file, id, data, error);
+	ret = sev_issue_cmd_external_user(fd_file(f), id, data, error);
 
 	fdput(f);
 	return ret;
@@ -2078,15 +2078,15 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
 	bool charged = false;
 	int ret;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	if (!file_is_kvm(f.file)) {
+	if (!file_is_kvm(fd_file(f))) {
 		ret = -EBADF;
 		goto out_fput;
 	}
 
-	source_kvm = f.file->private_data;
+	source_kvm = fd_file(f)->private_data;
 	ret = sev_lock_two_vms(kvm, source_kvm);
 	if (ret)
 		goto out_fput;
@@ -2801,15 +2801,15 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
 	struct kvm_sev_info *source_sev, *mirror_sev;
 	int ret;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	if (!file_is_kvm(f.file)) {
+	if (!file_is_kvm(fd_file(f))) {
 		ret = -EBADF;
 		goto e_source_fput;
 	}
 
-	source_kvm = f.file->private_data;
+	source_kvm = fd_file(f)->private_data;
 	ret = sev_lock_two_vms(kvm, source_kvm);
 	if (ret)
 		goto e_source_fput;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
index 863b2a34b2d6..a9298cb8d19a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
@@ -43,10 +43,10 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev,
 	uint32_t id;
 	int r;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EINVAL;
 
-	r = amdgpu_file_to_fpriv(f.file, &fpriv);
+	r = amdgpu_file_to_fpriv(fd_file(f), &fpriv);
 	if (r) {
 		fdput(f);
 		return r;
@@ -72,10 +72,10 @@ static int amdgpu_sched_context_priority_override(struct amdgpu_device *adev,
 	struct amdgpu_ctx *ctx;
 	int r;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EINVAL;
 
-	r = amdgpu_file_to_fpriv(f.file, &fpriv);
+	r = amdgpu_file_to_fpriv(fd_file(f), &fpriv);
 	if (r) {
 		fdput(f);
 		return r;
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index a0e94217b511..7fb31ca3b5fc 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -715,16 +715,16 @@ static int drm_syncobj_fd_to_handle(struct drm_file *file_private,
 	struct fd f = fdget(fd);
 	int ret;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EINVAL;
 
-	if (f.file->f_op != &drm_syncobj_file_fops) {
+	if (fd_file(f)->f_op != &drm_syncobj_file_fops) {
 		fdput(f);
 		return -EINVAL;
 	}
 
 	/* take a reference to put in the idr */
-	syncobj = f.file->private_data;
+	syncobj = fd_file(f)->private_data;
 	drm_syncobj_get(syncobj);
 
 	idr_preload(GFP_KERNEL);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 5f5ad8faf86e..dc57d07a1f45 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1624,13 +1624,13 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
 
 	/* Get current fd to protect against it being closed */
 	f = fdget(cmd.fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -ENOENT;
-	if (f.file->f_op != &ucma_fops) {
+	if (fd_file(f)->f_op != &ucma_fops) {
 		ret = -EINVAL;
 		goto file_put;
 	}
-	cur_file = f.file->private_data;
+	cur_file = fd_file(f)->private_data;
 
 	/* Validate current fd and prevent destruction of id. */
 	ctx = ucma_get_ctx(cur_file, cmd.id);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 1b3ea71f2c33..3f85575cf971 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -584,12 +584,12 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
 	if (cmd.fd != -1) {
 		/* search for file descriptor */
 		f = fdget(cmd.fd);
-		if (!f.file) {
+		if (!fd_file(f)) {
 			ret = -EBADF;
 			goto err_tree_mutex_unlock;
 		}
 
-		inode = file_inode(f.file);
+		inode = file_inode(fd_file(f));
 		xrcd = find_xrcd(ibudev, inode);
 		if (!xrcd && !(cmd.oflags & O_CREAT)) {
 			/* no file descriptor. Need CREATE flag */
@@ -632,7 +632,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
 		atomic_inc(&xrcd->usecnt);
 	}
 
-	if (f.file)
+	if (fd_file(f))
 		fdput(f);
 
 	mutex_unlock(&ibudev->xrcd_tree_mutex);
@@ -648,7 +648,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
 	uobj_alloc_abort(&obj->uobject, attrs);
 
 err_tree_mutex_unlock:
-	if (f.file)
+	if (fd_file(f))
 		fdput(f);
 
 	mutex_unlock(&ibudev->xrcd_tree_mutex);
diff --git a/drivers/media/mc/mc-request.c b/drivers/media/mc/mc-request.c
index addb8f2d8939..e064914c476e 100644
--- a/drivers/media/mc/mc-request.c
+++ b/drivers/media/mc/mc-request.c
@@ -254,12 +254,12 @@ media_request_get_by_fd(struct media_device *mdev, int request_fd)
 		return ERR_PTR(-EBADR);
 
 	f = fdget(request_fd);
-	if (!f.file)
+	if (!fd_file(f))
 		goto err_no_req_fd;
 
-	if (f.file->f_op != &request_fops)
+	if (fd_file(f)->f_op != &request_fops)
 		goto err_fput;
-	req = f.file->private_data;
+	req = fd_file(f)->private_data;
 	if (req->mdev != mdev)
 		goto err_fput;
 
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index 717c441b4a86..b8dfd530fab7 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -820,20 +820,20 @@ struct rc_dev *rc_dev_get_from_fd(int fd, bool write)
 	struct lirc_fh *fh;
 	struct rc_dev *dev;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return ERR_PTR(-EBADF);
 
-	if (f.file->f_op != &lirc_fops) {
+	if (fd_file(f)->f_op != &lirc_fops) {
 		fdput(f);
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (write && !(f.file->f_mode & FMODE_WRITE)) {
+	if (write && !(fd_file(f)->f_mode & FMODE_WRITE)) {
 		fdput(f);
 		return ERR_PTR(-EPERM);
 	}
 
-	fh = f.file->private_data;
+	fh = fd_file(f)->private_data;
 	dev = fh->rc;
 
 	get_device(&dev->dev);
diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
index ded364588d29..95b336de8a17 100644
--- a/drivers/vfio/group.c
+++ b/drivers/vfio/group.c
@@ -112,7 +112,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
 		return -EFAULT;
 
 	f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
 	mutex_lock(&group->group_lock);
@@ -125,13 +125,13 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
 		goto out_unlock;
 	}
 
-	container = vfio_container_from_file(f.file);
+	container = vfio_container_from_file(fd_file(f));
 	if (container) {
 		ret = vfio_container_attach_group(container, group);
 		goto out_unlock;
 	}
 
-	iommufd = iommufd_ctx_from_file(f.file);
+	iommufd = iommufd_ctx_from_file(fd_file(f));
 	if (!IS_ERR(iommufd)) {
 		if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) &&
 		    group->type == VFIO_NO_IOMMU)
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 532269133801..d22881245e89 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -134,12 +134,12 @@ int vfio_virqfd_enable(void *opaque,
 	INIT_WORK(&virqfd->flush_inject, virqfd_flush_inject);
 
 	irqfd = fdget(fd);
-	if (!irqfd.file) {
+	if (!fd_file(irqfd)) {
 		ret = -EBADF;
 		goto err_fd;
 	}
 
-	ctx = eventfd_ctx_fileget(irqfd.file);
+	ctx = eventfd_ctx_fileget(fd_file(irqfd));
 	if (IS_ERR(ctx)) {
 		ret = PTR_ERR(ctx);
 		goto err_ctx;
@@ -171,7 +171,7 @@ int vfio_virqfd_enable(void *opaque,
 	init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
 	init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
 
-	events = vfs_poll(irqfd.file, &virqfd->pt);
+	events = vfs_poll(fd_file(irqfd), &virqfd->pt);
 
 	/*
 	 * Check if there was an event already pending on the eventfd
diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c
index d4ad211dce7a..9994d818bb7e 100644
--- a/drivers/virt/acrn/irqfd.c
+++ b/drivers/virt/acrn/irqfd.c
@@ -125,12 +125,12 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
 	INIT_WORK(&irqfd->shutdown, hsm_irqfd_shutdown_work);
 
 	f = fdget(args->fd);
-	if (!f.file) {
+	if (!fd_file(f)) {
 		ret = -EBADF;
 		goto out;
 	}
 
-	eventfd = eventfd_ctx_fileget(f.file);
+	eventfd = eventfd_ctx_fileget(fd_file(f));
 	if (IS_ERR(eventfd)) {
 		ret = PTR_ERR(eventfd);
 		goto fail;
@@ -157,7 +157,7 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
 	mutex_unlock(&vm->irqfds_lock);
 
 	/* Check the pending event in this stage */
-	events = vfs_poll(f.file, &irqfd->pt);
+	events = vfs_poll(fd_file(f), &irqfd->pt);
 
 	if (events & EPOLLIN)
 		acrn_irqfd_inject(irqfd);
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 9563650dfbaf..54e4f285c0f4 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -959,12 +959,12 @@ static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
 	INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
 
 	f = fdget(irqfd->fd);
-	if (!f.file) {
+	if (!fd_file(f)) {
 		ret = -EBADF;
 		goto error_kfree;
 	}
 
-	kirqfd->eventfd = eventfd_ctx_fileget(f.file);
+	kirqfd->eventfd = eventfd_ctx_fileget(fd_file(f));
 	if (IS_ERR(kirqfd->eventfd)) {
 		ret = PTR_ERR(kirqfd->eventfd);
 		goto error_fd_put;
@@ -995,7 +995,7 @@ static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
 	 * Check if there was an event already pending on the eventfd before we
 	 * registered, and trigger it as if we didn't miss it.
 	 */
-	events = vfs_poll(f.file, &kirqfd->pt);
+	events = vfs_poll(fd_file(f), &kirqfd->pt);
 	if (events & EPOLLIN)
 		irqfd_inject(kirqfd);
 
@@ -1345,12 +1345,12 @@ static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd)
 		return -ENOMEM;
 
 	f = fdget(ioeventfd->event_fd);
-	if (!f.file) {
+	if (!fd_file(f)) {
 		ret = -EBADF;
 		goto error_kfree;
 	}
 
-	kioeventfd->eventfd = eventfd_ctx_fileget(f.file);
+	kioeventfd->eventfd = eventfd_ctx_fileget(fd_file(f));
 	fdput(f);
 
 	if (IS_ERR(kioeventfd->eventfd)) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e0a664b8a46a..32ddd3d31719 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1312,12 +1312,12 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
 	} else {
 		struct fd src = fdget(fd);
 		struct inode *src_inode;
-		if (!src.file) {
+		if (!fd_file(src)) {
 			ret = -EINVAL;
 			goto out_drop_write;
 		}
 
-		src_inode = file_inode(src.file);
+		src_inode = file_inode(fd_file(src));
 		if (src_inode->i_sb != file_inode(file)->i_sb) {
 			btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
 				   "Snapshot src from another FS");
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6898dc621011..7d56b6d1e4c3 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -127,9 +127,9 @@ static int coda_parse_fd(struct fs_context *fc, int fd)
 	int idx;
 
 	f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
-	inode = file_inode(f.file);
+	inode = file_inode(fd_file(f));
 	if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
 		fdput(f);
 		return invalf(fc, "code: Not coda psdev");
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 9afdb722fa92..22c934f3a080 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -349,9 +349,9 @@ struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 {
 	struct eventfd_ctx *ctx;
 	struct fd f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return ERR_PTR(-EBADF);
-	ctx = eventfd_ctx_fileget(f.file);
+	ctx = eventfd_ctx_fileget(fd_file(f));
 	fdput(f);
 	return ctx;
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index f53ca4f7fced..28d1a754cf33 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2266,17 +2266,17 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 
 	error = -EBADF;
 	f = fdget(epfd);
-	if (!f.file)
+	if (!fd_file(f))
 		goto error_return;
 
 	/* Get the "struct file *" for the target file */
 	tf = fdget(fd);
-	if (!tf.file)
+	if (!fd_file(tf))
 		goto error_fput;
 
 	/* The target file descriptor must support poll */
 	error = -EPERM;
-	if (!file_can_poll(tf.file))
+	if (!file_can_poll(fd_file(tf)))
 		goto error_tgt_fput;
 
 	/* Check if EPOLLWAKEUP is allowed */
@@ -2289,7 +2289,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 	 * adding an epoll file descriptor inside itself.
 	 */
 	error = -EINVAL;
-	if (f.file == tf.file || !is_file_epoll(f.file))
+	if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
 		goto error_tgt_fput;
 
 	/*
@@ -2300,7 +2300,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 	if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
 		if (op == EPOLL_CTL_MOD)
 			goto error_tgt_fput;
-		if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+		if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
 				(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
 			goto error_tgt_fput;
 	}
@@ -2309,7 +2309,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 	 * At this point it is safe to assume that the "private_data" contains
 	 * our own data structure.
 	 */
-	ep = f.file->private_data;
+	ep = fd_file(f)->private_data;
 
 	/*
 	 * When we insert an epoll file descriptor inside another epoll file
@@ -2330,16 +2330,16 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 	if (error)
 		goto error_tgt_fput;
 	if (op == EPOLL_CTL_ADD) {
-		if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen ||
-		    is_file_epoll(tf.file)) {
+		if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
+		    is_file_epoll(fd_file(tf))) {
 			mutex_unlock(&ep->mtx);
 			error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
 			if (error)
 				goto error_tgt_fput;
 			loop_check_gen++;
 			full_check = 1;
-			if (is_file_epoll(tf.file)) {
-				tep = tf.file->private_data;
+			if (is_file_epoll(fd_file(tf))) {
+				tep = fd_file(tf)->private_data;
 				error = -ELOOP;
 				if (ep_loop_check(ep, tep) != 0)
 					goto error_tgt_fput;
@@ -2355,14 +2355,14 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 	 * above, we can be sure to be able to use the item looked up by
 	 * ep_find() till we release the mutex.
 	 */
-	epi = ep_find(ep, tf.file, fd);
+	epi = ep_find(ep, fd_file(tf), fd);
 
 	error = -EINVAL;
 	switch (op) {
 	case EPOLL_CTL_ADD:
 		if (!epi) {
 			epds->events |= EPOLLERR | EPOLLHUP;
-			error = ep_insert(ep, epds, tf.file, fd, full_check);
+			error = ep_insert(ep, epds, fd_file(tf), fd, full_check);
 		} else
 			error = -EEXIST;
 		break;
@@ -2443,7 +2443,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
 
 	/* Get the "struct file *" for the eventpoll file */
 	f = fdget(epfd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
 	/*
@@ -2451,14 +2451,14 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
 	 * the user passed to us _is_ an eventpoll file.
 	 */
 	error = -EINVAL;
-	if (!is_file_epoll(f.file))
+	if (!is_file_epoll(fd_file(f)))
 		goto error_fput;
 
 	/*
 	 * At this point it is safe to assume that the "private_data" contains
 	 * our own data structure.
 	 */
-	ep = f.file->private_data;
+	ep = fd_file(f)->private_data;
 
 	/* Time to fish for events ... */
 	error = ep_poll(ep, events, maxevents, to);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index e8bf5972dd47..1c77400bd88e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1343,10 +1343,10 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		me.moved_len = 0;
 
 		donor = fdget(me.donor_fd);
-		if (!donor.file)
+		if (!fd_file(donor))
 			return -EBADF;
 
-		if (!(donor.file->f_mode & FMODE_WRITE)) {
+		if (!(fd_file(donor)->f_mode & FMODE_WRITE)) {
 			err = -EBADF;
 			goto mext_out;
 		}
@@ -1367,7 +1367,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (err)
 			goto mext_out;
 
-		err = ext4_move_extents(filp, donor.file, me.orig_start,
+		err = ext4_move_extents(filp, fd_file(donor), me.orig_start,
 					me.donor_start, me.len, &me.moved_len);
 		mnt_drop_write_file(filp);
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 168f08507004..903337f8d21a 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -3014,10 +3014,10 @@ static int __f2fs_ioc_move_range(struct file *filp,
 		return -EBADF;
 
 	dst = fdget(range->dst_fd);
-	if (!dst.file)
+	if (!fd_file(dst))
 		return -EBADF;
 
-	if (!(dst.file->f_mode & FMODE_WRITE)) {
+	if (!(fd_file(dst)->f_mode & FMODE_WRITE)) {
 		err = -EBADF;
 		goto err_out;
 	}
@@ -3026,7 +3026,7 @@ static int __f2fs_ioc_move_range(struct file *filp,
 	if (err)
 		goto err_out;
 
-	err = f2fs_move_file_range(filp, range->pos_in, dst.file,
+	err = f2fs_move_file_range(filp, range->pos_in, fd_file(dst),
 					range->pos_out, range->len);
 
 	mnt_drop_write_file(filp);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 300e5d9ad913..2b5616762354 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -340,7 +340,7 @@ static long f_dupfd_query(int fd, struct file *filp)
 	 * overkill, but given our lockless file pointer lookup, the
 	 * alternatives are complicated.
 	 */
-	return f.file == filp;
+	return fd_file(f) == filp;
 }
 
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
@@ -479,17 +479,17 @@ SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 	struct fd f = fdget_raw(fd);
 	long err = -EBADF;
 
-	if (!f.file)
+	if (!fd_file(f))
 		goto out;
 
-	if (unlikely(f.file->f_mode & FMODE_PATH)) {
+	if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
 		if (!check_fcntl_cmd(cmd))
 			goto out1;
 	}
 
-	err = security_file_fcntl(f.file, cmd, arg);
+	err = security_file_fcntl(fd_file(f), cmd, arg);
 	if (!err)
-		err = do_fcntl(fd, cmd, arg, f.file);
+		err = do_fcntl(fd, cmd, arg, fd_file(f));
 
 out1:
  	fdput(f);
@@ -506,15 +506,15 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
 	struct flock64 flock;
 	long err = -EBADF;
 
-	if (!f.file)
+	if (!fd_file(f))
 		goto out;
 
-	if (unlikely(f.file->f_mode & FMODE_PATH)) {
+	if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
 		if (!check_fcntl_cmd(cmd))
 			goto out1;
 	}
 
-	err = security_file_fcntl(f.file, cmd, arg);
+	err = security_file_fcntl(fd_file(f), cmd, arg);
 	if (err)
 		goto out1;
 	
@@ -524,7 +524,7 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
 		err = -EFAULT;
 		if (copy_from_user(&flock, argp, sizeof(flock)))
 			break;
-		err = fcntl_getlk64(f.file, cmd, &flock);
+		err = fcntl_getlk64(fd_file(f), cmd, &flock);
 		if (!err && copy_to_user(argp, &flock, sizeof(flock)))
 			err = -EFAULT;
 		break;
@@ -535,10 +535,10 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
 		err = -EFAULT;
 		if (copy_from_user(&flock, argp, sizeof(flock)))
 			break;
-		err = fcntl_setlk64(fd, f.file, cmd, &flock);
+		err = fcntl_setlk64(fd, fd_file(f), cmd, &flock);
 		break;
 	default:
-		err = do_fcntl(fd, cmd, arg, f.file);
+		err = do_fcntl(fd, cmd, arg, fd_file(f));
 		break;
 	}
 out1:
@@ -643,15 +643,15 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
 	struct flock flock;
 	long err = -EBADF;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return err;
 
-	if (unlikely(f.file->f_mode & FMODE_PATH)) {
+	if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
 		if (!check_fcntl_cmd(cmd))
 			goto out_put;
 	}
 
-	err = security_file_fcntl(f.file, cmd, arg);
+	err = security_file_fcntl(fd_file(f), cmd, arg);
 	if (err)
 		goto out_put;
 
@@ -660,7 +660,7 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
 		err = get_compat_flock(&flock, compat_ptr(arg));
 		if (err)
 			break;
-		err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
+		err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock);
 		if (err)
 			break;
 		err = fixup_compat_flock(&flock);
@@ -672,7 +672,7 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
 		err = get_compat_flock64(&flock, compat_ptr(arg));
 		if (err)
 			break;
-		err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock);
+		err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock);
 		if (!err)
 			err = put_compat_flock64(&flock, compat_ptr(arg));
 		break;
@@ -681,7 +681,7 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
 		err = get_compat_flock(&flock, compat_ptr(arg));
 		if (err)
 			break;
-		err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
+		err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock);
 		break;
 	case F_SETLK64:
 	case F_SETLKW64:
@@ -690,10 +690,10 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
 		err = get_compat_flock64(&flock, compat_ptr(arg));
 		if (err)
 			break;
-		err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock);
+		err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock);
 		break;
 	default:
-		err = do_fcntl(fd, cmd, arg, f.file);
+		err = do_fcntl(fd, cmd, arg, fd_file(f));
 		break;
 	}
 out_put:
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 6e8cea16790e..3f07b52874a8 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -125,9 +125,9 @@ static int get_path_from_fd(int fd, struct path *root)
 		spin_unlock(&fs->lock);
 	} else {
 		struct fd f = fdget(fd);
-		if (!f.file)
+		if (!fd_file(f))
 			return -EBADF;
-		*root = f.file->f_path;
+		*root = fd_file(f)->f_path;
 		path_get(root);
 		fdput(f);
 	}
diff --git a/fs/fsopen.c b/fs/fsopen.c
index ed2dd000622e..ee92ca58429e 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -394,13 +394,13 @@ SYSCALL_DEFINE5(fsconfig,
 	}
 
 	f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 	ret = -EINVAL;
-	if (f.file->f_op != &fscontext_fops)
+	if (fd_file(f)->f_op != &fscontext_fops)
 		goto out_f;
 
-	fc = f.file->private_data;
+	fc = fd_file(f)->private_data;
 	if (fc->ops == &legacy_fs_context_ops) {
 		switch (cmd) {
 		case FSCONFIG_SET_BINARY:
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 9eb191b5c4de..991b9ae8e7c9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2321,15 +2321,15 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
 		return -EFAULT;
 
 	f = fdget(oldfd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EINVAL;
 
 	/*
 	 * Check against file->f_op because CUSE
 	 * uses the same ioctl handler.
 	 */
-	if (f.file->f_op == file->f_op)
-		fud = fuse_get_dev(f.file);
+	if (fd_file(f)->f_op == file->f_op)
+		fud = fuse_get_dev(fd_file(f));
 
 	res = -EINVAL;
 	if (fud) {
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 64776891120c..6e0c954388d4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -235,9 +235,9 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
 	loff_t cloned;
 	int ret;
 
-	if (!src_file.file)
+	if (!fd_file(src_file))
 		return -EBADF;
-	cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
+	cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff,
 				      olen, 0);
 	if (cloned < 0)
 		ret = cloned;
@@ -895,16 +895,16 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 	struct fd f = fdget(fd);
 	int error;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	error = security_file_ioctl(f.file, cmd, arg);
+	error = security_file_ioctl(fd_file(f), cmd, arg);
 	if (error)
 		goto out;
 
-	error = do_vfs_ioctl(f.file, fd, cmd, arg);
+	error = do_vfs_ioctl(fd_file(f), fd, cmd, arg);
 	if (error == -ENOIOCTLCMD)
-		error = vfs_ioctl(f.file, cmd, arg);
+		error = vfs_ioctl(fd_file(f), cmd, arg);
 
 out:
 	fdput(f);
@@ -953,32 +953,32 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
 	struct fd f = fdget(fd);
 	int error;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	error = security_file_ioctl_compat(f.file, cmd, arg);
+	error = security_file_ioctl_compat(fd_file(f), cmd, arg);
 	if (error)
 		goto out;
 
 	switch (cmd) {
 	/* FICLONE takes an int argument, so don't use compat_ptr() */
 	case FICLONE:
-		error = ioctl_file_clone(f.file, arg, 0, 0, 0);
+		error = ioctl_file_clone(fd_file(f), arg, 0, 0, 0);
 		break;
 
 #if defined(CONFIG_X86_64)
 	/* these get messy on amd64 due to alignment differences */
 	case FS_IOC_RESVSP_32:
 	case FS_IOC_RESVSP64_32:
-		error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg));
+		error = compat_ioctl_preallocate(fd_file(f), 0, compat_ptr(arg));
 		break;
 	case FS_IOC_UNRESVSP_32:
 	case FS_IOC_UNRESVSP64_32:
-		error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE,
+		error = compat_ioctl_preallocate(fd_file(f), FALLOC_FL_PUNCH_HOLE,
 				compat_ptr(arg));
 		break;
 	case FS_IOC_ZERO_RANGE_32:
-		error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE,
+		error = compat_ioctl_preallocate(fd_file(f), FALLOC_FL_ZERO_RANGE,
 				compat_ptr(arg));
 		break;
 #endif
@@ -998,13 +998,13 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
 	 * argument.
 	 */
 	default:
-		error = do_vfs_ioctl(f.file, fd, cmd,
+		error = do_vfs_ioctl(fd_file(f), fd, cmd,
 				     (unsigned long)compat_ptr(arg));
 		if (error != -ENOIOCTLCMD)
 			break;
 
-		if (f.file->f_op->compat_ioctl)
-			error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
+		if (fd_file(f)->f_op->compat_ioctl)
+			error = fd_file(f)->f_op->compat_ioctl(fd_file(f), cmd, arg);
 		if (error == -ENOIOCTLCMD)
 			error = -ENOTTY;
 		break;
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index c429c42a6867..9ff37ae650ea 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -178,10 +178,10 @@ ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
 	struct fd f = fdget(fd);
 	ssize_t ret = -EBADF;
 
-	if (!f.file || !(f.file->f_mode & FMODE_READ))
+	if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ))
 		goto out;
 
-	ret = kernel_read_file(f.file, offset, buf, buf_size, file_size, id);
+	ret = kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id);
 out:
 	fdput(f);
 	return ret;
diff --git a/fs/locks.c b/fs/locks.c
index 9afb16e0683f..239759a356e9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2153,15 +2153,15 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 
 	error = -EBADF;
 	f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return error;
 
-	if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE)))
+	if (type != F_UNLCK && !(fd_file(f)->f_mode & (FMODE_READ | FMODE_WRITE)))
 		goto out_putf;
 
-	flock_make_lock(f.file, &fl, type);
+	flock_make_lock(fd_file(f), &fl, type);
 
-	error = security_file_lock(f.file, fl.c.flc_type);
+	error = security_file_lock(fd_file(f), fl.c.flc_type);
 	if (error)
 		goto out_putf;
 
@@ -2169,12 +2169,12 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 	if (can_sleep)
 		fl.c.flc_flags |= FL_SLEEP;
 
-	if (f.file->f_op->flock)
-		error = f.file->f_op->flock(f.file,
+	if (fd_file(f)->f_op->flock)
+		error = fd_file(f)->f_op->flock(fd_file(f),
 					    (can_sleep) ? F_SETLKW : F_SETLK,
 					    &fl);
 	else
-		error = locks_lock_file_wait(f.file, &fl);
+		error = locks_lock_file_wait(fd_file(f), &fl);
 
 	locks_release_private(&fl);
  out_putf:
diff --git a/fs/namei.c b/fs/namei.c
index 5512cb10fa89..af86e3330594 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2492,25 +2492,25 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 		struct fd f = fdget_raw(nd->dfd);
 		struct dentry *dentry;
 
-		if (!f.file)
+		if (!fd_file(f))
 			return ERR_PTR(-EBADF);
 
 		if (flags & LOOKUP_LINKAT_EMPTY) {
-			if (f.file->f_cred != current_cred() &&
-			    !ns_capable(f.file->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
+			if (fd_file(f)->f_cred != current_cred() &&
+			    !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
 				fdput(f);
 				return ERR_PTR(-ENOENT);
 			}
 		}
 
-		dentry = f.file->f_path.dentry;
+		dentry = fd_file(f)->f_path.dentry;
 
 		if (*s && unlikely(!d_can_lookup(dentry))) {
 			fdput(f);
 			return ERR_PTR(-ENOTDIR);
 		}
 
-		nd->path = f.file->f_path;
+		nd->path = fd_file(f)->f_path;
 		if (flags & LOOKUP_RCU) {
 			nd->inode = nd->path.dentry->d_inode;
 			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
diff --git a/fs/namespace.c b/fs/namespace.c
index 328087a4df8a..c46d48bb38cd 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4099,14 +4099,14 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
 	}
 
 	f = fdget(fs_fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
 	ret = -EINVAL;
-	if (f.file->f_op != &fscontext_fops)
+	if (fd_file(f)->f_op != &fscontext_fops)
 		goto err_fsfd;
 
-	fc = f.file->private_data;
+	fc = fd_file(f)->private_data;
 
 	ret = mutex_lock_interruptible(&fc->uapi_mutex);
 	if (ret < 0)
@@ -4649,15 +4649,15 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
 		return -EINVAL;
 
 	f = fdget(attr->userns_fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	if (!proc_ns_file(f.file)) {
+	if (!proc_ns_file(fd_file(f))) {
 		err = -EINVAL;
 		goto out_fput;
 	}
 
-	ns = get_proc_ns(file_inode(f.file));
+	ns = get_proc_ns(file_inode(fd_file(f)));
 	if (ns->ops->type != CLONE_NEWUSER) {
 		err = -EINVAL;
 		goto out_fput;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9ec313e9f6e1..13454e5fd3fb 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1006,17 +1006,17 @@ static int fanotify_find_path(int dfd, const char __user *filename,
 		struct fd f = fdget(dfd);
 
 		ret = -EBADF;
-		if (!f.file)
+		if (!fd_file(f))
 			goto out;
 
 		ret = -ENOTDIR;
 		if ((flags & FAN_MARK_ONLYDIR) &&
-		    !(S_ISDIR(file_inode(f.file)->i_mode))) {
+		    !(S_ISDIR(file_inode(fd_file(f))->i_mode))) {
 			fdput(f);
 			goto out;
 		}
 
-		*path = f.file->f_path;
+		*path = fd_file(f)->f_path;
 		path_get(path);
 		fdput(f);
 	} else {
@@ -1753,14 +1753,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	}
 
 	f = fdget(fanotify_fd);
-	if (unlikely(!f.file))
+	if (unlikely(!fd_file(f)))
 		return -EBADF;
 
 	/* verify that this is indeed an fanotify instance */
 	ret = -EINVAL;
-	if (unlikely(f.file->f_op != &fanotify_fops))
+	if (unlikely(fd_file(f)->f_op != &fanotify_fops))
 		goto fput_and_out;
-	group = f.file->private_data;
+	group = fd_file(f)->private_data;
 
 	/*
 	 * An unprivileged user is not allowed to setup mount nor filesystem
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 4ffc30606e0b..c7e451d5bd51 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -753,7 +753,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 		return -EINVAL;
 
 	f = fdget(fd);
-	if (unlikely(!f.file))
+	if (unlikely(!fd_file(f)))
 		return -EBADF;
 
 	/* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */
@@ -763,7 +763,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 	}
 
 	/* verify that this is indeed an inotify instance */
-	if (unlikely(f.file->f_op != &inotify_fops)) {
+	if (unlikely(fd_file(f)->f_op != &inotify_fops)) {
 		ret = -EINVAL;
 		goto fput_and_out;
 	}
@@ -780,7 +780,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 
 	/* inode held in place by reference to path; group by fget on fd */
 	inode = path.dentry->d_inode;
-	group = f.file->private_data;
+	group = fd_file(f)->private_data;
 
 	/* create/update an inode mark */
 	ret = inotify_update_watch(group, inode, mask);
@@ -798,14 +798,14 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 	int ret = -EINVAL;
 
 	f = fdget(fd);
-	if (unlikely(!f.file))
+	if (unlikely(!fd_file(f)))
 		return -EBADF;
 
 	/* verify that this is indeed an inotify instance */
-	if (unlikely(f.file->f_op != &inotify_fops))
+	if (unlikely(fd_file(f)->f_op != &inotify_fops))
 		goto out;
 
-	group = f.file->private_data;
+	group = fd_file(f)->private_data;
 
 	i_mark = inotify_idr_find(group, wd);
 	if (unlikely(!i_mark))
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 1bde1281d514..4b9f45d7049e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1785,17 +1785,17 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 		goto out;
 
 	f = fdget(fd);
-	if (f.file == NULL)
+	if (fd_file(f) == NULL)
 		goto out;
 
 	if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
 	    reg->hr_block_bytes == 0)
 		goto out2;
 
-	if (!S_ISBLK(f.file->f_mapping->host->i_mode))
+	if (!S_ISBLK(fd_file(f)->f_mapping->host->i_mode))
 		goto out2;
 
-	reg->hr_bdev_file = bdev_file_open_by_dev(f.file->f_mapping->host->i_rdev,
+	reg->hr_bdev_file = bdev_file_open_by_dev(fd_file(f)->f_mapping->host->i_rdev,
 			BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
 	if (IS_ERR(reg->hr_bdev_file)) {
 		ret = PTR_ERR(reg->hr_bdev_file);
diff --git a/fs/open.c b/fs/open.c
index 22adbef7ecc2..a388828ccd22 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -193,10 +193,10 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 	if (length < 0)
 		return -EINVAL;
 	f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	error = do_ftruncate(f.file, length, small);
+	error = do_ftruncate(fd_file(f), length, small);
 
 	fdput(f);
 	return error;
@@ -353,8 +353,8 @@ int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
 	struct fd f = fdget(fd);
 	int error = -EBADF;
 
-	if (f.file) {
-		error = vfs_fallocate(f.file, mode, offset, len);
+	if (fd_file(f)) {
+		error = vfs_fallocate(fd_file(f), mode, offset, len);
 		fdput(f);
 	}
 	return error;
@@ -585,16 +585,16 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 	int error;
 
 	error = -EBADF;
-	if (!f.file)
+	if (!fd_file(f))
 		goto out;
 
 	error = -ENOTDIR;
-	if (!d_can_lookup(f.file->f_path.dentry))
+	if (!d_can_lookup(fd_file(f)->f_path.dentry))
 		goto out_putf;
 
-	error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
+	error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR);
 	if (!error)
-		set_fs_pwd(current->fs, &f.file->f_path);
+		set_fs_pwd(current->fs, &fd_file(f)->f_path);
 out_putf:
 	fdput(f);
 out:
@@ -675,8 +675,8 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
 	struct fd f = fdget(fd);
 	int err = -EBADF;
 
-	if (f.file) {
-		err = vfs_fchmod(f.file, mode);
+	if (fd_file(f)) {
+		err = vfs_fchmod(fd_file(f), mode);
 		fdput(f);
 	}
 	return err;
@@ -869,8 +869,8 @@ int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
 	struct fd f = fdget(fd);
 	int error = -EBADF;
 
-	if (f.file) {
-		error = vfs_fchown(f.file, user, group);
+	if (fd_file(f)) {
+		error = vfs_fchown(fd_file(f), user, group);
 		fdput(f);
 	}
 	return error;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 1a411cae57ed..c4963d0c5549 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -209,13 +209,13 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
 	 * files, so we use the real file to perform seeks.
 	 */
 	ovl_inode_lock(inode);
-	real.file->f_pos = file->f_pos;
+	fd_file(real)->f_pos = file->f_pos;
 
 	old_cred = ovl_override_creds(inode->i_sb);
-	ret = vfs_llseek(real.file, offset, whence);
+	ret = vfs_llseek(fd_file(real), offset, whence);
 	revert_creds(old_cred);
 
-	file->f_pos = real.file->f_pos;
+	file->f_pos = fd_file(real)->f_pos;
 	ovl_inode_unlock(inode);
 
 	fdput(real);
@@ -275,7 +275,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	if (ret)
 		return ret;
 
-	ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags,
+	ret = backing_file_read_iter(fd_file(real), iter, iocb, iocb->ki_flags,
 				     &ctx);
 	fdput(real);
 
@@ -314,7 +314,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	 * this property in case it is set by the issuer.
 	 */
 	ifl &= ~IOCB_DIO_CALLER_COMP;
-	ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx);
+	ret = backing_file_write_iter(fd_file(real), iter, iocb, ifl, &ctx);
 	fdput(real);
 
 out_unlock:
@@ -339,7 +339,7 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
 	if (ret)
 		return ret;
 
-	ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx);
+	ret = backing_file_splice_read(fd_file(real), ppos, pipe, len, flags, &ctx);
 	fdput(real);
 
 	return ret;
@@ -348,7 +348,7 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
 /*
  * Calling iter_file_splice_write() directly from overlay's f_op may deadlock
  * due to lock order inversion between pipe->mutex in iter_file_splice_write()
- * and file_start_write(real.file) in ovl_write_iter().
+ * and file_start_write(fd_file(real)) in ovl_write_iter().
  *
  * So do everything ovl_write_iter() does and call iter_file_splice_write() on
  * the real file.
@@ -373,7 +373,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	if (ret)
 		goto out_unlock;
 
-	ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx);
+	ret = backing_file_splice_write(pipe, fd_file(real), ppos, len, flags, &ctx);
 	fdput(real);
 
 out_unlock:
@@ -397,9 +397,9 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		return ret;
 
 	/* Don't sync lower file for fear of receiving EROFS error */
-	if (file_inode(real.file) == ovl_inode_upper(file_inode(file))) {
+	if (file_inode(fd_file(real)) == ovl_inode_upper(file_inode(file))) {
 		old_cred = ovl_override_creds(file_inode(file)->i_sb);
-		ret = vfs_fsync_range(real.file, start, end, datasync);
+		ret = vfs_fsync_range(fd_file(real), start, end, datasync);
 		revert_creds(old_cred);
 	}
 
@@ -439,7 +439,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
 		goto out_unlock;
 
 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
-	ret = vfs_fallocate(real.file, mode, offset, len);
+	ret = vfs_fallocate(fd_file(real), mode, offset, len);
 	revert_creds(old_cred);
 
 	/* Update size */
@@ -464,7 +464,7 @@ static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
 		return ret;
 
 	old_cred = ovl_override_creds(file_inode(file)->i_sb);
-	ret = vfs_fadvise(real.file, offset, len, advice);
+	ret = vfs_fadvise(fd_file(real), offset, len, advice);
 	revert_creds(old_cred);
 
 	fdput(real);
@@ -509,18 +509,18 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in,
 	old_cred = ovl_override_creds(file_inode(file_out)->i_sb);
 	switch (op) {
 	case OVL_COPY:
-		ret = vfs_copy_file_range(real_in.file, pos_in,
-					  real_out.file, pos_out, len, flags);
+		ret = vfs_copy_file_range(fd_file(real_in), pos_in,
+					  fd_file(real_out), pos_out, len, flags);
 		break;
 
 	case OVL_CLONE:
-		ret = vfs_clone_file_range(real_in.file, pos_in,
-					   real_out.file, pos_out, len, flags);
+		ret = vfs_clone_file_range(fd_file(real_in), pos_in,
+					   fd_file(real_out), pos_out, len, flags);
 		break;
 
 	case OVL_DEDUPE:
-		ret = vfs_dedupe_file_range_one(real_in.file, pos_in,
-						real_out.file, pos_out, len,
+		ret = vfs_dedupe_file_range_one(fd_file(real_in), pos_in,
+						fd_file(real_out), pos_out, len,
 						flags);
 		break;
 	}
@@ -583,9 +583,9 @@ static int ovl_flush(struct file *file, fl_owner_t id)
 	if (err)
 		return err;
 
-	if (real.file->f_op->flush) {
+	if (fd_file(real)->f_op->flush) {
 		old_cred = ovl_override_creds(file_inode(file)->i_sb);
-		err = real.file->f_op->flush(real.file, id);
+		err = fd_file(real)->f_op->flush(fd_file(real), id);
 		revert_creds(old_cred);
 	}
 	fdput(real);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 0e41fb84060f..290157bc7bec 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -980,7 +980,7 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
 	int ret;
 
 	f = fdget_raw(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
 	ret = -EINVAL;
@@ -988,12 +988,12 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
 		goto out;
 
 	if (quotactl_cmd_write(cmds)) {
-		ret = mnt_want_write(f.file->f_path.mnt);
+		ret = mnt_want_write(fd_file(f)->f_path.mnt);
 		if (ret)
 			goto out;
 	}
 
-	sb = f.file->f_path.mnt->mnt_sb;
+	sb = fd_file(f)->f_path.mnt->mnt_sb;
 	if (quotactl_cmd_onoff(cmds))
 		down_write(&sb->s_umount);
 	else
@@ -1007,7 +1007,7 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
 		up_read(&sb->s_umount);
 
 	if (quotactl_cmd_write(cmds))
-		mnt_drop_write(f.file->f_path.mnt);
+		mnt_drop_write(fd_file(f)->f_path.mnt);
 out:
 	fdput(f);
 	return ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index 90e283b31ca1..59d6d0dee579 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -294,12 +294,12 @@ static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
 {
 	off_t retval;
 	struct fd f = fdget_pos(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
 	retval = -EINVAL;
 	if (whence <= SEEK_MAX) {
-		loff_t res = vfs_llseek(f.file, offset, whence);
+		loff_t res = vfs_llseek(fd_file(f), offset, whence);
 		retval = res;
 		if (res != (loff_t)retval)
 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
@@ -330,14 +330,14 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 	struct fd f = fdget_pos(fd);
 	loff_t offset;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
 	retval = -EINVAL;
 	if (whence > SEEK_MAX)
 		goto out_putf;
 
-	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
+	offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low,
 			whence);
 
 	retval = (int)offset;
@@ -610,15 +610,15 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
 	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
 
-	if (f.file) {
-		loff_t pos, *ppos = file_ppos(f.file);
+	if (fd_file(f)) {
+		loff_t pos, *ppos = file_ppos(fd_file(f));
 		if (ppos) {
 			pos = *ppos;
 			ppos = &pos;
 		}
-		ret = vfs_read(f.file, buf, count, ppos);
+		ret = vfs_read(fd_file(f), buf, count, ppos);
 		if (ret >= 0 && ppos)
-			f.file->f_pos = pos;
+			fd_file(f)->f_pos = pos;
 		fdput_pos(f);
 	}
 	return ret;
@@ -634,15 +634,15 @@ ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
 	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
 
-	if (f.file) {
-		loff_t pos, *ppos = file_ppos(f.file);
+	if (fd_file(f)) {
+		loff_t pos, *ppos = file_ppos(fd_file(f));
 		if (ppos) {
 			pos = *ppos;
 			ppos = &pos;
 		}
-		ret = vfs_write(f.file, buf, count, ppos);
+		ret = vfs_write(fd_file(f), buf, count, ppos);
 		if (ret >= 0 && ppos)
-			f.file->f_pos = pos;
+			fd_file(f)->f_pos = pos;
 		fdput_pos(f);
 	}
 
@@ -665,10 +665,10 @@ ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
 		return -EINVAL;
 
 	f = fdget(fd);
-	if (f.file) {
+	if (fd_file(f)) {
 		ret = -ESPIPE;
-		if (f.file->f_mode & FMODE_PREAD)
-			ret = vfs_read(f.file, buf, count, &pos);
+		if (fd_file(f)->f_mode & FMODE_PREAD)
+			ret = vfs_read(fd_file(f), buf, count, &pos);
 		fdput(f);
 	}
 
@@ -699,10 +699,10 @@ ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
 		return -EINVAL;
 
 	f = fdget(fd);
-	if (f.file) {
+	if (fd_file(f)) {
 		ret = -ESPIPE;
-		if (f.file->f_mode & FMODE_PWRITE)  
-			ret = vfs_write(f.file, buf, count, &pos);
+		if (fd_file(f)->f_mode & FMODE_PWRITE)
+			ret = vfs_write(fd_file(f), buf, count, &pos);
 		fdput(f);
 	}
 
@@ -985,15 +985,15 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
 	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
 
-	if (f.file) {
-		loff_t pos, *ppos = file_ppos(f.file);
+	if (fd_file(f)) {
+		loff_t pos, *ppos = file_ppos(fd_file(f));
 		if (ppos) {
 			pos = *ppos;
 			ppos = &pos;
 		}
-		ret = vfs_readv(f.file, vec, vlen, ppos, flags);
+		ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags);
 		if (ret >= 0 && ppos)
-			f.file->f_pos = pos;
+			fd_file(f)->f_pos = pos;
 		fdput_pos(f);
 	}
 
@@ -1009,15 +1009,15 @@ static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
 	struct fd f = fdget_pos(fd);
 	ssize_t ret = -EBADF;
 
-	if (f.file) {
-		loff_t pos, *ppos = file_ppos(f.file);
+	if (fd_file(f)) {
+		loff_t pos, *ppos = file_ppos(fd_file(f));
 		if (ppos) {
 			pos = *ppos;
 			ppos = &pos;
 		}
-		ret = vfs_writev(f.file, vec, vlen, ppos, flags);
+		ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags);
 		if (ret >= 0 && ppos)
-			f.file->f_pos = pos;
+			fd_file(f)->f_pos = pos;
 		fdput_pos(f);
 	}
 
@@ -1043,10 +1043,10 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
 		return -EINVAL;
 
 	f = fdget(fd);
-	if (f.file) {
+	if (fd_file(f)) {
 		ret = -ESPIPE;
-		if (f.file->f_mode & FMODE_PREAD)
-			ret = vfs_readv(f.file, vec, vlen, &pos, flags);
+		if (fd_file(f)->f_mode & FMODE_PREAD)
+			ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags);
 		fdput(f);
 	}
 
@@ -1066,10 +1066,10 @@ static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
 		return -EINVAL;
 
 	f = fdget(fd);
-	if (f.file) {
+	if (fd_file(f)) {
 		ret = -ESPIPE;
-		if (f.file->f_mode & FMODE_PWRITE)
-			ret = vfs_writev(f.file, vec, vlen, &pos, flags);
+		if (fd_file(f)->f_mode & FMODE_PWRITE)
+			ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags);
 		fdput(f);
 	}
 
@@ -1235,19 +1235,19 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	 */
 	retval = -EBADF;
 	in = fdget(in_fd);
-	if (!in.file)
+	if (!fd_file(in))
 		goto out;
-	if (!(in.file->f_mode & FMODE_READ))
+	if (!(fd_file(in)->f_mode & FMODE_READ))
 		goto fput_in;
 	retval = -ESPIPE;
 	if (!ppos) {
-		pos = in.file->f_pos;
+		pos = fd_file(in)->f_pos;
 	} else {
 		pos = *ppos;
-		if (!(in.file->f_mode & FMODE_PREAD))
+		if (!(fd_file(in)->f_mode & FMODE_PREAD))
 			goto fput_in;
 	}
-	retval = rw_verify_area(READ, in.file, &pos, count);
+	retval = rw_verify_area(READ, fd_file(in), &pos, count);
 	if (retval < 0)
 		goto fput_in;
 	if (count > MAX_RW_COUNT)
@@ -1258,13 +1258,13 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	 */
 	retval = -EBADF;
 	out = fdget(out_fd);
-	if (!out.file)
+	if (!fd_file(out))
 		goto fput_in;
-	if (!(out.file->f_mode & FMODE_WRITE))
+	if (!(fd_file(out)->f_mode & FMODE_WRITE))
 		goto fput_out;
-	in_inode = file_inode(in.file);
-	out_inode = file_inode(out.file);
-	out_pos = out.file->f_pos;
+	in_inode = file_inode(fd_file(in));
+	out_inode = file_inode(fd_file(out));
+	out_pos = fd_file(out)->f_pos;
 
 	if (!max)
 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
@@ -1284,33 +1284,33 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	 * and the application is arguably buggy if it doesn't expect
 	 * EAGAIN on a non-blocking file descriptor.
 	 */
-	if (in.file->f_flags & O_NONBLOCK)
+	if (fd_file(in)->f_flags & O_NONBLOCK)
 		fl = SPLICE_F_NONBLOCK;
 #endif
-	opipe = get_pipe_info(out.file, true);
+	opipe = get_pipe_info(fd_file(out), true);
 	if (!opipe) {
-		retval = rw_verify_area(WRITE, out.file, &out_pos, count);
+		retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count);
 		if (retval < 0)
 			goto fput_out;
-		retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
+		retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos,
 					  count, fl);
 	} else {
-		if (out.file->f_flags & O_NONBLOCK)
+		if (fd_file(out)->f_flags & O_NONBLOCK)
 			fl |= SPLICE_F_NONBLOCK;
 
-		retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
+		retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl);
 	}
 
 	if (retval > 0) {
 		add_rchar(current, retval);
 		add_wchar(current, retval);
-		fsnotify_access(in.file);
-		fsnotify_modify(out.file);
-		out.file->f_pos = out_pos;
+		fsnotify_access(fd_file(in));
+		fsnotify_modify(fd_file(out));
+		fd_file(out)->f_pos = out_pos;
 		if (ppos)
 			*ppos = pos;
 		else
-			in.file->f_pos = pos;
+			fd_file(in)->f_pos = pos;
 	}
 
 	inc_syscr(current);
@@ -1583,11 +1583,11 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
 	ssize_t ret = -EBADF;
 
 	f_in = fdget(fd_in);
-	if (!f_in.file)
+	if (!fd_file(f_in))
 		goto out2;
 
 	f_out = fdget(fd_out);
-	if (!f_out.file)
+	if (!fd_file(f_out))
 		goto out1;
 
 	ret = -EFAULT;
@@ -1595,21 +1595,21 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
 		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
 			goto out;
 	} else {
-		pos_in = f_in.file->f_pos;
+		pos_in = fd_file(f_in)->f_pos;
 	}
 
 	if (off_out) {
 		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
 			goto out;
 	} else {
-		pos_out = f_out.file->f_pos;
+		pos_out = fd_file(f_out)->f_pos;
 	}
 
 	ret = -EINVAL;
 	if (flags != 0)
 		goto out;
 
-	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
+	ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len,
 				  flags);
 	if (ret > 0) {
 		pos_in += ret;
@@ -1619,14 +1619,14 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
 			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
 				ret = -EFAULT;
 		} else {
-			f_in.file->f_pos = pos_in;
+			fd_file(f_in)->f_pos = pos_in;
 		}
 
 		if (off_out) {
 			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
 				ret = -EFAULT;
 		} else {
-			f_out.file->f_pos = pos_out;
+			fd_file(f_out)->f_pos = pos_out;
 		}
 	}
 
diff --git a/fs/readdir.c b/fs/readdir.c
index d6c82421902a..6d29cab8576e 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -225,10 +225,10 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
 		.dirent = dirent
 	};
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	error = iterate_dir(f.file, &buf.ctx);
+	error = iterate_dir(fd_file(f), &buf.ctx);
 	if (buf.result)
 		error = buf.result;
 
@@ -318,10 +318,10 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 	int error;
 
 	f = fdget_pos(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	error = iterate_dir(f.file, &buf.ctx);
+	error = iterate_dir(fd_file(f), &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	if (buf.prev_reclen) {
@@ -401,10 +401,10 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 	int error;
 
 	f = fdget_pos(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	error = iterate_dir(f.file, &buf.ctx);
+	error = iterate_dir(fd_file(f), &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	if (buf.prev_reclen) {
@@ -483,10 +483,10 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
 		.dirent = dirent
 	};
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	error = iterate_dir(f.file, &buf.ctx);
+	error = iterate_dir(fd_file(f), &buf.ctx);
 	if (buf.result)
 		error = buf.result;
 
@@ -569,10 +569,10 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
 	int error;
 
 	f = fdget_pos(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	error = iterate_dir(f.file, &buf.ctx);
+	error = iterate_dir(fd_file(f), &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	if (buf.prev_reclen) {
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 28246dfc8485..4403d5c68fcb 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -537,7 +537,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
 
 	for (i = 0, info = same->info; i < count; i++, info++) {
 		struct fd dst_fd = fdget(info->dest_fd);
-		struct file *dst_file = dst_fd.file;
+		struct file *dst_file = fd_file(dst_fd);
 
 		if (!dst_file) {
 			info->status = -EBADF;
diff --git a/fs/select.c b/fs/select.c
index 9515c3fa1a03..97e1009dde00 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -532,10 +532,10 @@ static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec
 					continue;
 				mask = EPOLLNVAL;
 				f = fdget(i);
-				if (f.file) {
+				if (fd_file(f)) {
 					wait_key_set(wait, in, out, bit,
 						     busy_flag);
-					mask = vfs_poll(f.file, wait);
+					mask = vfs_poll(fd_file(f), wait);
 
 					fdput(f);
 				}
@@ -864,13 +864,13 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
 		goto out;
 	mask = EPOLLNVAL;
 	f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		goto out;
 
 	/* userland u16 ->events contains POLL... bitmap */
 	filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
 	pwait->_key = filter | busy_flag;
-	mask = vfs_poll(f.file, pwait);
+	mask = vfs_poll(fd_file(f), pwait);
 	if (mask & busy_flag)
 		*can_busy_poll = true;
 	mask &= filter;		/* Mask out unneeded events. */
diff --git a/fs/signalfd.c b/fs/signalfd.c
index ec7b2da2477a..777e889ab0e8 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -289,10 +289,10 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
 		fd_install(ufd, file);
 	} else {
 		struct fd f = fdget(ufd);
-		if (!f.file)
+		if (!fd_file(f))
 			return -EBADF;
-		ctx = f.file->private_data;
-		if (f.file->f_op != &signalfd_fops) {
+		ctx = fd_file(f)->private_data;
+		if (fd_file(f)->f_op != &signalfd_fops) {
 			fdput(f);
 			return -EINVAL;
 		}
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index 855ac5a62edf..94bf2e5014d9 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -90,23 +90,23 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
 	}
 
 	src_file = fdget(srcfd);
-	if (!src_file.file) {
+	if (!fd_file(src_file)) {
 		rc = -EBADF;
 		goto out_drop_write;
 	}
 
-	if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
+	if (fd_file(src_file)->f_op->unlocked_ioctl != cifs_ioctl) {
 		rc = -EBADF;
 		cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
 		goto out_fput;
 	}
 
-	src_inode = file_inode(src_file.file);
+	src_inode = file_inode(fd_file(src_file));
 	rc = -EINVAL;
 	if (S_ISDIR(src_inode->i_mode))
 		goto out_fput;
 
-	rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
+	rc = cifs_file_copychunk_range(xid, fd_file(src_file), 0, dst_file, 0,
 					src_inode->i_size, 0);
 	if (rc > 0)
 		rc = 0;
diff --git a/fs/splice.c b/fs/splice.c
index 60aed8de21f8..06232d7e505f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1566,11 +1566,11 @@ static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
 
 static int vmsplice_type(struct fd f, int *type)
 {
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
-	if (f.file->f_mode & FMODE_WRITE) {
+	if (fd_file(f)->f_mode & FMODE_WRITE) {
 		*type = ITER_SOURCE;
-	} else if (f.file->f_mode & FMODE_READ) {
+	} else if (fd_file(f)->f_mode & FMODE_READ) {
 		*type = ITER_DEST;
 	} else {
 		fdput(f);
@@ -1621,9 +1621,9 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 	if (!iov_iter_count(&iter))
 		error = 0;
 	else if (type == ITER_SOURCE)
-		error = vmsplice_to_pipe(f.file, &iter, flags);
+		error = vmsplice_to_pipe(fd_file(f), &iter, flags);
 	else
-		error = vmsplice_to_user(f.file, &iter, flags);
+		error = vmsplice_to_user(fd_file(f), &iter, flags);
 
 	kfree(iov);
 out_fdput:
@@ -1646,10 +1646,10 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 
 	error = -EBADF;
 	in = fdget(fd_in);
-	if (in.file) {
+	if (fd_file(in)) {
 		out = fdget(fd_out);
-		if (out.file) {
-			error = __do_splice(in.file, off_in, out.file, off_out,
+		if (fd_file(out)) {
+			error = __do_splice(fd_file(in), off_in, fd_file(out), off_out,
 					    len, flags);
 			fdput(out);
 		}
@@ -2016,10 +2016,10 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
 
 	error = -EBADF;
 	in = fdget(fdin);
-	if (in.file) {
+	if (fd_file(in)) {
 		out = fdget(fdout);
-		if (out.file) {
-			error = do_tee(in.file, out.file, len, flags);
+		if (fd_file(out)) {
+			error = do_tee(fd_file(in), fd_file(out), len, flags);
 			fdput(out);
 		}
  		fdput(in);
diff --git a/fs/stat.c b/fs/stat.c
index 89ce1be56310..41e598376d7e 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -224,9 +224,9 @@ int vfs_fstat(int fd, struct kstat *stat)
 	int error;
 
 	f = fdget_raw(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
-	error = vfs_getattr(&f.file->f_path, stat, STATX_BASIC_STATS, 0);
+	error = vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0);
 	fdput(f);
 	return error;
 }
@@ -277,9 +277,9 @@ static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
 			  u32 request_mask)
 {
 	CLASS(fd_raw, f)(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
-	return vfs_statx_path(&f.file->f_path, flags, stat, request_mask);
+	return vfs_statx_path(&fd_file(f)->f_path, flags, stat, request_mask);
 }
 
 /**
diff --git a/fs/statfs.c b/fs/statfs.c
index 96d1c3edf289..9c7bb27e7932 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -116,8 +116,8 @@ int fd_statfs(int fd, struct kstatfs *st)
 {
 	struct fd f = fdget_raw(fd);
 	int error = -EBADF;
-	if (f.file) {
-		error = vfs_statfs(&f.file->f_path, st);
+	if (fd_file(f)) {
+		error = vfs_statfs(&fd_file(f)->f_path, st);
 		fdput(f);
 	}
 	return error;
diff --git a/fs/sync.c b/fs/sync.c
index dc725914e1ed..67df255eb189 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -152,15 +152,15 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 	struct super_block *sb;
 	int ret, ret2;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
-	sb = f.file->f_path.dentry->d_sb;
+	sb = fd_file(f)->f_path.dentry->d_sb;
 
 	down_read(&sb->s_umount);
 	ret = sync_filesystem(sb);
 	up_read(&sb->s_umount);
 
-	ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);
+	ret2 = errseq_check_and_advance(&sb->s_wb_err, &fd_file(f)->f_sb_err);
 
 	fdput(f);
 	return ret ? ret : ret2;
@@ -208,8 +208,8 @@ static int do_fsync(unsigned int fd, int datasync)
 	struct fd f = fdget(fd);
 	int ret = -EBADF;
 
-	if (f.file) {
-		ret = vfs_fsync(f.file, datasync);
+	if (fd_file(f)) {
+		ret = vfs_fsync(fd_file(f), datasync);
 		fdput(f);
 	}
 	return ret;
@@ -360,8 +360,8 @@ int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
 
 	ret = -EBADF;
 	f = fdget(fd);
-	if (f.file)
-		ret = sync_file_range(f.file, offset, nbytes, flags);
+	if (fd_file(f))
+		ret = sync_file_range(fd_file(f), offset, nbytes, flags);
 
 	fdput(f);
 	return ret;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 4bf2f8bfec11..137523e0bb21 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -397,9 +397,9 @@ static const struct file_operations timerfd_fops = {
 static int timerfd_fget(int fd, struct fd *p)
 {
 	struct fd f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
-	if (f.file->f_op != &timerfd_fops) {
+	if (fd_file(f)->f_op != &timerfd_fops) {
 		fdput(f);
 		return -EINVAL;
 	}
@@ -482,7 +482,7 @@ static int do_timerfd_settime(int ufd, int flags,
 	ret = timerfd_fget(ufd, &f);
 	if (ret)
 		return ret;
-	ctx = f.file->private_data;
+	ctx = fd_file(f)->private_data;
 
 	if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) {
 		fdput(f);
@@ -546,7 +546,7 @@ static int do_timerfd_gettime(int ufd, struct itimerspec64 *t)
 	int ret = timerfd_fget(ufd, &f);
 	if (ret)
 		return ret;
-	ctx = f.file->private_data;
+	ctx = fd_file(f)->private_data;
 
 	spin_lock_irq(&ctx->wqh.lock);
 	if (ctx->expired && ctx->tintv) {
diff --git a/fs/utimes.c b/fs/utimes.c
index 3701b3946f88..99b26f792b89 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -115,9 +115,9 @@ static int do_utimes_fd(int fd, struct timespec64 *times, int flags)
 		return -EINVAL;
 
 	f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
-	error = vfs_utimes(&f.file->f_path, times);
+	error = vfs_utimes(&fd_file(f)->f_path, times);
 	fdput(f);
 	return error;
 }
diff --git a/fs/xattr.c b/fs/xattr.c
index 7672ce5486c5..05ec7e7d9e87 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -697,19 +697,19 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
 	int error;
 
 	CLASS(fd, f)(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	audit_file(f.file);
+	audit_file(fd_file(f));
 	error = setxattr_copy(name, &ctx);
 	if (error)
 		return error;
 
-	error = mnt_want_write_file(f.file);
+	error = mnt_want_write_file(fd_file(f));
 	if (!error) {
-		error = do_setxattr(file_mnt_idmap(f.file),
-				    f.file->f_path.dentry, &ctx);
-		mnt_drop_write_file(f.file);
+		error = do_setxattr(file_mnt_idmap(fd_file(f)),
+				    fd_file(f)->f_path.dentry, &ctx);
+		mnt_drop_write_file(fd_file(f));
 	}
 	kvfree(ctx.kvalue);
 	return error;
@@ -812,10 +812,10 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
 	struct fd f = fdget(fd);
 	ssize_t error = -EBADF;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return error;
-	audit_file(f.file);
-	error = getxattr(file_mnt_idmap(f.file), f.file->f_path.dentry,
+	audit_file(fd_file(f));
+	error = getxattr(file_mnt_idmap(fd_file(f)), fd_file(f)->f_path.dentry,
 			 name, value, size);
 	fdput(f);
 	return error;
@@ -888,10 +888,10 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
 	struct fd f = fdget(fd);
 	ssize_t error = -EBADF;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return error;
-	audit_file(f.file);
-	error = listxattr(f.file->f_path.dentry, list, size);
+	audit_file(fd_file(f));
+	error = listxattr(fd_file(f)->f_path.dentry, list, size);
 	fdput(f);
 	return error;
 }
@@ -954,9 +954,9 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 	char kname[XATTR_NAME_MAX + 1];
 	int error = -EBADF;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return error;
-	audit_file(f.file);
+	audit_file(fd_file(f));
 
 	error = strncpy_from_user(kname, name, sizeof(kname));
 	if (error == 0 || error == sizeof(kname))
@@ -964,11 +964,11 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 	if (error < 0)
 		return error;
 
-	error = mnt_want_write_file(f.file);
+	error = mnt_want_write_file(fd_file(f));
 	if (!error) {
-		error = removexattr(file_mnt_idmap(f.file),
-				    f.file->f_path.dentry, kname);
-		mnt_drop_write_file(f.file);
+		error = removexattr(file_mnt_idmap(fd_file(f)),
+				    fd_file(f)->f_path.dentry, kname);
+		mnt_drop_write_file(fd_file(f));
 	}
 	fdput(f);
 	return error;
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index c8a655c92c92..9790e0f45d14 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -794,9 +794,9 @@ xfs_ioc_exchange_range(
 	fxr.flags		= args.flags;
 
 	file1 = fdget(args.file1_fd);
-	if (!file1.file)
+	if (!fd_file(file1))
 		return -EBADF;
-	fxr.file1 = file1.file;
+	fxr.file1 = fd_file(file1);
 
 	error = xfs_exchange_range(&fxr);
 	fdput(file1);
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
index cf5acbd3c7ca..7bcc4f519cb8 100644
--- a/fs/xfs/xfs_handle.c
+++ b/fs/xfs/xfs_handle.c
@@ -92,9 +92,9 @@ xfs_find_handle(
 
 	if (cmd == XFS_IOC_FD_TO_HANDLE) {
 		f = fdget(hreq->fd);
-		if (!f.file)
+		if (!fd_file(f))
 			return -EBADF;
-		inode = file_inode(f.file);
+		inode = file_inode(fd_file(f));
 	} else {
 		error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
 		if (error)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 4e933db75b12..95641817370b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1005,33 +1005,33 @@ xfs_ioc_swapext(
 
 	/* Pull information for the target fd */
 	f = fdget((int)sxp->sx_fdtarget);
-	if (!f.file) {
+	if (!fd_file(f)) {
 		error = -EINVAL;
 		goto out;
 	}
 
-	if (!(f.file->f_mode & FMODE_WRITE) ||
-	    !(f.file->f_mode & FMODE_READ) ||
-	    (f.file->f_flags & O_APPEND)) {
+	if (!(fd_file(f)->f_mode & FMODE_WRITE) ||
+	    !(fd_file(f)->f_mode & FMODE_READ) ||
+	    (fd_file(f)->f_flags & O_APPEND)) {
 		error = -EBADF;
 		goto out_put_file;
 	}
 
 	tmp = fdget((int)sxp->sx_fdtmp);
-	if (!tmp.file) {
+	if (!fd_file(tmp)) {
 		error = -EINVAL;
 		goto out_put_file;
 	}
 
-	if (!(tmp.file->f_mode & FMODE_WRITE) ||
-	    !(tmp.file->f_mode & FMODE_READ) ||
-	    (tmp.file->f_flags & O_APPEND)) {
+	if (!(fd_file(tmp)->f_mode & FMODE_WRITE) ||
+	    !(fd_file(tmp)->f_mode & FMODE_READ) ||
+	    (fd_file(tmp)->f_flags & O_APPEND)) {
 		error = -EBADF;
 		goto out_put_tmp_file;
 	}
 
-	if (IS_SWAPFILE(file_inode(f.file)) ||
-	    IS_SWAPFILE(file_inode(tmp.file))) {
+	if (IS_SWAPFILE(file_inode(fd_file(f))) ||
+	    IS_SWAPFILE(file_inode(fd_file(tmp)))) {
 		error = -EINVAL;
 		goto out_put_tmp_file;
 	}
@@ -1041,14 +1041,14 @@ xfs_ioc_swapext(
 	 * before we cast and access them as XFS structures as we have no
 	 * control over what the user passes us here.
 	 */
-	if (f.file->f_op != &xfs_file_operations ||
-	    tmp.file->f_op != &xfs_file_operations) {
+	if (fd_file(f)->f_op != &xfs_file_operations ||
+	    fd_file(tmp)->f_op != &xfs_file_operations) {
 		error = -EINVAL;
 		goto out_put_tmp_file;
 	}
 
-	ip = XFS_I(file_inode(f.file));
-	tip = XFS_I(file_inode(tmp.file));
+	ip = XFS_I(file_inode(fd_file(f)));
+	tip = XFS_I(file_inode(fd_file(tmp)));
 
 	if (ip->i_mount != tip->i_mount) {
 		error = -EINVAL;
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index d9e613803df1..a3d3e888cf1f 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -98,7 +98,7 @@ const volatile void * __must_check_fn(const volatile void *val)
  * DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd)
  *
  *	CLASS(fdget, f)(fd);
- *	if (!f.file)
+ *	if (!fd_file(f))
  *		return -EBADF;
  *
  *	// use 'f' without concern
diff --git a/include/linux/file.h b/include/linux/file.h
index 237931f20739..0f3f369f2450 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -42,10 +42,12 @@ struct fd {
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
+#define fd_file(f) ((f).file)
+
 static inline void fdput(struct fd fd)
 {
 	if (fd.flags & FDPUT_FPUT)
-		fput(fd.file);
+		fput(fd_file(fd));
 }
 
 extern struct file *fget(unsigned int fd);
@@ -79,7 +81,7 @@ static inline struct fd fdget_pos(int fd)
 static inline void fdput_pos(struct fd f)
 {
 	if (f.flags & FDPUT_POS_UNLOCK)
-		__f_unlock_pos(f.file);
+		__f_unlock_pos(fd_file(f));
 	fdput(f);
 }
 
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index b3722e5275e7..ffa7d341bd95 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -108,14 +108,14 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
 	struct fd f;
 
 	f = fdget(p->wq_fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return ERR_PTR(-ENXIO);
-	if (!io_is_uring_fops(f.file)) {
+	if (!io_is_uring_fops(fd_file(f))) {
 		fdput(f);
 		return ERR_PTR(-EINVAL);
 	}
 
-	ctx_attach = f.file->private_data;
+	ctx_attach = fd_file(f)->private_data;
 	sqd = ctx_attach->sq_data;
 	if (!sqd) {
 		fdput(f);
@@ -418,9 +418,9 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
 		struct fd f;
 
 		f = fdget(p->wq_fd);
-		if (!f.file)
+		if (!fd_file(f))
 			return -ENXIO;
-		if (!io_is_uring_fops(f.file)) {
+		if (!io_is_uring_fops(fd_file(f))) {
 			fdput(f);
 			return -EINVAL;
 		}
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index a7cbd69efbef..34fa0bd8bb11 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1085,20 +1085,20 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 	audit_mq_sendrecv(mqdes, msg_len, msg_prio, ts);
 
 	f = fdget(mqdes);
-	if (unlikely(!f.file)) {
+	if (unlikely(!fd_file(f))) {
 		ret = -EBADF;
 		goto out;
 	}
 
-	inode = file_inode(f.file);
-	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+	inode = file_inode(fd_file(f));
+	if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {
 		ret = -EBADF;
 		goto out_fput;
 	}
 	info = MQUEUE_I(inode);
-	audit_file(f.file);
+	audit_file(fd_file(f));
 
-	if (unlikely(!(f.file->f_mode & FMODE_WRITE))) {
+	if (unlikely(!(fd_file(f)->f_mode & FMODE_WRITE))) {
 		ret = -EBADF;
 		goto out_fput;
 	}
@@ -1138,7 +1138,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
 	}
 
 	if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
-		if (f.file->f_flags & O_NONBLOCK) {
+		if (fd_file(f)->f_flags & O_NONBLOCK) {
 			ret = -EAGAIN;
 		} else {
 			wait.task = current;
@@ -1199,20 +1199,20 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
 	audit_mq_sendrecv(mqdes, msg_len, 0, ts);
 
 	f = fdget(mqdes);
-	if (unlikely(!f.file)) {
+	if (unlikely(!fd_file(f))) {
 		ret = -EBADF;
 		goto out;
 	}
 
-	inode = file_inode(f.file);
-	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+	inode = file_inode(fd_file(f));
+	if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {
 		ret = -EBADF;
 		goto out_fput;
 	}
 	info = MQUEUE_I(inode);
-	audit_file(f.file);
+	audit_file(fd_file(f));
 
-	if (unlikely(!(f.file->f_mode & FMODE_READ))) {
+	if (unlikely(!(fd_file(f)->f_mode & FMODE_READ))) {
 		ret = -EBADF;
 		goto out_fput;
 	}
@@ -1242,7 +1242,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
 	}
 
 	if (info->attr.mq_curmsgs == 0) {
-		if (f.file->f_flags & O_NONBLOCK) {
+		if (fd_file(f)->f_flags & O_NONBLOCK) {
 			spin_unlock(&info->lock);
 			ret = -EAGAIN;
 		} else {
@@ -1356,11 +1356,11 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
 			/* and attach it to the socket */
 retry:
 			f = fdget(notification->sigev_signo);
-			if (!f.file) {
+			if (!fd_file(f)) {
 				ret = -EBADF;
 				goto out;
 			}
-			sock = netlink_getsockbyfilp(f.file);
+			sock = netlink_getsockbyfilp(fd_file(f));
 			fdput(f);
 			if (IS_ERR(sock)) {
 				ret = PTR_ERR(sock);
@@ -1379,13 +1379,13 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
 	}
 
 	f = fdget(mqdes);
-	if (!f.file) {
+	if (!fd_file(f)) {
 		ret = -EBADF;
 		goto out;
 	}
 
-	inode = file_inode(f.file);
-	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+	inode = file_inode(fd_file(f));
+	if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {
 		ret = -EBADF;
 		goto out_fput;
 	}
@@ -1460,31 +1460,31 @@ static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
 		return -EINVAL;
 
 	f = fdget(mqdes);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+	if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {
 		fdput(f);
 		return -EBADF;
 	}
 
-	inode = file_inode(f.file);
+	inode = file_inode(fd_file(f));
 	info = MQUEUE_I(inode);
 
 	spin_lock(&info->lock);
 
 	if (old) {
 		*old = info->attr;
-		old->mq_flags = f.file->f_flags & O_NONBLOCK;
+		old->mq_flags = fd_file(f)->f_flags & O_NONBLOCK;
 	}
 	if (new) {
 		audit_mq_getsetattr(mqdes, new);
-		spin_lock(&f.file->f_lock);
+		spin_lock(&fd_file(f)->f_lock);
 		if (new->mq_flags & O_NONBLOCK)
-			f.file->f_flags |= O_NONBLOCK;
+			fd_file(f)->f_flags |= O_NONBLOCK;
 		else
-			f.file->f_flags &= ~O_NONBLOCK;
-		spin_unlock(&f.file->f_lock);
+			fd_file(f)->f_flags &= ~O_NONBLOCK;
+		spin_unlock(&fd_file(f)->f_lock);
 
 		inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
 	}
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index b0ef45db207c..0a79aee6523d 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -80,10 +80,10 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
 	struct bpf_local_storage_data *sdata;
 	struct fd f = fdget_raw(*(int *)key);
 
-	if (!f.file)
+	if (!fd_file(f))
 		return ERR_PTR(-EBADF);
 
-	sdata = inode_storage_lookup(file_inode(f.file), map, true);
+	sdata = inode_storage_lookup(file_inode(fd_file(f)), map, true);
 	fdput(f);
 	return sdata ? sdata->data : NULL;
 }
@@ -94,14 +94,14 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
 	struct bpf_local_storage_data *sdata;
 	struct fd f = fdget_raw(*(int *)key);
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
-	if (!inode_storage_ptr(file_inode(f.file))) {
+	if (!inode_storage_ptr(file_inode(fd_file(f)))) {
 		fdput(f);
 		return -EBADF;
 	}
 
-	sdata = bpf_local_storage_update(file_inode(f.file),
+	sdata = bpf_local_storage_update(file_inode(fd_file(f)),
 					 (struct bpf_local_storage_map *)map,
 					 value, map_flags, GFP_ATOMIC);
 	fdput(f);
@@ -126,10 +126,10 @@ static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
 	struct fd f = fdget_raw(*(int *)key);
 	int err;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	err = inode_storage_delete(file_inode(f.file), map);
+	err = inode_storage_delete(file_inode(fd_file(f)), map);
 	fdput(f);
 	return err;
 }
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 520f49f422fe..4de1e3dc2284 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7677,15 +7677,15 @@ struct btf *btf_get_by_fd(int fd)
 
 	f = fdget(fd);
 
-	if (!f.file)
+	if (!fd_file(f))
 		return ERR_PTR(-EBADF);
 
-	if (f.file->f_op != &btf_fops) {
+	if (fd_file(f)->f_op != &btf_fops) {
 		fdput(f);
 		return ERR_PTR(-EINVAL);
 	}
 
-	btf = f.file->private_data;
+	btf = fd_file(f)->private_data;
 	refcount_inc(&btf->refcnt);
 	fdput(f);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bf6c5f685ea2..3093bf2cc266 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -829,7 +829,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 
 static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
 {
-	fmode_t mode = f.file->f_mode;
+	fmode_t mode = fd_file(f)->f_mode;
 
 	/* Our file permissions may have been overridden by global
 	 * map permissions facing syscall side.
@@ -1423,14 +1423,14 @@ static int map_create(union bpf_attr *attr)
  */
 struct bpf_map *__bpf_map_get(struct fd f)
 {
-	if (!f.file)
+	if (!fd_file(f))
 		return ERR_PTR(-EBADF);
-	if (f.file->f_op != &bpf_map_fops) {
+	if (fd_file(f)->f_op != &bpf_map_fops) {
 		fdput(f);
 		return ERR_PTR(-EINVAL);
 	}
 
-	return f.file->private_data;
+	return fd_file(f)->private_data;
 }
 
 void bpf_map_inc(struct bpf_map *map)
@@ -1651,7 +1651,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 		goto free_key;
 	}
 
-	err = bpf_map_update_value(map, f.file, key, value, attr->flags);
+	err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
 	if (!err)
 		maybe_wait_bpf_programs(map);
 
@@ -2409,14 +2409,14 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
 
 static struct bpf_prog *____bpf_prog_get(struct fd f)
 {
-	if (!f.file)
+	if (!fd_file(f))
 		return ERR_PTR(-EBADF);
-	if (f.file->f_op != &bpf_prog_fops) {
+	if (fd_file(f)->f_op != &bpf_prog_fops) {
 		fdput(f);
 		return ERR_PTR(-EINVAL);
 	}
 
-	return f.file->private_data;
+	return fd_file(f)->private_data;
 }
 
 void bpf_prog_add(struct bpf_prog *prog, int i)
@@ -3259,14 +3259,14 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
 	struct fd f = fdget(ufd);
 	struct bpf_link *link;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return ERR_PTR(-EBADF);
-	if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) {
+	if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll) {
 		fdput(f);
 		return ERR_PTR(-EINVAL);
 	}
 
-	link = f.file->private_data;
+	link = fd_file(f)->private_data;
 	bpf_link_inc(link);
 	fdput(f);
 
@@ -4982,19 +4982,19 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 		return -EINVAL;
 
 	f = fdget(ufd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADFD;
 
-	if (f.file->f_op == &bpf_prog_fops)
-		err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr,
+	if (fd_file(f)->f_op == &bpf_prog_fops)
+		err = bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
 					      uattr);
-	else if (f.file->f_op == &bpf_map_fops)
-		err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr,
+	else if (fd_file(f)->f_op == &bpf_map_fops)
+		err = bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr,
 					     uattr);
-	else if (f.file->f_op == &btf_fops)
-		err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
-	else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll)
-		err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
+	else if (fd_file(f)->f_op == &btf_fops)
+		err = bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr);
+	else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll)
+		err = bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
 					      attr, uattr);
 	else
 		err = -EINVAL;
@@ -5215,7 +5215,7 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
 	else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
 		BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
 	else if (cmd == BPF_MAP_UPDATE_BATCH)
-		BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
+		BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr);
 	else
 		BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
 err_put:
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index d6ccf8d00eab..9a1d356e79ed 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -122,10 +122,10 @@ int bpf_token_create(union bpf_attr *attr)
 	int err, fd;
 
 	f = fdget(attr->token_create.bpffs_fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	path = f.file->f_path;
+	path = fd_file(f)->f_path;
 	path_get(&path);
 	fdput(f);
 
@@ -235,14 +235,14 @@ struct bpf_token *bpf_token_get_from_fd(u32 ufd)
 	struct fd f = fdget(ufd);
 	struct bpf_token *token;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return ERR_PTR(-EBADF);
-	if (f.file->f_op != &bpf_token_fops) {
+	if (fd_file(f)->f_op != &bpf_token_fops) {
 		fdput(f);
 		return ERR_PTR(-EINVAL);
 	}
 
-	token = f.file->private_data;
+	token = fd_file(f)->private_data;
 	bpf_token_inc(token);
 	fdput(f);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c8e4b62b436a..b96489277f70 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6901,10 +6901,10 @@ struct cgroup *cgroup_v1v2_get_from_fd(int fd)
 {
 	struct cgroup *cgrp;
 	struct fd f = fdget_raw(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return ERR_PTR(-EBADF);
 
-	cgrp = cgroup_v1v2_get_from_file(f.file);
+	cgrp = cgroup_v1v2_get_from_file(fd_file(f));
 	fdput(f);
 	return cgrp;
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index aa3450bdc227..17b19d3e74ba 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -933,10 +933,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 	struct fd f = fdget(fd);
 	int ret = 0;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	css = css_tryget_online_from_dir(f.file->f_path.dentry,
+	css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry,
 					 &perf_event_cgrp_subsys);
 	if (IS_ERR(css)) {
 		ret = PTR_ERR(css);
@@ -5898,10 +5898,10 @@ static const struct file_operations perf_fops;
 static inline int perf_fget_light(int fd, struct fd *p)
 {
 	struct fd f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	if (f.file->f_op != &perf_fops) {
+	if (fd_file(f)->f_op != &perf_fops) {
 		fdput(f);
 		return -EBADF;
 	}
@@ -5961,7 +5961,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 			ret = perf_fget_light(arg, &output);
 			if (ret)
 				return ret;
-			output_event = output.file->private_data;
+			output_event = fd_file(output)->private_data;
 			ret = perf_event_set_output(event, output_event);
 			fdput(output);
 		} else {
@@ -12549,7 +12549,7 @@ SYSCALL_DEFINE5(perf_event_open,
 		err = perf_fget_light(group_fd, &group);
 		if (err)
 			goto err_fd;
-		group_leader = group.file->private_data;
+		group_leader = fd_file(group)->private_data;
 		if (flags & PERF_FLAG_FD_OUTPUT)
 			output_event = group_leader;
 		if (flags & PERF_FLAG_FD_NO_GROUP)
diff --git a/kernel/module/main.c b/kernel/module/main.c
index d9592195c5bb..6ed334eecc14 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -3211,7 +3211,7 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
 		return -EINVAL;
 
 	f = fdget(fd);
-	err = idempotent_init_module(f.file, uargs, flags);
+	err = idempotent_init_module(fd_file(f), uargs, flags);
 	fdput(f);
 	return err;
 }
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 6ec3deec68c2..dc952c3b05af 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -550,15 +550,15 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
 	struct nsset nsset = {};
 	int err = 0;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	if (proc_ns_file(f.file)) {
-		ns = get_proc_ns(file_inode(f.file));
+	if (proc_ns_file(fd_file(f))) {
+		ns = get_proc_ns(file_inode(fd_file(f)));
 		if (flags && (ns->ops->type != flags))
 			err = -EINVAL;
 		flags = ns->ops->type;
-	} else if (!IS_ERR(pidfd_pid(f.file))) {
+	} else if (!IS_ERR(pidfd_pid(fd_file(f)))) {
 		err = check_setns_flags(flags);
 	} else {
 		err = -EINVAL;
@@ -570,10 +570,10 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
 	if (err)
 		goto out;
 
-	if (proc_ns_file(f.file))
+	if (proc_ns_file(fd_file(f)))
 		err = validate_ns(&nsset, ns);
 	else
-		err = validate_nsset(&nsset, pidfd_pid(f.file));
+		err = validate_nsset(&nsset, pidfd_pid(fd_file(f)));
 	if (!err) {
 		commit_nsset(&nsset);
 		perf_event_namespaces(current);
diff --git a/kernel/pid.c b/kernel/pid.c
index da76ed1873f7..2715afb77eab 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -540,13 +540,13 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
 	struct pid *pid;
 
 	f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return ERR_PTR(-EBADF);
 
-	pid = pidfd_pid(f.file);
+	pid = pidfd_pid(fd_file(f));
 	if (!IS_ERR(pid)) {
 		get_pid(pid);
-		*flags = f.file->f_flags;
+		*flags = fd_file(f)->f_flags;
 	}
 
 	fdput(f);
@@ -755,10 +755,10 @@ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
 		return -EINVAL;
 
 	f = fdget(pidfd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	pid = pidfd_pid(f.file);
+	pid = pidfd_pid(fd_file(f));
 	if (IS_ERR(pid))
 		ret = PTR_ERR(pid);
 	else
diff --git a/kernel/signal.c b/kernel/signal.c
index 60c737e423a1..cc5d87cfa7c0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3922,11 +3922,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 		return -EINVAL;
 
 	f = fdget(pidfd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
 	/* Is this a pidfd? */
-	pid = pidfd_to_pid(f.file);
+	pid = pidfd_to_pid(fd_file(f));
 	if (IS_ERR(pid)) {
 		ret = PTR_ERR(pid);
 		goto err;
@@ -3939,7 +3939,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 	switch (flags) {
 	case 0:
 		/* Infer scope from the type of pidfd. */
-		if (f.file->f_flags & PIDFD_THREAD)
+		if (fd_file(f)->f_flags & PIDFD_THREAD)
 			type = PIDTYPE_PID;
 		else
 			type = PIDTYPE_TGID;
diff --git a/kernel/sys.c b/kernel/sys.c
index 3a2df1bd9f64..a4be1e568ff5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1916,10 +1916,10 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 	int err;
 
 	exe = fdget(fd);
-	if (!exe.file)
+	if (!fd_file(exe))
 		return -EBADF;
 
-	inode = file_inode(exe.file);
+	inode = file_inode(fd_file(exe));
 
 	/*
 	 * Because the original mm->exe_file points to executable file, make
@@ -1927,14 +1927,14 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 	 * overall picture.
 	 */
 	err = -EACCES;
-	if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
+	if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path))
 		goto exit;
 
-	err = file_permission(exe.file, MAY_EXEC);
+	err = file_permission(fd_file(exe), MAY_EXEC);
 	if (err)
 		goto exit;
 
-	err = replace_mm_exe_file(mm, exe.file);
+	err = replace_mm_exe_file(mm, fd_file(exe));
 exit:
 	fdput(exe);
 	return err;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4354ea231fab..0700f40c53ac 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -419,7 +419,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 
 	fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
 	f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return 0;
 
 	size = nla_total_size(sizeof(struct cgroupstats));
@@ -440,7 +440,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	stats = nla_data(na);
 	memset(stats, 0, sizeof(*stats));
 
-	rc = cgroupstats_build(stats, f.file->f_path.dentry);
+	rc = cgroupstats_build(stats, fd_file(f)->f_path.dentry);
 	if (rc < 0) {
 		nlmsg_free(rep_skb);
 		goto err;
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 03b90d7d2175..d36242fd4936 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -666,8 +666,8 @@ struct watch_queue *get_watch_queue(int fd)
 	struct fd f;
 
 	f = fdget(fd);
-	if (f.file) {
-		pipe = get_pipe_info(f.file, false);
+	if (fd_file(f)) {
+		pipe = get_pipe_info(fd_file(f), false);
 		if (pipe && pipe->watch_queue) {
 			wqueue = pipe->watch_queue;
 			kref_get(&wqueue->usage);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 6c39d42f16dc..532dee205c6e 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -193,10 +193,10 @@ int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 	struct fd f = fdget(fd);
 	int ret;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
-	ret = vfs_fadvise(f.file, offset, len, advice);
+	ret = vfs_fadvise(fd_file(f), offset, len, advice);
 
 	fdput(f);
 	return ret;
diff --git a/mm/filemap.c b/mm/filemap.c
index d62150418b91..0b5cbd644fdd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -4393,7 +4393,7 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd,
 	struct cachestat cs;
 	pgoff_t first_index, last_index;
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
 	if (copy_from_user(&csr, cstat_range,
@@ -4403,7 +4403,7 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd,
 	}
 
 	/* hugetlbfs is not supported */
-	if (is_file_hugepages(f.file)) {
+	if (is_file_hugepages(fd_file(f))) {
 		fdput(f);
 		return -EOPNOTSUPP;
 	}
@@ -4417,7 +4417,7 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd,
 	last_index =
 		csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
 	memset(&cs, 0, sizeof(struct cachestat));
-	mapping = f.file->f_mapping;
+	mapping = fd_file(f)->f_mapping;
 	filemap_cachestat(mapping, first_index, last_index, &cs);
 	fdput(f);
 
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 2aeea4d8bf8e..da626ddccffd 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1857,26 +1857,26 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 	INIT_WORK(&event->remove, memcg_event_remove);
 
 	efile = fdget(efd);
-	if (!efile.file) {
+	if (!fd_file(efile)) {
 		ret = -EBADF;
 		goto out_kfree;
 	}
 
-	event->eventfd = eventfd_ctx_fileget(efile.file);
+	event->eventfd = eventfd_ctx_fileget(fd_file(efile));
 	if (IS_ERR(event->eventfd)) {
 		ret = PTR_ERR(event->eventfd);
 		goto out_put_efile;
 	}
 
 	cfile = fdget(cfd);
-	if (!cfile.file) {
+	if (!fd_file(cfile)) {
 		ret = -EBADF;
 		goto out_put_eventfd;
 	}
 
 	/* the process need read permission on control file */
 	/* AV: shouldn't we check that it's been opened for read instead? */
-	ret = file_permission(cfile.file, MAY_READ);
+	ret = file_permission(fd_file(cfile), MAY_READ);
 	if (ret < 0)
 		goto out_put_cfile;
 
@@ -1884,7 +1884,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 	 * The control file must be a regular cgroup1 file. As a regular cgroup
 	 * file can't be renamed, it's safe to access its name afterwards.
 	 */
-	cdentry = cfile.file->f_path.dentry;
+	cdentry = fd_file(cfile)->f_path.dentry;
 	if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
 		ret = -EINVAL;
 		goto out_put_cfile;
@@ -1936,7 +1936,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 	if (ret)
 		goto out_put_css;
 
-	vfs_poll(efile.file, &event->pt);
+	vfs_poll(fd_file(efile), &event->pt);
 
 	spin_lock_irq(&memcg->event_list_lock);
 	list_add(&event->list, &memcg->event_list);
diff --git a/mm/readahead.c b/mm/readahead.c
index 517c0be7ce66..e83fe1c6e5ac 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -646,7 +646,7 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
 
 	ret = -EBADF;
 	f = fdget(fd);
-	if (!f.file || !(f.file->f_mode & FMODE_READ))
+	if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ))
 		goto out;
 
 	/*
@@ -655,12 +655,12 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
 	 * on this file, then we must return -EINVAL.
 	 */
 	ret = -EINVAL;
-	if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
-	    (!S_ISREG(file_inode(f.file)->i_mode) &&
-	    !S_ISBLK(file_inode(f.file)->i_mode)))
+	if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
+	    (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
+	    !S_ISBLK(file_inode(fd_file(f))->i_mode)))
 		goto out;
 
-	ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
+	ret = vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
 out:
 	fdput(f);
 	return ret;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 6a823ba906c6..18b7c8f31234 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -711,11 +711,11 @@ struct net *get_net_ns_by_fd(int fd)
 	struct fd f = fdget(fd);
 	struct net *net = ERR_PTR(-EINVAL);
 
-	if (!f.file)
+	if (!fd_file(f))
 		return ERR_PTR(-EBADF);
 
-	if (proc_ns_file(f.file)) {
-		struct ns_common *ns = get_proc_ns(file_inode(f.file));
+	if (proc_ns_file(fd_file(f))) {
+		struct ns_common *ns = get_proc_ns(file_inode(fd_file(f)));
 		if (ns->ops == &netns_operations)
 			net = get_net(container_of(ns, struct net, ns));
 	}
diff --git a/net/socket.c b/net/socket.c
index fcbdd5bc47ac..f77a42a74510 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -556,8 +556,8 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
 	struct socket *sock;
 
 	*err = -EBADF;
-	if (f.file) {
-		sock = sock_from_file(f.file);
+	if (fd_file(f)) {
+		sock = sock_from_file(fd_file(f));
 		if (likely(sock)) {
 			*fput_needed = f.flags & FDPUT_FPUT;
 			return sock;
@@ -2008,8 +2008,8 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
 	struct fd f;
 
 	f = fdget(fd);
-	if (f.file) {
-		ret = __sys_accept4_file(f.file, upeer_sockaddr,
+	if (fd_file(f)) {
+		ret = __sys_accept4_file(fd_file(f), upeer_sockaddr,
 					 upeer_addrlen, flags);
 		fdput(f);
 	}
@@ -2070,12 +2070,12 @@ int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
 	struct fd f;
 
 	f = fdget(fd);
-	if (f.file) {
+	if (fd_file(f)) {
 		struct sockaddr_storage address;
 
 		ret = move_addr_to_kernel(uservaddr, addrlen, &address);
 		if (!ret)
-			ret = __sys_connect_file(f.file, &address, addrlen, 0);
+			ret = __sys_connect_file(fd_file(f), &address, addrlen, 0);
 		fdput(f);
 	}
 
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index f04f43af651c..e7c1d3ae33fe 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -1068,10 +1068,10 @@ void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
 		return;
 
 	f = fdget(kernel_fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return;
 
-	process_buffer_measurement(file_mnt_idmap(f.file), file_inode(f.file),
+	process_buffer_measurement(file_mnt_idmap(fd_file(f)), file_inode(fd_file(f)),
 				   buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0,
 				   NULL, false, NULL, 0);
 	fdput(f);
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index ccc8bc6c1584..00b63971ab64 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -238,19 +238,19 @@ static struct landlock_ruleset *get_ruleset_from_fd(const int fd,
 	struct landlock_ruleset *ruleset;
 
 	ruleset_f = fdget(fd);
-	if (!ruleset_f.file)
+	if (!fd_file(ruleset_f))
 		return ERR_PTR(-EBADF);
 
 	/* Checks FD type and access right. */
-	if (ruleset_f.file->f_op != &ruleset_fops) {
+	if (fd_file(ruleset_f)->f_op != &ruleset_fops) {
 		ruleset = ERR_PTR(-EBADFD);
 		goto out_fdput;
 	}
-	if (!(ruleset_f.file->f_mode & mode)) {
+	if (!(fd_file(ruleset_f)->f_mode & mode)) {
 		ruleset = ERR_PTR(-EPERM);
 		goto out_fdput;
 	}
-	ruleset = ruleset_f.file->private_data;
+	ruleset = fd_file(ruleset_f)->private_data;
 	if (WARN_ON_ONCE(ruleset->num_layers != 1)) {
 		ruleset = ERR_PTR(-EINVAL);
 		goto out_fdput;
@@ -277,22 +277,22 @@ static int get_path_from_fd(const s32 fd, struct path *const path)
 
 	/* Handles O_PATH. */
 	f = fdget_raw(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 	/*
 	 * Forbids ruleset FDs, internal filesystems (e.g. nsfs), including
 	 * pseudo filesystems that will never be mountable (e.g. sockfs,
 	 * pipefs).
 	 */
-	if ((f.file->f_op == &ruleset_fops) ||
-	    (f.file->f_path.mnt->mnt_flags & MNT_INTERNAL) ||
-	    (f.file->f_path.dentry->d_sb->s_flags & SB_NOUSER) ||
-	    d_is_negative(f.file->f_path.dentry) ||
-	    IS_PRIVATE(d_backing_inode(f.file->f_path.dentry))) {
+	if ((fd_file(f)->f_op == &ruleset_fops) ||
+	    (fd_file(f)->f_path.mnt->mnt_flags & MNT_INTERNAL) ||
+	    (fd_file(f)->f_path.dentry->d_sb->s_flags & SB_NOUSER) ||
+	    d_is_negative(fd_file(f)->f_path.dentry) ||
+	    IS_PRIVATE(d_backing_inode(fd_file(f)->f_path.dentry))) {
 		err = -EBADFD;
 		goto out_fdput;
 	}
-	*path = f.file->f_path;
+	*path = fd_file(f)->f_path;
 	path_get(path);
 
 out_fdput:
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index 93fd4d47b334..02144ec39f43 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -296,7 +296,7 @@ static int read_trusted_verity_root_digests(unsigned int fd)
 		return -EPERM;
 
 	f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EINVAL;
 
 	data = kzalloc(SZ_4K, GFP_KERNEL);
@@ -305,7 +305,7 @@ static int read_trusted_verity_root_digests(unsigned int fd)
 		goto err;
 	}
 
-	rc = kernel_read_file(f.file, 0, (void **)&data, SZ_4K - 1, NULL, READING_POLICY);
+	rc = kernel_read_file(fd_file(f), 0, (void **)&data, SZ_4K - 1, NULL, READING_POLICY);
 	if (rc < 0)
 		goto err;
 
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 4057f9f10aee..cbb9c972cb93 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2250,12 +2250,12 @@ static int snd_pcm_link(struct snd_pcm_substream *substream, int fd)
 	bool nonatomic = substream->pcm->nonatomic;
 	CLASS(fd, f)(fd);
 
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADFD;
-	if (!is_pcm_file(f.file))
+	if (!is_pcm_file(fd_file(f)))
 		return -EBADFD;
 
-	pcm_file = f.file->private_data;
+	pcm_file = fd_file(f)->private_data;
 	substream1 = pcm_file->substream;
 
 	if (substream == substream1)
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 229570059a1b..65efb3735e79 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -327,12 +327,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock);
 
 	f = fdget(args->fd);
-	if (!f.file) {
+	if (!fd_file(f)) {
 		ret = -EBADF;
 		goto out;
 	}
 
-	eventfd = eventfd_ctx_fileget(f.file);
+	eventfd = eventfd_ctx_fileget(fd_file(f));
 	if (IS_ERR(eventfd)) {
 		ret = PTR_ERR(eventfd);
 		goto fail;
@@ -419,7 +419,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	 * Check if there was an event already pending on the eventfd
 	 * before we registered, and trigger it as if we didn't miss it.
 	 */
-	events = vfs_poll(f.file, &irqfd->pt);
+	events = vfs_poll(fd_file(f), &irqfd->pt);
 
 	if (events & EPOLLIN)
 		schedule_work(&irqfd->inject);
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index 76b7f6085dcd..388ae471d258 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -194,7 +194,7 @@ static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd)
 	int ret;
 
 	f = fdget(fd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
 	ret = -ENOENT;
@@ -202,7 +202,7 @@ static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd)
 	mutex_lock(&kv->lock);
 
 	list_for_each_entry(kvf, &kv->file_list, node) {
-		if (kvf->file != f.file)
+		if (kvf->file != fd_file(f))
 			continue;
 
 		list_del(&kvf->node);
@@ -240,7 +240,7 @@ static int kvm_vfio_file_set_spapr_tce(struct kvm_device *dev,
 		return -EFAULT;
 
 	f = fdget(param.groupfd);
-	if (!f.file)
+	if (!fd_file(f))
 		return -EBADF;
 
 	ret = -ENOENT;
@@ -248,7 +248,7 @@ static int kvm_vfio_file_set_spapr_tce(struct kvm_device *dev,
 	mutex_lock(&kv->lock);
 
 	list_for_each_entry(kvf, &kv->file_list, node) {
-		if (kvf->file != f.file)
+		if (kvf->file != fd_file(f))
 			continue;
 
 		if (!kvf->iommu_group) {

From 88a2f6468d013ca1163490dbddfc95135d1c27a1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 31 May 2024 15:45:12 -0400
Subject: [PATCH 345/346] struct fd: representation change

	We want the compiler to see that fdput() on empty instance
is a no-op.  The emptiness check is that file reference is NULL,
while fdput() is "fput() if FDPUT_FPUT is present in flags".
The reason why fdput() on empty instance is a no-op is something
compiler can't see - it's that we never generate instances with
NULL file reference combined with non-zero flags.

	It's not that hard to deal with - the real primitives behind
fdget() et.al. are returning an unsigned long value, unpacked by (inlined)
__to_fd() into the current struct file * + int.  The lower bits are
used to store flags, while the rest encodes the pointer.  Linus suggested
that keeping this unsigned long around with the extractions done by inlined
accessors should generate a sane code and that turns out to be the case.
Namely, turning struct fd into a struct-wrapped unsinged long, with
        fd_empty(f) => unlikely(f.word == 0)
	fd_file(f) => (struct file *)(f.word & ~3)
	fdput(f) => if (f.word & 1) fput(fd_file(f))
ends up with compiler doing the right thing.  The cost is the patch
footprint, of course - we need to switch f.file to fd_file(f) all over
the tree, and it's not doable with simple search and replace; there are
false positives, etc.

	Note that the sole member of that structure is an opaque
unsigned long - all accesses should be done via wrappers and I don't
want to use a name that would invite manual casts to file pointers,
etc.  The value of that member is equal either to (unsigned long)p | flags,
p being an address of some struct file instance, or to 0 for an empty fd.

	For now the new predicate (fd_empty(f)) has no users; all the
existing checks have form (!fd_file(f)).  We will convert to fd_empty()
use later; here we only define it (and tell the compiler that it's
unlikely to return true).

	This commit only deals with representation change; there will
be followups.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/infiniband/core/uverbs_cmd.c |  2 +-
 fs/overlayfs/file.c                  | 28 +++++++++++++++-------------
 fs/xfs/xfs_handle.c                  |  2 +-
 include/linux/file.h                 | 22 ++++++++++++++++------
 kernel/events/core.c                 |  2 +-
 net/socket.c                         |  2 +-
 6 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 3f85575cf971..a4cce360df21 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -572,7 +572,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
 	struct inode                   *inode = NULL;
 	int				new_xrcd = 0;
 	struct ib_device *ib_dev;
-	struct fd f = {};
+	struct fd f = EMPTY_FD;
 	int ret;
 
 	ret = uverbs_request(attrs, &cmd, sizeof(cmd));
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index c4963d0c5549..2b7a5a3a7a2f 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -93,11 +93,11 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
 			       bool allow_meta)
 {
 	struct dentry *dentry = file_dentry(file);
+	struct file *realfile = file->private_data;
 	struct path realpath;
 	int err;
 
-	real->flags = 0;
-	real->file = file->private_data;
+	real->word = (unsigned long)realfile;
 
 	if (allow_meta) {
 		ovl_path_real(dentry, &realpath);
@@ -113,16 +113,17 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
 		return -EIO;
 
 	/* Has it been copied up since we'd opened it? */
-	if (unlikely(file_inode(real->file) != d_inode(realpath.dentry))) {
-		real->flags = FDPUT_FPUT;
-		real->file = ovl_open_realfile(file, &realpath);
-
-		return PTR_ERR_OR_ZERO(real->file);
+	if (unlikely(file_inode(realfile) != d_inode(realpath.dentry))) {
+		struct file *f = ovl_open_realfile(file, &realpath);
+		if (IS_ERR(f))
+			return PTR_ERR(f);
+		real->word = (unsigned long)ovl_open_realfile(file, &realpath) | FDPUT_FPUT;
+		return 0;
 	}
 
 	/* Did the flags change since open? */
-	if (unlikely((file->f_flags ^ real->file->f_flags) & ~OVL_OPEN_FLAGS))
-		return ovl_change_flags(real->file, file->f_flags);
+	if (unlikely((file->f_flags ^ realfile->f_flags) & ~OVL_OPEN_FLAGS))
+		return ovl_change_flags(realfile, file->f_flags);
 
 	return 0;
 }
@@ -130,10 +131,11 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
 static int ovl_real_fdget(const struct file *file, struct fd *real)
 {
 	if (d_is_dir(file_dentry(file))) {
-		real->flags = 0;
-		real->file = ovl_dir_real_file(file, false);
-
-		return PTR_ERR_OR_ZERO(real->file);
+		struct file *f = ovl_dir_real_file(file, false);
+		if (IS_ERR(f))
+			return PTR_ERR(f);
+		real->word = (unsigned long)f;
+		return 0;
 	}
 
 	return ovl_real_fdget_meta(file, real, false);
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
index 7bcc4f519cb8..49e5e5f04e60 100644
--- a/fs/xfs/xfs_handle.c
+++ b/fs/xfs/xfs_handle.c
@@ -85,7 +85,7 @@ xfs_find_handle(
 	int			hsize;
 	xfs_handle_t		handle;
 	struct inode		*inode;
-	struct fd		f = {NULL};
+	struct fd		f = EMPTY_FD;
 	struct path		path;
 	int			error;
 	struct xfs_inode	*ip;
diff --git a/include/linux/file.h b/include/linux/file.h
index 0f3f369f2450..eb28469b1c16 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -35,18 +35,28 @@ static inline void fput_light(struct file *file, int fput_needed)
 		fput(file);
 }
 
+/* either a reference to struct file + flags
+ * (cloned vs. borrowed, pos locked), with
+ * flags stored in lower bits of value,
+ * or empty (represented by 0).
+ */
 struct fd {
-	struct file *file;
-	unsigned int flags;
+	unsigned long word;
 };
 #define FDPUT_FPUT       1
 #define FDPUT_POS_UNLOCK 2
 
-#define fd_file(f) ((f).file)
+#define fd_file(f) ((struct file *)((f).word & ~(FDPUT_FPUT|FDPUT_POS_UNLOCK)))
+static inline bool fd_empty(struct fd f)
+{
+	return unlikely(!f.word);
+}
+
+#define EMPTY_FD (struct fd){0}
 
 static inline void fdput(struct fd fd)
 {
-	if (fd.flags & FDPUT_FPUT)
+	if (fd.word & FDPUT_FPUT)
 		fput(fd_file(fd));
 }
 
@@ -60,7 +70,7 @@ extern void __f_unlock_pos(struct file *);
 
 static inline struct fd __to_fd(unsigned long v)
 {
-	return (struct fd){(struct file *)(v & ~3),v & 3};
+	return (struct fd){v};
 }
 
 static inline struct fd fdget(unsigned int fd)
@@ -80,7 +90,7 @@ static inline struct fd fdget_pos(int fd)
 
 static inline void fdput_pos(struct fd f)
 {
-	if (f.flags & FDPUT_POS_UNLOCK)
+	if (f.word & FDPUT_POS_UNLOCK)
 		__f_unlock_pos(fd_file(f));
 	fdput(f);
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 17b19d3e74ba..fd2ac9c7fd77 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -12474,7 +12474,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	struct perf_event_attr attr;
 	struct perf_event_context *ctx;
 	struct file *event_file = NULL;
-	struct fd group = {NULL, 0};
+	struct fd group = EMPTY_FD;
 	struct task_struct *task = NULL;
 	struct pmu *pmu;
 	int event_fd;
diff --git a/net/socket.c b/net/socket.c
index f77a42a74510..c0d4f5032374 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -559,7 +559,7 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
 	if (fd_file(f)) {
 		sock = sock_from_file(fd_file(f));
 		if (likely(sock)) {
-			*fput_needed = f.flags & FDPUT_FPUT;
+			*fput_needed = f.word & FDPUT_FPUT;
 			return sock;
 		}
 		*err = -ENOTSOCK;

From de12c3391bce10504c0e7bd767516c74110cfce1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 31 May 2024 16:34:25 -0400
Subject: [PATCH 346/346] add struct fd constructors, get rid of __to_fd()

	Make __fdget() et.al. return struct fd directly.
New helpers: BORROWED_FD(file) and CLONED_FD(file), for
borrowed and cloned file references resp.

	NOTE: this might need tuning; in particular, inline on
__fget_light() is there to keep the code generation same as
before - we probably want to keep it inlined in fdget() et.al.
(especially so in fdget_pos()), but that needs profiling.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c            | 26 +++++++++++++-------------
 include/linux/file.h | 33 +++++++++++----------------------
 2 files changed, 24 insertions(+), 35 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index a3b72aa64f11..6f5ed4daafd2 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1128,7 +1128,7 @@ EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
  * The fput_needed flag returned by fget_light should be passed to the
  * corresponding fput_light.
  */
-static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
 {
 	struct files_struct *files = current->files;
 	struct file *file;
@@ -1145,22 +1145,22 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
 	if (likely(atomic_read_acquire(&files->count) == 1)) {
 		file = files_lookup_fd_raw(files, fd);
 		if (!file || unlikely(file->f_mode & mask))
-			return 0;
-		return (unsigned long)file;
+			return EMPTY_FD;
+		return BORROWED_FD(file);
 	} else {
 		file = __fget_files(files, fd, mask);
 		if (!file)
-			return 0;
-		return FDPUT_FPUT | (unsigned long)file;
+			return EMPTY_FD;
+		return CLONED_FD(file);
 	}
 }
-unsigned long __fdget(unsigned int fd)
+struct fd fdget(unsigned int fd)
 {
 	return __fget_light(fd, FMODE_PATH);
 }
-EXPORT_SYMBOL(__fdget);
+EXPORT_SYMBOL(fdget);
 
-unsigned long __fdget_raw(unsigned int fd)
+struct fd fdget_raw(unsigned int fd)
 {
 	return __fget_light(fd, 0);
 }
@@ -1181,16 +1181,16 @@ static inline bool file_needs_f_pos_lock(struct file *file)
 		(file_count(file) > 1 || file->f_op->iterate_shared);
 }
 
-unsigned long __fdget_pos(unsigned int fd)
+struct fd fdget_pos(unsigned int fd)
 {
-	unsigned long v = __fdget(fd);
-	struct file *file = (struct file *)(v & ~3);
+	struct fd f = fdget(fd);
+	struct file *file = fd_file(f);
 
 	if (file && file_needs_f_pos_lock(file)) {
-		v |= FDPUT_POS_UNLOCK;
+		f.word |= FDPUT_POS_UNLOCK;
 		mutex_lock(&file->f_pos_lock);
 	}
-	return v;
+	return f;
 }
 
 void __f_unlock_pos(struct file *f)
diff --git a/include/linux/file.h b/include/linux/file.h
index eb28469b1c16..aab1caff6713 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -53,6 +53,14 @@ static inline bool fd_empty(struct fd f)
 }
 
 #define EMPTY_FD (struct fd){0}
+static inline struct fd BORROWED_FD(struct file *f)
+{
+	return (struct fd){(unsigned long)f};
+}
+static inline struct fd CLONED_FD(struct file *f)
+{
+	return (struct fd){(unsigned long)f | FDPUT_FPUT};
+}
 
 static inline void fdput(struct fd fd)
 {
@@ -63,30 +71,11 @@ static inline void fdput(struct fd fd)
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_raw(unsigned int fd);
 extern struct file *fget_task(struct task_struct *task, unsigned int fd);
-extern unsigned long __fdget(unsigned int fd);
-extern unsigned long __fdget_raw(unsigned int fd);
-extern unsigned long __fdget_pos(unsigned int fd);
 extern void __f_unlock_pos(struct file *);
 
-static inline struct fd __to_fd(unsigned long v)
-{
-	return (struct fd){v};
-}
-
-static inline struct fd fdget(unsigned int fd)
-{
-	return __to_fd(__fdget(fd));
-}
-
-static inline struct fd fdget_raw(unsigned int fd)
-{
-	return __to_fd(__fdget_raw(fd));
-}
-
-static inline struct fd fdget_pos(int fd)
-{
-	return __to_fd(__fdget_pos(fd));
-}
+struct fd fdget(unsigned int fd);
+struct fd fdget_raw(unsigned int fd);
+struct fd fdget_pos(unsigned int fd);
 
 static inline void fdput_pos(struct fd f)
 {