From 6c1bfbd9df8c42a1156b9f4769ca78dfc5e89045 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 13 Oct 2020 13:55:01 +0800 Subject: [PATCH 001/395] MIPS: Loongson64: Add /sys/firmware/lefi/boardinfo Add /sys/firmware/lefi/boardinfo to get mainboard and BIOS info easily on the Loongson platform, this is useful to point out the current used mainboard type and BIOS version when there exists problems related with hardware or firmware. The related structures are already defined in the interface specification about firmware and kernel which are common requirement and specific for Loongson64, so only add a new boardinfo.c file in arch/mips/loongson64. E.g. with this patch: [loongson@linux ~]$ cat /sys/firmware/lefi/boardinfo Board Info Manufacturer : LEMOTE Board Name : LEMOTE-LS3A4000-7A1000-1w-V01-pc Family : LOONGSON3 BIOS Info Vendor : Kunlun Version : Kunlun-A1901-V4.1.3-20200414093938 ROM Size : 4 KB Release Date : 2020-04-14 By the way, using dmidecode command can get the similar info if there exists SMBIOS in firmware, but the fact is that there is no SMBIOS on some machines, we can see nothing when execute dmidecode, like this: [root@linux loongson]# dmidecode # dmidecode 2.12 # No SMBIOS nor DMI entry point found, sorry. Signed-off-by: Tiezhu Yang Signed-off-by: Thomas Bogendoerfer --- .../include/asm/mach-loongson64/boot_param.h | 4 ++ arch/mips/loongson64/Makefile | 1 + arch/mips/loongson64/boardinfo.c | 48 +++++++++++++++++++ arch/mips/loongson64/env.c | 10 ++++ 4 files changed, 63 insertions(+) create mode 100644 arch/mips/loongson64/boardinfo.c diff --git a/arch/mips/include/asm/mach-loongson64/boot_param.h b/arch/mips/include/asm/mach-loongson64/boot_param.h index afc92b7a61c6..4592841b6b0c 100644 --- a/arch/mips/include/asm/mach-loongson64/boot_param.h +++ b/arch/mips/include/asm/mach-loongson64/boot_param.h @@ -228,6 +228,10 @@ struct loongson_system_configuration { extern struct efi_memory_map_loongson *loongson_memmap; extern struct loongson_system_configuration loongson_sysconf; +extern struct board_devices *eboard; +extern struct interface_info *einter; +extern struct loongson_special_attribute *especial; + extern u32 node_id_offset; extern void ls7a_early_config(void); extern void rs780e_early_config(void); diff --git a/arch/mips/loongson64/Makefile b/arch/mips/loongson64/Makefile index 39c06f52b08f..cc76944b1a9d 100644 --- a/arch/mips/loongson64/Makefile +++ b/arch/mips/loongson64/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_RS780_HPET) += hpet.o obj-$(CONFIG_SUSPEND) += pm.o obj-$(CONFIG_PCI_QUIRKS) += vbios_quirk.o obj-$(CONFIG_CPU_LOONGSON3_CPUCFG_EMULATION) += cpucfg-emul.o +obj-$(CONFIG_SYSFS) += boardinfo.o diff --git a/arch/mips/loongson64/boardinfo.c b/arch/mips/loongson64/boardinfo.c new file mode 100644 index 000000000000..280989c5a137 --- /dev/null +++ b/arch/mips/loongson64/boardinfo.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +static ssize_t boardinfo_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + char board_manufacturer[64] = {0}; + char *tmp_board_manufacturer = board_manufacturer; + char bios_vendor[64] = {0}; + char *tmp_bios_vendor = bios_vendor; + + strcpy(board_manufacturer, eboard->name); + strcpy(bios_vendor, einter->description); + + return sprintf(buf, + "Board Info\n" + "Manufacturer\t\t: %s\n" + "Board Name\t\t: %s\n" + "Family\t\t\t: LOONGSON3\n\n" + "BIOS Info\n" + "Vendor\t\t\t: %s\n" + "Version\t\t\t: %s\n" + "ROM Size\t\t: %d KB\n" + "Release Date\t\t: %s\n", + strsep(&tmp_board_manufacturer, "-"), + eboard->name, + strsep(&tmp_bios_vendor, "-"), + einter->description, + einter->size, + especial->special_name); +} +static struct kobj_attribute boardinfo_attr = __ATTR(boardinfo, 0444, + boardinfo_show, NULL); + +static int __init boardinfo_init(void) +{ + struct kobject *lefi_kobj; + + lefi_kobj = kobject_create_and_add("lefi", firmware_kobj); + if (!lefi_kobj) { + pr_err("lefi: Firmware registration failed.\n"); + return -ENOMEM; + } + + return sysfs_create_file(lefi_kobj, &boardinfo_attr.attr); +} +late_initcall(boardinfo_init); diff --git a/arch/mips/loongson64/env.c b/arch/mips/loongson64/env.c index 134cb8e9efc2..51a5d050a94c 100644 --- a/arch/mips/loongson64/env.c +++ b/arch/mips/loongson64/env.c @@ -28,6 +28,10 @@ EXPORT_SYMBOL(cpu_clock_freq); struct efi_memory_map_loongson *loongson_memmap; struct loongson_system_configuration loongson_sysconf; +struct board_devices *eboard; +struct interface_info *einter; +struct loongson_special_attribute *especial; + u64 loongson_chipcfg[MAX_PACKAGES] = {0xffffffffbfc00180}; u64 loongson_chiptemp[MAX_PACKAGES]; u64 loongson_freqctrl[MAX_PACKAGES]; @@ -57,6 +61,12 @@ void __init prom_init_env(void) ((u64)loongson_p + loongson_p->system_offset); ecpu = (struct efi_cpuinfo_loongson *) ((u64)loongson_p + loongson_p->cpu_offset); + eboard = (struct board_devices *) + ((u64)loongson_p + loongson_p->boarddev_table_offset); + einter = (struct interface_info *) + ((u64)loongson_p + loongson_p->interface_offset); + especial = (struct loongson_special_attribute *) + ((u64)loongson_p + loongson_p->special_offset); eirq_source = (struct irq_source_routing_table *) ((u64)loongson_p + loongson_p->irq_offset); loongson_memmap = (struct efi_memory_map_loongson *) From 673a8c4f8517ea624cd838f7f2246b6990190770 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 13 Oct 2020 13:55:02 +0800 Subject: [PATCH 002/395] Documentation: ABI: Add /sys/firmware/lefi/boardinfo description for Loongson64 Add a description for /sys/firmware/lefi/boardinfo on the Loongson platform. Signed-off-by: Tiezhu Yang Signed-off-by: Thomas Bogendoerfer --- .../ABI/testing/sysfs-firmware-lefi-boardinfo | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-firmware-lefi-boardinfo diff --git a/Documentation/ABI/testing/sysfs-firmware-lefi-boardinfo b/Documentation/ABI/testing/sysfs-firmware-lefi-boardinfo new file mode 100644 index 000000000000..5e3f6148c52e --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-lefi-boardinfo @@ -0,0 +1,35 @@ +What: /sys/firmware/lefi/boardinfo +Date: October 2020 +Contact: Tiezhu Yang +Description: + Get mainboard and BIOS info easily on the Loongson platform, + this is useful to point out the current used mainboard type + and BIOS version when there exists problems related with + hardware or firmware. + + The related structures are already defined in the interface + specification about firmware and kernel which are common + requirement and specific for Loongson64, so only add a new + boardinfo.c file in arch/mips/loongson64. + + For example: + + [loongson@linux ~]$ cat /sys/firmware/lefi/boardinfo + Board Info + Manufacturer : LEMOTE + Board Name : LEMOTE-LS3A4000-7A1000-1w-V01-pc + Family : LOONGSON3 + + BIOS Info + Vendor : Kunlun + Version : Kunlun-A1901-V4.1.3-20200414093938 + ROM Size : 4 KB + Release Date : 2020-04-14 + + By the way, using dmidecode command can get the similar info if there + exists SMBIOS in firmware, but the fact is that there is no SMBIOS on + some machines, we can see nothing when execute dmidecode, like this: + + [root@linux loongson]# dmidecode + # dmidecode 2.12 + # No SMBIOS nor DMI entry point found, sorry. From ab7cffb8d2367e5b088c7c14452724e719a10eba Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Mon, 26 Oct 2020 20:20:40 +0100 Subject: [PATCH 003/395] MIPS: ingenic: remove unused platform_data header file There are no users of this headers file in the kernel. All users are likely migrated to device tree which is a good thing. Signed-off-by: Sam Ravnborg Cc: Thomas Bogendoerfer Cc: Paul Cercueil Cc: Harvey Hunt Cc: linux-mips@vger.kernel.org Reviewed-by: Paul Cercueil Signed-off-by: Thomas Bogendoerfer --- .../linux/platform_data/jz4740/jz4740_nand.h | 25 ------------------- 1 file changed, 25 deletions(-) delete mode 100644 include/linux/platform_data/jz4740/jz4740_nand.h diff --git a/include/linux/platform_data/jz4740/jz4740_nand.h b/include/linux/platform_data/jz4740/jz4740_nand.h deleted file mode 100644 index b3f066d63059..000000000000 --- a/include/linux/platform_data/jz4740/jz4740_nand.h +++ /dev/null @@ -1,25 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2009-2010, Lars-Peter Clausen - * JZ4740 SoC NAND controller driver - */ - -#ifndef __JZ4740_NAND_H__ -#define __JZ4740_NAND_H__ - -#include -#include - -#define JZ_NAND_NUM_BANKS 4 - -struct jz_nand_platform_data { - int num_partitions; - struct mtd_partition *partitions; - - unsigned char banks[JZ_NAND_NUM_BANKS]; - - void (*ident_callback)(struct platform_device *, struct mtd_info *, - struct mtd_partition **, int *num_partitions); -}; - -#endif From cbb5262192d9a367d89d24e54388f54069ffd2b8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 23 Oct 2020 18:33:38 +0200 Subject: [PATCH 004/395] audit: fix a kernel-doc markup typo: kauditd_print_skb -> kauditd_printk_skb Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/audit.c b/kernel/audit.c index 68cee3bc8cfe..0be42cac086b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -523,7 +523,7 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net) } /** - * kauditd_print_skb - Print the audit record to the ring buffer + * kauditd_printk_skb - Print the audit record to the ring buffer * @skb: audit record * * Whatever the reason, this packet may not make it to the auditd connection From 6d915476e67d99b73a57bceb83cff1cf153d8bf6 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 22 Sep 2020 08:44:50 -0400 Subject: [PATCH 005/395] audit: trigger accompanying records when no rules present When there are no audit rules registered, mandatory records (config, etc.) are missing their accompanying records (syscall, proctitle, etc.). This is due to audit context dummy set on syscall entry based on absence of rules that signals that no other records are to be printed. Clear the dummy bit if any record is generated, open coding this in audit_log_start(). The proctitle context and dummy checks are pointless since the proctitle record will not be printed if no syscall records are printed. The fds array is reset to -1 after the first syscall to indicate it isn't valid any more, but was never set to -1 when the context was allocated to indicate it wasn't yet valid. Check ctx->pwd in audit_log_name(). The audit_inode* functions can be called without going through getname_flags() or getname_kernel() that sets audit_names and cwd, so set the cwd in audit_alloc_name() if it has not already been done so due to audit_names being valid and purge all other audit_getcwd() calls. Revert the LSM dump_common_audit_data() LSM_AUDIT_DATA_* cases from the ghak96 patch since they are no longer necessary due to cwd coverage in audit_alloc_name(). Thanks to bauen1 for reporting LSM situations in which context->cwd is not valid, inadvertantly fixed by the ghak96 patch. Please see upstream github issue https://github.com/linux-audit/audit-kernel/issues/120 This is also related to upstream github issue https://github.com/linux-audit/audit-kernel/issues/96 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 8 -------- kernel/audit.c | 3 +++ kernel/auditsc.c | 27 +++++++-------------------- security/lsm_audit.c | 5 ----- 4 files changed, 10 insertions(+), 33 deletions(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index b3d859831a31..82b7c1116a85 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -292,7 +292,6 @@ extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); -extern void __audit_getcwd(void); extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); @@ -351,11 +350,6 @@ static inline void audit_getname(struct filename *name) if (unlikely(!audit_dummy_context())) __audit_getname(name); } -static inline void audit_getcwd(void) -{ - if (unlikely(audit_context())) - __audit_getcwd(); -} static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { @@ -584,8 +578,6 @@ static inline struct filename *audit_reusename(const __user char *name) } static inline void audit_getname(struct filename *name) { } -static inline void audit_getcwd(void) -{ } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) diff --git a/kernel/audit.c b/kernel/audit.c index 0be42cac086b..ac0aeaa99937 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1865,6 +1865,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); + /* cancel dummy context to enable supporting records */ + if (ctx) + ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8dba8f0983b5..183d79cc2e12 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -929,6 +929,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); + context->fds[0] = -1; return context; } @@ -1367,7 +1368,10 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, /* name was specified as a relative path and the * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); + if (context->pwd.dentry && context->pwd.mnt) + audit_log_d_path(ab, " name=", &context->pwd); + else + audit_log_format(ab, " name=(null)"); break; default: /* log the name's directory component */ @@ -1435,9 +1439,6 @@ static void audit_log_proctitle(void) struct audit_context *context = audit_context(); struct audit_buffer *ab; - if (!context || context->dummy) - return; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ @@ -1866,6 +1867,8 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, list_add_tail(&aname->list, &context->names_list); context->name_count++; + if (!context->pwd.dentry) + get_fs_pwd(current->fs, &context->pwd); return aname; } @@ -1894,20 +1897,6 @@ __audit_reusename(const __user char *uptr) return NULL; } -inline void _audit_getcwd(struct audit_context *context) -{ - if (!context->pwd.dentry) - get_fs_pwd(current->fs, &context->pwd); -} - -void __audit_getcwd(void) -{ - struct audit_context *context = audit_context(); - - if (context->in_syscall) - _audit_getcwd(context); -} - /** * __audit_getname - add a name to the list * @name: name to add @@ -1931,8 +1920,6 @@ void __audit_getname(struct filename *name) n->name_len = AUDIT_NAME_FULL; name->aname = n; name->refcnt++; - - _audit_getcwd(context); } static inline int audit_copy_fcaps(struct audit_names *name, diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 53d0d183db8f..221370794d14 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -241,7 +241,6 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } - audit_getcwd(); break; } case LSM_AUDIT_DATA_FILE: { @@ -255,7 +254,6 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } - audit_getcwd(); break; } case LSM_AUDIT_DATA_IOCTL_OP: { @@ -271,7 +269,6 @@ static void dump_common_audit_data(struct audit_buffer *ab, } audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd); - audit_getcwd(); break; } case LSM_AUDIT_DATA_DENTRY: { @@ -286,7 +283,6 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } - audit_getcwd(); break; } case LSM_AUDIT_DATA_INODE: { @@ -304,7 +300,6 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); - audit_getcwd(); break; } case LSM_AUDIT_DATA_TASK: { From 83370b31a915493231e5b9addc72e4bef69f8d31 Mon Sep 17 00:00:00 2001 From: Tianyue Ren Date: Fri, 9 Oct 2020 09:36:30 +0800 Subject: [PATCH 006/395] selinux: fix error initialization in inode_doinit_with_dentry() Mark the inode security label as invalid if we cannot find a dentry so that we will retry later rather than marking it initialized with the unlabeled SID. Fixes: 9287aed2ad1f ("selinux: Convert isec->lock into a spinlock") Signed-off-by: Tianyue Ren [PM: minor comment tweaks] Signed-off-by: Paul Moore --- security/selinux/hooks.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 6b1826fc3658..158fc47d8620 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1451,7 +1451,13 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent * inode_doinit with a dentry, before these inodes could * be used again by userspace. */ - goto out; + isec->initialized = LABEL_INVALID; + /* + * There is nothing useful to jump to the "out" + * label, except a needless spin lock/unlock + * cycle. + */ + return 0; } rc = inode_doinit_use_xattr(inode, dentry, sbsec->def_sid, @@ -1507,8 +1513,15 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent * inode_doinit() with a dentry, before these inodes * could be used again by userspace. */ - if (!dentry) - goto out; + if (!dentry) { + isec->initialized = LABEL_INVALID; + /* + * There is nothing useful to jump to the "out" + * label, except a needless spin lock/unlock + * cycle. + */ + return 0; + } rc = selinux_genfs_get_sid(dentry, sclass, sbsec->flags, &sid); if (rc) { From 44141f58e14317853698f994ca5c3785a0c230d0 Mon Sep 17 00:00:00 2001 From: bauen1 Date: Fri, 9 Oct 2020 14:47:11 +0200 Subject: [PATCH 007/395] selinux: allow dontauditx and auditallowx rules to take effect without allowx This allows for dontauditing very specific ioctls e.g. TCGETS without dontauditing every ioctl or granting additional permissions. Now either an allowx, dontauditx or auditallowx rules enables checking for extended permissions. Signed-off-by: Jonathan Hettwer Signed-off-by: Paul Moore --- security/selinux/ss/services.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 9704c8a32303..597b79703584 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -596,9 +596,7 @@ void services_compute_xperms_drivers( node->datum.u.xperms->driver); } - /* If no ioctl commands are allowed, ignore auditallow and auditdeny */ - if (node->key.specified & AVTAB_XPERMS_ALLOWED) - xperms->len = 1; + xperms->len = 1; } /* From b000d5cb954fe25ac1ea929ae6da321033ace927 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 13 Oct 2020 10:18:04 +0200 Subject: [PATCH 008/395] ima: defer arch_ima_get_secureboot() call to IMA init time Chester reports that it is necessary to introduce a new way to pass the EFI secure boot status between the EFI stub and the core kernel on ARM systems. The usual way of obtaining this information is by checking the SecureBoot and SetupMode EFI variables, but this can only be done after the EFI variable workqueue is created, which occurs in a subsys_initcall(), whereas arch_ima_get_secureboot() is called much earlier by the IMA framework. However, the IMA framework itself is started as a late_initcall, and the only reason the call to arch_ima_get_secureboot() occurs so early is because it happens in the context of a __setup() callback that parses the ima_appraise= command line parameter. So let's refactor this code a little bit, by using a core_param() callback to capture the command line argument, and deferring any reasoning based on its contents to the IMA init routine. Cc: Chester Lin Cc: Dmitry Kasatkin Cc: James Morris Cc: "Serge E. Hallyn" Link: https://lore.kernel.org/linux-arm-kernel/20200904072905.25332-2-clin@suse.com/ Signed-off-by: Ard Biesheuvel Reported-by: kernel test robot [missing core_param()] [zohar@linux.ibm.com: included linux/module.h] Tested-by: Chester Lin Signed-off-by: Mimi Zohar --- include/linux/ima.h | 6 ++++++ security/integrity/ima/ima_appraise.c | 17 +++++++++++------ security/integrity/ima/ima_main.c | 1 + 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/ima.h b/include/linux/ima.h index 8fa7bcfb2da2..ac3d82f962f2 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -31,6 +31,12 @@ extern void ima_post_path_mknod(struct dentry *dentry); extern int ima_file_hash(struct file *file, char *buf, size_t buf_size); extern void ima_kexec_cmdline(int kernel_fd, const void *buf, int size); +#ifdef CONFIG_IMA_APPRAISE_BOOTPARAM +extern void ima_appraise_parse_cmdline(void); +#else +static inline void ima_appraise_parse_cmdline(void) {} +#endif + #ifdef CONFIG_IMA_KEXEC extern void ima_add_kexec_buffer(struct kimage *image); #endif diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index 3dd8c2e4314e..8361941ee0a1 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -5,6 +5,7 @@ * Author: * Mimi Zohar */ +#include #include #include #include @@ -16,12 +17,19 @@ #include "ima.h" -static int __init default_appraise_setup(char *str) -{ #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM +static char *ima_appraise_cmdline_default __initdata; +core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); + +void __init ima_appraise_parse_cmdline(void) +{ + const char *str = ima_appraise_cmdline_default; bool sb_state = arch_ima_get_secureboot(); int appraisal_state = ima_appraise; + if (!str) + return; + if (strncmp(str, "off", 3) == 0) appraisal_state = 0; else if (strncmp(str, "log", 3) == 0) @@ -42,11 +50,8 @@ static int __init default_appraise_setup(char *str) } else { ima_appraise = appraisal_state; } -#endif - return 1; } - -__setup("ima_appraise=", default_appraise_setup); +#endif /* * is_ima_appraise_enabled - return appraise status diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 2d1af8899cab..a962b23e0429 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -904,6 +904,7 @@ static int __init init_ima(void) { int error; + ima_appraise_parse_cmdline(); ima_init_template_list(); hash_setup(CONFIG_IMA_DEFAULT_HASH); error = ima_init(); From 200ea5a2292dc444a818b096ae6a32ba3caa51b9 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 3 Nov 2020 11:49:38 -0500 Subject: [PATCH 009/395] selinux: fix inode_doinit_with_dentry() LABEL_INVALID error handling A previous fix, commit 83370b31a915 ("selinux: fix error initialization in inode_doinit_with_dentry()"), changed how failures were handled before a SELinux policy was loaded. Unfortunately that patch was potentially problematic for two reasons: it set the isec->initialized state without holding a lock, and it didn't set the inode's SELinux label to the "default" for the particular filesystem. The later can be a problem if/when a later attempt to revalidate the inode fails and SELinux reverts to the existing inode label. This patch should restore the default inode labeling that existed before the original fix, without affecting the LABEL_INVALID marking such that revalidation will still be attempted in the future. Fixes: 83370b31a915 ("selinux: fix error initialization in inode_doinit_with_dentry()") Reported-by: Sven Schnelle Tested-by: Sven Schnelle Reviewed-by: Ondrej Mosnacek Signed-off-by: Paul Moore --- security/selinux/hooks.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 158fc47d8620..c46312710e73 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1451,13 +1451,7 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent * inode_doinit with a dentry, before these inodes could * be used again by userspace. */ - isec->initialized = LABEL_INVALID; - /* - * There is nothing useful to jump to the "out" - * label, except a needless spin lock/unlock - * cycle. - */ - return 0; + goto out_invalid; } rc = inode_doinit_use_xattr(inode, dentry, sbsec->def_sid, @@ -1513,15 +1507,8 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent * inode_doinit() with a dentry, before these inodes * could be used again by userspace. */ - if (!dentry) { - isec->initialized = LABEL_INVALID; - /* - * There is nothing useful to jump to the "out" - * label, except a needless spin lock/unlock - * cycle. - */ - return 0; - } + if (!dentry) + goto out_invalid; rc = selinux_genfs_get_sid(dentry, sclass, sbsec->flags, &sid); if (rc) { @@ -1546,11 +1533,10 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent out: spin_lock(&isec->lock); if (isec->initialized == LABEL_PENDING) { - if (!sid || rc) { + if (rc) { isec->initialized = LABEL_INVALID; goto out_unlock; } - isec->initialized = LABEL_INITIALIZED; isec->sid = sid; } @@ -1558,6 +1544,15 @@ static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dent out_unlock: spin_unlock(&isec->lock); return rc; + +out_invalid: + spin_lock(&isec->lock); + if (isec->initialized == LABEL_PENDING) { + isec->initialized = LABEL_INVALID; + isec->sid = sid; + } + spin_unlock(&isec->lock); + return 0; } /* Convert a Linux signal to an access vector. */ From d2e850e961835825492f37346ae76670bf4758f5 Mon Sep 17 00:00:00 2001 From: Chuanhong Guo Date: Mon, 26 Oct 2020 20:29:25 +0800 Subject: [PATCH 010/395] MIPS: zboot: put appended dtb into a section This will make a separated section for dtb appear in ELF, and we can then use objcopy to patch a dtb into vmlinuz when RAW_APPENDED_DTB is set in kernel config. command to patch a dtb: objcopy --set-section-flags=.appended_dtb=alloc,contents \ --update-section=.appended_dtb=.dtb vmlinuz Signed-off-by: Chuanhong Guo Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/compressed/ld.script | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/mips/boot/compressed/ld.script b/arch/mips/boot/compressed/ld.script index 2ed08fbef8e7..0ebb667274d6 100644 --- a/arch/mips/boot/compressed/ld.script +++ b/arch/mips/boot/compressed/ld.script @@ -31,9 +31,12 @@ SECTIONS CONSTRUCTORS . = ALIGN(16); } - __appended_dtb = .; - /* leave space for appended DTB */ - . += 0x100000; + + .appended_dtb : { + __appended_dtb = .; + /* leave space for appended DTB */ + . += 0x100000; + } _edata = .; /* End of data section */ From 18ff14c87d904f7968a143a6202740c27b4d8e7f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 27 Oct 2020 18:34:30 +0000 Subject: [PATCH 011/395] MIPS: Kconfig: fix a few trivial spelling mistakes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a few spelling mistakes in the Kconfig, fix these. Signed-off-by: Colin Ian King Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Thomas Bogendoerfer --- arch/mips/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 2000bb2b0220..ddaff19a9580 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2645,7 +2645,7 @@ config WAR_R4600_V1_INDEX_ICACHEOP # 18. The CACHE instructions Hit_Writeback_Invalidate_D, Hit_Writeback_D, # Hit_Invalidate_D and Create_Dirty_Excl_D should only be # executed if there is no other dcache activity. If the dcache is -# accessed for another instruction immeidately preceding when these +# accessed for another instruction immediately preceding when these # cache instructions are executing, it is possible that the dcache # tag match outputs used by these cache instructions will be # incorrect. These cache instructions should be preceded by at least @@ -3086,7 +3086,7 @@ config MIPS_O32_FP64_SUPPORT Although binutils currently supports use of this flag the details concerning its effect upon the O32 ABI in userland are still being - worked on. In order to avoid userland becoming dependant upon current + worked on. In order to avoid userland becoming dependent upon current behaviour before the details have been finalised, this option should be considered experimental and only enabled by those working upon said details. @@ -3124,7 +3124,7 @@ choice objcopy --update-section .appended_dtb=.dtb vmlinux - This is meant as a backward compatiblity convenience for those + This is meant as a backward compatibility convenience for those systems with a bootloader that can't be upgraded to accommodate the documented boot protocol using a device tree. From 5a5aa912f687204d50455d0db36f94dd8de601c2 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Wed, 28 Oct 2020 17:15:45 +0800 Subject: [PATCH 012/395] mips: ar7: add missing iounmap() on error in ar7_gpio_init Add the missing iounmap() of gpch->regs before return from ar7_gpio_init() in the error handling case. Signed-off-by: Qinglang Miao Signed-off-by: Thomas Bogendoerfer --- arch/mips/ar7/gpio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/ar7/gpio.c b/arch/mips/ar7/gpio.c index 8b006addd6ba..ae0e01b9438f 100644 --- a/arch/mips/ar7/gpio.c +++ b/arch/mips/ar7/gpio.c @@ -319,6 +319,7 @@ int __init ar7_gpio_init(void) if (ret) { printk(KERN_ERR "%s: failed to add gpiochip\n", gpch->chip.label); + iounmap(gpch->regs); return ret; } printk(KERN_INFO "%s: registered %d GPIOs\n", From 2673ecf9586551c5bcee499c1cc1949f6f7cc9a1 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Wed, 28 Oct 2020 17:15:46 +0800 Subject: [PATCH 013/395] mips: cm: add missing iounmap() on error in mips_cm_probe() Add the missing iounmap() of iounmap(mips_gcr_base) before return from mips_cm_probe() in the error handling case. Signed-off-by: Qinglang Miao Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/mips-cm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/kernel/mips-cm.c b/arch/mips/kernel/mips-cm.c index f60af512c877..90f1c3df1f0e 100644 --- a/arch/mips/kernel/mips-cm.c +++ b/arch/mips/kernel/mips-cm.c @@ -266,6 +266,7 @@ int mips_cm_probe(void) if ((base_reg & CM_GCR_BASE_GCRBASE) != addr) { pr_err("GCRs appear to have been moved (expected them at 0x%08lx)!\n", (unsigned long)addr); + iounmap(mips_gcr_base); mips_gcr_base = NULL; return -ENODEV; } From c424dc4cd1809e181a161723f770b7bc1e0f9927 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Wed, 28 Oct 2020 17:15:48 +0800 Subject: [PATCH 014/395] mips: Vr41xx: add missing iounmap() on error in vr41xx_pciu_init() add missing iounmap() of pciu_base on error when failed to init io_map_base. Signed-off-by: Qinglang Miao Signed-off-by: Thomas Bogendoerfer --- arch/mips/pci/pci-vr41xx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/mips/pci/pci-vr41xx.c b/arch/mips/pci/pci-vr41xx.c index 1fa4e101470c..4f250c55b6e6 100644 --- a/arch/mips/pci/pci-vr41xx.c +++ b/arch/mips/pci/pci-vr41xx.c @@ -293,8 +293,10 @@ static int __init vr41xx_pciu_init(void) master = setup->master_io; io_map_base = ioremap(master->bus_base_address, resource_size(res)); - if (!io_map_base) + if (!io_map_base) { + iounmap(pciu_base); return -EBUSY; + } vr41xx_pci_controller.io_map_base = (unsigned long)io_map_base; } From 53855e12588743ea128ee31f913d1c6e2f1d32c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Oct 2020 07:15:37 -0600 Subject: [PATCH 015/395] arc: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for arc. Cc: linux-snps-arc@lists.infradead.org Acked-by: Vineet Gupta Signed-off-by: Jens Axboe --- arch/arc/include/asm/thread_info.h | 4 +++- arch/arc/kernel/entry.S | 3 ++- arch/arc/kernel/signal.c | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h index f9eef0e8f0b7..c0942c24d401 100644 --- a/arch/arc/include/asm/thread_info.h +++ b/arch/arc/include/asm/thread_info.h @@ -79,6 +79,7 @@ static inline __attribute_const__ struct thread_info *current_thread_info(void) #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SYSCALL_AUDIT 4 /* syscall auditing active */ +#define TIF_NOTIFY_SIGNAL 5 /* signal notifications exist */ #define TIF_SYSCALL_TRACE 15 /* syscall trace active */ /* true if poll_idle() is polling TIF_NEED_RESCHED */ @@ -89,11 +90,12 @@ static inline __attribute_const__ struct thread_info *current_thread_info(void) #define _TIF_SIGPENDING (1< Date: Thu, 22 Oct 2020 20:09:23 -0600 Subject: [PATCH 016/395] arm64: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for arm64. Cc: linux-arm-kernel@lists.infradead.org Acked-by: Will Deacon Acked-by: Catalin Marinas Signed-off-by: Jens Axboe --- arch/arm64/include/asm/thread_info.h | 5 ++++- arch/arm64/kernel/signal.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 1fbab854a51b..cdcf307764aa 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -68,6 +68,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_FSCHECK 5 /* Check FS is USER_DS on return */ #define TIF_MTE_ASYNC_FAULT 6 /* MTE Asynchronous Tag Check Fault */ +#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @@ -98,10 +99,12 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ - _TIF_UPROBE | _TIF_FSCHECK | _TIF_MTE_ASYNC_FAULT) + _TIF_UPROBE | _TIF_FSCHECK | _TIF_MTE_ASYNC_FAULT | \ + _TIF_NOTIFY_SIGNAL) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index a8184cad8890..bec6ef69704f 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -942,7 +942,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, (void __user *)NULL, current); } - if (thread_flags & _TIF_SIGPENDING) + if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); if (thread_flags & _TIF_NOTIFY_RESUME) { From e660653cd9f2df470d156c249631f68b9dee51ee Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 15:10:55 -0600 Subject: [PATCH 017/395] m68k: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for m68k. Cc: linux-m68k@lists.linux-m68k.org Acked-by: Geert Uytterhoeven Signed-off-by: Jens Axboe --- arch/m68k/include/asm/thread_info.h | 1 + arch/m68k/kernel/signal.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h index 3689c6718c88..15a757073fa5 100644 --- a/arch/m68k/include/asm/thread_info.h +++ b/arch/m68k/include/asm/thread_info.h @@ -60,6 +60,7 @@ static inline struct thread_info *current_thread_info(void) * bits 0-7 are tested at every exception exit * bits 8-15 are also tested at syscall exit */ +#define TIF_NOTIFY_SIGNAL 4 #define TIF_NOTIFY_RESUME 5 /* callback before returning to user */ #define TIF_SIGPENDING 6 /* signal pending */ #define TIF_NEED_RESCHED 7 /* rescheduling necessary */ diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 46f91e0f6a08..349570f16a78 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -1133,7 +1133,8 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs) { - if (test_thread_flag(TIF_SIGPENDING)) + if (test_thread_flag(TIF_NOTIFY_SIGNAL) || + test_thread_flag(TIF_SIGPENDING)) do_signal(regs); if (test_thread_flag(TIF_NOTIFY_RESUME)) From 42020064274c235d720d9c4b7d9a678b133e59cf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 15:21:21 -0600 Subject: [PATCH 018/395] nios32: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for nios32. Cc: Ley Foon Tan Acked-by: Ley Foon Tan Signed-off-by: Jens Axboe --- arch/nios2/include/asm/thread_info.h | 2 ++ arch/nios2/kernel/signal.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/nios2/include/asm/thread_info.h b/arch/nios2/include/asm/thread_info.h index 7349a4fa635b..272d2c72a727 100644 --- a/arch/nios2/include/asm/thread_info.h +++ b/arch/nios2/include/asm/thread_info.h @@ -86,6 +86,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_MEMDIE 4 /* is terminating due to OOM killer */ #define TIF_SECCOMP 5 /* secure computing */ #define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ +#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling @@ -97,6 +98,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) diff --git a/arch/nios2/kernel/signal.c b/arch/nios2/kernel/signal.c index cf2dca2ac7c3..2009ae2d3c3b 100644 --- a/arch/nios2/kernel/signal.c +++ b/arch/nios2/kernel/signal.c @@ -306,7 +306,8 @@ asmlinkage int do_notify_resume(struct pt_regs *regs) if (!user_mode(regs)) return 0; - if (test_thread_flag(TIF_SIGPENDING)) { + if (test_thread_flag(TIF_SIGPENDING) || + test_thread_flag(TIF_NOTIFY_SIGNAL)) { int restart = do_signal(regs); if (unlikely(restart)) { From 18cb3281285d2190c0605d2e53543802319bd1a1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 15:27:02 -0600 Subject: [PATCH 019/395] parisc: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for parisc. Cc: linux-parisc@vger.kernel.org Acked-by: Helge Deller Signed-off-by: Jens Axboe --- arch/parisc/include/asm/thread_info.h | 4 +++- arch/parisc/kernel/signal.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index 285757544cca..0bd38a972cea 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -52,6 +52,7 @@ struct thread_info { #define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_32BIT 4 /* 32 bit binary */ #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ +#define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_NOTIFY_RESUME 8 /* callback before returning to user */ #define TIF_SINGLESTEP 9 /* single stepping? */ @@ -61,6 +62,7 @@ struct thread_info { #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_32BIT (1 << TIF_32BIT) @@ -72,7 +74,7 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \ - _TIF_NEED_RESCHED) + _TIF_NEED_RESCHED | _TIF_NOTIFY_SIGNAL) #define _TIF_SYSCALL_TRACE_MASK (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP | \ _TIF_BLOCKSTEP | _TIF_SYSCALL_AUDIT | \ _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT) diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 9f43eaeb0b0a..fb1e94a3982b 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -603,7 +603,8 @@ do_signal(struct pt_regs *regs, long in_syscall) void do_notify_resume(struct pt_regs *regs, long in_syscall) { - if (test_thread_flag(TIF_SIGPENDING)) + if (test_thread_flag(TIF_SIGPENDING) || + test_thread_flag(TIF_NOTIFY_SIGNAL)) do_signal(regs, in_syscall); if (test_thread_flag(TIF_NOTIFY_RESUME)) From 900f0713fdd730fab0f0bfa4a8ca4db2a8985bbe Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Oct 2020 20:11:56 -0600 Subject: [PATCH 020/395] powerpc: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for powerpc. Cc: linuxppc-dev@lists.ozlabs.org Acked-by: Michael Ellerman Signed-off-by: Jens Axboe --- arch/powerpc/include/asm/thread_info.h | 5 ++++- arch/powerpc/kernel/signal.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 46a210b03d2b..53115ae61495 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -90,6 +90,7 @@ void arch_setup_new_exec(void); #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ +#define TIF_NOTIFY_SIGNAL 3 /* signal notifications exist */ #define TIF_SYSCALL_EMU 4 /* syscall emulation active */ #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */ #define TIF_PATCH_PENDING 6 /* pending live patching update */ @@ -115,6 +116,7 @@ void arch_setup_new_exec(void); #define _TIF_SYSCALL_TRACE (1<thread.regs); do_signal(current); } From f45c184bce15f4a314c0210519bc3b4aab408838 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 15:16:02 -0600 Subject: [PATCH 021/395] mips: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for mips. Cc: linux-mips@vger.kernel.org Acked-By: Thomas Bogendoerfer Signed-off-by: Jens Axboe --- arch/mips/include/asm/thread_info.h | 4 +++- arch/mips/kernel/signal.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index ee26f9a4575d..e2c352da3877 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -115,6 +115,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SECCOMP 4 /* secure computing */ #define TIF_NOTIFY_RESUME 5 /* callback before returning to user */ #define TIF_UPROBE 6 /* breakpointed or singlestepping */ +#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ @@ -139,6 +140,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SECCOMP (1< Date: Fri, 9 Oct 2020 15:34:12 -0600 Subject: [PATCH 022/395] s390: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for s390. Cc: linux-s390@vger.kernel.org Acked-by: Heiko Carstens Acked-by: Sven Schnelle Signed-off-by: Jens Axboe --- arch/s390/include/asm/thread_info.h | 2 ++ arch/s390/kernel/entry.S | 11 ++++++----- arch/s390/kernel/signal.c | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 13a04fcf7762..0045341ade48 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -65,6 +65,7 @@ void arch_setup_new_exec(void); #define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */ #define TIF_PATCH_PENDING 5 /* pending live patching update */ #define TIF_PGSTE 6 /* New mm's will use 4K page tables */ +#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ #define TIF_ISOLATE_BP 8 /* Run process with isolated BP */ #define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ @@ -82,6 +83,7 @@ void arch_setup_new_exec(void); #define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */ #define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) +#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) #define _TIF_SIGPENDING BIT(TIF_SIGPENDING) #define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) #define _TIF_UPROBE BIT(TIF_UPROBE) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 86235919c2d1..19a89f292290 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -52,7 +52,8 @@ STACK_SIZE = 1 << STACK_SHIFT STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE _TIF_WORK = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ - _TIF_UPROBE | _TIF_GUARDED_STORAGE | _TIF_PATCH_PENDING) + _TIF_UPROBE | _TIF_GUARDED_STORAGE | _TIF_PATCH_PENDING | \ + _TIF_NOTIFY_SIGNAL) _TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ _TIF_SYSCALL_TRACEPOINT) _CIF_WORK = (_CIF_ASCE_PRIMARY | _CIF_ASCE_SECONDARY | _CIF_FPU) @@ -463,8 +464,8 @@ ENTRY(system_call) #endif TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART jo .Lsysc_syscall_restart - TSTMSK __TI_flags(%r12),_TIF_SIGPENDING - jo .Lsysc_sigpending + TSTMSK __TI_flags(%r12),(_TIF_SIGPENDING|_TIF_NOTIFY_SIGNAL) + jnz .Lsysc_sigpending TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME jo .Lsysc_notify_resume TSTMSK __LC_CPU_FLAGS,(_CIF_ASCE_PRIMARY|_CIF_ASCE_SECONDARY) @@ -855,8 +856,8 @@ ENTRY(io_int_handler) TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING jo .Lio_patch_pending #endif - TSTMSK __TI_flags(%r12),_TIF_SIGPENDING - jo .Lio_sigpending + TSTMSK __TI_flags(%r12),(_TIF_SIGPENDING|_TIF_NOTIFY_SIGNAL) + jnz .Lio_sigpending TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME jo .Lio_notify_resume TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 9e900a8977bd..b27b6c1f058d 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -472,7 +472,7 @@ void do_signal(struct pt_regs *regs) current->thread.system_call = test_pt_regs_flag(regs, PIF_SYSCALL) ? regs->int_code : 0; - if (get_signal(&ksig)) { + if (test_thread_flag(TIF_SIGPENDING) && get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ if (current->thread.system_call) { regs->int_code = current->thread.system_call; From a5b3cd32ff238b87e94d47b927aff117e22d13c0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 15:47:28 -0600 Subject: [PATCH 023/395] um: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for um. Cc: linux-um@lists.infradead.org Acked-By: Anton Ivanov Signed-off-by: Jens Axboe --- arch/um/include/asm/thread_info.h | 2 ++ arch/um/kernel/process.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index 4c19ce4c49f1..3b1cb8b3b186 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -57,6 +57,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ +#define TIF_NOTIFY_SIGNAL 3 /* signal notifications exist */ #define TIF_RESTART_BLOCK 4 #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ #define TIF_SYSCALL_AUDIT 6 @@ -67,6 +68,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 3bed09538dd9..9e19da9e6198 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -99,7 +99,8 @@ void interrupt_end(void) if (need_resched()) schedule(); - if (test_thread_flag(TIF_SIGPENDING)) + if (test_thread_flag(TIF_SIGPENDING) || + test_thread_flag(TIF_NOTIFY_SIGNAL)) do_signal(regs); if (test_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); From 6d3a273355e3c8471ddf9e8ce9a7cc4472bf1ccc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 15:36:35 -0600 Subject: [PATCH 024/395] sh: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for sh. Cc: linux-sh@vger.kernel.org Signed-off-by: Jens Axboe --- arch/sh/include/asm/thread_info.h | 4 +++- arch/sh/kernel/signal_32.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index 243ea5150aa0..598d0184ffea 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -105,6 +105,7 @@ extern void init_thread_xstate(void); #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ +#define TIF_NOTIFY_SIGNAL 3 /* signal notifications exist */ #define TIF_SINGLESTEP 4 /* singlestepping active */ #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ #define TIF_SECCOMP 6 /* secure computing */ @@ -116,6 +117,7 @@ extern void init_thread_xstate(void); #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) @@ -132,7 +134,7 @@ extern void init_thread_xstate(void); #define _TIF_ALLWORK_MASK (_TIF_SYSCALL_TRACE | _TIF_SIGPENDING | \ _TIF_NEED_RESCHED | _TIF_SYSCALL_AUDIT | \ _TIF_SINGLESTEP | _TIF_NOTIFY_RESUME | \ - _TIF_SYSCALL_TRACEPOINT) + _TIF_SYSCALL_TRACEPOINT | _TIF_NOTIFY_SIGNAL) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK (_TIF_ALLWORK_MASK & ~(_TIF_SYSCALL_TRACE | \ diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index 1add47fd31f6..dd3092911efa 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -499,7 +499,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int save_r0, unsigned long thread_info_flags) { /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) + if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs, save_r0); if (thread_info_flags & _TIF_NOTIFY_RESUME) From e181c0aa2e532af2b17128fbde699f8578cc0562 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 15:24:46 -0600 Subject: [PATCH 025/395] openrisc: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for openrisc. Cc: openrisc@lists.librecores.org Acked-by: Stafford Horne Signed-off-by: Jens Axboe --- arch/openrisc/include/asm/thread_info.h | 2 ++ arch/openrisc/kernel/signal.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h index 9afe68bc423b..4f9d2a261455 100644 --- a/arch/openrisc/include/asm/thread_info.h +++ b/arch/openrisc/include/asm/thread_info.h @@ -98,6 +98,7 @@ register struct thread_info *current_thread_info_reg asm("r10"); #define TIF_SINGLESTEP 4 /* restore singlestep on return to user * mode */ +#define TIF_NOTIFY_SIGNAL 5 /* signal notifications exist */ #define TIF_SYSCALL_TRACEPOINT 8 /* for ftrace syscall instrumentation */ #define TIF_RESTORE_SIGMASK 9 #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling * TIF_NEED_RESCHED @@ -109,6 +110,7 @@ register struct thread_info *current_thread_info_reg asm("r10"); #define _TIF_SIGPENDING (1< Date: Fri, 9 Oct 2020 14:39:00 -0600 Subject: [PATCH 026/395] csky: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for csky. Cc: linux-csky@vger.kernel.org Acked-by: Guo Ren Signed-off-by: Jens Axboe --- arch/csky/include/asm/thread_info.h | 5 ++++- arch/csky/kernel/signal.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/csky/include/asm/thread_info.h b/arch/csky/include/asm/thread_info.h index 68e7a1227170..21456a3737c2 100644 --- a/arch/csky/include/asm/thread_info.h +++ b/arch/csky/include/asm/thread_info.h @@ -64,6 +64,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_TRACE 4 /* syscall trace active */ #define TIF_SYSCALL_TRACEPOINT 5 /* syscall tracepoint instrumentation */ #define TIF_SYSCALL_AUDIT 6 /* syscall auditing */ +#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ #define TIF_POLLING_NRFLAG 16 /* poll_idle() is TIF_NEED_RESCHED */ #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ #define TIF_RESTORE_SIGMASK 20 /* restore signal mask in do_signal() */ @@ -75,6 +76,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_MEMDIE (1 << TIF_MEMDIE) @@ -82,7 +84,8 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_UPROBE) + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NOTIFY_SIGNAL) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP) diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c index 8b068cf37447..37ea64ed3c12 100644 --- a/arch/csky/kernel/signal.c +++ b/arch/csky/kernel/signal.c @@ -257,7 +257,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, uprobe_notify_resume(regs); /* Handle pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) + if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); if (thread_info_flags & _TIF_NOTIFY_RESUME) { From aeec8193578a71d0aee21218351849d38121ce90 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 14:45:22 -0600 Subject: [PATCH 027/395] hexagon: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for hexagon. Cc: linux-hexagon@vger.kernel.org Acked-by: Brian Cain Signed-off-by: Jens Axboe --- arch/hexagon/include/asm/thread_info.h | 2 ++ arch/hexagon/kernel/process.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/hexagon/include/asm/thread_info.h b/arch/hexagon/include/asm/thread_info.h index 563da1986464..535976665bf0 100644 --- a/arch/hexagon/include/asm/thread_info.h +++ b/arch/hexagon/include/asm/thread_info.h @@ -95,6 +95,7 @@ register struct thread_info *__current_thread_info asm(QUOTED_THREADINFO_REG); #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* restore ss @ return to usr mode */ #define TIF_RESTORE_SIGMASK 6 /* restore sig mask in do_signal() */ +#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 17 /* OOM killer killed process */ @@ -103,6 +104,7 @@ register struct thread_info *__current_thread_info asm(QUOTED_THREADINFO_REG); #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) /* work to do on interrupt/exception return - All but TIF_SYSCALL_TRACE */ #define _TIF_WORK_MASK (0x0000FFFF & ~_TIF_SYSCALL_TRACE) diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index 5a0a95d93ddb..82cd7a026eb3 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -174,7 +174,7 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags) return 1; } - if (thread_info_flags & _TIF_SIGPENDING) { + if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { do_signal(regs); return 1; } From f4ea089e429e0d366cd1a34a2cbe3c7b13d98d75 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 15:13:57 -0600 Subject: [PATCH 028/395] microblaze: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for microblaze. Acked-by: Michal Simek Signed-off-by: Jens Axboe --- arch/microblaze/include/asm/thread_info.h | 2 ++ arch/microblaze/kernel/signal.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index ad8e8fcb90d3..44f5ca331862 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -107,6 +107,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ /* restore singlestep on return to user mode */ #define TIF_SINGLESTEP 4 +#define TIF_NOTIFY_SIGNAL 5 /* signal notifications exist */ #define TIF_MEMDIE 6 /* is terminating due to OOM killer */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */ #define TIF_SECCOMP 10 /* secure computing */ @@ -119,6 +120,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index f11a0ccccabc..5a8d173d7b75 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -313,7 +313,8 @@ static void do_signal(struct pt_regs *regs, int in_syscall) asmlinkage void do_notify_resume(struct pt_regs *regs, int in_syscall) { - if (test_thread_flag(TIF_SIGPENDING)) + if (test_thread_flag(TIF_SIGPENDING) || + test_thread_flag(TIF_NOTIFY_SIGNAL)) do_signal(regs, in_syscall); if (test_thread_flag(TIF_NOTIFY_RESUME)) From e167a59c654a788cacbb0c2dd93859a715824d5c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 10 Nov 2020 16:48:51 +0100 Subject: [PATCH 029/395] microblaze: Replace by The MicroBlaze platform code is not a clock provider, and just needs to call of_clk_init(). Hence it can include instead of . Signed-off-by: Geert Uytterhoeven Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20201110154851.3285695-1-geert+renesas@glider.be Signed-off-by: Michal Simek --- arch/microblaze/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c index 333b09658ca8..7fcf5279ad15 100644 --- a/arch/microblaze/kernel/setup.c +++ b/arch/microblaze/kernel/setup.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include #include From ed2124c0b9a8d2c09e3b5b9ca9827187c5fcbe71 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 Oct 2020 10:16:19 -0600 Subject: [PATCH 030/395] microblaze: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for microblaze. Cc: Michal Simek Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/a2b78afc-5f60-8590-9df5-17302e356552@kernel.dk Signed-off-by: Michal Simek --- arch/microblaze/include/asm/thread_info.h | 2 ++ arch/microblaze/kernel/signal.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index ad8e8fcb90d3..44f5ca331862 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -107,6 +107,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ /* restore singlestep on return to user mode */ #define TIF_SINGLESTEP 4 +#define TIF_NOTIFY_SIGNAL 5 /* signal notifications exist */ #define TIF_MEMDIE 6 /* is terminating due to OOM killer */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */ #define TIF_SECCOMP 10 /* secure computing */ @@ -119,6 +120,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index f11a0ccccabc..5a8d173d7b75 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -313,7 +313,8 @@ static void do_signal(struct pt_regs *regs, int in_syscall) asmlinkage void do_notify_resume(struct pt_regs *regs, int in_syscall) { - if (test_thread_flag(TIF_SIGPENDING)) + if (test_thread_flag(TIF_SIGPENDING) || + test_thread_flag(TIF_NOTIFY_SIGNAL)) do_signal(regs, in_syscall); if (test_thread_flag(TIF_NOTIFY_RESUME)) From 93346da8ff47cc00f953c7f38a2d6ba11977fc42 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 24 Oct 2020 12:43:11 +0200 Subject: [PATCH 031/395] parisc: Drop loops_per_jiffy from per_cpu struct There is no need to keep a loops_per_jiffy value per cpu. Drop it. Signed-off-by: Helge Deller --- arch/parisc/include/asm/processor.h | 1 - arch/parisc/kernel/processor.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 6e2a8176b0dd..40135be97965 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -97,7 +97,6 @@ struct cpuinfo_parisc { unsigned long cpu_loc; /* CPU location from PAT firmware */ unsigned int state; struct parisc_device *dev; - unsigned long loops_per_jiffy; }; extern struct system_cpuinfo_parisc boot_cpu_data; diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index 7f2d0c0ecc80..1b6129e7d776 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -163,7 +163,6 @@ static int __init processor_probe(struct parisc_device *dev) if (cpuid) memset(p, 0, sizeof(struct cpuinfo_parisc)); - p->loops_per_jiffy = loops_per_jiffy; p->dev = dev; /* Save IODC data in case we need it */ p->hpa = dev->hpa.start; /* save CPU hpa */ p->cpuid = cpuid; /* save CPU id */ @@ -434,8 +433,8 @@ show_cpuinfo (struct seq_file *m, void *v) show_cache_info(m); seq_printf(m, "bogomips\t: %lu.%02lu\n", - cpuinfo->loops_per_jiffy / (500000 / HZ), - (cpuinfo->loops_per_jiffy / (5000 / HZ)) % 100); + loops_per_jiffy / (500000 / HZ), + loops_per_jiffy / (5000 / HZ) % 100); seq_printf(m, "software id\t: %ld\n\n", boot_cpu_data.pdc.model.sw_id); From c984baad3d8dd8555d23f0598fb81c3e0ea04c0e Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 29 Oct 2020 22:01:06 +0100 Subject: [PATCH 032/395] parisc: Use _TIF_USER_WORK_MASK in entry.S The constant _TIF_USER_WORK_MASK will get extended by additional flags in the future, so check against the bits set in this mask - with the exception of _TIF_NEED_RESCHED which was tested a few lines above. Signed-off-by: Helge Deller --- arch/parisc/kernel/entry.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index f6f28e41bb5e..beba9816cc6c 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -887,7 +887,7 @@ intr_check_sig: /* As above */ mfctl %cr30,%r1 LDREG TI_FLAGS(%r1),%r19 - ldi (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME), %r20 + ldi (_TIF_USER_WORK_MASK & ~_TIF_NEED_RESCHED), %r20 and,COND(<>) %r19, %r20, %r0 b,n intr_restore /* skip past if we've nothing to do */ @@ -1810,7 +1810,7 @@ syscall_check_resched: .import do_signal,code syscall_check_sig: LDREG TI_FLAGS-THREAD_SZ_ALGN-FRAME_SIZE(%r30),%r19 - ldi (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME), %r26 + ldi (_TIF_USER_WORK_MASK & ~_TIF_NEED_RESCHED), %r26 and,COND(<>) %r19, %r26, %r0 b,n syscall_restore /* skip past if we've nothing to do */ From 22ee3ea588dfc84ccb8cea5ea37051dfed91b9b9 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 6 Nov 2020 19:41:36 +0100 Subject: [PATCH 033/395] parisc: Make user stack size configurable On parisc we need to initialize the memory layout for the user stack at process start time to a fixed size, which up until now was limited to the size as given by CONFIG_MAX_STACK_SIZE_MB at compile time. This hard limit was too small and showed problems when compiling ruby2.7, qmlcachegen and some Qt packages. This patch changes two things: a) It increases the default maximum stack size to 100MB. b) Users can modify the stack hard limit size with ulimit and then newly forked processes will use the given stack size which can even be bigger than the default 100MB. Reported-by: John David Anglin Signed-off-by: Helge Deller --- arch/parisc/include/asm/processor.h | 7 ++----- arch/parisc/kernel/sys_parisc.c | 23 +++++++++++++++++++++-- fs/exec.c | 4 ++-- mm/Kconfig | 12 +++++------- 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 40135be97965..11ece0d07374 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -45,15 +45,12 @@ #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX DEFAULT_TASK_SIZE -/* Allow bigger stacks for 64-bit processes */ -#define STACK_SIZE_MAX (USER_WIDE_MODE \ - ? (1 << 30) /* 1 GB */ \ - : (CONFIG_MAX_STACK_SIZE_MB*1024*1024)) - #endif #ifndef __ASSEMBLY__ +unsigned long calc_max_stack_size(unsigned long stack_max); + /* * Data detected about CPUs at boot time which is the same for all CPU's. * HP boxes are SMP - ie identical processors. diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index 9549496f5523..5f12537318ab 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -53,6 +53,25 @@ static inline unsigned long COLOR_ALIGN(unsigned long addr, return base + off; } + +#define STACK_SIZE_DEFAULT (USER_WIDE_MODE \ + ? (1 << 30) /* 1 GB */ \ + : (CONFIG_STACK_MAX_DEFAULT_SIZE_MB*1024*1024)) + +unsigned long calc_max_stack_size(unsigned long stack_max) +{ +#ifdef CONFIG_COMPAT + if (!USER_WIDE_MODE && (stack_max == COMPAT_RLIM_INFINITY)) + stack_max = STACK_SIZE_DEFAULT; + else +#endif + if (stack_max == RLIM_INFINITY) + stack_max = STACK_SIZE_DEFAULT; + + return stack_max; +} + + /* * Top of mmap area (just below the process stack). */ @@ -69,8 +88,8 @@ static unsigned long mmap_upper_limit(struct rlimit *rlim_stack) /* Limit stack size - see setup_arg_pages() in fs/exec.c */ stack_base = rlim_stack ? rlim_stack->rlim_max : rlimit_max(RLIMIT_STACK); - if (stack_base > STACK_SIZE_MAX) - stack_base = STACK_SIZE_MAX; + + stack_base = calc_max_stack_size(stack_base); /* Add space for stack randomization. */ if (current->flags & PF_RANDOMIZE) diff --git a/fs/exec.c b/fs/exec.c index 547a2390baf5..a6f2c27f875b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -756,8 +756,8 @@ int setup_arg_pages(struct linux_binprm *bprm, #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ stack_base = bprm->rlim_stack.rlim_max; - if (stack_base > STACK_SIZE_MAX) - stack_base = STACK_SIZE_MAX; + + stack_base = calc_max_stack_size(stack_base); /* Add space for stack randomization. */ stack_base += (STACK_RND_MASK << PAGE_SHIFT); diff --git a/mm/Kconfig b/mm/Kconfig index d42423f884a7..4ec0f3dbfb11 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -733,19 +733,17 @@ config ZSMALLOC_STAT config GENERIC_EARLY_IOREMAP bool -config MAX_STACK_SIZE_MB - int "Maximum user stack size for 32-bit processes (MB)" - default 80 +config STACK_MAX_DEFAULT_SIZE_MB + int "Default maximum user stack size for 32-bit processes (MB)" + default 100 range 8 2048 depends on STACK_GROWSUP && (!64BIT || COMPAT) help This is the maximum stack size in Megabytes in the VM layout of 32-bit user processes when the stack grows upwards (currently only on parisc - arch). The stack will be located at the highest memory address minus - the given value, unless the RLIMIT_STACK hard limit is changed to a - smaller value in which case that is used. + arch) when the RLIMIT_STACK hard limit is unlimited. - A sane initial value is 80 MB. + A sane initial value is 100 MB. config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" From 6ca753a3a72e4c848d91f10da270c58000ada53c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 10 Nov 2020 17:51:36 +0100 Subject: [PATCH 034/395] parisc/uapi: Use Kbuild logic to provide Uapi just includes Signed-off-by: Geert Uytterhoeven Signed-off-by: Helge Deller --- arch/parisc/include/uapi/asm/types.h | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 arch/parisc/include/uapi/asm/types.h diff --git a/arch/parisc/include/uapi/asm/types.h b/arch/parisc/include/uapi/asm/types.h deleted file mode 100644 index 28c7d7453b10..000000000000 --- a/arch/parisc/include/uapi/asm/types.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _PARISC_TYPES_H -#define _PARISC_TYPES_H - -#include - -#endif From e1717283250aa426cd5f3ba9a1fc9faba3a2e529 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 10 Nov 2020 17:36:31 +0100 Subject: [PATCH 035/395] mips: Remove #include from Everything in arch/mips/include/uapi/asm/types.h is protected by "#ifndef __KERNEL__", so it's unused for kernelspace. Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/types.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/mips/include/asm/types.h b/arch/mips/include/asm/types.h index 148d42a17f30..638ef88e2f8e 100644 --- a/arch/mips/include/asm/types.h +++ b/arch/mips/include/asm/types.h @@ -12,6 +12,5 @@ #define _ASM_TYPES_H #include -#include #endif /* _ASM_TYPES_H */ From 4abaacc704729ec93a6ca23f6b3a92532337959b Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 10 Nov 2020 19:21:05 -0800 Subject: [PATCH 036/395] MIPS: remove GCC < 4.9 support Remove a tautology; since commit 0bddd227f3dc ("Documentation: update for gcc 4.9 requirement") which raised the minimally supported version of GCC to 4.9, this case is always true. Link: https://github.com/ClangBuiltLinux/linux/issues/427 Signed-off-by: Nick Desaulniers Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/compiler.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/mips/include/asm/compiler.h b/arch/mips/include/asm/compiler.h index a2cb2d2b1c07..2b06090a78b2 100644 --- a/arch/mips/include/asm/compiler.h +++ b/arch/mips/include/asm/compiler.h @@ -43,14 +43,7 @@ #undef barrier_before_unreachable #define barrier_before_unreachable() asm volatile(".insn") -#if !defined(CONFIG_CC_IS_GCC) || \ - (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9) -# define GCC_OFF_SMALL_ASM() "ZC" -#elif defined(CONFIG_CPU_MICROMIPS) -# error "microMIPS compilation unsupported with GCC older than 4.9" -#else -# define GCC_OFF_SMALL_ASM() "R" -#endif +#define GCC_OFF_SMALL_ASM() "ZC" #ifdef CONFIG_CPU_MIPSR6 #define MIPS_ISA_LEVEL "mips64r6" From 4d1b3ac886b5c03e966199edcc7e0efddd5c4cb2 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 3 Nov 2020 15:12:00 +0800 Subject: [PATCH 037/395] MIPS: Loongson64: Do not write the read only field LPA of CP0_CONFIG3 The field LPA of CP0_CONFIG3 register is read only for Loongson64, so the write operations are meaningless, remove them. Signed-off-by: Tiezhu Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/mach-loongson64/kernel-entry-init.h | 8 -------- arch/mips/loongson64/numa.c | 3 --- 2 files changed, 11 deletions(-) diff --git a/arch/mips/include/asm/mach-loongson64/kernel-entry-init.h b/arch/mips/include/asm/mach-loongson64/kernel-entry-init.h index 87a5bfbf8cfe..e4d77f4f0fe3 100644 --- a/arch/mips/include/asm/mach-loongson64/kernel-entry-init.h +++ b/arch/mips/include/asm/mach-loongson64/kernel-entry-init.h @@ -19,10 +19,6 @@ .macro kernel_entry_setup .set push .set mips64 - /* Set LPA on LOONGSON3 config3 */ - mfc0 t0, CP0_CONFIG3 - or t0, (0x1 << 7) - mtc0 t0, CP0_CONFIG3 /* Set ELPA on LOONGSON3 pagegrain */ mfc0 t0, CP0_PAGEGRAIN or t0, (0x1 << 29) @@ -54,10 +50,6 @@ .macro smp_slave_setup .set push .set mips64 - /* Set LPA on LOONGSON3 config3 */ - mfc0 t0, CP0_CONFIG3 - or t0, (0x1 << 7) - mtc0 t0, CP0_CONFIG3 /* Set ELPA on LOONGSON3 pagegrain */ mfc0 t0, CP0_PAGEGRAIN or t0, (0x1 << 29) diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index cf9459f79f9b..c7e3ccedc74f 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -39,9 +39,6 @@ static void enable_lpa(void) { unsigned long value; - value = __read_32bit_c0_register($16, 3); - value |= 0x00000080; - __write_32bit_c0_register($16, 3, value); value = __read_32bit_c0_register($16, 3); pr_info("CP0_Config3: CP0 16.3 (0x%lx)\n", value); From fe9863a19a5a73af8227548603fb521050769611 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 3 Nov 2020 15:12:01 +0800 Subject: [PATCH 038/395] MIPS: Loongson64: Set the field ELPA of CP0_PAGEGRAIN only once The field ELPA of CP0_PAGEGRAIN register is set at the beginning of the kernel entry point in kernel-entry-init.h, no need to set it again in numa.c, we can remove enable_lpa() and only print the related information. Signed-off-by: Tiezhu Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/loongson64/numa.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index c7e3ccedc74f..509b360e4d28 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -35,20 +35,6 @@ EXPORT_SYMBOL(__node_data); cpumask_t __node_cpumask[MAX_NUMNODES]; EXPORT_SYMBOL(__node_cpumask); -static void enable_lpa(void) -{ - unsigned long value; - - value = __read_32bit_c0_register($16, 3); - pr_info("CP0_Config3: CP0 16.3 (0x%lx)\n", value); - - value = __read_32bit_c0_register($5, 1); - value |= 0x20000000; - __write_32bit_c0_register($5, 1, value); - value = __read_32bit_c0_register($5, 1); - pr_info("CP0_PageGrain: CP0 5.1 (0x%lx)\n", value); -} - static void cpu_node_probe(void) { int i; @@ -240,7 +226,8 @@ EXPORT_SYMBOL(pcibus_to_node); void __init prom_init_numa_memory(void) { - enable_lpa(); + pr_info("CP0_Config3: CP0 16.3 (0x%x)\n", read_c0_config3()); + pr_info("CP0_PageGrain: CP0 5.1 (0x%x)\n", read_c0_pagegrain()); prom_meminit(); } EXPORT_SYMBOL(prom_init_numa_memory); From 42831cd70805211c240a5bba5b4fb6be9470c91d Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 3 Nov 2020 15:12:02 +0800 Subject: [PATCH 039/395] MIPS: Loongson64: Set IPI_Enable register per core by itself In the current code, for example, core 1 sets Core[0, 1, 2, 3]_IPI_Enalbe register and core 2, 3 do the same thing on the 1-way Loongson64 platform, this is not necessary. Set IPI_Enable register per core by itself to avoid duplicate operations and make the logic more clear. Signed-off-by: Tiezhu Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/loongson64/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/loongson64/smp.c b/arch/mips/loongson64/smp.c index e744e1bee49e..7d58853a0631 100644 --- a/arch/mips/loongson64/smp.c +++ b/arch/mips/loongson64/smp.c @@ -348,8 +348,7 @@ static void loongson3_init_secondary(void) /* Set interrupt mask, but don't enable */ change_c0_status(ST0_IM, imask); - for (i = 0; i < num_possible_cpus(); i++) - loongson3_ipi_write32(0xffffffff, ipi_en0_regs[cpu_logical_map(i)]); + loongson3_ipi_write32(0xffffffff, ipi_en0_regs[cpu_logical_map(cpu)]); per_cpu(cpu_state, cpu) = CPU_ONLINE; cpu_set_core(&cpu_data[cpu], @@ -420,6 +419,7 @@ static void __init loongson3_smp_setup(void) ipi_status0_regs_init(); ipi_en0_regs_init(); ipi_mailbox_buf_init(); + loongson3_ipi_write32(0xffffffff, ipi_en0_regs[cpu_logical_map(0)]); cpu_set_core(&cpu_data[0], cpu_logical_map(0) % loongson_sysconf.cores_per_package); cpu_data[0].package = cpu_logical_map(0) / loongson_sysconf.cores_per_package; From fed4955f304eb62acfdf86ecf05ea164856e09d8 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 3 Nov 2020 15:12:03 +0800 Subject: [PATCH 040/395] MIPS: Loongson64: Add Mail_Send support for 3A4000+ CPU Loongson 3A4000+ CPU has per-core Mail_Send register to send mail, there is no need to maintain register address of each core and node, just simply specify cpu number. Signed-off-by: Lu Zeng Signed-off-by: Jianmin Lv Signed-off-by: Tiezhu Yang Signed-off-by: Thomas Bogendoerfer --- .../asm/mach-loongson64/loongson_regs.h | 10 ++ arch/mips/loongson64/smp.c | 120 ++++++++++++++---- 2 files changed, 107 insertions(+), 23 deletions(-) diff --git a/arch/mips/include/asm/mach-loongson64/loongson_regs.h b/arch/mips/include/asm/mach-loongson64/loongson_regs.h index 83dbb9fdf9c2..165993514762 100644 --- a/arch/mips/include/asm/mach-loongson64/loongson_regs.h +++ b/arch/mips/include/asm/mach-loongson64/loongson_regs.h @@ -227,6 +227,16 @@ static inline void csr_writeq(u64 val, u32 reg) #define CSR_IPI_SEND_CPU_SHIFT 16 #define CSR_IPI_SEND_BLOCK BIT(31) +#define LOONGSON_CSR_MAIL_BUF0 0x1020 +#define LOONGSON_CSR_MAIL_SEND 0x1048 +#define CSR_MAIL_SEND_BLOCK BIT_ULL(31) +#define CSR_MAIL_SEND_BOX_LOW(box) (box << 1) +#define CSR_MAIL_SEND_BOX_HIGH(box) ((box << 1) + 1) +#define CSR_MAIL_SEND_BOX_SHIFT 2 +#define CSR_MAIL_SEND_CPU_SHIFT 16 +#define CSR_MAIL_SEND_BUF_SHIFT 32 +#define CSR_MAIL_SEND_H32_MASK 0xFFFFFFFF00000000ULL + static inline u64 drdtime(void) { int rID = 0; diff --git a/arch/mips/loongson64/smp.c b/arch/mips/loongson64/smp.c index 7d58853a0631..736e98ddd596 100644 --- a/arch/mips/loongson64/smp.c +++ b/arch/mips/loongson64/smp.c @@ -53,6 +53,29 @@ static uint32_t core0_c0count[NR_CPUS]; u32 (*ipi_read_clear)(int cpu); void (*ipi_write_action)(int cpu, u32 action); +void (*ipi_write_enable)(int cpu); +void (*ipi_clear_buf)(int cpu); +void (*ipi_write_buf)(int cpu, struct task_struct *idle); + +/* send mail via Mail_Send register for 3A4000+ CPU */ +static void csr_mail_send(uint64_t data, int cpu, int mailbox) +{ + uint64_t val; + + /* send high 32 bits */ + val = CSR_MAIL_SEND_BLOCK; + val |= (CSR_MAIL_SEND_BOX_HIGH(mailbox) << CSR_MAIL_SEND_BOX_SHIFT); + val |= (cpu << CSR_MAIL_SEND_CPU_SHIFT); + val |= (data & CSR_MAIL_SEND_H32_MASK); + csr_writeq(val, LOONGSON_CSR_MAIL_SEND); + + /* send low 32 bits */ + val = CSR_MAIL_SEND_BLOCK; + val |= (CSR_MAIL_SEND_BOX_LOW(mailbox) << CSR_MAIL_SEND_BOX_SHIFT); + val |= (cpu << CSR_MAIL_SEND_CPU_SHIFT); + val |= (data << CSR_MAIL_SEND_BUF_SHIFT); + csr_writeq(val, LOONGSON_CSR_MAIL_SEND); +}; static u32 csr_ipi_read_clear(int cpu) { @@ -79,6 +102,35 @@ static void csr_ipi_write_action(int cpu, u32 action) } } +static void csr_ipi_write_enable(int cpu) +{ + csr_writel(0xffffffff, LOONGSON_CSR_IPI_EN); +} + +static void csr_ipi_clear_buf(int cpu) +{ + csr_writeq(0, LOONGSON_CSR_MAIL_BUF0); +} + +static void csr_ipi_write_buf(int cpu, struct task_struct *idle) +{ + unsigned long startargs[4]; + + /* startargs[] are initial PC, SP and GP for secondary CPU */ + startargs[0] = (unsigned long)&smp_bootstrap; + startargs[1] = (unsigned long)__KSTK_TOS(idle); + startargs[2] = (unsigned long)task_thread_info(idle); + startargs[3] = 0; + + pr_debug("CPU#%d, func_pc=%lx, sp=%lx, gp=%lx\n", + cpu, startargs[0], startargs[1], startargs[2]); + + csr_mail_send(startargs[3], cpu_logical_map(cpu), 3); + csr_mail_send(startargs[2], cpu_logical_map(cpu), 2); + csr_mail_send(startargs[1], cpu_logical_map(cpu), 1); + csr_mail_send(startargs[0], cpu_logical_map(cpu), 0); +} + static u32 legacy_ipi_read_clear(int cpu) { u32 action; @@ -96,14 +148,53 @@ static void legacy_ipi_write_action(int cpu, u32 action) loongson3_ipi_write32((u32)action, ipi_set0_regs[cpu]); } +static void legacy_ipi_write_enable(int cpu) +{ + loongson3_ipi_write32(0xffffffff, ipi_en0_regs[cpu_logical_map(cpu)]); +} + +static void legacy_ipi_clear_buf(int cpu) +{ + loongson3_ipi_write64(0, ipi_mailbox_buf[cpu_logical_map(cpu)] + 0x0); +} + +static void legacy_ipi_write_buf(int cpu, struct task_struct *idle) +{ + unsigned long startargs[4]; + + /* startargs[] are initial PC, SP and GP for secondary CPU */ + startargs[0] = (unsigned long)&smp_bootstrap; + startargs[1] = (unsigned long)__KSTK_TOS(idle); + startargs[2] = (unsigned long)task_thread_info(idle); + startargs[3] = 0; + + pr_debug("CPU#%d, func_pc=%lx, sp=%lx, gp=%lx\n", + cpu, startargs[0], startargs[1], startargs[2]); + + loongson3_ipi_write64(startargs[3], + ipi_mailbox_buf[cpu_logical_map(cpu)] + 0x18); + loongson3_ipi_write64(startargs[2], + ipi_mailbox_buf[cpu_logical_map(cpu)] + 0x10); + loongson3_ipi_write64(startargs[1], + ipi_mailbox_buf[cpu_logical_map(cpu)] + 0x8); + loongson3_ipi_write64(startargs[0], + ipi_mailbox_buf[cpu_logical_map(cpu)] + 0x0); +} + static void csr_ipi_probe(void) { if (cpu_has_csr() && csr_readl(LOONGSON_CSR_FEATURES) & LOONGSON_CSRF_IPI) { ipi_read_clear = csr_ipi_read_clear; ipi_write_action = csr_ipi_write_action; + ipi_write_enable = csr_ipi_write_enable; + ipi_clear_buf = csr_ipi_clear_buf; + ipi_write_buf = csr_ipi_write_buf; } else { ipi_read_clear = legacy_ipi_read_clear; ipi_write_action = legacy_ipi_write_action; + ipi_write_enable = legacy_ipi_write_enable; + ipi_clear_buf = legacy_ipi_clear_buf; + ipi_write_buf = legacy_ipi_write_buf; } } @@ -347,8 +438,7 @@ static void loongson3_init_secondary(void) /* Set interrupt mask, but don't enable */ change_c0_status(ST0_IM, imask); - - loongson3_ipi_write32(0xffffffff, ipi_en0_regs[cpu_logical_map(cpu)]); + ipi_write_enable(cpu); per_cpu(cpu_state, cpu) = CPU_ONLINE; cpu_set_core(&cpu_data[cpu], @@ -380,8 +470,8 @@ static void loongson3_smp_finish(void) write_c0_compare(read_c0_count() + mips_hpt_frequency/HZ); local_irq_enable(); - loongson3_ipi_write64(0, - ipi_mailbox_buf[cpu_logical_map(cpu)] + 0x0); + ipi_clear_buf(cpu); + pr_info("CPU#%d finished, CP0_ST=%x\n", smp_processor_id(), read_c0_status()); } @@ -419,7 +509,8 @@ static void __init loongson3_smp_setup(void) ipi_status0_regs_init(); ipi_en0_regs_init(); ipi_mailbox_buf_init(); - loongson3_ipi_write32(0xffffffff, ipi_en0_regs[cpu_logical_map(0)]); + ipi_write_enable(0); + cpu_set_core(&cpu_data[0], cpu_logical_map(0) % loongson_sysconf.cores_per_package); cpu_data[0].package = cpu_logical_map(0) / loongson_sysconf.cores_per_package; @@ -439,27 +530,10 @@ static void __init loongson3_prepare_cpus(unsigned int max_cpus) */ static int loongson3_boot_secondary(int cpu, struct task_struct *idle) { - unsigned long startargs[4]; - pr_info("Booting CPU#%d...\n", cpu); - /* startargs[] are initial PC, SP and GP for secondary CPU */ - startargs[0] = (unsigned long)&smp_bootstrap; - startargs[1] = (unsigned long)__KSTK_TOS(idle); - startargs[2] = (unsigned long)task_thread_info(idle); - startargs[3] = 0; + ipi_write_buf(cpu, idle); - pr_debug("CPU#%d, func_pc=%lx, sp=%lx, gp=%lx\n", - cpu, startargs[0], startargs[1], startargs[2]); - - loongson3_ipi_write64(startargs[3], - ipi_mailbox_buf[cpu_logical_map(cpu)] + 0x18); - loongson3_ipi_write64(startargs[2], - ipi_mailbox_buf[cpu_logical_map(cpu)] + 0x10); - loongson3_ipi_write64(startargs[1], - ipi_mailbox_buf[cpu_logical_map(cpu)] + 0x8); - loongson3_ipi_write64(startargs[0], - ipi_mailbox_buf[cpu_logical_map(cpu)] + 0x0); return 0; } From 381ad3843b26ec9b461e7973729ef64b36ed4627 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 3 Nov 2020 15:12:04 +0800 Subject: [PATCH 041/395] MIPS: Loongson64: SMP: Fix up play_dead jump indicator In play_dead function, the whole 64-bit PC mailbox was used as a indicator to determine if the master core had written boot jump information. However, after we introduced CSR mailsend, the hardware will not guarante an atomic write for the 64-bit PC mailbox. Thus we have to use the lower 32-bit which is written at the last as the jump indicator instead. Signed-off-by: Lu Zeng Signed-off-by: Jun Yi Signed-off-by: Tiezhu Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/loongson64/smp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/loongson64/smp.c b/arch/mips/loongson64/smp.c index 736e98ddd596..aa0cd723e61d 100644 --- a/arch/mips/loongson64/smp.c +++ b/arch/mips/loongson64/smp.c @@ -764,9 +764,10 @@ static void loongson3_type3_play_dead(int *state_addr) "1: li %[count], 0x100 \n" /* wait for init loop */ "2: bnez %[count], 2b \n" /* limit mailbox access */ " addiu %[count], -1 \n" - " ld %[initfunc], 0x20(%[base]) \n" /* get PC via mailbox */ + " lw %[initfunc], 0x20(%[base]) \n" /* check lower 32-bit as jump indicator */ " beqz %[initfunc], 1b \n" " nop \n" + " ld %[initfunc], 0x20(%[base]) \n" /* get PC (whole 64-bit) via mailbox */ " ld $sp, 0x28(%[base]) \n" /* get SP via mailbox */ " ld $gp, 0x30(%[base]) \n" /* get GP via mailbox */ " ld $a1, 0x38(%[base]) \n" From 32d59773da38cd83e497a70eb9754d4bbae3aeae Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 16:00:49 -0600 Subject: [PATCH 042/395] arm: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for arm. Cc: linux-arm-kernel@lists.infradead.org Acked-by: Russell King Signed-off-by: Jens Axboe --- arch/arm/include/asm/thread_info.h | 7 ++++++- arch/arm/kernel/entry-common.S | 6 +++--- arch/arm/kernel/entry-v7m.S | 2 +- arch/arm/kernel/signal.c | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 536b6b979f63..eb7ce2747eb0 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -126,6 +126,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, * thread information flags: * TIF_USEDFPU - FPU was used by this task this quantum (SMP) * TIF_POLLING_NRFLAG - true if poll_idle() is polling TIF_NEED_RESCHED + * + * Any bit in the range of 0..15 will cause do_work_pending() to be invoked. */ #define TIF_SIGPENDING 0 /* signal pending */ #define TIF_NEED_RESCHED 1 /* rescheduling necessary */ @@ -135,6 +137,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ #define TIF_SECCOMP 7 /* seccomp syscall filtering active */ +#define TIF_NOTIFY_SIGNAL 8 /* signal notifications exist */ #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ @@ -148,6 +151,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) /* Checks for any syscall work in entry-common.S */ @@ -158,7 +162,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, * Change these and you break ASM code in entry-common.S */ #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_UPROBE) + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NOTIFY_SIGNAL) #endif /* __KERNEL__ */ #endif /* __ASM_ARM_THREAD_INFO_H */ diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 271cb8a1eba1..77d16390a524 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -53,7 +53,7 @@ __ret_fast_syscall: cmp r2, #TASK_SIZE blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + movs r1, r1, lsl #16 bne fast_work_pending @@ -90,7 +90,7 @@ __ret_fast_syscall: cmp r2, #TASK_SIZE blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + movs r1, r1, lsl #16 beq no_work_pending UNWIND(.fnend ) ENDPROC(ret_fast_syscall) @@ -131,7 +131,7 @@ ENTRY(ret_to_user_from_irq) cmp r2, #TASK_SIZE blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] - tst r1, #_TIF_WORK_MASK + movs r1, r1, lsl #16 bne slow_work_pending no_work_pending: asm_trace_hardirqs_on save = 0 diff --git a/arch/arm/kernel/entry-v7m.S b/arch/arm/kernel/entry-v7m.S index de1f20624be1..d0e898608d30 100644 --- a/arch/arm/kernel/entry-v7m.S +++ b/arch/arm/kernel/entry-v7m.S @@ -59,7 +59,7 @@ __irq_entry: get_thread_info tsk ldr r2, [tsk, #TI_FLAGS] - tst r2, #_TIF_WORK_MASK + movs r2, r2, lsl #16 beq 2f @ no work pending mov r0, #V7M_SCB_ICSR_PENDSVSET str r0, [r1, V7M_SCB_ICSR] @ raise PendSV diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 585edbfccf6d..9d2e916121be 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -655,7 +655,7 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) if (unlikely(!user_mode(regs))) return 0; local_irq_enable(); - if (thread_flags & _TIF_SIGPENDING) { + if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { int restart = do_signal(regs, syscall); if (unlikely(restart)) { /* From bec58f40d6c5372d812c93cc3947f3bc97440e57 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 15:56:07 -0600 Subject: [PATCH 043/395] xtensa: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for xtensa. Thanks to Max Filippov for making the asm correct. Cc: linux-xtensa@linux-xtensa.org Signed-off-by: Jens Axboe --- arch/xtensa/include/asm/thread_info.h | 5 ++++- arch/xtensa/kernel/entry.S | 4 ++-- arch/xtensa/kernel/signal.c | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index 6acbbe0d87d3..a312333a9add 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -111,18 +111,21 @@ static inline struct thread_info *current_thread_info(void) #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ #define TIF_SINGLESTEP 3 /* restore singlestep on return to user mode */ #define TIF_SYSCALL_TRACEPOINT 4 /* syscall tracepoint instrumentation */ -#define TIF_MEMDIE 5 /* is terminating due to OOM killer */ +#define TIF_NOTIFY_SIGNAL 5 /* signal notifications exist */ #define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */ #define TIF_NOTIFY_RESUME 7 /* callback before returning to user */ #define TIF_DB_DISABLED 8 /* debug trap disabled for syscall */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */ #define TIF_SECCOMP 10 /* secure computing */ +#define TIF_MEMDIE 11 /* is terminating due to OOM killer */ #define _TIF_SYSCALL_TRACE (1< Date: Wed, 11 Nov 2020 12:15:41 +0300 Subject: [PATCH 044/395] mips: dts: jz47x: Harmonize EHCI/OHCI DT nodes name In accordance with the Generic EHCI/OHCI bindings the corresponding node name is suppose to comply with the Generic USB HCD DT schema, which requires the USB nodes to have the name acceptable by the regexp: "^usb(@.*)?" . Make sure the "generic-ehci" and "generic-ohci"-compatible nodes are correctly named. Signed-off-by: Serge Semin Acked-by: Paul Cercueil Acked-by: Krzysztof Kozlowski Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/ingenic/jz4740.dtsi | 2 +- arch/mips/boot/dts/ingenic/jz4770.dtsi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/boot/dts/ingenic/jz4740.dtsi b/arch/mips/boot/dts/ingenic/jz4740.dtsi index eee523678ce5..c1afdfdaa8a3 100644 --- a/arch/mips/boot/dts/ingenic/jz4740.dtsi +++ b/arch/mips/boot/dts/ingenic/jz4740.dtsi @@ -295,7 +295,7 @@ dmac: dma-controller@13020000 { clocks = <&cgu JZ4740_CLK_DMA>; }; - uhc: uhc@13030000 { + uhc: usb@13030000 { compatible = "ingenic,jz4740-ohci", "generic-ohci"; reg = <0x13030000 0x1000>; diff --git a/arch/mips/boot/dts/ingenic/jz4770.dtsi b/arch/mips/boot/dts/ingenic/jz4770.dtsi index 018721a9eea9..05c00b93088e 100644 --- a/arch/mips/boot/dts/ingenic/jz4770.dtsi +++ b/arch/mips/boot/dts/ingenic/jz4770.dtsi @@ -430,7 +430,7 @@ dmac1: dma-controller@13420100 { interrupts = <23>; }; - uhc: uhc@13430000 { + uhc: usb@13430000 { compatible = "generic-ohci"; reg = <0x13430000 0x1000>; From f4fc91af93924994348a88ff886129cc0322d282 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Wed, 11 Nov 2020 12:15:42 +0300 Subject: [PATCH 045/395] mips: dts: sead3: Harmonize EHCI/OHCI DT nodes name In accordance with the Generic EHCI/OHCI bindings the corresponding node name is suppose to comply with the Generic USB HCD DT schema, which requires the USB nodes to have the name acceptable by the regexp: "^usb(@.*)?" . Make sure the "generic-ehci" and "generic-ohci"-compatible nodes are correctly named. Signed-off-by: Serge Semin Acked-by: Krzysztof Kozlowski Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/mti/sead3.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/boot/dts/mti/sead3.dts b/arch/mips/boot/dts/mti/sead3.dts index 192c26ff1d3d..1cf6728af8fe 100644 --- a/arch/mips/boot/dts/mti/sead3.dts +++ b/arch/mips/boot/dts/mti/sead3.dts @@ -56,7 +56,7 @@ gic: interrupt-controller@1b1c0000 { interrupt-parent = <&cpu_intc>; }; - ehci@1b200000 { + usb@1b200000 { compatible = "generic-ehci"; reg = <0x1b200000 0x1000>; From 3180b64aa9f1f25e0a6a743091d5c786a1a85cbd Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Wed, 11 Nov 2020 12:15:43 +0300 Subject: [PATCH 046/395] mips: dts: ralink: mt7628a: Harmonize EHCI/OHCI DT nodes name In accordance with the Generic EHCI/OHCI bindings the corresponding node name is suppose to comply with the Generic USB HCD DT schema, which requires the USB nodes to have the name acceptable by the regexp: "^usb(@.*)?" . Make sure the "generic-ehci" and "generic-ohci"-compatible nodes are correctly named. Signed-off-by: Serge Semin Acked-by: Krzysztof Kozlowski Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/ralink/mt7628a.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/boot/dts/ralink/mt7628a.dtsi b/arch/mips/boot/dts/ralink/mt7628a.dtsi index 892e8ab863c5..45bf96a3d17a 100644 --- a/arch/mips/boot/dts/ralink/mt7628a.dtsi +++ b/arch/mips/boot/dts/ralink/mt7628a.dtsi @@ -275,7 +275,7 @@ usb_phy: usb-phy@10120000 { reset-names = "host", "device"; }; - ehci@101c0000 { + usb@101c0000 { compatible = "generic-ehci"; reg = <0x101c0000 0x1000>; From 09a48cbcd7af9203296938044f1100bb113ce01a Mon Sep 17 00:00:00 2001 From: Necip Fazil Yildiran Date: Wed, 4 Nov 2020 19:41:27 +0300 Subject: [PATCH 047/395] MIPS: BMC47xx: fix kconfig dependency bug for BCM47XX_SSB When BCM47XX_SSB is enabled and SSB_PCIHOST is disabled, it results in the following Kbuild warning: WARNING: unmet direct dependencies detected for SSB_B43_PCI_BRIDGE Depends on [n]: SSB [=y] && SSB_PCIHOST [=n] Selected by [y]: - BCM47XX_SSB [=y] && BCM47XX [=y] && PCI [=y] The reason is that BCM47XX_SSB selects SSB_B43_PCI_BRIDGE without depending on or selecting SSB_PCIHOST while SSB_B43_PCI_BRIDGE depends on SSB_PCIHOST. This can also fail building the kernel as demonstrated in a bug report. Honor the kconfig dependency to remove unmet direct dependency warnings and avoid any potential build failures. Link: https://bugzilla.kernel.org/show_bug.cgi?id=210051 Signed-off-by: Necip Fazil Yildiran Signed-off-by: Thomas Bogendoerfer --- arch/mips/bcm47xx/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/bcm47xx/Kconfig b/arch/mips/bcm47xx/Kconfig index 6889f74e06f5..40876654423c 100644 --- a/arch/mips/bcm47xx/Kconfig +++ b/arch/mips/bcm47xx/Kconfig @@ -9,6 +9,7 @@ config BCM47XX_SSB select SSB_DRIVER_MIPS select SSB_DRIVER_EXTIF select SSB_EMBEDDED + select SSB_PCIHOST if PCI select SSB_B43_PCI_BRIDGE if PCI select SSB_DRIVER_PCICORE if PCI select SSB_PCICORE_HOSTMODE if PCI From 3a5fe2fb9635c43359c9729352f45044f3c8df6b Mon Sep 17 00:00:00 2001 From: Necip Fazil Yildiran Date: Tue, 3 Nov 2020 00:34:01 +0300 Subject: [PATCH 048/395] MIPS: BCM47XX: fix kconfig dependency bug for BCM47XX_BCMA When BCM47XX_BCMA is enabled and BCMA_DRIVER_PCI is disabled, it results in the following Kbuild warning: WARNING: unmet direct dependencies detected for BCMA_DRIVER_PCI_HOSTMODE Depends on [n]: MIPS [=y] && BCMA_DRIVER_PCI [=n] && PCI_DRIVERS_LEGACY [=y] && BCMA [=y]=y Selected by [y]: - BCM47XX_BCMA [=y] && BCM47XX [=y] && PCI [=y] The reason is that BCM47XX_BCMA selects BCMA_DRIVER_PCI_HOSTMODE without depending on or selecting BCMA_DRIVER_PCI while BCMA_DRIVER_PCI_HOSTMODE depends on BCMA_DRIVER_PCI. This can also fail building the kernel. Honor the kconfig dependency to remove unmet direct dependency warnings and avoid any potential build failures. Fixes: c1d1c5d4213e ("bcm47xx: add support for bcma bus") Link: https://bugzilla.kernel.org/show_bug.cgi?id=209879 Signed-off-by: Necip Fazil Yildiran Signed-off-by: Thomas Bogendoerfer --- arch/mips/bcm47xx/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/bcm47xx/Kconfig b/arch/mips/bcm47xx/Kconfig index 40876654423c..13d315ee676b 100644 --- a/arch/mips/bcm47xx/Kconfig +++ b/arch/mips/bcm47xx/Kconfig @@ -28,6 +28,7 @@ config BCM47XX_BCMA select BCMA select BCMA_HOST_SOC select BCMA_DRIVER_MIPS + select BCMA_DRIVER_PCI if PCI select BCMA_DRIVER_PCI_HOSTMODE if PCI select BCMA_DRIVER_GPIO default y From fc3553cb9fc5fee18299b599040d2cc2eb17666d Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Tue, 10 Nov 2020 12:45:00 +0100 Subject: [PATCH 049/395] dt-bindings: mips: Add Luton Luton SoC belongs to the same family as Ocelot. Acked-by: Rob Herring Signed-off-by: Gregory CLEMENT Reviewed-by: Alexandre Belloni Signed-off-by: Thomas Bogendoerfer --- Documentation/devicetree/bindings/mips/mscc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mips/mscc.txt b/Documentation/devicetree/bindings/mips/mscc.txt index bc817e984628..cc93fd302553 100644 --- a/Documentation/devicetree/bindings/mips/mscc.txt +++ b/Documentation/devicetree/bindings/mips/mscc.txt @@ -4,7 +4,7 @@ Boards with a SoC of the Microsemi MIPS family shall have the following properties: Required properties: -- compatible: "mscc,ocelot" +- compatible: "mscc,ocelot" or "mscc,luton" * Other peripherals: From 597fa616c49ae06a1a307750a7df9b59205f462f Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Tue, 10 Nov 2020 12:45:01 +0100 Subject: [PATCH 050/395] dt-bindings: mips: Add Serval and Jaguar2 Serval and Jaguar2 SoCs belong to the same family as Ocelot and Luton. Acked-by: Rob Herring Signed-off-by: Gregory CLEMENT Reviewed-by: Alexandre Belloni Signed-off-by: Thomas Bogendoerfer --- Documentation/devicetree/bindings/mips/mscc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mips/mscc.txt b/Documentation/devicetree/bindings/mips/mscc.txt index cc93fd302553..cc916eaeed0a 100644 --- a/Documentation/devicetree/bindings/mips/mscc.txt +++ b/Documentation/devicetree/bindings/mips/mscc.txt @@ -4,7 +4,7 @@ Boards with a SoC of the Microsemi MIPS family shall have the following properties: Required properties: -- compatible: "mscc,ocelot" or "mscc,luton" +- compatible: "mscc,ocelot", "mscc,luton", "mscc,serval" or "mscc,jr2" * Other peripherals: From 700364dadceb8d60ca1bb6ccd3da619a36ecdcde Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Tue, 10 Nov 2020 12:45:02 +0100 Subject: [PATCH 051/395] MIPS: mscc: Prepare configuration to handle more SoCs Ocelot belongs to a family of SoC named the VCore III. In order to add these new Soc, use the new symbol SOC_VCOREIII instead of a one dedicated to Ocelot. In order to avoid regression on driver building, the MSCC_OCELOT configuration symbol is kept until the driver will be converted. Signed-off-by: Gregory CLEMENT Acked-by: Alexandre Belloni Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/Makefile | 2 +- arch/mips/boot/dts/mscc/Makefile | 2 +- arch/mips/generic/Kconfig | 11 ++++++++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/mips/boot/dts/Makefile b/arch/mips/boot/dts/Makefile index 19027129add8..0259238d7a2e 100644 --- a/arch/mips/boot/dts/Makefile +++ b/arch/mips/boot/dts/Makefile @@ -6,7 +6,7 @@ subdir-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += img subdir-$(CONFIG_MACH_INGENIC) += ingenic subdir-$(CONFIG_LANTIQ) += lantiq subdir-$(CONFIG_MACH_LOONGSON64) += loongson -subdir-$(CONFIG_MSCC_OCELOT) += mscc +subdir-$(CONFIG_SOC_VCOREIII) += mscc subdir-$(CONFIG_MIPS_MALTA) += mti subdir-$(CONFIG_LEGACY_BOARD_SEAD3) += mti subdir-$(CONFIG_NLM_XLP_BOARD) += netlogic diff --git a/arch/mips/boot/dts/mscc/Makefile b/arch/mips/boot/dts/mscc/Makefile index eb71515871f6..5015ccbbfb23 100644 --- a/arch/mips/boot/dts/mscc/Makefile +++ b/arch/mips/boot/dts/mscc/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -dtb-$(CONFIG_MSCC_OCELOT) += ocelot_pcb123.dtb ocelot_pcb120.dtb +dtb-$(CONFIG_SOC_VCOREIII) += ocelot_pcb123.dtb ocelot_pcb120.dtb obj-$(CONFIG_BUILTIN_DTB) += $(addsuffix .o, $(dtb-y)) diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig index 55d9aed7ced9..45431b88dded 100644 --- a/arch/mips/generic/Kconfig +++ b/arch/mips/generic/Kconfig @@ -34,14 +34,19 @@ config LEGACY_BOARD_OCELOT bool "Support MSCC Ocelot boards" depends on LEGACY_BOARD_SEAD3=n select LEGACY_BOARDS - select MSCC_OCELOT + select SOC_VCOREIII select SYS_HAS_EARLY_PRINTK select USE_GENERIC_EARLY_PRINTK_8250 -config MSCC_OCELOT +config SOC_VCOREIII bool select GPIOLIB select MSCC_OCELOT_IRQ + select MSCC_OCELOT #will be removed when driver no more use it + +#Will be removed when the driver using it will be converted to SOC_VCOREIII +config MSCC_OCELOT + bool comment "FIT/UHI Boards" @@ -67,7 +72,7 @@ config FIT_IMAGE_FDT_XILFPGA config FIT_IMAGE_FDT_OCELOT bool "Include FDT for Microsemi Ocelot development platforms" - select MSCC_OCELOT + select SOC_VCOREIII help Enable this to include the FDT for the Ocelot development platforms from Microsemi in the FIT kernel image. From 2825f4c0ffcdf6e519e30ecdae4cd8c1c89a1cb9 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Tue, 10 Nov 2020 12:45:03 +0100 Subject: [PATCH 052/395] MIPS: mscc: Fix configuration name for ocelot legacy boards Ocelots is supported by the generic MIPS build so make it clears that LEGACY_BOARD_OCELOT is only needed for legacy boards which didn't have bootloader supporting device tree. Signed-off-by: Gregory CLEMENT Acked-by: Alexandre Belloni Signed-off-by: Thomas Bogendoerfer --- arch/mips/generic/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig index 45431b88dded..eb2a3fa9fcd7 100644 --- a/arch/mips/generic/Kconfig +++ b/arch/mips/generic/Kconfig @@ -31,7 +31,7 @@ comment "MSCC Ocelot doesn't work with SEAD3 enabled" depends on LEGACY_BOARD_SEAD3 config LEGACY_BOARD_OCELOT - bool "Support MSCC Ocelot boards" + bool "Legacy support for Ocelot based boards" depends on LEGACY_BOARD_SEAD3=n select LEGACY_BOARDS select SOC_VCOREIII From 93b834e6cf0e6bbb16f9bd678106648442960aba Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Tue, 10 Nov 2020 12:45:04 +0100 Subject: [PATCH 053/395] MIPS: mscc: Add luton dtsi Add a device tree include file for the Microsemi Luton SoC which belongs to same family of the Ocelot SoC. It is based on the work of Lars Povlsen . Signed-off-by: Gregory CLEMENT Acked-by: Alexandre Belloni Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/mscc/luton.dtsi | 116 +++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 arch/mips/boot/dts/mscc/luton.dtsi diff --git a/arch/mips/boot/dts/mscc/luton.dtsi b/arch/mips/boot/dts/mscc/luton.dtsi new file mode 100644 index 000000000000..2a170b84c5a9 --- /dev/null +++ b/arch/mips/boot/dts/mscc/luton.dtsi @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Copyright (c) 2020 Microsemi Corporation */ + +/ { + #address-cells = <1>; + #size-cells = <1>; + compatible = "mscc,luton"; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + compatible = "mips,mips24KEc"; + device_type = "cpu"; + clocks = <&cpu_clk>; + reg = <0>; + }; + }; + + aliases { + serial0 = &uart0; + }; + + cpuintc: interrupt-controller { + #address-cells = <0>; + #interrupt-cells = <1>; + interrupt-controller; + compatible = "mti,cpu-interrupt-controller"; + }; + + cpu_clk: cpu-clock { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <416666666>; + }; + + ahb_clk: ahb-clk { + compatible = "fixed-factor-clock"; + #clock-cells = <0>; + clocks = <&cpu_clk>; + clock-div = <2>; + clock-mult = <1>; + }; + + ahb@60000000 { + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0 0x60000000 0x20000000>; + + interrupt-parent = <&intc>; + + cpu_ctrl: syscon@10000000 { + compatible = "mscc,ocelot-cpu-syscon", "syscon"; + reg = <0x10000000 0x2c>; + }; + + intc: interrupt-controller@10000084 { + compatible = "mscc,luton-icpu-intr"; + reg = <0x10000084 0x70>; + #interrupt-cells = <1>; + interrupt-controller; + interrupt-parent = <&cpuintc>; + interrupts = <2>; + }; + + uart0: serial@10100000 { + pinctrl-0 = <&uart_pins>; + pinctrl-names = "default"; + compatible = "ns16550a"; + reg = <0x10100000 0x20>; + interrupts = <6>; + clocks = <&ahb_clk>; + reg-io-width = <4>; + reg-shift = <2>; + + status = "disabled"; + }; + + i2c0: i2c@10100400 { + compatible = "mscc,ocelot-i2c", "snps,designware-i2c"; + pinctrl-0 = <&i2c_pins>; + pinctrl-names = "default"; + reg = <0x10100400 0x100>, <0x100002a4 0x8>; + #address-cells = <1>; + #size-cells = <0>; + interrupts = <11>; + clocks = <&ahb_clk>; + + status = "disabled"; + }; + + gpio: pinctrl@70068 { + compatible = "mscc,luton-pinctrl"; + reg = <0x70068 0x28>; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&gpio 0 0 32>; + interrupt-controller; + interrupts = <13>; + #interrupt-cells = <2>; + + i2c_pins: i2c-pins { + pins = "GPIO_5", "GPIO_6"; + function = "twi"; + }; + + uart_pins: uart-pins { + pins = "GPIO_30", "GPIO_31"; + function = "uart"; + }; + + }; + }; +}; From 72bc5e8b25a0031354a27c1da2b89104e2bf32e3 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Tue, 10 Nov 2020 12:45:05 +0100 Subject: [PATCH 054/395] MIPS: mscc: Add luton PC0B91 device tree Add a device tree for the Microsemi Luton PCB091 evaluation board. It is based on the work of Lars Povlsen . Signed-off-by: Gregory CLEMENT Acked-by: Alexandre Belloni Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/mscc/Makefile | 5 +++- arch/mips/boot/dts/mscc/luton_pcb091.dts | 30 ++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 arch/mips/boot/dts/mscc/luton_pcb091.dts diff --git a/arch/mips/boot/dts/mscc/Makefile b/arch/mips/boot/dts/mscc/Makefile index 5015ccbbfb23..40699b44ed50 100644 --- a/arch/mips/boot/dts/mscc/Makefile +++ b/arch/mips/boot/dts/mscc/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only -dtb-$(CONFIG_SOC_VCOREIII) += ocelot_pcb123.dtb ocelot_pcb120.dtb +dtb-$(CONFIG_SOC_VCOREIII) += \ + luton_pcb091.dtb \ + ocelot_pcb120.dtb \ + ocelot_pcb123.dtb obj-$(CONFIG_BUILTIN_DTB) += $(addsuffix .o, $(dtb-y)) diff --git a/arch/mips/boot/dts/mscc/luton_pcb091.dts b/arch/mips/boot/dts/mscc/luton_pcb091.dts new file mode 100644 index 000000000000..26ef6285d71d --- /dev/null +++ b/arch/mips/boot/dts/mscc/luton_pcb091.dts @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2020 Microsemi Corporation + */ + +/dts-v1/; + +#include "luton.dtsi" + +/ { + model = "Luton10 PCB091 Reference Board"; + compatible = "mscc,luton-pcb091", "mscc,luton"; + + aliases { + serial0 = &uart0; + }; + + chosen { + stdout-path = "serial0:115200n8"; + }; +}; + +&uart0 { + status = "okay"; +}; + +&i2c0 { + status = "okay"; + i2c-sda-hold-time-ns = <300>; +}; From 378e413fe97e8d66ff4dc90b0b5b6ef3bbc15252 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Tue, 10 Nov 2020 12:45:06 +0100 Subject: [PATCH 055/395] MIPS: mscc: build FIT image for Luton Luton now has already an u-boot port so let's build FIT images. Signed-off-by: Gregory CLEMENT Acked-by: Alexandre Belloni Signed-off-by: Thomas Bogendoerfer --- arch/mips/generic/Kconfig | 8 ++++++++ arch/mips/generic/Platform | 1 + arch/mips/generic/board-luton.its.S | 23 +++++++++++++++++++++++ 3 files changed, 32 insertions(+) create mode 100644 arch/mips/generic/board-luton.its.S diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig index eb2a3fa9fcd7..e5a7a1314e71 100644 --- a/arch/mips/generic/Kconfig +++ b/arch/mips/generic/Kconfig @@ -78,6 +78,14 @@ config FIT_IMAGE_FDT_OCELOT from Microsemi in the FIT kernel image. This requires u-boot on the platform. +config FIT_IMAGE_FDT_LUTON + bool "Include FDT for Microsemi Luton development platforms" + select SOC_VCOREIII + help + Enable this to include the FDT for the Luton development platforms + from Microsemi in the FIT kernel image. + This requires u-boot on the platform. + config BOARD_INGENIC bool "Support boards based on Ingenic SoCs" select MACH_INGENIC_GENERIC diff --git a/arch/mips/generic/Platform b/arch/mips/generic/Platform index f8ef2f9d107e..4b6905daa39c 100644 --- a/arch/mips/generic/Platform +++ b/arch/mips/generic/Platform @@ -20,4 +20,5 @@ its-y := vmlinux.its.S its-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += board-boston.its.S its-$(CONFIG_FIT_IMAGE_FDT_NI169445) += board-ni169445.its.S its-$(CONFIG_FIT_IMAGE_FDT_OCELOT) += board-ocelot.its.S +its-$(CONFIG_FIT_IMAGE_FDT_LUTON) += board-luton.its.S its-$(CONFIG_FIT_IMAGE_FDT_XILFPGA) += board-xilfpga.its.S diff --git a/arch/mips/generic/board-luton.its.S b/arch/mips/generic/board-luton.its.S new file mode 100644 index 000000000000..39a543f62f25 --- /dev/null +++ b/arch/mips/generic/board-luton.its.S @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/ { + images { + fdt@luton_pcb091 { + description = "MSCC Luton PCB091 Device Tree"; + data = /incbin/("boot/dts/mscc/luton_pcb091.dtb"); + type = "flat_dt"; + arch = "mips"; + compression = "none"; + hash@0 { + algo = "sha1"; + }; + }; + }; + + configurations { + pcb091 { + description = "Luton Linux kernel"; + kernel = "kernel@0"; + fdt = "fdt@luton_pcb091"; + }; + }; +}; From f84778f7d8c3b867d6aaca9361d7b43b222f9f6a Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Tue, 10 Nov 2020 12:45:07 +0100 Subject: [PATCH 056/395] MIPS: mscc: Add jaguar2 support Add a device trees and FIT image support for the Microsemi Jaguar2 SoC which belongs to same family of the Ocelot SoC. It is based on the work of Lars Povlsen . Signed-off-by: Gregory CLEMENT Acked-by: Alexandre Belloni Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/mscc/Makefile | 3 + arch/mips/boot/dts/mscc/jaguar2.dtsi | 167 ++++++++++++ arch/mips/boot/dts/mscc/jaguar2_common.dtsi | 25 ++ arch/mips/boot/dts/mscc/jaguar2_pcb110.dts | 267 ++++++++++++++++++++ arch/mips/boot/dts/mscc/jaguar2_pcb111.dts | 107 ++++++++ arch/mips/boot/dts/mscc/jaguar2_pcb118.dts | 57 +++++ arch/mips/generic/Kconfig | 8 + arch/mips/generic/Platform | 1 + arch/mips/generic/board-jaguar2.its.S | 40 +++ 9 files changed, 675 insertions(+) create mode 100644 arch/mips/boot/dts/mscc/jaguar2.dtsi create mode 100644 arch/mips/boot/dts/mscc/jaguar2_common.dtsi create mode 100644 arch/mips/boot/dts/mscc/jaguar2_pcb110.dts create mode 100644 arch/mips/boot/dts/mscc/jaguar2_pcb111.dts create mode 100644 arch/mips/boot/dts/mscc/jaguar2_pcb118.dts create mode 100644 arch/mips/generic/board-jaguar2.its.S diff --git a/arch/mips/boot/dts/mscc/Makefile b/arch/mips/boot/dts/mscc/Makefile index 40699b44ed50..befda72ceb26 100644 --- a/arch/mips/boot/dts/mscc/Makefile +++ b/arch/mips/boot/dts/mscc/Makefile @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only dtb-$(CONFIG_SOC_VCOREIII) += \ + jaguar2_pcb110.dtb \ + jaguar2_pcb111.dtb \ + jaguar2_pcb118.dtb \ luton_pcb091.dtb \ ocelot_pcb120.dtb \ ocelot_pcb123.dtb diff --git a/arch/mips/boot/dts/mscc/jaguar2.dtsi b/arch/mips/boot/dts/mscc/jaguar2.dtsi new file mode 100644 index 000000000000..42b2b0a51ddc --- /dev/null +++ b/arch/mips/boot/dts/mscc/jaguar2.dtsi @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2020 Microsemi Corporation + */ + +/ { + #address-cells = <1>; + #size-cells = <1>; + compatible = "mscc,jr2"; + + aliases { + serial0 = &uart0; + serial1 = &uart2; + gpio0 = &gpio; + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + compatible = "mips,mips24KEc"; + device_type = "cpu"; + clocks = <&cpu_clk>; + reg = <0>; + }; + }; + + cpuintc: interrupt-controller { + #address-cells = <0>; + #interrupt-cells = <1>; + interrupt-controller; + compatible = "mti,cpu-interrupt-controller"; + }; + + cpu_clk: cpu-clock { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <500000000>; + }; + + ahb_clk: ahb-clk { + compatible = "fixed-factor-clock"; + #clock-cells = <0>; + clocks = <&cpu_clk>; + clock-div = <2>; + clock-mult = <1>; + }; + + ahb: ahb { + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <1>; + ranges; + + interrupt-parent = <&intc>; + + cpu_ctrl: syscon@70000000 { + compatible = "mscc,ocelot-cpu-syscon", "syscon"; + reg = <0x70000000 0x2c>; + }; + + intc: interrupt-controller@70000070 { + compatible = "mscc,jaguar2-icpu-intr"; + reg = <0x70000070 0x94>; + #interrupt-cells = <1>; + interrupt-controller; + interrupt-parent = <&cpuintc>; + interrupts = <2>; + }; + + uart0: serial@70100000 { + pinctrl-0 = <&uart_pins>; + pinctrl-names = "default"; + compatible = "ns16550a"; + reg = <0x70100000 0x20>; + interrupts = <6>; + clocks = <&ahb_clk>; + reg-io-width = <4>; + reg-shift = <2>; + + status = "disabled"; + }; + + uart2: serial@70100800 { + pinctrl-0 = <&uart2_pins>; + pinctrl-names = "default"; + compatible = "ns16550a"; + reg = <0x70100800 0x20>; + interrupts = <7>; + clocks = <&ahb_clk>; + reg-io-width = <4>; + reg-shift = <2>; + + status = "disabled"; + }; + + gpio: pinctrl@71010038 { + compatible = "mscc,jaguar2-pinctrl"; + reg = <0x71010038 0x90>; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&gpio 0 0 64>; + + uart_pins: uart-pins { + pins = "GPIO_10", "GPIO_11"; + function = "uart"; + }; + + uart2_pins: uart2-pins { + pins = "GPIO_24", "GPIO_25"; + function = "uart2"; + }; + + cs1_pins: cs1-pins { + pins = "GPIO_16"; + function = "si"; + }; + + cs2_pins: cs2-pins { + pins = "GPIO_17"; + function = "si"; + }; + + cs3_pins: cs3-pins { + pins = "GPIO_18"; + function = "si"; + }; + + i2c_pins: i2c-pins { + pins = "GPIO_14", "GPIO_15"; + function = "twi"; + }; + + i2c2_pins: i2c2-pins { + pins = "GPIO_28", "GPIO_29"; + function = "twi2"; + }; + }; + + i2c0: i2c@70100400 { + compatible = "mscc,ocelot-i2c", "snps,designware-i2c"; + status = "disabled"; + pinctrl-0 = <&i2c_pins>; + pinctrl-names = "default"; + reg = <0x70100400 0x100>, <0x700001b8 0x8>; + #address-cells = <1>; + #size-cells = <0>; + interrupts = <8>; + clock-frequency = <100000>; + clocks = <&ahb_clk>; + }; + + i2c2: i2c@70100c00 { + compatible = "mscc,ocelot-i2c", "snps,designware-i2c"; + status = "disabled"; + pinctrl-0 = <&i2c2_pins>; + pinctrl-names = "default"; + reg = <0x70100c00 0x100>; + #address-cells = <1>; + #size-cells = <0>; + interrupts = <8>; + clock-frequency = <100000>; + clocks = <&ahb_clk>; + }; + }; +}; diff --git a/arch/mips/boot/dts/mscc/jaguar2_common.dtsi b/arch/mips/boot/dts/mscc/jaguar2_common.dtsi new file mode 100644 index 000000000000..679ff0d8eda8 --- /dev/null +++ b/arch/mips/boot/dts/mscc/jaguar2_common.dtsi @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2020 Microsemi Corporation + */ + +#include "jaguar2.dtsi" + +/ { + chosen { + stdout-path = "serial0:115200n8"; + }; +}; + +&uart0 { + status = "okay"; +}; + +&uart2 { + status = "okay"; +}; + +&i2c0 { + status = "okay"; + i2c-sda-hold-time-ns = <300>; +}; diff --git a/arch/mips/boot/dts/mscc/jaguar2_pcb110.dts b/arch/mips/boot/dts/mscc/jaguar2_pcb110.dts new file mode 100644 index 000000000000..d80cd6842b2a --- /dev/null +++ b/arch/mips/boot/dts/mscc/jaguar2_pcb110.dts @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2020 Microsemi Corporation + */ + +/dts-v1/; +#include "jaguar2_common.dtsi" +#include + +/ { + model = "Jaguar2 Cu8-Sfp16 PCB110 Reference Board"; + compatible = "mscc,jr2-pcb110", "mscc,jr2"; + + aliases { + i2c0 = &i2c0; + i2c108 = &i2c108; + i2c109 = &i2c109; + i2c110 = &i2c110; + i2c111 = &i2c111; + i2c112 = &i2c112; + i2c113 = &i2c113; + i2c114 = &i2c114; + i2c115 = &i2c115; + i2c116 = &i2c116; + i2c117 = &i2c117; + i2c118 = &i2c118; + i2c119 = &i2c119; + i2c120 = &i2c120; + i2c121 = &i2c121; + i2c122 = &i2c122; + i2c123 = &i2c123; + i2c124 = &i2c124; + i2c125 = &i2c125; + i2c126 = &i2c126; + i2c127 = &i2c127; + i2c128 = &i2c128; + i2c129 = &i2c129; + i2c130 = &i2c130; + i2c131 = &i2c131; + i2c149 = &i2c149; + i2c150 = &i2c150; + i2c151 = &i2c151; + i2c152 = &i2c152; + }; + i2c0_imux: i2c0-imux { + compatible = "i2c-mux-pinctrl"; + #address-cells = <1>; + #size-cells = <0>; + i2c-parent = <&i2c0>; + pinctrl-names = + "i2c149", "i2c150", "i2c151", "i2c152", "idle"; + pinctrl-0 = <&i2cmux_0>; + pinctrl-1 = <&i2cmux_1>; + pinctrl-2 = <&i2cmux_2>; + pinctrl-3 = <&i2cmux_3>; + pinctrl-4 = <&i2cmux_pins_i>; + i2c149: i2c@0 { + reg = <0x0>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c150: i2c@1 { + reg = <0x1>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c151: i2c@2 { + reg = <0x2>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c152: i2c@3 { + reg = <0x3>; + #address-cells = <1>; + #size-cells = <0>; + }; + }; + i2c0_emux: i2c0-emux { + compatible = "i2c-mux-gpio"; + #address-cells = <1>; + #size-cells = <0>; + i2c-parent = <&i2c0>; + mux-gpios = <&gpio 51 GPIO_ACTIVE_HIGH + &gpio 52 GPIO_ACTIVE_HIGH + &gpio 53 GPIO_ACTIVE_HIGH + &gpio 58 GPIO_ACTIVE_HIGH + &gpio 59 GPIO_ACTIVE_HIGH>; + idle-state = <0x0>; + i2c108: i2c@10 { + reg = <0x10>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c109: i2c@11 { + reg = <0x11>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c110: i2c@12 { + reg = <0x12>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c111: i2c@13 { + reg = <0x13>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c112: i2c@14 { + reg = <0x14>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c113: i2c@15 { + reg = <0x15>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c114: i2c@16 { + reg = <0x16>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c115: i2c@17 { + reg = <0x17>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c116: i2c@8 { + reg = <0x8>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c117: i2c@9 { + reg = <0x9>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c118: i2c@a { + reg = <0xa>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c119: i2c@b { + reg = <0xb>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c120: i2c@c { + reg = <0xc>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c121: i2c@d { + reg = <0xd>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c122: i2c@e { + reg = <0xe>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c123: i2c@f { + reg = <0xf>; + #address-cells = <1>; + #size-cells = <0>; + }; + }; +}; + +&gpio { + synce_pins: synce-pins { + // GPIO 16 == SI_nCS1 + pins = "GPIO_16"; + function = "si"; + }; + synce_builtin_pins: synce-builtin-pins { + // GPIO 49 == SI_nCS13 + pins = "GPIO_49"; + function = "si"; + }; + i2cmux_pins_i: i2cmux-pins-i { + pins = "GPIO_17", "GPIO_18", "GPIO_20", "GPIO_21"; + function = "twi_scl_m"; + output-low; + }; + i2cmux_0: i2cmux-0 { + pins = "GPIO_17"; + function = "twi_scl_m"; + output-high; + }; + i2cmux_1: i2cmux-1 { + pins = "GPIO_18"; + function = "twi_scl_m"; + output-high; + }; + i2cmux_2: i2cmux-2 { + pins = "GPIO_20"; + function = "twi_scl_m"; + output-high; + }; + i2cmux_3: i2cmux-3 { + pins = "GPIO_21"; + function = "twi_scl_m"; + output-high; + }; +}; + +&i2c0 { + pca9545@70 { + compatible = "nxp,pca9545"; + reg = <0x70>; + #address-cells = <1>; + #size-cells = <0>; + i2c-mux-idle-disconnect; + i2c124: i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0>; + }; + i2c125: i2c@1 { + /* FMC B */ + #address-cells = <1>; + #size-cells = <0>; + reg = <1>; + }; + i2c126: i2c@2 { + #address-cells = <1>; + #size-cells = <0>; + reg = <2>; + }; + i2c127: i2c@3 { + #address-cells = <1>; + #size-cells = <0>; + reg = <3>; + }; + }; + pca9545@71 { + compatible = "nxp,pca9545"; + reg = <0x71>; + #address-cells = <1>; + #size-cells = <0>; + i2c-mux-idle-disconnect; + i2c128: i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0>; + }; + i2c129: i2c@1 { + /* FMC B */ + #address-cells = <1>; + #size-cells = <0>; + reg = <1>; + }; + i2c130: i2c@2 { + #address-cells = <1>; + #size-cells = <0>; + reg = <2>; + }; + i2c131: i2c@3 { + #address-cells = <1>; + #size-cells = <0>; + reg = <3>; + }; + }; +}; diff --git a/arch/mips/boot/dts/mscc/jaguar2_pcb111.dts b/arch/mips/boot/dts/mscc/jaguar2_pcb111.dts new file mode 100644 index 000000000000..813c5e16013c --- /dev/null +++ b/arch/mips/boot/dts/mscc/jaguar2_pcb111.dts @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2018 Microsemi Corporation + */ + +/dts-v1/; +#include "jaguar2_common.dtsi" + +/ { + model = "Jaguar2 Cu48 PCB111 Reference Board"; + compatible = "mscc,jr2-pcb111", "mscc,jr2"; + + aliases { + i2c0 = &i2c0; + i2c149 = &i2c149; + i2c150 = &i2c150; + i2c151 = &i2c151; + i2c152 = &i2c152; + i2c203 = &i2c203; + }; + + i2c0_imux: i2c0-imux { + compatible = "i2c-mux-pinctrl"; + #address-cells = <1>; + #size-cells = <0>; + i2c-parent = <&i2c0>; + pinctrl-names = + "i2c149", "i2c150", "i2c151", "i2c152", "i2c203", "idle"; + pinctrl-0 = <&i2cmux_0>; + pinctrl-1 = <&i2cmux_1>; + pinctrl-2 = <&i2cmux_2>; + pinctrl-3 = <&i2cmux_3>; + pinctrl-4 = <&i2cmux_pins_i>; // Added by convention for PoE + pinctrl-5 = <&i2cmux_pins_i>; + i2c149: i2c@0 { + reg = <0x0>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c150: i2c@1 { + reg = <0x1>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c151: i2c@2 { + reg = <0x2>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c152: i2c@3 { + reg = <0x3>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c203: i2c@4 { + reg = <0x4>; + #address-cells = <1>; + #size-cells = <0>; + }; + }; +}; + +&gpio { + synce_builtin_pins: synce-builtin-pins { + // GPIO 49 == SI_nCS13 + pins = "GPIO_49"; + function = "si"; + }; + cpld_pins: cpld-pins { + // GPIO 50 == SI_nCS14 + pins = "GPIO_50"; + function = "si"; + }; + cpld_fifo_pins: synce-builtin-pins { + // GPIO 51 == SI_nCS15 + pins = "GPIO_51"; + function = "si"; + }; +}; + +&gpio { + i2cmux_pins_i: i2cmux-pins-i { + pins = "GPIO_17", "GPIO_18"; + function = "twi_scl_m"; + output-low; + }; + i2cmux_0: i2cmux-0 { + pins = "GPIO_17"; + function = "twi_scl_m"; + output-high; + }; + i2cmux_1: i2cmux-1 { + pins = "GPIO_18"; + function = "twi_scl_m"; + output-high; + }; + i2cmux_2: i2cmux-2 { + pins = "GPIO_20"; + function = "twi_scl_m"; + output-high; + }; + i2cmux_3: i2cmux-3 { + pins = "GPIO_21"; + function = "twi_scl_m"; + output-high; + }; +}; diff --git a/arch/mips/boot/dts/mscc/jaguar2_pcb118.dts b/arch/mips/boot/dts/mscc/jaguar2_pcb118.dts new file mode 100644 index 000000000000..27c644f2d17f --- /dev/null +++ b/arch/mips/boot/dts/mscc/jaguar2_pcb118.dts @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2018 Microsemi Corporation + */ + +/dts-v1/; +#include "jaguar2_common.dtsi" + +/ { + model = "Jaguar2/Aquantia PCB118 Reference Board"; + compatible = "mscc,jr2-pcb118", "mscc,jr2"; + + aliases { + i2c150 = &i2c150; + i2c151 = &i2c151; + }; + + i2c0_imux: i2c0-imux { + compatible = "i2c-mux-pinctrl"; + #address-cells = <1>; + #size-cells = <0>; + i2c-parent = <&i2c0>; + pinctrl-names = + "i2c150", "i2c151", "idle"; + pinctrl-0 = <&i2cmux_0>; + pinctrl-1 = <&i2cmux_1>; + pinctrl-2 = <&i2cmux_pins_i>; + i2c150: i2c@0 { + reg = <0>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c151: i2c@1 { + reg = <1>; + #address-cells = <1>; + #size-cells = <0>; + }; + }; +}; + +&gpio { + i2cmux_pins_i: i2cmux-pins-i { + pins = "GPIO_17", "GPIO_16"; + function = "twi_scl_m"; + output-low; + }; + i2cmux_0: i2cmux-0 { + pins = "GPIO_17"; + function = "twi_scl_m"; + output-high; + }; + i2cmux_1: i2cmux-1 { + pins = "GPIO_16"; + function = "twi_scl_m"; + output-high; + }; +}; diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig index e5a7a1314e71..c7a840b8eaa6 100644 --- a/arch/mips/generic/Kconfig +++ b/arch/mips/generic/Kconfig @@ -86,6 +86,14 @@ config FIT_IMAGE_FDT_LUTON from Microsemi in the FIT kernel image. This requires u-boot on the platform. +config FIT_IMAGE_FDT_JAGUAR2 + bool "Include FDT for Microsemi Jaguar2 development platforms" + select SOC_VCOREIII + help + Enable this to include the FDT for the Jaguar2 development platforms + from Microsemi in the FIT kernel image. + This requires u-boot on the platform. + config BOARD_INGENIC bool "Support boards based on Ingenic SoCs" select MACH_INGENIC_GENERIC diff --git a/arch/mips/generic/Platform b/arch/mips/generic/Platform index 4b6905daa39c..3f2055bea596 100644 --- a/arch/mips/generic/Platform +++ b/arch/mips/generic/Platform @@ -21,4 +21,5 @@ its-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += board-boston.its.S its-$(CONFIG_FIT_IMAGE_FDT_NI169445) += board-ni169445.its.S its-$(CONFIG_FIT_IMAGE_FDT_OCELOT) += board-ocelot.its.S its-$(CONFIG_FIT_IMAGE_FDT_LUTON) += board-luton.its.S +its-$(CONFIG_FIT_IMAGE_FDT_JAGUAR2) += board-jaguar2.its.S its-$(CONFIG_FIT_IMAGE_FDT_XILFPGA) += board-xilfpga.its.S diff --git a/arch/mips/generic/board-jaguar2.its.S b/arch/mips/generic/board-jaguar2.its.S new file mode 100644 index 000000000000..fb0e589eeff7 --- /dev/null +++ b/arch/mips/generic/board-jaguar2.its.S @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/ { + images { + fdt@jaguar2_pcb110 { + description = "MSCC Jaguar2 PCB110 Device Tree"; + data = /incbin/("boot/dts/mscc/jaguar2_pcb110.dtb"); + type = "flat_dt"; + arch = "mips"; + compression = "none"; + hash@0 { + algo = "sha1"; + }; + }; + fdt@jaguar2_pcb111 { + description = "MSCC Jaguar2 PCB111 Device Tree"; + data = /incbin/("boot/dts/mscc/jaguar2_pcb111.dtb"); + type = "flat_dt"; + arch = "mips"; + compression = "none"; + hash@0 { + algo = "sha1"; + }; + }; + }; + + configurations { + pcb110 { + description = "Jaguar2 Linux kernel"; + kernel = "kernel@0"; + fdt = "fdt@jaguar2_pcb110"; + ramdisk = "ramdisk"; + }; + pcb111 { + description = "Jaguar2 Linux kernel"; + kernel = "kernel@0"; + fdt = "fdt@jaguar2_pcb111"; + ramdisk = "ramdisk"; + }; + }; +}; From fe0052018a84d50be034449b4175177f569fbf5c Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Tue, 10 Nov 2020 12:45:08 +0100 Subject: [PATCH 057/395] MIPS: mscc: Add serval support Add a device trees and FIT image support for the Microsemi Serval SoC which belongs to same family of the Ocelot SoC. It is based on the work of Lars Povlsen . Signed-off-by: Gregory CLEMENT Acked-by: Alexandre Belloni Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/mscc/Makefile | 5 +- arch/mips/boot/dts/mscc/serval.dtsi | 153 +++++++++++++++++++++ arch/mips/boot/dts/mscc/serval_common.dtsi | 127 +++++++++++++++++ arch/mips/boot/dts/mscc/serval_pcb105.dts | 17 +++ arch/mips/boot/dts/mscc/serval_pcb106.dts | 17 +++ arch/mips/generic/Kconfig | 8 ++ arch/mips/generic/Platform | 1 + arch/mips/generic/board-serval.its.S | 24 ++++ 8 files changed, 351 insertions(+), 1 deletion(-) create mode 100644 arch/mips/boot/dts/mscc/serval.dtsi create mode 100644 arch/mips/boot/dts/mscc/serval_common.dtsi create mode 100644 arch/mips/boot/dts/mscc/serval_pcb105.dts create mode 100644 arch/mips/boot/dts/mscc/serval_pcb106.dts create mode 100644 arch/mips/generic/board-serval.its.S diff --git a/arch/mips/boot/dts/mscc/Makefile b/arch/mips/boot/dts/mscc/Makefile index befda72ceb26..eeb6b7aae83b 100644 --- a/arch/mips/boot/dts/mscc/Makefile +++ b/arch/mips/boot/dts/mscc/Makefile @@ -5,6 +5,9 @@ dtb-$(CONFIG_SOC_VCOREIII) += \ jaguar2_pcb118.dtb \ luton_pcb091.dtb \ ocelot_pcb120.dtb \ - ocelot_pcb123.dtb + ocelot_pcb123.dtb \ + serval_pcb105.dtb \ + serval_pcb106.dtb + obj-$(CONFIG_BUILTIN_DTB) += $(addsuffix .o, $(dtb-y)) diff --git a/arch/mips/boot/dts/mscc/serval.dtsi b/arch/mips/boot/dts/mscc/serval.dtsi new file mode 100644 index 000000000000..089ce89df190 --- /dev/null +++ b/arch/mips/boot/dts/mscc/serval.dtsi @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2018 Microsemi Corporation + */ + +/ { + #address-cells = <1>; + #size-cells = <1>; + compatible = "mscc,serval"; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + compatible = "mips,mips24KEc"; + device_type = "cpu"; + clocks = <&cpu_clk>; + reg = <0>; + }; + }; + + aliases { + serial0 = &uart0; + gpio0 = &gpio; + }; + + cpuintc: interrupt-controller { + #address-cells = <0>; + #interrupt-cells = <1>; + interrupt-controller; + compatible = "mti,cpu-interrupt-controller"; + }; + + cpu_clk: cpu-clock { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <416666666>; + }; + + ahb_clk: ahb-clk { + compatible = "fixed-factor-clock"; + #clock-cells = <0>; + clocks = <&cpu_clk>; + clock-div = <2>; + clock-mult = <1>; + }; + + ahb: ahb { + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <1>; + ranges; + + interrupt-parent = <&intc>; + + cpu_ctrl: syscon@70000000 { + compatible = "mscc,ocelot-cpu-syscon", "syscon"; + reg = <0x70000000 0x2c>; + }; + + intc: interrupt-controller@70000070 { + compatible = "mscc,serval-icpu-intr"; + reg = <0x70000070 0x70>; + #interrupt-cells = <1>; + interrupt-controller; + interrupt-parent = <&cpuintc>; + interrupts = <2>; + }; + + uart0: serial@70100000 { + pinctrl-0 = <&uart_pins>; + pinctrl-names = "default"; + compatible = "ns16550a"; + reg = <0x70100000 0x20>; + interrupts = <6>; + clocks = <&ahb_clk>; + reg-io-width = <4>; + reg-shift = <2>; + + status = "disabled"; + }; + + uart2: serial@70100800 { + pinctrl-0 = <&uart2_pins>; + pinctrl-names = "default"; + compatible = "ns16550a"; + reg = <0x70100800 0x20>; + interrupts = <7>; + clocks = <&ahb_clk>; + reg-io-width = <4>; + reg-shift = <2>; + + status = "disabled"; + }; + + gpio: pinctrl@71070034 { + compatible = "mscc,serval-pinctrl"; + reg = <0x71070034 0x28>; + gpio-controller; + #gpio-cells = <2>; + gpio-ranges = <&gpio 0 0 22>; + + sgpio_pins: sgpio-pins { + pins = "GPIO_0", "GPIO_2", "GPIO_3", "GPIO_1"; + function = "sg0"; + }; + + i2c_pins: i2c-pins { + pins = "GPIO_6", "GPIO_7"; + function = "twi"; + }; + + uart_pins: uart-pins { + pins = "GPIO_26", "GPIO_27"; + function = "uart"; + }; + + uart2_pins: uart2-pins { + pins = "GPIO_13", "GPIO_14"; + function = "uart2"; + }; + + cs1_pins: cs1-pins { + pins = "GPIO_8"; + function = "si"; + }; + + irqext0_pins: irqext0-pins { + pins = "GPIO_28"; + function = "irq0"; + }; + + irqext1_pins: irqext1-pins { + pins = "GPIO_29"; + function = "irq1"; + }; + }; + + i2c0: i2c@70100400 { + compatible = "mscc,ocelot-i2c", "snps,designware-i2c"; + status = "disabled"; + pinctrl-0 = <&i2c_pins>; + pinctrl-names = "default"; + reg = <0x70100400 0x100>, <0x70000190 0x8>; + #address-cells = <1>; + #size-cells = <0>; + interrupts = <8>; + clock-frequency = <100000>; + clocks = <&ahb_clk>; + }; + }; +}; diff --git a/arch/mips/boot/dts/mscc/serval_common.dtsi b/arch/mips/boot/dts/mscc/serval_common.dtsi new file mode 100644 index 000000000000..5b404836db5e --- /dev/null +++ b/arch/mips/boot/dts/mscc/serval_common.dtsi @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2020 Microsemi Corporation + */ + +#include "serval.dtsi" + +/ { + aliases { + serial0 = &uart0; + i2c104 = &i2c104; + i2c105 = &i2c105; + i2c106 = &i2c106; + i2c107 = &i2c107; + i2c108 = &i2c108; + i2c109 = &i2c109; + }; + + chosen { + stdout-path = "serial0:115200n8"; + }; + + i2c0_imux: i2c0-imux{ + compatible = "i2c-mux-pinctrl"; + #address-cells = <1>; + #size-cells = <0>; + i2c-parent = <&i2c0>; + pinctrl-names = + "i2c104", "i2c105", "i2c106", "i2c107", + "i2c108", "i2c109", "idle"; + pinctrl-0 = <&i2cmux_0>; + pinctrl-1 = <&i2cmux_1>; + pinctrl-2 = <&i2cmux_2>; + pinctrl-3 = <&i2cmux_3>; + pinctrl-4 = <&i2cmux_4>; + pinctrl-5 = <&i2cmux_5>; + pinctrl-6 = <&i2cmux_pins_i>; + i2c104: i2c_sfp0@0 { + reg = <0>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c105: i2c_sfp1@1 { + reg = <1>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c106: i2c_sfp2@2 { + reg = <2>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c107: i2c_sfp3@3 { + reg = <3>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c108: i2c_sfp4@4 { + reg = <4>; + #address-cells = <1>; + #size-cells = <0>; + }; + i2c109: i2c_sfp5@5 { + reg = <5>; + #address-cells = <1>; + #size-cells = <0>; + }; +}; + +}; + +&uart0 { + status = "okay"; +}; + +&uart2 { + status = "okay"; +}; + +&gpio { + i2c_pins: i2c-pins { + pins = "GPIO_7"; /* No "default" scl for i2c0 */ + function = "twi"; + }; + i2cmux_pins_i: i2cmux-pins-i { + pins = "GPIO_11", "GPIO_12", "GPIO_18", "GPIO_19", + "GPIO_20", "GPIO_21"; + function = "twi_scl_m"; + output-low; + }; + i2cmux_0: i2cmux-0 { + pins = "GPIO_11"; + function = "twi_scl_m"; + output-high; + }; + i2cmux_1: i2cmux-1 { + pins = "GPIO_12"; + function = "twi_scl_m"; + output-high; + }; + i2cmux_2: i2cmux-2 { + pins = "GPIO_18"; + function = "twi_scl_m"; + output-high; + }; + i2cmux_3: i2cmux-3 { + pins = "GPIO_19"; + function = "twi_scl_m"; + output-high; + }; + i2cmux_4: i2cmux-4 { + pins = "GPIO_20"; + function = "twi_scl_m"; + output-high; + }; + i2cmux_5: i2cmux-5 { + pins = "GPIO_21"; + function = "twi_scl_m"; + output-high; + }; +}; + +&i2c0 { + status = "okay"; + i2c-sda-hold-time-ns = <300>; +}; + diff --git a/arch/mips/boot/dts/mscc/serval_pcb105.dts b/arch/mips/boot/dts/mscc/serval_pcb105.dts new file mode 100644 index 000000000000..a1b0012b79d3 --- /dev/null +++ b/arch/mips/boot/dts/mscc/serval_pcb105.dts @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2018 Microsemi Corporation + */ + +/dts-v1/; +#include "serval_common.dtsi" + +/ { + model = "Serval PCB105 Reference Board"; + compatible = "mscc,serval-pcb105", "mscc,serval"; + + aliases { + }; + +}; + diff --git a/arch/mips/boot/dts/mscc/serval_pcb106.dts b/arch/mips/boot/dts/mscc/serval_pcb106.dts new file mode 100644 index 000000000000..237be7c8da57 --- /dev/null +++ b/arch/mips/boot/dts/mscc/serval_pcb106.dts @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR MIT) +/* + * Copyright (c) 2018 Microsemi Corporation + */ + +/dts-v1/; +#include "serval_common.dtsi" + +/ { + model = "Serval PCB106 Reference Board"; + compatible = "mscc,serval-pcb106", "mscc,serval"; + + aliases { + }; + +}; + diff --git a/arch/mips/generic/Kconfig b/arch/mips/generic/Kconfig index c7a840b8eaa6..657dd93c5e76 100644 --- a/arch/mips/generic/Kconfig +++ b/arch/mips/generic/Kconfig @@ -94,6 +94,14 @@ config FIT_IMAGE_FDT_JAGUAR2 from Microsemi in the FIT kernel image. This requires u-boot on the platform. +config FIT_IMAGE_FDT_SERVAL + bool "Include FDT for Microsemi Serval development platforms" + select SOC_VCOREIII + help + Enable this to include the FDT for the Serval development platforms + from Microsemi in the FIT kernel image. + This requires u-boot on the platform. + config BOARD_INGENIC bool "Support boards based on Ingenic SoCs" select MACH_INGENIC_GENERIC diff --git a/arch/mips/generic/Platform b/arch/mips/generic/Platform index 3f2055bea596..b871af16b5b6 100644 --- a/arch/mips/generic/Platform +++ b/arch/mips/generic/Platform @@ -22,4 +22,5 @@ its-$(CONFIG_FIT_IMAGE_FDT_NI169445) += board-ni169445.its.S its-$(CONFIG_FIT_IMAGE_FDT_OCELOT) += board-ocelot.its.S its-$(CONFIG_FIT_IMAGE_FDT_LUTON) += board-luton.its.S its-$(CONFIG_FIT_IMAGE_FDT_JAGUAR2) += board-jaguar2.its.S +its-$(CONFIG_FIT_IMAGE_FDT_SERVAL) += board-serval.its.S its-$(CONFIG_FIT_IMAGE_FDT_XILFPGA) += board-xilfpga.its.S diff --git a/arch/mips/generic/board-serval.its.S b/arch/mips/generic/board-serval.its.S new file mode 100644 index 000000000000..4ea4fc9d757f --- /dev/null +++ b/arch/mips/generic/board-serval.its.S @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/ { + images { + fdt@serval_pcb105 { + description = "MSCC Serval PCB105 Device Tree"; + data = /incbin/("boot/dts/mscc/serval_pcb105.dtb"); + type = "flat_dt"; + arch = "mips"; + compression = "none"; + hash@0 { + algo = "sha1"; + }; + }; + }; + + configurations { + pcb105 { + description = "Serval Linux kernel"; + kernel = "kernel@0"; + fdt = "fdt@serval_pcb105"; + ramdisk = "ramdisk"; + }; + }; +}; From 9d63bcb87157c90899525d9db25b37106cd0afe3 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 1 Nov 2020 15:12:52 +0000 Subject: [PATCH 058/395] mips: boot: clean up self-extracting targets scenarios 1. All final targets like vmlinuz.{bin,ecoff,srec} etc. should reside in $(objtree)/arch/mips/boot, not in the root $(objtree) directory. The only file that should be left there is vmlinuz, similar to other architectures. 2. Add all the targets to $(targets) variable, so they'll be properly accounted by Kbuild. This also allows to remove redundant $(clean-files) (which were missing uzImage BTW). 3. Prefix all targets with $(obj)/$(objtree), depending on their locations. Misc: fix the identation of the 'STRIP' quiet message. Signed-off-by: Alexander Lobakin Acked-by: Paul Cercueil Signed-off-by: Thomas Bogendoerfer --- arch/mips/Makefile | 2 +- arch/mips/boot/.gitignore | 1 + arch/mips/boot/compressed/Makefile | 50 +++++++++++++++++++++--------- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/arch/mips/Makefile b/arch/mips/Makefile index 0d0f29d662c9..622ee83dbb9b 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -378,7 +378,7 @@ ifdef CONFIG_SYS_SUPPORTS_ZBOOT # boot/compressed $(bootz-y): $(vmlinux-32) FORCE $(Q)$(MAKE) $(build)=arch/mips/boot/compressed \ - $(bootvars-y) 32bit-bfd=$(32bit-bfd) $@ + $(bootvars-y) 32bit-bfd=$(32bit-bfd) arch/mips/boot/$@ else vmlinuz: FORCE @echo ' CONFIG_SYS_SUPPORTS_ZBOOT is not enabled' diff --git a/arch/mips/boot/.gitignore b/arch/mips/boot/.gitignore index 2adc8581a175..1c7adddf2e60 100644 --- a/arch/mips/boot/.gitignore +++ b/arch/mips/boot/.gitignore @@ -2,6 +2,7 @@ mkboot elf2ecoff vmlinux.* +vmlinuz.* zImage zImage.tmp calc_vmlinuz_load_addr diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index d66511825fe1..fa2c1e1b303f 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -65,7 +65,9 @@ $(obj)/bswapsi.c: $(obj)/%.c: $(srctree)/arch/mips/lib/%.c FORCE targets := $(notdir $(vmlinuzobjs-y)) targets += vmlinux.bin + OBJCOPYFLAGS_vmlinux.bin := $(OBJCOPYFLAGS) -O binary -R .comment -S + $(obj)/vmlinux.bin: $(KBUILD_IMAGE) FORCE $(call if_changed,objcopy) @@ -78,12 +80,15 @@ tool_$(CONFIG_KERNEL_XZ) = xzkern tool_$(CONFIG_KERNEL_ZSTD) = zstd22 targets += vmlinux.bin.z + $(obj)/vmlinux.bin.z: $(obj)/vmlinux.bin FORCE $(call if_changed,$(tool_y)) targets += piggy.o dummy.o + OBJCOPYFLAGS_piggy.o := --add-section=.image=$(obj)/vmlinux.bin.z \ --set-section-flags=.image=contents,alloc,load,readonly,data + $(obj)/piggy.o: $(obj)/dummy.o $(obj)/vmlinux.bin.z FORCE $(call if_changed,objcopy) @@ -102,14 +107,21 @@ UIMAGE_LOADADDR = $(VMLINUZ_LOAD_ADDRESS) vmlinuzobjs-y += $(obj)/piggy.o +targets += ../../../../vmlinuz + quiet_cmd_zld = LD $@ cmd_zld = $(LD) $(KBUILD_LDFLAGS) -Ttext $(VMLINUZ_LOAD_ADDRESS) -T $< $(vmlinuzobjs-y) -o $@ -quiet_cmd_strip = STRIP $@ +quiet_cmd_strip = STRIP $@ cmd_strip = $(STRIP) -s $@ -vmlinuz: $(src)/ld.script $(vmlinuzobjs-y) $(obj)/calc_vmlinuz_load_addr + +$(objtree)/vmlinuz: $(src)/ld.script $(vmlinuzobjs-y) $(obj)/calc_vmlinuz_load_addr $(call cmd,zld) $(call cmd,strip) +objboot := $(objtree)/arch/mips/boot + +$(objboot)/vmlinuz: $(objtree)/vmlinuz FORCE + # # Some DECstations need all possible sections of an ECOFF executable # @@ -121,34 +133,42 @@ endif hostprogs += ../elf2ecoff ifdef CONFIG_32BIT - VMLINUZ = vmlinuz + VMLINUZ = $(objtree)/vmlinuz else - VMLINUZ = vmlinuz.32 + VMLINUZ = $(objboot)/vmlinuz.32 endif +targets += ../vmlinuz.32 + quiet_cmd_32 = OBJCOPY $@ cmd_32 = $(OBJCOPY) -O $(32bit-bfd) $(OBJCOPYFLAGS) $< $@ -vmlinuz.32: vmlinuz + +$(objboot)/vmlinuz.32: $(objtree)/vmlinuz $(call cmd,32) +targets += ../vmlinuz.ecoff + quiet_cmd_ecoff = ECOFF $@ cmd_ecoff = $< $(VMLINUZ) $@ $(e2eflag) -vmlinuz.ecoff: $(obj)/../elf2ecoff $(VMLINUZ) + +$(objboot)/vmlinuz.ecoff: $(objboot)/elf2ecoff $(VMLINUZ) $(call cmd,ecoff) +targets += ../vmlinuz.bin + OBJCOPYFLAGS_vmlinuz.bin := $(OBJCOPYFLAGS) -O binary -vmlinuz.bin: vmlinuz + +$(objboot)/vmlinuz.bin: $(objtree)/vmlinuz $(call cmd,objcopy) +targets += ../vmlinuz.srec + OBJCOPYFLAGS_vmlinuz.srec := $(OBJCOPYFLAGS) -S -O srec -vmlinuz.srec: vmlinuz + +$(objboot)/vmlinuz.srec: $(objtree)/vmlinuz $(call cmd,objcopy) -uzImage.bin: vmlinuz.bin FORCE - $(call if_changed,uimage,none) +targets += ../uzImage.bin -clean-files += $(objtree)/vmlinuz -clean-files += $(objtree)/vmlinuz.32 -clean-files += $(objtree)/vmlinuz.ecoff -clean-files += $(objtree)/vmlinuz.bin -clean-files += $(objtree)/vmlinuz.srec +$(objboot)/uzImage.bin: $(objboot)/vmlinuz.bin FORCE + $(call if_changed,uimage,none) From a3fb655027c33a9281d3b813798b15bdf1e75d43 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 1 Nov 2020 15:13:01 +0000 Subject: [PATCH 059/395] mips: boot: add support for self-extracting FIT images (vmlinuz.itb) Commit c3e2ee657418 ("MIPS: generic: Add support for zboot") added support for self-extracting images to Generic MIPS. However, the intended way to boot Generic MIPS kernels is using FIT Images and UHI boot protocol, but currently there's no way to make self-extracting FIT Image (only legacy uzImages). Add a target for this named "vmlinuz.itb", which will consist of vmlinuz.bin and selected DT blobs. It will allow to have the advantages of both UHI and self-extracting images. Signed-off-by: Alexander Lobakin Acked-by: Paul Cercueil Signed-off-by: Thomas Bogendoerfer --- arch/mips/Makefile | 1 + arch/mips/boot/compressed/Makefile | 48 ++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/arch/mips/Makefile b/arch/mips/Makefile index 622ee83dbb9b..cd4343edeb11 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -347,6 +347,7 @@ bootz-y += vmlinuz.srec ifeq ($(shell expr $(zload-y) \< 0xffffffff80000000 2> /dev/null), 0) bootz-y += uzImage.bin endif +bootz-y += vmlinuz.itb # # Some machines like the Indy need 32-bit ELF binaries for booting purposes. diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index fa2c1e1b303f..4c3bc7e3d56d 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -172,3 +172,51 @@ targets += ../uzImage.bin $(objboot)/uzImage.bin: $(objboot)/vmlinuz.bin FORCE $(call if_changed,uimage,none) + +# +# Flattened Image Tree (.itb) image +# + +ifeq ($(ADDR_BITS),32) +itb_addr_cells = 1 +endif +ifeq ($(ADDR_BITS),64) +itb_addr_cells = 2 +endif + +targets += ../vmlinuz.its.S + +quiet_cmd_its_cat = CAT $@ + cmd_its_cat = cat $(real-prereqs) >$@ + +$(objboot)/vmlinuz.its.S: $(addprefix $(srctree)/arch/mips/$(PLATFORM)/,$(ITS_INPUTS)) FORCE + $(call if_changed,its_cat) + +targets += ../vmlinuz.its + +quiet_cmd_cpp_its_S = ITS $@ + cmd_cpp_its_S = $(CPP) -P -C -o $@ $< \ + -DKERNEL_NAME="\"Linux $(KERNELRELEASE)\"" \ + -DVMLINUX_BINARY="\"$(2)\"" \ + -DVMLINUX_COMPRESSION="\"none\"" \ + -DVMLINUX_LOAD_ADDRESS=$(VMLINUZ_LOAD_ADDRESS) \ + -DVMLINUX_ENTRY_ADDRESS=$(VMLINUZ_LOAD_ADDRESS) \ + -DADDR_BITS=$(ADDR_BITS) \ + -DADDR_CELLS=$(itb_addr_cells) + +$(objboot)/vmlinuz.its: $(objboot)/vmlinuz.its.S FORCE + $(call if_changed,cpp_its_S,vmlinuz.bin) + +targets += ../vmlinuz.itb + +quiet_cmd_itb-image = ITB $@ + cmd_itb-image = \ + env PATH="$(objtree)/scripts/dtc:$(PATH)" \ + $(BASH) $(MKIMAGE) \ + -D "-I dts -O dtb -p 500 \ + --include $(objtree)/arch/mips \ + --warning no-unit_address_vs_reg" \ + -f $(2) $@ + +$(objboot)/vmlinuz.itb: $(objboot)/vmlinuz.its $(objboot)/vmlinuz.bin FORCE + $(call if_changed,itb-image,$<) From b159e86b5a2ab826b3a292756072f4cc523675ab Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Wed, 4 Nov 2020 13:01:10 +0100 Subject: [PATCH 060/395] selinux: drop super_block backpointer from superblock_security_struct It appears to have been needed for selinux_complete_init() in the past, but today it's useless. Signed-off-by: Ondrej Mosnacek Signed-off-by: Paul Moore --- security/selinux/hooks.c | 5 ++--- security/selinux/include/objsec.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c46312710e73..ca5c5623b46d 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -600,7 +600,7 @@ static int selinux_set_mnt_opts(struct super_block *sb, { const struct cred *cred = current_cred(); struct superblock_security_struct *sbsec = sb->s_security; - struct dentry *root = sbsec->sb->s_root; + struct dentry *root = sb->s_root; struct selinux_mnt_opts *opts = mnt_opts; struct inode_security_struct *root_isec; u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0; @@ -1080,7 +1080,7 @@ static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb) return rc; } if (sbsec->flags & ROOTCONTEXT_MNT) { - struct dentry *root = sbsec->sb->s_root; + struct dentry *root = sb->s_root; struct inode_security_struct *isec = backing_inode_security(root); seq_putc(m, ','); seq_puts(m, ROOTCONTEXT_STR); @@ -2568,7 +2568,6 @@ static int selinux_sb_alloc_security(struct super_block *sb) mutex_init(&sbsec->lock); INIT_LIST_HEAD(&sbsec->isec_head); spin_lock_init(&sbsec->isec_lock); - sbsec->sb = sb; sbsec->sid = SECINITSID_UNLABELED; sbsec->def_sid = SECINITSID_FILE; sbsec->mntpoint_sid = SECINITSID_UNLABELED; diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 330b7b6d44e0..ca4d7ab6a835 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -61,7 +61,6 @@ struct file_security_struct { }; struct superblock_security_struct { - struct super_block *sb; /* back pointer to sb object */ u32 sid; /* SID of file system superblock */ u32 def_sid; /* default SID for labeling */ u32 mntpoint_sid; /* SECURITY_FS_USE_MNTPOINT context for files */ From 7da31b858ec278f90603506ce7fa7eed3c53c8d7 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 13 Nov 2020 15:26:59 +0800 Subject: [PATCH 061/395] Smack: fix kernel-doc interface on functions The are some kernel-doc interface issues: security/smack/smackfs.c:1950: warning: Function parameter or member 'list' not described in 'smk_parse_label_list' security/smack/smackfs.c:1950: warning: Excess function parameter 'private' description in 'smk_parse_label_list' security/smack/smackfs.c:1979: warning: Function parameter or member 'list' not described in 'smk_destroy_label_list' security/smack/smackfs.c:1979: warning: Excess function parameter 'head' description in 'smk_destroy_label_list' security/smack/smackfs.c:2141: warning: Function parameter or member 'count' not described in 'smk_read_logging' security/smack/smackfs.c:2141: warning: Excess function parameter 'cn' description in 'smk_read_logging' security/smack/smackfs.c:2278: warning: Function parameter or member 'format' not described in 'smk_user_access' Correct them in this patch. Signed-off-by: Alex Shi Cc: Casey Schaufler Cc: James Morris Cc: "Serge E. Hallyn" Cc: linux-security-module@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Casey Schaufler --- security/smack/smackfs.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index e567b4baf3a0..5d44b7d258ef 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -1942,7 +1942,7 @@ static void smk_list_swap_rcu(struct list_head *public, * smk_parse_label_list - parse list of Smack labels, separated by spaces * * @data: the string to parse - * @private: destination list + * @list: destination list * * Returns zero on success or error code, as appropriate */ @@ -1973,7 +1973,7 @@ static int smk_parse_label_list(char *data, struct list_head *list) /** * smk_destroy_label_list - destroy a list of smack_known_list_elem - * @head: header pointer of the list to destroy + * @list: header pointer of the list to destroy */ void smk_destroy_label_list(struct list_head *list) { @@ -2131,7 +2131,7 @@ static const struct file_operations smk_unconfined_ops = { * smk_read_logging - read() for /smack/logging * @filp: file pointer, not actually used * @buf: where to put the result - * @cn: maximum to send along + * @count: maximum to send along * @ppos: where to start * * Returns number of bytes read or error code, as appropriate @@ -2272,6 +2272,7 @@ static const struct file_operations smk_load_self_ops = { * @buf: data from user space * @count: bytes sent * @ppos: where to start - must be 0 + * @format: /smack/load or /smack/load2 or /smack/change-rule format. */ static ssize_t smk_user_access(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int format) From daaedb820ad716e00210af8859b194c404202b78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:09 +0100 Subject: [PATCH 062/395] mtd_blkdevs: don't override BLKFLSBUF BLKFLSBUF is not supposed to actually send a flush command to the device, but to tear down buffer cache structures. Remove the mtd_blkdevs implementation and just use the default semantics instead. Signed-off-by: Christoph Hellwig Acked-by: Richard Weinberger Signed-off-by: Jens Axboe --- drivers/mtd/mtd_blkdevs.c | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 0c05f77f9b21..fb8e12d590a1 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -298,38 +298,10 @@ static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo) return ret; } -static int blktrans_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct mtd_blktrans_dev *dev = blktrans_dev_get(bdev->bd_disk); - int ret = -ENXIO; - - if (!dev) - return ret; - - mutex_lock(&dev->lock); - - if (!dev->mtd) - goto unlock; - - switch (cmd) { - case BLKFLSBUF: - ret = dev->tr->flush ? dev->tr->flush(dev) : 0; - break; - default: - ret = -ENOTTY; - } -unlock: - mutex_unlock(&dev->lock); - blktrans_dev_put(dev); - return ret; -} - static const struct block_device_operations mtd_block_ops = { .owner = THIS_MODULE, .open = blktrans_open, .release = blktrans_release, - .ioctl = blktrans_ioctl, .getgeo = blktrans_getgeo, }; From 4a9d6d667f0bafed55a9e9f5ae8bceb3680749d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:10 +0100 Subject: [PATCH 063/395] block: don't call into the driver for BLKFLSBUF BLKFLSBUF is entirely contained in the block core, and there is no good reason to give the driver a hook into processing it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 3fbc382eb926..c6d8863f0409 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -369,15 +369,8 @@ static inline int is_unrecognized_ioctl(int ret) static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { - int ret; - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; - fsync_bdev(bdev); invalidate_bdev(bdev); return 0; From e00adcadf3af7a8335026d71ab9f0e0a922191ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:11 +0100 Subject: [PATCH 064/395] block: add a new set_read_only method Add a new method to allow for driver-specific processing when setting or clearing the block device read-only state. This allows to replace the cumbersome and error-prone override of the whole ioctl implementation. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 5 +++++ include/linux/blkdev.h | 1 + 2 files changed, 6 insertions(+) diff --git a/block/ioctl.c b/block/ioctl.c index c6d8863f0409..a6fa16b97705 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -389,6 +389,11 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, return ret; if (get_user(n, (int __user *)arg)) return -EFAULT; + if (bdev->bd_disk->fops->set_read_only) { + ret = bdev->bd_disk->fops->set_read_only(bdev, n); + if (ret) + return ret; + } set_device_ro(bdev, n); return 0; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 639cae2c158b..5c1ba8a8d2bc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1850,6 +1850,7 @@ struct block_device_operations { void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); int (*getgeo)(struct block_device *, struct hd_geometry *); + int (*set_read_only)(struct block_device *bdev, bool ro); /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, From 34919e3ba23a106e04f042f19348d9e55739b35a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:12 +0100 Subject: [PATCH 065/395] rbd: implement ->set_read_only to hook into BLKROSET processing Implement the ->set_read_only method instead of parsing the actual ioctl command. Signed-off-by: Christoph Hellwig Acked-by: Ilya Dryomov Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 40 ++++------------------------------------ 1 file changed, 4 insertions(+), 36 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f84128abade3..671733e459cf 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -692,12 +692,9 @@ static void rbd_release(struct gendisk *disk, fmode_t mode) put_device(&rbd_dev->dev); } -static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) +static int rbd_set_read_only(struct block_device *bdev, bool ro) { - int ro; - - if (get_user(ro, (int __user *)arg)) - return -EFAULT; + struct rbd_device *rbd_dev = bdev->bd_disk->private_data; /* * Both images mapped read-only and snapshots can't be marked @@ -710,43 +707,14 @@ static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) rbd_assert(!rbd_is_snap(rbd_dev)); } - /* Let blkdev_roset() handle it */ - return -ENOTTY; + return 0; } -static int rbd_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct rbd_device *rbd_dev = bdev->bd_disk->private_data; - int ret; - - switch (cmd) { - case BLKROSET: - ret = rbd_ioctl_set_ro(rbd_dev, arg); - break; - default: - ret = -ENOTTY; - } - - return ret; -} - -#ifdef CONFIG_COMPAT -static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - return rbd_ioctl(bdev, mode, cmd, arg); -} -#endif /* CONFIG_COMPAT */ - static const struct block_device_operations rbd_bd_ops = { .owner = THIS_MODULE, .open = rbd_open, .release = rbd_release, - .ioctl = rbd_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = rbd_compat_ioctl, -#endif + .set_read_only = rbd_set_read_only, }; /* From 118cf084adb3964d06e1667cf7d702e56e5cd2c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:13 +0100 Subject: [PATCH 066/395] md: implement ->set_read_only to hook into BLKROSET processing Implement the ->set_read_only method instead of parsing the actual ioctl command. Signed-off-by: Christoph Hellwig Acked-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/md.c | 62 ++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 98bac4f304ae..96d31336ad43 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7480,7 +7480,6 @@ static inline bool md_ioctl_valid(unsigned int cmd) { switch (cmd) { case ADD_NEW_DISK: - case BLKROSET: case GET_ARRAY_INFO: case GET_BITMAP_FILE: case GET_DISK_INFO: @@ -7507,7 +7506,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, int err = 0; void __user *argp = (void __user *)arg; struct mddev *mddev = NULL; - int ro; bool did_set_md_closing = false; if (!md_ioctl_valid(cmd)) @@ -7687,35 +7685,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto unlock; } break; - - case BLKROSET: - if (get_user(ro, (int __user *)(arg))) { - err = -EFAULT; - goto unlock; - } - err = -EINVAL; - - /* if the bdev is going readonly the value of mddev->ro - * does not matter, no writes are coming - */ - if (ro) - goto unlock; - - /* are we are already prepared for writes? */ - if (mddev->ro != 1) - goto unlock; - - /* transitioning to readauto need only happen for - * arrays that call md_write_start - */ - if (mddev->pers) { - err = restart_array(mddev); - if (err == 0) { - mddev->ro = 2; - set_disk_ro(mddev->gendisk, 0); - } - } - goto unlock; } /* @@ -7809,6 +7778,36 @@ static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif /* CONFIG_COMPAT */ +static int md_set_read_only(struct block_device *bdev, bool ro) +{ + struct mddev *mddev = bdev->bd_disk->private_data; + int err; + + err = mddev_lock(mddev); + if (err) + return err; + + if (!mddev->raid_disks && !mddev->external) { + err = -ENODEV; + goto out_unlock; + } + + /* + * Transitioning to read-auto need only happen for arrays that call + * md_write_start and which are not ready for writes yet. + */ + if (!ro && mddev->ro == 1 && mddev->pers) { + err = restart_array(mddev); + if (err) + goto out_unlock; + mddev->ro = 2; + } + +out_unlock: + mddev_unlock(mddev); + return err; +} + static int md_open(struct block_device *bdev, fmode_t mode) { /* @@ -7886,6 +7885,7 @@ const struct block_device_operations md_fops = #endif .getgeo = md_getgeo, .check_events = md_check_events, + .set_read_only = md_set_read_only, }; static int md_thread(void *arg) From 2c1b6ec170423bb032e825d4d9b2780bac4ce1a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:14 +0100 Subject: [PATCH 067/395] dasd: implement ->set_read_only to hook into BLKROSET processing Implement the ->set_read_only method instead of parsing the actual ioctl command. Signed-off-by: Christoph Hellwig Reviewed-by: Stefan Haberland Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 1 + drivers/s390/block/dasd_int.h | 3 ++- drivers/s390/block/dasd_ioctl.c | 27 +++++++++------------------ 3 files changed, 12 insertions(+), 19 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index eb17fea8075c..db24e04ee978 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3394,6 +3394,7 @@ dasd_device_operations = { .ioctl = dasd_ioctl, .compat_ioctl = dasd_ioctl, .getgeo = dasd_getgeo, + .set_read_only = dasd_set_read_only, }; /******************************************************************************* diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index fa552f9f1666..c59a0d63b506 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -844,7 +844,8 @@ int dasd_scan_partitions(struct dasd_block *); void dasd_destroy_partitions(struct dasd_block *); /* externals in dasd_ioctl.c */ -int dasd_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long); +int dasd_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long); +int dasd_set_read_only(struct block_device *bdev, bool ro); /* externals in dasd_proc.c */ int dasd_proc_init(void); diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index cb6427fb9f3d..3359559517bf 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -532,28 +532,22 @@ static int dasd_ioctl_information(struct dasd_block *block, void __user *argp, /* * Set read only */ -static int -dasd_ioctl_set_ro(struct block_device *bdev, void __user *argp) +int dasd_set_read_only(struct block_device *bdev, bool ro) { struct dasd_device *base; - int intval, rc; + int rc; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + /* do not manipulate hardware state for partitions */ if (bdev_is_partition(bdev)) - // ro setting is not allowed for partitions - return -EINVAL; - if (get_user(intval, (int __user *)argp)) - return -EFAULT; + return 0; + base = dasd_device_from_gendisk(bdev->bd_disk); if (!base) return -ENODEV; - if (!intval && test_bit(DASD_FLAG_DEVICE_RO, &base->flags)) { - dasd_put_device(base); - return -EROFS; - } - set_disk_ro(bdev->bd_disk, intval); - rc = dasd_set_feature(base->cdev, DASD_FEATURE_READONLY, intval); + if (!ro && test_bit(DASD_FLAG_DEVICE_RO, &base->flags)) + rc = -EROFS; + else + rc = dasd_set_feature(base->cdev, DASD_FEATURE_READONLY, ro); dasd_put_device(base); return rc; } @@ -633,9 +627,6 @@ int dasd_ioctl(struct block_device *bdev, fmode_t mode, case BIODASDPRRST: rc = dasd_ioctl_reset_profile(block); break; - case BLKROSET: - rc = dasd_ioctl_set_ro(bdev, argp); - break; case DASDAPIVER: rc = dasd_ioctl_api_version(argp); break; From 732e12d805a77f74c907c0a28ece271ef1e81e01 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:15 +0100 Subject: [PATCH 068/395] block: don't call into the driver for BLKROSET Now that all drivers that want to hook into setting or clearing the read-only flag use the set_read_only method, this code can be removed. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index a6fa16b97705..96cb45447364 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -346,26 +346,6 @@ static int blkdev_pr_clear(struct block_device *bdev, return ops->pr_clear(bdev, c.key); } -/* - * Is it an unrecognized ioctl? The correct returns are either - * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a - * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl - * code before returning. - * - * Confused drivers sometimes return EINVAL, which is wrong. It - * means "I understood the ioctl command, but the parameters to - * it were wrong". - * - * We should aim to just fix the broken drivers, the EINVAL case - * should go away. - */ -static inline int is_unrecognized_ioctl(int ret) -{ - return ret == -EINVAL || - ret == -ENOTTY || - ret == -ENOIOCTLCMD; -} - static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { @@ -384,9 +364,6 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, if (!capable(CAP_SYS_ADMIN)) return -EACCES; - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; if (get_user(n, (int __user *)arg)) return -EFAULT; if (bdev->bd_disk->fops->set_read_only) { From 7a2f0ce19f2e2ed93b5ace60375b908392ec8afb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:16 +0100 Subject: [PATCH 069/395] loop: use set_disk_ro Use set_disk_ro instead of set_device_ro to match all other block drivers and to ensure all partitions mirror the read-only flag. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a58084c2ed7c..027f5724fe2c 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1138,7 +1138,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, if (error) goto out_unlock; - set_device_ro(bdev, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); + set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; From 98f49b63e84d4ee1a5c327d0b5f4e8699f6c70fe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:17 +0100 Subject: [PATCH 070/395] block: remove set_device_ro Fold set_device_ro into its only remaining caller. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 7 ------- block/ioctl.c | 2 +- include/linux/genhd.h | 1 - 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 9387f050c248..b85db1f2233c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1846,13 +1846,6 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro) kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); } -void set_device_ro(struct block_device *bdev, int flag) -{ - bdev->bd_part->policy = flag; -} - -EXPORT_SYMBOL(set_device_ro); - void set_disk_ro(struct gendisk *disk, int flag) { struct disk_part_iter piter; diff --git a/block/ioctl.c b/block/ioctl.c index 96cb45447364..04255dc5f3bf 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -371,7 +371,7 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, if (ret) return ret; } - set_device_ro(bdev, n); + bdev->bd_part->policy = n; return 0; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 03da3f603d30..52eb592c874a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -304,7 +304,6 @@ extern void del_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); -extern void set_device_ro(struct block_device *bdev, int flag); extern void set_disk_ro(struct gendisk *disk, int flag); static inline int get_disk_ro(struct gendisk *disk) From a7cb3d2f09c8405aed59d97a7d02cebea43cd3c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:18 +0100 Subject: [PATCH 071/395] block: remove __blkdev_driver_ioctl Just open code it in the few callers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 25 +++++-------------------- drivers/block/pktcdvd.c | 6 ++++-- drivers/md/bcache/request.c | 5 +++-- drivers/md/dm.c | 5 ++++- include/linux/blkdev.h | 2 -- 5 files changed, 16 insertions(+), 27 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 04255dc5f3bf..6b785181344f 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -219,23 +219,6 @@ static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val) } #endif -int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) -{ - struct gendisk *disk = bdev->bd_disk; - - if (disk->fops->ioctl) - return disk->fops->ioctl(bdev, mode, cmd, arg); - - return -ENOTTY; -} -/* - * For the record: _GPL here is only because somebody decided to slap it - * on the previous export. Sheer idiocy, since it wasn't copyrightable - * at all and could be open-coded without any exports by anybody who cares. - */ -EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); - #ifdef CONFIG_COMPAT /* * This is the equivalent of compat_ptr_ioctl(), to be used by block @@ -594,10 +577,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, } ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); - if (ret == -ENOIOCTLCMD) - return __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (ret != -ENOIOCTLCMD) + return ret; - return ret; + if (!bdev->bd_disk->fops->ioctl) + return -ENOTTY; + return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); } EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */ diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 467dbd06b7cd..ef1c1f094ea4 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2584,9 +2584,11 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, case CDROM_LAST_WRITTEN: case CDROM_SEND_PACKET: case SCSI_IOCTL_SEND_COMMAND: - ret = __blkdev_driver_ioctl(pd->bdev, mode, cmd, arg); + if (!bdev->bd_disk->fops->ioctl) + ret = -ENOTTY; + else + ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); break; - default: pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd); ret = -ENOTTY; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 214326383145..afac8d07c1bd 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1230,8 +1230,9 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, if (dc->io_disable) return -EIO; - - return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); + if (!dc->bdev->bd_disk->fops->ioctl) + return -ENOTTY; + return dc->bdev->bd_disk->fops->ioctl(dc->bdev, mode, cmd, arg); } void bch_cached_dev_request_init(struct cached_dev *dc) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c18fc2548518..6db395c3d28b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -570,7 +570,10 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, } } - r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (!bdev->bd_disk->fops->ioctl) + r = -ENOTTY; + else + r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); out: dm_unprepare_ioctl(md, srcu_idx); return r; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c1ba8a8d2bc..05b346a68c2e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1867,8 +1867,6 @@ extern int blkdev_compat_ptr_ioctl(struct block_device *, fmode_t, #define blkdev_compat_ptr_ioctl NULL #endif -extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, - unsigned long); extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); From 6b3ba9762f9f9f651873af34481ca20e4a6791e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:24 +0100 Subject: [PATCH 072/395] block: cleanup del_gendisk a bit Merge three hidden gendisk checks into one. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index b85db1f2233c..d41176eb1f36 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -895,6 +895,9 @@ void del_gendisk(struct gendisk *disk) might_sleep(); + if (WARN_ON_ONCE(!disk->queue)) + return; + blk_integrity_del(disk); disk_del_events(disk); @@ -917,20 +920,18 @@ void del_gendisk(struct gendisk *disk) disk->flags &= ~GENHD_FL_UP; up_write(&disk->lookup_sem); - if (!(disk->flags & GENHD_FL_HIDDEN)) + if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); - if (disk->queue) { + /* * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - if (!(disk->flags & GENHD_FL_HIDDEN)) - bdi_unregister(disk->queue->backing_dev_info); - blk_unregister_queue(disk); - } else { - WARN_ON(1); + bdi_unregister(disk->queue->backing_dev_info); } + blk_unregister_queue(disk); + if (!(disk->flags & GENHD_FL_HIDDEN)) blk_unregister_region(disk_devt(disk), disk->minors); /* From 62b508f8b6b1b52843cd90f0b2068ed963f25bd3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:25 +0100 Subject: [PATCH 073/395] block: open code kobj_map into in block/genhd.c Copy and paste the kobj_map functionality in the block code in preparation for completely rewriting it. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- block/genhd.c | 130 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 117 insertions(+), 13 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index d41176eb1f36..667d1d6fd70a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -29,6 +28,16 @@ static DEFINE_MUTEX(block_class_lock); static struct kobject *block_depr; +struct bdev_map { + struct bdev_map *next; + dev_t dev; + unsigned long range; + struct module *owner; + struct kobject *(*probe)(dev_t, int *, void *); + int (*lock)(dev_t, void *); + void *data; +} *bdev_map[255]; + /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) @@ -520,8 +529,6 @@ void unregister_blkdev(unsigned int major, const char *name) EXPORT_SYMBOL(unregister_blkdev); -static struct kobj_map *bdev_map; - /** * blk_mangle_minor - scatter minor numbers apart * @minor: minor number to mangle @@ -648,16 +655,60 @@ void blk_register_region(dev_t devt, unsigned long range, struct module *module, struct kobject *(*probe)(dev_t, int *, void *), int (*lock)(dev_t, void *), void *data) { - kobj_map(bdev_map, devt, range, module, probe, lock, data); -} + unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1; + unsigned index = MAJOR(devt); + unsigned i; + struct bdev_map *p; + n = min(n, 255u); + p = kmalloc_array(n, sizeof(struct bdev_map), GFP_KERNEL); + if (p == NULL) + return; + + for (i = 0; i < n; i++, p++) { + p->owner = module; + p->probe = probe; + p->lock = lock; + p->dev = devt; + p->range = range; + p->data = data; + } + + mutex_lock(&block_class_lock); + for (i = 0, p -= n; i < n; i++, p++, index++) { + struct bdev_map **s = &bdev_map[index % 255]; + while (*s && (*s)->range < range) + s = &(*s)->next; + p->next = *s; + *s = p; + } + mutex_unlock(&block_class_lock); +} EXPORT_SYMBOL(blk_register_region); void blk_unregister_region(dev_t devt, unsigned long range) { - kobj_unmap(bdev_map, devt, range); -} + unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1; + unsigned index = MAJOR(devt); + unsigned i; + struct bdev_map *found = NULL; + mutex_lock(&block_class_lock); + for (i = 0; i < min(n, 255u); i++, index++) { + struct bdev_map **s; + for (s = &bdev_map[index % 255]; *s; s = &(*s)->next) { + struct bdev_map *p = *s; + if (p->dev == devt && p->range == range) { + *s = p->next; + if (!found) + found = p; + break; + } + } + } + mutex_unlock(&block_class_lock); + kfree(found); +} EXPORT_SYMBOL(blk_unregister_region); static struct kobject *exact_match(dev_t devt, int *partno, void *data) @@ -979,6 +1030,47 @@ static ssize_t disk_badblocks_store(struct device *dev, return badblocks_store(disk->bb, page, len, 0); } +static struct gendisk *lookup_gendisk(dev_t dev, int *partno) +{ + struct kobject *kobj; + struct bdev_map *p; + unsigned long best = ~0UL; + +retry: + mutex_lock(&block_class_lock); + for (p = bdev_map[MAJOR(dev) % 255]; p; p = p->next) { + struct kobject *(*probe)(dev_t, int *, void *); + struct module *owner; + void *data; + + if (p->dev > dev || p->dev + p->range - 1 < dev) + continue; + if (p->range - 1 >= best) + break; + if (!try_module_get(p->owner)) + continue; + owner = p->owner; + data = p->data; + probe = p->probe; + best = p->range - 1; + *partno = dev - p->dev; + if (p->lock && p->lock(dev, data) < 0) { + module_put(owner); + continue; + } + mutex_unlock(&block_class_lock); + kobj = probe(dev, partno, data); + /* Currently ->owner protects _only_ ->probe() itself. */ + module_put(owner); + if (kobj) + return dev_to_disk(kobj_to_dev(kobj)); + goto retry; + } + mutex_unlock(&block_class_lock); + return NULL; +} + + /** * get_gendisk - get partitioning information for a given device * @devt: device to get partitioning information for @@ -996,11 +1088,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) might_sleep(); if (MAJOR(devt) != BLOCK_EXT_MAJOR) { - struct kobject *kobj; - - kobj = kobj_lookup(bdev_map, devt, partno); - if (kobj) - disk = dev_to_disk(kobj_to_dev(kobj)); + disk = lookup_gendisk(devt, partno); } else { struct hd_struct *part; @@ -1213,6 +1301,22 @@ static struct kobject *base_probe(dev_t devt, int *partno, void *data) return NULL; } +static void bdev_map_init(void) +{ + struct bdev_map *base; + int i; + + base = kzalloc(sizeof(*base), GFP_KERNEL); + if (!base) + panic("cannot allocate bdev_map"); + + base->dev = 1; + base->range = ~0 ; + base->probe = base_probe; + for (i = 0; i < 255; i++) + bdev_map[i] = base; +} + static int __init genhd_device_init(void) { int error; @@ -1221,7 +1325,7 @@ static int __init genhd_device_init(void) error = class_register(&block_class); if (unlikely(error)) return error; - bdev_map = kobj_map_init(base_probe, &block_class_lock); + bdev_map_init(); blk_dev_init(); register_blkdev(BLOCK_EXT_MAJOR, "blkext"); From e49fbbbf0aa14f011ab037086f37f58bd058a6ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:26 +0100 Subject: [PATCH 074/395] block: split block_class_lock Split the block_class_lock mutex into one each to protect bdev_map and major_names. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 667d1d6fd70a..8226add353be 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -25,7 +25,6 @@ #include "blk.h" -static DEFINE_MUTEX(block_class_lock); static struct kobject *block_depr; struct bdev_map { @@ -37,6 +36,7 @@ struct bdev_map { int (*lock)(dev_t, void *); void *data; } *bdev_map[255]; +static DEFINE_MUTEX(bdev_map_lock); /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) @@ -403,6 +403,7 @@ static struct blk_major_name { int major; char name[16]; } *major_names[BLKDEV_MAJOR_HASH_SIZE]; +static DEFINE_MUTEX(major_names_lock); /* index in the above - for now: assume no multimajor ranges */ static inline int major_to_index(unsigned major) @@ -415,11 +416,11 @@ void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) if (dp->major == offset) seq_printf(seqf, "%3d %s\n", dp->major, dp->name); - mutex_unlock(&block_class_lock); + mutex_unlock(&major_names_lock); } #endif /* CONFIG_PROC_FS */ @@ -448,7 +449,7 @@ int register_blkdev(unsigned int major, const char *name) struct blk_major_name **n, *p; int index, ret = 0; - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); /* temporary */ if (major == 0) { @@ -501,7 +502,7 @@ int register_blkdev(unsigned int major, const char *name) kfree(p); } out: - mutex_unlock(&block_class_lock); + mutex_unlock(&major_names_lock); return ret; } @@ -513,7 +514,7 @@ void unregister_blkdev(unsigned int major, const char *name) struct blk_major_name *p = NULL; int index = major_to_index(major); - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; @@ -523,7 +524,7 @@ void unregister_blkdev(unsigned int major, const char *name) p = *n; *n = p->next; } - mutex_unlock(&block_class_lock); + mutex_unlock(&major_names_lock); kfree(p); } @@ -674,7 +675,7 @@ void blk_register_region(dev_t devt, unsigned long range, struct module *module, p->data = data; } - mutex_lock(&block_class_lock); + mutex_lock(&bdev_map_lock); for (i = 0, p -= n; i < n; i++, p++, index++) { struct bdev_map **s = &bdev_map[index % 255]; while (*s && (*s)->range < range) @@ -682,7 +683,7 @@ void blk_register_region(dev_t devt, unsigned long range, struct module *module, p->next = *s; *s = p; } - mutex_unlock(&block_class_lock); + mutex_unlock(&bdev_map_lock); } EXPORT_SYMBOL(blk_register_region); @@ -693,7 +694,7 @@ void blk_unregister_region(dev_t devt, unsigned long range) unsigned i; struct bdev_map *found = NULL; - mutex_lock(&block_class_lock); + mutex_lock(&bdev_map_lock); for (i = 0; i < min(n, 255u); i++, index++) { struct bdev_map **s; for (s = &bdev_map[index % 255]; *s; s = &(*s)->next) { @@ -706,7 +707,7 @@ void blk_unregister_region(dev_t devt, unsigned long range) } } } - mutex_unlock(&block_class_lock); + mutex_unlock(&bdev_map_lock); kfree(found); } EXPORT_SYMBOL(blk_unregister_region); @@ -1037,7 +1038,7 @@ static struct gendisk *lookup_gendisk(dev_t dev, int *partno) unsigned long best = ~0UL; retry: - mutex_lock(&block_class_lock); + mutex_lock(&bdev_map_lock); for (p = bdev_map[MAJOR(dev) % 255]; p; p = p->next) { struct kobject *(*probe)(dev_t, int *, void *); struct module *owner; @@ -1058,7 +1059,7 @@ static struct gendisk *lookup_gendisk(dev_t dev, int *partno) module_put(owner); continue; } - mutex_unlock(&block_class_lock); + mutex_unlock(&bdev_map_lock); kobj = probe(dev, partno, data); /* Currently ->owner protects _only_ ->probe() itself. */ module_put(owner); @@ -1066,7 +1067,7 @@ static struct gendisk *lookup_gendisk(dev_t dev, int *partno) return dev_to_disk(kobj_to_dev(kobj)); goto retry; } - mutex_unlock(&block_class_lock); + mutex_unlock(&bdev_map_lock); return NULL; } From bd8eff3ba2caca53ea72cf3cc87a7797771dd7d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:27 +0100 Subject: [PATCH 075/395] block: rework requesting modules for unclaimed devices Instead of reusing the ranges in bdev_map, add a new helper that is called if no ranges was found. This is a first step to unpeel and eventually remove the complex ranges structure. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 8226add353be..81017bd3b333 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1031,6 +1031,13 @@ static ssize_t disk_badblocks_store(struct device *dev, return badblocks_store(disk->bb, page, len, 0); } +static void request_gendisk_module(dev_t devt) +{ + if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) + /* Make old-style 2.4 aliases work */ + request_module("block-major-%d", MAJOR(devt)); +} + static struct gendisk *lookup_gendisk(dev_t dev, int *partno) { struct kobject *kobj; @@ -1055,6 +1062,14 @@ static struct gendisk *lookup_gendisk(dev_t dev, int *partno) probe = p->probe; best = p->range - 1; *partno = dev - p->dev; + + if (!probe) { + mutex_unlock(&bdev_map_lock); + module_put(owner); + request_gendisk_module(dev); + goto retry; + } + if (p->lock && p->lock(dev, data) < 0) { module_put(owner); continue; @@ -1293,15 +1308,6 @@ static const struct seq_operations partitions_op = { }; #endif - -static struct kobject *base_probe(dev_t devt, int *partno, void *data) -{ - if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) - /* Make old-style 2.4 aliases work */ - request_module("block-major-%d", MAJOR(devt)); - return NULL; -} - static void bdev_map_init(void) { struct bdev_map *base; @@ -1313,7 +1319,6 @@ static void bdev_map_init(void) base->dev = 1; base->range = ~0 ; - base->probe = base_probe; for (i = 0; i < 255; i++) bdev_map[i] = base; } From a160c6159d4a0cf82f28bc1658a958e278ec3688 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:28 +0100 Subject: [PATCH 076/395] block: add an optional probe callback to major_names Add a callback to the major_names array that allows a driver to override how to probe for dev_t that doesn't currently have a gendisk registered. This will help separating the lookup of the gendisk by dev_t vs probe action for a not currently registered dev_t. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 21 ++++++++++++++++++--- include/linux/genhd.h | 5 ++++- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 81017bd3b333..20521163fd06 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -402,6 +402,7 @@ static struct blk_major_name { struct blk_major_name *next; int major; char name[16]; + void (*probe)(dev_t devt); } *major_names[BLKDEV_MAJOR_HASH_SIZE]; static DEFINE_MUTEX(major_names_lock); @@ -444,7 +445,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset) * See Documentation/admin-guide/devices.txt for the list of allocated * major numbers. */ -int register_blkdev(unsigned int major, const char *name) +int __register_blkdev(unsigned int major, const char *name, + void (*probe)(dev_t devt)) { struct blk_major_name **n, *p; int index, ret = 0; @@ -483,6 +485,7 @@ int register_blkdev(unsigned int major, const char *name) } p->major = major; + p->probe = probe; strlcpy(p->name, name, sizeof(p->name)); p->next = NULL; index = major_to_index(major); @@ -505,8 +508,7 @@ int register_blkdev(unsigned int major, const char *name) mutex_unlock(&major_names_lock); return ret; } - -EXPORT_SYMBOL(register_blkdev); +EXPORT_SYMBOL(__register_blkdev); void unregister_blkdev(unsigned int major, const char *name) { @@ -1033,6 +1035,19 @@ static ssize_t disk_badblocks_store(struct device *dev, static void request_gendisk_module(dev_t devt) { + unsigned int major = MAJOR(devt); + struct blk_major_name **n; + + mutex_lock(&major_names_lock); + for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) { + if ((*n)->major == major && (*n)->probe) { + (*n)->probe(devt); + mutex_unlock(&major_names_lock); + return; + } + } + mutex_unlock(&major_names_lock); + if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) /* Make old-style 2.4 aliases work */ request_module("block-major-%d", MAJOR(devt)); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 52eb592c874a..811d0f83c4cf 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -367,7 +367,10 @@ extern void blk_unregister_region(dev_t devt, unsigned long range); #define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) -int register_blkdev(unsigned int major, const char *name); +int __register_blkdev(unsigned int major, const char *name, + void (*probe)(dev_t devt)); +#define register_blkdev(major, name) \ + __register_blkdev(major, name, NULL) void unregister_blkdev(unsigned int major, const char *name); void revalidate_disk_size(struct gendisk *disk, bool verbose); From d18e8b1bf9e2ee814a7f886a156bf762d52e178b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:29 +0100 Subject: [PATCH 077/395] ide: remove ide_{,un}register_region There is no need to ever register the fake gendisk used for ide-tape. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/ide/ide-probe.c | 32 -------------------------------- drivers/ide/ide-tape.c | 2 -- include/linux/ide.h | 3 --- 3 files changed, 37 deletions(-) diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 1ddc45a04418..076d34b38172 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -929,38 +929,6 @@ static struct kobject *ata_probe(dev_t dev, int *part, void *data) return NULL; } -static struct kobject *exact_match(dev_t dev, int *part, void *data) -{ - struct gendisk *p = data; - *part &= (1 << PARTN_BITS) - 1; - return &disk_to_dev(p)->kobj; -} - -static int exact_lock(dev_t dev, void *data) -{ - struct gendisk *p = data; - - if (!get_disk_and_module(p)) - return -1; - return 0; -} - -void ide_register_region(struct gendisk *disk) -{ - blk_register_region(MKDEV(disk->major, disk->first_minor), - disk->minors, NULL, exact_match, exact_lock, disk); -} - -EXPORT_SYMBOL_GPL(ide_register_region); - -void ide_unregister_region(struct gendisk *disk) -{ - blk_unregister_region(MKDEV(disk->major, disk->first_minor), - disk->minors); -} - -EXPORT_SYMBOL_GPL(ide_unregister_region); - void ide_init_disk(struct gendisk *disk, ide_drive_t *drive) { ide_hwif_t *hwif = drive->hwif; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 6f26634b22bb..88b96437b22e 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -1822,7 +1822,6 @@ static void ide_tape_remove(ide_drive_t *drive) ide_proc_unregister_driver(drive, tape->driver); device_del(&tape->dev); - ide_unregister_region(tape->disk); mutex_lock(&idetape_ref_mutex); put_device(&tape->dev); @@ -2026,7 +2025,6 @@ static int ide_tape_probe(ide_drive_t *drive) "n%s", tape->name); g->fops = &idetape_block_ops; - ide_register_region(g); return 0; diff --git a/include/linux/ide.h b/include/linux/ide.h index 62653769509f..2c300689a51a 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1493,9 +1493,6 @@ static inline void ide_acpi_port_init_devices(ide_hwif_t *hwif) { ; } static inline void ide_acpi_set_state(ide_hwif_t *hwif, int on) {} #endif -void ide_register_region(struct gendisk *); -void ide_unregister_region(struct gendisk *); - void ide_check_nien_quirk_list(ide_drive_t *); void ide_undecoded_slave(ide_drive_t *); From f9550f1b3981755b0ba9bd385c3c6efdfadd8fe4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:30 +0100 Subject: [PATCH 078/395] swim: don't call blk_register_region The swim driver (unlike various other floppy drivers) doesn't have magic device nodes for certain modes, and already registers a gendisk for each of the floppies supported by a device. Thus the region registered is a no-op and can be removed. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/block/swim.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 52dd1efa00f9..cc6a0bc6c005 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -745,18 +745,6 @@ static const struct block_device_operations floppy_fops = { .check_events = floppy_check_events, }; -static struct kobject *floppy_find(dev_t dev, int *part, void *data) -{ - struct swim_priv *swd = data; - int drive = (*part & 3); - - if (drive >= swd->floppy_count) - return NULL; - - *part = 0; - return get_disk_and_module(swd->unit[drive].disk); -} - static int swim_add_floppy(struct swim_priv *swd, enum drive_location location) { struct floppy_state *fs = &swd->unit[swd->floppy_count]; @@ -846,9 +834,6 @@ static int swim_floppy_init(struct swim_priv *swd) add_disk(swd->unit[drive].disk); } - blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, - floppy_find, NULL, swd); - return 0; exit_put_disks: @@ -932,8 +917,6 @@ static int swim_remove(struct platform_device *dev) int drive; struct resource *res; - blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); - for (drive = 0; drive < swd->floppy_count; drive++) { del_gendisk(swd->unit[drive].disk); blk_cleanup_queue(swd->unit[drive].disk->queue); From 996e509bbc956d60f761dc955f5c5131e9a8fb13 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:31 +0100 Subject: [PATCH 079/395] sd: use __register_blkdev to avoid a modprobe for an unregistered dev_t Switch from using blk_register_region to the probe callback passed to __register_blkdev to disable the request_module call for an unclaimed dev_t in the SD majors. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 656bcf4940d6..106a9cda0eb7 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -630,13 +630,11 @@ static struct scsi_driver sd_template = { }; /* - * Dummy kobj_map->probe function. - * The default ->probe function will call modprobe, which is - * pointless as this module is already loaded. + * Don't request a new module, as that could deadlock in multipath + * environment. */ -static struct kobject *sd_default_probe(dev_t devt, int *partno, void *data) +static void sd_default_probe(dev_t devt) { - return NULL; } /* @@ -3528,9 +3526,6 @@ static int sd_remove(struct device *dev) free_opal_dev(sdkp->opal_dev); - blk_register_region(devt, SD_MINORS, NULL, - sd_default_probe, NULL, NULL); - mutex_lock(&sd_ref_mutex); dev_set_drvdata(dev, NULL); put_device(&sdkp->dev); @@ -3720,11 +3715,9 @@ static int __init init_sd(void) SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n")); for (i = 0; i < SD_MAJORS; i++) { - if (register_blkdev(sd_major(i), "sd") != 0) + if (__register_blkdev(sd_major(i), "sd", sd_default_probe)) continue; majors++; - blk_register_region(sd_major(i), SD_MINORS, NULL, - sd_default_probe, NULL, NULL); } if (!majors) @@ -3797,10 +3790,8 @@ static void __exit exit_sd(void) class_unregister(&sd_disk_class); - for (i = 0; i < SD_MAJORS; i++) { - blk_unregister_region(sd_major(i), SD_MINORS); + for (i = 0; i < SD_MAJORS; i++) unregister_blkdev(sd_major(i), "sd"); - } } module_init(init_sd); From 7cc178a6b994b7c994f1811c4b9fedc015ee0c9e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:32 +0100 Subject: [PATCH 080/395] brd: use __register_blkdev to allocate devices on demand Use the simpler mechanism attached to major_name to allocate a brd device when a currently unregistered minor is accessed. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/block/brd.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index cc49a921339f..c43a6ab4b1f3 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -426,14 +426,15 @@ static void brd_free(struct brd_device *brd) kfree(brd); } -static struct brd_device *brd_init_one(int i, bool *new) +static void brd_probe(dev_t dev) { struct brd_device *brd; + int i = MINOR(dev) / max_part; - *new = false; + mutex_lock(&brd_devices_mutex); list_for_each_entry(brd, &brd_devices, brd_list) { if (brd->brd_number == i) - goto out; + goto out_unlock; } brd = brd_alloc(i); @@ -442,9 +443,9 @@ static struct brd_device *brd_init_one(int i, bool *new) add_disk(brd->brd_disk); list_add_tail(&brd->brd_list, &brd_devices); } - *new = true; -out: - return brd; + +out_unlock: + mutex_unlock(&brd_devices_mutex); } static void brd_del_one(struct brd_device *brd) @@ -454,23 +455,6 @@ static void brd_del_one(struct brd_device *brd) brd_free(brd); } -static struct kobject *brd_probe(dev_t dev, int *part, void *data) -{ - struct brd_device *brd; - struct kobject *kobj; - bool new; - - mutex_lock(&brd_devices_mutex); - brd = brd_init_one(MINOR(dev) / max_part, &new); - kobj = brd ? get_disk_and_module(brd->brd_disk) : NULL; - mutex_unlock(&brd_devices_mutex); - - if (new) - *part = 0; - - return kobj; -} - static inline void brd_check_and_reset_par(void) { if (unlikely(!max_part)) @@ -510,11 +494,12 @@ static int __init brd_init(void) * dynamically. */ - if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) + if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) return -EIO; brd_check_and_reset_par(); + mutex_lock(&brd_devices_mutex); for (i = 0; i < rd_nr; i++) { brd = brd_alloc(i); if (!brd) @@ -532,9 +517,7 @@ static int __init brd_init(void) brd->brd_disk->queue = brd->brd_queue; add_disk(brd->brd_disk); } - - blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS, - THIS_MODULE, brd_probe, NULL, NULL); + mutex_unlock(&brd_devices_mutex); pr_info("brd: module loaded\n"); return 0; @@ -544,6 +527,7 @@ static int __init brd_init(void) list_del(&brd->brd_list); brd_free(brd); } + mutex_unlock(&brd_devices_mutex); unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); pr_info("brd: module NOT loaded !!!\n"); @@ -557,7 +541,6 @@ static void __exit brd_exit(void) list_for_each_entry_safe(brd, next, &brd_devices, brd_list) brd_del_one(brd); - blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS); unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); pr_info("brd: module unloaded\n"); From 8410d38c255200f71b67ddb37021c36273e7e78c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:33 +0100 Subject: [PATCH 081/395] loop: use __register_blkdev to allocate devices on demand Use the simpler mechanism attached to major_name to allocate a brd device when a currently unregistered minor is accessed. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/block/loop.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 027f5724fe2c..22e59410b971 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2235,24 +2235,18 @@ static int loop_lookup(struct loop_device **l, int i) return ret; } -static struct kobject *loop_probe(dev_t dev, int *part, void *data) +static void loop_probe(dev_t dev) { + int idx = MINOR(dev) >> part_shift; struct loop_device *lo; - struct kobject *kobj; - int err; + + if (max_loop && idx >= max_loop) + return; mutex_lock(&loop_ctl_mutex); - err = loop_lookup(&lo, MINOR(dev) >> part_shift); - if (err < 0) - err = loop_add(&lo, MINOR(dev) >> part_shift); - if (err < 0) - kobj = NULL; - else - kobj = get_disk_and_module(lo->lo_disk); + if (loop_lookup(&lo, idx) < 0) + loop_add(&lo, idx); mutex_unlock(&loop_ctl_mutex); - - *part = 0; - return kobj; } static long loop_control_ioctl(struct file *file, unsigned int cmd, @@ -2372,14 +2366,11 @@ static int __init loop_init(void) goto err_out; - if (register_blkdev(LOOP_MAJOR, "loop")) { + if (__register_blkdev(LOOP_MAJOR, "loop", loop_probe)) { err = -EIO; goto misc_out; } - blk_register_region(MKDEV(LOOP_MAJOR, 0), range, - THIS_MODULE, loop_probe, NULL, NULL); - /* pre-create number of devices given by config or max_loop */ mutex_lock(&loop_ctl_mutex); for (i = 0; i < nr; i++) @@ -2405,16 +2396,11 @@ static int loop_exit_cb(int id, void *ptr, void *data) static void __exit loop_exit(void) { - unsigned long range; - - range = max_loop ? max_loop << part_shift : 1UL << MINORBITS; - mutex_lock(&loop_ctl_mutex); idr_for_each(&loop_index_idr, &loop_exit_cb, NULL); idr_destroy(&loop_index_idr); - blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); unregister_blkdev(LOOP_MAJOR, "loop"); misc_deregister(&loop_misc); From 28144f9998e047a9bac31421914335c6bc6eaa67 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:34 +0100 Subject: [PATCH 082/395] md: use __register_blkdev to allocate devices on demand Use the simpler mechanism attached to major_name to allocate a md device when a currently unregistered minor is accessed. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/md.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 96d31336ad43..3c79243e9d07 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5765,11 +5765,12 @@ static int md_alloc(dev_t dev, char *name) return error; } -static struct kobject *md_probe(dev_t dev, int *part, void *data) +static void md_probe(dev_t dev) { + if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) + return; if (create_on_open) md_alloc(dev, NULL); - return NULL; } static int add_named_array(const char *val, const struct kernel_param *kp) @@ -6535,7 +6536,7 @@ static void autorun_devices(int part) break; } - md_probe(dev, NULL, NULL); + md_probe(dev); mddev = mddev_find(dev); if (!mddev || !mddev->gendisk) { if (mddev) @@ -9567,18 +9568,15 @@ static int __init md_init(void) if (!md_rdev_misc_wq) goto err_rdev_misc_wq; - if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) + ret = __register_blkdev(MD_MAJOR, "md", md_probe); + if (ret < 0) goto err_md; - if ((ret = register_blkdev(0, "mdp")) < 0) + ret = __register_blkdev(0, "mdp", md_probe); + if (ret < 0) goto err_mdp; mdp_major = ret; - blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE, - md_probe, NULL, NULL); - blk_register_region(MKDEV(mdp_major, 0), 1UL< Date: Thu, 29 Oct 2020 15:58:35 +0100 Subject: [PATCH 083/395] ide: switch to __register_blkdev for command set probing ide is the last user of the blk_register_region framework except for the tracking of allocated gendisk. Switch to __register_blkdev, even if that doesn't allow us to trivially find out which command set to probe for. That means we now always request all modules when a user tries to access an unclaimed ide device node, but except for a few potentially loaded modules for a fringe use case of a deprecated and soon to be removed driver that doesn't make a difference. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/ide/ide-probe.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 076d34b38172..1c1567bb5194 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -902,31 +902,12 @@ static int init_irq (ide_hwif_t *hwif) return 1; } -static int ata_lock(dev_t dev, void *data) +static void ata_probe(dev_t dev) { - /* FIXME: we want to pin hwif down */ - return 0; -} - -static struct kobject *ata_probe(dev_t dev, int *part, void *data) -{ - ide_hwif_t *hwif = data; - int unit = *part >> PARTN_BITS; - ide_drive_t *drive = hwif->devices[unit]; - - if ((drive->dev_flags & IDE_DFLAG_PRESENT) == 0) - return NULL; - - if (drive->media == ide_disk) - request_module("ide-disk"); - if (drive->media == ide_cdrom || drive->media == ide_optical) - request_module("ide-cd"); - if (drive->media == ide_tape) - request_module("ide-tape"); - if (drive->media == ide_floppy) - request_module("ide-floppy"); - - return NULL; + request_module("ide-disk"); + request_module("ide-cd"); + request_module("ide-tape"); + request_module("ide-floppy"); } void ide_init_disk(struct gendisk *disk, ide_drive_t *drive) @@ -967,7 +948,7 @@ static int hwif_init(ide_hwif_t *hwif) return 0; } - if (register_blkdev(hwif->major, hwif->name)) + if (__register_blkdev(hwif->major, hwif->name, ata_probe)) return 0; if (!hwif->sg_max_nents) @@ -989,8 +970,6 @@ static int hwif_init(ide_hwif_t *hwif) goto out; } - blk_register_region(MKDEV(hwif->major, 0), MAX_DRIVES << PARTN_BITS, - THIS_MODULE, ata_probe, ata_lock, hwif); return 1; out: @@ -1582,7 +1561,6 @@ static void ide_unregister(ide_hwif_t *hwif) /* * Remove us from the kernel's knowledge */ - blk_unregister_region(MKDEV(hwif->major, 0), MAX_DRIVES<sg_table); unregister_blkdev(hwif->major, hwif->name); From 302cfee150291c6cd85b1ca197d062d0b423d09c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:36 +0100 Subject: [PATCH 084/395] floppy: use a separate gendisk for each media format The floppy driver usually autodetects the media when used with the normal /dev/fd? devices, which also are the only nodes created by udev. But it also supports various aliases that force a given media format. That is currently supported using the blk_register_region framework which finds the floppy gendisk even for a 'mismatched' dev_t. The problem with this (besides the code complexity) is that it creates multiple struct block_device instances for the whole device of a single gendisk, which can lead to interesting issues in code not aware of that fact. To fix this just create a separate gendisk for each of the aliases if they are accessed. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 154 ++++++++++++++++++++++++++--------------- 1 file changed, 97 insertions(+), 57 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 7df79ae6b0a1..dfe1dfc901cc 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -402,7 +402,6 @@ static struct floppy_drive_params drive_params[N_DRIVE]; static struct floppy_drive_struct drive_state[N_DRIVE]; static struct floppy_write_errors write_errors[N_DRIVE]; static struct timer_list motor_off_timer[N_DRIVE]; -static struct gendisk *disks[N_DRIVE]; static struct blk_mq_tag_set tag_sets[N_DRIVE]; static struct block_device *opened_bdev[N_DRIVE]; static DEFINE_MUTEX(open_lock); @@ -477,6 +476,8 @@ static struct floppy_struct floppy_type[32] = { { 3200,20,2,80,0,0x1C,0x00,0xCF,0x2C,"H1600" }, /* 31 1.6MB 3.5" */ }; +static struct gendisk *disks[N_DRIVE][ARRAY_SIZE(floppy_type)]; + #define SECTSIZE (_FD_SECTSIZE(*floppy)) /* Auto-detection: Disk type used until the next media change occurs. */ @@ -4111,7 +4112,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) new_dev = MINOR(bdev->bd_dev); drive_state[drive].fd_device = new_dev; - set_capacity(disks[drive], floppy_sizes[new_dev]); + set_capacity(disks[drive][ITYPE(new_dev)], floppy_sizes[new_dev]); if (old_dev != -1 && old_dev != new_dev) { if (buffer_drive == drive) buffer_track = -1; @@ -4579,15 +4580,58 @@ static bool floppy_available(int drive) return true; } -static struct kobject *floppy_find(dev_t dev, int *part, void *data) +static int floppy_alloc_disk(unsigned int drive, unsigned int type) { - int drive = (*part & 3) | ((*part & 0x80) >> 5); - if (drive >= N_DRIVE || !floppy_available(drive)) - return NULL; - if (((*part >> 2) & 0x1f) >= ARRAY_SIZE(floppy_type)) - return NULL; - *part = 0; - return get_disk_and_module(disks[drive]); + struct gendisk *disk; + int err; + + disk = alloc_disk(1); + if (!disk) + return -ENOMEM; + + disk->queue = blk_mq_init_queue(&tag_sets[drive]); + if (IS_ERR(disk->queue)) { + err = PTR_ERR(disk->queue); + disk->queue = NULL; + put_disk(disk); + return err; + } + + blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); + blk_queue_max_hw_sectors(disk->queue, 64); + disk->major = FLOPPY_MAJOR; + disk->first_minor = TOMINOR(drive) | (type << 2); + disk->fops = &floppy_fops; + disk->events = DISK_EVENT_MEDIA_CHANGE; + if (type) + sprintf(disk->disk_name, "fd%d_type%d", drive, type); + else + sprintf(disk->disk_name, "fd%d", drive); + /* to be cleaned up... */ + disk->private_data = (void *)(long)drive; + disk->flags |= GENHD_FL_REMOVABLE; + + disks[drive][type] = disk; + return 0; +} + +static DEFINE_MUTEX(floppy_probe_lock); + +static void floppy_probe(dev_t dev) +{ + unsigned int drive = (MINOR(dev) & 3) | ((MINOR(dev) & 0x80) >> 5); + unsigned int type = (MINOR(dev) >> 2) & 0x1f; + + if (drive >= N_DRIVE || !floppy_available(drive) || + type >= ARRAY_SIZE(floppy_type)) + return; + + mutex_lock(&floppy_probe_lock); + if (!disks[drive][type]) { + if (floppy_alloc_disk(drive, type) == 0) + add_disk(disks[drive][type]); + } + mutex_unlock(&floppy_probe_lock); } static int __init do_floppy_init(void) @@ -4609,33 +4653,25 @@ static int __init do_floppy_init(void) return -ENOMEM; for (drive = 0; drive < N_DRIVE; drive++) { - disks[drive] = alloc_disk(1); - if (!disks[drive]) { - err = -ENOMEM; + memset(&tag_sets[drive], 0, sizeof(tag_sets[drive])); + tag_sets[drive].ops = &floppy_mq_ops; + tag_sets[drive].nr_hw_queues = 1; + tag_sets[drive].nr_maps = 1; + tag_sets[drive].queue_depth = 2; + tag_sets[drive].numa_node = NUMA_NO_NODE; + tag_sets[drive].flags = BLK_MQ_F_SHOULD_MERGE; + err = blk_mq_alloc_tag_set(&tag_sets[drive]); + if (err) goto out_put_disk; - } - disks[drive]->queue = blk_mq_init_sq_queue(&tag_sets[drive], - &floppy_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(disks[drive]->queue)) { - err = PTR_ERR(disks[drive]->queue); - disks[drive]->queue = NULL; + err = floppy_alloc_disk(drive, 0); + if (err) goto out_put_disk; - } - - blk_queue_bounce_limit(disks[drive]->queue, BLK_BOUNCE_HIGH); - blk_queue_max_hw_sectors(disks[drive]->queue, 64); - disks[drive]->major = FLOPPY_MAJOR; - disks[drive]->first_minor = TOMINOR(drive); - disks[drive]->fops = &floppy_fops; - disks[drive]->events = DISK_EVENT_MEDIA_CHANGE; - sprintf(disks[drive]->disk_name, "fd%d", drive); timer_setup(&motor_off_timer[drive], motor_off_callback, 0); } - err = register_blkdev(FLOPPY_MAJOR, "fd"); + err = __register_blkdev(FLOPPY_MAJOR, "fd", floppy_probe); if (err) goto out_put_disk; @@ -4643,9 +4679,6 @@ static int __init do_floppy_init(void) if (err) goto out_unreg_blkdev; - blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, - floppy_find, NULL, NULL); - for (i = 0; i < 256; i++) if (ITYPE(i)) floppy_sizes[i] = floppy_type[ITYPE(i)].size; @@ -4673,7 +4706,7 @@ static int __init do_floppy_init(void) if (fdc_state[0].address == -1) { cancel_delayed_work(&fd_timeout); err = -ENODEV; - goto out_unreg_region; + goto out_unreg_driver; } #if N_FDC > 1 fdc_state[1].address = FDC2; @@ -4684,7 +4717,7 @@ static int __init do_floppy_init(void) if (err) { cancel_delayed_work(&fd_timeout); err = -EBUSY; - goto out_unreg_region; + goto out_unreg_driver; } /* initialise drive state */ @@ -4761,10 +4794,8 @@ static int __init do_floppy_init(void) if (err) goto out_remove_drives; - /* to be cleaned up... */ - disks[drive]->private_data = (void *)(long)drive; - disks[drive]->flags |= GENHD_FL_REMOVABLE; - device_add_disk(&floppy_device[drive].dev, disks[drive], NULL); + device_add_disk(&floppy_device[drive].dev, disks[drive][0], + NULL); } return 0; @@ -4772,30 +4803,27 @@ static int __init do_floppy_init(void) out_remove_drives: while (drive--) { if (floppy_available(drive)) { - del_gendisk(disks[drive]); + del_gendisk(disks[drive][0]); platform_device_unregister(&floppy_device[drive]); } } out_release_dma: if (atomic_read(&usage_count)) floppy_release_irq_and_dma(); -out_unreg_region: - blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); +out_unreg_driver: platform_driver_unregister(&floppy_driver); out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); out_put_disk: destroy_workqueue(floppy_wq); for (drive = 0; drive < N_DRIVE; drive++) { - if (!disks[drive]) + if (!disks[drive][0]) break; - if (disks[drive]->queue) { - del_timer_sync(&motor_off_timer[drive]); - blk_cleanup_queue(disks[drive]->queue); - disks[drive]->queue = NULL; - blk_mq_free_tag_set(&tag_sets[drive]); - } - put_disk(disks[drive]); + del_timer_sync(&motor_off_timer[drive]); + blk_cleanup_queue(disks[drive][0]->queue); + disks[drive][0]->queue = NULL; + blk_mq_free_tag_set(&tag_sets[drive]); + put_disk(disks[drive][0]); } return err; } @@ -5006,9 +5034,8 @@ module_init(floppy_module_init); static void __exit floppy_module_exit(void) { - int drive; + int drive, i; - blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); unregister_blkdev(FLOPPY_MAJOR, "fd"); platform_driver_unregister(&floppy_driver); @@ -5018,10 +5045,16 @@ static void __exit floppy_module_exit(void) del_timer_sync(&motor_off_timer[drive]); if (floppy_available(drive)) { - del_gendisk(disks[drive]); + for (i = 0; i < ARRAY_SIZE(floppy_type); i++) { + if (disks[drive][i]) + del_gendisk(disks[drive][i]); + } platform_device_unregister(&floppy_device[drive]); } - blk_cleanup_queue(disks[drive]->queue); + for (i = 0; i < ARRAY_SIZE(floppy_type); i++) { + if (disks[drive][i]) + blk_cleanup_queue(disks[drive][i]->queue); + } blk_mq_free_tag_set(&tag_sets[drive]); /* @@ -5029,10 +5062,17 @@ static void __exit floppy_module_exit(void) * queue reference in put_disk(). */ if (!(allowed_drive_mask & (1 << drive)) || - fdc_state[FDC(drive)].version == FDC_NONE) - disks[drive]->queue = NULL; + fdc_state[FDC(drive)].version == FDC_NONE) { + for (i = 0; i < ARRAY_SIZE(floppy_type); i++) { + if (disks[drive][i]) + disks[drive][i]->queue = NULL; + } + } - put_disk(disks[drive]); + for (i = 0; i < ARRAY_SIZE(floppy_type); i++) { + if (disks[drive][i]) + put_disk(disks[drive][i]); + } } cancel_delayed_work_sync(&fd_timeout); From 0033a9b41fc219e1e0d673e0a42179577a7d68ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:37 +0100 Subject: [PATCH 085/395] amiflop: use separate gendisks for Amiga vs MS-DOS mode Use separate gendisks (which share a tag_set) for the native Amgiga vs the MS-DOS mode instead of redirecting the gendisk lookup using a probe callback. This avoids potential problems with aliased block_device instances and will eventually allow for removing the blk_register_region framework. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/block/amiflop.c | 100 ++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 44 deletions(-) diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 71c2b1564558..9e2d0c6a3877 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -201,7 +201,7 @@ struct amiga_floppy_struct { int busy; /* true when drive is active */ int dirty; /* true when trackbuf is not on disk */ int status; /* current error code for unit */ - struct gendisk *gendisk; + struct gendisk *gendisk[2]; struct blk_mq_tag_set tag_set; }; @@ -1669,6 +1669,11 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) return -EBUSY; } + if (unit[drive].type->code == FD_NODRIVE) { + mutex_unlock(&amiflop_mutex); + return -ENXIO; + } + if (mode & (FMODE_READ|FMODE_WRITE)) { bdev_check_media_change(bdev); if (mode & FMODE_WRITE) { @@ -1695,7 +1700,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) unit[drive].dtype=&data_types[system]; unit[drive].blocks=unit[drive].type->heads*unit[drive].type->tracks* data_types[system].sects*unit[drive].type->sect_mult; - set_capacity(unit[drive].gendisk, unit[drive].blocks); + set_capacity(unit[drive].gendisk[system], unit[drive].blocks); printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive, unit[drive].type->name, data_types[system].name); @@ -1772,36 +1777,68 @@ static const struct blk_mq_ops amiflop_mq_ops = { .queue_rq = amiflop_queue_rq, }; -static struct gendisk *fd_alloc_disk(int drive) +static int fd_alloc_disk(int drive, int system) { struct gendisk *disk; disk = alloc_disk(1); if (!disk) goto out; - - disk->queue = blk_mq_init_sq_queue(&unit[drive].tag_set, &amiflop_mq_ops, - 2, BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(disk->queue)) { - disk->queue = NULL; + disk->queue = blk_mq_init_queue(&unit[drive].tag_set); + if (IS_ERR(disk->queue)) goto out_put_disk; - } - unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL); - if (!unit[drive].trackbuf) - goto out_cleanup_queue; + disk->major = FLOPPY_MAJOR; + disk->first_minor = drive + system; + disk->fops = &floppy_fops; + disk->events = DISK_EVENT_MEDIA_CHANGE; + if (system) + sprintf(disk->disk_name, "fd%d_msdos", drive); + else + sprintf(disk->disk_name, "fd%d", drive); + disk->private_data = &unit[drive]; + set_capacity(disk, 880 * 2); - return disk; + unit[drive].gendisk[system] = disk; + add_disk(disk); + return 0; -out_cleanup_queue: - blk_cleanup_queue(disk->queue); - disk->queue = NULL; - blk_mq_free_tag_set(&unit[drive].tag_set); out_put_disk: + disk->queue = NULL; put_disk(disk); +out: + return -ENOMEM; +} + +static int fd_alloc_drive(int drive) +{ + unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL); + if (!unit[drive].trackbuf) + goto out; + + memset(&unit[drive].tag_set, 0, sizeof(unit[drive].tag_set)); + unit[drive].tag_set.ops = &amiflop_mq_ops; + unit[drive].tag_set.nr_hw_queues = 1; + unit[drive].tag_set.nr_maps = 1; + unit[drive].tag_set.queue_depth = 2; + unit[drive].tag_set.numa_node = NUMA_NO_NODE; + unit[drive].tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + if (blk_mq_alloc_tag_set(&unit[drive].tag_set)) + goto out_cleanup_trackbuf; + + pr_cont(" fd%d", drive); + + if (fd_alloc_disk(drive, 0) || fd_alloc_disk(drive, 1)) + goto out_cleanup_tagset; + return 0; + +out_cleanup_tagset: + blk_mq_free_tag_set(&unit[drive].tag_set); +out_cleanup_trackbuf: + kfree(unit[drive].trackbuf); out: unit[drive].type->code = FD_NODRIVE; - return NULL; + return -ENOMEM; } static int __init fd_probe_drives(void) @@ -1812,29 +1849,16 @@ static int __init fd_probe_drives(void) drives=0; nomem=0; for(drive=0;drivecode == FD_NODRIVE) continue; - disk = fd_alloc_disk(drive); - if (!disk) { + if (fd_alloc_drive(drive) < 0) { pr_cont(" no mem for fd%d", drive); nomem = 1; continue; } - unit[drive].gendisk = disk; drives++; - - pr_cont(" fd%d",drive); - disk->major = FLOPPY_MAJOR; - disk->first_minor = drive; - disk->fops = &floppy_fops; - disk->events = DISK_EVENT_MEDIA_CHANGE; - sprintf(disk->disk_name, "fd%d", drive); - disk->private_data = &unit[drive]; - set_capacity(disk, 880*2); - add_disk(disk); } if ((drives > 0) || (nomem == 0)) { if (drives == 0) @@ -1846,15 +1870,6 @@ static int __init fd_probe_drives(void) return -ENOMEM; } -static struct kobject *floppy_find(dev_t dev, int *part, void *data) -{ - int drive = *part & 3; - if (unit[drive].type->code == FD_NODRIVE) - return NULL; - *part = 0; - return get_disk_and_module(unit[drive].gendisk); -} - static int __init amiga_floppy_probe(struct platform_device *pdev) { int i, ret; @@ -1884,9 +1899,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev) if (fd_probe_drives() < 1) /* No usable drives */ goto out_probe; - blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, - floppy_find, NULL, NULL); - /* initialize variables */ timer_setup(&motor_on_timer, motor_on_callback, 0); motor_on_timer.expires = 0; From bf9c0538e485b591a2ee02d9adb8a99db4be5a2a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:38 +0100 Subject: [PATCH 086/395] ataflop: use a separate gendisk for each media format The Atari floppy driver usually autodetects the media when used with the ormal /dev/fd? devices, which also are the only nodes created by udev. But it also supports various aliases that force a given media format. That is currently supported using the blk_register_region framework which finds the floppy gendisk even for a 'mismatched' dev_t. The problem with this (besides the code complexity) is that it creates multiple struct block_device instances for the whole device of a single gendisk, which can lead to interesting issues in code not aware of that fact. To fix this just create a separate gendisk for each of the aliases if they are accessed. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/ataflop.c | 135 +++++++++++++++++++++++++--------------- 1 file changed, 86 insertions(+), 49 deletions(-) diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 3e881fdb06e0..104b713f4055 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -297,7 +297,7 @@ static struct atari_floppy_struct { unsigned int wpstat; /* current state of WP signal (for disk change detection) */ int flags; /* flags */ - struct gendisk *disk; + struct gendisk *disk[NUM_DISK_MINORS]; int ref; int type; struct blk_mq_tag_set tag_set; @@ -723,12 +723,16 @@ static void fd_error( void ) static int do_format(int drive, int type, struct atari_format_descr *desc) { - struct request_queue *q = unit[drive].disk->queue; + struct request_queue *q; unsigned char *p; int sect, nsect; unsigned long flags; int ret; + if (type) + type--; + + q = unit[drive].disk[type]->queue; blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); @@ -738,7 +742,7 @@ static int do_format(int drive, int type, struct atari_format_descr *desc) local_irq_restore(flags); if (type) { - if (--type >= NUM_DISK_MINORS || + if (type >= NUM_DISK_MINORS || minor2disktype[type].drive_types > DriveType) { ret = -EINVAL; goto out; @@ -1154,7 +1158,7 @@ static void fd_rwsec_done1(int status) if (SUDT[-1].blocks > ReqBlock) { /* try another disk type */ SUDT--; - set_capacity(unit[SelectedDrive].disk, + set_capacity(unit[SelectedDrive].disk[0], SUDT->blocks); } else Probing = 0; @@ -1169,7 +1173,7 @@ static void fd_rwsec_done1(int status) /* record not found, but not probing. Maybe stretch wrong ? Restart probing */ if (SUD.autoprobe) { SUDT = atari_disk_type + StartDiskType[DriveType]; - set_capacity(unit[SelectedDrive].disk, + set_capacity(unit[SelectedDrive].disk[0], SUDT->blocks); Probing = 1; } @@ -1515,7 +1519,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, if (!UDT) { Probing = 1; UDT = atari_disk_type + StartDiskType[DriveType]; - set_capacity(floppy->disk, UDT->blocks); + set_capacity(bd->rq->rq_disk, UDT->blocks); UD.autoprobe = 1; } } @@ -1533,7 +1537,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, } type = minor2disktype[type].index; UDT = &atari_disk_type[type]; - set_capacity(floppy->disk, UDT->blocks); + set_capacity(bd->rq->rq_disk, UDT->blocks); UD.autoprobe = 0; } @@ -1658,7 +1662,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, printk (KERN_INFO "floppy%d: setting %s %p!\n", drive, dtp->name, dtp); UDT = dtp; - set_capacity(floppy->disk, UDT->blocks); + set_capacity(disk, UDT->blocks); if (cmd == FDDEFPRM) { /* save settings as permanent default type */ @@ -1702,7 +1706,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, return -EINVAL; UDT = dtp; - set_capacity(floppy->disk, UDT->blocks); + set_capacity(disk, UDT->blocks); return 0; case FDMSGON: @@ -1725,7 +1729,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, UDT = NULL; /* MSch: invalidate default_params */ default_params[drive].blocks = 0; - set_capacity(floppy->disk, MAX_DISK_SIZE * 2); + set_capacity(disk, MAX_DISK_SIZE * 2); fallthrough; case FDFMTEND: case FDFLUSH: @@ -1962,14 +1966,50 @@ static const struct blk_mq_ops ataflop_mq_ops = { .commit_rqs = ataflop_commit_rqs, }; -static struct kobject *floppy_find(dev_t dev, int *part, void *data) +static int ataflop_alloc_disk(unsigned int drive, unsigned int type) { - int drive = *part & 3; - int type = *part >> 2; + struct gendisk *disk; + int ret; + + disk = alloc_disk(1); + if (!disk) + return -ENOMEM; + + disk->queue = blk_mq_init_queue(&unit[drive].tag_set); + if (IS_ERR(disk->queue)) { + ret = PTR_ERR(disk->queue); + disk->queue = NULL; + put_disk(disk); + return ret; + } + + disk->major = FLOPPY_MAJOR; + disk->first_minor = drive + (type << 2); + sprintf(disk->disk_name, "fd%d", drive); + disk->fops = &floppy_fops; + disk->events = DISK_EVENT_MEDIA_CHANGE; + disk->private_data = &unit[drive]; + set_capacity(disk, MAX_DISK_SIZE * 2); + + unit[drive].disk[type] = disk; + return 0; +} + +static DEFINE_MUTEX(ataflop_probe_lock); + +static void ataflop_probe(dev_t dev) +{ + int drive = MINOR(dev) & 3; + int type = MINOR(dev) >> 2; + if (drive >= FD_MAX_UNITS || type > NUM_DISK_MINORS) - return NULL; - *part = 0; - return get_disk_and_module(unit[drive].disk); + return; + mutex_lock(&ataflop_probe_lock); + if (!unit[drive].disk[type]) { + if (ataflop_alloc_disk(drive, type) == 0) + add_disk(unit[drive].disk[type]); + } + mutex_unlock(&ataflop_probe_lock); } static int __init atari_floppy_init (void) @@ -1981,23 +2021,26 @@ static int __init atari_floppy_init (void) /* Amiga, Mac, ... don't have Atari-compatible floppy :-) */ return -ENODEV; - if (register_blkdev(FLOPPY_MAJOR,"fd")) - return -EBUSY; + mutex_lock(&ataflop_probe_lock); + ret = __register_blkdev(FLOPPY_MAJOR, "fd", ataflop_probe); + if (ret) + goto out_unlock; for (i = 0; i < FD_MAX_UNITS; i++) { - unit[i].disk = alloc_disk(1); - if (!unit[i].disk) { - ret = -ENOMEM; + memset(&unit[i].tag_set, 0, sizeof(unit[i].tag_set)); + unit[i].tag_set.ops = &ataflop_mq_ops; + unit[i].tag_set.nr_hw_queues = 1; + unit[i].tag_set.nr_maps = 1; + unit[i].tag_set.queue_depth = 2; + unit[i].tag_set.numa_node = NUMA_NO_NODE; + unit[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ret = blk_mq_alloc_tag_set(&unit[i].tag_set); + if (ret) goto err; - } - unit[i].disk->queue = blk_mq_init_sq_queue(&unit[i].tag_set, - &ataflop_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(unit[i].disk->queue)) { - put_disk(unit[i].disk); - ret = PTR_ERR(unit[i].disk->queue); - unit[i].disk->queue = NULL; + ret = ataflop_alloc_disk(i, 0); + if (ret) { + blk_mq_free_tag_set(&unit[i].tag_set); goto err; } } @@ -2027,19 +2070,9 @@ static int __init atari_floppy_init (void) for (i = 0; i < FD_MAX_UNITS; i++) { unit[i].track = -1; unit[i].flags = 0; - unit[i].disk->major = FLOPPY_MAJOR; - unit[i].disk->first_minor = i; - sprintf(unit[i].disk->disk_name, "fd%d", i); - unit[i].disk->fops = &floppy_fops; - unit[i].disk->events = DISK_EVENT_MEDIA_CHANGE; - unit[i].disk->private_data = &unit[i]; - set_capacity(unit[i].disk, MAX_DISK_SIZE * 2); - add_disk(unit[i].disk); + add_disk(unit[i].disk[0]); } - blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, - floppy_find, NULL, NULL); - printk(KERN_INFO "Atari floppy driver: max. %cD, %strack buffering\n", DriveType == 0 ? 'D' : DriveType == 1 ? 'H' : 'E', UseTrackbuffer ? "" : "no "); @@ -2049,14 +2082,14 @@ static int __init atari_floppy_init (void) err: while (--i >= 0) { - struct gendisk *disk = unit[i].disk; - - blk_cleanup_queue(disk->queue); + blk_cleanup_queue(unit[i].disk[0]->queue); + put_disk(unit[i].disk[0]); blk_mq_free_tag_set(&unit[i].tag_set); - put_disk(unit[i].disk); } unregister_blkdev(FLOPPY_MAJOR, "fd"); +out_unlock: + mutex_unlock(&ataflop_probe_lock); return ret; } @@ -2101,13 +2134,17 @@ __setup("floppy=", atari_floppy_setup); static void __exit atari_floppy_exit(void) { - int i; - blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); + int i, type; + for (i = 0; i < FD_MAX_UNITS; i++) { - del_gendisk(unit[i].disk); - blk_cleanup_queue(unit[i].disk->queue); + for (type = 0; type < NUM_DISK_MINORS; type++) { + if (!unit[i].disk[type]) + continue; + del_gendisk(unit[i].disk[type]); + blk_cleanup_queue(unit[i].disk[type]->queue); + put_disk(unit[i].disk[type]); + } blk_mq_free_tag_set(&unit[i].tag_set); - put_disk(unit[i].disk); } unregister_blkdev(FLOPPY_MAJOR, "fd"); From 6c3a05e127058ed9626a2f39e779c5e595c13a9b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:39 +0100 Subject: [PATCH 087/395] z2ram: reindent reindent the driver using Lident as the code style was far away from normal Linux code. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/block/z2ram.c | 485 ++++++++++++++++++++---------------------- 1 file changed, 232 insertions(+), 253 deletions(-) diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 0e734802ee7c..eafecc9a72b3 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -42,7 +42,6 @@ #include - #define Z2MINOR_COMBINED (0) #define Z2MINOR_Z2ONLY (1) #define Z2MINOR_CHIPONLY (2) @@ -50,17 +49,17 @@ #define Z2MINOR_MEMLIST2 (5) #define Z2MINOR_MEMLIST3 (6) #define Z2MINOR_MEMLIST4 (7) -#define Z2MINOR_COUNT (8) /* Move this down when adding a new minor */ +#define Z2MINOR_COUNT (8) /* Move this down when adding a new minor */ #define Z2RAM_CHUNK1024 ( Z2RAM_CHUNKSIZE >> 10 ) static DEFINE_MUTEX(z2ram_mutex); -static u_long *z2ram_map = NULL; -static u_long z2ram_size = 0; -static int z2_count = 0; -static int chip_count = 0; -static int list_count = 0; -static int current_device = -1; +static u_long *z2ram_map = NULL; +static u_long z2ram_size = 0; +static int z2_count = 0; +static int chip_count = 0; +static int list_count = 0; +static int current_device = -1; static DEFINE_SPINLOCK(z2ram_lock); @@ -71,7 +70,7 @@ static blk_status_t z2_queue_rq(struct blk_mq_hw_ctx *hctx, { struct request *req = bd->rq; unsigned long start = blk_rq_pos(req) << 9; - unsigned long len = blk_rq_cur_bytes(req); + unsigned long len = blk_rq_cur_bytes(req); blk_mq_start_request(req); @@ -92,7 +91,7 @@ static blk_status_t z2_queue_rq(struct blk_mq_hw_ctx *hctx, if (len < size) size = len; - addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ]; + addr += z2ram_map[start >> Z2RAM_CHUNKSHIFT]; if (rq_data_dir(req) == READ) memcpy(buffer, (char *)addr, size); else @@ -106,228 +105,214 @@ static blk_status_t z2_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static void -get_z2ram( void ) +static void get_z2ram(void) { - int i; + int i; - for ( i = 0; i < Z2RAM_SIZE / Z2RAM_CHUNKSIZE; i++ ) - { - if ( test_bit( i, zorro_unused_z2ram ) ) - { - z2_count++; - z2ram_map[z2ram_size++] = (unsigned long)ZTWO_VADDR(Z2RAM_START) + - (i << Z2RAM_CHUNKSHIFT); - clear_bit( i, zorro_unused_z2ram ); + for (i = 0; i < Z2RAM_SIZE / Z2RAM_CHUNKSIZE; i++) { + if (test_bit(i, zorro_unused_z2ram)) { + z2_count++; + z2ram_map[z2ram_size++] = + (unsigned long)ZTWO_VADDR(Z2RAM_START) + + (i << Z2RAM_CHUNKSHIFT); + clear_bit(i, zorro_unused_z2ram); + } } - } - return; + return; } -static void -get_chipram( void ) +static void get_chipram(void) { - while ( amiga_chip_avail() > ( Z2RAM_CHUNKSIZE * 4 ) ) - { - chip_count++; - z2ram_map[ z2ram_size ] = - (u_long)amiga_chip_alloc( Z2RAM_CHUNKSIZE, "z2ram" ); + while (amiga_chip_avail() > (Z2RAM_CHUNKSIZE * 4)) { + chip_count++; + z2ram_map[z2ram_size] = + (u_long) amiga_chip_alloc(Z2RAM_CHUNKSIZE, "z2ram"); - if ( z2ram_map[ z2ram_size ] == 0 ) - { - break; + if (z2ram_map[z2ram_size] == 0) { + break; + } + + z2ram_size++; } - z2ram_size++; - } - - return; + return; } static int z2_open(struct block_device *bdev, fmode_t mode) { - int device; - int max_z2_map = ( Z2RAM_SIZE / Z2RAM_CHUNKSIZE ) * - sizeof( z2ram_map[0] ); - int max_chip_map = ( amiga_chip_size / Z2RAM_CHUNKSIZE ) * - sizeof( z2ram_map[0] ); - int rc = -ENOMEM; + int device; + int max_z2_map = (Z2RAM_SIZE / Z2RAM_CHUNKSIZE) * sizeof(z2ram_map[0]); + int max_chip_map = (amiga_chip_size / Z2RAM_CHUNKSIZE) * + sizeof(z2ram_map[0]); + int rc = -ENOMEM; - device = MINOR(bdev->bd_dev); + device = MINOR(bdev->bd_dev); - mutex_lock(&z2ram_mutex); - if ( current_device != -1 && current_device != device ) - { - rc = -EBUSY; - goto err_out; - } + mutex_lock(&z2ram_mutex); + if (current_device != -1 && current_device != device) { + rc = -EBUSY; + goto err_out; + } - if ( current_device == -1 ) - { - z2_count = 0; - chip_count = 0; - list_count = 0; - z2ram_size = 0; + if (current_device == -1) { + z2_count = 0; + chip_count = 0; + list_count = 0; + z2ram_size = 0; - /* Use a specific list entry. */ - if (device >= Z2MINOR_MEMLIST1 && device <= Z2MINOR_MEMLIST4) { - int index = device - Z2MINOR_MEMLIST1 + 1; - unsigned long size, paddr, vaddr; + /* Use a specific list entry. */ + if (device >= Z2MINOR_MEMLIST1 && device <= Z2MINOR_MEMLIST4) { + int index = device - Z2MINOR_MEMLIST1 + 1; + unsigned long size, paddr, vaddr; - if (index >= m68k_realnum_memory) { - printk( KERN_ERR DEVICE_NAME - ": no such entry in z2ram_map\n" ); - goto err_out; - } + if (index >= m68k_realnum_memory) { + printk(KERN_ERR DEVICE_NAME + ": no such entry in z2ram_map\n"); + goto err_out; + } - paddr = m68k_memory[index].addr; - size = m68k_memory[index].size & ~(Z2RAM_CHUNKSIZE-1); + paddr = m68k_memory[index].addr; + size = m68k_memory[index].size & ~(Z2RAM_CHUNKSIZE - 1); #ifdef __powerpc__ - /* FIXME: ioremap doesn't build correct memory tables. */ - { - vfree(vmalloc (size)); - } + /* FIXME: ioremap doesn't build correct memory tables. */ + { + vfree(vmalloc(size)); + } - vaddr = (unsigned long)ioremap_wt(paddr, size); + vaddr = (unsigned long)ioremap_wt(paddr, size); #else - vaddr = (unsigned long)z_remap_nocache_nonser(paddr, size); + vaddr = + (unsigned long)z_remap_nocache_nonser(paddr, size); #endif - z2ram_map = - kmalloc_array(size / Z2RAM_CHUNKSIZE, - sizeof(z2ram_map[0]), - GFP_KERNEL); - if ( z2ram_map == NULL ) - { - printk( KERN_ERR DEVICE_NAME - ": cannot get mem for z2ram_map\n" ); - goto err_out; + z2ram_map = + kmalloc_array(size / Z2RAM_CHUNKSIZE, + sizeof(z2ram_map[0]), GFP_KERNEL); + if (z2ram_map == NULL) { + printk(KERN_ERR DEVICE_NAME + ": cannot get mem for z2ram_map\n"); + goto err_out; + } + + while (size) { + z2ram_map[z2ram_size++] = vaddr; + size -= Z2RAM_CHUNKSIZE; + vaddr += Z2RAM_CHUNKSIZE; + list_count++; + } + + if (z2ram_size != 0) + printk(KERN_INFO DEVICE_NAME + ": using %iK List Entry %d Memory\n", + list_count * Z2RAM_CHUNK1024, index); + } else + switch (device) { + case Z2MINOR_COMBINED: + + z2ram_map = + kmalloc(max_z2_map + max_chip_map, + GFP_KERNEL); + if (z2ram_map == NULL) { + printk(KERN_ERR DEVICE_NAME + ": cannot get mem for z2ram_map\n"); + goto err_out; + } + + get_z2ram(); + get_chipram(); + + if (z2ram_size != 0) + printk(KERN_INFO DEVICE_NAME + ": using %iK Zorro II RAM and %iK Chip RAM (Total %dK)\n", + z2_count * Z2RAM_CHUNK1024, + chip_count * Z2RAM_CHUNK1024, + (z2_count + + chip_count) * Z2RAM_CHUNK1024); + + break; + + case Z2MINOR_Z2ONLY: + z2ram_map = kmalloc(max_z2_map, GFP_KERNEL); + if (z2ram_map == NULL) { + printk(KERN_ERR DEVICE_NAME + ": cannot get mem for z2ram_map\n"); + goto err_out; + } + + get_z2ram(); + + if (z2ram_size != 0) + printk(KERN_INFO DEVICE_NAME + ": using %iK of Zorro II RAM\n", + z2_count * Z2RAM_CHUNK1024); + + break; + + case Z2MINOR_CHIPONLY: + z2ram_map = kmalloc(max_chip_map, GFP_KERNEL); + if (z2ram_map == NULL) { + printk(KERN_ERR DEVICE_NAME + ": cannot get mem for z2ram_map\n"); + goto err_out; + } + + get_chipram(); + + if (z2ram_size != 0) + printk(KERN_INFO DEVICE_NAME + ": using %iK Chip RAM\n", + chip_count * Z2RAM_CHUNK1024); + + break; + + default: + rc = -ENODEV; + goto err_out; + + break; + } + + if (z2ram_size == 0) { + printk(KERN_NOTICE DEVICE_NAME + ": no unused ZII/Chip RAM found\n"); + goto err_out_kfree; } - while (size) { - z2ram_map[ z2ram_size++ ] = vaddr; - size -= Z2RAM_CHUNKSIZE; - vaddr += Z2RAM_CHUNKSIZE; - list_count++; - } - - if ( z2ram_size != 0 ) - printk( KERN_INFO DEVICE_NAME - ": using %iK List Entry %d Memory\n", - list_count * Z2RAM_CHUNK1024, index ); - } else - - switch ( device ) - { - case Z2MINOR_COMBINED: - - z2ram_map = kmalloc( max_z2_map + max_chip_map, GFP_KERNEL ); - if ( z2ram_map == NULL ) - { - printk( KERN_ERR DEVICE_NAME - ": cannot get mem for z2ram_map\n" ); - goto err_out; - } - - get_z2ram(); - get_chipram(); - - if ( z2ram_size != 0 ) - printk( KERN_INFO DEVICE_NAME - ": using %iK Zorro II RAM and %iK Chip RAM (Total %dK)\n", - z2_count * Z2RAM_CHUNK1024, - chip_count * Z2RAM_CHUNK1024, - ( z2_count + chip_count ) * Z2RAM_CHUNK1024 ); - - break; - - case Z2MINOR_Z2ONLY: - z2ram_map = kmalloc( max_z2_map, GFP_KERNEL ); - if ( z2ram_map == NULL ) - { - printk( KERN_ERR DEVICE_NAME - ": cannot get mem for z2ram_map\n" ); - goto err_out; - } - - get_z2ram(); - - if ( z2ram_size != 0 ) - printk( KERN_INFO DEVICE_NAME - ": using %iK of Zorro II RAM\n", - z2_count * Z2RAM_CHUNK1024 ); - - break; - - case Z2MINOR_CHIPONLY: - z2ram_map = kmalloc( max_chip_map, GFP_KERNEL ); - if ( z2ram_map == NULL ) - { - printk( KERN_ERR DEVICE_NAME - ": cannot get mem for z2ram_map\n" ); - goto err_out; - } - - get_chipram(); - - if ( z2ram_size != 0 ) - printk( KERN_INFO DEVICE_NAME - ": using %iK Chip RAM\n", - chip_count * Z2RAM_CHUNK1024 ); - - break; - - default: - rc = -ENODEV; - goto err_out; - - break; + current_device = device; + z2ram_size <<= Z2RAM_CHUNKSHIFT; + set_capacity(z2ram_gendisk, z2ram_size >> 9); } - if ( z2ram_size == 0 ) - { - printk( KERN_NOTICE DEVICE_NAME - ": no unused ZII/Chip RAM found\n" ); - goto err_out_kfree; - } - - current_device = device; - z2ram_size <<= Z2RAM_CHUNKSHIFT; - set_capacity(z2ram_gendisk, z2ram_size >> 9); - } - - mutex_unlock(&z2ram_mutex); - return 0; + mutex_unlock(&z2ram_mutex); + return 0; err_out_kfree: - kfree(z2ram_map); + kfree(z2ram_map); err_out: - mutex_unlock(&z2ram_mutex); - return rc; + mutex_unlock(&z2ram_mutex); + return rc; } -static void -z2_release(struct gendisk *disk, fmode_t mode) +static void z2_release(struct gendisk *disk, fmode_t mode) { - mutex_lock(&z2ram_mutex); - if ( current_device == -1 ) { - mutex_unlock(&z2ram_mutex); - return; - } - mutex_unlock(&z2ram_mutex); - /* - * FIXME: unmap memory - */ + mutex_lock(&z2ram_mutex); + if (current_device == -1) { + mutex_unlock(&z2ram_mutex); + return; + } + mutex_unlock(&z2ram_mutex); + /* + * FIXME: unmap memory + */ } -static const struct block_device_operations z2_fops = -{ - .owner = THIS_MODULE, - .open = z2_open, - .release = z2_release, +static const struct block_device_operations z2_fops = { + .owner = THIS_MODULE, + .open = z2_open, + .release = z2_release, }; static struct kobject *z2_find(dev_t dev, int *part, void *data) @@ -340,89 +325,83 @@ static struct request_queue *z2_queue; static struct blk_mq_tag_set tag_set; static const struct blk_mq_ops z2_mq_ops = { - .queue_rq = z2_queue_rq, + .queue_rq = z2_queue_rq, }; -static int __init -z2_init(void) +static int __init z2_init(void) { - int ret; + int ret; - if (!MACH_IS_AMIGA) - return -ENODEV; + if (!MACH_IS_AMIGA) + return -ENODEV; - ret = -EBUSY; - if (register_blkdev(Z2RAM_MAJOR, DEVICE_NAME)) - goto err; + ret = -EBUSY; + if (register_blkdev(Z2RAM_MAJOR, DEVICE_NAME)) + goto err; - ret = -ENOMEM; - z2ram_gendisk = alloc_disk(1); - if (!z2ram_gendisk) - goto out_disk; + ret = -ENOMEM; + z2ram_gendisk = alloc_disk(1); + if (!z2ram_gendisk) + goto out_disk; - z2_queue = blk_mq_init_sq_queue(&tag_set, &z2_mq_ops, 16, + z2_queue = blk_mq_init_sq_queue(&tag_set, &z2_mq_ops, 16, BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(z2_queue)) { - ret = PTR_ERR(z2_queue); - z2_queue = NULL; - goto out_queue; - } + if (IS_ERR(z2_queue)) { + ret = PTR_ERR(z2_queue); + z2_queue = NULL; + goto out_queue; + } - z2ram_gendisk->major = Z2RAM_MAJOR; - z2ram_gendisk->first_minor = 0; - z2ram_gendisk->fops = &z2_fops; - sprintf(z2ram_gendisk->disk_name, "z2ram"); + z2ram_gendisk->major = Z2RAM_MAJOR; + z2ram_gendisk->first_minor = 0; + z2ram_gendisk->fops = &z2_fops; + sprintf(z2ram_gendisk->disk_name, "z2ram"); - z2ram_gendisk->queue = z2_queue; - add_disk(z2ram_gendisk); - blk_register_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT, THIS_MODULE, - z2_find, NULL, NULL); + z2ram_gendisk->queue = z2_queue; + add_disk(z2ram_gendisk); + blk_register_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT, THIS_MODULE, + z2_find, NULL, NULL); - return 0; + return 0; out_queue: - put_disk(z2ram_gendisk); + put_disk(z2ram_gendisk); out_disk: - unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME); + unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME); err: - return ret; + return ret; } static void __exit z2_exit(void) { - int i, j; - blk_unregister_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT); - unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME); - del_gendisk(z2ram_gendisk); - put_disk(z2ram_gendisk); - blk_cleanup_queue(z2_queue); - blk_mq_free_tag_set(&tag_set); + int i, j; + blk_unregister_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT); + unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME); + del_gendisk(z2ram_gendisk); + put_disk(z2ram_gendisk); + blk_cleanup_queue(z2_queue); + blk_mq_free_tag_set(&tag_set); - if ( current_device != -1 ) - { - i = 0; + if (current_device != -1) { + i = 0; - for ( j = 0 ; j < z2_count; j++ ) - { - set_bit( i++, zorro_unused_z2ram ); + for (j = 0; j < z2_count; j++) { + set_bit(i++, zorro_unused_z2ram); + } + + for (j = 0; j < chip_count; j++) { + if (z2ram_map[i]) { + amiga_chip_free((void *)z2ram_map[i++]); + } + } + + if (z2ram_map != NULL) { + kfree(z2ram_map); + } } - for ( j = 0 ; j < chip_count; j++ ) - { - if ( z2ram_map[ i ] ) - { - amiga_chip_free( (void *) z2ram_map[ i++ ] ); - } - } - - if ( z2ram_map != NULL ) - { - kfree( z2ram_map ); - } - } - - return; -} + return; +} module_init(z2_init); module_exit(z2_exit); From 76487f0241423497218f6ec505aa93cc29c8ddcd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:40 +0100 Subject: [PATCH 088/395] z2ram: use separate gendisk for the different modes Use separate gendisks (which share a tag_set) for the different operating modes instead of redirecting the gendisk lookup using a probe callback. This avoids potential problems with aliased block_device instances and will eventually allow for removing the blk_register_region framework. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/block/z2ram.c | 96 +++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index eafecc9a72b3..c1d20818e649 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -63,7 +63,7 @@ static int current_device = -1; static DEFINE_SPINLOCK(z2ram_lock); -static struct gendisk *z2ram_gendisk; +static struct gendisk *z2ram_gendisk[Z2MINOR_COUNT]; static blk_status_t z2_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) @@ -283,7 +283,7 @@ static int z2_open(struct block_device *bdev, fmode_t mode) current_device = device; z2ram_size <<= Z2RAM_CHUNKSHIFT; - set_capacity(z2ram_gendisk, z2ram_size >> 9); + set_capacity(z2ram_gendisk[device], z2ram_size >> 9); } mutex_unlock(&z2ram_mutex); @@ -315,71 +315,87 @@ static const struct block_device_operations z2_fops = { .release = z2_release, }; -static struct kobject *z2_find(dev_t dev, int *part, void *data) -{ - *part = 0; - return get_disk_and_module(z2ram_gendisk); -} - -static struct request_queue *z2_queue; static struct blk_mq_tag_set tag_set; static const struct blk_mq_ops z2_mq_ops = { .queue_rq = z2_queue_rq, }; +static int z2ram_register_disk(int minor) +{ + struct request_queue *q; + struct gendisk *disk; + + disk = alloc_disk(1); + if (!disk) + return -ENOMEM; + + q = blk_mq_init_queue(&tag_set); + if (IS_ERR(q)) { + put_disk(disk); + return PTR_ERR(q); + } + + disk->major = Z2RAM_MAJOR; + disk->first_minor = minor; + disk->fops = &z2_fops; + if (minor) + sprintf(disk->disk_name, "z2ram%d", minor); + else + sprintf(disk->disk_name, "z2ram"); + disk->queue = q; + + z2ram_gendisk[minor] = disk; + add_disk(disk); + return 0; +} + static int __init z2_init(void) { - int ret; + int ret, i; if (!MACH_IS_AMIGA) return -ENODEV; - ret = -EBUSY; if (register_blkdev(Z2RAM_MAJOR, DEVICE_NAME)) - goto err; + return -EBUSY; - ret = -ENOMEM; - z2ram_gendisk = alloc_disk(1); - if (!z2ram_gendisk) - goto out_disk; + tag_set.ops = &z2_mq_ops; + tag_set.nr_hw_queues = 1; + tag_set.nr_maps = 1; + tag_set.queue_depth = 16; + tag_set.numa_node = NUMA_NO_NODE; + tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ret = blk_mq_alloc_tag_set(&tag_set); + if (ret) + goto out_unregister_blkdev; - z2_queue = blk_mq_init_sq_queue(&tag_set, &z2_mq_ops, 16, - BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(z2_queue)) { - ret = PTR_ERR(z2_queue); - z2_queue = NULL; - goto out_queue; + for (i = 0; i < Z2MINOR_COUNT; i++) { + ret = z2ram_register_disk(i); + if (ret && i == 0) + goto out_free_tagset; } - z2ram_gendisk->major = Z2RAM_MAJOR; - z2ram_gendisk->first_minor = 0; - z2ram_gendisk->fops = &z2_fops; - sprintf(z2ram_gendisk->disk_name, "z2ram"); - - z2ram_gendisk->queue = z2_queue; - add_disk(z2ram_gendisk); - blk_register_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT, THIS_MODULE, - z2_find, NULL, NULL); - return 0; -out_queue: - put_disk(z2ram_gendisk); -out_disk: +out_free_tagset: + blk_mq_free_tag_set(&tag_set); +out_unregister_blkdev: unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME); -err: return ret; } static void __exit z2_exit(void) { int i, j; - blk_unregister_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT); + unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME); - del_gendisk(z2ram_gendisk); - put_disk(z2ram_gendisk); - blk_cleanup_queue(z2_queue); + + for (i = 0; i < Z2MINOR_COUNT; i++) { + del_gendisk(z2ram_gendisk[i]); + blk_cleanup_queue(z2ram_gendisk[i]->queue); + put_disk(z2ram_gendisk[i]); + } blk_mq_free_tag_set(&tag_set); if (current_device != -1) { From e418de3abcda8b102f737919e830024d1455938f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:41 +0100 Subject: [PATCH 089/395] block: switch gendisk lookup to a simple xarray Now that bdev_map is only used for finding gendisks, we can use a simple xarray instead of the regions tracking structure for it. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- block/genhd.c | 206 ++++++++---------------------------------- include/linux/genhd.h | 7 -- 2 files changed, 36 insertions(+), 177 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 20521163fd06..01d146598fe7 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -27,15 +27,7 @@ static struct kobject *block_depr; -struct bdev_map { - struct bdev_map *next; - dev_t dev; - unsigned long range; - struct module *owner; - struct kobject *(*probe)(dev_t, int *, void *); - int (*lock)(dev_t, void *); - void *data; -} *bdev_map[255]; +static DEFINE_XARRAY(bdev_map); static DEFINE_MUTEX(bdev_map_lock); /* for extended dynamic devt allocation, currently only one major is used */ @@ -649,85 +641,26 @@ static char *bdevt_str(dev_t devt, char *buf) return buf; } -/* - * Register device numbers dev..(dev+range-1) - * range must be nonzero - * The hash chain is sorted on range, so that subranges can override. - */ -void blk_register_region(dev_t devt, unsigned long range, struct module *module, - struct kobject *(*probe)(dev_t, int *, void *), - int (*lock)(dev_t, void *), void *data) +static void blk_register_region(struct gendisk *disk) { - unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1; - unsigned index = MAJOR(devt); - unsigned i; - struct bdev_map *p; - - n = min(n, 255u); - p = kmalloc_array(n, sizeof(struct bdev_map), GFP_KERNEL); - if (p == NULL) - return; - - for (i = 0; i < n; i++, p++) { - p->owner = module; - p->probe = probe; - p->lock = lock; - p->dev = devt; - p->range = range; - p->data = data; - } + int i; mutex_lock(&bdev_map_lock); - for (i = 0, p -= n; i < n; i++, p++, index++) { - struct bdev_map **s = &bdev_map[index % 255]; - while (*s && (*s)->range < range) - s = &(*s)->next; - p->next = *s; - *s = p; + for (i = 0; i < disk->minors; i++) { + if (xa_insert(&bdev_map, disk_devt(disk) + i, disk, GFP_KERNEL)) + WARN_ON_ONCE(1); } mutex_unlock(&bdev_map_lock); } -EXPORT_SYMBOL(blk_register_region); -void blk_unregister_region(dev_t devt, unsigned long range) +static void blk_unregister_region(struct gendisk *disk) { - unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1; - unsigned index = MAJOR(devt); - unsigned i; - struct bdev_map *found = NULL; + int i; mutex_lock(&bdev_map_lock); - for (i = 0; i < min(n, 255u); i++, index++) { - struct bdev_map **s; - for (s = &bdev_map[index % 255]; *s; s = &(*s)->next) { - struct bdev_map *p = *s; - if (p->dev == devt && p->range == range) { - *s = p->next; - if (!found) - found = p; - break; - } - } - } + for (i = 0; i < disk->minors; i++) + xa_erase(&bdev_map, disk_devt(disk) + i); mutex_unlock(&bdev_map_lock); - kfree(found); -} -EXPORT_SYMBOL(blk_unregister_region); - -static struct kobject *exact_match(dev_t devt, int *partno, void *data) -{ - struct gendisk *p = data; - - return &disk_to_dev(p)->kobj; -} - -static int exact_lock(dev_t devt, void *data) -{ - struct gendisk *p = data; - - if (!get_disk_and_module(p)) - return -1; - return 0; } static void disk_scan_partitions(struct gendisk *disk) @@ -873,8 +806,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); WARN_ON(ret); bdi_set_owner(bdi, dev); - blk_register_region(disk_devt(disk), disk->minors, NULL, - exact_match, exact_lock, disk); + blk_register_region(disk); } register_disk(parent, disk, groups); if (register_queue) @@ -987,7 +919,7 @@ void del_gendisk(struct gendisk *disk) blk_unregister_queue(disk); if (!(disk->flags & GENHD_FL_HIDDEN)) - blk_unregister_region(disk_devt(disk), disk->minors); + blk_unregister_region(disk); /* * Remove gendisk pointer from idr so that it cannot be looked up * while RCU period before freeing gendisk is running to prevent @@ -1053,54 +985,22 @@ static void request_gendisk_module(dev_t devt) request_module("block-major-%d", MAJOR(devt)); } -static struct gendisk *lookup_gendisk(dev_t dev, int *partno) +static bool get_disk_and_module(struct gendisk *disk) { - struct kobject *kobj; - struct bdev_map *p; - unsigned long best = ~0UL; + struct module *owner; -retry: - mutex_lock(&bdev_map_lock); - for (p = bdev_map[MAJOR(dev) % 255]; p; p = p->next) { - struct kobject *(*probe)(dev_t, int *, void *); - struct module *owner; - void *data; - - if (p->dev > dev || p->dev + p->range - 1 < dev) - continue; - if (p->range - 1 >= best) - break; - if (!try_module_get(p->owner)) - continue; - owner = p->owner; - data = p->data; - probe = p->probe; - best = p->range - 1; - *partno = dev - p->dev; - - if (!probe) { - mutex_unlock(&bdev_map_lock); - module_put(owner); - request_gendisk_module(dev); - goto retry; - } - - if (p->lock && p->lock(dev, data) < 0) { - module_put(owner); - continue; - } - mutex_unlock(&bdev_map_lock); - kobj = probe(dev, partno, data); - /* Currently ->owner protects _only_ ->probe() itself. */ + if (!disk->fops) + return false; + owner = disk->fops->owner; + if (owner && !try_module_get(owner)) + return false; + if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) { module_put(owner); - if (kobj) - return dev_to_disk(kobj_to_dev(kobj)); - goto retry; + return false; } - mutex_unlock(&bdev_map_lock); - return NULL; -} + return true; +} /** * get_gendisk - get partitioning information for a given device @@ -1119,7 +1019,19 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) might_sleep(); if (MAJOR(devt) != BLOCK_EXT_MAJOR) { - disk = lookup_gendisk(devt, partno); + mutex_lock(&bdev_map_lock); + disk = xa_load(&bdev_map, devt); + if (!disk) { + mutex_unlock(&bdev_map_lock); + request_gendisk_module(devt); + mutex_lock(&bdev_map_lock); + disk = xa_load(&bdev_map, devt); + } + if (disk && !get_disk_and_module(disk)) + disk = NULL; + if (disk) + *partno = devt - disk_devt(disk); + mutex_unlock(&bdev_map_lock); } else { struct hd_struct *part; @@ -1323,21 +1235,6 @@ static const struct seq_operations partitions_op = { }; #endif -static void bdev_map_init(void) -{ - struct bdev_map *base; - int i; - - base = kzalloc(sizeof(*base), GFP_KERNEL); - if (!base) - panic("cannot allocate bdev_map"); - - base->dev = 1; - base->range = ~0 ; - for (i = 0; i < 255; i++) - bdev_map[i] = base; -} - static int __init genhd_device_init(void) { int error; @@ -1346,7 +1243,6 @@ static int __init genhd_device_init(void) error = class_register(&block_class); if (unlikely(error)) return error; - bdev_map_init(); blk_dev_init(); register_blkdev(BLOCK_EXT_MAJOR, "blkext"); @@ -1895,35 +1791,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) } EXPORT_SYMBOL(__alloc_disk_node); -/** - * get_disk_and_module - increments the gendisk and gendisk fops module refcount - * @disk: the struct gendisk to increment the refcount for - * - * This increments the refcount for the struct gendisk, and the gendisk's - * fops module owner. - * - * Context: Any context. - */ -struct kobject *get_disk_and_module(struct gendisk *disk) -{ - struct module *owner; - struct kobject *kobj; - - if (!disk->fops) - return NULL; - owner = disk->fops->owner; - if (owner && !try_module_get(owner)) - return NULL; - kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj); - if (kobj == NULL) { - module_put(owner); - return NULL; - } - return kobj; - -} -EXPORT_SYMBOL(get_disk_and_module); - /** * put_disk - decrements the gendisk refcount * @disk: the struct gendisk to decrement the refcount for @@ -1960,7 +1827,6 @@ void put_disk_and_module(struct gendisk *disk) module_put(owner); } } -EXPORT_SYMBOL(put_disk_and_module); static void set_disk_ro_uevent(struct gendisk *gd, int ro) { diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 811d0f83c4cf..3cbc2781ef34 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -339,15 +339,8 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev); int blk_drop_partitions(struct block_device *bdev); extern struct gendisk *__alloc_disk_node(int minors, int node_id); -extern struct kobject *get_disk_and_module(struct gendisk *disk); extern void put_disk(struct gendisk *disk); extern void put_disk_and_module(struct gendisk *disk); -extern void blk_register_region(dev_t devt, unsigned long range, - struct module *module, - struct kobject *(*probe)(dev_t, int *, void *), - int (*lock)(dev_t, void *), - void *data); -extern void blk_unregister_region(dev_t devt, unsigned long range); #define alloc_disk_node(minors, node_id) \ ({ \ From e2b6b301871719d1db0b1ed7a1ed9e06750c80fc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 14 Nov 2020 18:08:21 +0100 Subject: [PATCH 090/395] block: fix the kerneldoc comment for __register_blkdev Switch the comment to talk about __register_blkdev instead of register_blkdev and document the new probe parameter. Fixes: 3da1a61e7046 ("block: add an optional probe callback to major_names") Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 01d146598fe7..ec2a24799cd9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -418,11 +418,12 @@ void blkdev_show(struct seq_file *seqf, off_t offset) #endif /* CONFIG_PROC_FS */ /** - * register_blkdev - register a new block device + * __register_blkdev - register a new block device * * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If * @major = 0, try to allocate any unused major number. * @name: the name of the new block device as a zero terminated string + * @probe: allback that is called on access to any minor number of @major * * The @name must be unique within the system. * @@ -436,6 +437,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset) * * See Documentation/admin-guide/devices.txt for the list of allocated * major numbers. + * + * Use register_blkdev instead for any new code. */ int __register_blkdev(unsigned int major, const char *name, void (*probe)(dev_t devt)) From 8b8b0915ba8daef9b4320d6dc75a2ec14e1fe2df Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 8 Oct 2020 15:13:27 +0200 Subject: [PATCH 091/395] s390/cio: Export information about Endpoint-Security Capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new sysfs attribute 'esc' per chpid. This new attribute exports the Endpoint-Security-Capability byte of channel-path description block, which could be 0-None, 1-Authentication, 2 and 3-Encryption. For example: $ cat /sys/devices/css0/chp0.34/esc 0 [vneethv@linux.ibm.com: cleaned-up & modified description] Signed-off-by: Sebastian Ott Signed-off-by: Vineeth Vijayan Signed-off-by: Stefan Haberland Reviewed-by: Jan Höppner Reviewed-by: Peter Oberparleiter Reviewed-by: Cornelia Huck Acked-by: Vasily Gorbik Signed-off-by: Jens Axboe --- drivers/s390/cio/chp.c | 15 +++++++++++++++ drivers/s390/cio/chsc.h | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index dfcbe54591fb..8d0de6adcad0 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -384,6 +384,20 @@ static ssize_t chp_chid_external_show(struct device *dev, } static DEVICE_ATTR(chid_external, 0444, chp_chid_external_show, NULL); +static ssize_t chp_esc_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct channel_path *chp = to_channelpath(dev); + ssize_t rc; + + mutex_lock(&chp->lock); + rc = sprintf(buf, "%x\n", chp->desc_fmt1.esc); + mutex_unlock(&chp->lock); + + return rc; +} +static DEVICE_ATTR(esc, 0444, chp_esc_show, NULL); + static ssize_t util_string_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) @@ -414,6 +428,7 @@ static struct attribute *chp_attrs[] = { &dev_attr_shared.attr, &dev_attr_chid.attr, &dev_attr_chid_external.attr, + &dev_attr_esc.attr, NULL, }; static struct attribute_group chp_attr_group = { diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h index c2b83b68bc57..32fa7faa5bf6 100644 --- a/drivers/s390/cio/chsc.h +++ b/drivers/s390/cio/chsc.h @@ -27,7 +27,8 @@ struct channel_path_desc_fmt1 { u8 lsn; u8 desc; u8 chpid; - u32:24; + u32:16; + u8 esc; u8 chpp; u32 unused[2]; u16 chid; From 4cd6094d9d609f73694783553df72572e302a5e9 Mon Sep 17 00:00:00 2001 From: Vineeth Vijayan Date: Thu, 8 Oct 2020 15:13:28 +0200 Subject: [PATCH 092/395] s390/cio: Provide Endpoint-Security Mode per CU Add an interface in the CIO layer to retrieve the information about the Endpoint-Security Mode (ESM) of the specified CU. The ESM values are defined as 0-None, 1-Authenticated or 2, 3-Encrypted. [vneethv@linux.ibm.com: cleaned-up and modified description] Signed-off-by: Sebastian Ott Signed-off-by: Vineeth Vijayan Signed-off-by: Stefan Haberland Reviewed-by: Peter Oberparleiter Acked-by: Vasily Gorbik Acked-by: Cornelia Huck Signed-off-by: Jens Axboe --- arch/s390/include/asm/cio.h | 1 + drivers/s390/cio/chsc.c | 83 +++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h index 5c58756d6476..e36cb67d2441 100644 --- a/arch/s390/include/asm/cio.h +++ b/arch/s390/include/asm/cio.h @@ -373,5 +373,6 @@ int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta); int chsc_sstpi(void *page, void *result, size_t size); int chsc_stzi(void *page, void *result, size_t size); int chsc_sgib(u32 origin); +int chsc_scud(u16 cu, u64 *esm, u8 *esm_valid); #endif diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index fc06a4002168..4ea466593fd6 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -1428,3 +1428,86 @@ int chsc_sgib(u32 origin) return ret; } EXPORT_SYMBOL_GPL(chsc_sgib); + +#define SCUD_REQ_LEN 0x10 /* SCUD request block length */ +#define SCUD_REQ_CMD 0x4b /* SCUD Command Code */ + +struct chse_cudb { + u16 flags:8; + u16 chp_valid:8; + u16 cu; + u32 esm_valid:8; + u32:24; + u8 chpid[8]; + u32:32; + u32:32; + u8 esm[8]; + u32 efla[8]; +} __packed; + +struct chsc_scud { + struct chsc_header request; + u16:4; + u16 fmt:4; + u16 cssid:8; + u16 first_cu; + u16:16; + u16 last_cu; + u32:32; + struct chsc_header response; + u16:4; + u16 fmt_resp:4; + u32:24; + struct chse_cudb cudb[]; +} __packed; + +/** + * chsc_scud() - Store control-unit description. + * @cu: number of the control-unit + * @esm: 8 1-byte endpoint security mode values + * @esm_valid: validity mask for @esm + * + * Interface to retrieve information about the endpoint security + * modes for up to 8 paths of a control unit. + * + * Returns 0 on success. + */ +int chsc_scud(u16 cu, u64 *esm, u8 *esm_valid) +{ + struct chsc_scud *scud = chsc_page; + int ret; + + spin_lock_irq(&chsc_page_lock); + memset(chsc_page, 0, PAGE_SIZE); + scud->request.length = SCUD_REQ_LEN; + scud->request.code = SCUD_REQ_CMD; + scud->fmt = 0; + scud->cssid = 0; + scud->first_cu = cu; + scud->last_cu = cu; + + ret = chsc(scud); + if (!ret) + ret = chsc_error_from_response(scud->response.code); + + if (!ret && (scud->response.length <= 8 || scud->fmt_resp != 0 + || !(scud->cudb[0].flags & 0x80) + || scud->cudb[0].cu != cu)) { + + CIO_MSG_EVENT(2, "chsc: scud failed rc=%04x, L2=%04x " + "FMT=%04x, cudb.flags=%02x, cudb.cu=%04x", + scud->response.code, scud->response.length, + scud->fmt_resp, scud->cudb[0].flags, scud->cudb[0].cu); + ret = -EINVAL; + } + + if (ret) + goto out; + + memcpy(esm, scud->cudb[0].esm, sizeof(*esm)); + *esm_valid = scud->cudb[0].esm_valid; +out: + spin_unlock_irq(&chsc_page_lock); + return ret; +} +EXPORT_SYMBOL_GPL(chsc_scud); From 32ef938815c1fb42d65212aac860ab153a64de1a Mon Sep 17 00:00:00 2001 From: Vineeth Vijayan Date: Thu, 8 Oct 2020 15:13:29 +0200 Subject: [PATCH 093/395] s390/cio: Add support for FCES status notification Fibre Channel Endpoint-Security event is received as an sei:nt0 type in the CIO layer. This information needs to be shared with the CCW device drivers using the path_events callback. Co-developed-by: Sebastian Ott Signed-off-by: Vineeth Vijayan Signed-off-by: Sebastian Ott Signed-off-by: Stefan Haberland Reviewed-by: Peter Oberparleiter Acked-by: Vasily Gorbik Signed-off-by: Jens Axboe --- arch/s390/include/asm/ccwdev.h | 2 ++ drivers/s390/cio/chp.h | 1 + drivers/s390/cio/chsc.c | 62 +++++++++++++++++++++++++++++----- drivers/s390/cio/device.c | 15 +++++++- 4 files changed, 70 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/ccwdev.h b/arch/s390/include/asm/ccwdev.h index c0be5fe1ddba..bf605e1fcf6a 100644 --- a/arch/s390/include/asm/ccwdev.h +++ b/arch/s390/include/asm/ccwdev.h @@ -104,6 +104,8 @@ struct ccw_device { was successfully verified. */ #define PE_PATHGROUP_ESTABLISHED 0x4 /* A pathgroup was reset and had to be established again. */ +#define PE_PATH_FCES_EVENT 0x8 /* The FCES Status of a path has + * changed. */ /* * Possible CIO actions triggered by the unit check handler. diff --git a/drivers/s390/cio/chp.h b/drivers/s390/cio/chp.h index 20259f3fbf45..7ee9eba0abcb 100644 --- a/drivers/s390/cio/chp.h +++ b/drivers/s390/cio/chp.h @@ -23,6 +23,7 @@ #define CHP_OFFLINE 1 #define CHP_VARY_ON 2 #define CHP_VARY_OFF 3 +#define CHP_FCES_EVENT 4 struct chp_link { struct chp_id chpid; diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index 4ea466593fd6..c22d9ee27ba1 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -37,6 +37,9 @@ static void *sei_page; static void *chsc_page; static DEFINE_SPINLOCK(chsc_page_lock); +#define SEI_VF_FLA 0xc0 /* VF flag for Full Link Address */ +#define SEI_RS_CHPID 0x4 /* 4 in RS field indicates CHPID */ + /** * chsc_error_from_response() - convert a chsc response to an error * @response: chsc response code @@ -287,6 +290,15 @@ static void s390_process_res_acc(struct chp_link *link) css_schedule_reprobe(); } +static int process_fces_event(struct subchannel *sch, void *data) +{ + spin_lock_irq(sch->lock); + if (sch->driver && sch->driver->chp_event) + sch->driver->chp_event(sch, data, CHP_FCES_EVENT); + spin_unlock_irq(sch->lock); + return 0; +} + struct chsc_sei_nt0_area { u8 flags; u8 vf; /* validity flags */ @@ -364,6 +376,16 @@ static char *store_ebcdic(char *dest, const char *src, unsigned long len, return dest + len; } +static void chsc_link_from_sei(struct chp_link *link, + struct chsc_sei_nt0_area *sei_area) +{ + if ((sei_area->vf & SEI_VF_FLA) != 0) { + link->fla = sei_area->fla; + link->fla_mask = ((sei_area->vf & SEI_VF_FLA) == SEI_VF_FLA) ? + 0xffff : 0xff00; + } +} + /* Format node ID and parameters for output in LIR log message. */ static void format_node_data(char *params, char *id, struct node_descriptor *nd) { @@ -453,15 +475,7 @@ static void chsc_process_sei_res_acc(struct chsc_sei_nt0_area *sei_area) } memset(&link, 0, sizeof(struct chp_link)); link.chpid = chpid; - if ((sei_area->vf & 0xc0) != 0) { - link.fla = sei_area->fla; - if ((sei_area->vf & 0xc0) == 0xc0) - /* full link address */ - link.fla_mask = 0xffff; - else - /* link address */ - link.fla_mask = 0xff00; - } + chsc_link_from_sei(&link, sei_area); s390_process_res_acc(&link); } @@ -570,6 +584,33 @@ static void chsc_process_sei_ap_cfg_chg(struct chsc_sei_nt0_area *sei_area) ap_bus_cfg_chg(); } +static void chsc_process_sei_fces_event(struct chsc_sei_nt0_area *sei_area) +{ + struct chp_link link; + struct chp_id chpid; + struct channel_path *chp; + + CIO_CRW_EVENT(4, + "chsc: FCES status notification (rs=%02x, rs_id=%04x, FCES-status=%x)\n", + sei_area->rs, sei_area->rsid, sei_area->ccdf[0]); + + if (sei_area->rs != SEI_RS_CHPID) + return; + chp_id_init(&chpid); + chpid.id = sei_area->rsid; + + /* Ignore the event on unknown/invalid chp */ + chp = chpid_to_chp(chpid); + if (!chp) + return; + + memset(&link, 0, sizeof(struct chp_link)); + link.chpid = chpid; + chsc_link_from_sei(&link, sei_area); + + for_each_subchannel_staged(process_fces_event, NULL, &link); +} + static void chsc_process_sei_nt2(struct chsc_sei_nt2_area *sei_area) { switch (sei_area->cc) { @@ -611,6 +652,9 @@ static void chsc_process_sei_nt0(struct chsc_sei_nt0_area *sei_area) case 14: /* scm available notification */ chsc_process_sei_scm_avail(sei_area); break; + case 15: /* FCES event notification */ + chsc_process_sei_fces_event(sei_area); + break; default: /* other stuff */ CIO_CRW_EVENT(2, "chsc: sei nt0 unhandled cc=%d\n", sei_area->cc); diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index b29fe8d50baf..aab13c78db9f 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -1170,7 +1170,8 @@ static int io_subchannel_chp_event(struct subchannel *sch, struct chp_link *link, int event) { struct ccw_device *cdev = sch_get_cdev(sch); - int mask; + int mask, chpid, valid_bit; + int path_event[8]; mask = chp_ssd_get_mask(&sch->ssd_info, link); if (!mask) @@ -1205,6 +1206,18 @@ static int io_subchannel_chp_event(struct subchannel *sch, cdev->private->path_new_mask |= mask; io_subchannel_verify(sch); break; + case CHP_FCES_EVENT: + /* Forward Endpoint Security event */ + for (chpid = 0, valid_bit = 0x80; chpid < 8; chpid++, + valid_bit >>= 1) { + if (mask & valid_bit) + path_event[chpid] = PE_PATH_FCES_EVENT; + else + path_event[chpid] = PE_NONE; + } + if (cdev) + cdev->drv->path_event(cdev, path_event); + break; } return 0; } From e03c5941f904afcc0237295e84e756c36619e058 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Thu, 8 Oct 2020 15:13:30 +0200 Subject: [PATCH 094/395] s390/dasd: Remove unused parameter from dasd_generic_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The discipline argument in dasd_generic_probe() isn't used and there is no history how it was used in the past. Remove it. Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 3 +-- drivers/s390/block/dasd_eckd.c | 2 +- drivers/s390/block/dasd_fba.c | 2 +- drivers/s390/block/dasd_int.h | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index db24e04ee978..8581b2d46e13 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3466,8 +3466,7 @@ static void dasd_generic_auto_online(void *data, async_cookie_t cookie) * Initial attempt at a probe function. this can be simplified once * the other detection code is gone. */ -int dasd_generic_probe(struct ccw_device *cdev, - struct dasd_discipline *discipline) +int dasd_generic_probe(struct ccw_device *cdev) { int ret; diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index ad44d22e8859..2b39d2a5965f 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -143,7 +143,7 @@ dasd_eckd_probe (struct ccw_device *cdev) "ccw-device options"); return ret; } - ret = dasd_generic_probe(cdev, &dasd_eckd_discipline); + ret = dasd_generic_probe(cdev); return ret; } diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 1a44e321b54e..5b0ebf6bf20f 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -58,7 +58,7 @@ static struct ccw_driver dasd_fba_driver; /* see below */ static int dasd_fba_probe(struct ccw_device *cdev) { - return dasd_generic_probe(cdev, &dasd_fba_discipline); + return dasd_generic_probe(cdev); } static int diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index c59a0d63b506..97ee0997a33e 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -774,7 +774,7 @@ void dasd_block_set_timer(struct dasd_block *, int); void dasd_block_clear_timer(struct dasd_block *); int dasd_cancel_req(struct dasd_ccw_req *); int dasd_flush_device_queue(struct dasd_device *); -int dasd_generic_probe (struct ccw_device *, struct dasd_discipline *); +int dasd_generic_probe(struct ccw_device *); void dasd_generic_free_discipline(struct dasd_device *); void dasd_generic_remove (struct ccw_device *cdev); int dasd_generic_set_online(struct ccw_device *, struct dasd_discipline *); From d2a527580c0a0c83f1d98eff32804cde4280d721 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Thu, 8 Oct 2020 15:13:31 +0200 Subject: [PATCH 095/395] s390/dasd: Move duplicate code to separate function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For storing retrieved path information both the if and else block in dasd_eckd_read_conf() use the same code. To avoid duplicate code this should be done after the if/else block. To further increase readability, move the code to a new function, dasd_eckd_store_conf_data(). Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.c | 42 ++++++++++++++++------------------ 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 2b39d2a5965f..497aa81778b6 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -1000,6 +1000,22 @@ static unsigned char dasd_eckd_path_access(void *conf_data, int conf_len) return 0; } +static void dasd_eckd_store_conf_data(struct dasd_device *device, + struct dasd_conf_data *conf_data, int chp) +{ + struct channel_path_desc_fmt0 *chp_desc; + struct subchannel_id sch_id; + + ccw_device_get_schid(device->cdev, &sch_id); + device->path[chp].conf_data = conf_data; + device->path[chp].cssid = sch_id.cssid; + device->path[chp].ssid = sch_id.ssid; + chp_desc = ccw_device_get_chp_desc(device->cdev, chp); + if (chp_desc) + device->path[chp].chpid = chp_desc->chpid; + kfree(chp_desc); +} + static void dasd_eckd_clear_conf_data(struct dasd_device *device) { struct dasd_eckd_private *private = device->private; @@ -1016,7 +1032,6 @@ static void dasd_eckd_clear_conf_data(struct dasd_device *device) } } - static int dasd_eckd_read_conf(struct dasd_device *device) { void *conf_data; @@ -1026,12 +1041,9 @@ static int dasd_eckd_read_conf(struct dasd_device *device) struct dasd_eckd_private *private, path_private; struct dasd_uid *uid; char print_path_uid[60], print_device_uid[60]; - struct channel_path_desc_fmt0 *chp_desc; - struct subchannel_id sch_id; private = device->private; opm = ccw_device_get_path_mask(device->cdev); - ccw_device_get_schid(device->cdev, &sch_id); conf_data_saved = 0; path_err = 0; /* get configuration data per operational path */ @@ -1066,15 +1078,6 @@ static int dasd_eckd_read_conf(struct dasd_device *device) kfree(conf_data); continue; } - pos = pathmask_to_pos(lpm); - /* store per path conf_data */ - device->path[pos].conf_data = conf_data; - device->path[pos].cssid = sch_id.cssid; - device->path[pos].ssid = sch_id.ssid; - chp_desc = ccw_device_get_chp_desc(device->cdev, pos); - if (chp_desc) - device->path[pos].chpid = chp_desc->chpid; - kfree(chp_desc); /* * build device UID that other path data * can be compared to it @@ -1132,18 +1135,13 @@ static int dasd_eckd_read_conf(struct dasd_device *device) dasd_path_add_cablepm(device, lpm); continue; } - pos = pathmask_to_pos(lpm); - /* store per path conf_data */ - device->path[pos].conf_data = conf_data; - device->path[pos].cssid = sch_id.cssid; - device->path[pos].ssid = sch_id.ssid; - chp_desc = ccw_device_get_chp_desc(device->cdev, pos); - if (chp_desc) - device->path[pos].chpid = chp_desc->chpid; - kfree(chp_desc); path_private.conf_data = NULL; path_private.conf_len = 0; } + + pos = pathmask_to_pos(lpm); + dasd_eckd_store_conf_data(device, conf_data, pos); + switch (dasd_eckd_path_access(conf_data, conf_len)) { case 0x02: dasd_path_add_nppm(device, lpm); From 460181217a2496defc6c279b0a7eb810b05b9145 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Thu, 8 Oct 2020 15:13:32 +0200 Subject: [PATCH 096/395] s390/dasd: Store path configuration data during path handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the configuration data for a path is retrieved during a path verification and used only temporarily. If a path is newly added to the I/O setup after a boot, no configuration data will be stored for this particular path. However, this data is required for later use and should be present for a valid I/O path anyway. Store this data during the path verification so that newly added paths can provide all information necessary. [sth@linux.ibm.com: fix conf_data memleak] Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 497aa81778b6..3ff7b532a5bf 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -1007,6 +1007,11 @@ static void dasd_eckd_store_conf_data(struct dasd_device *device, struct subchannel_id sch_id; ccw_device_get_schid(device->cdev, &sch_id); + /* + * path handling and read_conf allocate data + * free it before replacing the pointer + */ + kfree(device->path[chp].conf_data); device->path[chp].conf_data = conf_data; device->path[chp].cssid = sch_id.cssid; device->path[chp].ssid = sch_id.ssid; @@ -1263,9 +1268,10 @@ static void do_path_verification_work(struct work_struct *work) struct dasd_uid *uid; __u8 path_rcd_buf[DASD_ECKD_RCD_DATA_SIZE]; __u8 lpm, opm, npm, ppm, epm, hpfpm, cablepm; + struct dasd_conf_data *conf_data; unsigned long flags; char print_uid[60]; - int rc; + int rc, pos; data = container_of(work, struct path_verification_work_data, worker); device = data->device; @@ -1395,6 +1401,14 @@ static void do_path_verification_work(struct work_struct *work) } } + conf_data = kzalloc(DASD_ECKD_RCD_DATA_SIZE, GFP_KERNEL); + if (conf_data) { + memcpy(conf_data, data->rcd_buffer, + DASD_ECKD_RCD_DATA_SIZE); + } + pos = pathmask_to_pos(lpm); + dasd_eckd_store_conf_data(device, conf_data, pos); + /* * There is a small chance that a path is lost again between * above path verification and the following modification of From 9e34c8ba91697cb7441805c36d92ab3e695df6e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Thu, 8 Oct 2020 15:13:33 +0200 Subject: [PATCH 097/395] s390/dasd: Fix operational path inconsistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During online processing and setting up a DASD device, the configuration data for operational paths is read and validated two times (dasd_eckd_read_conf()). The first time to provide information that are necessary for the LCU setup. A second time after the LCU setup as a device might report different configuration data then. When the configuration setup for each operational path is being validated, an initial call to dasd_eckd_clear_conf_data() is issued. This call wipes all previously available configuration data and path information for each path. However, the operational path mask is not updated during this process. As a result, the stored operational path mask might no longer correspond to the operational paths mask reported by the CIO layer, as several paths might be gone between the two dasd_eckd_read_conf() calls. This inconsistency leads to more severe issues in later path handling changes. Fix this by removing the channel paths from the operational path mask during the dasd_eckd_clear_conf_data() call. Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 3ff7b532a5bf..3273b26b25b0 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -1034,6 +1034,7 @@ static void dasd_eckd_clear_conf_data(struct dasd_device *device) device->path[i].cssid = 0; device->path[i].ssid = 0; device->path[i].chpid = 0; + dasd_path_notoper(device, i); } } From 19508b2047403cc88d2255118e2640ab1d3bf8a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Thu, 8 Oct 2020 15:13:34 +0200 Subject: [PATCH 098/395] s390/dasd: Display FC Endpoint Security information via sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new sysfs attribute (fc_security) per device and per operational channel path. The information of the current FC Endpoint Security state is received through the CIO layer. The state of the FC Endpoint Security can be either "Unsupported", "Authentication", or "Encryption". For example: $ cat /sys/bus/ccw/devices/0.0.c600/fc_security Encryption If any of the operational paths is in a state different from all others, the device sysfs attribute will display the additional state "Inconsistent". The sysfs attributes per paths are organised in a new directory called "paths_info" with subdirectories for each path. /sys/bus/ccw/devices/0.0.c600/paths_info/ ├── 0.38 │   └── fc_security ├── 0.39 │   └── fc_security ├── 0.3a │   └── fc_security └── 0.3b └── fc_security Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_devmap.c | 109 +++++++++++++++++++++++++++++++ drivers/s390/block/dasd_eckd.c | 30 +++++++++ drivers/s390/block/dasd_int.h | 68 +++++++++++++++++++ 3 files changed, 207 insertions(+) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 32fc51341d99..16bb135c20aa 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -576,6 +576,11 @@ dasd_create_device(struct ccw_device *cdev) dev_set_drvdata(&cdev->dev, device); spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); + device->paths_info = kset_create_and_add("paths_info", NULL, + &device->cdev->dev.kobj); + if (!device->paths_info) + dev_warn(&cdev->dev, "Could not create paths_info kset\n"); + return device; } @@ -622,6 +627,9 @@ dasd_delete_device(struct dasd_device *device) wait_event(dasd_delete_wq, atomic_read(&device->ref_count) == 0); dasd_generic_free_discipline(device); + + kset_unregister(device->paths_info); + /* Disconnect dasd_device structure from ccw_device structure. */ cdev = device->cdev; device->cdev = NULL; @@ -1641,6 +1649,39 @@ dasd_path_interval_store(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(path_interval, 0644, dasd_path_interval_show, dasd_path_interval_store); +static ssize_t +dasd_device_fcs_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct dasd_device *device; + int fc_sec; + int rc; + + device = dasd_device_from_cdev(to_ccwdev(dev)); + if (IS_ERR(device)) + return -ENODEV; + fc_sec = dasd_path_get_fcs_device(device); + if (fc_sec == -EINVAL) + rc = snprintf(buf, PAGE_SIZE, "Inconsistent\n"); + else + rc = snprintf(buf, PAGE_SIZE, "%s\n", dasd_path_get_fcs_str(fc_sec)); + dasd_put_device(device); + + return rc; +} +static DEVICE_ATTR(fc_security, 0444, dasd_device_fcs_show, NULL); + +static ssize_t +dasd_path_fcs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + struct dasd_path *path = to_dasd_path(kobj); + unsigned int fc_sec = path->fc_security; + + return snprintf(buf, PAGE_SIZE, "%s\n", dasd_path_get_fcs_str(fc_sec)); +} + +static struct kobj_attribute path_fcs_attribute = + __ATTR(fc_security, 0444, dasd_path_fcs_show, NULL); #define DASD_DEFINE_ATTR(_name, _func) \ static ssize_t dasd_##_name##_show(struct device *dev, \ @@ -1697,6 +1738,7 @@ static struct attribute * dasd_attrs[] = { &dev_attr_path_reset.attr, &dev_attr_hpf.attr, &dev_attr_ese.attr, + &dev_attr_fc_security.attr, NULL, }; @@ -1777,6 +1819,73 @@ dasd_set_feature(struct ccw_device *cdev, int feature, int flag) } EXPORT_SYMBOL(dasd_set_feature); +static struct attribute *paths_info_attrs[] = { + &path_fcs_attribute.attr, + NULL, +}; + +static struct kobj_type path_attr_type = { + .release = dasd_path_release, + .default_attrs = paths_info_attrs, + .sysfs_ops = &kobj_sysfs_ops, +}; + +static void dasd_path_init_kobj(struct dasd_device *device, int chp) +{ + device->path[chp].kobj.kset = device->paths_info; + kobject_init(&device->path[chp].kobj, &path_attr_type); +} + +void dasd_path_create_kobj(struct dasd_device *device, int chp) +{ + int rc; + + if (test_bit(DASD_FLAG_OFFLINE, &device->flags)) + return; + if (!device->paths_info) { + dev_warn(&device->cdev->dev, "Unable to create paths objects\n"); + return; + } + if (device->path[chp].in_sysfs) + return; + if (!device->path[chp].conf_data) + return; + + dasd_path_init_kobj(device, chp); + + rc = kobject_add(&device->path[chp].kobj, NULL, "%x.%02x", + device->path[chp].cssid, device->path[chp].chpid); + if (rc) + kobject_put(&device->path[chp].kobj); + device->path[chp].in_sysfs = true; +} +EXPORT_SYMBOL(dasd_path_create_kobj); + +void dasd_path_create_kobjects(struct dasd_device *device) +{ + u8 lpm, opm; + + opm = dasd_path_get_opm(device); + for (lpm = 0x80; lpm; lpm >>= 1) { + if (!(lpm & opm)) + continue; + dasd_path_create_kobj(device, pathmask_to_pos(lpm)); + } +} +EXPORT_SYMBOL(dasd_path_create_kobjects); + +/* + * As we keep kobjects for the lifetime of a device, this function must not be + * called anywhere but in the context of offlining a device. + */ +void dasd_path_remove_kobj(struct dasd_device *device, int chp) +{ + if (device->path[chp].in_sysfs) { + kobject_put(&device->path[chp].kobj); + device->path[chp].in_sysfs = false; + } +} +EXPORT_SYMBOL(dasd_path_remove_kobj); int dasd_add_sysfs_files(struct ccw_device *cdev) { diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 3273b26b25b0..cfffab4c627b 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -1035,6 +1035,30 @@ static void dasd_eckd_clear_conf_data(struct dasd_device *device) device->path[i].ssid = 0; device->path[i].chpid = 0; dasd_path_notoper(device, i); + dasd_path_remove_kobj(device, i); + } +} + +static void dasd_eckd_read_fc_security(struct dasd_device *device) +{ + struct dasd_eckd_private *private = device->private; + u8 esm_valid; + u8 esm[8]; + int chp; + int rc; + + rc = chsc_scud(private->uid.ssid, (u64 *)esm, &esm_valid); + if (rc) { + for (chp = 0; chp < 8; chp++) + device->path[chp].fc_security = 0; + return; + } + + for (chp = 0; chp < 8; chp++) { + if (esm_valid & (0x80 >> chp)) + device->path[chp].fc_security = esm[chp]; + else + device->path[chp].fc_security = 0; } } @@ -1164,6 +1188,8 @@ static int dasd_eckd_read_conf(struct dasd_device *device) } } + dasd_eckd_read_fc_security(device); + return path_err; } @@ -1430,6 +1456,8 @@ static void do_path_verification_work(struct work_struct *work) dasd_path_add_cablepm(device, cablepm); dasd_path_add_nohpfpm(device, hpfpm); spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); + + dasd_path_create_kobj(device, pos); } clear_bit(DASD_FLAG_PATH_VERIFY, &device->flags); dasd_put_device(device); @@ -2069,6 +2097,8 @@ dasd_eckd_check_characteristics(struct dasd_device *device) if (rc) goto out_err3; + dasd_path_create_kobjects(device); + /* Read Feature Codes */ dasd_eckd_read_features(device); diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 97ee0997a33e..e6823464acca 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -426,6 +426,35 @@ extern struct dasd_discipline *dasd_diag_discipline_pointer; #define DASD_THRHLD_MAX 4294967295U #define DASD_INTERVAL_MAX 4294967295U +/* FC Endpoint Security Capabilities */ +#define DASD_FC_SECURITY_UNSUP 0 +#define DASD_FC_SECURITY_AUTH 1 +#define DASD_FC_SECURITY_ENC_FCSP2 2 +#define DASD_FC_SECURITY_ENC_ERAS 3 + +#define DASD_FC_SECURITY_ENC_STR "Encryption" +static const struct { + u8 value; + char *name; +} dasd_path_fcs_mnemonics[] = { + { DASD_FC_SECURITY_UNSUP, "Unsupported" }, + { DASD_FC_SECURITY_AUTH, "Authentication" }, + { DASD_FC_SECURITY_ENC_FCSP2, DASD_FC_SECURITY_ENC_STR }, + { DASD_FC_SECURITY_ENC_ERAS, DASD_FC_SECURITY_ENC_STR }, +}; + +static inline char *dasd_path_get_fcs_str(int val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(dasd_path_fcs_mnemonics); i++) { + if (dasd_path_fcs_mnemonics[i].value == val) + return dasd_path_fcs_mnemonics[i].name; + } + + return dasd_path_fcs_mnemonics[0].name; +} + struct dasd_path { unsigned long flags; u8 cssid; @@ -434,8 +463,18 @@ struct dasd_path { struct dasd_conf_data *conf_data; atomic_t error_count; unsigned long errorclk; + u8 fc_security; + struct kobject kobj; + bool in_sysfs; }; +#define to_dasd_path(path) container_of(path, struct dasd_path, kobj) + +static inline void dasd_path_release(struct kobject *kobj) +{ +/* Memory for the dasd_path kobject is freed when dasd_free_device() is called */ +} + struct dasd_profile_info { /* legacy part of profile data, as in dasd_profile_info_t */ @@ -547,6 +586,7 @@ struct dasd_device { struct dentry *hosts_dentry; struct dasd_profile profile; struct dasd_format_entry format_entry; + struct kset *paths_info; }; struct dasd_block { @@ -824,6 +864,9 @@ int dasd_set_feature(struct ccw_device *, int, int); int dasd_add_sysfs_files(struct ccw_device *); void dasd_remove_sysfs_files(struct ccw_device *); +void dasd_path_create_kobj(struct dasd_device *, int); +void dasd_path_create_kobjects(struct dasd_device *); +void dasd_path_remove_kobj(struct dasd_device *, int); struct dasd_device *dasd_device_from_cdev(struct ccw_device *); struct dasd_device *dasd_device_from_cdev_locked(struct ccw_device *); @@ -1114,6 +1157,31 @@ static inline __u8 dasd_path_get_hpfpm(struct dasd_device *device) return hpfpm; } +static inline u8 dasd_path_get_fcs_path(struct dasd_device *device, int chp) +{ + return device->path[chp].fc_security; +} + +static inline int dasd_path_get_fcs_device(struct dasd_device *device) +{ + u8 fc_sec = 0; + int chp; + + for (chp = 0; chp < 8; chp++) { + if (device->opm & (0x80 >> chp)) { + fc_sec = device->path[chp].fc_security; + break; + } + } + for (; chp < 8; chp++) { + if (device->opm & (0x80 >> chp)) + if (device->path[chp].fc_security != fc_sec) + return -EINVAL; + } + + return fc_sec; +} + /* * add functions for path masks * the existing path mask will be extended by the given path mask From b72949328869dfd45f6452c2410647afd7db5f1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Thu, 8 Oct 2020 15:13:35 +0200 Subject: [PATCH 099/395] s390/dasd: Prepare for additional path event handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As more path events need to be handled for ECKD the current path verification infrastructure can be reused. Rename all path verifcation code to fit the more broadly based task of path event handling and put the path verification in a new separate function. Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 4 +- drivers/s390/block/dasd_eckd.c | 78 +++++++++++++++++++--------------- drivers/s390/block/dasd_int.h | 1 + 3 files changed, 47 insertions(+), 36 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 8581b2d46e13..8d7a53e98298 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2115,8 +2115,8 @@ static void __dasd_device_check_path_events(struct dasd_device *device) if (device->stopped & ~(DASD_STOPPED_DC_WAIT | DASD_UNRESUMED_PM)) return; - rc = device->discipline->verify_path(device, - dasd_path_get_tbvpm(device)); + rc = device->discipline->pe_handler(device, + dasd_path_get_tbvpm(device)); if (rc) dasd_device_set_timer(device, 50); else diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index cfffab4c627b..2e1cfacbf4d8 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -103,7 +103,7 @@ struct ext_pool_exhaust_work_data { }; /* definitions for the path verification worker */ -struct path_verification_work_data { +struct pe_handler_work_data { struct work_struct worker; struct dasd_device *device; struct dasd_ccw_req cqr; @@ -112,8 +112,8 @@ struct path_verification_work_data { int isglobal; __u8 tbvpm; }; -static struct path_verification_work_data *path_verification_worker; -static DEFINE_MUTEX(dasd_path_verification_mutex); +static struct pe_handler_work_data *pe_handler_worker; +static DEFINE_MUTEX(dasd_pe_handler_mutex); struct check_attention_work_data { struct work_struct worker; @@ -1249,7 +1249,7 @@ static int verify_fcx_max_data(struct dasd_device *device, __u8 lpm) } static int rebuild_device_uid(struct dasd_device *device, - struct path_verification_work_data *data) + struct pe_handler_work_data *data) { struct dasd_eckd_private *private = device->private; __u8 lpm, opm = dasd_path_get_opm(device); @@ -1287,10 +1287,9 @@ static int rebuild_device_uid(struct dasd_device *device, return rc; } -static void do_path_verification_work(struct work_struct *work) +static void dasd_eckd_path_available_action(struct dasd_device *device, + struct pe_handler_work_data *data) { - struct path_verification_work_data *data; - struct dasd_device *device; struct dasd_eckd_private path_private; struct dasd_uid *uid; __u8 path_rcd_buf[DASD_ECKD_RCD_DATA_SIZE]; @@ -1300,19 +1299,6 @@ static void do_path_verification_work(struct work_struct *work) char print_uid[60]; int rc, pos; - data = container_of(work, struct path_verification_work_data, worker); - device = data->device; - - /* delay path verification until device was resumed */ - if (test_bit(DASD_FLAG_SUSPENDED, &device->flags)) { - schedule_work(work); - return; - } - /* check if path verification already running and delay if so */ - if (test_and_set_bit(DASD_FLAG_PATH_VERIFY, &device->flags)) { - schedule_work(work); - return; - } opm = 0; npm = 0; ppm = 0; @@ -1459,30 +1445,54 @@ static void do_path_verification_work(struct work_struct *work) dasd_path_create_kobj(device, pos); } +} + +static void do_pe_handler_work(struct work_struct *work) +{ + struct pe_handler_work_data *data; + struct dasd_device *device; + + data = container_of(work, struct pe_handler_work_data, worker); + device = data->device; + + /* delay path verification until device was resumed */ + if (test_bit(DASD_FLAG_SUSPENDED, &device->flags)) { + schedule_work(work); + return; + } + /* check if path verification already running and delay if so */ + if (test_and_set_bit(DASD_FLAG_PATH_VERIFY, &device->flags)) { + schedule_work(work); + return; + } + + dasd_eckd_path_available_action(device, data); + clear_bit(DASD_FLAG_PATH_VERIFY, &device->flags); dasd_put_device(device); if (data->isglobal) - mutex_unlock(&dasd_path_verification_mutex); + mutex_unlock(&dasd_pe_handler_mutex); else kfree(data); } -static int dasd_eckd_verify_path(struct dasd_device *device, __u8 lpm) +static int dasd_eckd_pe_handler(struct dasd_device *device, __u8 lpm) { - struct path_verification_work_data *data; + struct pe_handler_work_data *data; data = kmalloc(sizeof(*data), GFP_ATOMIC | GFP_DMA); if (!data) { - if (mutex_trylock(&dasd_path_verification_mutex)) { - data = path_verification_worker; + if (mutex_trylock(&dasd_pe_handler_mutex)) { + data = pe_handler_worker; data->isglobal = 1; - } else + } else { return -ENOMEM; + } } else { memset(data, 0, sizeof(*data)); data->isglobal = 0; } - INIT_WORK(&data->worker, do_path_verification_work); + INIT_WORK(&data->worker, do_pe_handler_work); dasd_get_device(device); data->device = device; data->tbvpm = lpm; @@ -6725,7 +6735,7 @@ static struct dasd_discipline dasd_eckd_discipline = { .check_device = dasd_eckd_check_characteristics, .uncheck_device = dasd_eckd_uncheck_device, .do_analysis = dasd_eckd_do_analysis, - .verify_path = dasd_eckd_verify_path, + .pe_handler = dasd_eckd_pe_handler, .basic_to_ready = dasd_eckd_basic_to_ready, .online_to_ready = dasd_eckd_online_to_ready, .basic_to_known = dasd_eckd_basic_to_known, @@ -6786,16 +6796,16 @@ dasd_eckd_init(void) GFP_KERNEL | GFP_DMA); if (!dasd_vol_info_req) return -ENOMEM; - path_verification_worker = kmalloc(sizeof(*path_verification_worker), - GFP_KERNEL | GFP_DMA); - if (!path_verification_worker) { + pe_handler_worker = kmalloc(sizeof(*pe_handler_worker), + GFP_KERNEL | GFP_DMA); + if (!pe_handler_worker) { kfree(dasd_reserve_req); kfree(dasd_vol_info_req); return -ENOMEM; } rawpadpage = (void *)__get_free_page(GFP_KERNEL); if (!rawpadpage) { - kfree(path_verification_worker); + kfree(pe_handler_worker); kfree(dasd_reserve_req); kfree(dasd_vol_info_req); return -ENOMEM; @@ -6804,7 +6814,7 @@ dasd_eckd_init(void) if (!ret) wait_for_device_probe(); else { - kfree(path_verification_worker); + kfree(pe_handler_worker); kfree(dasd_reserve_req); kfree(dasd_vol_info_req); free_page((unsigned long)rawpadpage); @@ -6816,7 +6826,7 @@ static void __exit dasd_eckd_cleanup(void) { ccw_driver_unregister(&dasd_eckd_driver); - kfree(path_verification_worker); + kfree(pe_handler_worker); kfree(dasd_reserve_req); free_page((unsigned long)rawpadpage); } diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index e6823464acca..4cfed3b6d9a6 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -298,6 +298,7 @@ struct dasd_discipline { * configuration. */ int (*verify_path)(struct dasd_device *, __u8); + int (*pe_handler)(struct dasd_device *, __u8); /* * Last things to do when a device is set online, and first things From 4d063e646b4bfe8e74c0b4b78bf11c3a7b5d962a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Thu, 8 Oct 2020 15:13:36 +0200 Subject: [PATCH 100/395] s390/dasd: Process FCES path event notification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the Fibre Channel Endpoint-Security status of a path changes, a corresponding path event is received from the CIO layer. Process this event by re-reading the FCES information. As the information is retrieved for all paths on a single CU in one call, the internal status can also be updated for all paths and no processing per path is necessary. Signed-off-by: Jan Höppner Signed-off-by: Stefan Haberland Reviewed-by: Stefan Haberland Reviewed-by: Cornelia Huck Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 19 +++++++++++---- drivers/s390/block/dasd_eckd.c | 12 +++++++--- drivers/s390/block/dasd_int.h | 42 +++++++++++++++++++++++++++++++++- 3 files changed, 64 insertions(+), 9 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 8d7a53e98298..874345e1138c 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2107,20 +2107,25 @@ static void __dasd_device_start_head(struct dasd_device *device) static void __dasd_device_check_path_events(struct dasd_device *device) { + __u8 tbvpm, fcsecpm; int rc; - if (!dasd_path_get_tbvpm(device)) + tbvpm = dasd_path_get_tbvpm(device); + fcsecpm = dasd_path_get_fcsecpm(device); + + if (!tbvpm && !fcsecpm) return; if (device->stopped & ~(DASD_STOPPED_DC_WAIT | DASD_UNRESUMED_PM)) return; - rc = device->discipline->pe_handler(device, - dasd_path_get_tbvpm(device)); - if (rc) + rc = device->discipline->pe_handler(device, tbvpm, fcsecpm); + if (rc) { dasd_device_set_timer(device, 50); - else + } else { dasd_path_clear_all_verify(device); + dasd_path_clear_all_fcsec(device); + } }; /* @@ -3869,6 +3874,10 @@ void dasd_generic_path_event(struct ccw_device *cdev, int *path_event) if (device->discipline->kick_validate) device->discipline->kick_validate(device); } + if (path_event[chp] & PE_PATH_FCES_EVENT) { + dasd_path_fcsec_update(device, chp); + dasd_schedule_device_bh(device); + } } hpfpm = dasd_path_get_hpfpm(device); ifccpm = dasd_path_get_ifccpm(device); diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 2e1cfacbf4d8..0d319c21c287 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -111,6 +111,7 @@ struct pe_handler_work_data { __u8 rcd_buffer[DASD_ECKD_RCD_DATA_SIZE]; int isglobal; __u8 tbvpm; + __u8 fcsecpm; }; static struct pe_handler_work_data *pe_handler_worker; static DEFINE_MUTEX(dasd_pe_handler_mutex); @@ -1466,7 +1467,10 @@ static void do_pe_handler_work(struct work_struct *work) return; } - dasd_eckd_path_available_action(device, data); + if (data->tbvpm) + dasd_eckd_path_available_action(device, data); + if (data->fcsecpm) + dasd_eckd_read_fc_security(device); clear_bit(DASD_FLAG_PATH_VERIFY, &device->flags); dasd_put_device(device); @@ -1476,7 +1480,8 @@ static void do_pe_handler_work(struct work_struct *work) kfree(data); } -static int dasd_eckd_pe_handler(struct dasd_device *device, __u8 lpm) +static int dasd_eckd_pe_handler(struct dasd_device *device, + __u8 tbvpm, __u8 fcsecpm) { struct pe_handler_work_data *data; @@ -1495,7 +1500,8 @@ static int dasd_eckd_pe_handler(struct dasd_device *device, __u8 lpm) INIT_WORK(&data->worker, do_pe_handler_work); dasd_get_device(device); data->device = device; - data->tbvpm = lpm; + data->tbvpm = tbvpm; + data->fcsecpm = fcsecpm; schedule_work(&data->worker); return 0; } diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 4cfed3b6d9a6..10f411c9b3c0 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -298,7 +298,7 @@ struct dasd_discipline { * configuration. */ int (*verify_path)(struct dasd_device *, __u8); - int (*pe_handler)(struct dasd_device *, __u8); + int (*pe_handler)(struct dasd_device *, __u8, __u8); /* * Last things to do when a device is set online, and first things @@ -423,6 +423,7 @@ extern struct dasd_discipline *dasd_diag_discipline_pointer; #define DASD_PATH_NOHPF 6 #define DASD_PATH_CUIR 7 #define DASD_PATH_IFCC 8 +#define DASD_PATH_FCSEC 9 #define DASD_THRHLD_MAX 4294967295U #define DASD_INTERVAL_MAX 4294967295U @@ -966,6 +967,29 @@ static inline void dasd_path_clear_all_verify(struct dasd_device *device) dasd_path_clear_verify(device, chp); } +static inline void dasd_path_fcsec(struct dasd_device *device, int chp) +{ + __set_bit(DASD_PATH_FCSEC, &device->path[chp].flags); +} + +static inline void dasd_path_clear_fcsec(struct dasd_device *device, int chp) +{ + __clear_bit(DASD_PATH_FCSEC, &device->path[chp].flags); +} + +static inline int dasd_path_need_fcsec(struct dasd_device *device, int chp) +{ + return test_bit(DASD_PATH_FCSEC, &device->path[chp].flags); +} + +static inline void dasd_path_clear_all_fcsec(struct dasd_device *device) +{ + int chp; + + for (chp = 0; chp < 8; chp++) + dasd_path_clear_fcsec(device, chp); +} + static inline void dasd_path_operational(struct dasd_device *device, int chp) { __set_bit(DASD_PATH_OPERATIONAL, &device->path[chp].flags); @@ -1091,6 +1115,17 @@ static inline __u8 dasd_path_get_tbvpm(struct dasd_device *device) return tbvpm; } +static inline int dasd_path_get_fcsecpm(struct dasd_device *device) +{ + int chp; + + for (chp = 0; chp < 8; chp++) + if (dasd_path_need_fcsec(device, chp)) + return 1; + + return 0; +} + static inline __u8 dasd_path_get_nppm(struct dasd_device *device) { int chp; @@ -1348,6 +1383,11 @@ static inline void dasd_path_notoper(struct dasd_device *device, int chp) dasd_path_clear_nonpreferred(device, chp); } +static inline void dasd_path_fcsec_update(struct dasd_device *device, int chp) +{ + dasd_path_fcsec(device, chp); +} + /* * remove all paths from normal operation */ From 99473d9db93a3bf557161373a1eb33d2873edd2e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:56:52 +0100 Subject: [PATCH 101/395] block: remove the call to __invalidate_device in check_disk_size_change __invalidate_device without the kill_dirty parameter just invalidates various clean entries in caches, which doesn't really help us with anything, but can cause all kinds of horrible lock orders due to how it calls into the file system. The only reason this hasn't been a major issue is because so many people use partitions, for which no invalidation was performed anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/block_dev.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 9e84b1928b94..66ebf594c97f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1334,12 +1334,6 @@ static void check_disk_size_change(struct gendisk *disk, i_size_write(bdev->bd_inode, disk_size); } spin_unlock(&bdev->bd_size_lock); - - if (bdev_size > disk_size) { - if (__invalidate_device(bdev, false)) - pr_warn("VFS: busy inodes on resized disk %s\n", - disk->disk_name); - } } /** From 3b4f85d02a4bd85cbea999a064235a47694bbb7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:56:53 +0100 Subject: [PATCH 102/395] loop: let set_capacity_revalidate_and_notify update the bdev size There is no good reason to call revalidate_disk_size separately. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/block/loop.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 22e59410b971..fcc5e32f0993 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -251,12 +251,8 @@ loop_validate_block_size(unsigned short bsize) */ static void loop_set_size(struct loop_device *lo, loff_t size) { - struct block_device *bdev = lo->lo_device; - - bd_set_nr_sectors(bdev, size); - - if (!set_capacity_revalidate_and_notify(lo->lo_disk, size, false)) - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); + if (!set_capacity_revalidate_and_notify(lo->lo_disk, size, true)) + kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); } static inline int From 5dd55749b79cdf471ca0966ad91541daebac3e2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:56:54 +0100 Subject: [PATCH 103/395] nvme: let set_capacity_revalidate_and_notify update the bdev size There is no good reason to call revalidate_disk_size separately. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9b01afcb7777..f6c6479da0e9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2053,7 +2053,7 @@ static void nvme_update_disk_info(struct gendisk *disk, capacity = 0; } - set_capacity_revalidate_and_notify(disk, capacity, false); + set_capacity_revalidate_and_notify(disk, capacity, true); nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); @@ -2134,7 +2134,6 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) blk_stack_limits(&ns->head->disk->queue->limits, &ns->queue->limits, 0); blk_queue_update_readahead(ns->head->disk->queue); - nvme_update_bdev_size(ns->head->disk); blk_mq_unfreeze_queue(ns->head->disk->queue); } #endif @@ -3963,8 +3962,6 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids) */ if (ret && ret != -ENOMEM && !(ret > 0 && !(ret & NVME_SC_DNR))) nvme_ns_remove(ns); - else - revalidate_disk_size(ns->disk, true); } static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) From b200e38c493b2a5acff4f86d40a3e45d546c664c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:56:55 +0100 Subject: [PATCH 104/395] sd: update the bdev size in sd_revalidate_disk This avoids the extra call to revalidate_disk_size in sd_rescan and is otherwise a no-op because the size did not change, or we are in the probe path. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 106a9cda0eb7..e9b898a1a480 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1748,10 +1748,8 @@ static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) static void sd_rescan(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); - int ret; - ret = sd_revalidate_disk(sdkp->disk); - revalidate_disk_size(sdkp->disk, ret == 0); + sd_revalidate_disk(sdkp->disk); } static int sd_ioctl(struct block_device *bdev, fmode_t mode, @@ -3264,7 +3262,7 @@ static int sd_revalidate_disk(struct gendisk *disk) sdkp->first_scan = 0; set_capacity_revalidate_and_notify(disk, - logical_to_sectors(sdp, sdkp->capacity), false); + logical_to_sectors(sdp, sdkp->capacity), true); sd_config_write_same(sdkp); kfree(buffer); @@ -3274,7 +3272,7 @@ static int sd_revalidate_disk(struct gendisk *disk) * capacity to 0. */ if (sd_zbc_revalidate_zones(sdkp)) - set_capacity_revalidate_and_notify(disk, 0, false); + set_capacity_revalidate_and_notify(disk, 0, true); out: return 0; From 449f4ec9892ebc2f37a7eae6d97db2cf7c65e09a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:56:56 +0100 Subject: [PATCH 105/395] block: remove the update_bdev parameter to set_capacity_revalidate_and_notify The update_bdev argument is always set to true, so remove it. Also rename the function to the slighly less verbose set_capacity_and_notify, as propagating the disk size to the block device isn't really revalidation. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Petr Vorel Signed-off-by: Jens Axboe --- block/genhd.c | 13 +++++-------- drivers/block/loop.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/nvme/host/core.c | 2 +- drivers/scsi/sd.c | 5 ++--- include/linux/genhd.h | 3 +-- 7 files changed, 12 insertions(+), 17 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index ec2a24799cd9..4e039524f92b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -47,17 +47,15 @@ static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); /* - * Set disk capacity and notify if the size is not currently - * zero and will not be set to zero + * Set disk capacity and notify if the size is not currently zero and will not + * be set to zero. Returns true if a uevent was sent, otherwise false. */ -bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, - bool update_bdev) +bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); set_capacity(disk, size); - if (update_bdev) - revalidate_disk_size(disk, true); + revalidate_disk_size(disk, true); if (capacity != size && capacity != 0 && size != 0) { char *envp[] = { "RESIZE=1", NULL }; @@ -68,8 +66,7 @@ bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, return false; } - -EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); +EXPORT_SYMBOL_GPL(set_capacity_and_notify); /* * Format the device name of the indicated disk into the supplied buffer and diff --git a/drivers/block/loop.c b/drivers/block/loop.c index fcc5e32f0993..9a27d4f1c08a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -251,7 +251,7 @@ loop_validate_block_size(unsigned short bsize) */ static void loop_set_size(struct loop_device *lo, loff_t size) { - if (!set_capacity_revalidate_and_notify(lo->lo_disk, size, true)) + if (!set_capacity_and_notify(lo->lo_disk, size)) kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index a314b9382442..3e812b4c32e6 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -470,7 +470,7 @@ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize) cap_str_10, cap_str_2); - set_capacity_revalidate_and_notify(vblk->disk, capacity, true); + set_capacity_and_notify(vblk->disk, capacity); } static void virtblk_config_changed_work(struct work_struct *work) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 48629d3433b4..79521e33d30e 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2370,7 +2370,7 @@ static void blkfront_connect(struct blkfront_info *info) return; printk(KERN_INFO "Setting capacity to %Lu\n", sectors); - set_capacity_revalidate_and_notify(info->gd, sectors, true); + set_capacity_and_notify(info->gd, sectors); return; case BLKIF_STATE_SUSPENDED: diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f6c6479da0e9..6c144e748f8c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2053,7 +2053,7 @@ static void nvme_update_disk_info(struct gendisk *disk, capacity = 0; } - set_capacity_revalidate_and_notify(disk, capacity, true); + set_capacity_and_notify(disk, capacity); nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index e9b898a1a480..679c2c025047 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3261,8 +3261,7 @@ static int sd_revalidate_disk(struct gendisk *disk) sdkp->first_scan = 0; - set_capacity_revalidate_and_notify(disk, - logical_to_sectors(sdp, sdkp->capacity), true); + set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity)); sd_config_write_same(sdkp); kfree(buffer); @@ -3272,7 +3271,7 @@ static int sd_revalidate_disk(struct gendisk *disk) * capacity to 0. */ if (sd_zbc_revalidate_zones(sdkp)) - set_capacity_revalidate_and_notify(disk, 0, true); + set_capacity_and_notify(disk, 0); out: return 0; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3cbc2781ef34..46553d6d6025 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -314,8 +314,7 @@ static inline int get_disk_ro(struct gendisk *disk) extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); -bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, - bool update_bdev); +bool set_capacity_and_notify(struct gendisk *disk, sector_t size); /* drivers/char/random.c */ extern void add_disk_randomness(struct gendisk *disk) __latent_entropy; From ee4bf648635055d2b76afadaf34236c8b2d852a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:56:57 +0100 Subject: [PATCH 106/395] nbd: remove the call to set_blocksize Block driver have no business setting the file system concept of a block size. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index aaae9220f3a0..a9a0b49ff161 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -296,7 +296,7 @@ static void nbd_size_clear(struct nbd_device *nbd) } } -static void nbd_size_update(struct nbd_device *nbd, bool start) +static void nbd_size_update(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; struct block_device *bdev = bdget_disk(nbd->disk, 0); @@ -311,11 +311,9 @@ static void nbd_size_update(struct nbd_device *nbd, bool start) blk_queue_physical_block_size(nbd->disk->queue, config->blksize); set_capacity(nbd->disk, nr_sectors); if (bdev) { - if (bdev->bd_disk) { + if (bdev->bd_disk) bd_set_nr_sectors(bdev, nr_sectors); - if (start) - set_blocksize(bdev, config->blksize); - } else + else set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); bdput(bdev); } @@ -329,7 +327,7 @@ static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize, config->blksize = blocksize; config->bytesize = blocksize * nr_blocks; if (nbd->task_recv != NULL) - nbd_size_update(nbd, false); + nbd_size_update(nbd); } static void nbd_complete_rq(struct request *req) @@ -1309,7 +1307,7 @@ static int nbd_start_device(struct nbd_device *nbd) args->index = i; queue_work(nbd->recv_workq, &args->work); } - nbd_size_update(nbd, true); + nbd_size_update(nbd); return error; } From 92f93c3a1bf9dc73181dc6566497d16b690cb576 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:56:58 +0100 Subject: [PATCH 107/395] nbd: move the task_recv check into nbd_size_update nbd_size_update is about to acquire a few more callers, so lift the check into the function. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a9a0b49ff161..48054051e281 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -299,8 +299,11 @@ static void nbd_size_clear(struct nbd_device *nbd) static void nbd_size_update(struct nbd_device *nbd) { struct nbd_config *config = nbd->config; - struct block_device *bdev = bdget_disk(nbd->disk, 0); sector_t nr_sectors = config->bytesize >> 9; + struct block_device *bdev; + + if (!nbd->task_recv) + return; if (config->flags & NBD_FLAG_SEND_TRIM) { nbd->disk->queue->limits.discard_granularity = config->blksize; @@ -309,7 +312,9 @@ static void nbd_size_update(struct nbd_device *nbd) } blk_queue_logical_block_size(nbd->disk->queue, config->blksize); blk_queue_physical_block_size(nbd->disk->queue, config->blksize); + set_capacity(nbd->disk, nr_sectors); + bdev = bdget_disk(nbd->disk, 0); if (bdev) { if (bdev->bd_disk) bd_set_nr_sectors(bdev, nr_sectors); @@ -326,8 +331,7 @@ static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize, struct nbd_config *config = nbd->config; config->blksize = blocksize; config->bytesize = blocksize * nr_blocks; - if (nbd->task_recv != NULL) - nbd_size_update(nbd); + nbd_size_update(nbd); } static void nbd_complete_rq(struct request *req) From 2dc691cc4ac259f8b5bb0bd8670645af894d30eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:56:59 +0100 Subject: [PATCH 108/395] nbd: refactor size updates Merge nbd_size_set and nbd_size_update into a single function that also updates the nbd_config fields. This new function takes the device size in bytes as the first argument, and the blocksize as the second argument, simplifying the calculations required in most callers. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 48054051e281..6e8f2ff715c6 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -296,28 +296,30 @@ static void nbd_size_clear(struct nbd_device *nbd) } } -static void nbd_size_update(struct nbd_device *nbd) +static void nbd_set_size(struct nbd_device *nbd, loff_t bytesize, + loff_t blksize) { - struct nbd_config *config = nbd->config; - sector_t nr_sectors = config->bytesize >> 9; struct block_device *bdev; + nbd->config->bytesize = bytesize; + nbd->config->blksize = blksize; + if (!nbd->task_recv) return; - if (config->flags & NBD_FLAG_SEND_TRIM) { - nbd->disk->queue->limits.discard_granularity = config->blksize; - nbd->disk->queue->limits.discard_alignment = config->blksize; + if (nbd->config->flags & NBD_FLAG_SEND_TRIM) { + nbd->disk->queue->limits.discard_granularity = blksize; + nbd->disk->queue->limits.discard_alignment = blksize; blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); } - blk_queue_logical_block_size(nbd->disk->queue, config->blksize); - blk_queue_physical_block_size(nbd->disk->queue, config->blksize); + blk_queue_logical_block_size(nbd->disk->queue, blksize); + blk_queue_physical_block_size(nbd->disk->queue, blksize); - set_capacity(nbd->disk, nr_sectors); + set_capacity(nbd->disk, bytesize >> 9); bdev = bdget_disk(nbd->disk, 0); if (bdev) { if (bdev->bd_disk) - bd_set_nr_sectors(bdev, nr_sectors); + bd_set_nr_sectors(bdev, bytesize >> 9); else set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); bdput(bdev); @@ -325,15 +327,6 @@ static void nbd_size_update(struct nbd_device *nbd) kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); } -static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize, - loff_t nr_blocks) -{ - struct nbd_config *config = nbd->config; - config->blksize = blocksize; - config->bytesize = blocksize * nr_blocks; - nbd_size_update(nbd); -} - static void nbd_complete_rq(struct request *req) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); @@ -1311,7 +1304,7 @@ static int nbd_start_device(struct nbd_device *nbd) args->index = i; queue_work(nbd->recv_workq, &args->work); } - nbd_size_update(nbd); + nbd_set_size(nbd, config->bytesize, config->blksize); return error; } @@ -1390,15 +1383,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, arg = NBD_DEF_BLKSIZE; if (!nbd_is_valid_blksize(arg)) return -EINVAL; - nbd_size_set(nbd, arg, - div_s64(config->bytesize, arg)); + nbd_set_size(nbd, config->bytesize, arg); return 0; case NBD_SET_SIZE: - nbd_size_set(nbd, config->blksize, - div_s64(arg, config->blksize)); + nbd_set_size(nbd, arg, config->blksize); return 0; case NBD_SET_SIZE_BLOCKS: - nbd_size_set(nbd, config->blksize, arg); + nbd_set_size(nbd, arg * config->blksize, + config->blksize); return 0; case NBD_SET_TIMEOUT: nbd_set_cmd_timeout(nbd, arg); @@ -1828,7 +1820,7 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) } if (bytes != config->bytesize || bsize != config->blksize) - nbd_size_set(nbd, bsize, div64_u64(bytes, bsize)); + nbd_set_size(nbd, bytes, bsize); return 0; } From dcbddf541f18e367ac9cdad8e223d382cd303161 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:00 +0100 Subject: [PATCH 109/395] nbd: validate the block size in nbd_set_size Move the validation of the block from the callers into nbd_set_size. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 47 +++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 32 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6e8f2ff715c6..7478a5e02bc1 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -296,16 +296,21 @@ static void nbd_size_clear(struct nbd_device *nbd) } } -static void nbd_set_size(struct nbd_device *nbd, loff_t bytesize, +static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize) { struct block_device *bdev; + if (!blksize) + blksize = NBD_DEF_BLKSIZE; + if (blksize < 512 || blksize > PAGE_SIZE || !is_power_of_2(blksize)) + return -EINVAL; + nbd->config->bytesize = bytesize; nbd->config->blksize = blksize; if (!nbd->task_recv) - return; + return 0; if (nbd->config->flags & NBD_FLAG_SEND_TRIM) { nbd->disk->queue->limits.discard_granularity = blksize; @@ -325,6 +330,7 @@ static void nbd_set_size(struct nbd_device *nbd, loff_t bytesize, bdput(bdev); } kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); + return 0; } static void nbd_complete_rq(struct request *req) @@ -1304,8 +1310,7 @@ static int nbd_start_device(struct nbd_device *nbd) args->index = i; queue_work(nbd->recv_workq, &args->work); } - nbd_set_size(nbd, config->bytesize, config->blksize); - return error; + return nbd_set_size(nbd, config->bytesize, config->blksize); } static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev) @@ -1347,14 +1352,6 @@ static void nbd_clear_sock_ioctl(struct nbd_device *nbd, nbd_config_put(nbd); } -static bool nbd_is_valid_blksize(unsigned long blksize) -{ - if (!blksize || !is_power_of_2(blksize) || blksize < 512 || - blksize > PAGE_SIZE) - return false; - return true; -} - static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout) { nbd->tag_set.timeout = timeout * HZ; @@ -1379,19 +1376,12 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, case NBD_SET_SOCK: return nbd_add_socket(nbd, arg, false); case NBD_SET_BLKSIZE: - if (!arg) - arg = NBD_DEF_BLKSIZE; - if (!nbd_is_valid_blksize(arg)) - return -EINVAL; - nbd_set_size(nbd, config->bytesize, arg); - return 0; + return nbd_set_size(nbd, config->bytesize, arg); case NBD_SET_SIZE: - nbd_set_size(nbd, arg, config->blksize); - return 0; + return nbd_set_size(nbd, arg, config->blksize); case NBD_SET_SIZE_BLOCKS: - nbd_set_size(nbd, arg * config->blksize, - config->blksize); - return 0; + return nbd_set_size(nbd, arg * config->blksize, + config->blksize); case NBD_SET_TIMEOUT: nbd_set_cmd_timeout(nbd, arg); return 0; @@ -1809,18 +1799,11 @@ static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) if (info->attrs[NBD_ATTR_SIZE_BYTES]) bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); - if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) { + if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); - if (!bsize) - bsize = NBD_DEF_BLKSIZE; - if (!nbd_is_valid_blksize(bsize)) { - printk(KERN_ERR "Invalid block size %llu\n", bsize); - return -EINVAL; - } - } if (bytes != config->bytesize || bsize != config->blksize) - nbd_set_size(nbd, bytes, bsize); + return nbd_set_size(nbd, bytes, bsize); return 0; } From 2ebcabf3dba50f61850efec9a331bd061c6333e0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:01 +0100 Subject: [PATCH 110/395] nbd: use set_capacity_and_notify Use set_capacity_and_notify to update the disk and block device sizes and send a RESIZE uevent to userspace. Note that blktests relies on uevents being sent also for updates that did not change the device size, so the explicit kobject_uevent remains for that case. Signed-off-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 7478a5e02bc1..45b0423ef2c5 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -299,8 +299,6 @@ static void nbd_size_clear(struct nbd_device *nbd) static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize) { - struct block_device *bdev; - if (!blksize) blksize = NBD_DEF_BLKSIZE; if (blksize < 512 || blksize > PAGE_SIZE || !is_power_of_2(blksize)) @@ -320,16 +318,9 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, blk_queue_logical_block_size(nbd->disk->queue, blksize); blk_queue_physical_block_size(nbd->disk->queue, blksize); - set_capacity(nbd->disk, bytesize >> 9); - bdev = bdget_disk(nbd->disk, 0); - if (bdev) { - if (bdev->bd_disk) - bd_set_nr_sectors(bdev, bytesize >> 9); - else - set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); - bdput(bdev); - } - kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); + set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); + if (!set_capacity_and_notify(nbd->disk, bytesize >> 9)) + kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); return 0; } From 8a6f7bbf29d1d61d3ff18f0a0feead9f287c9b14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:02 +0100 Subject: [PATCH 111/395] aoe: don't call set_capacity from irq context Updating the block device size from irq context can lead to torn writes of the 64-bit value, and prevents us from using normal process context locking primitives to serialize access to the 64-bit nr_sectors value. Defer the set_capacity to the already existing workqueue handler, where it can be merged with the update of the block device size by using set_capacity_and_notify. As an extra bonus this also adds proper uevent notifications for the resize. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/aoe/aoecmd.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 313f0b946fe2..ac720bdcd983 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -890,19 +890,13 @@ void aoecmd_sleepwork(struct work_struct *work) { struct aoedev *d = container_of(work, struct aoedev, work); - struct block_device *bd; - u64 ssize; if (d->flags & DEVFL_GDALLOC) aoeblk_gdalloc(d); if (d->flags & DEVFL_NEWSIZE) { - ssize = get_capacity(d->gd); - bd = bdget_disk(d->gd, 0); - if (bd) { - bd_set_nr_sectors(bd, ssize); - bdput(bd); - } + set_capacity_and_notify(d->gd, d->ssize); + spin_lock_irq(&d->lock); d->flags |= DEVFL_UP; d->flags &= ~DEVFL_NEWSIZE; @@ -971,10 +965,9 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id) d->geo.start = 0; if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) return; - if (d->gd != NULL) { - set_capacity(d->gd, ssize); + if (d->gd != NULL) d->flags |= DEVFL_NEWSIZE; - } else + else d->flags |= DEVFL_GDALLOC; schedule_work(&d->work); } From f64d9b2eacb95d4fbd17c8680cab803a5965744c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:03 +0100 Subject: [PATCH 112/395] dm: use set_capacity_and_notify Use set_capacity_and_notify to set the size of both the disk and block device. This also gets the uevent notifications for the resize for free. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/dm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6db395c3d28b..54739f1b579b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1974,8 +1974,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, if (size != dm_get_size(md)) memset(&md->geometry, 0, sizeof(md->geometry)); - set_capacity(md->disk, size); - bd_set_nr_sectors(md->bdev, size); + set_capacity_and_notify(md->disk, size); dm_table_event_callback(t, event_callback, md); From 657985f857c0027db6f17fa4af7e8818038e0b15 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:04 +0100 Subject: [PATCH 113/395] pktcdvd: use set_capacity_and_notify Use set_capacity_and_notify to set the size of both the disk and block device. This also gets the uevent notifications for the resize for free. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index ef1c1f094ea4..b8bb8ec7538d 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2130,8 +2130,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) } set_capacity(pd->disk, lba << 2); - set_capacity(pd->bdev->bd_disk, lba << 2); - bd_set_nr_sectors(pd->bdev, lba << 2); + set_capacity_and_notify(pd->bdev->bd_disk, lba << 2); q = bdev_get_queue(pd->bdev); if (write) { From d17e66aadbe50b7207187d6b2293fc2ddaab2c99 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:05 +0100 Subject: [PATCH 114/395] nvme: use set_capacity_and_notify in nvme_set_queue_dying Use the block layer helper to update both the disk and block device sizes. Contrary to the name no notification is sent in this case, as a size 0 is special cased. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6c144e748f8c..bc89e8659c40 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -93,16 +93,6 @@ static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); -static void nvme_update_bdev_size(struct gendisk *disk) -{ - struct block_device *bdev = bdget_disk(disk, 0); - - if (bdev) { - bd_set_nr_sectors(bdev, get_capacity(disk)); - bdput(bdev); - } -} - /* * Prepare a queue for teardown. * @@ -119,8 +109,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) blk_set_queue_dying(ns->queue); blk_mq_unquiesce_queue(ns->queue); - set_capacity(ns->disk, 0); - nvme_update_bdev_size(ns->disk); + set_capacity_and_notify(ns->disk, 0); } static void nvme_queue_scan(struct nvme_ctrl *ctrl) From bc254eb44f9dfce278b53b714fb7bb963253789d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:06 +0100 Subject: [PATCH 115/395] drbd: use set_capacity_and_notify Use set_capacity_and_notify to set the size of both the disk and block device. This also gets the uevent notifications for the resize for free. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 65b95aef8dbc..1c8c18b2a25f 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2036,8 +2036,7 @@ void drbd_set_my_capacity(struct drbd_device *device, sector_t size) { char ppb[10]; - set_capacity(device->vdisk, size); - revalidate_disk_size(device->vdisk, false); + set_capacity_and_notify(device->vdisk, size); drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), (unsigned long long)size>>1); @@ -2068,8 +2067,7 @@ void drbd_device_cleanup(struct drbd_device *device) } D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL); - set_capacity(device->vdisk, 0); - revalidate_disk_size(device->vdisk, false); + set_capacity_and_notify(device->vdisk, 0); if (device->bitmap) { /* maybe never allocated. */ drbd_bm_resize(device, 0, 1); From e864e49af3a85797b51b36876087591602de2eb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:07 +0100 Subject: [PATCH 116/395] rbd: use set_capacity_and_notify Use set_capacity_and_notify to set the size of both the disk and block device. This also gets the uevent notifications for the resize for free. Signed-off-by: Christoph Hellwig Acked-by: Ilya Dryomov Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 671733e459cf..2ed79b09439a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4888,8 +4888,7 @@ static void rbd_dev_update_size(struct rbd_device *rbd_dev) !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; dout("setting size to %llu sectors", (unsigned long long)size); - set_capacity(rbd_dev->disk, size); - revalidate_disk_size(rbd_dev->disk, true); + set_capacity_and_notify(rbd_dev->disk, size); } } From 230272b4f809d51c8b21d46dcec99f265b0842ec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:08 +0100 Subject: [PATCH 117/395] rnbd: use set_capacity_and_notify Use set_capacity_and_notify to set the size of both the disk and block device. This also gets the uevent notifications for the resize for free. Signed-off-by: Christoph Hellwig Acked-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 8b2411ccbda9..bb13d7dd195a 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -100,8 +100,7 @@ static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n", dev->nsectors, new_nsectors); dev->nsectors = new_nsectors; - set_capacity(dev->gd, dev->nsectors); - revalidate_disk_size(dev->gd, true); + set_capacity_and_notify(dev->gd, dev->nsectors); return 0; } From 6e017a3931d7722260e3656a6fc9b02de5fb3c5d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:09 +0100 Subject: [PATCH 118/395] zram: use set_capacity_and_notify Use set_capacity_and_notify to set the size of both the disk and block device. This also gets the uevent notifications for the resize for free. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1b697208d661..6d15d51cee2b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1695,7 +1695,7 @@ static void zram_reset_device(struct zram *zram) disksize = zram->disksize; zram->disksize = 0; - set_capacity(zram->disk, 0); + set_capacity_and_notify(zram->disk, 0); part_stat_set_all(&zram->disk->part0, 0); up_write(&zram->init_lock); @@ -1741,9 +1741,7 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; - set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - - revalidate_disk_size(zram->disk, true); + set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT); up_write(&zram->init_lock); return len; @@ -1790,7 +1788,6 @@ static ssize_t reset_store(struct device *dev, /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - revalidate_disk_size(zram->disk, true); bdput(bdev); mutex_lock(&bdev->bd_mutex); From dc2985a8d583abe232e5882df9c8b67ac0d523e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:10 +0100 Subject: [PATCH 119/395] dm-raid: use set_capacity_and_notify Use set_capacity_and_notify to set the size of both the disk and block device. This also gets the uevent notifications for the resize for free. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/dm-raid.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9c1f7c4de65b..294f34d2d61b 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -700,8 +700,7 @@ static void rs_set_capacity(struct raid_set *rs) { struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table)); - set_capacity(gendisk, rs->md.array_sectors); - revalidate_disk_size(gendisk, true); + set_capacity_and_notify(gendisk, rs->md.array_sectors); } /* From 2c247c5169b50d58f63c0e82a58f457343e49d10 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:11 +0100 Subject: [PATCH 120/395] md: use set_capacity_and_notify Use set_capacity_and_notify to set the size of both the disk and block device. This also gets the uevent notifications for the resize for free. Signed-off-by: Christoph Hellwig Acked-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/md-cluster.c | 6 ++---- drivers/md/md-linear.c | 3 +-- drivers/md/md.c | 24 ++++++++++-------------- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 4aaf4820b6f6..87442dc59f6c 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -581,8 +581,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) process_metadata_update(mddev, msg); break; case CHANGE_CAPACITY: - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk_size(mddev->gendisk, true); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); break; case RESYNCING: set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); @@ -1296,8 +1295,7 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors) if (ret) pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n", __func__, __LINE__); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk_size(mddev->gendisk, true); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); } else { /* revert to previous sectors */ ret = mddev->pers->resize(mddev, old_dev_sectors); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 5ab22069b5be..98f1b4b2bdce 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -200,9 +200,8 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) "copied raid_disks doesn't match mddev->raid_disks"); rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - set_capacity(mddev->gendisk, mddev->array_sectors); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); mddev_resume(mddev); - revalidate_disk_size(mddev->gendisk, true); kfree_rcu(oldconf, rcu); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 3c79243e9d07..b2edf5e0f965 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5355,10 +5355,9 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len) if (!err) { mddev->array_sectors = sectors; - if (mddev->pers) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk_size(mddev->gendisk, true); - } + if (mddev->pers) + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); } mddev_unlock(mddev); return err ?: len; @@ -6108,8 +6107,7 @@ int do_md_run(struct mddev *mddev) md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk_size(mddev->gendisk, true); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); clear_bit(MD_NOT_READY, &mddev->flags); mddev->changed = 1; kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); @@ -6424,10 +6422,9 @@ static int do_md_stop(struct mddev *mddev, int mode, if (rdev->raid_disk >= 0) sysfs_unlink_rdev(mddev, rdev); - set_capacity(disk, 0); + set_capacity_and_notify(disk, 0); mutex_unlock(&mddev->open_mutex); mddev->changed = 1; - revalidate_disk_size(disk, true); if (mddev->ro) mddev->ro = 0; @@ -7258,8 +7255,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) if (mddev_is_clustered(mddev)) md_cluster_ops->update_size(mddev, old_dev_sectors); else if (mddev->queue) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk_size(mddev->gendisk, true); + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); } } return rv; @@ -9036,10 +9033,9 @@ void md_do_sync(struct md_thread *thread) mddev_lock_nointr(mddev); md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); mddev_unlock(mddev); - if (!mddev_is_clustered(mddev)) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk_size(mddev->gendisk, true); - } + if (!mddev_is_clustered(mddev)) + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); } spin_lock(&mddev->lock); From 94d91e7f8c221260790a482373d347ea85efb7b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:12 +0100 Subject: [PATCH 121/395] md: remove a spurious call to revalidate_disk_size in update_size None of the ->resize methods updates the disk size, so calling revalidate_disk_size here won't do anything. Signed-off-by: Christoph Hellwig Acked-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/md-cluster.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 87442dc59f6c..35e2690c1803 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1299,8 +1299,6 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors) } else { /* revert to previous sectors */ ret = mddev->pers->resize(mddev, old_dev_sectors); - if (!ret) - revalidate_disk_size(mddev->gendisk, true); ret = __sendmsg(cinfo, &cmsg); if (ret) pr_err("%s:%d: failed to send METADATA_UPDATED msg\n", From ddff331a14eb7d5af08e63579ba28c289db26e20 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:13 +0100 Subject: [PATCH 122/395] virtio-blk: remove a spurious call to revalidate_disk_size revalidate_disk_size just updates the block device size from the disk size. Thus calling it from virtblk_update_cache_mode doesn't actually do anything. Signed-off-by: Christoph Hellwig Acked-by: Stefan Hajnoczi Acked-by: Michael S. Tsirkin Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 3e812b4c32e6..145606dc52db 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -598,7 +598,6 @@ static void virtblk_update_cache_mode(struct virtio_device *vdev) struct virtio_blk *vblk = vdev->priv; blk_queue_write_cache(vblk->disk->queue, writeback, false); - revalidate_disk_size(vblk->disk, true); } static const char *const virtblk_cache_types[] = { From 5a5678ff3a495cbfccde9c734164cc8753a1ca97 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:57:14 +0100 Subject: [PATCH 123/395] block: unexport revalidate_disk_size revalidate_disk_size is now only called from set_capacity_and_notify, so drop the export. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 66ebf594c97f..d8664f5c1ff6 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1362,7 +1362,6 @@ void revalidate_disk_size(struct gendisk *disk, bool verbose) bdput(bdev); } } -EXPORT_SYMBOL(revalidate_disk_size); void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) { From 9b0072e2b2b588ad75c94f2c6e6c52c8f4bd2657 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Sun, 8 Nov 2020 14:45:42 +0800 Subject: [PATCH 124/395] security/smack: remove unused varible 'rc' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This varible isn't used and can be removed to avoid a gcc warning: security/smack/smack_lsm.c:3873:6: warning: variable ‘rc’ set but not used [-Wunused-but-set-variable] Signed-off-by: Alex Shi Cc: Casey Schaufler Cc: James Morris Cc: "Serge E. Hallyn" Cc: linux-security-module@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Casey Schaufler --- security/smack/smack_lsm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 5c90b9fa4d40..9994fcfafd70 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -3870,7 +3870,6 @@ static struct smack_known *smack_from_netlbl(struct sock *sk, u16 family, struct netlbl_lsm_secattr secattr; struct socket_smack *ssp = NULL; struct smack_known *skp = NULL; - int rc; netlbl_secattr_init(&secattr); @@ -3880,7 +3879,7 @@ static struct smack_known *smack_from_netlbl(struct sock *sk, u16 family, if (netlbl_skbuff_getattr(skb, family, &secattr) == 0) { skp = smack_from_secattr(&secattr, ssp); if (secattr.flags & NETLBL_SECATTR_CACHEABLE) - rc = netlbl_cache_add(skb, family, &skp->smk_netlabel); + netlbl_cache_add(skb, family, &skp->smk_netlabel); } netlbl_secattr_destroy(&secattr); From e8b7db38449ac5b950a3f00519171c4be3e226ff Mon Sep 17 00:00:00 2001 From: Andres Beltran Date: Mon, 9 Nov 2020 11:04:00 +0100 Subject: [PATCH 125/395] Drivers: hv: vmbus: Add vmbus_requestor data structure for VMBus hardening Currently, VMbus drivers use pointers into guest memory as request IDs for interactions with Hyper-V. To be more robust in the face of errors or malicious behavior from a compromised Hyper-V, avoid exposing guest memory addresses to Hyper-V. Also avoid Hyper-V giving back a bad request ID that is then treated as the address of a guest data structure with no validation. Instead, encapsulate these memory addresses and provide small integers as request IDs. Signed-off-by: Andres Beltran Co-developed-by: Andrea Parri (Microsoft) Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Reviewed-by: Wei Liu Link: https://lore.kernel.org/r/20201109100402.8946-2-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel.c | 174 ++++++++++++++++++++++++++++++++++++-- drivers/hv/hyperv_vmbus.h | 3 +- drivers/hv/ring_buffer.c | 29 ++++++- include/linux/hyperv.h | 22 +++++ 4 files changed, 219 insertions(+), 9 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index fbdda9938039..6fb0c76bfbf8 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -503,6 +503,70 @@ int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer, } EXPORT_SYMBOL_GPL(vmbus_establish_gpadl); +/** + * request_arr_init - Allocates memory for the requestor array. Each slot + * keeps track of the next available slot in the array. Initially, each + * slot points to the next one (as in a Linked List). The last slot + * does not point to anything, so its value is U64_MAX by default. + * @size The size of the array + */ +static u64 *request_arr_init(u32 size) +{ + int i; + u64 *req_arr; + + req_arr = kcalloc(size, sizeof(u64), GFP_KERNEL); + if (!req_arr) + return NULL; + + for (i = 0; i < size - 1; i++) + req_arr[i] = i + 1; + + /* Last slot (no more available slots) */ + req_arr[i] = U64_MAX; + + return req_arr; +} + +/* + * vmbus_alloc_requestor - Initializes @rqstor's fields. + * Index 0 is the first free slot + * @size: Size of the requestor array + */ +static int vmbus_alloc_requestor(struct vmbus_requestor *rqstor, u32 size) +{ + u64 *rqst_arr; + unsigned long *bitmap; + + rqst_arr = request_arr_init(size); + if (!rqst_arr) + return -ENOMEM; + + bitmap = bitmap_zalloc(size, GFP_KERNEL); + if (!bitmap) { + kfree(rqst_arr); + return -ENOMEM; + } + + rqstor->req_arr = rqst_arr; + rqstor->req_bitmap = bitmap; + rqstor->size = size; + rqstor->next_request_id = 0; + spin_lock_init(&rqstor->req_lock); + + return 0; +} + +/* + * vmbus_free_requestor - Frees memory allocated for @rqstor + * @rqstor: Pointer to the requestor struct + */ +static void vmbus_free_requestor(struct vmbus_requestor *rqstor) +{ + kfree(rqstor->req_arr); + bitmap_free(rqstor->req_bitmap); +} + static int __vmbus_open(struct vmbus_channel *newchannel, void *userdata, u32 userdatalen, void (*onchannelcallback)(void *context), void *context) @@ -523,6 +587,12 @@ static int __vmbus_open(struct vmbus_channel *newchannel, if (newchannel->state != CHANNEL_OPEN_STATE) return -EINVAL; + /* Create and init requestor */ + if (newchannel->rqstor_size) { + if (vmbus_alloc_requestor(&newchannel->requestor, newchannel->rqstor_size)) + return -ENOMEM; + } + newchannel->state = CHANNEL_OPENING_STATE; newchannel->onchannel_callback = onchannelcallback; newchannel->channel_callback_context = context; @@ -626,6 +696,7 @@ static int __vmbus_open(struct vmbus_channel *newchannel, error_clean_ring: hv_ringbuffer_cleanup(&newchannel->outbound); hv_ringbuffer_cleanup(&newchannel->inbound); + vmbus_free_requestor(&newchannel->requestor); newchannel->state = CHANNEL_OPEN_STATE; return err; } @@ -808,6 +879,9 @@ static int vmbus_close_internal(struct vmbus_channel *channel) channel->ringbuffer_gpadlhandle = 0; } + if (!ret) + vmbus_free_requestor(&channel->requestor); + return ret; } @@ -888,7 +962,7 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, /* in 8-bytes granularity */ desc.offset8 = sizeof(struct vmpacket_descriptor) >> 3; desc.len8 = (u16)(packetlen_aligned >> 3); - desc.trans_id = requestid; + desc.trans_id = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ bufferlist[0].iov_base = &desc; bufferlist[0].iov_len = sizeof(struct vmpacket_descriptor); @@ -897,7 +971,7 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, bufferlist[2].iov_base = &aligned_data; bufferlist[2].iov_len = (packetlen_aligned - packetlen); - return hv_ringbuffer_write(channel, bufferlist, num_vecs); + return hv_ringbuffer_write(channel, bufferlist, num_vecs, requestid); } EXPORT_SYMBOL(vmbus_sendpacket); @@ -939,7 +1013,7 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */ desc.length8 = (u16)(packetlen_aligned >> 3); - desc.transactionid = requestid; + desc.transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ desc.reserved = 0; desc.rangecount = pagecount; @@ -956,7 +1030,7 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, bufferlist[2].iov_base = &aligned_data; bufferlist[2].iov_len = (packetlen_aligned - packetlen); - return hv_ringbuffer_write(channel, bufferlist, 3); + return hv_ringbuffer_write(channel, bufferlist, 3, requestid); } EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); @@ -983,7 +1057,7 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, desc->flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; desc->dataoffset8 = desc_size >> 3; /* in 8-bytes granularity */ desc->length8 = (u16)(packetlen_aligned >> 3); - desc->transactionid = requestid; + desc->transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ desc->reserved = 0; desc->rangecount = 1; @@ -994,7 +1068,7 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, bufferlist[2].iov_base = &aligned_data; bufferlist[2].iov_len = (packetlen_aligned - packetlen); - return hv_ringbuffer_write(channel, bufferlist, 3); + return hv_ringbuffer_write(channel, bufferlist, 3, requestid); } EXPORT_SYMBOL_GPL(vmbus_sendpacket_mpb_desc); @@ -1042,3 +1116,91 @@ int vmbus_recvpacket_raw(struct vmbus_channel *channel, void *buffer, buffer_actual_len, requestid, true); } EXPORT_SYMBOL_GPL(vmbus_recvpacket_raw); + +/* + * vmbus_next_request_id - Returns a new request id. It is also + * the index at which the guest memory address is stored. + * Uses a spin lock to avoid race conditions. + * @rqstor: Pointer to the requestor struct + * @rqst_add: Guest memory address to be stored in the array + */ +u64 vmbus_next_request_id(struct vmbus_requestor *rqstor, u64 rqst_addr) +{ + unsigned long flags; + u64 current_id; + const struct vmbus_channel *channel = + container_of(rqstor, const struct vmbus_channel, requestor); + + /* Check rqstor has been initialized */ + if (!channel->rqstor_size) + return VMBUS_NO_RQSTOR; + + spin_lock_irqsave(&rqstor->req_lock, flags); + current_id = rqstor->next_request_id; + + /* Requestor array is full */ + if (current_id >= rqstor->size) { + spin_unlock_irqrestore(&rqstor->req_lock, flags); + return VMBUS_RQST_ERROR; + } + + rqstor->next_request_id = rqstor->req_arr[current_id]; + rqstor->req_arr[current_id] = rqst_addr; + + /* The already held spin lock provides atomicity */ + bitmap_set(rqstor->req_bitmap, current_id, 1); + + spin_unlock_irqrestore(&rqstor->req_lock, flags); + + /* + * Cannot return an ID of 0, which is reserved for an unsolicited + * message from Hyper-V. + */ + return current_id + 1; +} +EXPORT_SYMBOL_GPL(vmbus_next_request_id); + +/* + * vmbus_request_addr - Returns the memory address stored at @trans_id + * in @rqstor. Uses a spin lock to avoid race conditions. + * @rqstor: Pointer to the requestor struct + * @trans_id: Request id sent back from Hyper-V. Becomes the requestor's + * next request id. + */ +u64 vmbus_request_addr(struct vmbus_requestor *rqstor, u64 trans_id) +{ + unsigned long flags; + u64 req_addr; + const struct vmbus_channel *channel = + container_of(rqstor, const struct vmbus_channel, requestor); + + /* Check rqstor has been initialized */ + if (!channel->rqstor_size) + return VMBUS_NO_RQSTOR; + + /* Hyper-V can send an unsolicited message with ID of 0 */ + if (!trans_id) + return trans_id; + + spin_lock_irqsave(&rqstor->req_lock, flags); + + /* Data corresponding to trans_id is stored at trans_id - 1 */ + trans_id--; + + /* Invalid trans_id */ + if (trans_id >= rqstor->size || !test_bit(trans_id, rqstor->req_bitmap)) { + spin_unlock_irqrestore(&rqstor->req_lock, flags); + return VMBUS_RQST_ERROR; + } + + req_addr = rqstor->req_arr[trans_id]; + rqstor->req_arr[trans_id] = rqstor->next_request_id; + rqstor->next_request_id = trans_id; + + /* The already held spin lock provides atomicity */ + bitmap_clear(rqstor->req_bitmap, trans_id, 1); + + spin_unlock_irqrestore(&rqstor->req_lock, flags); + return req_addr; +} +EXPORT_SYMBOL_GPL(vmbus_request_addr); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 40e2b9f91163..02f3e8988836 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -179,7 +179,8 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info); int hv_ringbuffer_write(struct vmbus_channel *channel, - const struct kvec *kv_list, u32 kv_count); + const struct kvec *kv_list, u32 kv_count, + u64 requestid); int hv_ringbuffer_read(struct vmbus_channel *channel, void *buffer, u32 buflen, u32 *buffer_actual_len, diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 356e22159e83..35833d4d1a1d 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -248,7 +248,8 @@ void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info) /* Write to the ring buffer. */ int hv_ringbuffer_write(struct vmbus_channel *channel, - const struct kvec *kv_list, u32 kv_count) + const struct kvec *kv_list, u32 kv_count, + u64 requestid) { int i; u32 bytes_avail_towrite; @@ -258,6 +259,8 @@ int hv_ringbuffer_write(struct vmbus_channel *channel, u64 prev_indices; unsigned long flags; struct hv_ring_buffer_info *outring_info = &channel->outbound; + struct vmpacket_descriptor *desc = kv_list[0].iov_base; + u64 rqst_id = VMBUS_NO_RQSTOR; if (channel->rescind) return -ENODEV; @@ -300,6 +303,23 @@ int hv_ringbuffer_write(struct vmbus_channel *channel, kv_list[i].iov_len); } + /* + * Allocate the request ID after the data has been copied into the + * ring buffer. Once this request ID is allocated, the completion + * path could find the data and free it. + */ + + if (desc->flags == VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED) { + rqst_id = vmbus_next_request_id(&channel->requestor, requestid); + if (rqst_id == VMBUS_RQST_ERROR) { + spin_unlock_irqrestore(&outring_info->ring_lock, flags); + pr_err("No request id available\n"); + return -EAGAIN; + } + } + desc = hv_get_ring_buffer(outring_info) + old_write; + desc->trans_id = (rqst_id == VMBUS_NO_RQSTOR) ? requestid : rqst_id; + /* Set previous packet start */ prev_indices = hv_get_ring_bufferindices(outring_info); @@ -319,8 +339,13 @@ int hv_ringbuffer_write(struct vmbus_channel *channel, hv_signal_on_write(old_write, channel); - if (channel->rescind) + if (channel->rescind) { + if (rqst_id != VMBUS_NO_RQSTOR) { + /* Reclaim request ID to avoid leak of IDs */ + vmbus_request_addr(&channel->requestor, rqst_id); + } return -ENODEV; + } return 0; } diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 1ce131f29f3b..5b6d5c4e3711 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -764,6 +764,22 @@ enum vmbus_device_type { HV_UNKNOWN, }; +/* + * Provides request ids for VMBus. Encapsulates guest memory + * addresses and stores the next available slot in req_arr + * to generate new ids in constant time. + */ +struct vmbus_requestor { + u64 *req_arr; + unsigned long *req_bitmap; /* is a given slot available? */ + u32 size; + u64 next_request_id; + spinlock_t req_lock; /* provides atomicity */ +}; + +#define VMBUS_NO_RQSTOR U64_MAX +#define VMBUS_RQST_ERROR (U64_MAX - 1) + struct vmbus_device { u16 dev_type; guid_t guid; @@ -988,8 +1004,14 @@ struct vmbus_channel { u32 fuzz_testing_interrupt_delay; u32 fuzz_testing_message_delay; + /* request/transaction ids for VMBus */ + struct vmbus_requestor requestor; + u32 rqstor_size; }; +u64 vmbus_next_request_id(struct vmbus_requestor *rqstor, u64 rqst_addr); +u64 vmbus_request_addr(struct vmbus_requestor *rqstor, u64 trans_id); + static inline bool is_hvsock_channel(const struct vmbus_channel *c) { return !!(c->offermsg.offer.chn_flags & From 453de21c2b8281228173a7b689120b92929743d6 Mon Sep 17 00:00:00 2001 From: Andres Beltran Date: Mon, 9 Nov 2020 11:04:01 +0100 Subject: [PATCH 126/395] scsi: storvsc: Use vmbus_requestor to generate transaction IDs for VMBus hardening Currently, pointers to guest memory are passed to Hyper-V as transaction IDs in storvsc. In the face of errors or malicious behavior in Hyper-V, storvsc should not expose or trust the transaction IDs returned by Hyper-V to be valid guest memory addresses. Instead, use small integers generated by vmbus_requestor as requests (transaction) IDs. Signed-off-by: Andres Beltran Co-developed-by: Andrea Parri (Microsoft) Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: linux-scsi@vger.kernel.org Link: https://lore.kernel.org/r/20201109100402.8946-3-parri.andrea@gmail.com Acked-by: Martin K. Petersen Reviewed-by: Wei Liu Signed-off-by: Wei Liu --- drivers/scsi/storvsc_drv.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 0c65fbd41035..369a6c626672 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -399,6 +399,7 @@ static int storvsc_timeout = 180; static struct scsi_transport_template *fc_transport_template; #endif +static struct scsi_host_template scsi_driver; static void storvsc_on_channel_callback(void *context); #define STORVSC_MAX_LUNS_PER_TARGET 255 @@ -698,6 +699,12 @@ static void handle_sc_creation(struct vmbus_channel *new_sc) memset(&props, 0, sizeof(struct vmstorage_channel_properties)); + /* + * The size of vmbus_requestor is an upper bound on the number of requests + * that can be in-progress at any one time across all channels. + */ + new_sc->rqstor_size = scsi_driver.can_queue; + ret = vmbus_open(new_sc, storvsc_ringbuffer_size, storvsc_ringbuffer_size, @@ -1242,9 +1249,17 @@ static void storvsc_on_channel_callback(void *context) foreach_vmbus_pkt(desc, channel) { void *packet = hv_pkt_data(desc); struct storvsc_cmd_request *request; + u64 cmd_rqst; - request = (struct storvsc_cmd_request *) - ((unsigned long)desc->trans_id); + cmd_rqst = vmbus_request_addr(&channel->requestor, + desc->trans_id); + if (cmd_rqst == VMBUS_RQST_ERROR) { + dev_err(&device->device, + "Incorrect transaction id\n"); + continue; + } + + request = (struct storvsc_cmd_request *)(unsigned long)cmd_rqst; if (request == &stor_device->init_request || request == &stor_device->reset_request) { @@ -1265,6 +1280,12 @@ static int storvsc_connect_to_vsp(struct hv_device *device, u32 ring_size, memset(&props, 0, sizeof(struct vmstorage_channel_properties)); + /* + * The size of vmbus_requestor is an upper bound on the number of requests + * that can be in-progress at any one time across all channels. + */ + device->channel->rqstor_size = scsi_driver.can_queue; + ret = vmbus_open(device->channel, ring_size, ring_size, @@ -1572,7 +1593,6 @@ static int storvsc_host_reset_handler(struct scsi_cmnd *scmnd) struct vstor_packet *vstor_packet; int ret, t; - stor_device = get_out_stor_device(device); if (!stor_device) return FAILED; From 4d18fcc95f50950a99bd940d4e61a983f91d267a Mon Sep 17 00:00:00 2001 From: Andres Beltran Date: Mon, 9 Nov 2020 11:04:02 +0100 Subject: [PATCH 127/395] hv_netvsc: Use vmbus_requestor to generate transaction IDs for VMBus hardening Currently, pointers to guest memory are passed to Hyper-V as transaction IDs in netvsc. In the face of errors or malicious behavior in Hyper-V, netvsc should not expose or trust the transaction IDs returned by Hyper-V to be valid guest memory addresses. Instead, use small integers generated by vmbus_requestor as requests (transaction) IDs. Signed-off-by: Andres Beltran Co-developed-by: Andrea Parri (Microsoft) Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Acked-by: Jakub Kicinski Reviewed-by: Wei Liu Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Link: https://lore.kernel.org/r/20201109100402.8946-4-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/net/hyperv/hyperv_net.h | 13 +++++++++++++ drivers/net/hyperv/netvsc.c | 22 ++++++++++++++++------ drivers/net/hyperv/rndis_filter.c | 1 + include/linux/hyperv.h | 1 + 4 files changed, 31 insertions(+), 6 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index a0f338cf1424..2a87cfa27ac0 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -847,6 +847,19 @@ struct nvsp_message { #define NETVSC_XDP_HDRM 256 +#define NETVSC_MIN_OUT_MSG_SIZE (sizeof(struct vmpacket_descriptor) + \ + sizeof(struct nvsp_message)) +#define NETVSC_MIN_IN_MSG_SIZE sizeof(struct vmpacket_descriptor) + +/* Estimated requestor size: + * out_ring_size/min_out_msg_size + in_ring_size/min_in_msg_size + */ +static inline u32 netvsc_rqstor_size(unsigned long ringbytes) +{ + return ringbytes / NETVSC_MIN_OUT_MSG_SIZE + + ringbytes / NETVSC_MIN_IN_MSG_SIZE; +} + #define NETVSC_XFER_HEADER_SIZE(rng_cnt) \ (offsetof(struct vmtransfer_page_packet_header, ranges) + \ (rng_cnt) * sizeof(struct vmtransfer_page_range)) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 0c3de94b5178..4dbc0055aed0 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -50,7 +50,7 @@ void netvsc_switch_datapath(struct net_device *ndev, bool vf) vmbus_sendpacket(dev->channel, init_pkt, sizeof(struct nvsp_message), - (unsigned long)init_pkt, + VMBUS_RQST_ID_NO_RESPONSE, VM_PKT_DATA_INBAND, 0); } @@ -163,7 +163,7 @@ static void netvsc_revoke_recv_buf(struct hv_device *device, ret = vmbus_sendpacket(device->channel, revoke_packet, sizeof(struct nvsp_message), - (unsigned long)revoke_packet, + VMBUS_RQST_ID_NO_RESPONSE, VM_PKT_DATA_INBAND, 0); /* If the failure is because the channel is rescinded; * ignore the failure since we cannot send on a rescinded @@ -213,7 +213,7 @@ static void netvsc_revoke_send_buf(struct hv_device *device, ret = vmbus_sendpacket(device->channel, revoke_packet, sizeof(struct nvsp_message), - (unsigned long)revoke_packet, + VMBUS_RQST_ID_NO_RESPONSE, VM_PKT_DATA_INBAND, 0); /* If the failure is because the channel is rescinded; @@ -557,7 +557,7 @@ static int negotiate_nvsp_ver(struct hv_device *device, ret = vmbus_sendpacket(device->channel, init_packet, sizeof(struct nvsp_message), - (unsigned long)init_packet, + VMBUS_RQST_ID_NO_RESPONSE, VM_PKT_DATA_INBAND, 0); return ret; @@ -614,7 +614,7 @@ static int netvsc_connect_vsp(struct hv_device *device, /* Send the init request */ ret = vmbus_sendpacket(device->channel, init_packet, sizeof(struct nvsp_message), - (unsigned long)init_packet, + VMBUS_RQST_ID_NO_RESPONSE, VM_PKT_DATA_INBAND, 0); if (ret != 0) goto cleanup; @@ -695,10 +695,19 @@ static void netvsc_send_tx_complete(struct net_device *ndev, const struct vmpacket_descriptor *desc, int budget) { - struct sk_buff *skb = (struct sk_buff *)(unsigned long)desc->trans_id; struct net_device_context *ndev_ctx = netdev_priv(ndev); + struct sk_buff *skb; u16 q_idx = 0; int queue_sends; + u64 cmd_rqst; + + cmd_rqst = vmbus_request_addr(&channel->requestor, (u64)desc->trans_id); + if (cmd_rqst == VMBUS_RQST_ERROR) { + netdev_err(ndev, "Incorrect transaction id\n"); + return; + } + + skb = (struct sk_buff *)(unsigned long)cmd_rqst; /* Notify the layer above us */ if (likely(skb)) { @@ -1520,6 +1529,7 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device, netvsc_poll, NAPI_POLL_WEIGHT); /* Open the channel */ + device->channel->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes); ret = vmbus_open(device->channel, netvsc_ring_bytes, netvsc_ring_bytes, NULL, 0, netvsc_channel_cb, net_device->chan_table); diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index b22e47bcfeca..6ae43319ece6 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1172,6 +1172,7 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) /* Set the channel before opening.*/ nvchan->channel = new_sc; + new_sc->rqstor_size = netvsc_rqstor_size(netvsc_ring_bytes); ret = vmbus_open(new_sc, netvsc_ring_bytes, netvsc_ring_bytes, NULL, 0, netvsc_channel_cb, nvchan); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 5b6d5c4e3711..5ddb479c4d4c 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -779,6 +779,7 @@ struct vmbus_requestor { #define VMBUS_NO_RQSTOR U64_MAX #define VMBUS_RQST_ERROR (U64_MAX - 1) +#define VMBUS_RQST_ID_NO_RESPONSE (U64_MAX - 2) struct vmbus_device { u16 dev_type; From b18e3589722c864576a3dbeb742a742d9453f633 Mon Sep 17 00:00:00 2001 From: Matheus Castello Date: Sun, 15 Nov 2020 16:57:29 -0300 Subject: [PATCH 128/395] drivers: hv: Fix hyperv_record_panic_msg path on comment Fix the kernel parameter path in the comment, in the documentation the parameter is correct but if someone who is studying the code and see this first can get confused and try to access the wrong path/parameter Signed-off-by: Matheus Castello Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20201115195734.8338-2-matheus@castello.eng.br Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 4fad3e6745e5..9ed7e3b1d654 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -55,7 +55,7 @@ int vmbus_interrupt; /* * Boolean to control whether to report panic messages over Hyper-V. * - * It can be set via /proc/sys/kernel/hyperv/record_panic_msg + * It can be set via /proc/sys/kernel/hyperv_record_panic_msg */ static int sysctl_record_panic_msg = 1; From f0434de41adc2c6dabfaa2f59882f1ca2d644fe9 Mon Sep 17 00:00:00 2001 From: Matheus Castello Date: Sun, 15 Nov 2020 16:57:30 -0300 Subject: [PATCH 129/395] drivers: hv: vmbus: Replace symbolic permissions by octal permissions This fixed the below checkpatch issue: WARNING: Symbolic permissions 'S_IRUGO' are not preferred. Consider using octal permissions '0444'. Signed-off-by: Matheus Castello Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20201115195734.8338-3-matheus@castello.eng.br Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 9ed7e3b1d654..52c1407c1849 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1812,7 +1812,7 @@ static ssize_t channel_pending_show(struct vmbus_channel *channel, channel_pending(channel, vmbus_connection.monitor_pages[1])); } -static VMBUS_CHAN_ATTR(pending, S_IRUGO, channel_pending_show, NULL); +static VMBUS_CHAN_ATTR(pending, 0444, channel_pending_show, NULL); static ssize_t channel_latency_show(struct vmbus_channel *channel, char *buf) @@ -1821,19 +1821,19 @@ static ssize_t channel_latency_show(struct vmbus_channel *channel, channel_latency(channel, vmbus_connection.monitor_pages[1])); } -static VMBUS_CHAN_ATTR(latency, S_IRUGO, channel_latency_show, NULL); +static VMBUS_CHAN_ATTR(latency, 0444, channel_latency_show, NULL); static ssize_t channel_interrupts_show(struct vmbus_channel *channel, char *buf) { return sprintf(buf, "%llu\n", channel->interrupts); } -static VMBUS_CHAN_ATTR(interrupts, S_IRUGO, channel_interrupts_show, NULL); +static VMBUS_CHAN_ATTR(interrupts, 0444, channel_interrupts_show, NULL); static ssize_t channel_events_show(struct vmbus_channel *channel, char *buf) { return sprintf(buf, "%llu\n", channel->sig_events); } -static VMBUS_CHAN_ATTR(events, S_IRUGO, channel_events_show, NULL); +static VMBUS_CHAN_ATTR(events, 0444, channel_events_show, NULL); static ssize_t channel_intr_in_full_show(struct vmbus_channel *channel, char *buf) @@ -1872,7 +1872,7 @@ static ssize_t subchannel_monitor_id_show(struct vmbus_channel *channel, { return sprintf(buf, "%u\n", channel->offermsg.monitorid); } -static VMBUS_CHAN_ATTR(monitor_id, S_IRUGO, subchannel_monitor_id_show, NULL); +static VMBUS_CHAN_ATTR(monitor_id, 0444, subchannel_monitor_id_show, NULL); static ssize_t subchannel_id_show(struct vmbus_channel *channel, char *buf) From e4f2212e53c265ed9fb2f5b936b63cd57acb70ff Mon Sep 17 00:00:00 2001 From: Matheus Castello Date: Sun, 15 Nov 2020 16:57:31 -0300 Subject: [PATCH 130/395] drivers: hv: vmbus: Fix checkpatch LINE_SPACING Fixed checkpatch warning: Missing a blank line after declarations checkpatch(LINE_SPACING) Signed-off-by: Matheus Castello Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20201115195734.8338-4-matheus@castello.eng.br Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 52c1407c1849..61d28c743263 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -156,6 +156,7 @@ static u32 channel_conn_id(struct vmbus_channel *channel, { u8 monitor_group = channel_monitor_group(channel); u8 monitor_offset = channel_monitor_offset(channel); + return monitor_page->parameter[monitor_group][monitor_offset].connectionid.u.id; } @@ -550,6 +551,7 @@ static ssize_t vendor_show(struct device *dev, char *buf) { struct hv_device *hv_dev = device_to_hv_device(dev); + return sprintf(buf, "0x%x\n", hv_dev->vendor_id); } static DEVICE_ATTR_RO(vendor); @@ -559,6 +561,7 @@ static ssize_t device_show(struct device *dev, char *buf) { struct hv_device *hv_dev = device_to_hv_device(dev); + return sprintf(buf, "0x%x\n", hv_dev->device_id); } static DEVICE_ATTR_RO(device); From 14c685d9eb361768bb5ca452b999e43498f15746 Mon Sep 17 00:00:00 2001 From: Matheus Castello Date: Sun, 15 Nov 2020 16:57:34 -0300 Subject: [PATCH 131/395] drivers: hv: vmbus: Fix call msleep using < 20ms Fixed checkpatch warning: MSLEEP: msleep < 20ms can sleep for up to 20ms; see Documentation/timers/timers-howto.rst Signed-off-by: Matheus Castello Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20201115195734.8338-7-matheus@castello.eng.br Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 61d28c743263..0a2711aa63a1 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2380,7 +2380,7 @@ static int vmbus_bus_suspend(struct device *dev) * We wait here until the completion of any channel * offers that are currently in progress. */ - msleep(1); + usleep_range(1000, 2000); } mutex_lock(&vmbus_connection.channel_mutex); From 158c774d3c64859e84dd20e04d5fb18c8d3d318e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E7=90=B0=E6=9D=B0=20=28Zhou=20Yanjie=29?= Date: Tue, 17 Nov 2020 01:55:07 +0800 Subject: [PATCH 132/395] MIPS: Ingenic: Add missing nodes for Ingenic SoCs and boards. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.Add OTG/OTG PHY/RNG nodes for JZ4780, CGU/OTG nodes for CI20. 2.Add OTG/OTG PHY/RNG/OST nodes for X1000, SSI/CGU/OST/OTG/SC16IS752 nodes for CU1000-Neo. 3.Add OTG/OTG PHY/DTRNG/OST nodes for X1830, SSI/CGU/OST/OTG/SC16IS752 nodes for CU1830-Neo. Tested-by: 周正 (Zhou Zheng) Tested by: H. Nikolaus Schaller # CI20/jz4780 Signed-off-by: 周琰杰 (Zhou Yanjie) Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/ingenic/ci20.dts | 45 ++++++++++++++- arch/mips/boot/dts/ingenic/cu1000-neo.dts | 64 ++++++++++++++++++--- arch/mips/boot/dts/ingenic/cu1830-neo.dts | 68 ++++++++++++++++++++--- arch/mips/boot/dts/ingenic/jz4780.dtsi | 45 ++++++++++++++- arch/mips/boot/dts/ingenic/x1000.dtsi | 56 ++++++++++++++++++- arch/mips/boot/dts/ingenic/x1830.dtsi | 58 ++++++++++++++++++- 6 files changed, 315 insertions(+), 21 deletions(-) diff --git a/arch/mips/boot/dts/ingenic/ci20.dts b/arch/mips/boot/dts/ingenic/ci20.dts index 75f5bfbf2c37..8877c62609de 100644 --- a/arch/mips/boot/dts/ingenic/ci20.dts +++ b/arch/mips/boot/dts/ingenic/ci20.dts @@ -69,9 +69,11 @@ led3 { eth0_power: fixedregulator@0 { compatible = "regulator-fixed"; + regulator-name = "eth0_power"; regulator-min-microvolt = <3300000>; regulator-max-microvolt = <3300000>; + gpio = <&gpb 25 GPIO_ACTIVE_LOW>; enable-active-high; }; @@ -83,16 +85,39 @@ ir: ir { wlan0_power: fixedregulator@1 { compatible = "regulator-fixed"; + regulator-name = "wlan0_power"; + gpio = <&gpb 19 GPIO_ACTIVE_LOW>; enable-active-high; }; + + otg_power: fixedregulator@2 { + compatible = "regulator-fixed"; + + regulator-name = "otg_power"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + + gpio = <&gpf 14 GPIO_ACTIVE_LOW>; + enable-active-high; + }; }; &ext { clock-frequency = <48000000>; }; +&cgu { + /* + * Use the 32.768 kHz oscillator as the parent of the RTC for a higher + * precision. + */ + assigned-clocks = <&cgu JZ4780_CLK_OTGPHY>, <&cgu JZ4780_CLK_RTC>; + assigned-clock-parents = <0>, <&cgu JZ4780_CLK_RTCLK>; + assigned-clock-rates = <48000000>; +}; + &mmc0 { status = "okay"; @@ -396,6 +421,16 @@ &bch { status = "okay"; }; +&otg_phy { + status = "okay"; + + vcc-supply = <&otg_power>; +}; + +&otg { + status = "okay"; +}; + &pinctrl { pins_uart0: uart0 { function = "uart0"; @@ -489,7 +524,11 @@ pins_mmc1: mmc1 { }; &tcu { - /* 3 MHz for the system timer and clocksource */ - assigned-clocks = <&tcu TCU_CLK_TIMER0>, <&tcu TCU_CLK_TIMER1>; - assigned-clock-rates = <3000000>, <3000000>; + /* + * 750 kHz for the system timer and 3 MHz for the clocksource, + * use channel #0 for the system timer, #1 for the clocksource. + */ + assigned-clocks = <&tcu TCU_CLK_TIMER0>, <&tcu TCU_CLK_TIMER1>, + <&tcu TCU_CLK_OST>; + assigned-clock-rates = <750000>, <3000000>, <3000000>; }; diff --git a/arch/mips/boot/dts/ingenic/cu1000-neo.dts b/arch/mips/boot/dts/ingenic/cu1000-neo.dts index 22a1066d637b..f98cf029efc3 100644 --- a/arch/mips/boot/dts/ingenic/cu1000-neo.dts +++ b/arch/mips/boot/dts/ingenic/cu1000-neo.dts @@ -3,7 +3,7 @@ #include "x1000.dtsi" #include -#include +#include #include / { @@ -31,6 +31,42 @@ led-0 { }; }; + ssi: spi-gpio { + compatible = "spi-gpio"; + #address-cells = <1>; + #size-cells = <0>; + num-chipselects = <1>; + + mosi-gpios = <&gpd 2 GPIO_ACTIVE_HIGH>; + miso-gpios = <&gpd 3 GPIO_ACTIVE_HIGH>; + sck-gpios = <&gpd 0 GPIO_ACTIVE_HIGH>; + cs-gpios = <&gpd 1 GPIO_ACTIVE_HIGH>; + + status = "okay"; + + spi-max-frequency = <50000000>; + + sc16is752: expander@0 { + compatible = "nxp,sc16is752"; + reg = <0>; /* CE0 */ + spi-max-frequency = <4000000>; + + clocks = <&exclk_sc16is752>; + + interrupt-parent = <&gpc>; + interrupts = <6 IRQ_TYPE_EDGE_FALLING>; + + gpio-controller; + #gpio-cells = <2>; + + exclk_sc16is752: sc16is752 { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <48000000>; + }; + }; + }; + wlan_pwrseq: msc1-pwrseq { compatible = "mmc-pwrseq-simple"; @@ -43,13 +79,19 @@ &exclk { clock-frequency = <24000000>; }; -&tcu { - /* 1500 kHz for the system timer and clocksource */ - assigned-clocks = <&tcu TCU_CLK_TIMER0>, <&tcu TCU_CLK_TIMER2>; - assigned-clock-rates = <1500000>, <1500000>; +&cgu { + /* + * Use the 32.768 kHz oscillator as the parent of the RTC for a higher + * precision. + */ + assigned-clocks = <&cgu X1000_CLK_RTC>; + assigned-clock-parents = <&cgu X1000_CLK_RTCLK>; +}; - /* Use channel #0 for the system timer channel #2 for the clocksource */ - ingenic,pwm-channels-mask = <0xfa>; +&ost { + /* 1500 kHz for the system timer and clocksource */ + assigned-clocks = <&ost OST_CLK_PERCPU_TIMER>, <&ost OST_CLK_GLOBAL_TIMER>; + assigned-clock-rates = <1500000>, <1500000>; }; &uart2 { @@ -135,6 +177,14 @@ lan8720a: ethernet-phy@0 { }; }; +&otg_phy { + status = "okay"; +}; + +&otg { + status = "okay"; +}; + &pinctrl { pins_uart2: uart2 { function = "uart2"; diff --git a/arch/mips/boot/dts/ingenic/cu1830-neo.dts b/arch/mips/boot/dts/ingenic/cu1830-neo.dts index 640f96c00d63..cfcb40edb7d9 100644 --- a/arch/mips/boot/dts/ingenic/cu1830-neo.dts +++ b/arch/mips/boot/dts/ingenic/cu1830-neo.dts @@ -3,7 +3,7 @@ #include "x1830.dtsi" #include -#include +#include #include / { @@ -31,6 +31,42 @@ led-0 { }; }; + ssi0: spi-gpio { + compatible = "spi-gpio"; + #address-cells = <1>; + #size-cells = <0>; + num-chipselects = <1>; + + mosi-gpios = <&gpc 12 GPIO_ACTIVE_HIGH>; + miso-gpios = <&gpc 11 GPIO_ACTIVE_HIGH>; + sck-gpios = <&gpc 15 GPIO_ACTIVE_HIGH>; + cs-gpios = <&gpc 16 GPIO_ACTIVE_HIGH>; + + status = "okay"; + + spi-max-frequency = <50000000>; + + sc16is752: expander@0 { + compatible = "nxp,sc16is752"; + reg = <0>; /* CE0 */ + spi-max-frequency = <4000000>; + + clocks = <&exclk_sc16is752>; + + interrupt-parent = <&gpb>; + interrupts = <18 IRQ_TYPE_EDGE_FALLING>; + + gpio-controller; + #gpio-cells = <2>; + + exclk_sc16is752: sc16is752 { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <48000000>; + }; + }; + }; + wlan_pwrseq: msc1-pwrseq { compatible = "mmc-pwrseq-simple"; @@ -43,13 +79,19 @@ &exclk { clock-frequency = <24000000>; }; -&tcu { - /* 1500 kHz for the system timer and clocksource */ - assigned-clocks = <&tcu TCU_CLK_TIMER0>, <&tcu TCU_CLK_TIMER2>; - assigned-clock-rates = <1500000>, <1500000>; +&cgu { + /* + * Use the 32.768 kHz oscillator as the parent of the RTC for a higher + * precision. + */ + assigned-clocks = <&cgu X1830_CLK_RTC>; + assigned-clock-parents = <&cgu X1830_CLK_RTCLK>; +}; - /* Use channel #0 for the system timer channel #2 for the clocksource */ - ingenic,pwm-channels-mask = <0xfa>; +&ost { + /* 1500 kHz for the system timer and clocksource */ + assigned-clocks = <&ost OST_CLK_PERCPU_TIMER>, <&ost OST_CLK_GLOBAL_TIMER>; + assigned-clock-rates = <1500000>, <1500000>; }; &uart1 { @@ -73,6 +115,10 @@ ads7830: adc@48 { }; }; +&dtrng { + status = "okay"; +}; + &msc0 { status = "okay"; @@ -135,6 +181,14 @@ ip101gr: ethernet-phy@0 { }; }; +&otg_phy { + status = "okay"; +}; + +&otg { + status = "okay"; +}; + &pinctrl { pins_uart1: uart1 { function = "uart1"; diff --git a/arch/mips/boot/dts/ingenic/jz4780.dtsi b/arch/mips/boot/dts/ingenic/jz4780.dtsi index dfb5a7e1bb21..8d01feef7ff5 100644 --- a/arch/mips/boot/dts/ingenic/jz4780.dtsi +++ b/arch/mips/boot/dts/ingenic/jz4780.dtsi @@ -61,13 +61,34 @@ rtc: rtc { }; cgu: jz4780-cgu@10000000 { - compatible = "ingenic,jz4780-cgu"; + compatible = "ingenic,jz4780-cgu", "simple-mfd"; reg = <0x10000000 0x100>; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0x0 0x10000000 0x100>; + + #clock-cells = <1>; clocks = <&ext>, <&rtc>; clock-names = "ext", "rtc"; - #clock-cells = <1>; + otg_phy: usb-phy@3c { + compatible = "ingenic,jz4780-phy"; + reg = <0x3c 0x10>; + + clocks = <&cgu JZ4780_CLK_OTG1>; + + #phy-cells = <0>; + + status = "disabled"; + }; + + rng: rng@d8 { + compatible = "ingenic,jz4780-rng"; + reg = <0xd8 0x8>; + + status = "disabled"; + }; }; tcu: timer@10002000 { @@ -494,4 +515,24 @@ bch: bch@134d0000 { status = "disabled"; }; + + otg: usb@13500000 { + compatible = "ingenic,jz4780-otg", "snps,dwc2"; + reg = <0x13500000 0x40000>; + + interrupt-parent = <&intc>; + interrupts = <21>; + + clocks = <&cgu JZ4780_CLK_UHC>; + clock-names = "otg"; + + phys = <&otg_phy>; + phy-names = "usb2-phy"; + + g-rx-fifo-size = <768>; + g-np-tx-fifo-size = <256>; + g-tx-fifo-size = <256 256 256 256 256 256 256 512>; + + status = "disabled"; + }; }; diff --git a/arch/mips/boot/dts/ingenic/x1000.dtsi b/arch/mips/boot/dts/ingenic/x1000.dtsi index 1f1f896dd1f7..aac9dedaf334 100644 --- a/arch/mips/boot/dts/ingenic/x1000.dtsi +++ b/arch/mips/boot/dts/ingenic/x1000.dtsi @@ -52,13 +52,47 @@ rtclk: rtc { }; cgu: x1000-cgu@10000000 { - compatible = "ingenic,x1000-cgu"; + compatible = "ingenic,x1000-cgu", "simple-mfd"; reg = <0x10000000 0x100>; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0x0 0x10000000 0x100>; #clock-cells = <1>; clocks = <&exclk>, <&rtclk>; clock-names = "ext", "rtc"; + + otg_phy: usb-phy@3c { + compatible = "ingenic,x1000-phy"; + reg = <0x3c 0x10>; + + clocks = <&cgu X1000_CLK_OTGPHY>; + + #phy-cells = <0>; + + status = "disabled"; + }; + + rng: rng@d8 { + compatible = "ingenic,x1000-rng"; + reg = <0xd8 0x8>; + + status = "disabled"; + }; + }; + + ost: timer@12000000 { + compatible = "ingenic,x1000-ost"; + reg = <0x12000000 0x3c>; + + #clock-cells = <1>; + + clocks = <&cgu X1000_CLK_OST>; + clock-names = "ost"; + + interrupt-parent = <&cpuintc>; + interrupts = <3>; }; tcu: timer@10002000 { @@ -323,4 +357,24 @@ mdio: mdio { status = "disabled"; }; }; + + otg: usb@13500000 { + compatible = "ingenic,x1000-otg", "snps,dwc2"; + reg = <0x13500000 0x40000>; + + interrupt-parent = <&intc>; + interrupts = <21>; + + clocks = <&cgu X1000_CLK_OTG>; + clock-names = "otg"; + + phys = <&otg_phy>; + phy-names = "usb2-phy"; + + g-rx-fifo-size = <768>; + g-np-tx-fifo-size = <256>; + g-tx-fifo-size = <256 256 256 256 256 256 256 512>; + + status = "disabled"; + }; }; diff --git a/arch/mips/boot/dts/ingenic/x1830.dtsi b/arch/mips/boot/dts/ingenic/x1830.dtsi index b05dac3ae308..b21c93057356 100644 --- a/arch/mips/boot/dts/ingenic/x1830.dtsi +++ b/arch/mips/boot/dts/ingenic/x1830.dtsi @@ -52,13 +52,40 @@ rtclk: rtc { }; cgu: x1830-cgu@10000000 { - compatible = "ingenic,x1830-cgu"; + compatible = "ingenic,x1830-cgu", "simple-mfd"; reg = <0x10000000 0x100>; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0x0 0x10000000 0x100>; #clock-cells = <1>; clocks = <&exclk>, <&rtclk>; clock-names = "ext", "rtc"; + + otg_phy: usb-phy@3c { + compatible = "ingenic,x1830-phy"; + reg = <0x3c 0x10>; + + clocks = <&cgu X1830_CLK_OTGPHY>; + + #phy-cells = <0>; + + status = "disabled"; + }; + }; + + ost: timer@12000000 { + compatible = "ingenic,x1830-ost", "ingenic,x1000-ost"; + reg = <0x12000000 0x3c>; + + #clock-cells = <1>; + + clocks = <&cgu X1830_CLK_OST>; + clock-names = "ost"; + + interrupt-parent = <&cpuintc>; + interrupts = <4>; }; tcu: timer@10002000 { @@ -236,6 +263,15 @@ i2c2: i2c-controller@10052000 { status = "disabled"; }; + dtrng: trng@10072000 { + compatible = "ingenic,x1830-dtrng"; + reg = <0x10072000 0xc>; + + clocks = <&cgu X1830_CLK_DTRNG>; + + status = "disabled"; + }; + pdma: dma-controller@13420000 { compatible = "ingenic,x1830-dma"; reg = <0x13420000 0x400 @@ -311,4 +347,24 @@ mdio: mdio { status = "disabled"; }; }; + + otg: usb@13500000 { + compatible = "ingenic,x1830-otg", "snps,dwc2"; + reg = <0x13500000 0x40000>; + + interrupt-parent = <&intc>; + interrupts = <21>; + + clocks = <&cgu X1830_CLK_OTG>; + clock-names = "otg"; + + phys = <&otg_phy>; + phy-names = "usb2-phy"; + + g-rx-fifo-size = <768>; + g-np-tx-fifo-size = <256>; + g-tx-fifo-size = <256 256 256 256 256 256 256 512>; + + status = "disabled"; + }; }; From e5dab78f1f3c8d6cd1f0717c27580ddab5176bc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E7=90=B0=E6=9D=B0=20=28Zhou=20Yanjie=29?= Date: Tue, 17 Nov 2020 01:55:08 +0800 Subject: [PATCH 133/395] MIPS: Ingenic: Refresh defconfig for Ingenic SoCs based boards. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.Refresh defconfig of CI20 to support OTG and RNG. 2.Refresh defconfig of CU1000-Neo to support OTG/RNG/OST/SC16IS752. 3.Refresh defconfig of CU1830-Neo to support OTG/DTRNG/OST/SC16IS752. Tested-by: 周正 (Zhou Zheng) Tested by: H. Nikolaus Schaller # CI20/jz4780 Signed-off-by: 周琰杰 (Zhou Yanjie) Signed-off-by: Thomas Bogendoerfer --- arch/mips/configs/ci20_defconfig | 15 ++++++++++-- arch/mips/configs/cu1000-neo_defconfig | 28 ++++++++++++++++++---- arch/mips/configs/cu1830-neo_defconfig | 32 ++++++++++++++++++++------ 3 files changed, 61 insertions(+), 14 deletions(-) diff --git a/arch/mips/configs/ci20_defconfig b/arch/mips/configs/ci20_defconfig index 052c5ad0f2b1..ab7ebb066834 100644 --- a/arch/mips/configs/ci20_defconfig +++ b/arch/mips/configs/ci20_defconfig @@ -49,6 +49,8 @@ CONFIG_MTD_RAW_NAND=y CONFIG_MTD_NAND_JZ4780=y CONFIG_MTD_UBI=y CONFIG_MTD_UBI_FASTMAP=y +CONFIG_SCSI=y +CONFIG_BLK_DEV_SD=y CONFIG_NETDEVICES=y # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_BROADCOM is not set @@ -77,7 +79,6 @@ CONFIG_SERIAL_8250_NR_UARTS=5 CONFIG_SERIAL_8250_RUNTIME_UARTS=5 CONFIG_SERIAL_8250_INGENIC=y CONFIG_SERIAL_OF_PLATFORM=y -# CONFIG_HW_RANDOM is not set CONFIG_I2C=y CONFIG_I2C_JZ4780=y CONFIG_SPI=y @@ -99,7 +100,12 @@ CONFIG_IR_GPIO_TX=m CONFIG_MEDIA_SUPPORT=m # CONFIG_VGA_CONSOLE is not set # CONFIG_HID is not set -# CONFIG_USB_SUPPORT is not set +CONFIG_USB=y +CONFIG_USB_STORAGE=y +CONFIG_USB_DWC2=y +CONFIG_USB_SERIAL=y +CONFIG_USB_SERIAL_CH341=y +CONFIG_USB_GADGET=y CONFIG_MMC=y CONFIG_MMC_JZ4740=y CONFIG_NEW_LEDS=y @@ -131,8 +137,13 @@ CONFIG_MEMORY=y CONFIG_JZ4780_NEMC=y CONFIG_PWM=y CONFIG_PWM_JZ4740=m +CONFIG_JZ4780_EFUSE=y +CONFIG_JZ4770_PHY=y CONFIG_EXT4_FS=y # CONFIG_DNOTIFY is not set +CONFIG_AUTOFS_FS=y +CONFIG_VFAT_FS=y +CONFIG_FAT_DEFAULT_UTF8=y CONFIG_PROC_KCORE=y # CONFIG_PROC_PAGE_MONITOR is not set CONFIG_TMPFS=y diff --git a/arch/mips/configs/cu1000-neo_defconfig b/arch/mips/configs/cu1000-neo_defconfig index 55d0690a3ffe..9d75f5b77d5d 100644 --- a/arch/mips/configs/cu1000-neo_defconfig +++ b/arch/mips/configs/cu1000-neo_defconfig @@ -25,6 +25,7 @@ CONFIG_HIGHMEM=y CONFIG_HZ_100=y # CONFIG_SECCOMP is not set # CONFIG_SUSPEND is not set +CONFIG_MODULES=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_COMPACTION is not set CONFIG_CMA=y @@ -32,15 +33,17 @@ CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y -CONFIG_CFG80211=y +CONFIG_CFG80211=m CONFIG_UEVENT_HELPER=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y # CONFIG_ALLOW_DEV_COREDUMP is not set +CONFIG_SCSI=y +CONFIG_BLK_DEV_SD=y CONFIG_NETDEVICES=y CONFIG_STMMAC_ETH=y CONFIG_SMSC_PHY=y -CONFIG_BRCMFMAC=y +CONFIG_BRCMFMAC=m # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -52,16 +55,25 @@ CONFIG_SERIAL_8250_NR_UARTS=3 CONFIG_SERIAL_8250_RUNTIME_UARTS=3 CONFIG_SERIAL_8250_INGENIC=y CONFIG_SERIAL_OF_PLATFORM=y -# CONFIG_HW_RANDOM is not set +CONFIG_SERIAL_SC16IS7XX=y +# CONFIG_SERIAL_SC16IS7XX_I2C is not set +CONFIG_SERIAL_SC16IS7XX_SPI=y CONFIG_I2C=y CONFIG_I2C_JZ4780=y +CONFIG_SPI=y +CONFIG_SPI_GPIO=y CONFIG_GPIO_SYSFS=y -CONFIG_SENSORS_ADS7828=y +CONFIG_SENSORS_ADS7828=m CONFIG_WATCHDOG=y CONFIG_JZ4740_WDT=y # CONFIG_VGA_CONSOLE is not set # CONFIG_HID is not set -# CONFIG_USB_SUPPORT is not set +CONFIG_USB=y +CONFIG_USB_STORAGE=y +CONFIG_USB_DWC2=y +CONFIG_USB_SERIAL=y +CONFIG_USB_SERIAL_CH341=y +CONFIG_USB_GADGET=y CONFIG_MMC=y CONFIG_MMC_JZ4740=y CONFIG_NEW_LEDS=y @@ -72,16 +84,22 @@ CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_JZ4740=y CONFIG_DMADEVICES=y CONFIG_DMA_JZ4780=y +# CONFIG_INGENIC_TIMER is not set +CONFIG_INGENIC_SYSOST=y # CONFIG_IOMMU_SUPPORT is not set +CONFIG_JZ4770_PHY=y CONFIG_EXT4_FS=y # CONFIG_DNOTIFY is not set CONFIG_AUTOFS_FS=y +CONFIG_VFAT_FS=y +CONFIG_FAT_DEFAULT_UTF8=y CONFIG_PROC_KCORE=y # CONFIG_PROC_PAGE_MONITOR is not set CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_NFS_FS=y CONFIG_NLS=y +CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_936=y CONFIG_NLS_CODEPAGE_950=y CONFIG_NLS_ASCII=y diff --git a/arch/mips/configs/cu1830-neo_defconfig b/arch/mips/configs/cu1830-neo_defconfig index e7064851a47a..29decd0003c6 100644 --- a/arch/mips/configs/cu1830-neo_defconfig +++ b/arch/mips/configs/cu1830-neo_defconfig @@ -25,6 +25,7 @@ CONFIG_HIGHMEM=y CONFIG_HZ_100=y # CONFIG_SECCOMP is not set # CONFIG_SUSPEND is not set +CONFIG_MODULES=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_COMPACTION is not set CONFIG_CMA=y @@ -32,18 +33,20 @@ CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y -CONFIG_CFG80211=y +CONFIG_CFG80211=m CONFIG_UEVENT_HELPER=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y # CONFIG_ALLOW_DEV_COREDUMP is not set +CONFIG_SCSI=y +CONFIG_BLK_DEV_SD=y CONFIG_MD=y -CONFIG_BLK_DEV_MD=y -CONFIG_BLK_DEV_DM=y +CONFIG_BLK_DEV_MD=m +CONFIG_BLK_DEV_DM=m CONFIG_NETDEVICES=y CONFIG_STMMAC_ETH=y CONFIG_ICPLUS_PHY=y -CONFIG_BRCMFMAC=y +CONFIG_BRCMFMAC=m # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set @@ -55,16 +58,25 @@ CONFIG_SERIAL_8250_NR_UARTS=2 CONFIG_SERIAL_8250_RUNTIME_UARTS=2 CONFIG_SERIAL_8250_INGENIC=y CONFIG_SERIAL_OF_PLATFORM=y -# CONFIG_HW_RANDOM is not set +CONFIG_SERIAL_SC16IS7XX=y +# CONFIG_SERIAL_SC16IS7XX_I2C is not set +CONFIG_SERIAL_SC16IS7XX_SPI=y CONFIG_I2C=y CONFIG_I2C_JZ4780=y +CONFIG_SPI=y +CONFIG_SPI_GPIO=y CONFIG_GPIO_SYSFS=y -CONFIG_SENSORS_ADS7828=y +CONFIG_SENSORS_ADS7828=m CONFIG_WATCHDOG=y CONFIG_JZ4740_WDT=y # CONFIG_VGA_CONSOLE is not set # CONFIG_HID is not set -# CONFIG_USB_SUPPORT is not set +CONFIG_USB=y +CONFIG_USB_STORAGE=y +CONFIG_USB_DWC2=y +CONFIG_USB_SERIAL=y +CONFIG_USB_SERIAL_CH341=y +CONFIG_USB_GADGET=y CONFIG_MMC=y CONFIG_MMC_JZ4740=y CONFIG_NEW_LEDS=y @@ -75,16 +87,22 @@ CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_JZ4740=y CONFIG_DMADEVICES=y CONFIG_DMA_JZ4780=y +# CONFIG_INGENIC_TIMER is not set +CONFIG_INGENIC_SYSOST=y # CONFIG_IOMMU_SUPPORT is not set +CONFIG_JZ4770_PHY=y CONFIG_EXT4_FS=y # CONFIG_DNOTIFY is not set CONFIG_AUTOFS_FS=y +CONFIG_VFAT_FS=y +CONFIG_FAT_DEFAULT_UTF8=y CONFIG_PROC_KCORE=y # CONFIG_PROC_PAGE_MONITOR is not set CONFIG_TMPFS=y CONFIG_CONFIGFS_FS=y CONFIG_NFS_FS=y CONFIG_NLS=y +CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_936=y CONFIG_NLS_CODEPAGE_950=y CONFIG_NLS_ASCII=y From 29906e1aac11bf9907e26608216dc7970e73a70e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Wed, 17 Jun 2020 12:50:33 +0200 Subject: [PATCH 134/395] mips: bmips: select ARCH_HAS_RESET_CONTROLLER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows to add reset controllers support. Signed-off-by: Álvaro Fernández Rojas Acked-by: Florian Fainelli Signed-off-by: Thomas Bogendoerfer --- arch/mips/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ddaff19a9580..0f638bfaf63a 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -249,6 +249,7 @@ config ATH79 config BMIPS_GENERIC bool "Broadcom Generic BMIPS kernel" + select ARCH_HAS_RESET_CONTROLLER select ARCH_HAS_SYNC_DMA_FOR_CPU_ALL select ARCH_HAS_PHYS_TO_DMA select BOOT_RAW From 10c1e714a68b45b124157aa02d80abe244a2a61a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Wed, 17 Jun 2020 12:50:34 +0200 Subject: [PATCH 135/395] dt-bindings: reset: add BCM6345 reset controller bindings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add device tree binding documentation for BCM6345 reset controller. Signed-off-by: Álvaro Fernández Rojas Reviewed-by: Florian Fainelli Reviewed-by: Rob Herring Signed-off-by: Thomas Bogendoerfer --- .../bindings/reset/brcm,bcm6345-reset.yaml | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 Documentation/devicetree/bindings/reset/brcm,bcm6345-reset.yaml diff --git a/Documentation/devicetree/bindings/reset/brcm,bcm6345-reset.yaml b/Documentation/devicetree/bindings/reset/brcm,bcm6345-reset.yaml new file mode 100644 index 000000000000..560cf6522cb8 --- /dev/null +++ b/Documentation/devicetree/bindings/reset/brcm,bcm6345-reset.yaml @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/reset/brcm,bcm6345-reset.yaml#" +$schema: "http://devicetree.org/meta-schemas/core.yaml#" + +title: BCM6345 reset controller + +description: This document describes the BCM6345 reset controller. + +maintainers: + - Álvaro Fernández Rojas + +properties: + compatible: + const: brcm,bcm6345-reset + + reg: + maxItems: 1 + + "#reset-cells": + const: 1 + +required: + - compatible + - reg + - "#reset-cells" + +additionalProperties: false + +examples: + - | + reset-controller@10000010 { + compatible = "brcm,bcm6345-reset"; + reg = <0x10000010 0x4>; + #reset-cells = <1>; + }; From aac025437f14c1647dc6054b95daeebed34f6971 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Wed, 17 Jun 2020 12:50:35 +0200 Subject: [PATCH 136/395] reset: add BCM6345 reset controller driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for resetting blocks through the Linux reset controller subsystem for BCM63xx SoCs. Signed-off-by: Álvaro Fernández Rojas Reviewed-by: Florian Fainelli Reviewed-by: Philipp Zabel Signed-off-by: Thomas Bogendoerfer --- drivers/reset/Kconfig | 7 ++ drivers/reset/Makefile | 1 + drivers/reset/reset-bcm6345.c | 135 ++++++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+) create mode 100644 drivers/reset/reset-bcm6345.c diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig index 07d162b179fc..dceec715e745 100644 --- a/drivers/reset/Kconfig +++ b/drivers/reset/Kconfig @@ -35,6 +35,13 @@ config RESET_AXS10X help This enables the reset controller driver for AXS10x. +config RESET_BCM6345 + bool "BCM6345 Reset Controller" + depends on BMIPS_GENERIC || COMPILE_TEST + default BMIPS_GENERIC + help + This enables the reset controller driver for BCM6345 SoCs. + config RESET_BERLIN bool "Berlin Reset Driver" if COMPILE_TEST default ARCH_BERLIN diff --git a/drivers/reset/Makefile b/drivers/reset/Makefile index 16947610cc3b..1054123fd187 100644 --- a/drivers/reset/Makefile +++ b/drivers/reset/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_ARCH_TEGRA) += tegra/ obj-$(CONFIG_RESET_A10SR) += reset-a10sr.o obj-$(CONFIG_RESET_ATH79) += reset-ath79.o obj-$(CONFIG_RESET_AXS10X) += reset-axs10x.o +obj-$(CONFIG_RESET_BCM6345) += reset-bcm6345.o obj-$(CONFIG_RESET_BERLIN) += reset-berlin.o obj-$(CONFIG_RESET_BRCMSTB) += reset-brcmstb.o obj-$(CONFIG_RESET_BRCMSTB_RESCAL) += reset-brcmstb-rescal.o diff --git a/drivers/reset/reset-bcm6345.c b/drivers/reset/reset-bcm6345.c new file mode 100644 index 000000000000..737e4e81f6b7 --- /dev/null +++ b/drivers/reset/reset-bcm6345.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * BCM6345 Reset Controller Driver + * + * Copyright (C) 2020 Álvaro Fernández Rojas + */ + +#include +#include +#include +#include +#include +#include + +#define BCM6345_RESET_NUM 32 +#define BCM6345_RESET_SLEEP_MIN_US 10000 +#define BCM6345_RESET_SLEEP_MAX_US 20000 + +struct bcm6345_reset { + struct reset_controller_dev rcdev; + void __iomem *base; + spinlock_t lock; +}; + +static inline struct bcm6345_reset * +to_bcm6345_reset(struct reset_controller_dev *rcdev) +{ + return container_of(rcdev, struct bcm6345_reset, rcdev); +} + +static int bcm6345_reset_update(struct reset_controller_dev *rcdev, + unsigned long id, bool assert) +{ + struct bcm6345_reset *bcm6345_reset = to_bcm6345_reset(rcdev); + unsigned long flags; + uint32_t val; + + spin_lock_irqsave(&bcm6345_reset->lock, flags); + val = __raw_readl(bcm6345_reset->base); + if (assert) + val &= ~BIT(id); + else + val |= BIT(id); + __raw_writel(val, bcm6345_reset->base); + spin_unlock_irqrestore(&bcm6345_reset->lock, flags); + + return 0; +} + +static int bcm6345_reset_assert(struct reset_controller_dev *rcdev, + unsigned long id) +{ + return bcm6345_reset_update(rcdev, id, true); +} + +static int bcm6345_reset_deassert(struct reset_controller_dev *rcdev, + unsigned long id) +{ + return bcm6345_reset_update(rcdev, id, false); +} + +static int bcm6345_reset_reset(struct reset_controller_dev *rcdev, + unsigned long id) +{ + bcm6345_reset_update(rcdev, id, true); + usleep_range(BCM6345_RESET_SLEEP_MIN_US, + BCM6345_RESET_SLEEP_MAX_US); + + bcm6345_reset_update(rcdev, id, false); + /* + * Ensure component is taken out reset state by sleeping also after + * deasserting the reset. Otherwise, the component may not be ready + * for operation. + */ + usleep_range(BCM6345_RESET_SLEEP_MIN_US, + BCM6345_RESET_SLEEP_MAX_US); + + return 0; +} + +static int bcm6345_reset_status(struct reset_controller_dev *rcdev, + unsigned long id) +{ + struct bcm6345_reset *bcm6345_reset = to_bcm6345_reset(rcdev); + + return !(__raw_readl(bcm6345_reset->base) & BIT(id)); +} + +static struct reset_control_ops bcm6345_reset_ops = { + .assert = bcm6345_reset_assert, + .deassert = bcm6345_reset_deassert, + .reset = bcm6345_reset_reset, + .status = bcm6345_reset_status, +}; + +static int bcm6345_reset_probe(struct platform_device *pdev) +{ + struct bcm6345_reset *bcm6345_reset; + + bcm6345_reset = devm_kzalloc(&pdev->dev, + sizeof(*bcm6345_reset), GFP_KERNEL); + if (!bcm6345_reset) + return -ENOMEM; + + platform_set_drvdata(pdev, bcm6345_reset); + + bcm6345_reset->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(bcm6345_reset->base)) + return PTR_ERR(bcm6345_reset->base); + + spin_lock_init(&bcm6345_reset->lock); + bcm6345_reset->rcdev.ops = &bcm6345_reset_ops; + bcm6345_reset->rcdev.owner = THIS_MODULE; + bcm6345_reset->rcdev.of_node = pdev->dev.of_node; + bcm6345_reset->rcdev.of_reset_n_cells = 1; + bcm6345_reset->rcdev.nr_resets = BCM6345_RESET_NUM; + + return devm_reset_controller_register(&pdev->dev, + &bcm6345_reset->rcdev); +} + +static const struct of_device_id bcm6345_reset_of_match[] = { + { .compatible = "brcm,bcm6345-reset" }, + { /* sentinel */ }, +}; + +static struct platform_driver bcm6345_reset_driver = { + .probe = bcm6345_reset_probe, + .driver = { + .name = "bcm6345-reset", + .of_match_table = bcm6345_reset_of_match, + .suppress_bind_attrs = true, + }, +}; +builtin_platform_driver(bcm6345_reset_driver); From 83f865d7e32e40b4903b1f83537c63fc5cdf1eb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Wed, 17 Jun 2020 12:50:36 +0200 Subject: [PATCH 137/395] mips: bmips: dts: add BCM6328 reset controller support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BCM6328 SoCs have a reset controller for certain components. Signed-off-by: Álvaro Fernández Rojas Acked-by: Florian Fainelli Reviewed-by: Rob Herring Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/brcm/bcm6328.dtsi | 6 ++++++ include/dt-bindings/reset/bcm6328-reset.h | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 include/dt-bindings/reset/bcm6328-reset.h diff --git a/arch/mips/boot/dts/brcm/bcm6328.dtsi b/arch/mips/boot/dts/brcm/bcm6328.dtsi index 1f9edd710392..9dc558763c46 100644 --- a/arch/mips/boot/dts/brcm/bcm6328.dtsi +++ b/arch/mips/boot/dts/brcm/bcm6328.dtsi @@ -57,6 +57,12 @@ clkctl: clock-controller@10000004 { #clock-cells = <1>; }; + periph_rst: reset-controller@10000010 { + compatible = "brcm,bcm6345-reset"; + reg = <0x10000010 0x4>; + #reset-cells = <1>; + }; + periph_intc: interrupt-controller@10000020 { compatible = "brcm,bcm6345-l1-intc"; reg = <0x10000020 0x10>, diff --git a/include/dt-bindings/reset/bcm6328-reset.h b/include/dt-bindings/reset/bcm6328-reset.h new file mode 100644 index 000000000000..0f3df87d47af --- /dev/null +++ b/include/dt-bindings/reset/bcm6328-reset.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef __DT_BINDINGS_RESET_BCM6328_H +#define __DT_BINDINGS_RESET_BCM6328_H + +#define BCM6328_RST_SPI 0 +#define BCM6328_RST_EPHY 1 +#define BCM6328_RST_SAR 2 +#define BCM6328_RST_ENETSW 3 +#define BCM6328_RST_USBS 4 +#define BCM6328_RST_USBH 5 +#define BCM6328_RST_PCM 6 +#define BCM6328_RST_PCIE_CORE 7 +#define BCM6328_RST_PCIE 8 +#define BCM6328_RST_PCIE_EXT 9 +#define BCM6328_RST_PCIE_HARD 10 + +#endif /* __DT_BINDINGS_RESET_BCM6328_H */ From 8079cfba4c7b8cae900c27104b4512fa5ed1f021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Wed, 17 Jun 2020 12:50:37 +0200 Subject: [PATCH 138/395] mips: bmips: dts: add BCM6358 reset controller support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BCM6358 SoCs have a reset controller for certain components. Signed-off-by: Álvaro Fernández Rojas Acked-by: Florian Fainelli Reviewed-by: Rob Herring Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/brcm/bcm6358.dtsi | 6 ++++++ include/dt-bindings/reset/bcm6358-reset.h | 15 +++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 include/dt-bindings/reset/bcm6358-reset.h diff --git a/arch/mips/boot/dts/brcm/bcm6358.dtsi b/arch/mips/boot/dts/brcm/bcm6358.dtsi index f21176cac038..9d93e7f5e6fc 100644 --- a/arch/mips/boot/dts/brcm/bcm6358.dtsi +++ b/arch/mips/boot/dts/brcm/bcm6358.dtsi @@ -82,6 +82,12 @@ periph_intc: interrupt-controller@fffe000c { interrupts = <2>, <3>; }; + periph_rst: reset-controller@fffe0034 { + compatible = "brcm,bcm6345-reset"; + reg = <0xfffe0034 0x4>; + #reset-cells = <1>; + }; + leds0: led-controller@fffe00d0 { #address-cells = <1>; #size-cells = <0>; diff --git a/include/dt-bindings/reset/bcm6358-reset.h b/include/dt-bindings/reset/bcm6358-reset.h new file mode 100644 index 000000000000..bda62ef84f5a --- /dev/null +++ b/include/dt-bindings/reset/bcm6358-reset.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef __DT_BINDINGS_RESET_BCM6358_H +#define __DT_BINDINGS_RESET_BCM6358_H + +#define BCM6358_RST_SPI 0 +#define BCM6358_RST_ENET 2 +#define BCM6358_RST_MPI 3 +#define BCM6358_RST_EPHY 6 +#define BCM6358_RST_SAR 7 +#define BCM6358_RST_USBH 12 +#define BCM6358_RST_PCM 13 +#define BCM6358_RST_ADSL 14 + +#endif /* __DT_BINDINGS_RESET_BCM6358_H */ From 226383600be58dcf2e070e4ac8a371640024fe54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Wed, 17 Jun 2020 12:50:38 +0200 Subject: [PATCH 139/395] mips: bmips: dts: add BCM6362 reset controller support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BCM6362 SoCs have a reset controller for certain components. Signed-off-by: Álvaro Fernández Rojas Acked-by: Florian Fainelli Reviewed-by: Rob Herring Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/brcm/bcm6362.dtsi | 6 ++++++ include/dt-bindings/reset/bcm6362-reset.h | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 include/dt-bindings/reset/bcm6362-reset.h diff --git a/arch/mips/boot/dts/brcm/bcm6362.dtsi b/arch/mips/boot/dts/brcm/bcm6362.dtsi index c98f9111e3c8..eb10341b75ba 100644 --- a/arch/mips/boot/dts/brcm/bcm6362.dtsi +++ b/arch/mips/boot/dts/brcm/bcm6362.dtsi @@ -70,6 +70,12 @@ reboot: syscon-reboot@10000008 { mask = <0x1>; }; + periph_rst: reset-controller@10000010 { + compatible = "brcm,bcm6345-reset"; + reg = <0x10000010 0x4>; + #reset-cells = <1>; + }; + periph_intc: interrupt-controller@10000020 { compatible = "brcm,bcm6345-l1-intc"; reg = <0x10000020 0x10>, diff --git a/include/dt-bindings/reset/bcm6362-reset.h b/include/dt-bindings/reset/bcm6362-reset.h new file mode 100644 index 000000000000..7ebb0546e0ab --- /dev/null +++ b/include/dt-bindings/reset/bcm6362-reset.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef __DT_BINDINGS_RESET_BCM6362_H +#define __DT_BINDINGS_RESET_BCM6362_H + +#define BCM6362_RST_SPI 0 +#define BCM6362_RST_IPSEC 1 +#define BCM6362_RST_EPHY 2 +#define BCM6362_RST_SAR 3 +#define BCM6362_RST_ENETSW 4 +#define BCM6362_RST_USBD 5 +#define BCM6362_RST_USBH 6 +#define BCM6362_RST_PCM 7 +#define BCM6362_RST_PCIE_CORE 8 +#define BCM6362_RST_PCIE 9 +#define BCM6362_RST_PCIE_EXT 10 +#define BCM6362_RST_WLAN_SHIM 11 +#define BCM6362_RST_DDR_PHY 12 +#define BCM6362_RST_FAP 13 +#define BCM6362_RST_WLAN_UBUS 14 + +#endif /* __DT_BINDINGS_RESET_BCM6362_H */ From 7acf84e87857721d66a1ba800c2c50669089f43d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Wed, 17 Jun 2020 12:50:39 +0200 Subject: [PATCH 140/395] mips: bmips: dts: add BCM6368 reset controller support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BCM6368 SoCs have a reset controller for certain components. Signed-off-by: Álvaro Fernández Rojas Acked-by: Florian Fainelli Reviewed-by: Rob Herring Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/brcm/bcm6368.dtsi | 6 ++++++ include/dt-bindings/reset/bcm6368-reset.h | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 include/dt-bindings/reset/bcm6368-reset.h diff --git a/arch/mips/boot/dts/brcm/bcm6368.dtsi b/arch/mips/boot/dts/brcm/bcm6368.dtsi index 449c167dd892..52c19f40b9cc 100644 --- a/arch/mips/boot/dts/brcm/bcm6368.dtsi +++ b/arch/mips/boot/dts/brcm/bcm6368.dtsi @@ -70,6 +70,12 @@ reboot: syscon-reboot@10000008 { mask = <0x1>; }; + periph_rst: reset-controller@10000010 { + compatible = "brcm,bcm6345-reset"; + reg = <0x10000010 0x4>; + #reset-cells = <1>; + }; + periph_intc: interrupt-controller@10000020 { compatible = "brcm,bcm6345-l1-intc"; reg = <0x10000020 0x10>, diff --git a/include/dt-bindings/reset/bcm6368-reset.h b/include/dt-bindings/reset/bcm6368-reset.h new file mode 100644 index 000000000000..c81d8eb6d173 --- /dev/null +++ b/include/dt-bindings/reset/bcm6368-reset.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef __DT_BINDINGS_RESET_BCM6368_H +#define __DT_BINDINGS_RESET_BCM6368_H + +#define BCM6368_RST_SPI 0 +#define BCM6368_RST_MPI 3 +#define BCM6368_RST_IPSEC 4 +#define BCM6368_RST_EPHY 6 +#define BCM6368_RST_SAR 7 +#define BCM6368_RST_SWITCH 10 +#define BCM6368_RST_USBD 11 +#define BCM6368_RST_USBH 12 +#define BCM6368_RST_PCM 13 + +#endif /* __DT_BINDINGS_RESET_BCM6368_H */ From b7aa228813bdf014d6ad173ca3abfced30f1ed37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Wed, 17 Jun 2020 12:50:40 +0200 Subject: [PATCH 141/395] mips: bmips: dts: add BCM63268 reset controller support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BCM63268 SoCs have a reset controller for certain components. Signed-off-by: Álvaro Fernández Rojas Acked-by: Florian Fainelli Reviewed-by: Rob Herring Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/brcm/bcm63268.dtsi | 6 +++++ include/dt-bindings/reset/bcm63268-reset.h | 26 ++++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 include/dt-bindings/reset/bcm63268-reset.h diff --git a/arch/mips/boot/dts/brcm/bcm63268.dtsi b/arch/mips/boot/dts/brcm/bcm63268.dtsi index 5acb49b61867..e0021ff9f144 100644 --- a/arch/mips/boot/dts/brcm/bcm63268.dtsi +++ b/arch/mips/boot/dts/brcm/bcm63268.dtsi @@ -70,6 +70,12 @@ reboot: syscon-reboot@10000008 { mask = <0x1>; }; + periph_rst: reset-controller@10000010 { + compatible = "brcm,bcm6345-reset"; + reg = <0x10000010 0x4>; + #reset-cells = <1>; + }; + periph_intc: interrupt-controller@10000020 { compatible = "brcm,bcm6345-l1-intc"; reg = <0x10000020 0x20>, diff --git a/include/dt-bindings/reset/bcm63268-reset.h b/include/dt-bindings/reset/bcm63268-reset.h new file mode 100644 index 000000000000..6a6403a4c2d5 --- /dev/null +++ b/include/dt-bindings/reset/bcm63268-reset.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef __DT_BINDINGS_RESET_BCM63268_H +#define __DT_BINDINGS_RESET_BCM63268_H + +#define BCM63268_RST_SPI 0 +#define BCM63268_RST_IPSEC 1 +#define BCM63268_RST_EPHY 2 +#define BCM63268_RST_SAR 3 +#define BCM63268_RST_ENETSW 4 +#define BCM63268_RST_USBS 5 +#define BCM63268_RST_USBH 6 +#define BCM63268_RST_PCM 7 +#define BCM63268_RST_PCIE_CORE 8 +#define BCM63268_RST_PCIE 9 +#define BCM63268_RST_PCIE_EXT 10 +#define BCM63268_RST_WLAN_SHIM 11 +#define BCM63268_RST_DDR_PHY 12 +#define BCM63268_RST_FAP0 13 +#define BCM63268_RST_WLAN_UBUS 14 +#define BCM63268_RST_DECT 15 +#define BCM63268_RST_FAP1 16 +#define BCM63268_RST_PCIE_HARD 17 +#define BCM63268_RST_GPHY 18 + +#endif /* __DT_BINDINGS_RESET_BCM63268_H */ From 8c9e8b0a28225c46f2cca0a09a3a111bb043e874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= Date: Wed, 17 Jun 2020 12:50:41 +0200 Subject: [PATCH 142/395] mips: bmips: add BCM6318 reset controller definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BCM6318 SoCs have a reset controller for certain components. Signed-off-by: Álvaro Fernández Rojas Acked-by: Florian Fainelli Reviewed-by: Rob Herring Signed-off-by: Thomas Bogendoerfer --- include/dt-bindings/reset/bcm6318-reset.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 include/dt-bindings/reset/bcm6318-reset.h diff --git a/include/dt-bindings/reset/bcm6318-reset.h b/include/dt-bindings/reset/bcm6318-reset.h new file mode 100644 index 000000000000..f882662505ea --- /dev/null +++ b/include/dt-bindings/reset/bcm6318-reset.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef __DT_BINDINGS_RESET_BCM6318_H +#define __DT_BINDINGS_RESET_BCM6318_H + +#define BCM6318_RST_SPI 0 +#define BCM6318_RST_EPHY 1 +#define BCM6318_RST_SAR 2 +#define BCM6318_RST_ENETSW 3 +#define BCM6318_RST_USBD 4 +#define BCM6318_RST_USBH 5 +#define BCM6318_RST_PCIE_CORE 6 +#define BCM6318_RST_PCIE 7 +#define BCM6318_RST_PCIE_EXT 8 +#define BCM6318_RST_PCIE_HARD 9 +#define BCM6318_RST_ADSL 10 +#define BCM6318_RST_PHYMIPS 11 +#define BCM6318_RST_HOSTMIPS 12 + +#endif /* __DT_BINDINGS_RESET_BCM6318_H */ From 724d554a117a0552c2c982f0b5cd1d685274d678 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Fri, 13 Nov 2020 12:09:49 +0100 Subject: [PATCH 143/395] MIPS: vdso: Use vma page protection for remapping MIPS protection bits are setup during runtime so using defines like PAGE_READONLY ignores these runtime changes. To fix this we simply use the page protection of the setup vma. Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/vdso.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 242dc5e83847..7d0b91ad2581 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -161,7 +161,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) gic_pfn = virt_to_phys(mips_gic_base + MIPS_GIC_USER_OFS) >> PAGE_SHIFT; ret = io_remap_pfn_range(vma, base, gic_pfn, gic_size, - pgprot_noncached(PAGE_READONLY)); + pgprot_noncached(vma->vm_page_prot)); if (ret) goto out; } @@ -169,7 +169,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) /* Map data page. */ ret = remap_pfn_range(vma, data_addr, virt_to_phys(vdso_data) >> PAGE_SHIFT, - PAGE_SIZE, PAGE_READONLY); + PAGE_SIZE, vma->vm_page_prot); if (ret) goto out; From 411406a8c758d9ad6f908fab3a6cf1d3d89e1d08 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Fri, 13 Nov 2020 12:09:50 +0100 Subject: [PATCH 144/395] MIPS: kvm: Use vm_get_page_prot to get protection bits MIPS protection bits are setup during runtime so using defines like PAGE_SHARED ignores this runtime changes. Using vm_get_page_prot to get correct page protection fixes this. Signed-off-by: Thomas Bogendoerfer --- arch/mips/kvm/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 28c366d307e7..3dabeda82458 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -1074,6 +1074,7 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, { kvm_pfn_t pfn; pte_t *ptep; + pgprot_t prot; ptep = kvm_trap_emul_pte_for_gva(vcpu, badvaddr); if (!ptep) { @@ -1083,7 +1084,8 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage)); /* Also set valid and dirty, so refill handler doesn't have to */ - *ptep = pte_mkyoung(pte_mkdirty(pfn_pte(pfn, PAGE_SHARED))); + prot = vm_get_page_prot(VM_READ|VM_WRITE|VM_SHARED); + *ptep = pte_mkyoung(pte_mkdirty(pfn_pte(pfn, prot))); /* Invalidate this entry in the TLB, guest kernel ASID only */ kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true); From ed2adb74217a4054a92e0a0746e31ec6f5e466c8 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Fri, 13 Nov 2020 12:09:51 +0100 Subject: [PATCH 145/395] MIPS: mm: shorten lines by using macro Introduce helper macro to make lines shorter. Signed-off-by: Thomas Bogendoerfer --- arch/mips/mm/cache.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 3e81ba000096..f66a8bfc030e 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -155,26 +155,28 @@ void __update_cache(unsigned long address, pte_t pte) unsigned long _page_cachable_default; EXPORT_SYMBOL(_page_cachable_default); +#define PM(p) __pgprot(_page_cachable_default | (p)) + static inline void setup_protection_map(void) { if (cpu_has_rixi) { - protection_map[0] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[1] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[2] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[3] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[4] = __pgprot(_page_cachable_default | _PAGE_PRESENT); - protection_map[5] = __pgprot(_page_cachable_default | _PAGE_PRESENT); - protection_map[6] = __pgprot(_page_cachable_default | _PAGE_PRESENT); - protection_map[7] = __pgprot(_page_cachable_default | _PAGE_PRESENT); + protection_map[0] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); + protection_map[1] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[2] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); + protection_map[3] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[4] = PM(_PAGE_PRESENT); + protection_map[5] = PM(_PAGE_PRESENT); + protection_map[6] = PM(_PAGE_PRESENT); + protection_map[7] = PM(_PAGE_PRESENT); - protection_map[8] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[9] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[10] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE | _PAGE_NO_READ); - protection_map[11] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE); - protection_map[12] = __pgprot(_page_cachable_default | _PAGE_PRESENT); - protection_map[13] = __pgprot(_page_cachable_default | _PAGE_PRESENT); - protection_map[14] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_WRITE); - protection_map[15] = __pgprot(_page_cachable_default | _PAGE_PRESENT | _PAGE_WRITE); + protection_map[8] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); + protection_map[9] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[10] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE | _PAGE_NO_READ); + protection_map[11] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE); + protection_map[12] = PM(_PAGE_PRESENT); + protection_map[13] = PM(_PAGE_PRESENT); + protection_map[14] = PM(_PAGE_PRESENT | _PAGE_WRITE); + protection_map[15] = PM(_PAGE_PRESENT | _PAGE_WRITE); } else { protection_map[0] = PAGE_NONE; @@ -196,6 +198,8 @@ static inline void setup_protection_map(void) } } +#undef PM + void cpu_cache_init(void) { if (cpu_has_3k_cache) { From 0df162e1377a585ced8adb932f7d6e4164e91ccf Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Fri, 13 Nov 2020 12:09:52 +0100 Subject: [PATCH 146/395] MIPS: mm: Clean up setup of protection map Protection map difference between RIXI and non RIXI cpus is _PAGE_NO_EXEC and _PAGE_NO_READ usage. Both already take care of cpu_has_rixi while setting up the page bits. So we just need one setup of protection map and can drop the now unused (and broken for RIXI) PAGE_* defines. Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/pgtable.h | 8 ----- arch/mips/mm/cache.c | 53 +++++++++++---------------------- 2 files changed, 17 insertions(+), 44 deletions(-) diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index e5ef0fdd4838..158ba3aa3bf1 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -25,14 +25,6 @@ struct mm_struct; struct vm_area_struct; -#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_NO_READ | \ - _page_cachable_default) -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_WRITE | \ - _page_cachable_default) -#define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_NO_EXEC | \ - _page_cachable_default) -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | \ - _page_cachable_default) #define PAGE_KERNEL __pgprot(_PAGE_PRESENT | __READABLE | __WRITEABLE | \ _PAGE_GLOBAL | _page_cachable_default) #define PAGE_KERNEL_NC __pgprot(_PAGE_PRESENT | __READABLE | __WRITEABLE | \ diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index f66a8bfc030e..36bcf4e955e8 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -159,43 +159,24 @@ EXPORT_SYMBOL(_page_cachable_default); static inline void setup_protection_map(void) { - if (cpu_has_rixi) { - protection_map[0] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[1] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[2] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[3] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[4] = PM(_PAGE_PRESENT); - protection_map[5] = PM(_PAGE_PRESENT); - protection_map[6] = PM(_PAGE_PRESENT); - protection_map[7] = PM(_PAGE_PRESENT); + protection_map[0] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); + protection_map[1] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[2] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); + protection_map[3] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[4] = PM(_PAGE_PRESENT); + protection_map[5] = PM(_PAGE_PRESENT); + protection_map[6] = PM(_PAGE_PRESENT); + protection_map[7] = PM(_PAGE_PRESENT); - protection_map[8] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); - protection_map[9] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); - protection_map[10] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE | _PAGE_NO_READ); - protection_map[11] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE); - protection_map[12] = PM(_PAGE_PRESENT); - protection_map[13] = PM(_PAGE_PRESENT); - protection_map[14] = PM(_PAGE_PRESENT | _PAGE_WRITE); - protection_map[15] = PM(_PAGE_PRESENT | _PAGE_WRITE); - - } else { - protection_map[0] = PAGE_NONE; - protection_map[1] = PAGE_READONLY; - protection_map[2] = PAGE_COPY; - protection_map[3] = PAGE_COPY; - protection_map[4] = PAGE_READONLY; - protection_map[5] = PAGE_READONLY; - protection_map[6] = PAGE_COPY; - protection_map[7] = PAGE_COPY; - protection_map[8] = PAGE_NONE; - protection_map[9] = PAGE_READONLY; - protection_map[10] = PAGE_SHARED; - protection_map[11] = PAGE_SHARED; - protection_map[12] = PAGE_READONLY; - protection_map[13] = PAGE_READONLY; - protection_map[14] = PAGE_SHARED; - protection_map[15] = PAGE_SHARED; - } + protection_map[8] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); + protection_map[9] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC); + protection_map[10] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE | + _PAGE_NO_READ); + protection_map[11] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE); + protection_map[12] = PM(_PAGE_PRESENT); + protection_map[13] = PM(_PAGE_PRESENT); + protection_map[14] = PM(_PAGE_PRESENT | _PAGE_WRITE); + protection_map[15] = PM(_PAGE_PRESENT | _PAGE_WRITE); } #undef PM From 6ce91ba8589ab08143939f9d6a58993e36773e75 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 19 Nov 2020 15:53:00 +0800 Subject: [PATCH 147/395] MIPS: Remove cpu_has_6k_cache and cpu_has_8k_cache in cpu_cache_init() Since commit 02cf2119684e ("Cleanup the mess in cpu_cache_init."), cpu_has_6k_cache and cpu_has_8k_cache have no user, r6k_cache_init() and r8k_cache_init() are not defined for over 15 years, just remove them. Signed-off-by: Tiezhu Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/cpu-features.h | 2 -- arch/mips/mm/cache.c | 10 ---------- 2 files changed, 12 deletions(-) diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h index f2e216eef7da..336e02b3b3ce 100644 --- a/arch/mips/include/asm/cpu-features.h +++ b/arch/mips/include/asm/cpu-features.h @@ -115,8 +115,6 @@ #ifndef cpu_has_3k_cache #define cpu_has_3k_cache __isa_lt_and_opt(1, MIPS_CPU_3K_CACHE) #endif -#define cpu_has_6k_cache 0 -#define cpu_has_8k_cache 0 #ifndef cpu_has_4k_cache #define cpu_has_4k_cache __isa_ge_or_opt(1, MIPS_CPU_4K_CACHE) #endif diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 36bcf4e955e8..23b16bfd97b2 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -188,21 +188,11 @@ void cpu_cache_init(void) r3k_cache_init(); } - if (cpu_has_6k_cache) { - extern void __weak r6k_cache_init(void); - - r6k_cache_init(); - } if (cpu_has_4k_cache) { extern void __weak r4k_cache_init(void); r4k_cache_init(); } - if (cpu_has_8k_cache) { - extern void __weak r8k_cache_init(void); - - r8k_cache_init(); - } if (cpu_has_tx39_cache) { extern void __weak tx39_cache_init(void); From 91c7a7e0656de077911332f2acdb60f6fd4a134f Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 19 Nov 2020 15:53:01 +0800 Subject: [PATCH 148/395] MIPS: Loongson64: Fix wrong scache size when execute lscpu As the user manual and code comment said, Loongson-3 has 4-scache banks, while Loongson-2K has only 2 banks, so we should multiply the number of scache banks, this multiply operation should be done by c->scache.sets instead of scache_size, otherwise we will get the wrong scache size when execute lscpu. For example, the scache size should be 8192K instead of 2048K on the Loongson 3A3000 and 3A4000 platform, we can see the related info in the following boot message: [loongson@linux ~]$ dmesg | grep "Unified secondary cache" [ 0.000000] Unified secondary cache 8192kB 16-way, linesize 64 bytes. [ 4.061909] Unified secondary cache 8192kB 16-way, linesize 64 bytes. [ 4.125629] Unified secondary cache 8192kB 16-way, linesize 64 bytes. [ 4.188379] Unified secondary cache 8192kB 16-way, linesize 64 bytes. E.g. without this patch: [loongson@linux ~]$ cat /sys/devices/system/cpu/cpu*/cache/index2/size 2048K 2048K 2048K 2048K [loongson@linux ~]$ lscpu | grep "L2 cache" L2 cache: 2048K With this patch: [loongson@linux ~]$ cat /sys/devices/system/cpu/cpu*/cache/index2/size 8192K 8192K 8192K 8192K [loongson@linux ~]$ lscpu | grep "L2 cache" L2 cache: 8192K Signed-off-by: Tiezhu Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/mm/c-r4k.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 9cede7ce37e6..99521764c75b 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -1623,15 +1623,13 @@ static void __init loongson3_sc_init(void) c->scache.sets = 64 << ((config2 >> 8) & 15); c->scache.ways = 1 + (config2 & 15); - scache_size = c->scache.sets * - c->scache.ways * - c->scache.linesz; - /* Loongson-3 has 4-Scache banks, while Loongson-2K have only 2 banks */ if ((c->processor_id & PRID_IMP_MASK) == PRID_IMP_LOONGSON_64R) - scache_size *= 2; + c->scache.sets *= 2; else - scache_size *= 4; + c->scache.sets *= 4; + + scache_size = c->scache.sets * c->scache.ways * c->scache.linesz; c->scache.waybit = 0; c->scache.waysize = scache_size / c->scache.ways; From dea87d0889dd663bd32e86824a0b35cd617ae1d0 Mon Sep 17 00:00:00 2001 From: Lakshmi Ramasubramanian Date: Thu, 12 Nov 2020 12:39:59 -0800 Subject: [PATCH 149/395] ima: select ima-buf template for buffer measurement The default IMA template used for all policy rules is the value set for CONFIG_IMA_DEFAULT_TEMPLATE if the policy rule does not specify a template. The default IMA template for buffer measurements should be 'ima-buf' - so that the measured buffer is correctly included in the IMA measurement log entry. With the default template format, buffer measurements are added to the measurement list, but do not include the buffer data, making it difficult, if not impossible, to validate. Including 'ima-buf' template records in the measurement list by default, should not impact existing attestation servers without 'ima-buf' template support. Initialize a global 'ima-buf' template and select that template, by default, for buffer measurements. Signed-off-by: Lakshmi Ramasubramanian Signed-off-by: Mimi Zohar --- security/integrity/ima/ima.h | 1 + security/integrity/ima/ima_main.c | 24 +++++++++--------------- security/integrity/ima/ima_policy.c | 2 +- security/integrity/ima/ima_template.c | 26 ++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 16 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 6ebefec616e4..8e8b1e3cb847 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -156,6 +156,7 @@ int template_desc_init_fields(const char *template_fmt, const struct ima_template_field ***fields, int *num_fields); struct ima_template_desc *ima_template_desc_current(void); +struct ima_template_desc *ima_template_desc_buf(void); struct ima_template_desc *lookup_template_desc(const char *name); bool ima_template_has_modsig(const struct ima_template_desc *ima_template); int ima_restore_measurement_entry(struct ima_template_entry *entry); diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index a962b23e0429..68956e884403 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -413,7 +413,7 @@ int ima_file_mmap(struct file *file, unsigned long prot) */ int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot) { - struct ima_template_desc *template; + struct ima_template_desc *template = NULL; struct file *file = vma->vm_file; char filename[NAME_MAX]; char *pathbuf = NULL; @@ -802,7 +802,7 @@ void process_buffer_measurement(struct inode *inode, const void *buf, int size, .filename = eventname, .buf = buf, .buf_len = size}; - struct ima_template_desc *template = NULL; + struct ima_template_desc *template; struct { struct ima_digest_data hdr; char digest[IMA_MAX_DIGEST_SIZE]; @@ -814,6 +814,13 @@ void process_buffer_measurement(struct inode *inode, const void *buf, int size, if (!ima_policy_flag) return; + template = ima_template_desc_buf(); + if (!template) { + ret = -EINVAL; + audit_cause = "ima_template_desc_buf"; + goto out; + } + /* * Both LSM hooks and auxilary based buffer measurements are * based on policy. To avoid code duplication, differentiate @@ -832,19 +839,6 @@ void process_buffer_measurement(struct inode *inode, const void *buf, int size, if (!pcr) pcr = CONFIG_IMA_MEASURE_PCR_IDX; - if (!template) { - template = lookup_template_desc("ima-buf"); - ret = template_desc_init_fields(template->fmt, - &(template->fields), - &(template->num_fields)); - if (ret < 0) { - pr_err("template %s init failed, result: %d\n", - (strlen(template->name) ? - template->name : template->fmt), ret); - return; - } - } - iint.ima_hash = &hash.hdr; iint.ima_hash->algo = ima_hash_algo; iint.ima_hash->length = hash_digest_size[ima_hash_algo]; diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 9b5adeaa47fc..823a0c1379cb 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -628,7 +628,7 @@ int ima_match_policy(struct inode *inode, const struct cred *cred, u32 secid, struct ima_rule_entry *entry; int action = 0, actmask = flags | (flags << 1); - if (template_desc) + if (template_desc && !*template_desc) *template_desc = ima_template_desc_current(); rcu_read_lock(); diff --git a/security/integrity/ima/ima_template.c b/security/integrity/ima/ima_template.c index 1e89e2d3851f..e22e510ae92d 100644 --- a/security/integrity/ima/ima_template.c +++ b/security/integrity/ima/ima_template.c @@ -55,6 +55,7 @@ static const struct ima_template_field supported_fields[] = { #define MAX_TEMPLATE_NAME_LEN sizeof("d-ng|n-ng|sig|buf|d-modisg|modsig") static struct ima_template_desc *ima_template; +static struct ima_template_desc *ima_buf_template; /** * ima_template_has_modsig - Check whether template has modsig-related fields. @@ -252,11 +253,36 @@ struct ima_template_desc *ima_template_desc_current(void) return ima_template; } +struct ima_template_desc *ima_template_desc_buf(void) +{ + if (!ima_buf_template) { + ima_init_template_list(); + ima_buf_template = lookup_template_desc("ima-buf"); + } + return ima_buf_template; +} + int __init ima_init_template(void) { struct ima_template_desc *template = ima_template_desc_current(); int result; + result = template_desc_init_fields(template->fmt, + &(template->fields), + &(template->num_fields)); + if (result < 0) { + pr_err("template %s init failed, result: %d\n", + (strlen(template->name) ? + template->name : template->fmt), result); + return result; + } + + template = ima_template_desc_buf(); + if (!template) { + pr_err("Failed to get ima-buf template\n"); + return -EINVAL; + } + result = template_desc_init_fields(template->fmt, &(template->fields), &(template->num_fields)); From f9d480b6ffbeb336bf7f6ce44825c00f61b3abae Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Sun, 11 Oct 2020 10:47:42 -0500 Subject: [PATCH 150/395] seccomp/cache: Lookup syscall allowlist bitmap for fast path The overhead of running Seccomp filters has been part of some past discussions [1][2][3]. Oftentimes, the filters have a large number of instructions that check syscall numbers one by one and jump based on that. Some users chain BPF filters which further enlarge the overhead. A recent work [6] comprehensively measures the Seccomp overhead and shows that the overhead is non-negligible and has a non-trivial impact on application performance. We observed some common filters, such as docker's [4] or systemd's [5], will make most decisions based only on the syscall numbers, and as past discussions considered, a bitmap where each bit represents a syscall makes most sense for these filters. The fast (common) path for seccomp should be that the filter permits the syscall to pass through, and failing seccomp is expected to be an exceptional case; it is not expected for userspace to call a denylisted syscall over and over. When it can be concluded that an allow must occur for the given architecture and syscall pair (this determination is introduced in the next commit), seccomp will immediately allow the syscall, bypassing further BPF execution. Each architecture number has its own bitmap. The architecture number in seccomp_data is checked against the defined architecture number constant before proceeding to test the bit against the bitmap with the syscall number as the index of the bit in the bitmap, and if the bit is set, seccomp returns allow. The bitmaps are all clear in this patch and will be initialized in the next commit. When only one architecture exists, the check against architecture number is skipped, suggested by Kees Cook [7]. [1] https://lore.kernel.org/linux-security-module/c22a6c3cefc2412cad00ae14c1371711@huawei.com/T/ [2] https://lore.kernel.org/lkml/202005181120.971232B7B@keescook/T/ [3] https://github.com/seccomp/libseccomp/issues/116 [4] https://github.com/moby/moby/blob/ae0ef82b90356ac613f329a8ef5ee42ca923417d/profiles/seccomp/default.json [5] https://github.com/systemd/systemd/blob/6743a1caf4037f03dc51a1277855018e4ab61957/src/shared/seccomp-util.c#L270 [6] Draco: Architectural and Operating System Support for System Call Security https://tianyin.github.io/pub/draco.pdf, MICRO-53, Oct. 2020 [7] https://lore.kernel.org/bpf/202010091614.8BB0EB64@keescook/ Co-developed-by: Dimitrios Skarlatos Signed-off-by: Dimitrios Skarlatos Signed-off-by: YiFei Zhu Reviewed-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/10f91a367ec4fcdea7fc3f086de3f5f13a4a7436.1602431034.git.yifeifz2@illinois.edu --- kernel/seccomp.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 8ad7a293255a..fe35f4f38949 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -143,6 +143,34 @@ struct notification { struct list_head notifications; }; +#ifdef SECCOMP_ARCH_NATIVE +/** + * struct action_cache - per-filter cache of seccomp actions per + * arch/syscall pair + * + * @allow_native: A bitmap where each bit represents whether the + * filter will always allow the syscall, for the + * native architecture. + * @allow_compat: A bitmap where each bit represents whether the + * filter will always allow the syscall, for the + * compat architecture. + */ +struct action_cache { + DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR); +#ifdef SECCOMP_ARCH_COMPAT + DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR); +#endif +}; +#else +struct action_cache { }; + +static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, + const struct seccomp_data *sd) +{ + return false; +} +#endif /* SECCOMP_ARCH_NATIVE */ + /** * struct seccomp_filter - container for seccomp BPF programs * @@ -298,6 +326,52 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) return 0; } +#ifdef SECCOMP_ARCH_NATIVE +static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap, + size_t bitmap_size, + int syscall_nr) +{ + if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size)) + return false; + syscall_nr = array_index_nospec(syscall_nr, bitmap_size); + + return test_bit(syscall_nr, bitmap); +} + +/** + * seccomp_cache_check_allow - lookup seccomp cache + * @sfilter: The seccomp filter + * @sd: The seccomp data to lookup the cache with + * + * Returns true if the seccomp_data is cached and allowed. + */ +static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, + const struct seccomp_data *sd) +{ + int syscall_nr = sd->nr; + const struct action_cache *cache = &sfilter->cache; + +#ifndef SECCOMP_ARCH_COMPAT + /* A native-only architecture doesn't need to check sd->arch. */ + return seccomp_cache_check_allow_bitmap(cache->allow_native, + SECCOMP_ARCH_NATIVE_NR, + syscall_nr); +#else + if (likely(sd->arch == SECCOMP_ARCH_NATIVE)) + return seccomp_cache_check_allow_bitmap(cache->allow_native, + SECCOMP_ARCH_NATIVE_NR, + syscall_nr); + if (likely(sd->arch == SECCOMP_ARCH_COMPAT)) + return seccomp_cache_check_allow_bitmap(cache->allow_compat, + SECCOMP_ARCH_COMPAT_NR, + syscall_nr); +#endif /* SECCOMP_ARCH_COMPAT */ + + WARN_ON_ONCE(true); + return false; +} +#endif /* SECCOMP_ARCH_NATIVE */ + /** * seccomp_run_filters - evaluates all seccomp filters against @sd * @sd: optional seccomp data to be passed to filters @@ -320,6 +394,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, if (WARN_ON(f == NULL)) return SECCOMP_RET_KILL_PROCESS; + if (seccomp_cache_check_allow(f, sd)) + return SECCOMP_RET_ALLOW; + /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). From 8e01b51a31a1e08e2c3e8fcc0ef6790441be2f61 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Sun, 11 Oct 2020 10:47:43 -0500 Subject: [PATCH 151/395] seccomp/cache: Add "emulator" to check if filter is constant allow SECCOMP_CACHE will only operate on syscalls that do not access any syscall arguments or instruction pointer. To facilitate this we need a static analyser to know whether a filter will return allow regardless of syscall arguments for a given architecture number / syscall number pair. This is implemented here with a pseudo-emulator, and stored in a per-filter bitmap. In order to build this bitmap at filter attach time, each filter is emulated for every syscall (under each possible architecture), and checked for any accesses of struct seccomp_data that are not the "arch" nor "nr" (syscall) members. If only "arch" and "nr" are examined, and the program returns allow, then we can be sure that the filter must return allow independent from syscall arguments. Nearly all seccomp filters are built from these cBPF instructions: BPF_LD | BPF_W | BPF_ABS BPF_JMP | BPF_JEQ | BPF_K BPF_JMP | BPF_JGE | BPF_K BPF_JMP | BPF_JGT | BPF_K BPF_JMP | BPF_JSET | BPF_K BPF_JMP | BPF_JA BPF_RET | BPF_K BPF_ALU | BPF_AND | BPF_K Each of these instructions are emulated. Any weirdness or loading from a syscall argument will cause the emulator to bail. The emulation is also halted if it reaches a return. In that case, if it returns an SECCOMP_RET_ALLOW, the syscall is marked as good. Emulator structure and comments are from Kees [1] and Jann [2]. Emulation is done at attach time. If a filter depends on more filters, and if the dependee does not guarantee to allow the syscall, then we skip the emulation of this syscall. [1] https://lore.kernel.org/lkml/20200923232923.3142503-5-keescook@chromium.org/ [2] https://lore.kernel.org/lkml/CAG48ez1p=dR_2ikKq=xVxkoGg0fYpTBpkhJSv1w-6BG=76PAvw@mail.gmail.com/ Suggested-by: Jann Horn Signed-off-by: YiFei Zhu Reviewed-by: Jann Horn Co-developed-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/71c7be2db5ee08905f41c3be5c1ad6e2601ce88f.1602431034.git.yifeifz2@illinois.edu --- kernel/seccomp.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 155 insertions(+), 1 deletion(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index fe35f4f38949..d8cf468dbe1e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -169,6 +169,10 @@ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilte { return false; } + +static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter) +{ +} #endif /* SECCOMP_ARCH_NATIVE */ /** @@ -187,6 +191,7 @@ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilte * this filter after reaching 0. The @users count is always smaller * or equal to @refs. Hence, reaching 0 for @users does not mean * the filter can be freed. + * @cache: cache of arch/syscall mappings to actions * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate @@ -208,6 +213,7 @@ struct seccomp_filter { refcount_t refs; refcount_t users; bool log; + struct action_cache cache; struct seccomp_filter *prev; struct bpf_prog *prog; struct notification *notif; @@ -621,7 +627,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; - const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE); + const bool save_orig = +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE) + true; +#else + false; +#endif if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); @@ -687,6 +698,148 @@ seccomp_prepare_user_filter(const char __user *user_filter) return filter; } +#ifdef SECCOMP_ARCH_NATIVE +/** + * seccomp_is_const_allow - check if filter is constant allow with given data + * @fprog: The BPF programs + * @sd: The seccomp data to check against, only syscall number and arch + * number are considered constant. + */ +static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, + struct seccomp_data *sd) +{ + unsigned int reg_value = 0; + unsigned int pc; + bool op_res; + + if (WARN_ON_ONCE(!fprog)) + return false; + + for (pc = 0; pc < fprog->len; pc++) { + struct sock_filter *insn = &fprog->filter[pc]; + u16 code = insn->code; + u32 k = insn->k; + + switch (code) { + case BPF_LD | BPF_W | BPF_ABS: + switch (k) { + case offsetof(struct seccomp_data, nr): + reg_value = sd->nr; + break; + case offsetof(struct seccomp_data, arch): + reg_value = sd->arch; + break; + default: + /* can't optimize (non-constant value load) */ + return false; + } + break; + case BPF_RET | BPF_K: + /* reached return with constant values only, check allow */ + return k == SECCOMP_RET_ALLOW; + case BPF_JMP | BPF_JA: + pc += insn->k; + break; + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JSET | BPF_K: + switch (BPF_OP(code)) { + case BPF_JEQ: + op_res = reg_value == k; + break; + case BPF_JGE: + op_res = reg_value >= k; + break; + case BPF_JGT: + op_res = reg_value > k; + break; + case BPF_JSET: + op_res = !!(reg_value & k); + break; + default: + /* can't optimize (unknown jump) */ + return false; + } + + pc += op_res ? insn->jt : insn->jf; + break; + case BPF_ALU | BPF_AND | BPF_K: + reg_value &= k; + break; + default: + /* can't optimize (unknown insn) */ + return false; + } + } + + /* ran off the end of the filter?! */ + WARN_ON(1); + return false; +} + +static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter, + void *bitmap, const void *bitmap_prev, + size_t bitmap_size, int arch) +{ + struct sock_fprog_kern *fprog = sfilter->prog->orig_prog; + struct seccomp_data sd; + int nr; + + if (bitmap_prev) { + /* The new filter must be as restrictive as the last. */ + bitmap_copy(bitmap, bitmap_prev, bitmap_size); + } else { + /* Before any filters, all syscalls are always allowed. */ + bitmap_fill(bitmap, bitmap_size); + } + + for (nr = 0; nr < bitmap_size; nr++) { + /* No bitmap change: not a cacheable action. */ + if (!test_bit(nr, bitmap)) + continue; + + sd.nr = nr; + sd.arch = arch; + + /* No bitmap change: continue to always allow. */ + if (seccomp_is_const_allow(fprog, &sd)) + continue; + + /* + * Not a cacheable action: always run filters. + * atomic clear_bit() not needed, filter not visible yet. + */ + __clear_bit(nr, bitmap); + } +} + +/** + * seccomp_cache_prepare - emulate the filter to find cachable syscalls + * @sfilter: The seccomp filter + * + * Returns 0 if successful or -errno if error occurred. + */ +static void seccomp_cache_prepare(struct seccomp_filter *sfilter) +{ + struct action_cache *cache = &sfilter->cache; + const struct action_cache *cache_prev = + sfilter->prev ? &sfilter->prev->cache : NULL; + + seccomp_cache_prepare_bitmap(sfilter, cache->allow_native, + cache_prev ? cache_prev->allow_native : NULL, + SECCOMP_ARCH_NATIVE_NR, + SECCOMP_ARCH_NATIVE); + +#ifdef SECCOMP_ARCH_COMPAT + seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat, + cache_prev ? cache_prev->allow_compat : NULL, + SECCOMP_ARCH_COMPAT_NR, + SECCOMP_ARCH_COMPAT); +#endif /* SECCOMP_ARCH_COMPAT */ +} +#endif /* SECCOMP_ARCH_NATIVE */ + /** * seccomp_attach_filter: validate and attach filter * @flags: flags to change filter behavior @@ -736,6 +889,7 @@ static long seccomp_attach_filter(unsigned int flags, * task reference. */ filter->prev = current->seccomp.filter; + seccomp_cache_prepare(filter); current->seccomp.filter = filter; atomic_inc(¤t->seccomp.filter_count); From 25db91209a910a0ccf8b093743088d0f4bf5659f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 11 Oct 2020 10:47:44 -0500 Subject: [PATCH 152/395] x86: Enable seccomp architecture tracking Provide seccomp internals with the details to calculate which syscall table the running kernel is expecting to deal with. This allows for efficient architecture pinning and paves the way for constant-action bitmaps. Co-developed-by: YiFei Zhu Signed-off-by: YiFei Zhu Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/da58c3733d95c4f2115dd94225dfbe2573ba4d87.1602431034.git.yifeifz2@illinois.edu --- arch/x86/include/asm/seccomp.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index 2bd1338de236..fef16e398161 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h @@ -16,6 +16,26 @@ #define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn #endif +#ifdef CONFIG_X86_64 +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_X86_64 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "x86_64" +# ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_I386 +# define SECCOMP_ARCH_COMPAT_NR IA32_NR_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "ia32" +# endif +/* + * x32 will have __X32_SYSCALL_BIT set in syscall number. We don't support + * caching them and they are treated as out of range syscalls, which will + * always pass through the BPF filter. + */ +#else /* !CONFIG_X86_64 */ +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_I386 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "ia32" +#endif + #include #endif /* _ASM_X86_SECCOMP_H */ From 192cf32243ce39af65bd095625aec374b38c03df Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 11 Oct 2020 10:47:45 -0500 Subject: [PATCH 153/395] selftests/seccomp: Compare bitmap vs filter overhead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of the seccomp benchmarking, include the expectations with regard to the timing behavior of the constant action bitmaps, and report inconsistencies better. Example output with constant action bitmaps on x86: $ sudo ./seccomp_benchmark 100000000 Current BPF sysctl settings: net.core.bpf_jit_enable = 1 net.core.bpf_jit_harden = 0 Benchmarking 200000000 syscalls... 129.359381409 - 0.008724424 = 129350656985 (129.4s) getpid native: 646 ns 264.385890006 - 129.360453229 = 135025436777 (135.0s) getpid RET_ALLOW 1 filter (bitmap): 675 ns 399.400511893 - 264.387045901 = 135013465992 (135.0s) getpid RET_ALLOW 2 filters (bitmap): 675 ns 545.872866260 - 399.401718327 = 146471147933 (146.5s) getpid RET_ALLOW 3 filters (full): 732 ns 696.337101319 - 545.874097681 = 150463003638 (150.5s) getpid RET_ALLOW 4 filters (full): 752 ns Estimated total seccomp overhead for 1 bitmapped filter: 29 ns Estimated total seccomp overhead for 2 bitmapped filters: 29 ns Estimated total seccomp overhead for 3 full filters: 86 ns Estimated total seccomp overhead for 4 full filters: 106 ns Estimated seccomp entry overhead: 29 ns Estimated seccomp per-filter overhead (last 2 diff): 20 ns Estimated seccomp per-filter overhead (filters / 4): 19 ns Expectations: native ≤ 1 bitmap (646 ≤ 675): ✔️ native ≤ 1 filter (646 ≤ 732): ✔️ per-filter (last 2 diff) ≈ per-filter (filters / 4) (20 ≈ 19): ✔️ 1 bitmapped ≈ 2 bitmapped (29 ≈ 29): ✔️ entry ≈ 1 bitmapped (29 ≈ 29): ✔️ entry ≈ 2 bitmapped (29 ≈ 29): ✔️ native + entry + (per filter * 4) ≈ 4 filters total (755 ≈ 752): ✔️ [YiFei: Changed commit message to show stats for this patch series] Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/1b61df3db85c5f7f1b9202722c45e7b39df73ef2.1602431034.git.yifeifz2@illinois.edu --- .../selftests/seccomp/seccomp_benchmark.c | 147 +++++++++++++++--- tools/testing/selftests/seccomp/settings | 2 +- 2 files changed, 128 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index 91f5a89cadac..fcc806585266 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -4,12 +4,16 @@ */ #define _GNU_SOURCE #include +#include +#include +#include #include #include #include #include #include #include +#include #include #include #include @@ -70,18 +74,74 @@ unsigned long long calibrate(void) return samples * seconds; } +bool approx(int i_one, int i_two) +{ + double one = i_one, one_bump = one * 0.01; + double two = i_two, two_bump = two * 0.01; + + one_bump = one + MAX(one_bump, 2.0); + two_bump = two + MAX(two_bump, 2.0); + + /* Equal to, or within 1% or 2 digits */ + if (one == two || + (one > two && one <= two_bump) || + (two > one && two <= one_bump)) + return true; + return false; +} + +bool le(int i_one, int i_two) +{ + if (i_one <= i_two) + return true; + return false; +} + +long compare(const char *name_one, const char *name_eval, const char *name_two, + unsigned long long one, bool (*eval)(int, int), unsigned long long two) +{ + bool good; + + printf("\t%s %s %s (%lld %s %lld): ", name_one, name_eval, name_two, + (long long)one, name_eval, (long long)two); + if (one > INT_MAX) { + printf("Miscalculation! Measurement went negative: %lld\n", (long long)one); + return 1; + } + if (two > INT_MAX) { + printf("Miscalculation! Measurement went negative: %lld\n", (long long)two); + return 1; + } + + good = eval(one, two); + printf("%s\n", good ? "✔️" : "❌"); + + return good ? 0 : 1; +} + int main(int argc, char *argv[]) { + struct sock_filter bitmap_filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog bitmap_prog = { + .len = (unsigned short)ARRAY_SIZE(bitmap_filter), + .filter = bitmap_filter, + }; struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, args[0])), BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog prog = { .len = (unsigned short)ARRAY_SIZE(filter), .filter = filter, }; - long ret; - unsigned long long samples; - unsigned long long native, filter1, filter2; + + long ret, bits; + unsigned long long samples, calc; + unsigned long long native, filter1, filter2, bitmap1, bitmap2; + unsigned long long entry, per_filter1, per_filter2; printf("Current BPF sysctl settings:\n"); system("sysctl net.core.bpf_jit_enable"); @@ -101,35 +161,82 @@ int main(int argc, char *argv[]) ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); assert(ret == 0); - /* One filter */ + /* One filter resulting in a bitmap */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); + assert(ret == 0); + + bitmap1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 1 filter (bitmap): %llu ns\n", bitmap1); + + /* Second filter resulting in a bitmap */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); + assert(ret == 0); + + bitmap2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 2 filters (bitmap): %llu ns\n", bitmap2); + + /* Third filter, can no longer be converted to bitmap */ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); assert(ret == 0); filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1); + printf("getpid RET_ALLOW 3 filters (full): %llu ns\n", filter1); - if (filter1 == native) - printf("No overhead measured!? Try running again with more samples.\n"); - - /* Two filters */ - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + /* Fourth filter, can not be converted to bitmap because of filter 3 */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bitmap_prog); assert(ret == 0); filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2); + printf("getpid RET_ALLOW 4 filters (full): %llu ns\n", filter2); - /* Calculations */ - printf("Estimated total seccomp overhead for 1 filter: %llu ns\n", - filter1 - native); + /* Estimations */ +#define ESTIMATE(fmt, var, what) do { \ + var = (what); \ + printf("Estimated " fmt ": %llu ns\n", var); \ + if (var > INT_MAX) \ + goto more_samples; \ + } while (0) - printf("Estimated total seccomp overhead for 2 filters: %llu ns\n", - filter2 - native); + ESTIMATE("total seccomp overhead for 1 bitmapped filter", calc, + bitmap1 - native); + ESTIMATE("total seccomp overhead for 2 bitmapped filters", calc, + bitmap2 - native); + ESTIMATE("total seccomp overhead for 3 full filters", calc, + filter1 - native); + ESTIMATE("total seccomp overhead for 4 full filters", calc, + filter2 - native); + ESTIMATE("seccomp entry overhead", entry, + bitmap1 - native - (bitmap2 - bitmap1)); + ESTIMATE("seccomp per-filter overhead (last 2 diff)", per_filter1, + filter2 - filter1); + ESTIMATE("seccomp per-filter overhead (filters / 4)", per_filter2, + (filter2 - native - entry) / 4); - printf("Estimated seccomp per-filter overhead: %llu ns\n", - filter2 - filter1); + printf("Expectations:\n"); + ret |= compare("native", "≤", "1 bitmap", native, le, bitmap1); + bits = compare("native", "≤", "1 filter", native, le, filter1); + if (bits) + goto more_samples; - printf("Estimated seccomp entry overhead: %llu ns\n", - filter1 - native - (filter2 - filter1)); + ret |= compare("per-filter (last 2 diff)", "≈", "per-filter (filters / 4)", + per_filter1, approx, per_filter2); + bits = compare("1 bitmapped", "≈", "2 bitmapped", + bitmap1 - native, approx, bitmap2 - native); + if (bits) { + printf("Skipping constant action bitmap expectations: they appear unsupported.\n"); + goto out; + } + + ret |= compare("entry", "≈", "1 bitmapped", entry, approx, bitmap1 - native); + ret |= compare("entry", "≈", "2 bitmapped", entry, approx, bitmap2 - native); + ret |= compare("native + entry + (per filter * 4)", "≈", "4 filters total", + entry + (per_filter1 * 4) + native, approx, filter2); + if (ret == 0) + goto out; + +more_samples: + printf("Saw unexpected benchmark result. Try running again with more samples?\n"); +out: return 0; } diff --git a/tools/testing/selftests/seccomp/settings b/tools/testing/selftests/seccomp/settings index ba4d85f74cd6..6091b45d226b 100644 --- a/tools/testing/selftests/seccomp/settings +++ b/tools/testing/selftests/seccomp/settings @@ -1 +1 @@ -timeout=90 +timeout=120 From ffde703470b03b1000017ed35c4f90a90caa22cf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 27 Oct 2020 12:23:19 -0700 Subject: [PATCH 154/395] arm64: Enable seccomp architecture tracking To enable seccomp constant action bitmaps, we need to have a static mapping to the audit architecture and system call table size. Add these for arm64. Signed-off-by: Kees Cook --- arch/arm64/include/asm/seccomp.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/include/asm/seccomp.h b/arch/arm64/include/asm/seccomp.h index c36387170936..30256233788b 100644 --- a/arch/arm64/include/asm/seccomp.h +++ b/arch/arm64/include/asm/seccomp.h @@ -19,4 +19,13 @@ #include +#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_AARCH64 +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "aarch64" +#ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_ARM +# define SECCOMP_ARCH_COMPAT_NR __NR_compat_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "arm" +#endif + #endif /* _ASM_SECCOMP_H */ From 424c9102fa7b2a5c15afe47fd14278c849f4eefb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 27 Oct 2020 12:26:58 -0700 Subject: [PATCH 155/395] arm: Enable seccomp architecture tracking To enable seccomp constant action bitmaps, we need to have a static mapping to the audit architecture and system call table size. Add these for arm. Signed-off-by: Kees Cook --- arch/arm/include/asm/Kbuild | 1 - arch/arm/include/asm/seccomp.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 arch/arm/include/asm/seccomp.h diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 383635b68763..4a0848aef207 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -4,7 +4,6 @@ generic-y += extable.h generic-y += flat.h generic-y += local64.h generic-y += parport.h -generic-y += seccomp.h generated-y += mach-types.h generated-y += unistd-nr.h diff --git a/arch/arm/include/asm/seccomp.h b/arch/arm/include/asm/seccomp.h new file mode 100644 index 000000000000..e9ad0f37d2ba --- /dev/null +++ b/arch/arm/include/asm/seccomp.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_SECCOMP_H +#define _ASM_SECCOMP_H + +#include + +#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_ARM +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "arm" + +#endif /* _ASM_SECCOMP_H */ From 6e9ae6f98809e0d123ff4d769ba2e6f652119138 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 11 Nov 2020 07:33:47 -0600 Subject: [PATCH 156/395] csky: Enable seccomp architecture tracking To enable seccomp constant action bitmaps, we need to have a static mapping to the audit architecture and system call table size. Add these for csky. Signed-off-by: YiFei Zhu Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/f9219026d4803b22f3e57e3768b4e42e004ef236.1605101222.git.yifeifz2@illinois.edu --- arch/csky/include/asm/Kbuild | 1 - arch/csky/include/asm/seccomp.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 arch/csky/include/asm/seccomp.h diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index 64876e59e2ef..93372255984d 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -4,6 +4,5 @@ generic-y += gpio.h generic-y += kvm_para.h generic-y += local64.h generic-y += qrwlock.h -generic-y += seccomp.h generic-y += user.h generic-y += vmlinux.lds.h diff --git a/arch/csky/include/asm/seccomp.h b/arch/csky/include/asm/seccomp.h new file mode 100644 index 000000000000..d33e758126fb --- /dev/null +++ b/arch/csky/include/asm/seccomp.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_SECCOMP_H +#define _ASM_SECCOMP_H + +#include + +#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_CSKY +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "csky" + +#endif /* _ASM_SECCOMP_H */ From 6aa7923c8737d1f8fd2a06154155d68dec646464 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 11 Nov 2020 07:33:48 -0600 Subject: [PATCH 157/395] parisc: Enable seccomp architecture tracking To enable seccomp constant action bitmaps, we need to have a static mapping to the audit architecture and system call table size. Add these for parisc. Signed-off-by: YiFei Zhu Acked-by: Helge Deller Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/9bb86c546eda753adf5270425e7353202dbce87c.1605101222.git.yifeifz2@illinois.edu --- arch/parisc/include/asm/Kbuild | 1 - arch/parisc/include/asm/seccomp.h | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 arch/parisc/include/asm/seccomp.h diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index e3ee5c0bfe80..f16c4db80116 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -5,5 +5,4 @@ generated-y += syscall_table_c32.h generic-y += kvm_para.h generic-y += local64.h generic-y += mcs_spinlock.h -generic-y += seccomp.h generic-y += user.h diff --git a/arch/parisc/include/asm/seccomp.h b/arch/parisc/include/asm/seccomp.h new file mode 100644 index 000000000000..b058b2220322 --- /dev/null +++ b/arch/parisc/include/asm/seccomp.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_SECCOMP_H +#define _ASM_SECCOMP_H + +#include + +#ifdef CONFIG_64BIT +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC64 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "parisc64" +# ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_PARISC +# define SECCOMP_ARCH_COMPAT_NR NR_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "parisc" +# endif +#else /* !CONFIG_64BIT */ +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_PARISC +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "parisc" +#endif + +#endif /* _ASM_SECCOMP_H */ From e7bcb4622ddf4473da6c03fa8423919a568c57dc Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 11 Nov 2020 07:33:49 -0600 Subject: [PATCH 158/395] powerpc: Enable seccomp architecture tracking To enable seccomp constant action bitmaps, we need to have a static mapping to the audit architecture and system call table size. Add these for powerpc. __LITTLE_ENDIAN__ is used here instead of CONFIG_CPU_LITTLE_ENDIAN to keep it consistent with asm/syscall.h. Signed-off-by: YiFei Zhu Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/0b64925362671cdaa26d01bfe50b3ba5e164adfd.1605101222.git.yifeifz2@illinois.edu --- arch/powerpc/include/asm/seccomp.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h index 51209f6071c5..ac2033f134f0 100644 --- a/arch/powerpc/include/asm/seccomp.h +++ b/arch/powerpc/include/asm/seccomp.h @@ -8,4 +8,27 @@ #include +#ifdef __LITTLE_ENDIAN__ +#define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE +#define __SECCOMP_ARCH_LE_NAME "le" +#else +#define __SECCOMP_ARCH_LE 0 +#define __SECCOMP_ARCH_LE_NAME +#endif + +#ifdef CONFIG_PPC64 +# define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC64 | __SECCOMP_ARCH_LE) +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "ppc64" __SECCOMP_ARCH_LE_NAME +# ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE) +# define SECCOMP_ARCH_COMPAT_NR NR_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "ppc" __SECCOMP_ARCH_LE_NAME +# endif +#else /* !CONFIG_PPC64 */ +# define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_PPC | __SECCOMP_ARCH_LE) +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "ppc" __SECCOMP_ARCH_LE_NAME +#endif + #endif /* _ASM_POWERPC_SECCOMP_H */ From 673a11a7e4152b101bad6851c4e4c34c7c6d6dde Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 11 Nov 2020 07:33:50 -0600 Subject: [PATCH 159/395] riscv: Enable seccomp architecture tracking To enable seccomp constant action bitmaps, we need to have a static mapping to the audit architecture and system call table size. Add these for riscv. Signed-off-by: YiFei Zhu Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/58ef925d00505cbb77478fa6bd2b48ab2d902460.1605101222.git.yifeifz2@illinois.edu --- arch/riscv/include/asm/seccomp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h index bf7744ee3b3d..c7ee6a3507be 100644 --- a/arch/riscv/include/asm/seccomp.h +++ b/arch/riscv/include/asm/seccomp.h @@ -7,4 +7,14 @@ #include +#ifdef CONFIG_64BIT +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV64 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "riscv64" +#else /* !CONFIG_64BIT */ +# define SECCOMP_ARCH_NATIVE AUDIT_ARCH_RISCV32 +# define SECCOMP_ARCH_NATIVE_NR NR_syscalls +# define SECCOMP_ARCH_NATIVE_NAME "riscv32" +#endif + #endif /* _ASM_SECCOMP_H */ From c09058eda2654c37fd7ac28c2004c3aae8b988e9 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 11 Nov 2020 07:33:51 -0600 Subject: [PATCH 160/395] s390: Enable seccomp architecture tracking To enable seccomp constant action bitmaps, we need to have a static mapping to the audit architecture and system call table size. Add these for s390. Signed-off-by: YiFei Zhu Acked-by: Heiko Carstens Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/a381b10aa2c5b1e583642f3cd46ced842d9d4ce5.1605101222.git.yifeifz2@illinois.edu --- arch/s390/include/asm/seccomp.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h index 795bbe0d7ca6..71d46f0ba97b 100644 --- a/arch/s390/include/asm/seccomp.h +++ b/arch/s390/include/asm/seccomp.h @@ -16,4 +16,13 @@ #include +#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_S390X +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "s390x" +#ifdef CONFIG_COMPAT +# define SECCOMP_ARCH_COMPAT AUDIT_ARCH_S390 +# define SECCOMP_ARCH_COMPAT_NR NR_syscalls +# define SECCOMP_ARCH_COMPAT_NAME "s390" +#endif + #endif /* _ASM_S390_SECCOMP_H */ From 4c18bc054bffe415bec9e0edaa9ff1a84c1a6973 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 11 Nov 2020 07:33:52 -0600 Subject: [PATCH 161/395] sh: Enable seccomp architecture tracking To enable seccomp constant action bitmaps, we need to have a static mapping to the audit architecture and system call table size. Add these for sh. Signed-off-by: YiFei Zhu Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/61ae084cd4783b9b50860d9dedb4a348cf1b7b6f.1605101222.git.yifeifz2@illinois.edu --- arch/sh/include/asm/seccomp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/sh/include/asm/seccomp.h b/arch/sh/include/asm/seccomp.h index 54111e4d32b8..d4578395fd66 100644 --- a/arch/sh/include/asm/seccomp.h +++ b/arch/sh/include/asm/seccomp.h @@ -8,4 +8,14 @@ #define __NR_seccomp_exit __NR_exit #define __NR_seccomp_sigreturn __NR_rt_sigreturn +#ifdef CONFIG_CPU_LITTLE_ENDIAN +#define __SECCOMP_ARCH_LE __AUDIT_ARCH_LE +#else +#define __SECCOMP_ARCH_LE 0 +#endif + +#define SECCOMP_ARCH_NATIVE (AUDIT_ARCH_SH | __SECCOMP_ARCH_LE) +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "sh" + #endif /* __ASM_SECCOMP_H */ From 445247b02342a05b7d528bba6d85d2d418875b69 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 11 Nov 2020 07:33:53 -0600 Subject: [PATCH 162/395] xtensa: Enable seccomp architecture tracking To enable seccomp constant action bitmaps, we need to have a static mapping to the audit architecture and system call table size. Add these for xtensa. Signed-off-by: YiFei Zhu Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/79669648ba167d668ea6ffb4884250abcd5ed254.1605101222.git.yifeifz2@illinois.edu --- arch/xtensa/include/asm/Kbuild | 1 - arch/xtensa/include/asm/seccomp.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 arch/xtensa/include/asm/seccomp.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index c59c42a1221a..9718e9593564 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -7,5 +7,4 @@ generic-y += mcs_spinlock.h generic-y += param.h generic-y += qrwlock.h generic-y += qspinlock.h -generic-y += seccomp.h generic-y += user.h diff --git a/arch/xtensa/include/asm/seccomp.h b/arch/xtensa/include/asm/seccomp.h new file mode 100644 index 000000000000..f1cb6b0a9e1f --- /dev/null +++ b/arch/xtensa/include/asm/seccomp.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_SECCOMP_H +#define _ASM_SECCOMP_H + +#include + +#define SECCOMP_ARCH_NATIVE AUDIT_ARCH_XTENSA +#define SECCOMP_ARCH_NATIVE_NR NR_syscalls +#define SECCOMP_ARCH_NATIVE_NAME "xtensa" + +#endif /* _ASM_SECCOMP_H */ From 0d8315dddd2899f519fe1ca3d4d5cdaf44ea421e Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 11 Nov 2020 07:33:54 -0600 Subject: [PATCH 163/395] seccomp/cache: Report cache data through /proc/pid/seccomp_cache Currently the kernel does not provide an infrastructure to translate architecture numbers to a human-readable name. Translating syscall numbers to syscall names is possible through FTRACE_SYSCALL infrastructure but it does not provide support for compat syscalls. This will create a file for each PID as /proc/pid/seccomp_cache. The file will be empty when no seccomp filters are loaded, or be in the format of: where ALLOW means the cache is guaranteed to allow the syscall, and filter means the cache will pass the syscall to the BPF filter. For the docker default profile on x86_64 it looks like: x86_64 0 ALLOW x86_64 1 ALLOW x86_64 2 ALLOW x86_64 3 ALLOW [...] x86_64 132 ALLOW x86_64 133 ALLOW x86_64 134 FILTER x86_64 135 FILTER x86_64 136 FILTER x86_64 137 ALLOW x86_64 138 ALLOW x86_64 139 FILTER x86_64 140 ALLOW x86_64 141 ALLOW [...] This file is guarded by CONFIG_SECCOMP_CACHE_DEBUG with a default of N because I think certain users of seccomp might not want the application to know which syscalls are definitely usable. For the same reason, it is also guarded by CAP_SYS_ADMIN. Suggested-by: Jann Horn Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/ Signed-off-by: YiFei Zhu Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/94e663fa53136f5a11f432c661794d1ee7060779.1605101222.git.yifeifz2@illinois.edu --- arch/Kconfig | 17 ++++++++++++ fs/proc/base.c | 6 +++++ include/linux/seccomp.h | 7 +++++ kernel/seccomp.c | 59 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 56b6ccc0e32d..35c9463b7d10 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -486,6 +486,9 @@ config HAVE_ARCH_SECCOMP_FILTER - secure_computing return value is checked and a return value of -1 results in the system call being skipped immediately. - seccomp syscall wired up + - if !HAVE_SPARSE_SYSCALL_NR, have SECCOMP_ARCH_NATIVE, + SECCOMP_ARCH_NATIVE_NR, SECCOMP_ARCH_NATIVE_NAME defined. If + COMPAT is supported, have the SECCOMP_ARCH_COMPAT* defines too. config SECCOMP prompt "Enable seccomp to safely execute untrusted bytecode" @@ -514,6 +517,20 @@ config SECCOMP_FILTER See Documentation/userspace-api/seccomp_filter.rst for details. +config SECCOMP_CACHE_DEBUG + bool "Show seccomp filter cache status in /proc/pid/seccomp_cache" + depends on SECCOMP_FILTER && !HAVE_SPARSE_SYSCALL_NR + depends on PROC_FS + help + This enables the /proc/pid/seccomp_cache interface to monitor + seccomp cache data. The file format is subject to change. Reading + the file requires CAP_SYS_ADMIN. + + This option is for debugging only. Enabling presents the risk that + an adversary may be able to infer the seccomp filter logic. + + If unsure, say N. + config HAVE_ARCH_STACKLEAK bool help diff --git a/fs/proc/base.c b/fs/proc/base.c index b362523a9829..8a7d682ba881 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3263,6 +3263,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif +#ifdef CONFIG_SECCOMP_CACHE_DEBUG + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3592,6 +3595,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif +#ifdef CONFIG_SECCOMP_CACHE_DEBUG + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 02aef2844c38..76963ec4641a 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -121,4 +121,11 @@ static inline long seccomp_get_metadata(struct task_struct *task, return -EINVAL; } #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */ + +#ifdef CONFIG_SECCOMP_CACHE_DEBUG +struct seq_file; + +int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); +#endif #endif /* _LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d8cf468dbe1e..76f524e320b1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -553,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; + /* We are effectively holding the siglock by not having any sighand. */ + WARN_ON(tsk->sighand != NULL); + /* Detach task from its filter tree. */ tsk->seccomp.filter = NULL; __seccomp_filter_release(orig); @@ -2335,3 +2338,59 @@ static int __init seccomp_sysctl_init(void) device_initcall(seccomp_sysctl_init) #endif /* CONFIG_SYSCTL */ + +#ifdef CONFIG_SECCOMP_CACHE_DEBUG +/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */ +static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name, + const void *bitmap, size_t bitmap_size) +{ + int nr; + + for (nr = 0; nr < bitmap_size; nr++) { + bool cached = test_bit(nr, bitmap); + char *status = cached ? "ALLOW" : "FILTER"; + + seq_printf(m, "%s %d %s\n", name, nr, status); + } +} + +int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct seccomp_filter *f; + unsigned long flags; + + /* + * We don't want some sandboxed process to know what their seccomp + * filters consist of. + */ + if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EACCES; + + if (!lock_task_sighand(task, &flags)) + return -ESRCH; + + f = READ_ONCE(task->seccomp.filter); + if (!f) { + unlock_task_sighand(task, &flags); + return 0; + } + + /* prevent filter from being freed while we are printing it */ + __get_seccomp_filter(f); + unlock_task_sighand(task, &flags); + + proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME, + f->cache.allow_native, + SECCOMP_ARCH_NATIVE_NR); + +#ifdef SECCOMP_ARCH_COMPAT + proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME, + f->cache.allow_compat, + SECCOMP_ARCH_COMPAT_NR); +#endif /* SECCOMP_ARCH_COMPAT */ + + __put_seccomp_filter(f); + return 0; +} +#endif /* CONFIG_SECCOMP_CACHE_DEBUG */ From fab686eb0307121e7a2890b6d6c57edd2457863d Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 20 Nov 2020 18:05:45 +0100 Subject: [PATCH 164/395] seccomp: Remove bogus __user annotations Buffers that are passed to read_actions_logged() and write_actions_logged() are in kernel memory; the sysctl core takes care of copying from/to userspace. Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Reviewed-by: Tyler Hicks Signed-off-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201120170545.1419332-1-jannh@google.com --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 76f524e320b1..0e0e369d2fcb 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -2202,7 +2202,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) return true; } -static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer, +static int read_actions_logged(struct ctl_table *ro_table, void *buffer, size_t *lenp, loff_t *ppos) { char names[sizeof(seccomp_actions_avail)]; @@ -2220,7 +2220,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer, return proc_dostring(&table, 0, buffer, lenp, ppos); } -static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer, +static int write_actions_logged(struct ctl_table *ro_table, void *buffer, size_t *lenp, loff_t *ppos, u32 *actions_logged) { char names[sizeof(seccomp_actions_avail)]; From ce59fc69b1c2da555706f6b0e77fc099f80e9d0e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 2 Sep 2020 13:28:09 -0600 Subject: [PATCH 165/395] io_uring: allow SQPOLL with CAP_SYS_NICE privileges CAP_SYS_ADMIN is too restrictive for a lot of uses cases, allow CAP_SYS_NICE based on the premise that such users are already allowed to raise the priority of tasks. Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a8c136a1cf4e..3cc1e59dd789 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7783,7 +7783,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_sq_data *sqd; ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) goto err; sqd = io_get_sq_data(p); From b713c195d59332277a31a59c91f755e53b5b302b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 5 Sep 2020 11:13:35 -0600 Subject: [PATCH 166/395] net: provide __sys_shutdown_sock() that takes a socket No functional changes in this patch, needed to provide io_uring support for shutdown(2). Cc: netdev@vger.kernel.org Cc: David S. Miller Acked-by: Jakub Kicinski Signed-off-by: Jens Axboe --- include/linux/socket.h | 1 + net/socket.c | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index e9cb30d8cbfb..385894b4a8bb 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -436,6 +436,7 @@ extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_socketpair(int family, int type, int protocol, int __user *usockvec); +extern int __sys_shutdown_sock(struct socket *sock, int how); extern int __sys_shutdown(int fd, int how); extern struct ns_common *get_net_ns(struct ns_common *ns); diff --git a/net/socket.c b/net/socket.c index 6e6cccc2104f..4b615c719765 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2192,6 +2192,17 @@ SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, * Shutdown a socket. */ +int __sys_shutdown_sock(struct socket *sock, int how) +{ + int err; + + err = security_socket_shutdown(sock, how); + if (!err) + err = sock->ops->shutdown(sock, how); + + return err; +} + int __sys_shutdown(int fd, int how) { int err, fput_needed; @@ -2199,9 +2210,7 @@ int __sys_shutdown(int fd, int how) sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { - err = security_socket_shutdown(sock, how); - if (!err) - err = sock->ops->shutdown(sock, how); + err = __sys_shutdown_sock(sock, how); fput_light(sock->file, fput_needed); } return err; From 36f4fa6886a81266d7c82b1c90a65205e73a7c85 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 5 Sep 2020 11:14:22 -0600 Subject: [PATCH 167/395] io_uring: add support for shutdown(2) This adds support for the shutdown(2) system call, which is useful for dealing with sockets. shutdown(2) may block, so we have to punt it to async context. Suggested-by: Norman Maurer Signed-off-by: Jens Axboe --- fs/io_uring.c | 52 +++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 1 + 2 files changed, 53 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3cc1e59dd789..d17198733f6a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -541,6 +541,11 @@ struct io_statx { struct statx __user *buffer; }; +struct io_shutdown { + struct file *file; + int how; +}; + struct io_completion { struct file *file; struct list_head list; @@ -667,6 +672,7 @@ struct io_kiocb { struct io_splice splice; struct io_provide_buf pbuf; struct io_statx statx; + struct io_shutdown shutdown; /* use only after cleaning per-op data, see io_clean_op() */ struct io_completion compl; }; @@ -934,6 +940,9 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, }, + [IORING_OP_SHUTDOWN] = { + .needs_file = 1, + }, }; enum io_mem_account { @@ -3591,6 +3600,44 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, return ret; } +static int io_shutdown_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_NET) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || + sqe->buf_index) + return -EINVAL; + + req->shutdown.how = READ_ONCE(sqe->len); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_shutdown(struct io_kiocb *req, bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (force_nonblock) + return -EAGAIN; + + sock = sock_from_file(req->file, &ret); + if (unlikely(!sock)) + return ret; + + ret = __sys_shutdown_sock(sock, req->shutdown.how); + io_req_complete(req, ret); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static int __io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -5775,6 +5822,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_remove_buffers_prep(req, sqe); case IORING_OP_TEE: return io_tee_prep(req, sqe); + case IORING_OP_SHUTDOWN: + return io_shutdown_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6018,6 +6067,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, case IORING_OP_TEE: ret = io_tee(req, force_nonblock); break; + case IORING_OP_SHUTDOWN: + ret = io_shutdown(req, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 98d8e06dea22..e943bf07c959 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -132,6 +132,7 @@ enum { IORING_OP_PROVIDE_BUFFERS, IORING_OP_REMOVE_BUFFERS, IORING_OP_TEE, + IORING_OP_SHUTDOWN, /* this goes last, obviously */ IORING_OP_LAST, From b2d99bcb27225fe420a8923b21861aef2bb43d9b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 20 Nov 2020 12:32:26 -0600 Subject: [PATCH 168/395] selinux: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix a warning by explicitly adding a break statement instead of letting the code fall through to the next case. Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Paul Moore --- security/selinux/hooks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index ca5c5623b46d..943c2693cec7 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4036,6 +4036,7 @@ static int selinux_kernel_load_data(enum kernel_load_data_id id, bool contents) switch (id) { case LOADING_MODULE: rc = selinux_kernel_module_from_file(NULL); + break; default: break; } From 3df98d79215ace13d1e91ddfc5a67a0f5acbd83f Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Sun, 27 Sep 2020 22:38:26 -0400 Subject: [PATCH 169/395] lsm,selinux: pass flowi_common instead of flowi to the LSM hooks As pointed out by Herbert in a recent related patch, the LSM hooks do not have the necessary address family information to use the flowi struct safely. As none of the LSMs currently use any of the protocol specific flowi information, replace the flowi pointers with pointers to the address family independent flowi_common struct. Reported-by: Herbert Xu Acked-by: James Morris Signed-off-by: Paul Moore --- .../chelsio/inline_crypto/chtls/chtls_cm.c | 2 +- drivers/net/wireguard/socket.c | 4 ++-- include/linux/lsm_hook_defs.h | 4 ++-- include/linux/lsm_hooks.h | 2 +- include/linux/security.h | 23 +++++++++++-------- include/net/flow.h | 10 ++++++++ include/net/route.h | 6 ++--- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 6 ++--- net/ipv4/icmp.c | 4 ++-- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/syncookies.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/af_inet6.c | 2 +- net/ipv6/datagram.c | 2 +- net/ipv6/icmp.c | 6 ++--- net/ipv6/inet6_connection_sock.c | 4 ++-- net/ipv6/netfilter/nf_reject_ipv6.c | 2 +- net/ipv6/ping.c | 2 +- net/ipv6/raw.c | 2 +- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- net/ipv6/udp.c | 2 +- net/l2tp/l2tp_ip6.c | 2 +- net/netfilter/nf_synproxy_core.c | 2 +- net/xfrm/xfrm_state.c | 6 +++-- security/security.c | 17 +++++++------- security/selinux/hooks.c | 4 ++-- security/selinux/include/xfrm.h | 2 +- security/selinux/xfrm.c | 13 ++++++----- 33 files changed, 85 insertions(+), 66 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index ec4f79049a06..42e4e43cdd91 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -1148,7 +1148,7 @@ static struct sock *chtls_recv_sock(struct sock *lsk, fl6.daddr = ip6h->saddr; fl6.fl6_dport = inet_rsk(oreq)->ir_rmt_port; fl6.fl6_sport = htons(inet_rsk(oreq)->ir_num); - security_req_classify_flow(oreq, flowi6_to_flowi(&fl6)); + security_req_classify_flow(oreq, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(sock_net(lsk), lsk, &fl6, NULL); if (IS_ERR(dst)) goto free_sk; diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index c33e2c81635f..410b318e57fb 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -49,7 +49,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, rt = dst_cache_get_ip4(cache, &fl.saddr); if (!rt) { - security_sk_classify_flow(sock, flowi4_to_flowi(&fl)); + security_sk_classify_flow(sock, flowi4_to_flowi_common(&fl)); if (unlikely(!inet_confirm_addr(sock_net(sock), NULL, 0, fl.saddr, RT_SCOPE_HOST))) { endpoint->src4.s_addr = 0; @@ -129,7 +129,7 @@ static int send6(struct wg_device *wg, struct sk_buff *skb, dst = dst_cache_get_ip6(cache, &fl.saddr); if (!dst) { - security_sk_classify_flow(sock, flowi6_to_flowi(&fl)); + security_sk_classify_flow(sock, flowi6_to_flowi_common(&fl)); if (unlikely(!ipv6_addr_any(&fl.saddr) && !ipv6_chk_addr(sock_net(sock), &fl.saddr, NULL, 0))) { endpoint->src6 = fl.saddr = in6addr_any; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 32a940117e7a..f70984c8318b 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -311,7 +311,7 @@ LSM_HOOK(int, 0, secmark_relabel_packet, u32 secid) LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_inc, void) LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_dec, void) LSM_HOOK(void, LSM_RET_VOID, req_classify_flow, const struct request_sock *req, - struct flowi *fl) + struct flowi_common *flic) LSM_HOOK(int, 0, tun_dev_alloc_security, void **security) LSM_HOOK(void, LSM_RET_VOID, tun_dev_free_security, void *security) LSM_HOOK(int, 0, tun_dev_create, void) @@ -351,7 +351,7 @@ LSM_HOOK(int, 0, xfrm_state_delete_security, struct xfrm_state *x) LSM_HOOK(int, 0, xfrm_policy_lookup, struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir) LSM_HOOK(int, 1, xfrm_state_pol_flow_match, struct xfrm_state *x, - struct xfrm_policy *xp, const struct flowi *fl) + struct xfrm_policy *xp, const struct flowi_common *flic) LSM_HOOK(int, 0, xfrm_decode_session, struct sk_buff *skb, u32 *secid, int ckall) #endif /* CONFIG_SECURITY_NETWORK_XFRM */ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index c503f7ab8afb..a19adef1f088 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1105,7 +1105,7 @@ * @xfrm_state_pol_flow_match: * @x contains the state to match. * @xp contains the policy to check for a match. - * @fl contains the flow to check for a match. + * @flic contains the flowi_common struct to check for a match. * Return 1 if there is a match. * @xfrm_decode_session: * @skb points to skb to decode. diff --git a/include/linux/security.h b/include/linux/security.h index bc2725491560..469273e8b46d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -167,7 +167,7 @@ struct sk_buff; struct sock; struct sockaddr; struct socket; -struct flowi; +struct flowi_common; struct dst_entry; struct xfrm_selector; struct xfrm_policy; @@ -1355,8 +1355,9 @@ int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u int security_sk_alloc(struct sock *sk, int family, gfp_t priority); void security_sk_free(struct sock *sk); void security_sk_clone(const struct sock *sk, struct sock *newsk); -void security_sk_classify_flow(struct sock *sk, struct flowi *fl); -void security_req_classify_flow(const struct request_sock *req, struct flowi *fl); +void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic); +void security_req_classify_flow(const struct request_sock *req, + struct flowi_common *flic); void security_sock_graft(struct sock*sk, struct socket *parent); int security_inet_conn_request(struct sock *sk, struct sk_buff *skb, struct request_sock *req); @@ -1507,11 +1508,13 @@ static inline void security_sk_clone(const struct sock *sk, struct sock *newsk) { } -static inline void security_sk_classify_flow(struct sock *sk, struct flowi *fl) +static inline void security_sk_classify_flow(struct sock *sk, + struct flowi_common *flic) { } -static inline void security_req_classify_flow(const struct request_sock *req, struct flowi *fl) +static inline void security_req_classify_flow(const struct request_sock *req, + struct flowi_common *flic) { } @@ -1638,9 +1641,9 @@ void security_xfrm_state_free(struct xfrm_state *x); int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir); int security_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, - const struct flowi *fl); + const struct flowi_common *flic); int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid); -void security_skb_classify_flow(struct sk_buff *skb, struct flowi *fl); +void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic); #else /* CONFIG_SECURITY_NETWORK_XFRM */ @@ -1692,7 +1695,8 @@ static inline int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_s } static inline int security_xfrm_state_pol_flow_match(struct xfrm_state *x, - struct xfrm_policy *xp, const struct flowi *fl) + struct xfrm_policy *xp, + const struct flowi_common *flic) { return 1; } @@ -1702,7 +1706,8 @@ static inline int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid) return 0; } -static inline void security_skb_classify_flow(struct sk_buff *skb, struct flowi *fl) +static inline void security_skb_classify_flow(struct sk_buff *skb, + struct flowi_common *flic) { } diff --git a/include/net/flow.h b/include/net/flow.h index b2531df3f65f..39d0cedcddee 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -195,11 +195,21 @@ static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) return container_of(fl4, struct flowi, u.ip4); } +static inline struct flowi_common *flowi4_to_flowi_common(struct flowi4 *fl4) +{ + return &(flowi4_to_flowi(fl4)->u.__fl_common); +} + static inline struct flowi *flowi6_to_flowi(struct flowi6 *fl6) { return container_of(fl6, struct flowi, u.ip6); } +static inline struct flowi_common *flowi6_to_flowi_common(struct flowi6 *fl6) +{ + return &(flowi6_to_flowi(fl6)->u.__fl_common); +} + static inline struct flowi *flowidn_to_flowi(struct flowidn *fldn) { return container_of(fldn, struct flowi, u.dn); diff --git a/include/net/route.h b/include/net/route.h index ff021cab657e..2e6c0e153e3a 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -165,7 +165,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi sk ? inet_sk_flowi_flags(sk) : 0, daddr, saddr, dport, sport, sock_net_uid(net, sk)); if (sk) - security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); } @@ -322,7 +322,7 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4, ip_rt_put(rt); flowi4_update_output(fl4, oif, tos, fl4->daddr, fl4->saddr); } - security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); } @@ -338,7 +338,7 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable flowi4_update_output(fl4, sk->sk_bound_dev_if, RT_CONN_FLAGS(sk), fl4->daddr, fl4->saddr); - security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(sock_net(sk), fl4, sk); } return rt; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index bb3d70664dde..3d0b58d7bf3f 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -464,7 +464,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, .fl4_dport = dccp_hdr(skb)->dccph_sport, }; - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ef4ab28cfde0..179dcf02a572 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -203,7 +203,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req fl6.flowi6_oif = ireq->ir_iif; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = htons(ireq->ir_num); - security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); @@ -279,7 +279,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) fl6.flowi6_oif = inet6_iif(rxskb); fl6.fl6_dport = dccp_hdr(skb)->dccph_dport; fl6.fl6_sport = dccp_hdr(skb)->dccph_sport; - security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6)); /* sk = NULL, but it is safe for now. RST socket required. */ dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); @@ -907,7 +907,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 005faea415a4..396b492c804f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -447,7 +447,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) goto out_unlock; @@ -503,7 +503,7 @@ static struct rtable *icmp_route_lookup(struct net *net, route_lookup_dev = icmp_get_route_lookup_dev(skb_in); fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev); - security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); + security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4)); rt = ip_route_output_key_hash(net, fl4, skb_in); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4148f5f78f31..1b7801371f7d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -602,7 +602,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); - security_req_classify_flow(req, flowi4_to_flowi(fl4)); + security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; @@ -640,7 +640,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, htons(ireq->ir_num), sk->sk_uid); - security_req_classify_flow(req, flowi4_to_flowi(fl4)); + security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 879b76ae4435..89fff5f59eea 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1700,7 +1700,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, daddr, saddr, tcp_hdr(skb)->source, tcp_hdr(skb)->dest, arg->uid); - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 248856b301c4..8b943f85fff9 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -778,7 +778,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4.fl4_icmp_type = user_icmph.type; fl4.fl4_icmp_code = user_icmph.code; - security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 7d26e0f8bdae..50a73178d63a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -640,7 +640,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto done; } - security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 6ac473b47f30..36b3b9b72b75 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -418,7 +418,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); - security_req_classify_flow(req, flowi4_to_flowi(&fl4)); + security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); if (IS_ERR(rt)) { reqsk_free(req); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 09f0a23d1a01..594632ff094f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1197,7 +1197,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) faddr, saddr, dport, inet->inet_sport, sk->sk_uid); - security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e648fbebb167..b8e491d3acf0 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -819,7 +819,7 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; fl6.flowi6_uid = sk->sk_uid; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index cc8ad7ddecda..206f66310a88 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -60,7 +60,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) fl6->flowi6_oif = np->mcast_oif; - security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index ec448b71bf9a..91d9136b6e9c 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -567,7 +567,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); - security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); np = inet6_sk(sk); @@ -749,7 +749,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY; fl6.flowi6_mark = mark; fl6.flowi6_uid = sock_net_uid(net, NULL); - security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); local_bh_disable(); sk = icmpv6_xmit_lock(net); @@ -1002,7 +1002,7 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, fl6->fl6_icmp_type = type; fl6->fl6_icmp_code = 0; fl6->flowi6_oif = oif; - security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } static void __net_exit icmpv6_sk_exit(struct net *net) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index e315526fa244..5a9f4d722f35 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -46,7 +46,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); fl6->flowi6_uid = sk->sk_uid; - security_req_classify_flow(req, flowi6_to_flowi(fl6)); + security_req_classify_flow(req, flowi6_to_flowi_common(fl6)); dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(dst)) @@ -95,7 +95,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->fl6_sport = inet->inet_sport; fl6->fl6_dport = inet->inet_dport; fl6->flowi6_uid = sk->sk_uid; - security_sk_classify_flow(sk, flowi6_to_flowi(fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); rcu_read_lock(); final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 4aef6baaa55e..bf95513736c9 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -179,7 +179,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); - security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6)); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 6caa062f68e7..6ac88fe24a8e 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -111,7 +111,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_uid = sk->sk_uid; fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); ipcm6_init_sk(&ipc6, np); ipc6.sockc.mark = sk->sk_mark; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 6e4ab80a3b94..1f56d9aae589 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -915,7 +915,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_oif = np->mcast_oif; else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index e796a64be308..0351e2ffc707 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -233,7 +233,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; fl6.flowi6_uid = sk->sk_uid; - security_req_classify_flow(req, flowi6_to_flowi(&fl6)); + security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8db59f4e5f13..e5f75bbcc66f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -278,7 +278,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { @@ -954,7 +954,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); - security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); /* Pass a socket to ip6_dst_lookup either it is for RST * Underlying function will use this to retrieve the network diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 29d9691359b9..724942afc035 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1496,7 +1496,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); if (ipc6.tclass < 0) ipc6.tclass = np->tclass; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index e5e5036257b0..96f975777438 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -606,7 +606,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); if (ipc6.tclass < 0) ipc6.tclass = np->tclass; diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 9cca35d22927..fb64e8515f7d 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -849,7 +849,7 @@ synproxy_send_tcp_ipv6(struct net *net, fl6.fl6_sport = nth->source; fl6.fl6_dport = nth->dest; security_skb_classify_flow((struct sk_buff *)skb, - flowi6_to_flowi(&fl6)); + flowi6_to_flowi_common(&fl6)); err = nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (err) { goto free_nskb; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index bbd4643d7e82..ac25b0c2d87e 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1021,7 +1021,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, if ((x->sel.family && (x->sel.family != family || !xfrm_selector_match(&x->sel, fl, family))) || - !security_xfrm_state_pol_flow_match(x, pol, fl)) + !security_xfrm_state_pol_flow_match(x, pol, + &fl->u.__fl_common)) return; if (!*best || @@ -1036,7 +1037,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, if ((!x->sel.family || (x->sel.family == family && xfrm_selector_match(&x->sel, fl, family))) && - security_xfrm_state_pol_flow_match(x, pol, fl)) + security_xfrm_state_pol_flow_match(x, pol, + &fl->u.__fl_common)) *error = -ESRCH; } } diff --git a/security/security.c b/security/security.c index a28045dc9e7f..c08e0eec8b9e 100644 --- a/security/security.c +++ b/security/security.c @@ -2207,15 +2207,16 @@ void security_sk_clone(const struct sock *sk, struct sock *newsk) } EXPORT_SYMBOL(security_sk_clone); -void security_sk_classify_flow(struct sock *sk, struct flowi *fl) +void security_sk_classify_flow(struct sock *sk, struct flowi_common *flic) { - call_void_hook(sk_getsecid, sk, &fl->flowi_secid); + call_void_hook(sk_getsecid, sk, &flic->flowic_secid); } EXPORT_SYMBOL(security_sk_classify_flow); -void security_req_classify_flow(const struct request_sock *req, struct flowi *fl) +void security_req_classify_flow(const struct request_sock *req, + struct flowi_common *flic) { - call_void_hook(req_classify_flow, req, fl); + call_void_hook(req_classify_flow, req, flic); } EXPORT_SYMBOL(security_req_classify_flow); @@ -2407,7 +2408,7 @@ int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir) int security_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, - const struct flowi *fl) + const struct flowi_common *flic) { struct security_hook_list *hp; int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match); @@ -2423,7 +2424,7 @@ int security_xfrm_state_pol_flow_match(struct xfrm_state *x, */ hlist_for_each_entry(hp, &security_hook_heads.xfrm_state_pol_flow_match, list) { - rc = hp->hook.xfrm_state_pol_flow_match(x, xp, fl); + rc = hp->hook.xfrm_state_pol_flow_match(x, xp, flic); break; } return rc; @@ -2434,9 +2435,9 @@ int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid) return call_int_hook(xfrm_decode_session, 0, skb, secid, 1); } -void security_skb_classify_flow(struct sk_buff *skb, struct flowi *fl) +void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic) { - int rc = call_int_hook(xfrm_decode_session, 0, skb, &fl->flowi_secid, + int rc = call_int_hook(xfrm_decode_session, 0, skb, &flic->flowic_secid, 0); BUG_ON(rc); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 943c2693cec7..a515abdf115b 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -5437,9 +5437,9 @@ static void selinux_secmark_refcount_dec(void) } static void selinux_req_classify_flow(const struct request_sock *req, - struct flowi *fl) + struct flowi_common *flic) { - fl->flowi_secid = req->secid; + flic->flowic_secid = req->secid; } static int selinux_tun_dev_alloc_security(void **security) diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index a0b465316292..0a6f34a7a971 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -26,7 +26,7 @@ int selinux_xfrm_state_delete(struct xfrm_state *x); int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir); int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, - const struct flowi *fl); + const struct flowi_common *flic); #ifdef CONFIG_SECURITY_NETWORK_XFRM extern atomic_t selinux_xfrm_refcount; diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index 7314196185d1..c367d36965d4 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -175,9 +175,10 @@ int selinux_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid, u8 dir) */ int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, - const struct flowi *fl) + const struct flowi_common *flic) { u32 state_sid; + u32 flic_sid; if (!xp->security) if (x->security) @@ -196,17 +197,17 @@ int selinux_xfrm_state_pol_flow_match(struct xfrm_state *x, return 0; state_sid = x->security->ctx_sid; + flic_sid = flic->flowic_secid; - if (fl->flowi_secid != state_sid) + if (flic_sid != state_sid) return 0; /* We don't need a separate SA Vs. policy polmatch check since the SA * is now of the same label as the flow and a flow Vs. policy polmatch * check had already happened in selinux_xfrm_policy_lookup() above. */ - return (avc_has_perm(&selinux_state, - fl->flowi_secid, state_sid, - SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, - NULL) ? 0 : 1); + return (avc_has_perm(&selinux_state, flic_sid, state_sid, + SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, + NULL) ? 0 : 1); } static u32 selinux_xfrm_skb_sid_egress(struct sk_buff *skb) From 206ad34d52a2f1205c84d08c12fc116aad0eb407 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Wed, 18 Nov 2020 16:33:10 +0100 Subject: [PATCH 170/395] hv_netvsc: Validate number of allocated sub-channels Lack of validation could lead to out-of-bound reads and information leaks (cf. usage of nvdev->chan_table[]). Check that the number of allocated sub-channels fits into the expected range. Suggested-by: Saruhan Karademir Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Haiyang Zhang Acked-by: Jakub Kicinski Cc: "David S. Miller" Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Link: https://lore.kernel.org/r/20201118153310.112404-1-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/net/hyperv/rndis_filter.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 6ae43319ece6..9a51fa3003a8 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1224,6 +1224,11 @@ int rndis_set_subchannel(struct net_device *ndev, return -EIO; } + /* Check that number of allocated sub channel is within the expected range */ + if (init_packet->msg.v5_msg.subchn_comp.num_subchannels > nvdev->num_chn - 1) { + netdev_err(ndev, "invalid number of allocated sub channel\n"); + return -EINVAL; + } nvdev->num_chn = 1 + init_packet->msg.v5_msg.subchn_comp.num_subchannels; From ba59eae723857257a791618092d8022ad82efaa4 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 6 Nov 2020 16:31:22 +0800 Subject: [PATCH 171/395] audit: fix macros warnings Some unused macros could cause gcc warning: kernel/audit.c:68:0: warning: macro "AUDIT_UNINITIALIZED" is not used [-Wunused-macros] kernel/auditsc.c:104:0: warning: macro "AUDIT_AUX_IPCPERM" is not used [-Wunused-macros] kernel/auditsc.c:82:0: warning: macro "AUDITSC_INVALID" is not used [-Wunused-macros] AUDIT_UNINITIALIZED and AUDITSC_INVALID are still meaningful and should be in incorporated. Just remove AUDIT_AUX_IPCPERM. Thanks comments from Richard Guy Briggs and Paul Moore. Signed-off-by: Alex Shi Cc: Paul Moore Cc: Richard Guy Briggs Cc: Eric Paris Cc: linux-audit@redhat.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/auditsc.c | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/audit.c b/kernel/audit.c index ac0aeaa99937..e22f22bdc000 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -67,7 +67,7 @@ #define AUDIT_DISABLED -1 #define AUDIT_UNINITIALIZED 0 #define AUDIT_INITIALIZED 1 -static int audit_initialized; +static int audit_initialized = AUDIT_UNINITIALIZED; u32 audit_enabled = AUDIT_OFF; bool audit_ever_enabled = !!AUDIT_OFF; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 183d79cc2e12..9cbe6d5437be 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -102,8 +102,6 @@ struct audit_aux_data { int type; }; -#define AUDIT_AUX_IPCPERM 0 - /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 @@ -552,11 +550,11 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_EXIT: - if (ctx && ctx->return_valid) + if (ctx && ctx->return_valid != AUDITSC_INVALID) result = audit_comparator(ctx->return_code, f->op, f->val); break; case AUDIT_SUCCESS: - if (ctx && ctx->return_valid) { + if (ctx && ctx->return_valid != AUDITSC_INVALID) { if (f->val) result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS); else @@ -930,6 +928,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); context->fds[0] = -1; + context->return_valid = AUDITSC_INVALID; return context; } @@ -1488,7 +1487,7 @@ static void audit_log_exit(void) context->arch, context->major); if (context->personality != PER_LINUX) audit_log_format(ab, " per=%lx", context->personality); - if (context->return_valid) + if (context->return_valid != AUDITSC_INVALID) audit_log_format(ab, " success=%s exit=%ld", (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); @@ -1625,7 +1624,7 @@ void __audit_free(struct task_struct *tsk) * need to log via audit_log_exit(). */ if (tsk == current && !context->dummy && context->in_syscall) { - context->return_valid = 0; + context->return_valid = AUDITSC_INVALID; context->return_code = 0; audit_filter_syscall(tsk, context, From 63653368c25ff0b1b1aaf045c97ea87bd8c16123 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Wed, 25 Nov 2020 14:58:17 +0800 Subject: [PATCH 172/395] block: remove unused BIO_SPLIT_ENTRIES Since commit 4b1faf931650 ("block: Kill bio_pair_split()"), there's no user of BIO_SPLIT_ENTRIES anymore. Signed-off-by: Jeffle Xu Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index c6d765382926..ecf67108f091 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -711,12 +711,6 @@ static inline bool bioset_initialized(struct bio_set *bs) return bs->bio_slab != NULL; } -/* - * a small number of entries is fine, not going to be performance critical. - * basically we just need to survive - */ -#define BIO_SPLIT_ENTRIES 2 - #if defined(CONFIG_BLK_DEV_INTEGRITY) #define bip_for_each_vec(bvl, bip, iter) \ From 05cdf457477d6603b207d91873f0a3d4c7f8c1cd Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Thu, 26 Nov 2020 14:32:25 +0100 Subject: [PATCH 173/395] microblaze: Remove noMMU code This configuration is obsolete and likely none is really using it. That's why remove it to simplify code. Note about CONFIG_MMU in hw_exception_handler.S is left intentionally for better comment understanding. Cc: Mike Rapoport Cc: Arnd Bergmann Signed-off-by: Michal Simek Acked-by: Mike Rapoport Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/43486cab370e0c0a79860120b71e0caac75a7e44.1606397528.git.michal.simek@xilinx.com --- arch/microblaze/Kconfig | 53 +- arch/microblaze/Makefile | 11 +- arch/microblaze/configs/mmu_defconfig | 1 - arch/microblaze/configs/nommu_defconfig | 90 --- arch/microblaze/include/asm/dma.h | 6 - arch/microblaze/include/asm/exceptions.h | 5 - arch/microblaze/include/asm/io.h | 3 - arch/microblaze/include/asm/mmu.h | 4 - arch/microblaze/include/asm/mmu_context.h | 4 - arch/microblaze/include/asm/page.h | 59 -- arch/microblaze/include/asm/pgalloc.h | 4 - arch/microblaze/include/asm/pgtable.h | 43 -- arch/microblaze/include/asm/processor.h | 37 -- arch/microblaze/include/asm/registers.h | 2 - arch/microblaze/include/asm/setup.h | 2 - arch/microblaze/include/asm/tlbflush.h | 14 - arch/microblaze/include/asm/uaccess.h | 27 - arch/microblaze/kernel/Makefile | 4 +- arch/microblaze/kernel/asm-offsets.c | 2 - arch/microblaze/kernel/entry-nommu.S | 622 ------------------ arch/microblaze/kernel/exceptions.c | 5 - arch/microblaze/kernel/head.S | 12 - arch/microblaze/kernel/hw_exception_handler.S | 130 +--- arch/microblaze/kernel/microblaze_ksyms.c | 2 - arch/microblaze/kernel/process.c | 10 - arch/microblaze/kernel/setup.c | 2 - arch/microblaze/kernel/signal.c | 7 - arch/microblaze/kernel/unwind.c | 19 - arch/microblaze/mm/Makefile | 2 +- arch/microblaze/mm/consistent.c | 29 - arch/microblaze/mm/init.c | 49 -- arch/microblaze/pci/pci-common.c | 2 - 32 files changed, 11 insertions(+), 1251 deletions(-) delete mode 100644 arch/microblaze/configs/nommu_defconfig delete mode 100644 arch/microblaze/kernel/entry-nommu.S diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 33925ffed68f..32739ab2af25 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -3,19 +3,17 @@ config MICROBLAZE def_bool y select ARCH_32BIT_OFF_T select ARCH_NO_SWAP - select ARCH_HAS_BINFMT_FLAT if !MMU select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_DMA_SET_UNCACHED if !MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_TABLE_SORT select TIMER_OF select CLONE_BACKWARDS3 select COMMON_CLK - select DMA_DIRECT_REMAP if MMU + select DMA_DIRECT_REMAP select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS select GENERIC_CPU_DEVICES @@ -45,7 +43,7 @@ config MICROBLAZE select TRACING_SUPPORT select VIRT_TO_BUS select CPU_NO_EFFICIENT_FFS - select MMU_GATHER_NO_RANGE if MMU + select MMU_GATHER_NO_RANGE select SPARSE_IRQ select SET_FS @@ -96,8 +94,7 @@ menu "Processor type and features" source "kernel/Kconfig.hz" config MMU - bool "MMU support" - default n + def_bool y comment "Boot options" @@ -143,18 +140,8 @@ config ADVANCED_OPTIONS comment "Default settings for advanced configuration options are used" depends on !ADVANCED_OPTIONS -config XILINX_UNCACHED_SHADOW - bool "Are you using uncached shadow for RAM ?" - depends on ADVANCED_OPTIONS && !MMU - default n - help - This is needed to be able to allocate uncachable memory regions. - The feature requires the design to define the RAM memory controller - window to be twice as large as the actual physical memory. - config HIGHMEM bool "High memory support" - depends on MMU help The address space of Microblaze processors is only 4 Gigabytes large and it has to accommodate user address space, kernel address @@ -167,7 +154,7 @@ config HIGHMEM config LOWMEM_SIZE_BOOL bool "Set maximum low memory" - depends on ADVANCED_OPTIONS && MMU + depends on ADVANCED_OPTIONS help This option allows you to set the maximum amount of memory which will be used as "low memory", that is, memory which the kernel can @@ -205,12 +192,11 @@ config KERNEL_START_BOOL config KERNEL_START hex "Virtual address of kernel base" if KERNEL_START_BOOL - default "0xc0000000" if MMU - default KERNEL_BASE_ADDR if !MMU + default "0xc0000000" config TASK_SIZE_BOOL bool "Set custom user task size" - depends on ADVANCED_OPTIONS && MMU + depends on ADVANCED_OPTIONS help This option allows you to set the amount of virtual address space allocated to user tasks. This can be useful in optimizing the @@ -222,33 +208,6 @@ config TASK_SIZE hex "Size of user task space" if TASK_SIZE_BOOL default "0x80000000" -choice - prompt "Page size" - default MICROBLAZE_4K_PAGES - depends on ADVANCED_OPTIONS && !MMU - help - Select the kernel logical page size. Increasing the page size - will reduce software overhead at each page boundary, allow - hardware prefetch mechanisms to be more effective, and allow - larger dma transfers increasing IO efficiency and reducing - overhead. However the utilization of memory will increase. - For example, each cached file will using a multiple of the - page size to hold its contents and the difference between the - end of file and the end of page is wasted. - - If unsure, choose 4K_PAGES. - -config MICROBLAZE_4K_PAGES - bool "4k page size" - -config MICROBLAZE_16K_PAGES - bool "16k page size" - -config MICROBLAZE_64K_PAGES - bool "64k page size" - -endchoice - endmenu menu "Bus Options" diff --git a/arch/microblaze/Makefile b/arch/microblaze/Makefile index 7b340a35b194..bb980891816d 100644 --- a/arch/microblaze/Makefile +++ b/arch/microblaze/Makefile @@ -1,11 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 KBUILD_DEFCONFIG := mmu_defconfig -ifeq ($(CONFIG_MMU),y) UTS_SYSNAME = -DUTS_SYSNAME=\"Linux\" -else -UTS_SYSNAME = -DUTS_SYSNAME=\"uClinux\" -endif # What CPU vesion are we building for, and crack it open # as major.minor.rev @@ -67,12 +63,7 @@ DTB:=$(subst simpleImage.,,$(filter simpleImage.%, $(MAKECMDGOALS))) core-y += $(boot)/dts/ -# defines filename extension depending memory management type -ifeq ($(CONFIG_MMU),) -MMU := -nommu -endif - -export MMU DTB +export DTB all: linux.bin diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index 9b8a50f30662..51337fffb947 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -16,7 +16,6 @@ CONFIG_XILINX_MICROBLAZE0_USE_DIV=1 CONFIG_XILINX_MICROBLAZE0_USE_HW_MUL=2 CONFIG_XILINX_MICROBLAZE0_USE_FPU=2 CONFIG_HZ_100=y -CONFIG_MMU=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE_FORCE=y CONFIG_HIGHMEM=y diff --git a/arch/microblaze/configs/nommu_defconfig b/arch/microblaze/configs/nommu_defconfig deleted file mode 100644 index 8c420782d6e4..000000000000 --- a/arch/microblaze/configs/nommu_defconfig +++ /dev/null @@ -1,90 +0,0 @@ -CONFIG_SYSVIPC=y -CONFIG_POSIX_MQUEUE=y -CONFIG_AUDIT=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_SYSFS_DEPRECATED=y -CONFIG_SYSFS_DEPRECATED_V2=y -# CONFIG_BASE_FULL is not set -CONFIG_KALLSYMS_ALL=y -CONFIG_EMBEDDED=y -CONFIG_SLAB=y -CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR=1 -CONFIG_XILINX_MICROBLAZE0_USE_PCMP_INSTR=1 -CONFIG_XILINX_MICROBLAZE0_USE_BARREL=1 -CONFIG_XILINX_MICROBLAZE0_USE_DIV=1 -CONFIG_XILINX_MICROBLAZE0_USE_HW_MUL=2 -CONFIG_XILINX_MICROBLAZE0_USE_FPU=2 -CONFIG_HZ_100=y -CONFIG_CMDLINE_BOOL=y -CONFIG_CMDLINE_FORCE=y -CONFIG_PCI_XILINX=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_PARTITION_ADVANCED=y -# CONFIG_EFI_PARTITION is not set -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_UNIX=y -CONFIG_INET=y -# CONFIG_IPV6 is not set -CONFIG_PCI=y -CONFIG_MTD=y -CONFIG_MTD_CMDLINE_PARTS=y -CONFIG_MTD_BLOCK=y -CONFIG_MTD_CFI=y -CONFIG_MTD_CFI_INTELEXT=y -CONFIG_MTD_CFI_AMDSTD=y -CONFIG_MTD_RAM=y -CONFIG_MTD_UCLINUX=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=8192 -CONFIG_NETDEVICES=y -CONFIG_XILINX_EMACLITE=y -CONFIG_XILINX_LL_TEMAC=y -# CONFIG_INPUT is not set -# CONFIG_SERIO is not set -# CONFIG_VT is not set -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_OF_PLATFORM=y -CONFIG_SERIAL_UARTLITE=y -CONFIG_SERIAL_UARTLITE_CONSOLE=y -# CONFIG_HW_RANDOM is not set -CONFIG_XILINX_HWICAP=y -CONFIG_I2C=y -CONFIG_I2C_XILINX=y -CONFIG_SPI=y -CONFIG_SPI_XILINX=y -CONFIG_GPIOLIB=y -CONFIG_GPIO_SYSFS=y -CONFIG_GPIO_XILINX=y -CONFIG_POWER_RESET=y -CONFIG_POWER_RESET_GPIO_RESTART=y -# CONFIG_HWMON is not set -CONFIG_WATCHDOG=y -CONFIG_XILINX_WATCHDOG=y -CONFIG_FB=y -CONFIG_FB_XILINX=y -# CONFIG_USB_SUPPORT is not set -CONFIG_EXT3_FS=y -# CONFIG_DNOTIFY is not set -CONFIG_CRAMFS=y -CONFIG_ROMFS_FS=y -CONFIG_NFS_FS=y -CONFIG_NFS_V3_ACL=y -CONFIG_NLS=y -CONFIG_KEYS=y -CONFIG_ENCRYPTED_KEYS=y -CONFIG_CRYPTO_ECB=y -CONFIG_CRYPTO_MD4=y -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_ARC4=y -CONFIG_CRYPTO_DES=y -CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_SLAB=y -CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEBUG_SPINLOCK=y diff --git a/arch/microblaze/include/asm/dma.h b/arch/microblaze/include/asm/dma.h index e6cb6d0725af..f801582be912 100644 --- a/arch/microblaze/include/asm/dma.h +++ b/arch/microblaze/include/asm/dma.h @@ -6,14 +6,8 @@ #ifndef _ASM_MICROBLAZE_DMA_H #define _ASM_MICROBLAZE_DMA_H -#ifndef CONFIG_MMU -/* we don't have dma address limit. define it as zero to be - * unlimited. */ -#define MAX_DMA_ADDRESS (0) -#else /* Virtual address corresponding to last available physical memory address. */ #define MAX_DMA_ADDRESS (CONFIG_KERNEL_START + memory_size - 1) -#endif #ifdef CONFIG_PCI extern int isa_dma_bridge_buggy; diff --git a/arch/microblaze/include/asm/exceptions.h b/arch/microblaze/include/asm/exceptions.h index d67e65b72215..967f175173e1 100644 --- a/arch/microblaze/include/asm/exceptions.h +++ b/arch/microblaze/include/asm/exceptions.h @@ -11,11 +11,6 @@ #define _ASM_MICROBLAZE_EXCEPTIONS_H #ifdef __KERNEL__ - -#ifndef CONFIG_MMU -#define EX_HANDLER_STACK_SIZ (4*19) -#endif - #ifndef __ASSEMBLY__ /* Macros to enable and disable HW exceptions in the MSR */ diff --git a/arch/microblaze/include/asm/io.h b/arch/microblaze/include/asm/io.h index 1dd6fae41897..b6a57f8468f0 100644 --- a/arch/microblaze/include/asm/io.h +++ b/arch/microblaze/include/asm/io.h @@ -30,15 +30,12 @@ extern resource_size_t isa_mem_base; #define PCI_IOBASE ((void __iomem *)_IO_BASE) #define IO_SPACE_LIMIT (0xFFFFFFFF) -#ifdef CONFIG_MMU #define page_to_bus(page) (page_to_phys(page)) extern void iounmap(volatile void __iomem *addr); extern void __iomem *ioremap(phys_addr_t address, unsigned long size); -#endif /* CONFIG_MMU */ - /* Big Endian */ #define out_be32(a, v) __raw_writel((v), (void __iomem __force *)(a)) #define out_be16(a, v) __raw_writew((v), (a)) diff --git a/arch/microblaze/include/asm/mmu.h b/arch/microblaze/include/asm/mmu.h index 97f1243101cc..b928a87c0076 100644 --- a/arch/microblaze/include/asm/mmu.h +++ b/arch/microblaze/include/asm/mmu.h @@ -8,9 +8,6 @@ #ifndef _ASM_MICROBLAZE_MMU_H #define _ASM_MICROBLAZE_MMU_H -# ifndef CONFIG_MMU -# include -# else /* CONFIG_MMU */ # ifdef __KERNEL__ # ifndef __ASSEMBLY__ @@ -119,5 +116,4 @@ extern u32 tlb_skip; # define TLB_G 0x00000001 /* Memory is guarded from prefetch */ # endif /* __KERNEL__ */ -# endif /* CONFIG_MMU */ #endif /* _ASM_MICROBLAZE_MMU_H */ diff --git a/arch/microblaze/include/asm/mmu_context.h b/arch/microblaze/include/asm/mmu_context.h index f74f9da07fdc..866e52da5eb9 100644 --- a/arch/microblaze/include/asm/mmu_context.h +++ b/arch/microblaze/include/asm/mmu_context.h @@ -1,6 +1,2 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_MMU # include -#else -# include -#endif diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index b13463d39b38..bf681f272f72 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -20,13 +20,7 @@ #ifdef __KERNEL__ /* PAGE_SHIFT determines the page size */ -#if defined(CONFIG_MICROBLAZE_64K_PAGES) -#define PAGE_SHIFT 16 -#elif defined(CONFIG_MICROBLAZE_16K_PAGES) -#define PAGE_SHIFT 14 -#else #define PAGE_SHIFT 12 -#endif #define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) @@ -44,17 +38,6 @@ #define PAGE_UP(addr) (((addr)+((PAGE_SIZE)-1))&(~((PAGE_SIZE)-1))) #define PAGE_DOWN(addr) ((addr)&(~((PAGE_SIZE)-1))) -#ifndef CONFIG_MMU -/* - * PAGE_OFFSET -- the first address of the first page of memory. When not - * using MMU this corresponds to the first free page in physical memory (aligned - * on a page boundary). - */ -extern unsigned int __page_offset; -#define PAGE_OFFSET __page_offset - -#else /* CONFIG_MMU */ - /* * PAGE_OFFSET -- the first address of the first page of memory. With MMU * it is set to the kernel start address (aligned on a page boundary). @@ -70,8 +53,6 @@ extern unsigned int __page_offset; typedef unsigned long pte_basic_t; #define PTE_FMT "%.8lx" -#endif /* CONFIG_MMU */ - # define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) # define clear_page(pgaddr) memset((pgaddr), 0, PAGE_SIZE) @@ -86,25 +67,12 @@ typedef struct page *pgtable_t; typedef struct { unsigned long pte; } pte_t; typedef struct { unsigned long pgprot; } pgprot_t; /* FIXME this can depend on linux kernel version */ -# ifdef CONFIG_MMU typedef struct { unsigned long pgd; } pgd_t; -# else /* CONFIG_MMU */ -typedef struct { unsigned long ste[64]; } pmd_t; -typedef struct { pmd_t pue[1]; } pud_t; -typedef struct { pud_t p4e[1]; } p4d_t; -typedef struct { p4d_t pge[1]; } pgd_t; -# endif /* CONFIG_MMU */ # define pte_val(x) ((x).pte) # define pgprot_val(x) ((x).pgprot) -# ifdef CONFIG_MMU # define pgd_val(x) ((x).pgd) -# else /* CONFIG_MMU */ -# define pmd_val(x) ((x).ste[0]) -# define pud_val(x) ((x).pue[0]) -# define pgd_val(x) ((x).pge[0]) -# endif /* CONFIG_MMU */ # define __pte(x) ((pte_t) { (x) }) # define __pgd(x) ((pgd_t) { (x) }) @@ -142,28 +110,12 @@ extern int page_is_ram(unsigned long pfn); # define virt_to_pfn(vaddr) (phys_to_pfn((__pa(vaddr)))) # define pfn_to_virt(pfn) __va(pfn_to_phys((pfn))) -# ifdef CONFIG_MMU - # define virt_to_page(kaddr) (pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)) # define page_to_virt(page) __va(page_to_pfn(page) << PAGE_SHIFT) # define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) -# else /* CONFIG_MMU */ -# define virt_to_page(vaddr) (pfn_to_page(virt_to_pfn(vaddr))) -# define page_to_virt(page) (pfn_to_virt(page_to_pfn(page))) -# define page_to_phys(page) (pfn_to_phys(page_to_pfn(page))) -# define page_to_bus(page) (page_to_phys(page)) -# define phys_to_page(paddr) (pfn_to_page(phys_to_pfn(paddr))) -# endif /* CONFIG_MMU */ - -# ifndef CONFIG_MMU -# define pfn_valid(pfn) (((pfn) >= min_low_pfn) && \ - ((pfn) <= (min_low_pfn + max_mapnr))) -# define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) -# else /* CONFIG_MMU */ # define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT) # define pfn_valid(pfn) ((pfn) < (max_mapnr + ARCH_PFN_OFFSET)) -# endif /* CONFIG_MMU */ # endif /* __ASSEMBLY__ */ @@ -174,12 +126,6 @@ extern int page_is_ram(unsigned long pfn); /* Convert between virtual and physical address for MMU. */ /* Handle MicroBlaze processor with virtual memory. */ -#ifndef CONFIG_MMU -#define __virt_to_phys(addr) addr -#define __phys_to_virt(addr) addr -#define tophys(rd, rs) addik rd, rs, 0 -#define tovirt(rd, rs) addik rd, rs, 0 -#else #define __virt_to_phys(addr) \ ((addr) + CONFIG_KERNEL_BASE_ADDR - CONFIG_KERNEL_START) #define __phys_to_virt(addr) \ @@ -188,14 +134,9 @@ extern int page_is_ram(unsigned long pfn); addik rd, rs, (CONFIG_KERNEL_BASE_ADDR - CONFIG_KERNEL_START) #define tovirt(rd, rs) \ addik rd, rs, (CONFIG_KERNEL_START - CONFIG_KERNEL_BASE_ADDR) -#endif /* CONFIG_MMU */ #define TOPHYS(addr) __virt_to_phys(addr) -#ifdef CONFIG_MMU - -#endif /* CONFIG_MMU */ - #endif /* __KERNEL__ */ #include diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index 8839ce00ea05..d56b9f670ad1 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -8,8 +8,6 @@ #ifndef _ASM_MICROBLAZE_PGALLOC_H #define _ASM_MICROBLAZE_PGALLOC_H -#ifdef CONFIG_MMU - #include /* For min/max macros */ #include #include @@ -42,6 +40,4 @@ extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); #define pmd_populate_kernel(mm, pmd, pte) \ (pmd_val(*(pmd)) = (unsigned long) (pte)) -#endif /* CONFIG_MMU */ - #endif /* _ASM_MICROBLAZE_PGALLOC_H */ diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 3fa1df90925e..9ae8d2c17dd5 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -14,47 +14,6 @@ extern int mem_init_done; #endif -#ifndef CONFIG_MMU - -#define pgd_present(pgd) (1) /* pages are always present on non MMU */ -#define pgd_none(pgd) (0) -#define pgd_bad(pgd) (0) -#define pgd_clear(pgdp) -#define kern_addr_valid(addr) (1) - -#define PAGE_NONE __pgprot(0) /* these mean nothing to non MMU */ -#define PAGE_SHARED __pgprot(0) /* these mean nothing to non MMU */ -#define PAGE_COPY __pgprot(0) /* these mean nothing to non MMU */ -#define PAGE_READONLY __pgprot(0) /* these mean nothing to non MMU */ -#define PAGE_KERNEL __pgprot(0) /* these mean nothing to non MMU */ - -#define pgprot_noncached(x) (x) -#define pgprot_writecombine pgprot_noncached -#define pgprot_device pgprot_noncached - -#define __swp_type(x) (0) -#define __swp_offset(x) (0) -#define __swp_entry(typ, off) ((swp_entry_t) { ((typ) | ((off) << 7)) }) -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) - -#define ZERO_PAGE(vaddr) ({ BUG(); NULL; }) - -#define swapper_pg_dir ((pgd_t *) NULL) - -#define arch_enter_lazy_cpu_mode() do {} while (0) - -#define pgprot_noncached_wc(prot) prot - -/* - * All 32bit addresses are effectively valid for vmalloc... - * Sort of meaningless for non-VM targets. - */ -#define VMALLOC_START 0 -#define VMALLOC_END 0xffffffff - -#else /* CONFIG_MMU */ - #include #ifdef __KERNEL__ @@ -491,8 +450,6 @@ void __init *early_get_page(void); #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ -#endif /* CONFIG_MMU */ - #ifndef __ASSEMBLY__ extern unsigned long ioremap_bot, ioremap_base; diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index 1ff5a82b76b6..f4f3048cde4c 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -31,42 +31,6 @@ extern void ret_from_kernel_thread(void); # endif /* __ASSEMBLY__ */ -# ifndef CONFIG_MMU -/* - * User space process size: memory size - * - * TASK_SIZE on MMU cpu is usually 1GB. However, on no-MMU arch, both - * user processes and the kernel is on the same memory region. They - * both share the memory space and that is limited by the amount of - * physical memory. thus, we set TASK_SIZE == amount of total memory. - */ -# define TASK_SIZE (0x81000000 - 0x80000000) - -/* - * This decides where the kernel will search for a free chunk of vm - * space during mmap's. We won't be using it - */ -# define TASK_UNMAPPED_BASE 0 - -/* definition in include/linux/sched.h */ -struct task_struct; - -/* thread_struct is gone. use thread_info instead. */ -struct thread_struct { }; -# define INIT_THREAD { } - -/* Free all resources held by a thread. */ -static inline void release_thread(struct task_struct *dead_task) -{ -} - -extern unsigned long get_wchan(struct task_struct *p); - -# define KSTK_EIP(tsk) (0) -# define KSTK_ESP(tsk) (0) - -# else /* CONFIG_MMU */ - /* * This is used to define STACK_TOP, and with MMU it must be below * kernel base to select the correct PGD when handling MMU exceptions. @@ -133,5 +97,4 @@ extern struct dentry *of_debugfs_root; #endif # endif /* __ASSEMBLY__ */ -# endif /* CONFIG_MMU */ #endif /* _ASM_MICROBLAZE_PROCESSOR_H */ diff --git a/arch/microblaze/include/asm/registers.h b/arch/microblaze/include/asm/registers.h index ee81e1cba008..6b36693fc621 100644 --- a/arch/microblaze/include/asm/registers.h +++ b/arch/microblaze/include/asm/registers.h @@ -27,7 +27,6 @@ #define FSR_UF (1<<1) /* Underflow */ #define FSR_DO (1<<0) /* Denormalized operand error */ -# ifdef CONFIG_MMU /* Machine State Register (MSR) Fields */ # define MSR_UM (1<<11) /* User Mode */ # define MSR_UMS (1<<12) /* User Mode Save */ @@ -43,5 +42,4 @@ # define ESR_DIZ (1<<11) /* Zone Protection */ # define ESR_S (1<<10) /* Store instruction */ -# endif /* CONFIG_MMU */ #endif /* _ASM_MICROBLAZE_REGISTERS_H */ diff --git a/arch/microblaze/include/asm/setup.h b/arch/microblaze/include/asm/setup.h index be10da9d87cb..a06cc1f97aa9 100644 --- a/arch/microblaze/include/asm/setup.h +++ b/arch/microblaze/include/asm/setup.h @@ -14,9 +14,7 @@ extern char cmd_line[COMMAND_LINE_SIZE]; extern char *klimit; -# ifdef CONFIG_MMU extern void mmu_reset(void); -# endif /* CONFIG_MMU */ void time_init(void); void init_IRQ(void); diff --git a/arch/microblaze/include/asm/tlbflush.h b/arch/microblaze/include/asm/tlbflush.h index 1200e2bf14bb..2038168ed128 100644 --- a/arch/microblaze/include/asm/tlbflush.h +++ b/arch/microblaze/include/asm/tlbflush.h @@ -8,8 +8,6 @@ #ifndef _ASM_MICROBLAZE_TLBFLUSH_H #define _ASM_MICROBLAZE_TLBFLUSH_H -#ifdef CONFIG_MMU - #include #include #include /* For TASK_SIZE */ @@ -50,16 +48,4 @@ static inline void local_flush_tlb_range(struct vm_area_struct *vma, static inline void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end) { } -#else /* CONFIG_MMU */ - -#define flush_tlb() BUG() -#define flush_tlb_all() BUG() -#define flush_tlb_mm(mm) BUG() -#define flush_tlb_page(vma, addr) BUG() -#define flush_tlb_range(mm, start, end) BUG() -#define flush_tlb_pgtables(mm, start, end) BUG() -#define flush_tlb_kernel_range(start, end) BUG() - -#endif /* CONFIG_MMU */ - #endif /* _ASM_MICROBLAZE_TLBFLUSH_H */ diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h index 304b04ffea2f..c44b59470e45 100644 --- a/arch/microblaze/include/asm/uaccess.h +++ b/arch/microblaze/include/asm/uaccess.h @@ -30,35 +30,14 @@ */ # define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) -# ifndef CONFIG_MMU -# define KERNEL_DS MAKE_MM_SEG(0) -# define USER_DS KERNEL_DS -# else # define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) # define USER_DS MAKE_MM_SEG(TASK_SIZE - 1) -# endif # define get_fs() (current_thread_info()->addr_limit) # define set_fs(val) (current_thread_info()->addr_limit = (val)) # define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) -#ifndef CONFIG_MMU - -/* Check against bounds of physical memory */ -static inline int ___range_ok(unsigned long addr, unsigned long size) -{ - return ((addr < memory_start) || - ((addr + size - 1) > (memory_start + memory_size - 1))); -} - -#define __range_ok(addr, size) \ - ___range_ok((unsigned long)(addr), (unsigned long)(size)) - -#define access_ok(addr, size) (__range_ok((addr), (size)) == 0) - -#else - static inline int access_ok(const void __user *addr, unsigned long size) { if (!size) @@ -77,15 +56,9 @@ static inline int access_ok(const void __user *addr, unsigned long size) (u32)get_fs().seg); return 1; } -#endif -#ifdef CONFIG_MMU # define __FIXUP_SECTION ".section .fixup,\"ax\"\n" # define __EX_TABLE_SECTION ".section __ex_table,\"a\"\n" -#else -# define __FIXUP_SECTION ".section .discard,\"ax\"\n" -# define __EX_TABLE_SECTION ".section .discard,\"ax\"\n" -#endif extern unsigned long __copy_tofrom_user(void __user *to, const void __user *from, unsigned long size); diff --git a/arch/microblaze/kernel/Makefile b/arch/microblaze/kernel/Makefile index dd71637437f4..15a20eb814ce 100644 --- a/arch/microblaze/kernel/Makefile +++ b/arch/microblaze/kernel/Makefile @@ -22,9 +22,9 @@ obj-y += dma.o exceptions.o \ obj-y += cpu/ obj-$(CONFIG_MODULES) += microblaze_ksyms.o module.o -obj-$(CONFIG_MMU) += misc.o +obj-y += misc.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o mcount.o obj-$(CONFIG_KGDB) += kgdb.o -obj-y += entry$(MMU).o +obj-y += entry.o diff --git a/arch/microblaze/kernel/asm-offsets.c b/arch/microblaze/kernel/asm-offsets.c index c1b459c97571..6c69ce7be2e8 100644 --- a/arch/microblaze/kernel/asm-offsets.c +++ b/arch/microblaze/kernel/asm-offsets.c @@ -70,7 +70,6 @@ int main(int argc, char *argv[]) /* struct task_struct */ DEFINE(TS_THREAD_INFO, offsetof(struct task_struct, stack)); -#ifdef CONFIG_MMU DEFINE(TASK_STATE, offsetof(struct task_struct, state)); DEFINE(TASK_FLAGS, offsetof(struct task_struct, flags)); DEFINE(TASK_PTRACE, offsetof(struct task_struct, ptrace)); @@ -84,7 +83,6 @@ int main(int argc, char *argv[]) DEFINE(PGDIR, offsetof(struct thread_struct, pgdir)); BLANK(); -#endif /* struct thread_info */ DEFINE(TI_TASK, offsetof(struct thread_info, task)); diff --git a/arch/microblaze/kernel/entry-nommu.S b/arch/microblaze/kernel/entry-nommu.S deleted file mode 100644 index 7e394fc2c439..000000000000 --- a/arch/microblaze/kernel/entry-nommu.S +++ /dev/null @@ -1,622 +0,0 @@ -/* - * Copyright (C) 2007-2009 Michal Simek - * Copyright (C) 2007-2009 PetaLogix - * Copyright (C) 2006 Atmark Techno, Inc. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR - .macro disable_irq - msrclr r0, MSR_IE - .endm - - .macro enable_irq - msrset r0, MSR_IE - .endm - - .macro clear_bip - msrclr r0, MSR_BIP - .endm -#else - .macro disable_irq - mfs r11, rmsr - andi r11, r11, ~MSR_IE - mts rmsr, r11 - .endm - - .macro enable_irq - mfs r11, rmsr - ori r11, r11, MSR_IE - mts rmsr, r11 - .endm - - .macro clear_bip - mfs r11, rmsr - andi r11, r11, ~MSR_BIP - mts rmsr, r11 - .endm -#endif - -ENTRY(_interrupt) - swi r1, r0, PER_CPU(ENTRY_SP) /* save the current sp */ - swi r11, r0, PER_CPU(R11_SAVE) /* temporarily save r11 */ - lwi r11, r0, PER_CPU(KM) /* load mode indicator */ - beqid r11, 1f - nop - brid 2f /* jump over */ - addik r1, r1, (-PT_SIZE) /* room for pt_regs (delay slot) */ -1: /* switch to kernel stack */ - lwi r1, r0, PER_CPU(CURRENT_SAVE) /* get the saved current */ - lwi r1, r1, TS_THREAD_INFO /* get the thread info */ - /* calculate kernel stack pointer */ - addik r1, r1, THREAD_SIZE - PT_SIZE -2: - swi r11, r1, PT_MODE /* store the mode */ - lwi r11, r0, PER_CPU(R11_SAVE) /* reload r11 */ - swi r2, r1, PT_R2 - swi r3, r1, PT_R3 - swi r4, r1, PT_R4 - swi r5, r1, PT_R5 - swi r6, r1, PT_R6 - swi r7, r1, PT_R7 - swi r8, r1, PT_R8 - swi r9, r1, PT_R9 - swi r10, r1, PT_R10 - swi r11, r1, PT_R11 - swi r12, r1, PT_R12 - swi r13, r1, PT_R13 - swi r14, r1, PT_R14 - swi r14, r1, PT_PC - swi r15, r1, PT_R15 - swi r16, r1, PT_R16 - swi r17, r1, PT_R17 - swi r18, r1, PT_R18 - swi r19, r1, PT_R19 - swi r20, r1, PT_R20 - swi r21, r1, PT_R21 - swi r22, r1, PT_R22 - swi r23, r1, PT_R23 - swi r24, r1, PT_R24 - swi r25, r1, PT_R25 - swi r26, r1, PT_R26 - swi r27, r1, PT_R27 - swi r28, r1, PT_R28 - swi r29, r1, PT_R29 - swi r30, r1, PT_R30 - swi r31, r1, PT_R31 - /* special purpose registers */ - mfs r11, rmsr - swi r11, r1, PT_MSR - mfs r11, rear - swi r11, r1, PT_EAR - mfs r11, resr - swi r11, r1, PT_ESR - mfs r11, rfsr - swi r11, r1, PT_FSR - /* reload original stack pointer and save it */ - lwi r11, r0, PER_CPU(ENTRY_SP) - swi r11, r1, PT_R1 - /* update mode indicator we are in kernel mode */ - addik r11, r0, 1 - swi r11, r0, PER_CPU(KM) - /* restore r31 */ - lwi r31, r0, PER_CPU(CURRENT_SAVE) - /* prepare the link register, the argument and jump */ - addik r15, r0, ret_from_intr - 8 - addk r6, r0, r15 - braid do_IRQ - add r5, r0, r1 - -ret_from_intr: - lwi r11, r1, PT_MODE - bneid r11, no_intr_resched - -3: - lwi r6, r31, TS_THREAD_INFO /* get thread info */ - lwi r19, r6, TI_FLAGS /* get flags in thread info */ - /* do an extra work if any bits are set */ - - andi r11, r19, _TIF_NEED_RESCHED - beqi r11, 1f - bralid r15, schedule - nop - bri 3b -1: andi r11, r19, _TIF_SIGPENDING | _TIF_NOTIFY_RESUME - beqid r11, no_intr_resched - addk r5, r1, r0 - bralid r15, do_notify_resume - addk r6, r0, r0 - bri 3b - -no_intr_resched: - /* Disable interrupts, we are now committed to the state restore */ - disable_irq - - /* save mode indicator */ - lwi r11, r1, PT_MODE - swi r11, r0, PER_CPU(KM) - - /* save r31 */ - swi r31, r0, PER_CPU(CURRENT_SAVE) -restore_context: - /* special purpose registers */ - lwi r11, r1, PT_FSR - mts rfsr, r11 - lwi r11, r1, PT_ESR - mts resr, r11 - lwi r11, r1, PT_EAR - mts rear, r11 - lwi r11, r1, PT_MSR - mts rmsr, r11 - - lwi r31, r1, PT_R31 - lwi r30, r1, PT_R30 - lwi r29, r1, PT_R29 - lwi r28, r1, PT_R28 - lwi r27, r1, PT_R27 - lwi r26, r1, PT_R26 - lwi r25, r1, PT_R25 - lwi r24, r1, PT_R24 - lwi r23, r1, PT_R23 - lwi r22, r1, PT_R22 - lwi r21, r1, PT_R21 - lwi r20, r1, PT_R20 - lwi r19, r1, PT_R19 - lwi r18, r1, PT_R18 - lwi r17, r1, PT_R17 - lwi r16, r1, PT_R16 - lwi r15, r1, PT_R15 - lwi r14, r1, PT_PC - lwi r13, r1, PT_R13 - lwi r12, r1, PT_R12 - lwi r11, r1, PT_R11 - lwi r10, r1, PT_R10 - lwi r9, r1, PT_R9 - lwi r8, r1, PT_R8 - lwi r7, r1, PT_R7 - lwi r6, r1, PT_R6 - lwi r5, r1, PT_R5 - lwi r4, r1, PT_R4 - lwi r3, r1, PT_R3 - lwi r2, r1, PT_R2 - lwi r1, r1, PT_R1 - rtid r14, 0 - nop - -ENTRY(_reset) - brai 0; - -ENTRY(_user_exception) - swi r1, r0, PER_CPU(ENTRY_SP) /* save the current sp */ - swi r11, r0, PER_CPU(R11_SAVE) /* temporarily save r11 */ - lwi r11, r0, PER_CPU(KM) /* load mode indicator */ - beqid r11, 1f /* Already in kernel mode? */ - nop - brid 2f /* jump over */ - addik r1, r1, (-PT_SIZE) /* Room for pt_regs (delay slot) */ -1: /* Switch to kernel stack */ - lwi r1, r0, PER_CPU(CURRENT_SAVE) /* get the saved current */ - lwi r1, r1, TS_THREAD_INFO /* get the thread info */ - /* calculate kernel stack pointer */ - addik r1, r1, THREAD_SIZE - PT_SIZE -2: - swi r11, r1, PT_MODE /* store the mode */ - lwi r11, r0, PER_CPU(R11_SAVE) /* reload r11 */ - /* save them on stack */ - swi r2, r1, PT_R2 - swi r3, r1, PT_R3 /* r3: _always_ in clobber list; see unistd.h */ - swi r4, r1, PT_R4 /* r4: _always_ in clobber list; see unistd.h */ - swi r5, r1, PT_R5 - swi r6, r1, PT_R6 - swi r7, r1, PT_R7 - swi r8, r1, PT_R8 - swi r9, r1, PT_R9 - swi r10, r1, PT_R10 - swi r11, r1, PT_R11 - /* r12: _always_ in clobber list; see unistd.h */ - swi r12, r1, PT_R12 - swi r13, r1, PT_R13 - /* r14: _always_ in clobber list; see unistd.h */ - swi r14, r1, PT_R14 - /* but we want to return to the next inst. */ - addik r14, r14, 0x4 - swi r14, r1, PT_PC /* increment by 4 and store in pc */ - swi r15, r1, PT_R15 - swi r16, r1, PT_R16 - swi r17, r1, PT_R17 - swi r18, r1, PT_R18 - swi r19, r1, PT_R19 - swi r20, r1, PT_R20 - swi r21, r1, PT_R21 - swi r22, r1, PT_R22 - swi r23, r1, PT_R23 - swi r24, r1, PT_R24 - swi r25, r1, PT_R25 - swi r26, r1, PT_R26 - swi r27, r1, PT_R27 - swi r28, r1, PT_R28 - swi r29, r1, PT_R29 - swi r30, r1, PT_R30 - swi r31, r1, PT_R31 - - disable_irq - nop /* make sure IE bit is in effect */ - clear_bip /* once IE is in effect it is safe to clear BIP */ - nop - - /* special purpose registers */ - mfs r11, rmsr - swi r11, r1, PT_MSR - mfs r11, rear - swi r11, r1, PT_EAR - mfs r11, resr - swi r11, r1, PT_ESR - mfs r11, rfsr - swi r11, r1, PT_FSR - /* reload original stack pointer and save it */ - lwi r11, r0, PER_CPU(ENTRY_SP) - swi r11, r1, PT_R1 - /* update mode indicator we are in kernel mode */ - addik r11, r0, 1 - swi r11, r0, PER_CPU(KM) - /* restore r31 */ - lwi r31, r0, PER_CPU(CURRENT_SAVE) - /* re-enable interrupts now we are in kernel mode */ - enable_irq - - /* See if the system call number is valid. */ - addi r11, r12, -__NR_syscalls - bgei r11, 1f /* return to user if not valid */ - /* Figure out which function to use for this system call. */ - /* Note Microblaze barrel shift is optional, so don't rely on it */ - add r12, r12, r12 /* convert num -> ptr */ - addik r30, r0, 1 /* restarts allowed */ - add r12, r12, r12 - lwi r12, r12, sys_call_table /* Get function pointer */ - addik r15, r0, ret_to_user-8 /* set return address */ - bra r12 /* Make the system call. */ - bri 0 /* won't reach here */ -1: - brid ret_to_user /* jump to syscall epilogue */ - addi r3, r0, -ENOSYS /* set errno in delay slot */ - -/* - * Debug traps are like a system call, but entered via brki r14, 0x60 - * All we need to do is send the SIGTRAP signal to current, ptrace and - * do_notify_resume will handle the rest - */ -ENTRY(_debug_exception) - swi r1, r0, PER_CPU(ENTRY_SP) /* save the current sp */ - lwi r1, r0, PER_CPU(CURRENT_SAVE) /* get the saved current */ - lwi r1, r1, TS_THREAD_INFO /* get the thread info */ - addik r1, r1, THREAD_SIZE - PT_SIZE /* get the kernel stack */ - swi r11, r0, PER_CPU(R11_SAVE) /* temporarily save r11 */ - lwi r11, r0, PER_CPU(KM) /* load mode indicator */ -//save_context: - swi r11, r1, PT_MODE /* store the mode */ - lwi r11, r0, PER_CPU(R11_SAVE) /* reload r11 */ - /* save them on stack */ - swi r2, r1, PT_R2 - swi r3, r1, PT_R3 /* r3: _always_ in clobber list; see unistd.h */ - swi r4, r1, PT_R4 /* r4: _always_ in clobber list; see unistd.h */ - swi r5, r1, PT_R5 - swi r6, r1, PT_R6 - swi r7, r1, PT_R7 - swi r8, r1, PT_R8 - swi r9, r1, PT_R9 - swi r10, r1, PT_R10 - swi r11, r1, PT_R11 - /* r12: _always_ in clobber list; see unistd.h */ - swi r12, r1, PT_R12 - swi r13, r1, PT_R13 - /* r14: _always_ in clobber list; see unistd.h */ - swi r14, r1, PT_R14 - swi r14, r1, PT_PC /* Will return to interrupted instruction */ - swi r15, r1, PT_R15 - swi r16, r1, PT_R16 - swi r17, r1, PT_R17 - swi r18, r1, PT_R18 - swi r19, r1, PT_R19 - swi r20, r1, PT_R20 - swi r21, r1, PT_R21 - swi r22, r1, PT_R22 - swi r23, r1, PT_R23 - swi r24, r1, PT_R24 - swi r25, r1, PT_R25 - swi r26, r1, PT_R26 - swi r27, r1, PT_R27 - swi r28, r1, PT_R28 - swi r29, r1, PT_R29 - swi r30, r1, PT_R30 - swi r31, r1, PT_R31 - - disable_irq - nop /* make sure IE bit is in effect */ - clear_bip /* once IE is in effect it is safe to clear BIP */ - nop - - /* special purpose registers */ - mfs r11, rmsr - swi r11, r1, PT_MSR - mfs r11, rear - swi r11, r1, PT_EAR - mfs r11, resr - swi r11, r1, PT_ESR - mfs r11, rfsr - swi r11, r1, PT_FSR - /* reload original stack pointer and save it */ - lwi r11, r0, PER_CPU(ENTRY_SP) - swi r11, r1, PT_R1 - /* update mode indicator we are in kernel mode */ - addik r11, r0, 1 - swi r11, r0, PER_CPU(KM) - /* restore r31 */ - lwi r31, r0, PER_CPU(CURRENT_SAVE) - /* re-enable interrupts now we are in kernel mode */ - enable_irq - - addi r5, r0, SIGTRAP /* sending the trap signal */ - add r6, r0, r31 /* to current */ - bralid r15, send_sig - add r7, r0, r0 /* 3rd param zero */ - - addik r30, r0, 1 /* restarts allowed ??? */ - /* Restore r3/r4 to work around how ret_to_user works */ - lwi r3, r1, PT_R3 - lwi r4, r1, PT_R4 - bri ret_to_user - -ENTRY(_break) - bri 0 - -/* struct task_struct *_switch_to(struct thread_info *prev, - struct thread_info *next); */ -ENTRY(_switch_to) - /* prepare return value */ - addk r3, r0, r31 - - /* save registers in cpu_context */ - /* use r11 and r12, volatile registers, as temp register */ - addik r11, r5, TI_CPU_CONTEXT - swi r1, r11, CC_R1 - swi r2, r11, CC_R2 - /* skip volatile registers. - * they are saved on stack when we jumped to _switch_to() */ - /* dedicated registers */ - swi r13, r11, CC_R13 - swi r14, r11, CC_R14 - swi r15, r11, CC_R15 - swi r16, r11, CC_R16 - swi r17, r11, CC_R17 - swi r18, r11, CC_R18 - /* save non-volatile registers */ - swi r19, r11, CC_R19 - swi r20, r11, CC_R20 - swi r21, r11, CC_R21 - swi r22, r11, CC_R22 - swi r23, r11, CC_R23 - swi r24, r11, CC_R24 - swi r25, r11, CC_R25 - swi r26, r11, CC_R26 - swi r27, r11, CC_R27 - swi r28, r11, CC_R28 - swi r29, r11, CC_R29 - swi r30, r11, CC_R30 - /* special purpose registers */ - mfs r12, rmsr - swi r12, r11, CC_MSR - mfs r12, rear - swi r12, r11, CC_EAR - mfs r12, resr - swi r12, r11, CC_ESR - mfs r12, rfsr - swi r12, r11, CC_FSR - - /* update r31, the current */ - lwi r31, r6, TI_TASK - swi r31, r0, PER_CPU(CURRENT_SAVE) - - /* get new process' cpu context and restore */ - addik r11, r6, TI_CPU_CONTEXT - - /* special purpose registers */ - lwi r12, r11, CC_FSR - mts rfsr, r12 - lwi r12, r11, CC_ESR - mts resr, r12 - lwi r12, r11, CC_EAR - mts rear, r12 - lwi r12, r11, CC_MSR - mts rmsr, r12 - /* non-volatile registers */ - lwi r30, r11, CC_R30 - lwi r29, r11, CC_R29 - lwi r28, r11, CC_R28 - lwi r27, r11, CC_R27 - lwi r26, r11, CC_R26 - lwi r25, r11, CC_R25 - lwi r24, r11, CC_R24 - lwi r23, r11, CC_R23 - lwi r22, r11, CC_R22 - lwi r21, r11, CC_R21 - lwi r20, r11, CC_R20 - lwi r19, r11, CC_R19 - /* dedicated registers */ - lwi r18, r11, CC_R18 - lwi r17, r11, CC_R17 - lwi r16, r11, CC_R16 - lwi r15, r11, CC_R15 - lwi r14, r11, CC_R14 - lwi r13, r11, CC_R13 - /* skip volatile registers */ - lwi r2, r11, CC_R2 - lwi r1, r11, CC_R1 - - rtsd r15, 8 - nop - -ENTRY(ret_from_fork) - addk r5, r0, r3 - brlid r15, schedule_tail - nop - swi r31, r1, PT_R31 /* save r31 in user context. */ - /* will soon be restored to r31 in ret_to_user */ - addk r3, r0, r0 - brid ret_to_user - nop - -ENTRY(ret_from_kernel_thread) - brlid r15, schedule_tail - addk r5, r0, r3 - brald r15, r20 - addk r5, r0, r19 - brid ret_to_user - addk r3, r0, r0 - -work_pending: - lwi r11, r1, PT_MODE - bneid r11, 2f -3: - enable_irq - andi r11, r19, _TIF_NEED_RESCHED - beqi r11, 1f - bralid r15, schedule - nop - bri 4f -1: andi r11, r19, _TIF_SIGPENDING | _TIF_NOTIFY_RESUME - beqi r11, no_work_pending - addk r5, r30, r0 - bralid r15, do_notify_resume - addik r6, r0, 1 - addk r30, r0, r0 /* no restarts from now on */ -4: - disable_irq - lwi r6, r31, TS_THREAD_INFO /* get thread info */ - lwi r19, r6, TI_FLAGS /* get flags in thread info */ - bri 3b - -ENTRY(ret_to_user) - disable_irq - - swi r4, r1, PT_R4 /* return val */ - swi r3, r1, PT_R3 /* return val */ - - lwi r6, r31, TS_THREAD_INFO /* get thread info */ - lwi r19, r6, TI_FLAGS /* get flags in thread info */ - bnei r19, work_pending /* do an extra work if any bits are set */ -no_work_pending: - disable_irq - -2: - /* save r31 */ - swi r31, r0, PER_CPU(CURRENT_SAVE) - /* save mode indicator */ - lwi r18, r1, PT_MODE - swi r18, r0, PER_CPU(KM) -//restore_context: - /* special purpose registers */ - lwi r18, r1, PT_FSR - mts rfsr, r18 - lwi r18, r1, PT_ESR - mts resr, r18 - lwi r18, r1, PT_EAR - mts rear, r18 - lwi r18, r1, PT_MSR - mts rmsr, r18 - - lwi r31, r1, PT_R31 - lwi r30, r1, PT_R30 - lwi r29, r1, PT_R29 - lwi r28, r1, PT_R28 - lwi r27, r1, PT_R27 - lwi r26, r1, PT_R26 - lwi r25, r1, PT_R25 - lwi r24, r1, PT_R24 - lwi r23, r1, PT_R23 - lwi r22, r1, PT_R22 - lwi r21, r1, PT_R21 - lwi r20, r1, PT_R20 - lwi r19, r1, PT_R19 - lwi r18, r1, PT_R18 - lwi r17, r1, PT_R17 - lwi r16, r1, PT_R16 - lwi r15, r1, PT_R15 - lwi r14, r1, PT_PC - lwi r13, r1, PT_R13 - lwi r12, r1, PT_R12 - lwi r11, r1, PT_R11 - lwi r10, r1, PT_R10 - lwi r9, r1, PT_R9 - lwi r8, r1, PT_R8 - lwi r7, r1, PT_R7 - lwi r6, r1, PT_R6 - lwi r5, r1, PT_R5 - lwi r4, r1, PT_R4 /* return val */ - lwi r3, r1, PT_R3 /* return val */ - lwi r2, r1, PT_R2 - lwi r1, r1, PT_R1 - - rtid r14, 0 - nop - -sys_rt_sigreturn_wrapper: - addk r30, r0, r0 /* no restarts for this one */ - brid sys_rt_sigreturn - addk r5, r1, r0 - - /* Interrupt vector table */ - .section .init.ivt, "ax" - .org 0x0 - brai _reset - brai _user_exception - brai _interrupt - brai _break - brai _hw_exception_handler - .org 0x60 - brai _debug_exception - -.section .rodata,"a" -#include "syscall_table.S" - -syscall_table_size=(.-sys_call_table) - -type_SYSCALL: - .ascii "SYSCALL\0" -type_IRQ: - .ascii "IRQ\0" -type_IRQ_PREEMPT: - .ascii "IRQ (PREEMPTED)\0" -type_SYSCALL_PREEMPT: - .ascii " SYSCALL (PREEMPTED)\0" - - /* - * Trap decoding for stack unwinder - * Tuples are (start addr, end addr, string) - * If return address lies on [start addr, end addr], - * unwinder displays 'string' - */ - - .align 4 -.global microblaze_trap_handlers -microblaze_trap_handlers: - /* Exact matches come first */ - .word ret_to_user ; .word ret_to_user ; .word type_SYSCALL - .word ret_from_intr; .word ret_from_intr ; .word type_IRQ - /* Fuzzy matches go here */ - .word ret_from_intr; .word no_intr_resched; .word type_IRQ_PREEMPT - .word work_pending ; .word no_work_pending; .word type_SYSCALL_PREEMPT - /* End of table */ - .word 0 ; .word 0 ; .word 0 diff --git a/arch/microblaze/kernel/exceptions.c b/arch/microblaze/kernel/exceptions.c index cf99c411503e..908788497b28 100644 --- a/arch/microblaze/kernel/exceptions.c +++ b/arch/microblaze/kernel/exceptions.c @@ -69,9 +69,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) asmlinkage void full_exception(struct pt_regs *regs, unsigned int type, int fsr, int addr) { -#ifdef CONFIG_MMU addr = regs->pc; -#endif #if 0 pr_warn("Exception %02x in %s mode, FSR=%08x PC=%08x ESR=%08x\n", @@ -132,13 +130,10 @@ asmlinkage void full_exception(struct pt_regs *regs, unsigned int type, fsr = FPE_FLTRES; _exception(SIGFPE, regs, fsr, addr); break; - -#ifdef CONFIG_MMU case MICROBLAZE_PRIVILEGED_EXCEPTION: pr_debug("Privileged exception\n"); _exception(SIGILL, regs, ILL_PRVOPC, addr); break; -#endif default: /* FIXME what to do in unexpected exception */ pr_warn("Unexpected exception %02x PC=%08x in %s mode\n", diff --git a/arch/microblaze/kernel/head.S b/arch/microblaze/kernel/head.S index 14b276406153..ec2fcb545e64 100644 --- a/arch/microblaze/kernel/head.S +++ b/arch/microblaze/kernel/head.S @@ -34,7 +34,6 @@ #include #include /* for OF_DT_HEADER */ -#ifdef CONFIG_MMU #include /* COMMAND_LINE_SIZE */ #include #include @@ -48,8 +47,6 @@ empty_zero_page: swapper_pg_dir: .space PAGE_SIZE -#endif /* CONFIG_MMU */ - .section .rodata .align 4 endian_check: @@ -108,8 +105,6 @@ _copy_fdt: addik r3, r3, -4 /* descrement loop */ no_fdt_arg: -#ifdef CONFIG_MMU - #ifndef CONFIG_CMDLINE_BOOL /* * handling command line @@ -329,7 +324,6 @@ turn_on_mmu: nop start_here: -#endif /* CONFIG_MMU */ /* Initialize small data anchors */ addik r13, r0, _KERNEL_SDA_BASE_ @@ -345,11 +339,6 @@ start_here: brald r15, r11 nop -#ifndef CONFIG_MMU - addik r15, r0, machine_halt - braid start_kernel - nop -#else /* * Initialize the MMU. */ @@ -383,4 +372,3 @@ kernel_load_context: nop rted r17, 0 /* enable MMU and jump to start_kernel */ nop -#endif /* CONFIG_MMU */ diff --git a/arch/microblaze/kernel/hw_exception_handler.S b/arch/microblaze/kernel/hw_exception_handler.S index 54411de22fa6..07ea23965f81 100644 --- a/arch/microblaze/kernel/hw_exception_handler.S +++ b/arch/microblaze/kernel/hw_exception_handler.S @@ -80,7 +80,6 @@ /* Helpful Macros */ #define NUM_TO_REG(num) r ## num -#ifdef CONFIG_MMU #define RESTORE_STATE \ lwi r5, r1, 0; \ mts rmsr, r5; \ @@ -92,7 +91,6 @@ lwi r11, r1, PT_R11; \ lwi r31, r1, PT_R31; \ lwi r1, r1, PT_R1; -#endif /* CONFIG_MMU */ #define LWREG_NOP \ bri ex_handler_unhandled; \ @@ -102,10 +100,6 @@ bri ex_handler_unhandled; \ nop; -/* FIXME this is weird - for noMMU kernel is not possible to use brid - * instruction which can shorten executed time - */ - /* r3 is the source */ #define R3_TO_LWREG_V(regnum) \ swi r3, r1, 4 * regnum; \ @@ -126,7 +120,6 @@ or r3, r0, NUM_TO_REG (regnum); \ bri ex_sw_tail; -#ifdef CONFIG_MMU #define R3_TO_LWREG_VM_V(regnum) \ brid ex_lw_end_vm; \ swi r3, r7, 4 * regnum; @@ -193,7 +186,6 @@ .endm #endif -#endif /* CONFIG_MMU */ .extern other_exception_handler /* Defined in exception.c */ @@ -251,7 +243,6 @@ */ /* wrappers to restore state before coming to entry.S */ -#ifdef CONFIG_MMU .section .data .align 4 pt_pool_space: @@ -316,31 +307,24 @@ _MB_HW_ExceptionVectorTable: .long TOPHYS(ex_handler_unhandled) .long TOPHYS(ex_handler_unhandled) .long TOPHYS(ex_handler_unhandled) -#endif .global _hw_exception_handler .section .text .align 4 .ent _hw_exception_handler _hw_exception_handler: -#ifndef CONFIG_MMU - addik r1, r1, -(EX_HANDLER_STACK_SIZ); /* Create stack frame */ -#else swi r1, r0, TOPHYS(pt_pool_space + PT_R1); /* GET_SP */ /* Save date to kernel memory. Here is the problem * when you came from user space */ ori r1, r0, TOPHYS(pt_pool_space); -#endif swi r3, r1, PT_R3 swi r4, r1, PT_R4 swi r5, r1, PT_R5 swi r6, r1, PT_R6 -#ifdef CONFIG_MMU swi r11, r1, PT_R11 swi r31, r1, PT_R31 lwi r31, r0, TOPHYS(PER_CPU(CURRENT_SAVE)) /* get saved current */ -#endif mfs r5, rmsr; nop @@ -350,18 +334,8 @@ _hw_exception_handler: mfs r3, rear; nop -#ifndef CONFIG_MMU - andi r5, r4, 0x1000; /* Check ESR[DS] */ - beqi r5, not_in_delay_slot; /* Branch if ESR[DS] not set */ - mfs r17, rbtr; /* ESR[DS] set - return address in BTR */ - nop -not_in_delay_slot: - swi r17, r1, PT_R17 -#endif - andi r5, r4, 0x1F; /* Extract ESR[EXC] */ -#ifdef CONFIG_MMU /* Calculate exception vector offset = r5 << 2 */ addk r6, r5, r5; /* << 1 */ addk r6, r6, r6; /* << 2 */ @@ -383,73 +357,6 @@ not_in_delay_slot: full_exception_trapw: RESTORE_STATE bri full_exception_trap -#else - /* Exceptions enabled here. This will allow nested exceptions */ - mfs r6, rmsr; - nop - swi r6, r1, 0; /* RMSR_OFFSET */ - ori r6, r6, 0x100; /* Turn ON the EE bit */ - andi r6, r6, ~2; /* Disable interrupts */ - mts rmsr, r6; - nop - - xori r6, r5, 1; /* 00001 = Unaligned Exception */ - /* Jump to unalignment exception handler */ - beqi r6, handle_unaligned_ex; - -handle_other_ex: /* Handle Other exceptions here */ - /* Save other volatiles before we make procedure calls below */ - swi r7, r1, PT_R7 - swi r8, r1, PT_R8 - swi r9, r1, PT_R9 - swi r10, r1, PT_R10 - swi r11, r1, PT_R11 - swi r12, r1, PT_R12 - swi r14, r1, PT_R14 - swi r15, r1, PT_R15 - swi r18, r1, PT_R18 - - or r5, r1, r0 - andi r6, r4, 0x1F; /* Load ESR[EC] */ - lwi r7, r0, PER_CPU(KM) /* MS: saving current kernel mode to regs */ - swi r7, r1, PT_MODE - mfs r7, rfsr - nop - addk r8, r17, r0; /* Load exception address */ - bralid r15, full_exception; /* Branch to the handler */ - nop; - mts rfsr, r0; /* Clear sticky fsr */ - nop - - /* - * Trigger execution of the signal handler by enabling - * interrupts and calling an invalid syscall. - */ - mfs r5, rmsr; - nop - ori r5, r5, 2; - mts rmsr, r5; /* enable interrupt */ - nop - addi r12, r0, __NR_syscalls; - brki r14, 0x08; - mfs r5, rmsr; /* disable interrupt */ - nop - andi r5, r5, ~2; - mts rmsr, r5; - nop - - lwi r7, r1, PT_R7 - lwi r8, r1, PT_R8 - lwi r9, r1, PT_R9 - lwi r10, r1, PT_R10 - lwi r11, r1, PT_R11 - lwi r12, r1, PT_R12 - lwi r14, r1, PT_R14 - lwi r15, r1, PT_R15 - lwi r18, r1, PT_R18 - - bri ex_handler_done; /* Complete exception handling */ -#endif /* 0x01 - Unaligned data access exception * This occurs when a word access is not aligned on a word boundary, @@ -463,7 +370,6 @@ handle_unaligned_ex: * R4 = ESR * R3 = EAR */ -#ifdef CONFIG_MMU andi r6, r4, 0x1000 /* Check ESR[DS] */ beqi r6, _no_delayslot /* Branch if ESR[DS] not set */ mfs r17, rbtr; /* ESR[DS] set - return address in BTR */ @@ -472,7 +378,7 @@ _no_delayslot: /* jump to high level unaligned handler */ RESTORE_STATE; bri unaligned_data_trap -#endif + andi r6, r4, 0x3E0; /* Mask and extract the register operand */ srl r6, r6; /* r6 >> 5 */ srl r6, r6; @@ -558,25 +464,10 @@ ex_shw: ex_sw_end: /* Exception handling of store word, ends. */ ex_handler_done: -#ifndef CONFIG_MMU - lwi r5, r1, 0 /* RMSR */ - mts rmsr, r5 - nop - lwi r3, r1, PT_R3 - lwi r4, r1, PT_R4 - lwi r5, r1, PT_R5 - lwi r6, r1, PT_R6 - lwi r17, r1, PT_R17 - - rted r17, 0 - addik r1, r1, (EX_HANDLER_STACK_SIZ); /* Restore stack frame */ -#else RESTORE_STATE; rted r17, 0 nop -#endif -#ifdef CONFIG_MMU /* Exception vector entry code. This code runs with address translation * turned off (i.e. using physical addresses). */ @@ -882,13 +773,7 @@ ex_handler_done: * bits 20 and 21 are zero. */ andi r3, r3, PAGE_MASK -#ifdef CONFIG_MICROBLAZE_64K_PAGES - ori r3, r3, TLB_VALID | TLB_PAGESZ(PAGESZ_64K) -#elif CONFIG_MICROBLAZE_16K_PAGES - ori r3, r3, TLB_VALID | TLB_PAGESZ(PAGESZ_16K) -#else ori r3, r3, TLB_VALID | TLB_PAGESZ(PAGESZ_4K) -#endif mts rtlbhi, r3 /* Load TLB HI */ nop @@ -926,10 +811,8 @@ ex_handler_done: rtsd r15,8 nop -#endif .end _hw_exception_handler -#ifdef CONFIG_MMU /* Unaligned data access exception last on a 4k page for MMU. * When this is called, we are in virtual mode with exceptions enabled * and registers 1-13,15,17,18 saved. @@ -1044,7 +927,6 @@ ex_unaligned_fixup: .word store6,ex_unaligned_fixup; .previous; .end _unaligned_data_exception -#endif /* CONFIG_MMU */ .global ex_handler_unhandled ex_handler_unhandled: @@ -1093,11 +975,7 @@ lw_r27: R3_TO_LWREG (27); lw_r28: R3_TO_LWREG (28); lw_r29: R3_TO_LWREG (29); lw_r30: R3_TO_LWREG (30); -#ifdef CONFIG_MMU lw_r31: R3_TO_LWREG_V (31); -#else -lw_r31: R3_TO_LWREG (31); -#endif sw_table: sw_r0: SWREG_TO_R3 (0); @@ -1131,13 +1009,8 @@ sw_r27: SWREG_TO_R3 (27); sw_r28: SWREG_TO_R3 (28); sw_r29: SWREG_TO_R3 (29); sw_r30: SWREG_TO_R3 (30); -#ifdef CONFIG_MMU sw_r31: SWREG_TO_R3_V (31); -#else -sw_r31: SWREG_TO_R3 (31); -#endif -#ifdef CONFIG_MMU lw_table_vm: lw_r0_vm: R3_TO_LWREG_VM (0); lw_r1_vm: R3_TO_LWREG_VM_V (1); @@ -1205,7 +1078,6 @@ sw_r28_vm: SWREG_TO_R3_VM_V (28); sw_r29_vm: SWREG_TO_R3_VM_V (29); sw_r30_vm: SWREG_TO_R3_VM_V (30); sw_r31_vm: SWREG_TO_R3_VM_V (31); -#endif /* CONFIG_MMU */ /* Temporary data structures used in the handler */ .section .data diff --git a/arch/microblaze/kernel/microblaze_ksyms.c b/arch/microblaze/kernel/microblaze_ksyms.c index 51c43ee5e380..303aaf13573b 100644 --- a/arch/microblaze/kernel/microblaze_ksyms.c +++ b/arch/microblaze/kernel/microblaze_ksyms.c @@ -33,9 +33,7 @@ EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); #endif -#ifdef CONFIG_MMU EXPORT_SYMBOL(empty_zero_page); -#endif EXPORT_SYMBOL(mbc); diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index a9e46e525cd0..3afda1823730 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -69,9 +69,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, ti->cpu_context.r19 = (unsigned long)arg; childregs->pt_mode = 1; local_save_flags(childregs->msr); -#ifdef CONFIG_MMU ti->cpu_context.msr = childregs->msr & ~MSR_IE; -#endif ti->cpu_context.r15 = (unsigned long)ret_from_kernel_thread - 8; return 0; } @@ -81,9 +79,6 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, memset(&ti->cpu_context, 0, sizeof(struct cpu_context)); ti->cpu_context.r1 = (unsigned long)childregs; -#ifndef CONFIG_MMU - ti->cpu_context.msr = (unsigned long)childregs->msr; -#else childregs->msr |= MSR_UMS; /* we should consider the fact that childregs is a copy of the parent @@ -105,7 +100,6 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, ti->cpu_context.msr = (childregs->msr|MSR_VM); ti->cpu_context.msr &= ~MSR_UMS; /* switch_to to kernel mode */ ti->cpu_context.msr &= ~MSR_IE; -#endif ti->cpu_context.r15 = (unsigned long)ret_from_fork - 8; /* @@ -130,13 +124,10 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long usp) regs->pc = pc; regs->r1 = usp; regs->pt_mode = 0; -#ifdef CONFIG_MMU regs->msr |= MSR_UMS; regs->msr &= ~MSR_VM; -#endif } -#ifdef CONFIG_MMU #include /* * Set up a thread for executing a new program @@ -145,7 +136,6 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpregs) { return 0; /* MicroBlaze has no separate FPU registers */ } -#endif /* CONFIG_MMU */ void arch_cpu_idle(void) { diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c index 7fcf5279ad15..f417333eccae 100644 --- a/arch/microblaze/kernel/setup.c +++ b/arch/microblaze/kernel/setup.c @@ -190,12 +190,10 @@ static int microblaze_debugfs_init(void) } arch_initcall(microblaze_debugfs_init); -# ifdef CONFIG_MMU static int __init debugfs_tlb(void) { debugfs_create_u32("tlb_skip", S_IRUGO, of_debugfs_root, &tlb_skip); return 0; } device_initcall(debugfs_tlb); -# endif #endif diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 5a8d173d7b75..fc61eb0eb8dd 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -157,10 +157,8 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct rt_sigframe __user *frame; int err = 0, sig = ksig->sig; unsigned long address = 0; -#ifdef CONFIG_MMU pmd_t *pmdp; pte_t *ptep; -#endif frame = get_sigframe(ksig, regs, sizeof(*frame)); @@ -192,7 +190,6 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, regs->r15 = ((unsigned long)frame->tramp)-8; address = ((unsigned long)frame->tramp); -#ifdef CONFIG_MMU pmdp = pmd_off(current->mm, address); preempt_disable(); @@ -208,10 +205,6 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, } pte_unmap(ptep); preempt_enable(); -#else - flush_icache_range(address, address + 8); - flush_dcache_range(address, address + 8); -#endif if (err) return -EFAULT; diff --git a/arch/microblaze/kernel/unwind.c b/arch/microblaze/kernel/unwind.c index 778a761af0a7..a530a7a6be7d 100644 --- a/arch/microblaze/kernel/unwind.c +++ b/arch/microblaze/kernel/unwind.c @@ -161,22 +161,12 @@ static void microblaze_unwind_inner(struct task_struct *task, * unwind_trap - Unwind through a system trap, that stored previous state * on the stack. */ -#ifdef CONFIG_MMU static inline void unwind_trap(struct task_struct *task, unsigned long pc, unsigned long fp, struct stack_trace *trace, const char *loglvl) { /* To be implemented */ } -#else -static inline void unwind_trap(struct task_struct *task, unsigned long pc, - unsigned long fp, struct stack_trace *trace, - const char *loglvl) -{ - const struct pt_regs *regs = (const struct pt_regs *) fp; - microblaze_unwind_inner(task, regs->pc, regs->r1, regs->r15, trace, loglvl); -} -#endif /** * microblaze_unwind_inner - Unwind the stack from the specified point @@ -215,16 +205,7 @@ static void microblaze_unwind_inner(struct task_struct *task, * HW exception handler doesn't save all registers, * so we open-code a special case of unwind_trap() */ -#ifndef CONFIG_MMU - const struct pt_regs *regs = - (const struct pt_regs *) fp; -#endif printk("%sHW EXCEPTION\n", loglvl); -#ifndef CONFIG_MMU - microblaze_unwind_inner(task, regs->r17 - 4, - fp + EX_HANDLER_STACK_SIZ, - regs->r15, trace, loglvl); -#endif return; } diff --git a/arch/microblaze/mm/Makefile b/arch/microblaze/mm/Makefile index 1b16875cea70..cd8a844bf29e 100644 --- a/arch/microblaze/mm/Makefile +++ b/arch/microblaze/mm/Makefile @@ -5,5 +5,5 @@ obj-y := consistent.o init.o -obj-$(CONFIG_MMU) += pgtable.o mmu_context.o fault.o +obj-y += pgtable.o mmu_context.o fault.o obj-$(CONFIG_HIGHMEM) += highmem.o diff --git a/arch/microblaze/mm/consistent.c b/arch/microblaze/mm/consistent.c index 81dffe43b18c..b7ad4a98636d 100644 --- a/arch/microblaze/mm/consistent.c +++ b/arch/microblaze/mm/consistent.c @@ -21,32 +21,3 @@ void arch_dma_prep_coherent(struct page *page, size_t size) flush_dcache_range(paddr, paddr + size); } - -#ifndef CONFIG_MMU -/* - * Consistent memory allocators. Used for DMA devices that want to share - * uncached memory with the processor core. My crufty no-MMU approach is - * simple. In the HW platform we can optionally mirror the DDR up above the - * processor cacheable region. So, memory accessed in this mirror region will - * not be cached. It's alloced from the same pool as normal memory, but the - * handle we return is shifted up into the uncached region. This will no doubt - * cause big problems if memory allocated here is not also freed properly. -- JW - * - * I have to use dcache values because I can't relate on ram size: - */ -#ifdef CONFIG_XILINX_UNCACHED_SHADOW -#define UNCACHED_SHADOW_MASK (cpuinfo.dcache_high - cpuinfo.dcache_base + 1) -#else -#define UNCACHED_SHADOW_MASK 0 -#endif /* CONFIG_XILINX_UNCACHED_SHADOW */ - -void *arch_dma_set_uncached(void *ptr, size_t size) -{ - unsigned long addr = (unsigned long)ptr; - - addr |= UNCACHED_SHADOW_MASK; - if (addr > cpuinfo.dcache_base && addr < cpuinfo.dcache_high) - pr_warn("ERROR: Your cache coherent area is CACHED!!!\n"); - return (void *)addr; -} -#endif /* CONFIG_MMU */ diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 45da639bd22c..7129a20881ea 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -29,11 +29,6 @@ /* Use for MMU and noMMU because of PCI generic code */ int mem_init_done; -#ifndef CONFIG_MMU -unsigned int __page_offset; -EXPORT_SYMBOL(__page_offset); -#endif /* CONFIG_MMU */ - char *klimit = _end; /* @@ -82,13 +77,11 @@ static void highmem_setup(void) static void __init paging_init(void) { unsigned long zones_size[MAX_NR_ZONES]; -#ifdef CONFIG_MMU int idx; /* Setup fixmaps */ for (idx = 0; idx < __end_of_fixed_addresses; idx++) clear_fixmap(idx); -#endif /* Clean every zones */ memset(zones_size, 0, sizeof(zones_size)); @@ -108,40 +101,6 @@ static void __init paging_init(void) void __init setup_memory(void) { -#ifndef CONFIG_MMU - u32 kernel_align_start, kernel_align_size; - phys_addr_t start, end; - u64 i; - - /* Find main memory where is the kernel */ - for_each_mem_range(i, &start, &end) { - memory_start = start; - lowmem_size = end - start; - if ((memory_start <= (u32)_text) && - ((u32)_text <= (memory_start + lowmem_size - 1))) { - memory_size = lowmem_size; - PAGE_OFFSET = memory_start; - pr_info("%s: Main mem: 0x%x, size 0x%08x\n", - __func__, (u32) memory_start, - (u32) memory_size); - break; - } - } - - if (!memory_start || !memory_size) { - panic("%s: Missing memory setting 0x%08x, size=0x%08x\n", - __func__, (u32) memory_start, (u32) memory_size); - } - - /* reservation of region where is the kernel */ - kernel_align_start = PAGE_DOWN((u32)_text); - /* ALIGN can be remove because _end in vmlinux.lds.S is align */ - kernel_align_size = PAGE_UP((u32)klimit) - kernel_align_start; - pr_info("%s: kernel addr:0x%08x-0x%08x size=0x%08x\n", - __func__, kernel_align_start, kernel_align_start - + kernel_align_size, kernel_align_size); - memblock_reserve(kernel_align_start, kernel_align_size); -#endif /* * Kernel: * start: base phys address of kernel - page align @@ -181,12 +140,6 @@ void __init mem_init(void) mem_init_done = 1; } -#ifndef CONFIG_MMU -int page_is_ram(unsigned long pfn) -{ - return __range_ok(pfn, 0); -} -#else int page_is_ram(unsigned long pfn) { return pfn < max_low_pfn; @@ -330,8 +283,6 @@ void __init *early_get_page(void) NUMA_NO_NODE); } -#endif /* CONFIG_MMU */ - void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) { void *p; diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index 60a58c0015f2..557585f1be41 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -325,12 +325,10 @@ int pci_mmap_legacy_page_range(struct pci_bus *bus, * memory, effectively behaving just like /dev/zero */ if ((offset + size) > hose->isa_mem_size) { -#ifdef CONFIG_MMU pr_debug("Process %s (pid:%d) mapped non-existing PCI", current->comm, current->pid); pr_debug("legacy memory for 0%04x:%02x\n", pci_domain_nr(bus), bus->number); -#endif if (vma->vm_flags & VM_SHARED) return shmem_zero_setup(vma); return 0; From 74a2810b7c1fcd60c87a8c47f95660628e00e97c Mon Sep 17 00:00:00 2001 From: Jinyang He Date: Wed, 25 Nov 2020 18:07:18 +0800 Subject: [PATCH 174/395] MIPS: KASLR: Correct valid bits in apply_r_mips_26_rel() Apply_r_mips_26_rel() relocates instructions like j, jal and etc. These instructions consist of 6bits function field and 26bits address field. The value of target_addr as follows, ================================================================= | high 4bits | low 28bits | ================================================================= |the high 4bits of this PC | the low 26bits of instructions << 2| ================================================================= Thus, loc_orig and log_new both need high 4bits rather than high 6bits. Signed-off-by: Jinyang He Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/relocate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/relocate.c b/arch/mips/kernel/relocate.c index 3d80a51256de..8561c7ac94dc 100644 --- a/arch/mips/kernel/relocate.c +++ b/arch/mips/kernel/relocate.c @@ -95,7 +95,7 @@ static int __init apply_r_mips_26_rel(u32 *loc_orig, u32 *loc_new, long offset) /* Original target address */ target_addr <<= 2; - target_addr += (unsigned long)loc_orig & ~0x03ffffff; + target_addr += (unsigned long)loc_orig & 0xf0000000; /* Get the new target address */ target_addr += offset; @@ -105,7 +105,7 @@ static int __init apply_r_mips_26_rel(u32 *loc_orig, u32 *loc_new, long offset) return -ENOEXEC; } - target_addr -= (unsigned long)loc_new & ~0x03ffffff; + target_addr -= (unsigned long)loc_new & 0xf0000000; target_addr >>= 2; *loc_new = (*loc_new & ~0x03ffffff) | (target_addr & 0x03ffffff); From a307a4ce9ecd2e23c71318201330d9d648b3f818 Mon Sep 17 00:00:00 2001 From: Jinyang He Date: Wed, 25 Nov 2020 18:07:46 +0800 Subject: [PATCH 175/395] MIPS: Loongson64: Add KASLR support Provide a weak plat_get_fdt() in relocate.c in case some platform enable USE_OF while plat_get_fdt() is useless. 1MB RELOCATION_TABLE_SIZE is small for Loongson64 because too many instructions should be relocated. 2MB is enough in present. Add KASLR support for Loongson64. KASLR(kernel address space layout randomization) To enable KASLR on Loongson64: First, make loongson3_defconfig. Then, enable CONFIG_RELOCATABLE and CONFIG_RANDOMIZE_BASE. Finally, compile the kernel. To test KASLR on Loongson64: Start machine with KASLR kernel. The first time: # cat /proc/iomem 00200000-0effffff : System RAM 02f30000-03895e9f : Kernel code 03895ea0-03bc7fff : Kernel data 03e30000-04f43f7f : Kernel bss The second time: # cat /proc/iomem 00200000-0effffff : System RAM 022f0000-02c55e9f : Kernel code 02c55ea0-02f87fff : Kernel data 031f0000-04303f7f : Kernel bss We see that code, data and bss sections become randomize. Signed-off-by: Jinyang He Signed-off-by: Thomas Bogendoerfer --- arch/mips/Kconfig | 5 ++++- arch/mips/kernel/relocate.c | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 0f638bfaf63a..44a47adb1275 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -488,6 +488,7 @@ config MACH_LOONGSON64 select SYS_SUPPORTS_HIGHMEM select SYS_SUPPORTS_LITTLE_ENDIAN select SYS_SUPPORTS_ZBOOT + select SYS_SUPPORTS_RELOCATABLE select ZONE_DMA32 select NUMA select SMP @@ -2778,7 +2779,8 @@ config RELOCATABLE depends on CPU_MIPS32_R2 || CPU_MIPS64_R2 || \ CPU_MIPS32_R5 || CPU_MIPS64_R5 || \ CPU_MIPS32_R6 || CPU_MIPS64_R6 || \ - CPU_P5600 || CAVIUM_OCTEON_SOC + CPU_P5600 || CAVIUM_OCTEON_SOC || \ + CPU_LOONGSON64 help This builds a kernel image that retains relocation information so it can be loaded someplace besides the default 1MB. @@ -2789,6 +2791,7 @@ config RELOCATION_TABLE_SIZE hex "Relocation table size" depends on RELOCATABLE range 0x0 0x01000000 + default "0x00200000" if CPU_LOONGSON64 default "0x00100000" help A table of relocation data will be appended to the kernel binary diff --git a/arch/mips/kernel/relocate.c b/arch/mips/kernel/relocate.c index 8561c7ac94dc..57bdd2761c78 100644 --- a/arch/mips/kernel/relocate.c +++ b/arch/mips/kernel/relocate.c @@ -294,6 +294,13 @@ static inline int __init relocation_addr_valid(void *loc_new) return 1; } +#if defined(CONFIG_USE_OF) +void __weak *plat_get_fdt(void) +{ + return NULL; +} +#endif + void *__init relocate_kernel(void) { void *loc_new; From f0e82242b16826077a2775eacfe201d803bb7a22 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Fri, 20 Nov 2020 15:48:47 +0800 Subject: [PATCH 176/395] mips: cdmm: fix use-after-free in mips_cdmm_bus_discover kfree(dev) has been called inside put_device so anther kfree would cause a use-after-free bug/ Fixes: 8286ae03308c ("MIPS: Add CDMM bus support") Reported-by: Hulk Robot Signed-off-by: Qinglang Miao Acked-by: Serge Semin Signed-off-by: Thomas Bogendoerfer --- drivers/bus/mips_cdmm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/bus/mips_cdmm.c b/drivers/bus/mips_cdmm.c index 9f7ed1fcd428..626dedd110cb 100644 --- a/drivers/bus/mips_cdmm.c +++ b/drivers/bus/mips_cdmm.c @@ -559,10 +559,8 @@ static void mips_cdmm_bus_discover(struct mips_cdmm_bus *bus) dev_set_name(&dev->dev, "cdmm%u-%u", cpu, id); ++id; ret = device_register(&dev->dev); - if (ret) { + if (ret) put_device(&dev->dev); - kfree(dev); - } } } From cbab54d9c2b2a73abe541790df28add14b2385bd Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 25 Nov 2020 18:11:30 +0800 Subject: [PATCH 177/395] MIPS: No need to check CPU 0 in {loongson3,bmips,octeon}_cpu_disable() After commit 9cce844abf07 ("MIPS: CPU#0 is not hotpluggable"), c->hotpluggable is 0 for CPU 0 and it will not generate a control file in sysfs for this CPU: [root@linux loongson]# cat /sys/devices/system/cpu/cpu0/online cat: /sys/devices/system/cpu/cpu0/online: No such file or directory [root@linux loongson]# echo 0 > /sys/devices/system/cpu/cpu0/online bash: /sys/devices/system/cpu/cpu0/online: Permission denied So no need to check CPU 0 in {loongson3,bmips,octeon}_cpu_disable(), just remove them. Signed-off-by: Tiezhu Yang Acked-by: Florian Fainelli Signed-off-by: Thomas Bogendoerfer --- arch/mips/cavium-octeon/smp.c | 3 --- arch/mips/kernel/smp-bmips.c | 3 --- arch/mips/loongson64/smp.c | 3 --- 3 files changed, 9 deletions(-) diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index 076db9a06b5e..66ce5527da54 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -290,9 +290,6 @@ static int octeon_cpu_disable(void) { unsigned int cpu = smp_processor_id(); - if (cpu == 0) - return -EBUSY; - if (!octeon_bootloader_entry_addr) return -ENOTSUPP; diff --git a/arch/mips/kernel/smp-bmips.c b/arch/mips/kernel/smp-bmips.c index 1dbfb5aadffd..359b176b665f 100644 --- a/arch/mips/kernel/smp-bmips.c +++ b/arch/mips/kernel/smp-bmips.c @@ -362,9 +362,6 @@ static int bmips_cpu_disable(void) { unsigned int cpu = smp_processor_id(); - if (cpu == 0) - return -EBUSY; - pr_info("SMP: CPU%d is offline\n", cpu); set_cpu_online(cpu, false); diff --git a/arch/mips/loongson64/smp.c b/arch/mips/loongson64/smp.c index aa0cd723e61d..b8c1fc3158fd 100644 --- a/arch/mips/loongson64/smp.c +++ b/arch/mips/loongson64/smp.c @@ -544,9 +544,6 @@ static int loongson3_cpu_disable(void) unsigned long flags; unsigned int cpu = smp_processor_id(); - if (cpu == 0) - return -EBUSY; - set_cpu_online(cpu, false); calculate_cpu_foreign_map(); local_irq_save(flags); From 915d8aac69d32bf4272a015bf7bf3516deeaad5e Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Fri, 27 Nov 2020 10:53:32 +0100 Subject: [PATCH 178/395] MIPS: mm: Remove unused is_aligned_hugepage_range Function is_aligned_hugepage_range is no longer needed. Signed-off-by: Thomas Bogendoerfer --- arch/mips/mm/hugetlbpage.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 77ffece9c270..b9f76f433617 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -58,18 +58,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return (pte_t *) pmd; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - int pmd_huge(pmd_t pmd) { return (pmd_val(pmd) & _PAGE_HUGE) != 0; From 1dac4585f585d3aa1a5af4821128ea2642700e48 Mon Sep 17 00:00:00 2001 From: Jinyang He Date: Wed, 25 Nov 2020 18:11:02 +0800 Subject: [PATCH 179/395] MIPS: Loongson64: Fix up reserving kernel memory range Reserve memory from &_text to &_end. Otherwise if kernel address was modified, the memory range of start_pfn to kernel_start_pfn would be reserved. Then we could not use this range. Signed-off-by: Jinyang He Signed-off-by: Thomas Bogendoerfer --- arch/mips/loongson64/numa.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index 509b360e4d28..c6f0c48384f8 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -151,6 +151,9 @@ static void __init node_mem_init(unsigned int node) NODE_DATA(node)->node_spanned_pages = end_pfn - start_pfn; if (node == 0) { + /* kernel start address */ + unsigned long kernel_start_pfn = PFN_DOWN(__pa_symbol(&_text)); + /* kernel end address */ unsigned long kernel_end_pfn = PFN_UP(__pa_symbol(&_end)); @@ -158,8 +161,8 @@ static void __init node_mem_init(unsigned int node) max_low_pfn = end_pfn; /* Reserve the kernel text/data/bss */ - memblock_reserve(start_pfn << PAGE_SHIFT, - ((kernel_end_pfn - start_pfn) << PAGE_SHIFT)); + memblock_reserve(kernel_start_pfn << PAGE_SHIFT, + ((kernel_end_pfn - kernel_start_pfn) << PAGE_SHIFT)); /* Reserve 0xfe000000~0xffffffff for RS780E integrated GPU */ if (node_end_pfn(0) >= (0xffffffff >> PAGE_SHIFT)) From 207cdd565dfc95a0a5185263a567817b7ebf5467 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Thu, 26 Nov 2020 11:34:56 +0100 Subject: [PATCH 180/395] ima: Don't modify file descriptor mode on the fly Commit a408e4a86b36b ("ima: open a new file instance if no read permissions") already introduced a second open to measure a file when the original file descriptor does not allow it. However, it didn't remove the existing method of changing the mode of the original file descriptor, which is still necessary if the current process does not have enough privileges to open a new one. Changing the mode isn't really an option, as the filesystem might need to do preliminary steps to make the read possible. Thus, this patch removes the code and keeps the second open as the only option to measure a file when it is unreadable with the original file descriptor. Cc: # 4.20.x: 0014cc04e8ec0 ima: Set file->f_mode Fixes: 2fe5d6def1672 ("ima: integrity appraisal extension") Signed-off-by: Roberto Sassu Reviewed-by: Christoph Hellwig Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_crypto.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 21989fa0c107..f6a7e9643b54 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -537,7 +537,7 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) loff_t i_size; int rc; struct file *f = file; - bool new_file_instance = false, modified_mode = false; + bool new_file_instance = false; /* * For consistency, fail file's opened with the O_DIRECT flag on @@ -555,18 +555,10 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) O_TRUNC | O_CREAT | O_NOCTTY | O_EXCL); flags |= O_RDONLY; f = dentry_open(&file->f_path, flags, file->f_cred); - if (IS_ERR(f)) { - /* - * Cannot open the file again, lets modify f_mode - * of original and continue - */ - pr_info_ratelimited("Unable to reopen file for reading.\n"); - f = file; - f->f_mode |= FMODE_READ; - modified_mode = true; - } else { - new_file_instance = true; - } + if (IS_ERR(f)) + return PTR_ERR(f); + + new_file_instance = true; } i_size = i_size_read(file_inode(f)); @@ -581,8 +573,6 @@ int ima_calc_file_hash(struct file *file, struct ima_digest_data *hash) out: if (new_file_instance) fput(f); - else if (modified_mode) - f->f_mode &= ~FMODE_READ; return rc; } From c731b84b51bf7fe83448bea8f56a6d55006b0615 Mon Sep 17 00:00:00 2001 From: "Dae R. Jeong" Date: Thu, 22 Oct 2020 10:21:28 +0900 Subject: [PATCH 181/395] md: fix a warning caused by a race between concurrent md_ioctl()s Syzkaller reports a warning as belows. WARNING: CPU: 0 PID: 9647 at drivers/md/md.c:7169 ... Call Trace: ... RIP: 0010:md_ioctl+0x4017/0x5980 drivers/md/md.c:7169 RSP: 0018:ffff888096027950 EFLAGS: 00010293 RAX: ffff88809322c380 RBX: 0000000000000932 RCX: ffffffff84e266f2 RDX: 0000000000000000 RSI: ffffffff84e299f7 RDI: 0000000000000007 RBP: ffff888096027bc0 R08: ffff88809322c380 R09: ffffed101341a482 R10: ffff888096027940 R11: ffff88809a0d240f R12: 0000000000000932 R13: ffff8880a2c14100 R14: ffff88809a0d2268 R15: ffff88809a0d2408 __blkdev_driver_ioctl block/ioctl.c:304 [inline] blkdev_ioctl+0xece/0x1c10 block/ioctl.c:606 block_ioctl+0xee/0x130 fs/block_dev.c:1930 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:509 [inline] do_vfs_ioctl+0xd5f/0x1380 fs/ioctl.c:696 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713 __do_sys_ioctl fs/ioctl.c:720 [inline] __se_sys_ioctl fs/ioctl.c:718 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe This is caused by a race between two concurrenct md_ioctl()s closing the array. CPU1 (md_ioctl()) CPU2 (md_ioctl()) ------ ------ set_bit(MD_CLOSING, &mddev->flags); did_set_md_closing = true; WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags)); if(did_set_md_closing) clear_bit(MD_CLOSING, &mddev->flags); Fix the warning by returning immediately if the MD_CLOSING bit is set in &mddev->flags which indicates that the array is being closed. Fixes: 065e519e71b2 ("md: MD_CLOSING needs to be cleared after called md_set_readonly or do_md_stop") Reported-by: syzbot+1e46a0864c1a6e9bd3d8@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Dae R. Jeong Signed-off-by: Song Liu --- drivers/md/md.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3c79243e9d07..16a97dc385b4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7589,8 +7589,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = -EBUSY; goto out; } - WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags)); - set_bit(MD_CLOSING, &mddev->flags); + if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { + mutex_unlock(&mddev->open_mutex); + err = -EBUSY; + goto out; + } did_set_md_closing = true; mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); From 93decc563637c4288380912eac0eb42fb246cc04 Mon Sep 17 00:00:00 2001 From: Kevin Vigor Date: Fri, 6 Nov 2020 14:20:34 -0800 Subject: [PATCH 182/395] md/raid10: initialize r10_bio->read_slot before use. In __make_request() a new r10bio is allocated and passed to raid10_read_request(). The read_slot member of the bio is not initialized, and the raid10_read_request() uses it to index an array. This leads to occasional panics. Fix by initializing the field to invalid value and checking for valid value in raid10_read_request(). Cc: stable@vger.kernel.org Signed-off-by: Kevin Vigor Signed-off-by: Song Liu --- drivers/md/raid10.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b7bca6703df8..3153183b7772 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1127,7 +1127,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; - if (r10_bio->devs[slot].rdev) { + if (slot >= 0 && r10_bio->devs[slot].rdev) { /* * This is an error retry, but we cannot * safely dereference the rdev in the r10_bio, @@ -1508,6 +1508,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->mddev = mddev; r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; + r10_bio->read_slot = -1; memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks); if (bio_data_dir(bio) == READ) From 81ba3c24628c14eb869d81652dbaf50640d8cc24 Mon Sep 17 00:00:00 2001 From: Pankaj Gupta Date: Wed, 11 Nov 2020 06:16:56 +0100 Subject: [PATCH 183/395] md: improve variable names in md_flush_request() This patch improves readability by using better variable names in flush request coalescing logic. Signed-off-by: Pankaj Gupta Reviewed-by: Paul Menzel Signed-off-by: Song Liu --- drivers/md/md.c | 8 ++++---- drivers/md/md.h | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 16a97dc385b4..3992e8c5c0d6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -639,7 +639,7 @@ static void md_submit_flush_data(struct work_struct *ws) * could wait for this and below md_handle_request could wait for those * bios because of suspend check */ - mddev->last_flush = mddev->start_flush; + mddev->prev_flush_start = mddev->start_flush; mddev->flush_bio = NULL; wake_up(&mddev->sb_wait); @@ -660,13 +660,13 @@ static void md_submit_flush_data(struct work_struct *ws) */ bool md_flush_request(struct mddev *mddev, struct bio *bio) { - ktime_t start = ktime_get_boottime(); + ktime_t req_start = ktime_get_boottime(); spin_lock_irq(&mddev->lock); wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio || - ktime_after(mddev->last_flush, start), + ktime_after(mddev->prev_flush_start, req_start), mddev->lock); - if (!ktime_after(mddev->last_flush, start)) { + if (!ktime_after(mddev->prev_flush_start, req_start)) { WARN_ON(mddev->flush_bio); mddev->flush_bio = bio; bio = NULL; diff --git a/drivers/md/md.h b/drivers/md/md.h index ccfb69868c2e..2292c847f9dd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -495,9 +495,9 @@ struct mddev { */ struct bio *flush_bio; atomic_t flush_pending; - ktime_t start_flush, last_flush; /* last_flush is when the last completed - * flush was started. - */ + ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed + * flush was started. + */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ mempool_t *serial_info_pool; From 204d1a6434158ac655fc4037f29742b9b6103f0e Mon Sep 17 00:00:00 2001 From: Pankaj Gupta Date: Wed, 11 Nov 2020 06:16:57 +0100 Subject: [PATCH 184/395] md: add comments in md_flush_request() Request coalescing logic is dependent on flush time update in other context. This patch adds comments to understand the code flow better. Signed-off-by: Pankaj Gupta Signed-off-by: Song Liu --- drivers/md/md.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3992e8c5c0d6..a0998ad6388c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -662,10 +662,14 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio) { ktime_t req_start = ktime_get_boottime(); spin_lock_irq(&mddev->lock); + /* flush requests wait until ongoing flush completes, + * hence coalescing all the pending requests. + */ wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio || ktime_after(mddev->prev_flush_start, req_start), mddev->lock); + /* new request after previous flush is completed */ if (!ktime_after(mddev->prev_flush_start, req_start)) { WARN_ON(mddev->flush_bio); mddev->flush_bio = bio; From a23f2aae8498d8c8bb6ff5301bda02db8093cb09 Mon Sep 17 00:00:00 2001 From: Pankaj Gupta Date: Wed, 11 Nov 2020 06:16:58 +0100 Subject: [PATCH 185/395] md: use current request time as base for ktime comparisons Request coalescing logic uses 'prev_flush_start' as base to compare the current request start time. 'prev_flush_start' is updated in other context. This patch changes this by using ktime comparison base to 'req_start' for better readability of code. Signed-off-by: Pankaj Gupta Signed-off-by: Song Liu --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index a0998ad6388c..1a3a6150123e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -667,10 +667,10 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio) */ wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio || - ktime_after(mddev->prev_flush_start, req_start), + ktime_before(req_start, mddev->prev_flush_start), mddev->lock); /* new request after previous flush is completed */ - if (!ktime_after(mddev->prev_flush_start, req_start)) { + if (ktime_after(req_start, mddev->prev_flush_start)) { WARN_ON(mddev->flush_bio); mddev->flush_bio = bio; bio = NULL; From a8da01f79c89755fad55ed0ea96e8d2103242a72 Mon Sep 17 00:00:00 2001 From: Zhao Heming Date: Thu, 19 Nov 2020 19:41:33 +0800 Subject: [PATCH 186/395] md/cluster: block reshape with remote resync job Reshape request should be blocked with ongoing resync job. In cluster env, a node can start resync job even if the resync cmd isn't executed on it, e.g., user executes "mdadm --grow" on node A, sometimes node B will start resync job. However, current update_raid_disks() only check local recovery status, which is incomplete. As a result, we see user will execute "mdadm --grow" successfully on local, while the remote node deny to do reshape job when it doing resync job. The inconsistent handling cause array enter unexpected status. If user doesn't observe this issue and continue executing mdadm cmd, the array doesn't work at last. Fix this issue by blocking reshape request. When node executes "--grow" and detects ongoing resync, it should stop and report error to user. The following script reproduces the issue with ~100% probability. (two nodes share 3 iSCSI luns: sdg/sdh/sdi. Each lun size is 1GB) ``` # on node1, node2 is the remote node. ssh root@node2 "mdadm -S --scan" mdadm -S --scan for i in {g,h,i};do dd if=/dev/zero of=/dev/sd$i oflag=direct bs=1M \ count=20; done mdadm -C /dev/md0 -b clustered -e 1.2 -n 2 -l mirror /dev/sdg /dev/sdh ssh root@node2 "mdadm -A /dev/md0 /dev/sdg /dev/sdh" sleep 5 mdadm --manage --add /dev/md0 /dev/sdi mdadm --wait /dev/md0 mdadm --grow --raid-devices=3 /dev/md0 mdadm /dev/md0 --fail /dev/sdg mdadm /dev/md0 --remove /dev/sdg mdadm --grow --raid-devices=2 /dev/md0 ``` Cc: stable@vger.kernel.org Signed-off-by: Zhao Heming Signed-off-by: Song Liu --- drivers/md/md.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 1a3a6150123e..90faec048f2c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7283,6 +7283,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return -EINVAL; if (mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || mddev->reshape_position != MaxSector) return -EBUSY; @@ -9667,8 +9668,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) } } - if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) - update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { + ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + if (ret) + pr_warn("md: updating array disks failed. %d\n", ret); + } /* * Since mddev->delta_disks has already updated in update_raid_disks, From bca5b0658020be90b6b504ca514fd80110204f71 Mon Sep 17 00:00:00 2001 From: Zhao Heming Date: Thu, 19 Nov 2020 19:41:34 +0800 Subject: [PATCH 187/395] md/cluster: fix deadlock when node is doing resync job md-cluster uses MD_CLUSTER_SEND_LOCK to make node can exclusively send msg. During sending msg, node can concurrently receive msg from another node. When node does resync job, grab token_lockres:EX may trigger a deadlock: ``` nodeA nodeB -------------------- -------------------- a. send METADATA_UPDATED held token_lockres:EX b. md_do_sync resync_info_update send RESYNCING + set MD_CLUSTER_SEND_LOCK + wait for holding token_lockres:EX c. mdadm /dev/md0 --remove /dev/sdg + held reconfig_mutex + send REMOVE + wait_event(MD_CLUSTER_SEND_LOCK) d. recv_daemon //METADATA_UPDATED from A process_metadata_update + (mddev_trylock(mddev) || MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD) //this time, both return false forever ``` Explaination: a. A send METADATA_UPDATED This will block another node to send msg b. B does sync jobs, which will send RESYNCING at intervals. This will be block for holding token_lockres:EX lock. c. B do "mdadm --remove", which will send REMOVE. This will be blocked by step : MD_CLUSTER_SEND_LOCK is 1. d. B recv METADATA_UPDATED msg, which send from A in step . This will be blocked by step : holding mddev lock, it makes wait_event can't hold mddev lock. (btw, MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD keep ZERO in this scenario.) There is a similar deadlock in commit 0ba959774e93 ("md-cluster: use sync way to handle METADATA_UPDATED msg") In that commit, step c is "update sb". This patch step c is "mdadm --remove". For fixing this issue, we can refer the solution of function: metadata_update_start. Which does the same grab lock_token action. lock_comm can use the same steps to avoid deadlock. By moving MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD from lock_token to lock_comm. It enlarge a little bit window of MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, but it is safe & can break deadlock. Repro steps (I only triggered 3 times with hundreds tests): two nodes share 3 iSCSI luns: sdg/sdh/sdi. Each lun size is 1GB. ``` ssh root@node2 "mdadm -S --scan" mdadm -S --scan for i in {g,h,i};do dd if=/dev/zero of=/dev/sd$i oflag=direct bs=1M \ count=20; done mdadm -C /dev/md0 -b clustered -e 1.2 -n 2 -l mirror /dev/sdg /dev/sdh \ --bitmap-chunk=1M ssh root@node2 "mdadm -A /dev/md0 /dev/sdg /dev/sdh" sleep 5 mkfs.xfs /dev/md0 mdadm --manage --add /dev/md0 /dev/sdi mdadm --wait /dev/md0 mdadm --grow --raid-devices=3 /dev/md0 mdadm /dev/md0 --fail /dev/sdg mdadm /dev/md0 --remove /dev/sdg mdadm --grow --raid-devices=2 /dev/md0 ``` test script will hung when executing "mdadm --remove". ``` # dump stacks by "echo t > /proc/sysrq-trigger" md0_cluster_rec D 0 5329 2 0x80004000 Call Trace: __schedule+0x1f6/0x560 ? _cond_resched+0x2d/0x40 ? schedule+0x4a/0xb0 ? process_metadata_update.isra.0+0xdb/0x140 [md_cluster] ? wait_woken+0x80/0x80 ? process_recvd_msg+0x113/0x1d0 [md_cluster] ? recv_daemon+0x9e/0x120 [md_cluster] ? md_thread+0x94/0x160 [md_mod] ? wait_woken+0x80/0x80 ? md_congested+0x30/0x30 [md_mod] ? kthread+0x115/0x140 ? __kthread_bind_mask+0x60/0x60 ? ret_from_fork+0x1f/0x40 mdadm D 0 5423 1 0x00004004 Call Trace: __schedule+0x1f6/0x560 ? __schedule+0x1fe/0x560 ? schedule+0x4a/0xb0 ? lock_comm.isra.0+0x7b/0xb0 [md_cluster] ? wait_woken+0x80/0x80 ? remove_disk+0x4f/0x90 [md_cluster] ? hot_remove_disk+0xb1/0x1b0 [md_mod] ? md_ioctl+0x50c/0xba0 [md_mod] ? wait_woken+0x80/0x80 ? blkdev_ioctl+0xa2/0x2a0 ? block_ioctl+0x39/0x40 ? ksys_ioctl+0x82/0xc0 ? __x64_sys_ioctl+0x16/0x20 ? do_syscall_64+0x5f/0x150 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 md0_resync D 0 5425 2 0x80004000 Call Trace: __schedule+0x1f6/0x560 ? schedule+0x4a/0xb0 ? dlm_lock_sync+0xa1/0xd0 [md_cluster] ? wait_woken+0x80/0x80 ? lock_token+0x2d/0x90 [md_cluster] ? resync_info_update+0x95/0x100 [md_cluster] ? raid1_sync_request+0x7d3/0xa40 [raid1] ? md_do_sync.cold+0x737/0xc8f [md_mod] ? md_thread+0x94/0x160 [md_mod] ? md_congested+0x30/0x30 [md_mod] ? kthread+0x115/0x140 ? __kthread_bind_mask+0x60/0x60 ? ret_from_fork+0x1f/0x40 ``` At last, thanks for Xiao's solution. Cc: stable@vger.kernel.org Signed-off-by: Zhao Heming Suggested-by: Xiao Ni Reviewed-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md-cluster.c | 67 +++++++++++++++++++++++------------------ drivers/md/md.c | 6 ++-- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 4aaf4820b6f6..f0e64e76fd79 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -664,9 +664,27 @@ static void recv_daemon(struct md_thread *thread) * Takes the lock on the TOKEN lock resource so no other * node can communicate while the operation is underway. */ -static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) +static int lock_token(struct md_cluster_info *cinfo) { - int error, set_bit = 0; + int error; + + error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); + if (error) { + pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", + __func__, __LINE__, error); + } else { + /* Lock the receive sequence */ + mutex_lock(&cinfo->recv_mutex); + } + return error; +} + +/* lock_comm() + * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. + */ +static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) +{ + int rv, set_bit = 0; struct mddev *mddev = cinfo->mddev; /* @@ -677,34 +695,19 @@ static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) */ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)) { - error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, + rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - WARN_ON_ONCE(error); + WARN_ON_ONCE(rv); md_wakeup_thread(mddev->thread); set_bit = 1; } - error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); - if (set_bit) - clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - if (error) - pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", - __func__, __LINE__, error); - - /* Lock the receive sequence */ - mutex_lock(&cinfo->recv_mutex); - return error; -} - -/* lock_comm() - * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. - */ -static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) -{ wait_event(cinfo->wait, !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); - - return lock_token(cinfo, mddev_locked); + rv = lock_token(cinfo); + if (set_bit) + clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); + return rv; } static void unlock_comm(struct md_cluster_info *cinfo) @@ -784,9 +787,11 @@ static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg, { int ret; - lock_comm(cinfo, mddev_locked); - ret = __sendmsg(cinfo, cmsg); - unlock_comm(cinfo); + ret = lock_comm(cinfo, mddev_locked); + if (!ret) { + ret = __sendmsg(cinfo, cmsg); + unlock_comm(cinfo); + } return ret; } @@ -1061,7 +1066,7 @@ static int metadata_update_start(struct mddev *mddev) return 0; } - ret = lock_token(cinfo, 1); + ret = lock_token(cinfo); clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); return ret; } @@ -1255,7 +1260,10 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors) int raid_slot = -1; md_update_sb(mddev, 1); - lock_comm(cinfo, 1); + if (lock_comm(cinfo, 1)) { + pr_err("%s: lock_comm failed\n", __func__); + return; + } memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = cpu_to_le32(METADATA_UPDATED); @@ -1407,7 +1415,8 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) cmsg.type = cpu_to_le32(NEWDISK); memcpy(cmsg.uuid, uuid, 16); cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); - lock_comm(cinfo, 1); + if (lock_comm(cinfo, 1)) + return -EAGAIN; ret = __sendmsg(cinfo, &cmsg); if (ret) { unlock_comm(cinfo); diff --git a/drivers/md/md.c b/drivers/md/md.c index 90faec048f2c..c42af46d366a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6953,8 +6953,10 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev)) - md_cluster_ops->remove_disk(mddev, rdev); + if (mddev_is_clustered(mddev)) { + if (md_cluster_ops->remove_disk(mddev, rdev)) + goto busy; + } md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); From 5a20d073ec54a72d9a732fa44bfe14954eb6332f Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Mon, 30 Nov 2020 10:20:52 +0800 Subject: [PATCH 188/395] block: wbt: Remove unnecessary invoking of wbt_update_limits in wbt_init It's unnecessary to call wbt_update_limits explicitly within wbt_init, because it will be called in the following function wbt_queue_depth_changed. Signed-off-by: Lei Chen Signed-off-by: Jens Axboe --- block/blk-wbt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index fd410086fe1d..0321ca83e73f 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -835,7 +835,6 @@ int wbt_init(struct request_queue *q) rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = 1; rwb->rq_depth.default_depth = RWB_DEF_DEPTH; - wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. From cbf82e35031b135928f36e72c6d166e935530b6a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Oct 2020 15:20:39 +0200 Subject: [PATCH 189/395] pstore/zone: cap the maximum device size Introduce an abritrary 128MiB cap to avoid malloc failures when using a larger block device. Signed-off-by: Christoph Hellwig Reviewed-by: WeiXiong Liao Reviewed-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201016132047.3068029-2-hch@lst.de --- fs/pstore/zone.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 3ce89216670c..5266ccbec007 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -1299,6 +1299,10 @@ int register_pstore_zone(struct pstore_zone_info *info) pr_warn("total_size must be >= 4096\n"); return -EINVAL; } + if (info->total_size > SZ_128M) { + pr_warn("capping size to 128MiB\n"); + info->total_size = SZ_128M; + } if (!info->kmsg_size && !info->pmsg_size && !info->console_size && !info->ftrace_size) { From 45a8af4412b1143760cbc7255b8c53271df89ed7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Oct 2020 15:20:40 +0200 Subject: [PATCH 190/395] pstore/blk: update the command line example Use the human readable device name instead of the device number, and add the required best_effort parameter. Signed-off-by: Christoph Hellwig Reviewed-by: WeiXiong Liao Reviewed-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201016132047.3068029-3-hch@lst.de --- Documentation/admin-guide/pstore-blk.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/pstore-blk.rst b/Documentation/admin-guide/pstore-blk.rst index 6898aba9fb5c..83543cb3daa1 100644 --- a/Documentation/admin-guide/pstore-blk.rst +++ b/Documentation/admin-guide/pstore-blk.rst @@ -35,7 +35,7 @@ module parameters have priority over Kconfig. Here is an example for module parameters:: - pstore_blk.blkdev=179:7 pstore_blk.kmsg_size=64 + pstore_blk.blkdev=/dev/mmcblk0p7 pstore_blk.kmsg_size=64 best_effort=y The detail of each configurations may be of interest to you. From 03d99e5d63dabe2c0cea0d8fe1cb89bde33f7939 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 16 Oct 2020 14:28:38 -0700 Subject: [PATCH 191/395] nvme-fcloop: add sysfs attribute to inject command drop Add sysfs attribute to specify parameters for dropping a command. The attribute takes a string of: :: Opcode is formatted as lower 8 bits are opcode. If a fabrics opcode, a bit above bits 7:0 will be set. Once set, each sqe is looked at. If the opcode matches the running instance count is updated. If the instance count is in the range of where to drop (based on starting and # of times), then drop the command by not passing it to the target layer. Signed-off-by: James Smart --- drivers/nvme/target/fcloop.c | 81 +++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 3da067a8311e..733d9363900e 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -564,6 +564,50 @@ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq, fcloop_tfcp_req_put(tfcp_req); } +static bool drop_fabric_opcode; +#define DROP_OPCODE_MASK 0x00FF +/* fabrics opcode will have a bit set above 1st byte */ +static int drop_opcode = -1; +static int drop_instance; +static int drop_amount; +static int drop_current_cnt; + +/* + * Routine to parse io and determine if the io is to be dropped. + * Returns: + * 0 if io is not obstructed + * 1 if io was dropped + */ +static int check_for_drop(struct fcloop_fcpreq *tfcp_req) +{ + struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq; + struct nvme_fc_cmd_iu *cmdiu = fcpreq->cmdaddr; + struct nvme_command *sqe = &cmdiu->sqe; + + if (drop_opcode == -1) + return 0; + + pr_info("%s: seq opcd x%02x fctype x%02x: drop F %s op x%02x " + "inst %d start %d amt %d\n", + __func__, sqe->common.opcode, sqe->fabrics.fctype, + drop_fabric_opcode ? "y" : "n", + drop_opcode, drop_current_cnt, drop_instance, drop_amount); + + if ((drop_fabric_opcode && + (sqe->common.opcode != nvme_fabrics_command || + sqe->fabrics.fctype != drop_opcode)) || + (!drop_fabric_opcode && sqe->common.opcode != drop_opcode)) + return 0; + + if (++drop_current_cnt >= drop_instance) { + if (drop_current_cnt >= drop_instance + drop_amount) + drop_opcode = -1; + return 1; + } + + return 0; +} + static void fcloop_fcp_recv_work(struct work_struct *work) { @@ -590,10 +634,14 @@ fcloop_fcp_recv_work(struct work_struct *work) if (unlikely(aborted)) ret = -ECANCELED; - else - ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, + else { + if (likely(!check_for_drop(tfcp_req))) + ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport, &tfcp_req->tgt_fcp_req, fcpreq->cmdaddr, fcpreq->cmdlen); + else + pr_info("%s: dropped command ********\n", __func__); + } if (ret) fcloop_call_host_done(fcpreq, tfcp_req, ret); @@ -1449,6 +1497,33 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, return ret ? ret : count; } +static ssize_t +fcloop_set_cmd_drop(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int opcode, starting, amount; + + if (sscanf(buf, "%x:%d:%d", &opcode, &starting, &amount) != 3) + return -EBADRQC; + + drop_current_cnt = 0; + drop_fabric_opcode = (opcode & ~DROP_OPCODE_MASK) ? true : false; + drop_opcode = (opcode & DROP_OPCODE_MASK); + drop_instance = starting; + /* the check to drop routine uses instance + count to know when + * to end. Thus, if dropping 1 instance, count should be 0. + * so subtract 1 from the count. + */ + drop_amount = amount - 1; + + pr_info("%s: DROP: Starting at instance %d of%s opcode x%x drop +%d " + "instances\n", + __func__, drop_instance, drop_fabric_opcode ? " fabric" : "", + drop_opcode, drop_amount); + + return count; +} + static DEVICE_ATTR(add_local_port, 0200, NULL, fcloop_create_local_port); static DEVICE_ATTR(del_local_port, 0200, NULL, fcloop_delete_local_port); @@ -1456,6 +1531,7 @@ static DEVICE_ATTR(add_remote_port, 0200, NULL, fcloop_create_remote_port); static DEVICE_ATTR(del_remote_port, 0200, NULL, fcloop_delete_remote_port); static DEVICE_ATTR(add_target_port, 0200, NULL, fcloop_create_target_port); static DEVICE_ATTR(del_target_port, 0200, NULL, fcloop_delete_target_port); +static DEVICE_ATTR(set_cmd_drop, 0200, NULL, fcloop_set_cmd_drop); static struct attribute *fcloop_dev_attrs[] = { &dev_attr_add_local_port.attr, @@ -1464,6 +1540,7 @@ static struct attribute *fcloop_dev_attrs[] = { &dev_attr_del_remote_port.attr, &dev_attr_add_target_port.attr, &dev_attr_del_target_port.attr, + &dev_attr_set_cmd_drop.attr, NULL }; From 84115d6d80c809d65c42f9383f22c10b91a4eb1c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 27 Oct 2020 16:15:16 +0800 Subject: [PATCH 192/395] nvme: simplify nvme_req_qid() Use the request's '->mq_hctx->queue_num' directly to simplify the nvme_req_qid() function. Signed-off-by: Baolin Wang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index bc330bf0d3bd..87867e93c7d3 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -178,7 +178,8 @@ static inline u16 nvme_req_qid(struct request *req) { if (!req->q->queuedata) return 0; - return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1; + + return req->mq_hctx->queue_num + 1; } /* The below value is the specific amount of delay needed before checking From 0d2e7c840b178bf9a47bd0de89d8f9182fa71d86 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 16:33:42 -0800 Subject: [PATCH 193/395] nvme: centralize setting the timeout in nvme_alloc_request The function nvme_alloc_request() is called from different context (I/O and Admin queue) where callers do not consider the I/O timeout when called from I/O queue context. Update nvme_alloc_request() to set the default I/O and Admin timeout value based on whether the queuedata is set or not. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 11 +++++++++-- drivers/nvme/host/lightnvm.c | 3 ++- drivers/nvme/host/pci.c | 2 -- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9b01afcb7777..97348b1ecfd6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -533,6 +533,11 @@ struct request *nvme_alloc_request(struct request_queue *q, if (IS_ERR(req)) return req; + if (req->q->queuedata) + req->timeout = NVME_IO_TIMEOUT; + else /* no queuedata implies admin queue */ + req->timeout = ADMIN_TIMEOUT; + req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_clear_nvme_request(req); nvme_req(req)->cmd = cmd; @@ -901,7 +906,8 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, if (IS_ERR(req)) return PTR_ERR(req); - req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + if (timeout) + req->timeout = timeout; if (buffer && bufflen) { ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); @@ -1071,7 +1077,8 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (IS_ERR(req)) return PTR_ERR(req); - req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + if (timeout) + req->timeout = timeout; nvme_req(req)->flags |= NVME_REQ_USERCMD; if (ubuffer && bufflen) { diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 8e562d0f2c30..88a7c8eac455 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -774,7 +774,8 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, goto err_cmd; } - rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; + if (timeout) + rq->timeout = timeout; if (ppa_buf && ppa_len) { ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0578ff253c47..76465d335924 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1310,7 +1310,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_RESET_TIMER; } - abort_req->timeout = ADMIN_TIMEOUT; abort_req->end_io_data = NULL; blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); @@ -2223,7 +2222,6 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) if (IS_ERR(req)) return PTR_ERR(req); - req->timeout = ADMIN_TIMEOUT; req->end_io_data = nvmeq; init_completion(&nvmeq->delete_done); From dc96f93874c63e126087e1adf1973c9fecfdaa0c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 16:33:45 -0800 Subject: [PATCH 194/395] nvme: use consistent macro name for timeout This is purely a clenaup patch, add prefix NVME to the ADMIN_TIMEOUT to make consistent with NVME_IO_TIMEOUT. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 +++--- drivers/nvme/host/fc.c | 2 +- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/pci.c | 4 ++-- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/host/tcp.c | 2 +- drivers/nvme/target/loop.c | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 97348b1ecfd6..98bea150e5dc 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -536,7 +536,7 @@ struct request *nvme_alloc_request(struct request_queue *q, if (req->q->queuedata) req->timeout = NVME_IO_TIMEOUT; else /* no queuedata implies admin queue */ - req->timeout = ADMIN_TIMEOUT; + req->timeout = NVME_ADMIN_TIMEOUT; req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_clear_nvme_request(req); @@ -2268,8 +2268,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); cmd.common.cdw11 = cpu_to_le32(len); - return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, - ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false); + return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0, + NVME_QID_ANY, 1, 0, false); } EXPORT_SYMBOL_GPL(nvme_sec_submit); #endif /* CONFIG_BLK_SED_OPAL */ diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f4c246462658..38373a0e86ef 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3479,7 +3479,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->lport->ops->fcprqst_priv_sz); ctrl->admin_tag_set.driver_data = ctrl; ctrl->admin_tag_set.nr_hw_queues = 1; - ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 87867e93c7d3..824776a8ba13 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -24,7 +24,7 @@ extern unsigned int nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) extern unsigned int admin_timeout; -#define ADMIN_TIMEOUT (admin_timeout * HZ) +#define NVME_ADMIN_TIMEOUT (admin_timeout * HZ) #define NVME_DEFAULT_KATO 5 #define NVME_KATO_GRACE 10 diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 76465d335924..6123040ff872 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1606,7 +1606,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.nr_hw_queues = 1; dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; - dev->admin_tagset.timeout = ADMIN_TIMEOUT; + dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev->ctrl.numa_node; dev->admin_tagset.cmd_size = sizeof(struct nvme_iod); dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; @@ -2237,7 +2237,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) unsigned long timeout; retry: - timeout = ADMIN_TIMEOUT; + timeout = NVME_ADMIN_TIMEOUT; while (nr_queues > 0) { if (nvme_delete_queue(&dev->queues[nr_queues], opcode)) break; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 65e3d0ef36e1..df9f6f4549f1 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -797,7 +797,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, NVME_RDMA_DATA_SGL_SIZE; set->driver_data = ctrl; set->nr_hw_queues = 1; - set->timeout = ADMIN_TIMEOUT; + set->timeout = NVME_ADMIN_TIMEOUT; set->flags = BLK_MQ_F_NO_SCHED; } else { set = &ctrl->tag_set; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index c0c33320fe65..1ba659927442 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1568,7 +1568,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, set->cmd_size = sizeof(struct nvme_tcp_request); set->driver_data = ctrl; set->nr_hw_queues = 1; - set->timeout = ADMIN_TIMEOUT; + set->timeout = NVME_ADMIN_TIMEOUT; } else { set = &ctrl->tag_set; memset(set, 0, sizeof(*set)); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index f6d81239be21..76d8c0a9a87d 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -345,7 +345,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) NVME_INLINE_SG_CNT * sizeof(struct scatterlist); ctrl->admin_tag_set.driver_data = ctrl; ctrl->admin_tag_set.nr_hw_queues = 1; - ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT; ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; ctrl->queues[0].ctrl = ctrl; From a2f6a2b8ce43db608357a490e028166f9e4bab0d Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 16:33:43 -0800 Subject: [PATCH 195/395] nvmet: add passthru admin timeout value attr NVMeOF controller in the passsthru mode is capable of handling wide set of admin commands including vender specific passhtru admin comands. The vendor specific admin commands are used to read the large drive logs and can take longer than default NVMe commands, i.e. for passthru requests the timeout value may differ from the passthru controller's default timeout values (nvme-core:admin_timeout). Add a configfs attribute so that user can set the admin timeout values. In case if this configfs value is not set nvme_alloc_request() will set the ADMIN_TIMEOUT value when request queuedata is NULL. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 20 ++++++++++++++++++++ drivers/nvme/target/nvmet.h | 1 + drivers/nvme/target/passthru.c | 6 ++++++ 3 files changed, 27 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 37e1d7784e17..781157a654e9 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -736,9 +736,29 @@ static ssize_t nvmet_passthru_enable_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_passthru_, enable); +static ssize_t nvmet_passthru_admin_timeout_show(struct config_item *item, + char *page) +{ + return sprintf(page, "%u\n", to_subsys(item->ci_parent)->admin_timeout); +} + +static ssize_t nvmet_passthru_admin_timeout_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + unsigned int timeout; + + if (kstrtouint(page, 0, &timeout)) + return -EINVAL; + subsys->admin_timeout = timeout; + return count; +} +CONFIGFS_ATTR(nvmet_passthru_, admin_timeout); + static struct configfs_attribute *nvmet_passthru_attrs[] = { &nvmet_passthru_attr_device_path, &nvmet_passthru_attr_enable, + &nvmet_passthru_attr_admin_timeout, NULL, }; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 559a15ccc322..a0c80e5179a2 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -249,6 +249,7 @@ struct nvmet_subsys { struct nvme_ctrl *passthru_ctrl; char *passthru_ctrl_path; struct config_group passthru_group; + unsigned int admin_timeout; #endif /* CONFIG_NVME_TARGET_PASSTHRU */ }; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 8ee94f056898..b496682ccf85 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -227,6 +227,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) struct request_queue *q = ctrl->admin_q; struct nvme_ns *ns = NULL; struct request *rq = NULL; + unsigned int timeout = 0; u32 effects; u16 status; int ret; @@ -242,6 +243,8 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) } q = ns->queue; + } else { + timeout = req->sq->ctrl->subsys->admin_timeout; } rq = nvme_alloc_request(q, req->cmd, 0, NVME_QID_ANY); @@ -250,6 +253,9 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) goto out_put_ns; } + if (timeout) + rq->timeout = timeout; + if (req->sg_cnt) { ret = nvmet_passthru_map_sg(req, rq); if (unlikely(ret)) { From 47e9730c26a4a5d4eab2124d6bbeb94693e44b46 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 16:33:44 -0800 Subject: [PATCH 196/395] nvmet: add passthru io timeout value attr NVMeOF controller in the passsthru mode is capable of handling wide set of I/O commands including vender specific passhtru io comands. The vendor specific I/O commands are used to read the large drive logs and can take longer than default NVMe commands, i.e. for passthru requests the timeout value may differ from the passthru controller's default timeout values (nvme-core:io_timeout). Add a configfs attribute so that user can set the io timeout values. In case if this configfs value is not set nvme_alloc_request() will set the NVME_IO_TIMEOUT value when request queuedata is NULL. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 20 ++++++++++++++++++++ drivers/nvme/target/nvmet.h | 1 + drivers/nvme/target/passthru.c | 3 ++- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 781157a654e9..c61ffd767062 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -755,10 +755,30 @@ static ssize_t nvmet_passthru_admin_timeout_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_passthru_, admin_timeout); +static ssize_t nvmet_passthru_io_timeout_show(struct config_item *item, + char *page) +{ + return sprintf(page, "%u\n", to_subsys(item->ci_parent)->io_timeout); +} + +static ssize_t nvmet_passthru_io_timeout_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item->ci_parent); + unsigned int timeout; + + if (kstrtouint(page, 0, &timeout)) + return -EINVAL; + subsys->io_timeout = timeout; + return count; +} +CONFIGFS_ATTR(nvmet_passthru_, io_timeout); + static struct configfs_attribute *nvmet_passthru_attrs[] = { &nvmet_passthru_attr_device_path, &nvmet_passthru_attr_enable, &nvmet_passthru_attr_admin_timeout, + &nvmet_passthru_attr_io_timeout, NULL, }; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index a0c80e5179a2..2f9635273629 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -250,6 +250,7 @@ struct nvmet_subsys { char *passthru_ctrl_path; struct config_group passthru_group; unsigned int admin_timeout; + unsigned int io_timeout; #endif /* CONFIG_NVME_TARGET_PASSTHRU */ }; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index b496682ccf85..a062398305a7 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -227,7 +227,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) struct request_queue *q = ctrl->admin_q; struct nvme_ns *ns = NULL; struct request *rq = NULL; - unsigned int timeout = 0; + unsigned int timeout; u32 effects; u16 status; int ret; @@ -243,6 +243,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) } q = ns->queue; + timeout = req->sq->ctrl->subsys->io_timeout; } else { timeout = req->sq->ctrl->subsys->admin_timeout; } From 53ffabfd4ddb3a24c5603ae82eefb5537ecb5c20 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:03 -0800 Subject: [PATCH 197/395] block: move blk_rq_bio_prep() to linux/blk-mq.h This is a preparation patch to have minimal block layer request bio append functionality in the context of the NVMeOF Passthru driver which falls in the fast path and doesn't need calls from blk_rq_append_bio(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- block/blk.h | 12 ------------ include/linux/blk-mq.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/block/blk.h b/block/blk.h index dfab98465db9..e05507a8d1e3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -91,18 +91,6 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } -static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, - unsigned int nr_segs) -{ - rq->nr_phys_segments = nr_segs; - rq->__data_len = bio->bi_iter.bi_size; - rq->bio = rq->biotail = bio; - rq->ioprio = bio_prio(bio); - - if (bio->bi_disk) - rq->rq_disk = bio->bi_disk; -} - #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 794b2a33a2c3..e7482e6ad3ec 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -593,6 +593,18 @@ static inline void blk_mq_cleanup_rq(struct request *rq) rq->q->mq_ops->cleanup_rq(rq); } +static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, + unsigned int nr_segs) +{ + rq->nr_phys_segments = nr_segs; + rq->__data_len = bio->bi_iter.bi_size; + rq->bio = rq->biotail = bio; + rq->ioprio = bio_prio(bio); + + if (bio->bi_disk) + rq->rq_disk = bio->bi_disk; +} + blk_qc_t blk_mq_submit_bio(struct bio *bio); #endif From 39dfe84451b4526a8054cc5a127337bca980dfa3 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:00 -0800 Subject: [PATCH 198/395] nvme: split nvme_alloc_request() Right now nvme_alloc_request() allocates a request from block layer based on the value of the qid. When qid set to NVME_QID_ANY it used blk_mq_alloc_request() else blk_mq_alloc_request_hctx(). The function nvme_alloc_request() is called from different context, The only place where it uses non NVME_QID_ANY value is for fabrics connect commands :- nvme_submit_sync_cmd() NVME_QID_ANY nvme_features() NVME_QID_ANY nvme_sec_submit() NVME_QID_ANY nvmf_reg_read32() NVME_QID_ANY nvmf_reg_read64() NVME_QID_ANY nvmf_reg_write32() NVME_QID_ANY nvmf_connect_admin_queue() NVME_QID_ANY nvme_submit_user_cmd() NVME_QID_ANY nvme_alloc_request() nvme_keep_alive() NVME_QID_ANY nvme_alloc_request() nvme_timeout() NVME_QID_ANY nvme_alloc_request() nvme_delete_queue() NVME_QID_ANY nvme_alloc_request() nvmet_passthru_execute_cmd() NVME_QID_ANY nvme_alloc_request() nvmf_connect_io_queue() QID __nvme_submit_sync_cmd() nvme_alloc_request() With passthru nvme_alloc_request() now falls into the I/O fast path such that blk_mq_alloc_request_hctx() is never gets called and that adds additional branch check in fast path. Split the nvme_alloc_request() into nvme_alloc_request() and nvme_alloc_request_qid(). Replace each call of the nvme_alloc_request() with NVME_QID_ANY param with a call to newly added nvme_alloc_request() without NVME_QID_ANY. Replace a call to nvme_alloc_request() with QID param with a call to newly added nvme_alloc_request() and nvme_alloc_request_qid() based on the qid value set in the __nvme_submit_sync_cmd(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 52 +++++++++++++++++++++++----------- drivers/nvme/host/lightnvm.c | 5 ++-- drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/pci.c | 4 +-- drivers/nvme/target/passthru.c | 2 +- 5 files changed, 42 insertions(+), 23 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 98bea150e5dc..fff90200497c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -518,21 +518,14 @@ static inline void nvme_clear_nvme_request(struct request *req) } } -struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) +static inline unsigned int nvme_req_op(struct nvme_command *cmd) { - unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; - struct request *req; - - if (qid == NVME_QID_ANY) { - req = blk_mq_alloc_request(q, op, flags); - } else { - req = blk_mq_alloc_request_hctx(q, op, flags, - qid ? qid - 1 : 0); - } - if (IS_ERR(req)) - return req; + return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; +} +static inline void nvme_init_request(struct request *req, + struct nvme_command *cmd) +{ if (req->q->queuedata) req->timeout = NVME_IO_TIMEOUT; else /* no queuedata implies admin queue */ @@ -541,11 +534,33 @@ struct request *nvme_alloc_request(struct request_queue *q, req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_clear_nvme_request(req); nvme_req(req)->cmd = cmd; +} +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags) +{ + struct request *req; + + req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); + if (!IS_ERR(req)) + nvme_init_request(req, cmd); return req; } EXPORT_SYMBOL_GPL(nvme_alloc_request); +struct request *nvme_alloc_request_qid(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) +{ + struct request *req; + + req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, + qid ? qid - 1 : 0); + if (!IS_ERR(req)) + nvme_init_request(req, cmd); + return req; +} +EXPORT_SYMBOL_GPL(nvme_alloc_request_qid); + static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) { struct nvme_command c; @@ -902,7 +917,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, struct request *req; int ret; - req = nvme_alloc_request(q, cmd, flags, qid); + if (qid == NVME_QID_ANY) + req = nvme_alloc_request(q, cmd, flags); + else + req = nvme_alloc_request_qid(q, cmd, flags, qid); if (IS_ERR(req)) return PTR_ERR(req); @@ -1073,7 +1091,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, void *meta = NULL; int ret; - req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY); + req = nvme_alloc_request(q, cmd, 0); if (IS_ERR(req)) return PTR_ERR(req); @@ -1148,8 +1166,8 @@ static int nvme_keep_alive(struct nvme_ctrl *ctrl) { struct request *rq; - rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED, - NVME_QID_ANY); + rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, + BLK_MQ_REQ_RESERVED); if (IS_ERR(rq)) return PTR_ERR(rq); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 88a7c8eac455..470cef3abec3 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -653,7 +653,7 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q, nvme_nvm_rqtocmd(rqd, ns, cmd); - rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY); + rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0); if (IS_ERR(rq)) return rq; @@ -767,8 +767,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, DECLARE_COMPLETION_ONSTACK(wait); int ret = 0; - rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0, - NVME_QID_ANY); + rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0); if (IS_ERR(rq)) { ret = -ENOMEM; goto err_cmd; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 824776a8ba13..83fb30e317e0 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -611,6 +611,8 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, blk_mq_req_flags_t flags); +struct request *nvme_alloc_request_qid(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); void nvme_cleanup_cmd(struct request *req); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6123040ff872..5e6365dd0c8e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1304,7 +1304,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) req->tag, nvmeq->qid); abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, - BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); + BLK_MQ_REQ_NOWAIT); if (IS_ERR(abort_req)) { atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; @@ -2218,7 +2218,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) cmd.delete_queue.opcode = opcode; cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); - req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); + req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index a062398305a7..be8ae59dcb71 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -248,7 +248,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) timeout = req->sq->ctrl->subsys->admin_timeout; } - rq = nvme_alloc_request(q, req->cmd, 0, NVME_QID_ANY); + rq = nvme_alloc_request(q, req->cmd, 0); if (IS_ERR(rq)) { status = NVME_SC_INTERNAL; goto out_put_ns; From 06b3bec8204b4c6433ccb2f6ec60fedb77b34cb3 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:02 -0800 Subject: [PATCH 199/395] nvmet: remove op_flags for passthru commands For passthru commands setting op_flags has no meaning. Remove the code that sets the op flags in nvmet_passthru_map_sg(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/target/passthru.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index be8ae59dcb71..1c84dadfb38f 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -188,21 +188,15 @@ static void nvmet_passthru_req_done(struct request *rq, static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) { struct scatterlist *sg; - int op_flags = 0; struct bio *bio; int i, ret; if (req->sg_cnt > BIO_MAX_PAGES) return -EINVAL; - if (req->cmd->common.opcode == nvme_cmd_flush) - op_flags = REQ_FUA; - else if (nvme_is_write(req->cmd)) - op_flags = REQ_SYNC | REQ_IDLE; - bio = bio_alloc(GFP_KERNEL, req->sg_cnt); bio->bi_end_io = bio_put; - bio->bi_opf = req_op(rq) | op_flags; + bio->bi_opf = req_op(rq); for_each_sg(req->sg, sg, req->sg_cnt, i) { if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, From a4fe2d3afe3ce77edeadb567c0d0a8d102c6b159 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:04 -0800 Subject: [PATCH 200/395] nvmet: use blk_rq_bio_prep instead of blk_rq_append_bio The function blk_rq_append_bio() is a genereric API written for all types driver (having bounce buffers) and different context (where request is already having a bio i.e. rq->bio != NULL). It does mainly three things: calculating the segments, bounce queue and if req->bio == NULL call blk_rq_bio_prep() or handle low level merge() case. The NVMe PCIe and fabrics transports currently does not use queue bounce mechanism. In order to find this for each request processing in the passthru blk_rq_append_bio() does extra work in the fast path for each request. When I ran I/Os with different block sizes on the passthru controller I found that we can reuse the req->sg_cnt instead of iterating over the bvecs to find out nr_segs in blk_rq_append_bio(). This calculation in blk_rq_append_bio() is a duplication of work given that we have the value in req->sg_cnt. (correct me here if I'm wrong). With NVMe passthru request based driver we allocate fresh request each time, so every call to blk_rq_append_bio() rq->bio will be NULL i.e. we don't really need the second condition in the blk_rq_append_bio() and the resulting error condition in the caller of blk_rq_append_bio(). So for NVMeOF passthru driver recalculating the segments, bounce check and ll_back_merge code is not needed such that we can get away with the minimal version of the blk_rq_append_bio() which removes the error check in the fast path along with extra variable in nvmet_passthru_map_sg(). This patch updates the nvmet_passthru_map_sg() such that it does only appending the bio to the request in the context of the NVMeOF Passthru driver. Following are perf numbers :- With current implementation (blk_rq_append_bio()) :- ---------------------------------------------------- + 5.80% 0.02% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 5.44% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 4.88% 0.00% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 5.44% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 4.86% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 5.17% 0.00% kworker/0:2-eve [nvmet] [k] nvmet_passthru_execute_cmd With this patch using blk_rq_bio_prep() :- ---------------------------------------------------- + 3.14% 0.02% kworker/0:2-eve [nvmet] [k] nvmet_passthru_execute_cmd + 3.26% 0.01% kworker/0:2-eve [nvmet] [k] nvmet_passthru_execute_cmd + 5.37% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 5.18% 0.02% kworker/0:2-eve [nvmet] [k] nvmet_passthru_execute_cmd + 4.84% 0.02% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd + 4.87% 0.01% kworker/0:2-mm_ [nvmet] [k] nvmet_passthru_execute_cmd Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/target/passthru.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 1c84dadfb38f..2b24205ee79d 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -189,7 +189,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) { struct scatterlist *sg; struct bio *bio; - int i, ret; + int i; if (req->sg_cnt > BIO_MAX_PAGES) return -EINVAL; @@ -206,11 +206,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) } } - ret = blk_rq_append_bio(rq, &bio); - if (unlikely(ret)) { - bio_put(bio); - return ret; - } + blk_rq_bio_prep(rq, bio, req->sg_cnt); return 0; } From dab3902b19a0dd1668d0cc3e8e4b976b1ee8638c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 9 Nov 2020 18:24:05 -0800 Subject: [PATCH 201/395] nvmet: use inline bio for passthru fast path In nvmet_passthru_execute_cmd() which is a high frequency function it uses bio_alloc() which leads to memory allocation from the fs pool for each I/O. For NVMeoF nvmet_req we already have inline_bvec allocated as a part of request allocation that can be used with preallocated bio when we already know the size of request before bio allocation with bio_alloc(), which we already do. Introduce a bio member for the nvmet_req passthru anon union. In the fast path, check if we can get away with inline bvec and bio from nvmet_req with bio_init() call before actually allocating from the bio_alloc(). This will be useful to avoid any new memory allocation under high memory pressure situation and get rid of any extra work of allocation (bio_alloc()) vs initialization (bio_init()) when transfer len is < NVMET_MAX_INLINE_DATA_LEN that user can configure at compile time. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/target/nvmet.h | 1 + drivers/nvme/target/passthru.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 2f9635273629..e89ec280e91a 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -332,6 +332,7 @@ struct nvmet_req { struct work_struct work; } f; struct { + struct bio inline_bio; struct request *rq; struct work_struct work; bool use_workqueue; diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 2b24205ee79d..b9776fc8f08f 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -194,14 +194,20 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) if (req->sg_cnt > BIO_MAX_PAGES) return -EINVAL; - bio = bio_alloc(GFP_KERNEL, req->sg_cnt); - bio->bi_end_io = bio_put; + if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) { + bio = &req->p.inline_bio; + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); + } else { + bio = bio_alloc(GFP_KERNEL, min(req->sg_cnt, BIO_MAX_PAGES)); + bio->bi_end_io = bio_put; + } bio->bi_opf = req_op(rq); for_each_sg(req->sg, sg, req->sg_cnt, i) { if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, sg->offset) < sg->length) { - bio_put(bio); + if (bio != &req->p.inline_bio) + bio_put(bio); return -EINVAL; } } From ff4e5fbad06f762b8551da56e8fd64ad14c8aa3e Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 12 Nov 2020 09:23:01 +0100 Subject: [PATCH 202/395] nvme-pci: drop min() from nr_io_queues assignment in nvme_setup_io_queues() the number of I/O queues is set to either 1 in case of a quirky Apple device or to the min of nvme_max_io_queues() or dev->nr_allocated_queues - 1. This is unnecessarily complicated as dev->nr_allocated_queues is only assigned once and is nvme_max_io_queues() + 1. Signed-off-by: Niklas Schnelle Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5e6365dd0c8e..90b338435021 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2113,8 +2113,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) nr_io_queues = 1; else - nr_io_queues = min(nvme_max_io_queues(dev), - dev->nr_allocated_queues - 1); + nr_io_queues = dev->nr_allocated_queues - 1; result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) From e3aef0950a30ecbf475be52509ca178907410709 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 12 Nov 2020 09:23:02 +0100 Subject: [PATCH 203/395] nvme-pci: don't allocate unused I/O queues currently the NVME_QUIRK_SHARED_TAGS quirk for Apple devices is handled during the assignment of nr_io_queues in nvme_setup_io_queues(). This however means that for these devices nvme_max_io_queues() will actually not return the supported maximum which is confusing and unexpected and also means that in nvme_probe() we are allocating for I/O queues that will never be used. Fix this by moving the quirk handling into nvme_max_io_queues(). Signed-off-by: Niklas Schnelle Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 90b338435021..2c072f33a577 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2088,6 +2088,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) static unsigned int nvme_max_io_queues(struct nvme_dev *dev) { + /* + * If tags are shared with admin queue (Apple bug), then + * make sure we only use one IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + return 1; return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues; } @@ -2106,15 +2112,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->nr_write_queues = write_queues; dev->nr_poll_queues = poll_queues; - /* - * If tags are shared with admin queue (Apple bug), then - * make sure we only use one IO queue. - */ - if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) - nr_io_queues = 1; - else - nr_io_queues = dev->nr_allocated_queues - 1; - + nr_io_queues = dev->nr_allocated_queues - 1; result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result; From 6d65aeab7bf6e83e75f53cfdbdb84603e52e1182 Mon Sep 17 00:00:00 2001 From: Amit Date: Sun, 15 Nov 2020 14:19:51 +0200 Subject: [PATCH 204/395] nvmet: remove unused ctrl->cqs remove unused cqs from nvmet_ctrl struct this will reduce the allocated memory. Signed-off-by: Amit Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 15 ++------------- drivers/nvme/target/nvmet.h | 1 - 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 957b39a82431..8ce4d59cc9e7 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -757,8 +757,6 @@ void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, { cq->qid = qid; cq->size = size; - - ctrl->cqs[qid] = cq; } void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, @@ -1344,20 +1342,14 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, if (!ctrl->changed_ns_list) goto out_free_ctrl; - ctrl->cqs = kcalloc(subsys->max_qid + 1, - sizeof(struct nvmet_cq *), - GFP_KERNEL); - if (!ctrl->cqs) - goto out_free_changed_ns_list; - ctrl->sqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_sq *), GFP_KERNEL); if (!ctrl->sqs) - goto out_free_cqs; + goto out_free_changed_ns_list; if (subsys->cntlid_min > subsys->cntlid_max) - goto out_free_cqs; + goto out_free_changed_ns_list; ret = ida_simple_get(&cntlid_ida, subsys->cntlid_min, subsys->cntlid_max, @@ -1395,8 +1387,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, out_free_sqs: kfree(ctrl->sqs); -out_free_cqs: - kfree(ctrl->cqs); out_free_changed_ns_list: kfree(ctrl->changed_ns_list); out_free_ctrl: @@ -1426,7 +1416,6 @@ static void nvmet_ctrl_free(struct kref *ref) nvmet_async_events_free(ctrl); kfree(ctrl->sqs); - kfree(ctrl->cqs); kfree(ctrl->changed_ns_list); kfree(ctrl); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index e89ec280e91a..592763732065 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -164,7 +164,6 @@ static inline struct nvmet_port *ana_groups_to_port( struct nvmet_ctrl { struct nvmet_subsys *subsys; - struct nvmet_cq **cqs; struct nvmet_sq **sqs; bool cmd_seen; From 0068a7b010533872b6e71a376771dc310d90fa1c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 25 Nov 2020 12:27:36 +0000 Subject: [PATCH 205/395] nvmet: make sure discovery change log event is protected Generation counter is protected by nvmet_config_sem. Make sure the callers that call functions that might change it, are calling it properly. Signed-off-by: Max Gurtovoy Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/discovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index f40c05c33c3a..682854e0e079 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -69,6 +69,7 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, struct nvmet_port *port; struct nvmet_subsys_link *s; + lockdep_assert_held(&nvmet_config_sem); nvmet_genctr++; list_for_each_entry(port, nvmet_ports, global_entry) From 9f20599c4821d1f7281a3efb3ef94ff3cfdd5e10 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 26 Nov 2020 22:40:29 +0000 Subject: [PATCH 206/395] nvmet: fix a spelling mistake "incuding" -> "including" in Kconfig There is a spelling mistake in the Kconfig help text. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 8056955e652c..4be2ececbc45 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -24,7 +24,7 @@ config NVME_TARGET_PASSTHRU This enables target side NVMe passthru controller support for the NVMe Over Fabrics protocol. It allows for hosts to manage and directly access an actual NVMe controller residing on the target - side, incuding executing Vendor Unique Commands. + side, including executing Vendor Unique Commands. If unsure, say N. From 8c4dfea97f15b80097b3f882ca428fb2751ec30c Mon Sep 17 00:00:00 2001 From: Victor Gladkov Date: Tue, 24 Nov 2020 18:34:59 +0000 Subject: [PATCH 207/395] nvme-fabrics: reject I/O to offline device Commands get stuck while Host NVMe-oF controller is in reconnect state. The controller enters into reconnect state when it loses connection with the target. It tries to reconnect every 10 seconds (default) until a successful reconnect or until the reconnect time-out is reached. The default reconnect time out is 10 minutes. Applications are expecting commands to complete with success or error within a certain timeout (30 seconds by default). The NVMe host is enforcing that timeout while it is connected, but during reconnect the timeout is not enforced and commands may get stuck for a long period or even forever. To fix this long delay due to the default timeout, introduce new "fast_io_fail_tmo" session parameter. The timeout is measured in seconds from the controller reconnect and any command beyond that timeout is rejected. The new parameter value may be passed during 'connect'. The default value of -1 means no timeout (similar to current behavior). Signed-off-by: Victor Gladkov Signed-off-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 46 ++++++++++++++++++++++++++++++++++- drivers/nvme/host/fabrics.c | 25 ++++++++++++++++--- drivers/nvme/host/fabrics.h | 5 ++++ drivers/nvme/host/multipath.c | 2 ++ drivers/nvme/host/nvme.h | 3 +++ 5 files changed, 77 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fff90200497c..9c1645f28a7a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -148,6 +148,38 @@ int nvme_try_sched_reset(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_try_sched_reset); +static void nvme_failfast_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_ctrl, failfast_work); + + if (ctrl->state != NVME_CTRL_CONNECTING) + return; + + set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); + dev_info(ctrl->device, "failfast expired\n"); + nvme_kick_requeue_lists(ctrl); +} + +static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl) +{ + if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1) + return; + + schedule_delayed_work(&ctrl->failfast_work, + ctrl->opts->fast_io_fail_tmo * HZ); +} + +static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl) +{ + if (!ctrl->opts) + return; + + cancel_delayed_work_sync(&ctrl->failfast_work); + clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); +} + + int nvme_reset_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) @@ -433,8 +465,17 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, } spin_unlock_irqrestore(&ctrl->lock, flags); - if (changed && ctrl->state == NVME_CTRL_LIVE) + if (!changed) + return false; + + if (ctrl->state == NVME_CTRL_LIVE) { + if (old_state == NVME_CTRL_CONNECTING) + nvme_stop_failfast_work(ctrl); nvme_kick_requeue_lists(ctrl); + } else if (ctrl->state == NVME_CTRL_CONNECTING && + old_state == NVME_CTRL_RESETTING) { + nvme_start_failfast_work(ctrl); + } return changed; } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); @@ -4372,6 +4413,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_mpath_stop(ctrl); nvme_stop_keep_alive(ctrl); + nvme_stop_failfast_work(ctrl); flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fw_act_work); } @@ -4437,6 +4479,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, int ret; ctrl->state = NVME_CTRL_NEW; + clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); @@ -4453,6 +4496,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, init_waitqueue_head(&ctrl->state_wq); INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); + INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work); memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 8575724734e0..72ac00173500 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -549,6 +549,7 @@ blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, { if (ctrl->state != NVME_CTRL_DELETING_NOIO && ctrl->state != NVME_CTRL_DEAD && + !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE; @@ -615,6 +616,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" }, { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, { NVMF_OPT_TOS, "tos=%d" }, + { NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" }, { NVMF_OPT_ERR, NULL } }; @@ -634,6 +636,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; opts->kato = NVME_DEFAULT_KATO; opts->duplicate_connect = false; + opts->fast_io_fail_tmo = NVMF_DEF_FAIL_FAST_TMO; opts->hdr_digest = false; opts->data_digest = false; opts->tos = -1; /* < 0 == use transport default */ @@ -754,6 +757,17 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n"); ctrl_loss_tmo = token; break; + case NVMF_OPT_FAIL_FAST_TMO: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + + if (token >= 0) + pr_warn("I/O fail on reconnect controller after %d sec\n", + token); + opts->fast_io_fail_tmo = token; + break; case NVMF_OPT_HOSTNQN: if (opts->host) { pr_err("hostnqn already user-assigned: %s\n", @@ -884,11 +898,15 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->nr_poll_queues = 0; opts->duplicate_connect = true; } - if (ctrl_loss_tmo < 0) + if (ctrl_loss_tmo < 0) { opts->max_reconnects = -1; - else + } else { opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, opts->reconnect_delay); + if (ctrl_loss_tmo < opts->fast_io_fail_tmo) + pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n", + opts->fast_io_fail_tmo, ctrl_loss_tmo); + } if (!opts->host) { kref_get(&nvmf_default_host->ref); @@ -988,7 +1006,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\ - NVMF_OPT_DISABLE_SQFLOW) + NVMF_OPT_DISABLE_SQFLOW |\ + NVMF_OPT_FAIL_FAST_TMO) static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a9c1e3b4585e..733010d2eafd 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -15,6 +15,8 @@ #define NVMF_DEF_RECONNECT_DELAY 10 /* default to 600 seconds of reconnect attempts before giving up */ #define NVMF_DEF_CTRL_LOSS_TMO 600 +/* default is -1: the fail fast mechanism is disabled */ +#define NVMF_DEF_FAIL_FAST_TMO -1 /* * Define a host as seen by the target. We allocate one at boot, but also @@ -56,6 +58,7 @@ enum { NVMF_OPT_NR_WRITE_QUEUES = 1 << 17, NVMF_OPT_NR_POLL_QUEUES = 1 << 18, NVMF_OPT_TOS = 1 << 19, + NVMF_OPT_FAIL_FAST_TMO = 1 << 20, }; /** @@ -89,6 +92,7 @@ enum { * @nr_write_queues: number of queues for write I/O * @nr_poll_queues: number of queues for polling I/O * @tos: type of service + * @fast_io_fail_tmo: Fast I/O fail timeout in seconds */ struct nvmf_ctrl_options { unsigned mask; @@ -111,6 +115,7 @@ struct nvmf_ctrl_options { unsigned int nr_write_queues; unsigned int nr_poll_queues; int tos; + int fast_io_fail_tmo; }; /* diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 74896be40c17..71696819c228 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -279,6 +279,8 @@ static bool nvme_available_path(struct nvme_ns_head *head) struct nvme_ns *ns; list_for_each_entry_rcu(ns, &head->list, siblings) { + if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) + continue; switch (ns->ctrl->state) { case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 83fb30e317e0..ae017f727798 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -305,6 +305,7 @@ struct nvme_ctrl { struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; + struct delayed_work failfast_work; struct nvme_command ka_cmd; struct work_struct fw_act_work; unsigned long events; @@ -338,6 +339,8 @@ struct nvme_ctrl { u16 icdoff; u16 maxcmd; int nr_reconnects; + unsigned long flags; +#define NVME_CTRL_FAILFAST_EXPIRED 0 struct nvmf_ctrl_options *opts; struct page *discard_page; From aa9d729592316e121110daa81604f71f82663167 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Mon, 30 Nov 2020 21:47:46 +0900 Subject: [PATCH 208/395] nvme: improve an error message on Identify failure Add the namespace ID to the error message when the Identify command used to retrieve the Namespace Identification Descriptor list fails. This avoids rather useless and duplicative messages like the following: [ 1.321031] nvme nvme0: Identify Descriptors failed (16386) [ 1.321948] nvme nvme0: Identify Descriptors failed (16386) [ 1.322872] nvme nvme0: Identify Descriptors failed (16386) [ 1.323775] nvme nvme0: Identify Descriptors failed (16386) [ 1.324687] nvme nvme0: Identify Descriptors failed (16386) ... Also, print the nvme status code in hexadecimal rather than decimal format rather for better readability. Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9c1645f28a7a..73c6684aaee9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1368,7 +1368,8 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, NVME_IDENTIFY_DATA_SIZE); if (status) { dev_warn(ctrl->device, - "Identify Descriptors failed (%d)\n", status); + "Identify Descriptors failed (nsid=%u, status=0x%x)\n", + nsid, status); goto free_data; } From f781f3dd6a165d860c29eeb092af8584284e50f3 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Mon, 30 Nov 2020 21:47:47 +0900 Subject: [PATCH 209/395] nvme: print a warning for when listing active namespaces fails During the scan_work, an Identify command is issued to figure out which namespaces are active. If this command fails, the nvme driver falls back to scanning namespaces sequentially. In this situation, we don't see any warnings and don't even know whether list-ns command has been failed or not easiliy. Printa warning when the Identify command executin fail: [ 1.108399] nvme nvme0: Identify NS List failed (status=0x400b) [ 1.109583] nvme0n1: detected capacity change from 0 to 1048576 [ 1.112186] nvme nvme0: Identify Descriptors failed (nsid=2, status=0x4002) [ 1.113929] nvme nvme0: Identify Descriptors failed (nsid=3, status=0x4002) [ 1.116537] nvme nvme0: Identify Descriptors failed (nsid=4, status=0x4002) ... Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 73c6684aaee9..279f4aea861b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4110,8 +4110,11 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list, NVME_IDENTIFY_DATA_SIZE); - if (ret) + if (ret) { + dev_warn(ctrl->device, + "Identify NS List failed (status=0x%x)\n", ret); goto free; + } for (i = 0; i < nr_entries; i++) { u32 nsid = le32_to_cpu(ns_list[i]); From e1aaf5cacba9d994d825a87a33bdd33343477f16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 1 Dec 2020 13:56:07 +0100 Subject: [PATCH 210/395] nvme: remove unnecessary return values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup unnecessary ret values that are not checked or used in nvme_alloc_ns(). Signed-off-by: Javier González Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 279f4aea861b..6d0c902034af 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3872,7 +3872,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct gendisk *disk; struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; - int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret; + int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; if (nvme_identify_ns(ctrl, nsid, ids, &id)) return; @@ -3896,8 +3896,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, ns->ctrl = ctrl; kref_init(&ns->kref); - ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED); - if (ret) + if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) goto out_free_queue; nvme_set_disk_name(disk_name, ns, ctrl, &flags); @@ -3916,8 +3915,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, goto out_put_disk; if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - ret = nvme_nvm_register(ns, disk_name, node); - if (ret) { + if (nvme_nvm_register(ns, disk_name, node)) { dev_warn(ctrl->device, "LightNVM init failure\n"); goto out_put_disk; } From f68abd9cc00cce58c5dbe5953ac190d25f1e4f8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 1 Dec 2020 13:56:08 +0100 Subject: [PATCH 211/395] nvme: rename controller base dev_t char device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename controller base dev_t char device in preparation for adding a namespace char device. Signed-off-by: Javier González Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6d0c902034af..5aaaae7884e5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -85,7 +85,7 @@ static LIST_HEAD(nvme_subsystems); static DEFINE_MUTEX(nvme_subsystems_lock); static DEFINE_IDA(nvme_instance_ida); -static dev_t nvme_chr_devt; +static dev_t nvme_ctrl_base_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; @@ -4517,7 +4517,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, device_initialize(&ctrl->ctrl_device); ctrl->device = &ctrl->ctrl_device; - ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance); + ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt), + ctrl->instance); ctrl->device->class = nvme_class; ctrl->device->parent = ctrl->dev; ctrl->device->groups = nvme_dev_attr_groups; @@ -4726,7 +4727,8 @@ static int __init nvme_core_init(void) if (!nvme_delete_wq) goto destroy_reset_wq; - result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); + result = alloc_chrdev_region(&nvme_ctrl_base_chr_devt, 0, + NVME_MINORS, "nvme"); if (result < 0) goto destroy_delete_wq; @@ -4747,7 +4749,7 @@ static int __init nvme_core_init(void) destroy_class: class_destroy(nvme_class); unregister_chrdev: - unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); + unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_delete_wq: destroy_workqueue(nvme_delete_wq); destroy_reset_wq: @@ -4762,7 +4764,7 @@ static void __exit nvme_core_exit(void) { class_destroy(nvme_subsys_class); class_destroy(nvme_class); - unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); + unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_workqueue(nvme_delete_wq); destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); From ba4fb3205680ade6c29c80102e86b88641709561 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 1 Dec 2020 13:56:09 +0100 Subject: [PATCH 212/395] nvme: rename bdev operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remane block device operations in preparation to add char device file operations. Signed-off-by: Javier González Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5aaaae7884e5..1520773d545f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2334,7 +2334,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, EXPORT_SYMBOL_GPL(nvme_sec_submit); #endif /* CONFIG_BLK_SED_OPAL */ -static const struct block_device_operations nvme_fops = { +static const struct block_device_operations nvme_bdev_ops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, .compat_ioctl = nvme_compat_ioctl, @@ -3342,7 +3342,7 @@ static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); - if (disk->fops == &nvme_fops) + if (disk->fops == &nvme_bdev_ops) return nvme_get_ns_from_dev(dev)->head; else return disk->private_data; @@ -3451,7 +3451,7 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, } #ifdef CONFIG_NVME_MULTIPATH if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { - if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */ + if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */ return 0; if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) return 0; @@ -3904,7 +3904,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (!disk) goto out_unlink_ns; - disk->fops = &nvme_fops; + disk->fops = &nvme_bdev_ops; disk->private_data = ns; disk->queue = ns->queue; disk->flags = flags; From 2f4c9ba23b887e7a69a474e9d53f38b5833a2119 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 1 Dec 2020 13:02:21 +0100 Subject: [PATCH 213/395] nvme: export zoned namespaces without Zone Append support read-only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow ZNS NVMe SSDs to present a read-only namespace when append is not supported, instead of rejecting the namespace directly. This allows (i) the namespace to be used in read-only mode, which is not a problem as the append command only affects the write path, and (ii) to use standard management tools such as nvme-cli to choose a different format or firmware slot that is compatible with the Linux zoned block device. Signed-off-by: Javier González Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/zns.c | 13 +++++++++---- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1520773d545f..99f91efe3824 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2125,7 +2125,8 @@ static void nvme_update_disk_info(struct gendisk *disk, nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); - if (id->nsattr & NVME_NS_ATTR_RO) + if ((id->nsattr & NVME_NS_ATTR_RO) || + test_bit(NVME_NS_FORCE_RO, &ns->flags)) set_disk_ro(disk, true); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ae017f727798..bfcedfa4b057 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -452,6 +452,7 @@ struct nvme_ns { #define NVME_NS_REMOVING 0 #define NVME_NS_DEAD 1 #define NVME_NS_ANA_PENDING 2 +#define NVME_NS_FORCE_RO 3 struct nvme_fault_inject fault_inject; diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 67e87e9f306f..1dfe9a3500e3 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -55,12 +55,17 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) int status; /* Driver requires zone append support */ - if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) & + if ((le32_to_cpu(log->iocs[nvme_cmd_zone_append]) & NVME_CMD_EFFECTS_CSUPP)) { + if (test_and_clear_bit(NVME_NS_FORCE_RO, &ns->flags)) + dev_warn(ns->ctrl->device, + "Zone Append supported for zoned namespace:%d. Remove read-only mode\n", + ns->head->ns_id); + } else { + set_bit(NVME_NS_FORCE_RO, &ns->flags); dev_warn(ns->ctrl->device, - "append not supported for zoned namespace:%d\n", - ns->head->ns_id); - return -EINVAL; + "Zone Append not supported for zoned namespace:%d. Forcing to read-only mode\n", + ns->head->ns_id); } /* Lazily query controller append limit for the first zoned namespace */ From b6f8ed33ab2bbc58e40fb1e2fb0f9c90cab04baf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Oct 2020 15:20:41 +0200 Subject: [PATCH 214/395] pstore/blk: remove {un,}register_pstore_blk This interface is entirely unused, so remove them and various bits of unreachable code. Signed-off-by: Christoph Hellwig Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201016132047.3068029-4-hch@lst.de --- Documentation/admin-guide/pstore-blk.rst | 5 +- fs/pstore/blk.c | 83 +++--------------------- include/linux/pstore_blk.h | 42 ------------ 3 files changed, 11 insertions(+), 119 deletions(-) diff --git a/Documentation/admin-guide/pstore-blk.rst b/Documentation/admin-guide/pstore-blk.rst index 83543cb3daa1..49d8149f8d32 100644 --- a/Documentation/admin-guide/pstore-blk.rst +++ b/Documentation/admin-guide/pstore-blk.rst @@ -151,10 +151,7 @@ otherwise KMSG_DUMP_MAX. Configurations for driver ------------------------- -Only a block device driver cares about these configurations. A block device -driver uses ``register_pstore_blk`` to register to pstore/blk. - -A non-block device driver uses ``register_pstore_device`` with +A device driver uses ``register_pstore_device`` with ``struct pstore_device_info`` to register to pstore/blk. .. kernel-doc:: fs/pstore/blk.c diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index fcd5563dde06..882d0bc0cfd7 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -90,7 +90,6 @@ MODULE_PARM_DESC(blkdev, "block device for pstore storage"); static DEFINE_MUTEX(pstore_blk_lock); static struct block_device *psblk_bdev; static struct pstore_zone_info *pstore_zone_info; -static pstore_blk_panic_write_op blkdev_panic_write; struct bdev_info { dev_t devt; @@ -341,24 +340,11 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes, return ret; } -static ssize_t psblk_blk_panic_write(const char *buf, size_t size, - loff_t off) -{ - int ret; - - if (!blkdev_panic_write) - return -EOPNOTSUPP; - - /* size and off must align to SECTOR_SIZE for block device */ - ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT, - size >> SECTOR_SHIFT); - /* try next zone */ - if (ret == -ENOMSG) - return ret; - return ret ? -EIO : size; -} - -static int __register_pstore_blk(struct pstore_blk_info *info) +/* + * This takes its configuration only from the module parameters now. + * See psblk_get_bdev() and blkdev. + */ +static int __register_pstore_blk(void) { char bdev_name[BDEVNAME_SIZE]; struct block_device *bdev; @@ -378,68 +364,34 @@ static int __register_pstore_blk(struct pstore_blk_info *info) } /* only allow driver matching the @blkdev */ - if (!binfo.devt || (!best_effort && - MAJOR(binfo.devt) != info->major)) { - pr_debug("invalid major %u (expect %u)\n", - info->major, MAJOR(binfo.devt)); + if (!binfo.devt) { + pr_debug("no major\n"); ret = -ENODEV; goto err_put_bdev; } /* psblk_bdev must be assigned before register to pstore/blk */ psblk_bdev = bdev; - blkdev_panic_write = info->panic_write; - - /* Copy back block device details. */ - info->devt = binfo.devt; - info->nr_sects = binfo.nr_sects; - info->start_sect = binfo.start_sect; memset(&dev, 0, sizeof(dev)); - dev.total_size = info->nr_sects << SECTOR_SHIFT; - dev.flags = info->flags; + dev.total_size = binfo.nr_sects << SECTOR_SHIFT; dev.read = psblk_generic_blk_read; dev.write = psblk_generic_blk_write; - dev.erase = NULL; - dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL; ret = __register_pstore_device(&dev); if (ret) goto err_put_bdev; bdevname(bdev, bdev_name); - pr_info("attached %s%s\n", bdev_name, - info->panic_write ? "" : " (no dedicated panic_write!)"); + pr_info("attached %s (no dedicated panic_write!)\n", bdev_name); return 0; err_put_bdev: psblk_bdev = NULL; - blkdev_panic_write = NULL; psblk_put_bdev(bdev, holder); return ret; } -/** - * register_pstore_blk() - register block device to pstore/blk - * - * @info: details on the desired block device interface - * - * Return: - * * 0 - OK - * * Others - something error. - */ -int register_pstore_blk(struct pstore_blk_info *info) -{ - int ret; - - mutex_lock(&pstore_blk_lock); - ret = __register_pstore_blk(info); - mutex_unlock(&pstore_blk_lock); - - return ret; -} -EXPORT_SYMBOL_GPL(register_pstore_blk); - static void __unregister_pstore_blk(unsigned int major) { struct pstore_device_info dev = { .read = psblk_generic_blk_read }; @@ -449,24 +401,10 @@ static void __unregister_pstore_blk(unsigned int major) if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) { __unregister_pstore_device(&dev); psblk_put_bdev(psblk_bdev, holder); - blkdev_panic_write = NULL; psblk_bdev = NULL; } } -/** - * unregister_pstore_blk() - unregister block device from pstore/blk - * - * @major: the major device number of device - */ -void unregister_pstore_blk(unsigned int major) -{ - mutex_lock(&pstore_blk_lock); - __unregister_pstore_blk(major); - mutex_unlock(&pstore_blk_lock); -} -EXPORT_SYMBOL_GPL(unregister_pstore_blk); - /* get information of pstore/blk */ int pstore_blk_get_config(struct pstore_blk_config *info) { @@ -483,12 +421,11 @@ EXPORT_SYMBOL_GPL(pstore_blk_get_config); static int __init pstore_blk_init(void) { - struct pstore_blk_info info = { }; int ret = 0; mutex_lock(&pstore_blk_lock); if (!pstore_zone_info && best_effort && blkdev[0]) - ret = __register_pstore_blk(&info); + ret = __register_pstore_blk(); mutex_unlock(&pstore_blk_lock); return ret; diff --git a/include/linux/pstore_blk.h b/include/linux/pstore_blk.h index 61e914522b01..99564f93d774 100644 --- a/include/linux/pstore_blk.h +++ b/include/linux/pstore_blk.h @@ -7,48 +7,6 @@ #include #include -/** - * typedef pstore_blk_panic_write_op - panic write operation to block device - * - * @buf: the data to write - * @start_sect: start sector to block device - * @sects: sectors count on buf - * - * Return: On success, zero should be returned. Others excluding -ENOMSG - * mean error. -ENOMSG means to try next zone. - * - * Panic write to block device must be aligned to SECTOR_SIZE. - */ -typedef int (*pstore_blk_panic_write_op)(const char *buf, sector_t start_sect, - sector_t sects); - -/** - * struct pstore_blk_info - pstore/blk registration details - * - * @major: Which major device number to support with pstore/blk - * @flags: The supported PSTORE_FLAGS_* from linux/pstore.h. - * @panic_write:The write operation only used for the panic case. - * This can be NULL, but is recommended to avoid losing - * crash data if the kernel's IO path or work queues are - * broken during a panic. - * @devt: The dev_t that pstore/blk has attached to. - * @nr_sects: Number of sectors on @devt. - * @start_sect: Starting sector on @devt. - */ -struct pstore_blk_info { - unsigned int major; - unsigned int flags; - pstore_blk_panic_write_op panic_write; - - /* Filled in by pstore/blk after registration. */ - dev_t devt; - sector_t nr_sects; - sector_t start_sect; -}; - -int register_pstore_blk(struct pstore_blk_info *info); -void unregister_pstore_blk(unsigned int major); - /** * struct pstore_device_info - back-end pstore/blk driver structure. * From 26fecbf7602dd69b649914e61526bd67c557fece Mon Sep 17 00:00:00 2001 From: Vasile-Laurentiu Stanimir Date: Wed, 4 Nov 2020 15:05:32 +0200 Subject: [PATCH 215/395] pstore: Move kmsg_bytes default into Kconfig While kmsg_bytes can be set for pstore via mount, if a crash occurs before the mount only partial console log will be stored as kmsg_bytes defaults to a potentially different hardcoded value in the kernel (PSTORE_DEFAULT_KMSG_BYTES). This makes it impossible to analyze valuable post-mortem data especially on the embedded development or in the process of bringing up new boards. Change this value to be a Kconfig option with the default of old PSTORE_DEFAULT_KMSG_BYTES Signed-off-by: Vasile-Laurentiu Stanimir Signed-off-by: Kees Cook --- fs/pstore/Kconfig | 8 ++++++++ fs/pstore/inode.c | 2 +- fs/pstore/internal.h | 1 - fs/pstore/platform.c | 2 +- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index e16a49ebfe54..8adabde685f1 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -14,6 +14,14 @@ config PSTORE If you don't have a platform persistent store driver, say N. +config PSTORE_DEFAULT_KMSG_BYTES + int "Default kernel log storage space" if EXPERT + depends on PSTORE + default "10240" + help + Defines default size of pstore kernel log storage. + Can be enlarged if needed, not recommended to shrink it. + config PSTORE_DEFLATE_COMPRESS tristate "DEFLATE (ZLIB) compression" default y diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index c331efe8de95..93a217e4f563 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -266,7 +266,7 @@ static void parse_options(char *options) */ static int pstore_show_options(struct seq_file *m, struct dentry *root) { - if (kmsg_bytes != PSTORE_DEFAULT_KMSG_BYTES) + if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES) seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes); return 0; } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 7fb219042f13..801d6c0b170c 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -6,7 +6,6 @@ #include #include -#define PSTORE_DEFAULT_KMSG_BYTES 10240 extern unsigned long kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 36714df37d5d..32f64abc277c 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -101,7 +101,7 @@ static char *big_oops_buf; static size_t big_oops_buf_sz; /* How much of the console log to snapshot */ -unsigned long kmsg_bytes = PSTORE_DEFAULT_KMSG_BYTES; +unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; void pstore_set_kmsg_bytes(int bytes) { From 5df1a6726973ee2444e11f16daa013971dc52e8b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 14:33:37 +0100 Subject: [PATCH 216/395] filemap: consistently use ->f_mapping over ->i_mapping Use file->f_mapping in all remaining places that have a struct file available to properly handle the case where inode->i_mapping != file_inode(file)->i_mapping. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Jens Axboe --- mm/filemap.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index d5e7c2029d16..4f583489aa3c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2886,14 +2886,14 @@ EXPORT_SYMBOL(filemap_map_pages); vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { + struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct page *page = vmf->page; - struct inode *inode = file_inode(vmf->vma->vm_file); vm_fault_t ret = VM_FAULT_LOCKED; - sb_start_pagefault(inode->i_sb); + sb_start_pagefault(mapping->host->i_sb); file_update_time(vmf->vma->vm_file); lock_page(page); - if (page->mapping != inode->i_mapping) { + if (page->mapping != mapping) { unlock_page(page); ret = VM_FAULT_NOPAGE; goto out; @@ -2906,7 +2906,7 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) set_page_dirty(page); wait_for_stable_page(page); out: - sb_end_pagefault(inode->i_sb); + sb_end_pagefault(mapping->host->i_sb); return ret; } @@ -3149,10 +3149,9 @@ void dio_warn_stale_pagecache(struct file *filp) { static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); char pathname[128]; - struct inode *inode = file_inode(filp); char *path; - errseq_set(&inode->i_mapping->wb_err, -EIO); + errseq_set(&filp->f_mapping->wb_err, -EIO); if (__ratelimit(&_rs)) { path = file_path(filp, pathname, sizeof(pathname)); if (IS_ERR(path)) @@ -3179,7 +3178,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) { /* If there are pages to writeback, return */ - if (filemap_range_has_page(inode->i_mapping, pos, + if (filemap_range_has_page(file->f_mapping, pos, pos + write_len - 1)) return -EAGAIN; } else { From 60b498852bf219c0bf2b0864c69972840978ca43 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:21:18 +0100 Subject: [PATCH 217/395] fs: remove get_super_thawed and get_super_exclusive_thawed Just open code the wait in the only caller of both functions. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/internal.h | 2 ++ fs/quota/quota.c | 31 +++++++++++++++++++++------- fs/super.c | 51 ++-------------------------------------------- include/linux/fs.h | 4 +--- 4 files changed, 29 insertions(+), 59 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index a7cd0f64faa4..47be21dfeebe 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -114,7 +114,9 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *); */ extern int reconfigure_super(struct fs_context *); extern bool trylock_super(struct super_block *sb); +struct super_block *__get_super(struct block_device *bdev, bool excl); extern struct super_block *user_get_super(dev_t); +void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); /* diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 9af95c7a0bbe..f3d32b0d9008 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -20,6 +20,7 @@ #include #include #include "compat.h" +#include "../internal.h" static int check_quotactl_permission(struct super_block *sb, int type, int cmd, qid_t id) @@ -868,6 +869,7 @@ static struct super_block *quotactl_block(const char __user *special, int cmd) struct block_device *bdev; struct super_block *sb; struct filename *tmp = getname(special); + bool excl = false, thawed = false; if (IS_ERR(tmp)) return ERR_CAST(tmp); @@ -875,17 +877,32 @@ static struct super_block *quotactl_block(const char __user *special, int cmd) putname(tmp); if (IS_ERR(bdev)) return ERR_CAST(bdev); - if (quotactl_cmd_onoff(cmd)) - sb = get_super_exclusive_thawed(bdev); - else if (quotactl_cmd_write(cmd)) - sb = get_super_thawed(bdev); - else - sb = get_super(bdev); + + if (quotactl_cmd_onoff(cmd)) { + excl = true; + thawed = true; + } else if (quotactl_cmd_write(cmd)) { + thawed = true; + } + +retry: + sb = __get_super(bdev, excl); + if (thawed && sb && sb->s_writers.frozen != SB_UNFROZEN) { + if (excl) + up_write(&sb->s_umount); + else + up_read(&sb->s_umount); + wait_event(sb->s_writers.wait_unfrozen, + sb->s_writers.frozen == SB_UNFROZEN); + put_super(sb); + goto retry; + } + bdput(bdev); if (!sb) return ERR_PTR(-ENODEV); - return sb; + #else return ERR_PTR(-ENODEV); #endif diff --git a/fs/super.c b/fs/super.c index 98bb0629ee10..343e5c1e538d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -307,7 +307,7 @@ static void __put_super(struct super_block *s) * Drops a temporary reference, frees superblock if there's no * references left. */ -static void put_super(struct super_block *sb) +void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); @@ -740,7 +740,7 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); -static struct super_block *__get_super(struct block_device *bdev, bool excl) +struct super_block *__get_super(struct block_device *bdev, bool excl) { struct super_block *sb; @@ -789,53 +789,6 @@ struct super_block *get_super(struct block_device *bdev) } EXPORT_SYMBOL(get_super); -static struct super_block *__get_super_thawed(struct block_device *bdev, - bool excl) -{ - while (1) { - struct super_block *s = __get_super(bdev, excl); - if (!s || s->s_writers.frozen == SB_UNFROZEN) - return s; - if (!excl) - up_read(&s->s_umount); - else - up_write(&s->s_umount); - wait_event(s->s_writers.wait_unfrozen, - s->s_writers.frozen == SB_UNFROZEN); - put_super(s); - } -} - -/** - * get_super_thawed - get thawed superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device. The superblock is returned once it is thawed - * (or immediately if it was not frozen). %NULL is returned if no match - * is found. - */ -struct super_block *get_super_thawed(struct block_device *bdev) -{ - return __get_super_thawed(bdev, false); -} -EXPORT_SYMBOL(get_super_thawed); - -/** - * get_super_exclusive_thawed - get thawed superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device. The superblock is returned once it is thawed - * (or immediately if it was not frozen) and s_umount semaphore is held - * in exclusive mode. %NULL is returned if no match is found. - */ -struct super_block *get_super_exclusive_thawed(struct block_device *bdev) -{ - return __get_super_thawed(bdev, true); -} -EXPORT_SYMBOL(get_super_exclusive_thawed); - /** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for diff --git a/include/linux/fs.h b/include/linux/fs.h index 8667d0cdc71e..a61df0dd4f19 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1409,7 +1409,7 @@ enum { struct sb_writers { int frozen; /* Is sb frozen? */ - wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */ + wait_queue_head_t wait_unfrozen; /* wait for thaw */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; @@ -3132,8 +3132,6 @@ extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern struct super_block *get_super(struct block_device *); -extern struct super_block *get_super_thawed(struct block_device *); -extern struct super_block *get_super_exclusive_thawed(struct block_device *bdev); extern struct super_block *get_active_super(struct block_device *bdev); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); From 040f04bd2e825f1d80b14a0e0ac3d830339eb779 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 11:54:06 +0100 Subject: [PATCH 218/395] fs: simplify freeze_bdev/thaw_bdev Store the frozen superblock in struct block_device to avoid the awkward interface that can return a sb only used a cookie, an ERR_PTR or NULL. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Chao Yu [f2fs] Signed-off-by: Jens Axboe --- drivers/md/dm-core.h | 5 ----- drivers/md/dm.c | 20 ++++++------------- fs/block_dev.c | 41 ++++++++++++++++----------------------- fs/buffer.c | 2 +- fs/ext4/ioctl.c | 2 +- fs/f2fs/file.c | 14 +++++-------- fs/xfs/xfs_fsops.c | 7 ++----- include/linux/blk_types.h | 1 + include/linux/blkdev.h | 4 ++-- 9 files changed, 35 insertions(+), 61 deletions(-) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index d522093cb39d..aace147effca 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -96,11 +96,6 @@ struct mapped_device { */ struct workqueue_struct *wq; - /* - * freeze/thaw support require holding onto a super block - */ - struct super_block *frozen_sb; - /* forced geometry settings */ struct hd_geometry geometry; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 54739f1b579b..50541d336c71 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2392,27 +2392,19 @@ static int lock_fs(struct mapped_device *md) { int r; - WARN_ON(md->frozen_sb); + WARN_ON(test_bit(DMF_FROZEN, &md->flags)); - md->frozen_sb = freeze_bdev(md->bdev); - if (IS_ERR(md->frozen_sb)) { - r = PTR_ERR(md->frozen_sb); - md->frozen_sb = NULL; - return r; - } - - set_bit(DMF_FROZEN, &md->flags); - - return 0; + r = freeze_bdev(md->bdev); + if (!r) + set_bit(DMF_FROZEN, &md->flags); + return r; } static void unlock_fs(struct mapped_device *md) { if (!test_bit(DMF_FROZEN, &md->flags)) return; - - thaw_bdev(md->bdev, md->frozen_sb); - md->frozen_sb = NULL; + thaw_bdev(md->bdev); clear_bit(DMF_FROZEN, &md->flags); } diff --git a/fs/block_dev.c b/fs/block_dev.c index d8664f5c1ff6..33c29106c989 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -548,55 +548,47 @@ EXPORT_SYMBOL(fsync_bdev); * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze * actually. */ -struct super_block *freeze_bdev(struct block_device *bdev) +int freeze_bdev(struct block_device *bdev) { struct super_block *sb; int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (++bdev->bd_fsfreeze_count > 1) { - /* - * We don't even need to grab a reference - the first call - * to freeze_bdev grab an active reference and only the last - * thaw_bdev drops it. - */ - sb = get_super(bdev); - if (sb) - drop_super(sb); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; - } + if (++bdev->bd_fsfreeze_count > 1) + goto done; sb = get_active_super(bdev); if (!sb) - goto out; + goto sync; if (sb->s_op->freeze_super) error = sb->s_op->freeze_super(sb); else error = freeze_super(sb); - if (error) { - deactivate_super(sb); - bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); - } deactivate_super(sb); - out: + + if (error) { + bdev->bd_fsfreeze_count--; + goto done; + } + bdev->bd_fsfreeze_sb = sb; + +sync: sync_blockdev(bdev); +done: mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; /* thaw_bdev releases s->s_umount */ + return error; } EXPORT_SYMBOL(freeze_bdev); /** * thaw_bdev -- unlock filesystem * @bdev: blockdevice to unlock - * @sb: associated superblock * * Unlocks the filesystem and marks it writeable again after freeze_bdev(). */ -int thaw_bdev(struct block_device *bdev, struct super_block *sb) +int thaw_bdev(struct block_device *bdev) { + struct super_block *sb; int error = -EINVAL; mutex_lock(&bdev->bd_fsfreeze_mutex); @@ -607,6 +599,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) if (--bdev->bd_fsfreeze_count > 0) goto out; + sb = bdev->bd_fsfreeze_sb; if (!sb) goto out; diff --git a/fs/buffer.c b/fs/buffer.c index 23f645657488..a7595ada9400 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -523,7 +523,7 @@ static int osync_buffers_list(spinlock_t *lock, struct list_head *list) void emergency_thaw_bdev(struct super_block *sb) { - while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index f0381876a7e5..524e13432447 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -624,7 +624,7 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) case EXT4_GOING_FLAGS_DEFAULT: freeze_bdev(sb->s_bdev); set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); - thaw_bdev(sb->s_bdev, sb); + thaw_bdev(sb->s_bdev); break; case EXT4_GOING_FLAGS_LOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ee861c6d9ff0..a9fc482a0e60 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2230,16 +2230,12 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) switch (in) { case F2FS_GOING_DOWN_FULLSYNC: - sb = freeze_bdev(sb->s_bdev); - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); + ret = freeze_bdev(sb->s_bdev); + if (ret) goto out; - } - if (sb) { - f2fs_stop_checkpoint(sbi, false); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); - thaw_bdev(sb->s_bdev, sb); - } + f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + thaw_bdev(sb->s_bdev); break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index ef1d5bb88b93..b7c5783a031c 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -433,13 +433,10 @@ xfs_fs_goingdown( { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { - struct super_block *sb = freeze_bdev(mp->m_super->s_bdev); - - if (sb && !IS_ERR(sb)) { + if (!freeze_bdev(mp->m_super->s_bdev)) { xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); - thaw_bdev(sb->s_bdev, sb); + thaw_bdev(mp->m_super->s_bdev); } - break; } case XFS_FSOP_GOING_FLAGS_LOGFLUSH: diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d9b69bbde5cc..ebfb4e7c1fd1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -46,6 +46,7 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; + struct super_block *bd_fsfreeze_sb; } __randomize_layout; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 05b346a68c2e..12810a19edeb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -2020,7 +2020,7 @@ static inline int sync_blockdev(struct block_device *bdev) #endif int fsync_bdev(struct block_device *bdev); -struct super_block *freeze_bdev(struct block_device *bdev); -int thaw_bdev(struct block_device *bdev, struct super_block *sb); +int freeze_bdev(struct block_device *bdev); +int thaw_bdev(struct block_device *bdev); #endif /* _LINUX_BLKDEV_H */ From a6419fd810c6b3b060f75b69b09d25ea2ac1f200 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 30 Jun 2020 16:33:54 +0200 Subject: [PATCH 219/395] mtip32xx: remove the call to fsync_bdev on removal del_gendisk already calls fsync_bdev for every partition, no need to do this twice. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 15 --------------- drivers/block/mtip32xx/mtip32xx.h | 2 -- 2 files changed, 17 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 153e2cdecb4d..53ac59d19ae5 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3687,7 +3687,6 @@ static int mtip_block_initialize(struct driver_data *dd) /* Enable the block device and add it to /dev */ device_add_disk(&dd->pdev->dev, dd->disk, NULL); - dd->bdev = bdget_disk(dd->disk, 0); /* * Now that the disk is active, initialize any sysfs attributes * managed by the protocol layer. @@ -3721,9 +3720,6 @@ static int mtip_block_initialize(struct driver_data *dd) return rv; kthread_run_error: - bdput(dd->bdev); - dd->bdev = NULL; - /* Delete our gendisk. This also removes the device from /dev */ del_gendisk(dd->disk); @@ -3804,14 +3800,6 @@ static int mtip_block_remove(struct driver_data *dd) blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd); blk_mq_unquiesce_queue(dd->queue); - /* - * Delete our gendisk structure. This also removes the device - * from /dev - */ - if (dd->bdev) { - bdput(dd->bdev); - dd->bdev = NULL; - } if (dd->disk) { if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) del_gendisk(dd->disk); @@ -4206,9 +4194,6 @@ static void mtip_pci_remove(struct pci_dev *pdev) } while (atomic_read(&dd->irq_workers_active) != 0 && time_before(jiffies, to)); - if (!dd->sr) - fsync_bdev(dd->bdev); - if (atomic_read(&dd->irq_workers_active) != 0) { dev_warn(&dd->pdev->dev, "Completion workers still active!\n"); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index e22a7f0523bf..88f4206310e4 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -463,8 +463,6 @@ struct driver_data { int isr_binding; - struct block_device *bdev; - struct list_head online_list; /* linkage for online list */ struct list_head remove_list; /* linkage for removing list */ From ee763e2143e79fa41d2818e620e1e8ff69af87bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:43:46 +0100 Subject: [PATCH 220/395] zram: do not call set_blocksize set_blocksize is used by file systems to use their preferred buffer cache block size. Block drivers should not set it. Signed-off-by: Christoph Hellwig Acked-by: Minchan Kim Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 11 +---------- drivers/block/zram/zram_drv.h | 1 - 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6d15d51cee2b..b5f68951c9d2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -403,13 +403,10 @@ static void reset_bdev(struct zram *zram) return; bdev = zram->bdev; - if (zram->old_block_size) - set_blocksize(bdev, zram->old_block_size); blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); /* hope filp_close flush all of IO */ filp_close(zram->backing_dev, NULL); zram->backing_dev = NULL; - zram->old_block_size = 0; zram->bdev = NULL; zram->disk->fops = &zram_devops; kvfree(zram->bitmap); @@ -454,7 +451,7 @@ static ssize_t backing_dev_store(struct device *dev, struct file *backing_dev = NULL; struct inode *inode; struct address_space *mapping; - unsigned int bitmap_sz, old_block_size = 0; + unsigned int bitmap_sz; unsigned long nr_pages, *bitmap = NULL; struct block_device *bdev = NULL; int err; @@ -509,14 +506,8 @@ static ssize_t backing_dev_store(struct device *dev, goto out; } - old_block_size = block_size(bdev); - err = set_blocksize(bdev, PAGE_SIZE); - if (err) - goto out; - reset_bdev(zram); - zram->old_block_size = old_block_size; zram->bdev = bdev; zram->backing_dev = backing_dev; zram->bitmap = bitmap; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index f2fd46daa760..712354a4207c 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -118,7 +118,6 @@ struct zram { bool wb_limit_enable; u64 bd_wb_limit; struct block_device *bdev; - unsigned int old_block_size; unsigned long *bitmap; unsigned long nr_pages; #endif From f46f2a3198017cff1f3f8f71de74ff7abee3aa16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:38:30 +0100 Subject: [PATCH 221/395] loop: do not call set_blocksize set_blocksize is used by file systems to use their preferred buffer cache block size. Block drivers should not set it. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/loop.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 9a27d4f1c08a..b42c728620c9 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1164,9 +1164,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, size = get_loop_size(lo, file); loop_set_size(lo, size); - set_blocksize(bdev, S_ISBLK(inode->i_mode) ? - block_size(inode->i_bdev) : PAGE_SIZE); - lo->lo_state = Lo_bound; if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; From 47d951023a242bb159534573a4a76fef9a31dc9b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 7 Nov 2020 09:30:05 +0100 Subject: [PATCH 222/395] dm: simplify flush_bio initialization in __send_empty_flush We don't really need the struct block_device to initialize a bio. So switch from using bio_set_dev to manually setting up bi_disk (bi_partno will always be zero and has been cleared by bio_init already). Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 50541d336c71..ab0a8335f098 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1422,18 +1422,12 @@ static int __send_empty_flush(struct clone_info *ci) */ bio_init(&flush_bio, NULL, 0); flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + flush_bio.bi_disk = ci->io->md->disk; + bio_associate_blkg(&flush_bio); + ci->bio = &flush_bio; ci->sector_count = 0; - /* - * Empty flush uses a statically initialized bio, as the base for - * cloning. However, blkg association requires that a bdev is - * associated with a gendisk, which doesn't happen until the bdev is - * opened. So, blkg association is done at issue time of the flush - * rather than when the device is created in alloc_dev(). - */ - bio_set_dev(ci->bio, ci->io->md->bdev); - BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); From b0519b542303bc167d22bf11dadd3f18d37dbfe2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Nov 2020 16:21:42 +0100 Subject: [PATCH 223/395] dm: remove the block_device reference in struct mapped_device Get rid of the long-lasting struct block_device reference in struct mapped_device. The only remaining user is the freeze code, where we can trivially look up the block device at freeze time and release the reference at thaw time. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Mike Snitzer Signed-off-by: Jens Axboe --- drivers/md/dm-core.h | 2 -- drivers/md/dm.c | 25 ++++++++++++++----------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index aace147effca..086d293c2b03 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -102,8 +102,6 @@ struct mapped_device { /* kobject and completion */ struct dm_kobject_holder kobj_holder; - struct block_device *bdev; - struct dm_stats stats; /* for blk-mq request-based DM support */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ab0a8335f098..48051db006f3 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1744,11 +1744,6 @@ static void cleanup_mapped_device(struct mapped_device *md) cleanup_srcu_struct(&md->io_barrier); - if (md->bdev) { - bdput(md->bdev); - md->bdev = NULL; - } - mutex_destroy(&md->suspend_lock); mutex_destroy(&md->type_lock); mutex_destroy(&md->table_devices_lock); @@ -1840,10 +1835,6 @@ static struct mapped_device *alloc_dev(int minor) if (!md->wq) goto bad; - md->bdev = bdget_disk(md->disk, 0); - if (!md->bdev) - goto bad; - dm_stats_init(&md->stats); /* Populate the mapping, nobody knows we exist yet */ @@ -2384,11 +2375,16 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) */ static int lock_fs(struct mapped_device *md) { + struct block_device *bdev; int r; WARN_ON(test_bit(DMF_FROZEN, &md->flags)); - r = freeze_bdev(md->bdev); + bdev = bdget_disk(md->disk, 0); + if (!bdev) + return -ENOMEM; + r = freeze_bdev(bdev); + bdput(bdev); if (!r) set_bit(DMF_FROZEN, &md->flags); return r; @@ -2396,9 +2392,16 @@ static int lock_fs(struct mapped_device *md) static void unlock_fs(struct mapped_device *md) { + struct block_device *bdev; + if (!test_bit(DMF_FROZEN, &md->flags)) return; - thaw_bdev(md->bdev); + + bdev = bdget_disk(md->disk, 0); + if (!bdev) + return; + thaw_bdev(bdev); + bdput(bdev); clear_bit(DMF_FROZEN, &md->flags); } From b601d148a16ea16dfbaf3600be35ee175847a09b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Nov 2020 16:21:52 +0100 Subject: [PATCH 224/395] block: remove a duplicate __disk_get_part prototype Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Reviewed-by: Jan Kara Reviewed-by: Greg Kroah-Hartman Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 46553d6d6025..22f5b9fd96f8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -250,7 +250,6 @@ static inline dev_t part_devt(struct hd_struct *part) return part_to_dev(part)->devt; } -extern struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno); static inline void disk_put_part(struct hd_struct *part) From 3f50b95e0edd22824b2650eb65466bf7060f7488 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:43:52 +0100 Subject: [PATCH 225/395] block: remove a superflous check in blkpg_do_ioctl sector_t is now always a u64, so this check is not needed. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/ioctl.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 6b785181344f..0c09bb7a6ff3 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -35,15 +35,6 @@ static int blkpg_do_ioctl(struct block_device *bdev, start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; - /* check for fit in a hd_struct */ - if (sizeof(sector_t) < sizeof(long long)) { - long pstart = start, plength = length; - - if (pstart != start || plength != length || pstart < 0 || - plength < 0 || p.pno > 65535) - return -EINVAL; - } - switch (op) { case BLKPG_ADD_PARTITION: /* check if partition is aligned to blocksize */ From 8d65269fe8065fee889bca5b204d711b0695a8f6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Nov 2020 08:18:55 +0100 Subject: [PATCH 226/395] block: add a bdev_kobj helper Add a little helper to find the kobject for a struct block_device. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Acked-by: Tejun Heo Acked-by: Coly Li [bcache] Acked-by: David Sterba [btrfs] Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 7 ++----- drivers/md/md.c | 4 +--- fs/block_dev.c | 6 +++--- fs/btrfs/sysfs.c | 15 +++------------ include/linux/blk_types.h | 3 +++ 5 files changed, 12 insertions(+), 23 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 46a00134a36a..a6a5e21e4fd1 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1447,8 +1447,7 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, goto err; err = "error creating kobject"; - if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj, - "bcache")) + if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache")) goto err; if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) goto err; @@ -2342,9 +2341,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, goto err; } - if (kobject_add(&ca->kobj, - &part_to_dev(bdev->bd_part)->kobj, - "bcache")) { + if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) { err = "error calling kobject_add"; ret = -ENOMEM; goto out; diff --git a/drivers/md/md.c b/drivers/md/md.c index b2edf5e0f965..7ce6047c856e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2414,7 +2414,6 @@ EXPORT_SYMBOL(md_integrity_add_rdev); static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) { char b[BDEVNAME_SIZE]; - struct kobject *ko; int err; /* prevent duplicates */ @@ -2477,9 +2476,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; - ko = &part_to_dev(rdev->bdev->bd_part)->kobj; /* failure here is OK */ - err = sysfs_create_link(&rdev->kobj, ko, "block"); + err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); rdev->sysfs_unack_badblocks = sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); diff --git a/fs/block_dev.c b/fs/block_dev.c index 33c29106c989..c5755150c6be 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1242,7 +1242,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) holder->disk = disk; holder->refcnt = 1; - ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); if (ret) goto out_free; @@ -1259,7 +1259,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) goto out_unlock; out_del: - del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + del_symlink(disk->slave_dir, bdev_kobj(bdev)); out_free: kfree(holder); out_unlock: @@ -1287,7 +1287,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); + del_symlink(disk->slave_dir, bdev_kobj(bdev)); del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); kobject_put(bdev->bd_part->holder_dir); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 279d9262b676..24b6c6dc6900 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1232,8 +1232,6 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, void btrfs_sysfs_remove_device(struct btrfs_device *device) { - struct hd_struct *disk; - struct kobject *disk_kobj; struct kobject *devices_kobj; /* @@ -1243,11 +1241,8 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device) devices_kobj = device->fs_info->fs_devices->devices_kobj; ASSERT(devices_kobj); - if (device->bdev) { - disk = device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(devices_kobj, disk_kobj->name); - } + if (device->bdev) + sysfs_remove_link(devices_kobj, bdev_kobj(device->bdev)->name); if (device->devid_kobj.state_initialized) { kobject_del(&device->devid_kobj); @@ -1353,11 +1348,7 @@ int btrfs_sysfs_add_device(struct btrfs_device *device) nofs_flag = memalloc_nofs_save(); if (device->bdev) { - struct hd_struct *disk; - struct kobject *disk_kobj; - - disk = device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; + struct kobject *disk_kobj = bdev_kobj(device->bdev); ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name); if (ret) { diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index ebfb4e7c1fd1..9698f459cc65 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -49,6 +49,9 @@ struct block_device { struct super_block *bd_fsfreeze_sb; } __randomize_layout; +#define bdev_kobj(_bdev) \ + (&part_to_dev((_bdev)->bd_part)->kobj) + /* * Block error status values. See block/blk-core:blk_errors for the details. * Alpha cannot write a byte atomically, so we need to use 32-bit value. From e79319af6d8cfd7311fef1bfbb1c59c94e6e10a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Nov 2020 06:48:53 +0100 Subject: [PATCH 227/395] block: use disk_part_iter_exit in disk_part_iter_next Call disk_part_iter_exit in disk_part_iter_next instead of duplicating the functionality. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 4e039524f92b..0bd9c41dd4cb 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -227,8 +227,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) int inc, end; /* put the last partition */ - disk_put_part(piter->part); - piter->part = NULL; + disk_part_iter_exit(piter); /* get part_tbl */ rcu_read_lock(); From efdc41c8d49fc1ff9bbef8f68f1cf1d8d59164a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Nov 2020 07:25:37 +0100 Subject: [PATCH 228/395] block: use put_device in put_disk Use put_device to put the device instead of poking into the internals and using kobject_put. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 0bd9c41dd4cb..f46e89226fdf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1803,7 +1803,7 @@ EXPORT_SYMBOL(__alloc_disk_node); void put_disk(struct gendisk *disk) { if (disk) - kobject_put(&disk_to_dev(disk)->kobj); + put_device(disk_to_dev(disk)); } EXPORT_SYMBOL(put_disk); From 612c6aa7817f1c89b6a92fc724331aa7c9d77f6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Nov 2020 17:52:02 +0100 Subject: [PATCH 229/395] block: change the hash used for looking up block devices Adding the minor to the major creates tons of pointless conflicts. Just use the dev_t itself, which is 32-bits and thus is guaranteed to fit into ino_t. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index c5755150c6be..d707ab376da8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -863,35 +863,12 @@ void __init bdev_cache_init(void) blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } -/* - * Most likely _very_ bad one - but then it's hardly critical for small - * /dev and can be fixed when somebody will need really large one. - * Keep in mind that it will be fed through icache hash function too. - */ -static inline unsigned long hash(dev_t dev) -{ - return MAJOR(dev)+MINOR(dev); -} - -static int bdev_test(struct inode *inode, void *data) -{ - return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; -} - -static int bdev_set(struct inode *inode, void *data) -{ - BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; - return 0; -} - static struct block_device *bdget(dev_t dev) { struct block_device *bdev; struct inode *inode; - inode = iget5_locked(blockdev_superblock, hash(dev), - bdev_test, bdev_set, &dev); - + inode = iget_locked(blockdev_superblock, dev); if (!inode) return NULL; @@ -903,6 +880,7 @@ static struct block_device *bdget(dev_t dev) bdev->bd_super = NULL; bdev->bd_inode = inode; bdev->bd_part_count = 0; + bdev->bd_dev = dev; inode->i_mode = S_IFBLK; inode->i_rdev = dev; inode->i_bdev = bdev; From 3a4174e68684e43ecdcb59126a441b29d5e94f7f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Nov 2020 16:45:27 +0100 Subject: [PATCH 230/395] block: switch bdgrab to use igrab All of the current callers already have a reference, but to prepare for additional users ensure bdgrab returns NULL if the block device is being freed. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- fs/block_dev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index d707ab376da8..962fabe8a67b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -894,10 +894,14 @@ static struct block_device *bdget(dev_t dev) /** * bdgrab -- Grab a reference to an already referenced block device * @bdev: Block device to grab a reference to. + * + * Returns the block_device with an additional reference when successful, + * or NULL if the inode is already beeing freed. */ struct block_device *bdgrab(struct block_device *bdev) { - ihold(bdev->bd_inode); + if (!igrab(bdev->bd_inode)) + return NULL; return bdev; } EXPORT_SYMBOL(bdgrab); From c2637e80a09e0d6c698d2771d7230f59c2138122 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 14 Nov 2020 19:19:57 +0100 Subject: [PATCH 231/395] init: refactor name_to_dev_t Split each case into a self-contained helper, and move the block dependent code entirely under the pre-existing #ifdef CONFIG_BLOCK. This allows to remove the blk_lookup_devt stub in genhd.h. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 7 +- init/do_mounts.c | 183 +++++++++++++++++++++--------------------- 2 files changed, 91 insertions(+), 99 deletions(-) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 22f5b9fd96f8..ca5e356084c3 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -388,18 +388,13 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, } #endif /* CONFIG_SYSFS */ +dev_t blk_lookup_devt(const char *name, int partno); #ifdef CONFIG_BLOCK void printk_all_partitions(void); -dev_t blk_lookup_devt(const char *name, int partno); #else /* CONFIG_BLOCK */ static inline void printk_all_partitions(void) { } -static inline dev_t blk_lookup_devt(const char *name, int partno) -{ - dev_t devt = MKDEV(0, 0); - return devt; -} #endif /* CONFIG_BLOCK */ #endif /* _LINUX_GENHD_H */ diff --git a/init/do_mounts.c b/init/do_mounts.c index b5f9604d0c98..aef2f24461c7 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -90,7 +90,6 @@ static int match_dev_by_uuid(struct device *dev, const void *data) return 0; } - /** * devt_from_partuuid - looks up the dev_t of a partition by its UUID * @uuid_str: char array containing ascii UUID @@ -186,7 +185,83 @@ static int match_dev_by_label(struct device *dev, const void *data) return 0; } -#endif + +static dev_t devt_from_partlabel(const char *label) +{ + struct device *dev; + dev_t devt = 0; + + dev = class_find_device(&block_class, NULL, label, &match_dev_by_label); + if (dev) { + devt = dev->devt; + put_device(dev); + } + + return devt; +} + +static dev_t devt_from_devname(const char *name) +{ + dev_t devt = 0; + int part; + char s[32]; + char *p; + + if (strlen(name) > 31) + return 0; + strcpy(s, name); + for (p = s; *p; p++) { + if (*p == '/') + *p = '!'; + } + + devt = blk_lookup_devt(s, 0); + if (devt) + return devt; + + /* + * Try non-existent, but valid partition, which may only exist after + * opening the device, like partitioned md devices. + */ + while (p > s && isdigit(p[-1])) + p--; + if (p == s || !*p || *p == '0') + return 0; + + /* try disk name without */ + part = simple_strtoul(p, NULL, 10); + *p = '\0'; + devt = blk_lookup_devt(s, part); + if (devt) + return devt; + + /* try disk name without p */ + if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') + return 0; + p[-1] = '\0'; + return blk_lookup_devt(s, part); +} +#endif /* CONFIG_BLOCK */ + +static dev_t devt_from_devnum(const char *name) +{ + unsigned maj, min, offset; + dev_t devt = 0; + char *p, dummy; + + if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || + sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) { + devt = MKDEV(maj, min); + if (maj != MAJOR(devt) || min != MINOR(devt)) + return 0; + } else { + devt = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + return 0; + } + + return devt; +} /* * Convert a name into device number. We accept the following variants: @@ -218,101 +293,23 @@ static int match_dev_by_label(struct device *dev, const void *data) * name contains slashes, the device name has them replaced with * bangs. */ - dev_t name_to_dev_t(const char *name) { - char s[32]; - char *p; - dev_t res = 0; - int part; - + if (strcmp(name, "/dev/nfs") == 0) + return Root_NFS; + if (strcmp(name, "/dev/cifs") == 0) + return Root_CIFS; + if (strcmp(name, "/dev/ram") == 0) + return Root_RAM0; #ifdef CONFIG_BLOCK - if (strncmp(name, "PARTUUID=", 9) == 0) { - name += 9; - res = devt_from_partuuid(name); - if (!res) - goto fail; - goto done; - } else if (strncmp(name, "PARTLABEL=", 10) == 0) { - struct device *dev; - - dev = class_find_device(&block_class, NULL, name + 10, - &match_dev_by_label); - if (!dev) - goto fail; - - res = dev->devt; - put_device(dev); - goto done; - } + if (strncmp(name, "PARTUUID=", 9) == 0) + return devt_from_partuuid(name + 9); + if (strncmp(name, "PARTLABEL=", 10) == 0) + return devt_from_partlabel(name + 10); + if (strncmp(name, "/dev/", 5) == 0) + return devt_from_devname(name + 5); #endif - - if (strncmp(name, "/dev/", 5) != 0) { - unsigned maj, min, offset; - char dummy; - - if ((sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2) || - (sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3)) { - res = MKDEV(maj, min); - if (maj != MAJOR(res) || min != MINOR(res)) - goto fail; - } else { - res = new_decode_dev(simple_strtoul(name, &p, 16)); - if (*p) - goto fail; - } - goto done; - } - - name += 5; - res = Root_NFS; - if (strcmp(name, "nfs") == 0) - goto done; - res = Root_CIFS; - if (strcmp(name, "cifs") == 0) - goto done; - res = Root_RAM0; - if (strcmp(name, "ram") == 0) - goto done; - - if (strlen(name) > 31) - goto fail; - strcpy(s, name); - for (p = s; *p; p++) - if (*p == '/') - *p = '!'; - res = blk_lookup_devt(s, 0); - if (res) - goto done; - - /* - * try non-existent, but valid partition, which may only exist - * after revalidating the disk, like partitioned md devices - */ - while (p > s && isdigit(p[-1])) - p--; - if (p == s || !*p || *p == '0') - goto fail; - - /* try disk name without */ - part = simple_strtoul(p, NULL, 10); - *p = '\0'; - res = blk_lookup_devt(s, part); - if (res) - goto done; - - /* try disk name without p */ - if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') - goto fail; - p[-1] = '\0'; - res = blk_lookup_devt(s, part); - if (res) - goto done; - -fail: - return 0; -done: - return res; + return devt_from_devnum(name); } EXPORT_SYMBOL_GPL(name_to_dev_t); From e036bb8e0cdf9dbac3b76fb0a576100eaa81f0be Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 14 Nov 2020 19:35:28 +0100 Subject: [PATCH 232/395] init: refactor devt_from_partuuid The code in devt_from_partuuid is very convoluted. Refactor a bit by sanitizing the goto and variable name usage. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- init/do_mounts.c | 68 ++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index aef2f24461c7..afa26a4028d2 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -105,13 +105,10 @@ static int match_dev_by_uuid(struct device *dev, const void *data) */ static dev_t devt_from_partuuid(const char *uuid_str) { - dev_t res = 0; struct uuidcmp cmp; struct device *dev = NULL; - struct gendisk *disk; - struct hd_struct *part; + dev_t devt = 0; int offset = 0; - bool clear_root_wait = false; char *slash; cmp.uuid = uuid_str; @@ -120,52 +117,49 @@ static dev_t devt_from_partuuid(const char *uuid_str) /* Check for optional partition number offset attributes. */ if (slash) { char c = 0; + /* Explicitly fail on poor PARTUUID syntax. */ - if (sscanf(slash + 1, - "PARTNROFF=%d%c", &offset, &c) != 1) { - clear_root_wait = true; - goto done; - } + if (sscanf(slash + 1, "PARTNROFF=%d%c", &offset, &c) != 1) + goto clear_root_wait; cmp.len = slash - uuid_str; } else { cmp.len = strlen(uuid_str); } - if (!cmp.len) { - clear_root_wait = true; - goto done; - } + if (!cmp.len) + goto clear_root_wait; - dev = class_find_device(&block_class, NULL, &cmp, - &match_dev_by_uuid); + dev = class_find_device(&block_class, NULL, &cmp, &match_dev_by_uuid); if (!dev) - goto done; + return 0; - res = dev->devt; + if (offset) { + /* + * Attempt to find the requested partition by adding an offset + * to the partition number found by UUID. + */ + struct hd_struct *part; - /* Attempt to find the partition by offset. */ - if (!offset) - goto no_offset; - - res = 0; - disk = part_to_disk(dev_to_part(dev)); - part = disk_get_part(disk, dev_to_part(dev)->partno + offset); - if (part) { - res = part_devt(part); - put_device(part_to_dev(part)); + part = disk_get_part(dev_to_disk(dev), + dev_to_part(dev)->partno + offset); + if (part) { + devt = part_devt(part); + put_device(part_to_dev(part)); + } + } else { + devt = dev->devt; } -no_offset: put_device(dev); -done: - if (clear_root_wait) { - pr_err("VFS: PARTUUID= is invalid.\n" - "Expected PARTUUID=[/PARTNROFF=%%d]\n"); - if (root_wait) - pr_err("Disabling rootwait; root= is invalid.\n"); - root_wait = 0; - } - return res; + return devt; + +clear_root_wait: + pr_err("VFS: PARTUUID= is invalid.\n" + "Expected PARTUUID=[/PARTNROFF=%%d]\n"); + if (root_wait) + pr_err("Disabling rootwait; root= is invalid.\n"); + root_wait = 0; + return 0; } /** From 013b0e96ae2225a649b48a2f8fc4f87429483cb1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 14 Nov 2020 19:41:48 +0100 Subject: [PATCH 233/395] init: cleanup match_dev_by_uuid and match_dev_by_label Avoid a totally pointless goto label, and use the same style of comparism for both helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- init/do_mounts.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index afa26a4028d2..5879edf083b3 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -79,15 +79,10 @@ static int match_dev_by_uuid(struct device *dev, const void *data) const struct uuidcmp *cmp = data; struct hd_struct *part = dev_to_part(dev); - if (!part->info) - goto no_match; - - if (strncasecmp(cmp->uuid, part->info->uuid, cmp->len)) - goto no_match; - + if (!part->info || + strncasecmp(cmp->uuid, part->info->uuid, cmp->len)) + return 0; return 1; -no_match: - return 0; } /** @@ -174,10 +169,9 @@ static int match_dev_by_label(struct device *dev, const void *data) const char *label = data; struct hd_struct *part = dev_to_part(dev); - if (part->info && !strcmp(label, part->info->volname)) - return 1; - - return 0; + if (!part->info || strcmp(label, part->info->volname)) + return 0; + return 1; } static dev_t devt_from_partlabel(const char *label) From ec5d451438a2f24c9b9c33c195bc2c39dcd3d3f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 17 Nov 2020 12:01:26 +0100 Subject: [PATCH 234/395] block: refactor __blkdev_put Reorder the code to have one big section for the last close, and to use bdev_is_partition. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 962fabe8a67b..6016777b6483 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1742,22 +1742,22 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); - bdev_write_inode(bdev); - } - if (bdev->bd_contains == bdev) { - if (disk->fops->release) + + if (!bdev_is_partition(bdev) && disk->fops->release) disk->fops->release(disk, mode); - } - if (!bdev->bd_openers) { + disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; - if (bdev != bdev->bd_contains) + if (bdev_is_partition(bdev)) victim = bdev->bd_contains; bdev->bd_contains = NULL; put_disk_and_module(disk); + } else { + if (!bdev_is_partition(bdev) && disk->fops->release) + disk->fops->release(disk, mode); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); From 5b56b6ed574b583b07da9d824c1eca6d67c1074e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 10:19:22 +0100 Subject: [PATCH 235/395] block: refactor blkdev_get Move more code that is only run on the outer open but not the open of the underlying whole device when opening a partition into blkdev_get, which leads to a much easier to follow structure. This allows to simplify the disk and module refcounting so that one reference is held for each open, similar to what we do with normal file operations. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/block_dev.c | 185 +++++++++++++++++++++++-------------------------- 1 file changed, 86 insertions(+), 99 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 6016777b6483..0c533ac92e24 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1407,46 +1407,12 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); * mutex_lock(part->bd_mutex) * mutex_lock_nested(whole->bd_mutex, 1) */ - -static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, - int for_part) +static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, + int partno, fmode_t mode) { - struct block_device *whole = NULL, *claiming = NULL; - struct gendisk *disk; int ret; - int partno; - bool first_open = false, unblock_events = true, need_restart; - restart: - need_restart = false; - ret = -ENXIO; - disk = bdev_get_gendisk(bdev, &partno); - if (!disk) - goto out; - - if (partno) { - whole = bdget_disk(disk, 0); - if (!whole) { - ret = -ENOMEM; - goto out_put_disk; - } - } - - if (!for_part && (mode & FMODE_EXCL)) { - WARN_ON_ONCE(!holder); - if (whole) - claiming = whole; - else - claiming = bdev; - ret = bd_prepare_to_claim(bdev, claiming, holder); - if (ret) - goto out_put_whole; - } - - disk_block_events(disk); - mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { - first_open = true; bdev->bd_disk = disk; bdev->bd_contains = bdev; bdev->bd_partno = partno; @@ -1458,15 +1424,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, goto out_clear; ret = 0; - if (disk->fops->open) { + if (disk->fops->open) ret = disk->fops->open(bdev, mode); - /* - * If we lost a race with 'disk' being deleted, - * try again. See md.c - */ - if (ret == -ERESTARTSYS) - need_restart = true; - } if (!ret) { bd_set_nr_sectors(bdev, get_capacity(disk)); @@ -1486,14 +1445,23 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, if (ret) goto out_clear; } else { - BUG_ON(for_part); - ret = __blkdev_get(whole, mode, NULL, 1); - if (ret) + struct block_device *whole = bdget_disk(disk, 0); + + mutex_lock_nested(&whole->bd_mutex, 1); + ret = __blkdev_get(whole, disk, 0, mode); + if (ret) { + mutex_unlock(&whole->bd_mutex); + bdput(whole); goto out_clear; - bdev->bd_contains = bdgrab(whole); + } + whole->bd_part_count++; + mutex_unlock(&whole->bd_mutex); + + bdev->bd_contains = whole; bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { + __blkdev_put(whole, mode, 1); ret = -ENXIO; goto out_clear; } @@ -1513,58 +1481,17 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) - goto out_unlock_bdev; + return ret; } } bdev->bd_openers++; - if (for_part) - bdev->bd_part_count++; - if (claiming) - bd_finish_claiming(bdev, claiming, holder); - - /* - * Block event polling for write claims if requested. Any write holder - * makes the write_holder state stick until all are released. This is - * good enough and tracking individual writeable reference is too - * fragile given the way @mode is used in blkdev_get/put(). - */ - if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder && - (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { - bdev->bd_write_holder = true; - unblock_events = false; - } - mutex_unlock(&bdev->bd_mutex); - - if (unblock_events) - disk_unblock_events(disk); - - /* only one opener holds refs to the module and disk */ - if (!first_open) - put_disk_and_module(disk); - if (whole) - bdput(whole); return 0; out_clear: disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; - if (bdev != bdev->bd_contains) - __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; - out_unlock_bdev: - if (claiming) - bd_abort_claiming(bdev, claiming, holder); - mutex_unlock(&bdev->bd_mutex); - disk_unblock_events(disk); - out_put_whole: - if (whole) - bdput(whole); - out_put_disk: - put_disk_and_module(disk); - if (need_restart) - goto restart; - out: return ret; } @@ -1589,7 +1516,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, */ static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { - int ret, perm = 0; + struct block_device *claiming; + bool unblock_events = true; + struct gendisk *disk; + int perm = 0; + int partno; + int ret; if (mode & FMODE_READ) perm |= MAY_READ; @@ -1599,13 +1531,67 @@ static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) if (ret) goto bdput; - ret =__blkdev_get(bdev, mode, holder, 0); - if (ret) + /* + * If we lost a race with 'disk' being deleted, try again. See md.c. + */ +retry: + ret = -ENXIO; + disk = bdev_get_gendisk(bdev, &partno); + if (!disk) goto bdput; - return 0; + if (mode & FMODE_EXCL) { + WARN_ON_ONCE(!holder); + + ret = -ENOMEM; + claiming = bdget_disk(disk, 0); + if (!claiming) + goto put_disk; + ret = bd_prepare_to_claim(bdev, claiming, holder); + if (ret) + goto put_claiming; + } + + disk_block_events(disk); + + mutex_lock(&bdev->bd_mutex); + ret =__blkdev_get(bdev, disk, partno, mode); + if (!(mode & FMODE_EXCL)) { + ; /* nothing to do here */ + } else if (ret) { + bd_abort_claiming(bdev, claiming, holder); + } else { + bd_finish_claiming(bdev, claiming, holder); + + /* + * Block event polling for write claims if requested. Any write + * holder makes the write_holder state stick until all are + * released. This is good enough and tracking individual + * writeable reference is too fragile given the way @mode is + * used in blkdev_get/put(). + */ + if ((mode & FMODE_WRITE) && !bdev->bd_write_holder && + (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { + bdev->bd_write_holder = true; + unblock_events = false; + } + } + mutex_unlock(&bdev->bd_mutex); + + if (unblock_events) + disk_unblock_events(disk); + +put_claiming: + if (mode & FMODE_EXCL) + bdput(claiming); +put_disk: + if (ret) + put_disk_and_module(disk); + if (ret == -ERESTARTSYS) + goto retry; bdput: - bdput(bdev); + if (ret) + bdput(bdev); return ret; } @@ -1753,8 +1739,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (bdev_is_partition(bdev)) victim = bdev->bd_contains; bdev->bd_contains = NULL; - - put_disk_and_module(disk); } else { if (!bdev_is_partition(bdev) && disk->fops->release) disk->fops->release(disk, mode); @@ -1767,6 +1751,8 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) void blkdev_put(struct block_device *bdev, fmode_t mode) { + struct gendisk *disk = bdev->bd_disk; + mutex_lock(&bdev->bd_mutex); if (mode & FMODE_EXCL) { @@ -1795,7 +1781,7 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * unblock evpoll if it was a write holder. */ if (bdev_free && bdev->bd_write_holder) { - disk_unblock_events(bdev->bd_disk); + disk_unblock_events(disk); bdev->bd_write_holder = false; } } @@ -1805,11 +1791,12 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * event. This is to ensure detection of media removal commanded * from userland - e.g. eject(1). */ - disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); + disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); mutex_unlock(&bdev->bd_mutex); __blkdev_put(bdev, mode, 0); + put_disk_and_module(disk); } EXPORT_SYMBOL(blkdev_put); From 63d9932caecee8b0a295c608d083280b45885d10 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 09:22:18 +0100 Subject: [PATCH 236/395] block: move bdput() to the callers of __blkdev_get This will allow for a more symmetric calling convention going forward. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/block_dev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 0c533ac92e24..a2d5050c97ee 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1462,6 +1462,7 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { __blkdev_put(whole, mode, 1); + bdput(whole); ret = -ENXIO; goto out_clear; } @@ -1744,9 +1745,10 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) disk->fops->release(disk, mode); } mutex_unlock(&bdev->bd_mutex); - bdput(bdev); - if (victim) + if (victim) { __blkdev_put(victim, mode, 1); + bdput(victim); + } } void blkdev_put(struct block_device *bdev, fmode_t mode) @@ -1796,6 +1798,7 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) mutex_unlock(&bdev->bd_mutex); __blkdev_put(bdev, mode, 0); + bdput(bdev); put_disk_and_module(disk); } EXPORT_SYMBOL(blkdev_put); From 7918f0f6fdafa1e52c2d77c537cb55ef25fb69a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 13:44:44 +0100 Subject: [PATCH 237/395] block: opencode devcgroup_inode_permission Just call devcgroup_check_permission to avoid various superflous checks and a double conversion of the access flags. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/block_dev.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index a2d5050c97ee..2b8c0586314f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1520,15 +1520,13 @@ static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) struct block_device *claiming; bool unblock_events = true; struct gendisk *disk; - int perm = 0; int partno; int ret; - if (mode & FMODE_READ) - perm |= MAY_READ; - if (mode & FMODE_WRITE) - perm |= MAY_WRITE; - ret = devcgroup_inode_permission(bdev->bd_inode, perm); + ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, + imajor(bdev->bd_inode), iminor(bdev->bd_inode), + ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) | + ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0)); if (ret) goto bdput; From 4e7b5671c6a883d94b5428e1a9c141bbd56cb2a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 13:38:40 +0100 Subject: [PATCH 238/395] block: remove i_bdev Switch the block device lookup interfaces to directly work with a dev_t so that struct block_device references are only acquired by the blkdev_get variants (and the blk-cgroup special case). This means that we now don't need an extra reference in the inode and can generally simplify handling of struct block_device to keep the lookups contained in the core block layer code. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Tejun Heo Acked-by: Coly Li [bcache] Signed-off-by: Jens Axboe --- block/ioctl.c | 3 +- drivers/block/loop.c | 8 +- drivers/md/bcache/super.c | 20 +- drivers/md/dm-table.c | 9 +- drivers/mtd/mtdsuper.c | 17 +- drivers/target/target_core_file.c | 6 +- drivers/usb/gadget/function/storage_common.c | 8 +- fs/block_dev.c | 196 +++++-------------- fs/btrfs/volumes.c | 13 +- fs/inode.c | 3 - fs/internal.h | 7 +- fs/io_uring.c | 10 +- fs/pipe.c | 5 +- fs/quota/quota.c | 19 +- fs/statfs.c | 2 +- fs/super.c | 44 ++--- include/linux/blkdev.h | 2 +- include/linux/fs.h | 1 - 18 files changed, 121 insertions(+), 252 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 0c09bb7a6ff3..a6d8171221c7 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -590,8 +590,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { int ret; void __user *argp = compat_ptr(arg); - struct inode *inode = file->f_mapping->host; - struct block_device *bdev = inode->i_bdev; + struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; fmode_t mode = file->f_mode; loff_t size; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index b42c728620c9..26c7aafba7c5 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -675,10 +675,10 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) while (is_loop_device(f)) { struct loop_device *l; - if (f->f_mapping->host->i_bdev == bdev) + if (f->f_mapping->host->i_rdev == bdev->bd_dev) return -EBADF; - l = f->f_mapping->host->i_bdev->bd_disk->private_data; + l = I_BDEV(f->f_mapping->host)->bd_disk->private_data; if (l->lo_state != Lo_bound) { return -EINVAL; } @@ -885,9 +885,7 @@ static void loop_config_discard(struct loop_device *lo) * file-backed loop devices: discarded regions read back as zero. */ if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) { - struct request_queue *backingq; - - backingq = bdev_get_queue(inode->i_bdev); + struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); max_discard_sectors = backingq->limits.max_write_zeroes_sectors; granularity = backingq->limits.discard_granularity ?: diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a6a5e21e4fd1..c55d3c58a7ef 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2380,38 +2380,38 @@ kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); -static bool bch_is_open_backing(struct block_device *bdev) +static bool bch_is_open_backing(dev_t dev) { struct cache_set *c, *tc; struct cached_dev *dc, *t; list_for_each_entry_safe(c, tc, &bch_cache_sets, list) list_for_each_entry_safe(dc, t, &c->cached_devs, list) - if (dc->bdev == bdev) + if (dc->bdev->bd_dev == dev) return true; list_for_each_entry_safe(dc, t, &uncached_devices, list) - if (dc->bdev == bdev) + if (dc->bdev->bd_dev == dev) return true; return false; } -static bool bch_is_open_cache(struct block_device *bdev) +static bool bch_is_open_cache(dev_t dev) { struct cache_set *c, *tc; list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { struct cache *ca = c->cache; - if (ca->bdev == bdev) + if (ca->bdev->bd_dev == dev) return true; } return false; } -static bool bch_is_open(struct block_device *bdev) +static bool bch_is_open(dev_t dev) { - return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); + return bch_is_open_cache(dev) || bch_is_open_backing(dev); } struct async_reg_args { @@ -2535,9 +2535,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, sb); if (IS_ERR(bdev)) { if (bdev == ERR_PTR(-EBUSY)) { - bdev = lookup_bdev(strim(path)); + dev_t dev; + mutex_lock(&bch_register_lock); - if (!IS_ERR(bdev) && bch_is_open(bdev)) + if (lookup_bdev(strim(path), &dev) == 0 && + bch_is_open(dev)) err = "device already registered"; else err = "device busy"; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ce543b761be7..dea677721710 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -348,16 +348,9 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, dev_t dm_get_dev_t(const char *path) { dev_t dev; - struct block_device *bdev; - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) + if (lookup_bdev(path, &dev)) dev = name_to_dev_t(path); - else { - dev = bdev->bd_dev; - bdput(bdev); - } - return dev; } EXPORT_SYMBOL_GPL(dm_get_dev_t); diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c index c3e2098372f2..38b6aa849c63 100644 --- a/drivers/mtd/mtdsuper.c +++ b/drivers/mtd/mtdsuper.c @@ -120,8 +120,8 @@ int get_tree_mtd(struct fs_context *fc, struct fs_context *fc)) { #ifdef CONFIG_BLOCK - struct block_device *bdev; - int ret, major; + dev_t dev; + int ret; #endif int mtdnr; @@ -169,20 +169,15 @@ int get_tree_mtd(struct fs_context *fc, /* try the old way - the hack where we allowed users to mount * /dev/mtdblock$(n) but didn't actually _use_ the blockdev */ - bdev = lookup_bdev(fc->source); - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); + ret = lookup_bdev(fc->source, &dev); + if (ret) { errorf(fc, "MTD: Couldn't look up '%s': %d", fc->source, ret); return ret; } pr_debug("MTDSB: lookup_bdev() returned 0\n"); - major = MAJOR(bdev->bd_dev); - mtdnr = MINOR(bdev->bd_dev); - bdput(bdev); - - if (major == MTD_BLOCK_MAJOR) - return mtd_get_sb_by_nr(fc, mtdnr, fill_super); + if (MAJOR(dev) == MTD_BLOCK_MAJOR) + return mtd_get_sb_by_nr(fc, MINOR(dev), fill_super); #endif /* CONFIG_BLOCK */ diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 7143d03f0e02..b0cb5b95e892 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -133,10 +133,10 @@ static int fd_configure_device(struct se_device *dev) */ inode = file->f_mapping->host; if (S_ISBLK(inode->i_mode)) { - struct request_queue *q = bdev_get_queue(inode->i_bdev); + struct request_queue *q = bdev_get_queue(I_BDEV(inode)); unsigned long long dev_size; - fd_dev->fd_block_size = bdev_logical_block_size(inode->i_bdev); + fd_dev->fd_block_size = bdev_logical_block_size(I_BDEV(inode)); /* * Determine the number of bytes from i_size_read() minus * one (1) logical sector from underlying struct block_device @@ -559,7 +559,7 @@ fd_execute_unmap(struct se_cmd *cmd, sector_t lba, sector_t nolb) if (S_ISBLK(inode->i_mode)) { /* The backend is block device, use discard */ - struct block_device *bdev = inode->i_bdev; + struct block_device *bdev = I_BDEV(inode); struct se_device *dev = cmd->se_dev; ret = blkdev_issue_discard(bdev, diff --git a/drivers/usb/gadget/function/storage_common.c b/drivers/usb/gadget/function/storage_common.c index f7e6c42558eb..b859a158a414 100644 --- a/drivers/usb/gadget/function/storage_common.c +++ b/drivers/usb/gadget/function/storage_common.c @@ -204,7 +204,7 @@ int fsg_lun_open(struct fsg_lun *curlun, const char *filename) if (!(filp->f_mode & FMODE_WRITE)) ro = 1; - inode = file_inode(filp); + inode = filp->f_mapping->host; if ((!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) { LINFO(curlun, "invalid file type: %s\n", filename); goto out; @@ -221,7 +221,7 @@ int fsg_lun_open(struct fsg_lun *curlun, const char *filename) if (!(filp->f_mode & FMODE_CAN_WRITE)) ro = 1; - size = i_size_read(inode->i_mapping->host); + size = i_size_read(inode); if (size < 0) { LINFO(curlun, "unable to find file size: %s\n", filename); rc = (int) size; @@ -231,8 +231,8 @@ int fsg_lun_open(struct fsg_lun *curlun, const char *filename) if (curlun->cdrom) { blksize = 2048; blkbits = 11; - } else if (inode->i_bdev) { - blksize = bdev_logical_block_size(inode->i_bdev); + } else if (S_ISBLK(inode->i_mode)) { + blksize = bdev_logical_block_size(I_BDEV(inode)); blkbits = blksize_bits(blksize); } else { blksize = 512; diff --git a/fs/block_dev.c b/fs/block_dev.c index 2b8c0586314f..6d6e4d50834c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -883,7 +883,6 @@ static struct block_device *bdget(dev_t dev) bdev->bd_dev = dev; inode->i_mode = S_IFBLK; inode->i_rdev = dev; - inode->i_bdev = bdev; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); unlock_new_inode(inode); @@ -928,67 +927,8 @@ void bdput(struct block_device *bdev) { iput(bdev->bd_inode); } - EXPORT_SYMBOL(bdput); -static struct block_device *bd_acquire(struct inode *inode) -{ - struct block_device *bdev; - - spin_lock(&bdev_lock); - bdev = inode->i_bdev; - if (bdev && !inode_unhashed(bdev->bd_inode)) { - bdgrab(bdev); - spin_unlock(&bdev_lock); - return bdev; - } - spin_unlock(&bdev_lock); - - /* - * i_bdev references block device inode that was already shut down - * (corresponding device got removed). Remove the reference and look - * up block device inode again just in case new device got - * reestablished under the same device number. - */ - if (bdev) - bd_forget(inode); - - bdev = bdget(inode->i_rdev); - if (bdev) { - spin_lock(&bdev_lock); - if (!inode->i_bdev) { - /* - * We take an additional reference to bd_inode, - * and it's released in clear_inode() of inode. - * So, we can access it via ->i_mapping always - * without igrab(). - */ - bdgrab(bdev); - inode->i_bdev = bdev; - inode->i_mapping = bdev->bd_inode->i_mapping; - } - spin_unlock(&bdev_lock); - } - return bdev; -} - -/* Call when you free inode */ - -void bd_forget(struct inode *inode) -{ - struct block_device *bdev = NULL; - - spin_lock(&bdev_lock); - if (!sb_is_blkdev_sb(inode->i_sb)) - bdev = inode->i_bdev; - inode->i_bdev = NULL; - inode->i_mapping = &inode->i_data; - spin_unlock(&bdev_lock); - - if (bdev) - bdput(bdev); -} - /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest @@ -1497,38 +1437,45 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, } /** - * blkdev_get - open a block device - * @bdev: block_device to open + * blkdev_get_by_dev - open a block device by device number + * @dev: device number of block device to open * @mode: FMODE_* mask * @holder: exclusive holder identifier * - * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is - * open with exclusive access. Specifying %FMODE_EXCL with %NULL - * @holder is invalid. Exclusive opens may nest for the same @holder. + * Open the block device described by device number @dev. If @mode includes + * %FMODE_EXCL, the block device is opened with exclusive access. Specifying + * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for + * the same @holder. * - * On success, the reference count of @bdev is unchanged. On failure, - * @bdev is put. + * Use this interface ONLY if you really do not have anything better - i.e. when + * you are behind a truly sucky interface and all you are given is a device + * number. Everything else should use blkdev_get_by_path(). * * CONTEXT: * Might sleep. * * RETURNS: - * 0 on success, -errno on failure. + * Reference to the block_device on success, ERR_PTR(-errno) on failure. */ -static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) { struct block_device *claiming; bool unblock_events = true; + struct block_device *bdev; struct gendisk *disk; int partno; int ret; ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, - imajor(bdev->bd_inode), iminor(bdev->bd_inode), + MAJOR(dev), MINOR(dev), ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) | ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0)); if (ret) - goto bdput; + return ERR_PTR(ret); + + bdev = bdget(dev); + if (!bdev) + return ERR_PTR(-ENOMEM); /* * If we lost a race with 'disk' being deleted, try again. See md.c. @@ -1589,10 +1536,13 @@ static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) if (ret == -ERESTARTSYS) goto retry; bdput: - if (ret) + if (ret) { bdput(bdev); - return ret; + return ERR_PTR(ret); + } + return bdev; } +EXPORT_SYMBOL(blkdev_get_by_dev); /** * blkdev_get_by_path - open a block device by name @@ -1600,32 +1550,30 @@ static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) * @mode: FMODE_* mask * @holder: exclusive holder identifier * - * Open the blockdevice described by the device file at @path. @mode - * and @holder are identical to blkdev_get(). - * - * On success, the returned block_device has reference count of one. + * Open the block device described by the device file at @path. If @mode + * includes %FMODE_EXCL, the block device is opened with exclusive access. + * Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may + * nest for the same @holder. * * CONTEXT: * Might sleep. * * RETURNS: - * Pointer to block_device on success, ERR_PTR(-errno) on failure. + * Reference to the block_device on success, ERR_PTR(-errno) on failure. */ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder) { struct block_device *bdev; - int err; + dev_t dev; + int error; - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) - return bdev; + error = lookup_bdev(path, &dev); + if (error) + return ERR_PTR(error); - err = blkdev_get(bdev, mode, holder); - if (err) - return ERR_PTR(err); - - if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { + bdev = blkdev_get_by_dev(dev, mode, holder); + if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { blkdev_put(bdev, mode); return ERR_PTR(-EACCES); } @@ -1634,45 +1582,6 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, } EXPORT_SYMBOL(blkdev_get_by_path); -/** - * blkdev_get_by_dev - open a block device by device number - * @dev: device number of block device to open - * @mode: FMODE_* mask - * @holder: exclusive holder identifier - * - * Open the blockdevice described by device number @dev. @mode and - * @holder are identical to blkdev_get(). - * - * Use it ONLY if you really do not have anything better - i.e. when - * you are behind a truly sucky interface and all you are given is a - * device number. _Never_ to be used for internal purposes. If you - * ever need it - reconsider your API. - * - * On success, the returned block_device has reference count of one. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * Pointer to block_device on success, ERR_PTR(-errno) on failure. - */ -struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) -{ - struct block_device *bdev; - int err; - - bdev = bdget(dev); - if (!bdev) - return ERR_PTR(-ENOMEM); - - err = blkdev_get(bdev, mode, holder); - if (err) - return ERR_PTR(err); - - return bdev; -} -EXPORT_SYMBOL(blkdev_get_by_dev); - static int blkdev_open(struct inode * inode, struct file * filp) { struct block_device *bdev; @@ -1694,14 +1603,12 @@ static int blkdev_open(struct inode * inode, struct file * filp) if ((filp->f_flags & O_ACCMODE) == 3) filp->f_mode |= FMODE_WRITE_IOCTL; - bdev = bd_acquire(inode); - if (bdev == NULL) - return -ENOMEM; - + bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); - - return blkdev_get(bdev, filp->f_mode, filp); + return 0; } static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) @@ -2010,37 +1917,32 @@ const struct file_operations def_blk_fops = { * namespace if possible and return it. Return ERR_PTR(error) * otherwise. */ -struct block_device *lookup_bdev(const char *pathname) +int lookup_bdev(const char *pathname, dev_t *dev) { - struct block_device *bdev; struct inode *inode; struct path path; int error; if (!pathname || !*pathname) - return ERR_PTR(-EINVAL); + return -EINVAL; error = kern_path(pathname, LOOKUP_FOLLOW, &path); if (error) - return ERR_PTR(error); + return error; inode = d_backing_inode(path.dentry); error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) - goto fail; + goto out_path_put; error = -EACCES; if (!may_open_dev(&path)) - goto fail; - error = -ENOMEM; - bdev = bd_acquire(inode); - if (!bdev) - goto fail; -out: + goto out_path_put; + + *dev = inode->i_rdev; + error = 0; +out_path_put: path_put(&path); - return bdev; -fail: - bdev = ERR_PTR(error); - goto out; + return error; } EXPORT_SYMBOL(lookup_bdev); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a6406b3b8c2b..fbc4b58228f7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -929,16 +929,16 @@ static noinline struct btrfs_device *device_list_add(const char *path, * make sure it's the same device if the device is mounted */ if (device->bdev) { - struct block_device *path_bdev; + int error; + dev_t path_dev; - path_bdev = lookup_bdev(path); - if (IS_ERR(path_bdev)) { + error = lookup_bdev(path, &path_dev); + if (error) { mutex_unlock(&fs_devices->device_list_mutex); - return ERR_CAST(path_bdev); + return ERR_PTR(error); } - if (device->bdev != path_bdev) { - bdput(path_bdev); + if (device->bdev->bd_dev != path_dev) { mutex_unlock(&fs_devices->device_list_mutex); btrfs_warn_in_rcu(device->fs_info, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", @@ -947,7 +947,6 @@ static noinline struct btrfs_device *device_list_add(const char *path, task_pid_nr(current)); return ERR_PTR(-EEXIST); } - bdput(path_bdev); btrfs_info_in_rcu(device->fs_info, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, rcu_str_deref(device->name), diff --git a/fs/inode.c b/fs/inode.c index 9d78c37b00b8..cb008acf0efd 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -155,7 +155,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_bytes = 0; inode->i_generation = 0; inode->i_pipe = NULL; - inode->i_bdev = NULL; inode->i_cdev = NULL; inode->i_link = NULL; inode->i_dir_seq = 0; @@ -580,8 +579,6 @@ static void evict(struct inode *inode) truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } - if (S_ISBLK(inode->i_mode) && inode->i_bdev) - bd_forget(inode); if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode); diff --git a/fs/internal.h b/fs/internal.h index 47be21dfeebe..53f890446e75 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -25,7 +25,6 @@ extern void __init bdev_cache_init(void); extern int __sync_blockdev(struct block_device *bdev, int wait); void iterate_bdevs(void (*)(struct block_device *, void *), void *); void emergency_thaw_bdev(struct super_block *sb); -void bd_forget(struct inode *inode); #else static inline void bdev_cache_init(void) { @@ -43,9 +42,6 @@ static inline int emergency_thaw_bdev(struct super_block *sb) { return 0; } -static inline void bd_forget(struct inode *inode) -{ -} #endif /* CONFIG_BLOCK */ /* @@ -114,8 +110,7 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *); */ extern int reconfigure_super(struct fs_context *); extern bool trylock_super(struct super_block *sb); -struct super_block *__get_super(struct block_device *bdev, bool excl); -extern struct super_block *user_get_super(dev_t); +struct super_block *user_get_super(dev_t, bool excl); void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); diff --git a/fs/io_uring.c b/fs/io_uring.c index 4ead291b2976..8f13c0417f94 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2716,11 +2716,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) static bool io_bdev_nowait(struct block_device *bdev) { -#ifdef CONFIG_BLOCK return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); -#else - return true; -#endif } /* @@ -2733,14 +2729,16 @@ static bool io_file_supports_async(struct file *file, int rw) umode_t mode = file_inode(file)->i_mode; if (S_ISBLK(mode)) { - if (io_bdev_nowait(file->f_inode->i_bdev)) + if (IS_ENABLED(CONFIG_BLOCK) && + io_bdev_nowait(I_BDEV(file->f_mapping->host))) return true; return false; } if (S_ISCHR(mode) || S_ISSOCK(mode)) return true; if (S_ISREG(mode)) { - if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) && + if (IS_ENABLED(CONFIG_BLOCK) && + io_bdev_nowait(file->f_inode->i_sb->s_bdev) && file->f_op != &io_uring_fops) return true; return false; diff --git a/fs/pipe.c b/fs/pipe.c index 0ac197658a2d..c5989cfd564d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1342,9 +1342,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) } /* - * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same - * location, so checking ->i_pipe is not enough to verify that this is a - * pipe. + * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is + * not enough to verify that this is a pipe. */ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { diff --git a/fs/quota/quota.c b/fs/quota/quota.c index f3d32b0d9008..6d16b2be5ac4 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -866,17 +866,18 @@ static bool quotactl_cmd_onoff(int cmd) static struct super_block *quotactl_block(const char __user *special, int cmd) { #ifdef CONFIG_BLOCK - struct block_device *bdev; struct super_block *sb; struct filename *tmp = getname(special); bool excl = false, thawed = false; + int error; + dev_t dev; if (IS_ERR(tmp)) return ERR_CAST(tmp); - bdev = lookup_bdev(tmp->name); + error = lookup_bdev(tmp->name, &dev); putname(tmp); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + if (error) + return ERR_PTR(error); if (quotactl_cmd_onoff(cmd)) { excl = true; @@ -886,8 +887,10 @@ static struct super_block *quotactl_block(const char __user *special, int cmd) } retry: - sb = __get_super(bdev, excl); - if (thawed && sb && sb->s_writers.frozen != SB_UNFROZEN) { + sb = user_get_super(dev, excl); + if (!sb) + return ERR_PTR(-ENODEV); + if (thawed && sb->s_writers.frozen != SB_UNFROZEN) { if (excl) up_write(&sb->s_umount); else @@ -897,10 +900,6 @@ static struct super_block *quotactl_block(const char __user *special, int cmd) put_super(sb); goto retry; } - - bdput(bdev); - if (!sb) - return ERR_PTR(-ENODEV); return sb; #else diff --git a/fs/statfs.c b/fs/statfs.c index 59f33752c131..68cb07788750 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -235,7 +235,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user static int vfs_ustat(dev_t dev, struct kstatfs *sbuf) { - struct super_block *s = user_get_super(dev); + struct super_block *s = user_get_super(dev, false); int err; if (!s) return -EINVAL; diff --git a/fs/super.c b/fs/super.c index 343e5c1e538d..2c6cdea2ab2d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -740,7 +740,14 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); -struct super_block *__get_super(struct block_device *bdev, bool excl) +/** + * get_super - get the superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. %NULL is returned if no match is found. + */ +struct super_block *get_super(struct block_device *bdev) { struct super_block *sb; @@ -755,17 +762,11 @@ struct super_block *__get_super(struct block_device *bdev, bool excl) if (sb->s_bdev == bdev) { sb->s_count++; spin_unlock(&sb_lock); - if (!excl) - down_read(&sb->s_umount); - else - down_write(&sb->s_umount); + down_read(&sb->s_umount); /* still alive? */ if (sb->s_root && (sb->s_flags & SB_BORN)) return sb; - if (!excl) - up_read(&sb->s_umount); - else - up_write(&sb->s_umount); + up_read(&sb->s_umount); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); @@ -776,19 +777,6 @@ struct super_block *__get_super(struct block_device *bdev, bool excl) return NULL; } -/** - * get_super - get the superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device given. %NULL is returned if no match is found. - */ -struct super_block *get_super(struct block_device *bdev) -{ - return __get_super(bdev, false); -} -EXPORT_SYMBOL(get_super); - /** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for @@ -820,7 +808,7 @@ struct super_block *get_active_super(struct block_device *bdev) return NULL; } -struct super_block *user_get_super(dev_t dev) +struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; @@ -832,11 +820,17 @@ struct super_block *user_get_super(dev_t dev) if (sb->s_dev == dev) { sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); + if (excl) + down_write(&sb->s_umount); + else + down_read(&sb->s_umount); /* still alive? */ if (sb->s_root && (sb->s_flags & SB_BORN)) return sb; - up_read(&sb->s_umount); + if (excl) + up_write(&sb->s_umount); + else + up_read(&sb->s_umount); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 12810a19edeb..bdd7339bcda4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1973,7 +1973,7 @@ int bdev_read_only(struct block_device *bdev); int set_blocksize(struct block_device *bdev, int size); const char *bdevname(struct block_device *bdev, char *buffer); -struct block_device *lookup_bdev(const char *); +int lookup_bdev(const char *pathname, dev_t *dev); void blkdev_show(struct seq_file *seqf, off_t offset); diff --git a/include/linux/fs.h b/include/linux/fs.h index a61df0dd4f19..b0b358309657 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -696,7 +696,6 @@ struct inode { struct list_head i_devices; union { struct pipe_inode_info *i_pipe; - struct block_device *i_bdev; struct cdev *i_cdev; char *i_link; unsigned i_dir_seq; From 22ae8ce8b89241c94ac00c237752c0ffa37ba5ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 09:23:26 +0100 Subject: [PATCH 239/395] block: simplify bdev/disk lookup in blkdev_get To simplify block device lookup and a few other upcoming areas, make sure that we always have a struct block_device available for each disk and each partition, and only find existing block devices in bdget. The only downside of this is that each device and partition uses a little more memory. The upside will be that a lot of code can be simplified. With that all we need to look up the block device is to lookup the inode and do a few sanity checks on the gendisk, instead of the separate lookup for the gendisk. For blk-cgroup which wants to access a gendisk without opening it, a new blkdev_{get,put}_no_open low-level interface is added to replace the previous get_gendisk use. Note that the change to look up block device directly instead of the two step lookup using struct gendisk causes a subtile change in behavior: accessing a non-existing partition on an existing block device can now cause a call to request_module. That call is harmless, and in practice no recent system will access these nodes as they aren't created by udev and static /dev/ setups are unusual. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 42 ++++---- block/blk-iocost.c | 36 +++---- block/blk.h | 2 +- block/genhd.c | 212 +++++-------------------------------- block/partitions/core.c | 29 ++--- fs/block_dev.c | 177 ++++++++++++++++++------------- include/linux/blk-cgroup.h | 4 +- include/linux/blkdev.h | 6 ++ include/linux/genhd.h | 7 +- 9 files changed, 195 insertions(+), 320 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c68bdf58c9a6..ad02289a4f7f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -556,22 +556,22 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, } /** - * blkg_conf_prep - parse and prepare for per-blkg config update + * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update * @inputp: input string pointer * * Parse the device node prefix part, MAJ:MIN, of per-blkg config update - * from @input and get and return the matching gendisk. *@inputp is + * from @input and get and return the matching bdev. *@inputp is * updated to point past the device node prefix. Returns an ERR_PTR() * value on error. * * Use this function iff blkg_conf_prep() can't be used for some reason. */ -struct gendisk *blkcg_conf_get_disk(char **inputp) +struct block_device *blkcg_conf_open_bdev(char **inputp) { char *input = *inputp; unsigned int major, minor; - struct gendisk *disk; - int key_len, part; + struct block_device *bdev; + int key_len; if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return ERR_PTR(-EINVAL); @@ -581,16 +581,16 @@ struct gendisk *blkcg_conf_get_disk(char **inputp) return ERR_PTR(-EINVAL); input = skip_spaces(input); - disk = get_gendisk(MKDEV(major, minor), &part); - if (!disk) + bdev = blkdev_get_no_open(MKDEV(major, minor)); + if (!bdev) return ERR_PTR(-ENODEV); - if (part) { - put_disk_and_module(disk); + if (bdev_is_partition(bdev)) { + blkdev_put_no_open(bdev); return ERR_PTR(-ENODEV); } *inputp = input; - return disk; + return bdev; } /** @@ -607,18 +607,18 @@ struct gendisk *blkcg_conf_get_disk(char **inputp) */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(&disk->queue->queue_lock) + __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock) { - struct gendisk *disk; + struct block_device *bdev; struct request_queue *q; struct blkcg_gq *blkg; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - q = disk->queue; + q = bdev->bd_disk->queue; rcu_read_lock(); spin_lock_irq(&q->queue_lock); @@ -689,7 +689,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: - ctx->disk = disk; + ctx->bdev = bdev; ctx->blkg = blkg; ctx->body = input; return 0; @@ -700,7 +700,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); fail: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -723,11 +723,11 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); * with blkg_conf_prep(). */ void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(&ctx->disk->queue->queue_lock) __releases(rcu) + __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu) { - spin_unlock_irq(&ctx->disk->queue->queue_lock); + spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock); rcu_read_unlock(); - put_disk_and_module(ctx->disk); + blkdev_put_no_open(ctx->bdev); } EXPORT_SYMBOL_GPL(blkg_conf_finish); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index bbe86d1199dc..8e20fe4bddec 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3120,23 +3120,23 @@ static const match_table_t qos_tokens = { static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct gendisk *disk; + struct block_device *bdev; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; char *p; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); if (!ioc) { - ret = blk_iocost_init(disk->queue); + ret = blk_iocost_init(bdev->bd_disk->queue); if (ret) goto err; - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); } spin_lock_irq(&ioc->lock); @@ -3231,12 +3231,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return nbytes; einval: ret = -EINVAL; err: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return ret; } @@ -3287,23 +3287,23 @@ static const match_table_t i_lcoef_tokens = { static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct gendisk *disk; + struct block_device *bdev; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; char *p; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); if (!ioc) { - ret = blk_iocost_init(disk->queue); + ret = blk_iocost_init(bdev->bd_disk->queue); if (ret) goto err; - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); } spin_lock_irq(&ioc->lock); @@ -3356,13 +3356,13 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return nbytes; einval: ret = -EINVAL; err: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return ret; } diff --git a/block/blk.h b/block/blk.h index dfab98465db9..c4839abcfa27 100644 --- a/block/blk.h +++ b/block/blk.h @@ -352,7 +352,6 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); int blk_alloc_devt(struct hd_struct *part, dev_t *devt); void blk_free_devt(dev_t devt); -void blk_invalidate_devt(dev_t devt); char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 @@ -384,6 +383,7 @@ static inline void hd_free_part(struct hd_struct *part) { free_percpu(part->dkstats); kfree(part->info); + bdput(part->bdev); percpu_ref_exit(&part->ref); } diff --git a/block/genhd.c b/block/genhd.c index f46e89226fdf..bf8fa82f135f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -27,17 +27,11 @@ static struct kobject *block_depr; -static DEFINE_XARRAY(bdev_map); -static DEFINE_MUTEX(bdev_map_lock); +DECLARE_RWSEM(bdev_lookup_sem); /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) - -/* For extended devt allocation. ext_devt_lock prevents look up - * results from going away underneath its user. - */ -static DEFINE_SPINLOCK(ext_devt_lock); -static DEFINE_IDR(ext_devt_idr); +static DEFINE_IDA(ext_devt_ida); static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr); @@ -580,14 +574,7 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) return 0; } - /* allocate ext devt */ - idr_preload(GFP_KERNEL); - - spin_lock_bh(&ext_devt_lock); - idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT); - spin_unlock_bh(&ext_devt_lock); - - idr_preload_end(); + idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL); if (idx < 0) return idx == -ENOSPC ? -EBUSY : idx; @@ -606,26 +593,8 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) */ void blk_free_devt(dev_t devt) { - if (devt == MKDEV(0, 0)) - return; - - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - spin_lock_bh(&ext_devt_lock); - idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - spin_unlock_bh(&ext_devt_lock); - } -} - -/* - * We invalidate devt by assigning NULL pointer for devt in idr. - */ -void blk_invalidate_devt(dev_t devt) -{ - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - spin_lock_bh(&ext_devt_lock); - idr_replace(&ext_devt_idr, NULL, blk_mangle_minor(MINOR(devt))); - spin_unlock_bh(&ext_devt_lock); - } + if (MAJOR(devt) == BLOCK_EXT_MAJOR) + ida_free(&ext_devt_ida, blk_mangle_minor(MINOR(devt))); } static char *bdevt_str(dev_t devt, char *buf) @@ -640,28 +609,6 @@ static char *bdevt_str(dev_t devt, char *buf) return buf; } -static void blk_register_region(struct gendisk *disk) -{ - int i; - - mutex_lock(&bdev_map_lock); - for (i = 0; i < disk->minors; i++) { - if (xa_insert(&bdev_map, disk_devt(disk) + i, disk, GFP_KERNEL)) - WARN_ON_ONCE(1); - } - mutex_unlock(&bdev_map_lock); -} - -static void blk_unregister_region(struct gendisk *disk) -{ - int i; - - mutex_lock(&bdev_map_lock); - for (i = 0; i < disk->minors; i++) - xa_erase(&bdev_map, disk_devt(disk) + i); - mutex_unlock(&bdev_map_lock); -} - static void disk_scan_partitions(struct gendisk *disk) { struct block_device *bdev; @@ -805,7 +752,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); WARN_ON(ret); bdi_set_owner(bdi, dev); - blk_register_region(disk); + bdev_add(disk->part0.bdev, devt); } register_disk(parent, disk, groups); if (register_queue) @@ -847,8 +794,8 @@ static void invalidate_partition(struct gendisk *disk, int partno) __invalidate_device(bdev, true); /* - * Unhash the bdev inode for this device so that it gets evicted as soon - * as last inode reference is dropped. + * Unhash the bdev inode for this device so that it can't be looked + * up any more even if openers still hold references to it. */ remove_inode_hash(bdev->bd_inode); bdput(bdev); @@ -890,7 +837,8 @@ void del_gendisk(struct gendisk *disk) * Block lookups of the disk until all bdevs are unhashed and the * disk is marked as dead (GENHD_FL_UP cleared). */ - down_write(&disk->lookup_sem); + down_write(&bdev_lookup_sem); + /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); @@ -903,7 +851,7 @@ void del_gendisk(struct gendisk *disk) invalidate_partition(disk, 0); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; - up_write(&disk->lookup_sem); + up_write(&bdev_lookup_sem); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); @@ -916,16 +864,6 @@ void del_gendisk(struct gendisk *disk) } blk_unregister_queue(disk); - - if (!(disk->flags & GENHD_FL_HIDDEN)) - blk_unregister_region(disk); - /* - * Remove gendisk pointer from idr so that it cannot be looked up - * while RCU period before freeing gendisk is running to prevent - * use-after-free issues. Note that the device number stays - * "in-use" until we really free the gendisk. - */ - blk_invalidate_devt(disk_devt(disk)); kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); @@ -964,7 +902,7 @@ static ssize_t disk_badblocks_store(struct device *dev, return badblocks_store(disk->bb, page, len, 0); } -static void request_gendisk_module(dev_t devt) +void blk_request_module(dev_t devt) { unsigned int major = MAJOR(devt); struct blk_major_name **n; @@ -984,84 +922,6 @@ static void request_gendisk_module(dev_t devt) request_module("block-major-%d", MAJOR(devt)); } -static bool get_disk_and_module(struct gendisk *disk) -{ - struct module *owner; - - if (!disk->fops) - return false; - owner = disk->fops->owner; - if (owner && !try_module_get(owner)) - return false; - if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) { - module_put(owner); - return false; - } - return true; - -} - -/** - * get_gendisk - get partitioning information for a given device - * @devt: device to get partitioning information for - * @partno: returned partition index - * - * This function gets the structure containing partitioning - * information for the given device @devt. - * - * Context: can sleep - */ -struct gendisk *get_gendisk(dev_t devt, int *partno) -{ - struct gendisk *disk = NULL; - - might_sleep(); - - if (MAJOR(devt) != BLOCK_EXT_MAJOR) { - mutex_lock(&bdev_map_lock); - disk = xa_load(&bdev_map, devt); - if (!disk) { - mutex_unlock(&bdev_map_lock); - request_gendisk_module(devt); - mutex_lock(&bdev_map_lock); - disk = xa_load(&bdev_map, devt); - } - if (disk && !get_disk_and_module(disk)) - disk = NULL; - if (disk) - *partno = devt - disk_devt(disk); - mutex_unlock(&bdev_map_lock); - } else { - struct hd_struct *part; - - spin_lock_bh(&ext_devt_lock); - part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - if (part && get_disk_and_module(part_to_disk(part))) { - *partno = part->partno; - disk = part_to_disk(part); - } - spin_unlock_bh(&ext_devt_lock); - } - - if (!disk) - return NULL; - - /* - * Synchronize with del_gendisk() to not return disk that is being - * destroyed. - */ - down_read(&disk->lookup_sem); - if (unlikely((disk->flags & GENHD_FL_HIDDEN) || - !(disk->flags & GENHD_FL_UP))) { - up_read(&disk->lookup_sem); - put_disk_and_module(disk); - disk = NULL; - } else { - up_read(&disk->lookup_sem); - } - return disk; -} - /** * bdget_disk - do bdget() by gendisk and partition number * @disk: gendisk of interest @@ -1559,11 +1419,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) * * This function releases all allocated resources of the gendisk. * - * The struct gendisk refcount is incremented with get_gendisk() or - * get_disk_and_module(), and its refcount is decremented with - * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this - * function is called. - * * Drivers which used __device_add_disk() have a gendisk with a request_queue * assigned. Since the request_queue sits on top of the gendisk for these * drivers we also call blk_put_queue() for them, and we expect the @@ -1748,16 +1603,17 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (!disk) return NULL; - disk->part0.dkstats = alloc_percpu(struct disk_stats); - if (!disk->part0.dkstats) + disk->part0.bdev = bdev_alloc(disk, 0); + if (!disk->part0.bdev) goto out_free_disk; - init_rwsem(&disk->lookup_sem); + disk->part0.dkstats = alloc_percpu(struct disk_stats); + if (!disk->part0.dkstats) + goto out_bdput; + disk->node_id = node_id; - if (disk_expand_part_tbl(disk, 0)) { - free_percpu(disk->part0.dkstats); - goto out_free_disk; - } + if (disk_expand_part_tbl(disk, 0)) + goto out_free_bdstats; ptbl = rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(ptbl->part[0], &disk->part0); @@ -1773,7 +1629,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) */ hd_sects_seq_init(&disk->part0); if (hd_ref_init(&disk->part0)) - goto out_free_part0; + goto out_free_bdstats; disk->minors = minors; rand_initialize_disk(disk); @@ -1782,8 +1638,10 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); return disk; -out_free_part0: - hd_free_part(&disk->part0); +out_free_bdstats: + free_percpu(disk->part0.dkstats); +out_bdput: + bdput(disk->part0.bdev); out_free_disk: kfree(disk); return NULL; @@ -1807,26 +1665,6 @@ void put_disk(struct gendisk *disk) } EXPORT_SYMBOL(put_disk); -/** - * put_disk_and_module - decrements the module and gendisk refcount - * @disk: the struct gendisk to decrement the refcount for - * - * This is a counterpart of get_disk_and_module() and thus also of - * get_gendisk(). - * - * Context: Any context, but the last reference must not be dropped from - * atomic context. - */ -void put_disk_and_module(struct gendisk *disk) -{ - if (disk) { - struct module *owner = disk->fops->owner; - - put_disk(disk); - module_put(owner); - } -} - static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; diff --git a/block/partitions/core.c b/block/partitions/core.c index a02e22411594..696bd9ff63c6 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -340,12 +340,11 @@ void delete_partition(struct hd_struct *part) device_del(part_to_dev(part)); /* - * Remove gendisk pointer from idr so that it cannot be looked up - * while RCU period before freeing gendisk is running to prevent - * use-after-free issues. Note that the device number stays - * "in-use" until we really free the gendisk. + * Remove the block device from the inode hash, so that it cannot be + * looked up any more even when openers still hold references. */ - blk_invalidate_devt(part_devt(part)); + remove_inode_hash(part->bdev->bd_inode); + percpu_ref_kill(&part->ref); } @@ -368,6 +367,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; + struct block_device *bdev; struct disk_part_tbl *ptbl; const char *dname; int err; @@ -402,11 +402,15 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!p) return ERR_PTR(-EBUSY); + err = -ENOMEM; p->dkstats = alloc_percpu(struct disk_stats); - if (!p->dkstats) { - err = -ENOMEM; + if (!p->dkstats) goto out_free; - } + + bdev = bdev_alloc(disk, partno); + if (!bdev) + goto out_free_stats; + p->bdev = bdev; hd_sects_seq_init(p); pdev = part_to_dev(p); @@ -420,10 +424,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, struct partition_meta_info *pinfo; pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); - if (!pinfo) { - err = -ENOMEM; - goto out_free_stats; - } + if (!pinfo) + goto out_bdput; memcpy(pinfo, info, sizeof(*info)); p->info = pinfo; } @@ -470,6 +472,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, } /* everything is up and running, commence */ + bdev_add(bdev, devt); rcu_assign_pointer(ptbl->part[partno], p); /* suppress uevent if the disk suppresses it */ @@ -479,6 +482,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, out_free_info: kfree(p->info); +out_bdput: + bdput(bdev); out_free_stats: free_percpu(p->dkstats); out_free: diff --git a/fs/block_dev.c b/fs/block_dev.c index 6d6e4d50834c..b350ed3af83b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -863,33 +863,48 @@ void __init bdev_cache_init(void) blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } -static struct block_device *bdget(dev_t dev) +struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) { struct block_device *bdev; struct inode *inode; - inode = iget_locked(blockdev_superblock, dev); + inode = new_inode(blockdev_superblock); if (!inode) return NULL; + inode->i_mode = S_IFBLK; + inode->i_rdev = 0; + inode->i_data.a_ops = &def_blk_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); - bdev = &BDEV_I(inode)->bdev; - - if (inode->i_state & I_NEW) { - spin_lock_init(&bdev->bd_size_lock); - bdev->bd_contains = NULL; - bdev->bd_super = NULL; - bdev->bd_inode = inode; - bdev->bd_part_count = 0; - bdev->bd_dev = dev; - inode->i_mode = S_IFBLK; - inode->i_rdev = dev; - inode->i_data.a_ops = &def_blk_aops; - mapping_set_gfp_mask(&inode->i_data, GFP_USER); - unlock_new_inode(inode); - } + bdev = I_BDEV(inode); + spin_lock_init(&bdev->bd_size_lock); + bdev->bd_disk = disk; + bdev->bd_partno = partno; + bdev->bd_contains = NULL; + bdev->bd_super = NULL; + bdev->bd_inode = inode; + bdev->bd_part_count = 0; return bdev; } +void bdev_add(struct block_device *bdev, dev_t dev) +{ + bdev->bd_dev = dev; + bdev->bd_inode->i_rdev = dev; + bdev->bd_inode->i_ino = dev; + insert_inode_hash(bdev->bd_inode); +} + +static struct block_device *bdget(dev_t dev) +{ + struct inode *inode; + + inode = ilookup(blockdev_superblock, dev); + if (!inode) + return NULL; + return &BDEV_I(inode)->bdev; +} + /** * bdgrab -- Grab a reference to an already referenced block device * @bdev: Block device to grab a reference to. @@ -1004,27 +1019,6 @@ int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole, } EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ -static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) -{ - struct gendisk *disk = get_gendisk(bdev->bd_dev, partno); - - if (!disk) - return NULL; - /* - * Now that we hold gendisk reference we make sure bdev we looked up is - * not stale. If it is, it means device got removed and created before - * we looked up gendisk and we fail open in such case. Associating - * unhashed bdev with newly created gendisk could lead to two bdevs - * (and thus two independent caches) being associated with one device - * which is bad. - */ - if (inode_unhashed(bdev->bd_inode)) { - put_disk_and_module(disk); - return NULL; - } - return disk; -} - static void bd_clear_claiming(struct block_device *whole, void *holder) { lockdep_assert_held(&bdev_lock); @@ -1347,19 +1341,17 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); * mutex_lock(part->bd_mutex) * mutex_lock_nested(whole->bd_mutex, 1) */ -static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, - int partno, fmode_t mode) +static int __blkdev_get(struct block_device *bdev, fmode_t mode) { + struct gendisk *disk = bdev->bd_disk; int ret; if (!bdev->bd_openers) { - bdev->bd_disk = disk; bdev->bd_contains = bdev; - bdev->bd_partno = partno; - if (!partno) { + if (!bdev->bd_partno) { ret = -ENXIO; - bdev->bd_part = disk_get_part(disk, partno); + bdev->bd_part = disk_get_part(disk, 0); if (!bdev->bd_part) goto out_clear; @@ -1388,7 +1380,7 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, struct block_device *whole = bdget_disk(disk, 0); mutex_lock_nested(&whole->bd_mutex, 1); - ret = __blkdev_get(whole, disk, 0, mode); + ret = __blkdev_get(whole, mode); if (ret) { mutex_unlock(&whole->bd_mutex); bdput(whole); @@ -1398,7 +1390,7 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, mutex_unlock(&whole->bd_mutex); bdev->bd_contains = whole; - bdev->bd_part = disk_get_part(disk, partno); + bdev->bd_part = disk_get_part(disk, bdev->bd_partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { __blkdev_put(whole, mode, 1); @@ -1430,12 +1422,53 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, out_clear: disk_put_part(bdev->bd_part); - bdev->bd_disk = NULL; bdev->bd_part = NULL; bdev->bd_contains = NULL; return ret; } +struct block_device *blkdev_get_no_open(dev_t dev) +{ + struct block_device *bdev; + struct gendisk *disk; + + down_read(&bdev_lookup_sem); + bdev = bdget(dev); + if (!bdev) { + up_read(&bdev_lookup_sem); + blk_request_module(dev); + down_read(&bdev_lookup_sem); + + bdev = bdget(dev); + if (!bdev) + goto unlock; + } + + disk = bdev->bd_disk; + if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) + goto bdput; + if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + goto put_disk; + if (!try_module_get(bdev->bd_disk->fops->owner)) + goto put_disk; + up_read(&bdev_lookup_sem); + return bdev; +put_disk: + put_disk(disk); +bdput: + bdput(bdev); +unlock: + up_read(&bdev_lookup_sem); + return NULL; +} + +void blkdev_put_no_open(struct block_device *bdev) +{ + module_put(bdev->bd_disk->fops->owner); + put_disk(bdev->bd_disk); + bdput(bdev); +} + /** * blkdev_get_by_dev - open a block device by device number * @dev: device number of block device to open @@ -1463,7 +1496,6 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) bool unblock_events = true; struct block_device *bdev; struct gendisk *disk; - int partno; int ret; ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, @@ -1473,18 +1505,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (ret) return ERR_PTR(ret); - bdev = bdget(dev); - if (!bdev) - return ERR_PTR(-ENOMEM); - /* * If we lost a race with 'disk' being deleted, try again. See md.c. */ retry: - ret = -ENXIO; - disk = bdev_get_gendisk(bdev, &partno); - if (!disk) - goto bdput; + bdev = blkdev_get_no_open(dev); + if (!bdev) + return ERR_PTR(-ENXIO); + disk = bdev->bd_disk; if (mode & FMODE_EXCL) { WARN_ON_ONCE(!holder); @@ -1492,7 +1520,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) ret = -ENOMEM; claiming = bdget_disk(disk, 0); if (!claiming) - goto put_disk; + goto put_blkdev; ret = bd_prepare_to_claim(bdev, claiming, holder); if (ret) goto put_claiming; @@ -1501,12 +1529,10 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) disk_block_events(disk); mutex_lock(&bdev->bd_mutex); - ret =__blkdev_get(bdev, disk, partno, mode); - if (!(mode & FMODE_EXCL)) { - ; /* nothing to do here */ - } else if (ret) { - bd_abort_claiming(bdev, claiming, holder); - } else { + ret =__blkdev_get(bdev, mode); + if (ret) + goto abort_claiming; + if (mode & FMODE_EXCL) { bd_finish_claiming(bdev, claiming, holder); /* @@ -1526,21 +1552,23 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (unblock_events) disk_unblock_events(disk); + if (mode & FMODE_EXCL) + bdput(claiming); + return bdev; +abort_claiming: + if (mode & FMODE_EXCL) + bd_abort_claiming(bdev, claiming, holder); + mutex_unlock(&bdev->bd_mutex); + disk_unblock_events(disk); put_claiming: if (mode & FMODE_EXCL) bdput(claiming); -put_disk: - if (ret) - put_disk_and_module(disk); +put_blkdev: + blkdev_put_no_open(bdev); if (ret == -ERESTARTSYS) goto retry; -bdput: - if (ret) { - bdput(bdev); - return ERR_PTR(ret); - } - return bdev; + return ERR_PTR(ret); } EXPORT_SYMBOL(blkdev_get_by_dev); @@ -1641,7 +1669,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; - bdev->bd_disk = NULL; if (bdev_is_partition(bdev)) victim = bdev->bd_contains; bdev->bd_contains = NULL; @@ -1699,12 +1726,10 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * from userland - e.g. eject(1). */ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); - mutex_unlock(&bdev->bd_mutex); __blkdev_put(bdev, mode, 0); - bdput(bdev); - put_disk_and_module(disk); + blkdev_put_no_open(bdev); } EXPORT_SYMBOL(blkdev_put); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index c8fc9792ac77..b9f3c246c3c9 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -197,12 +197,12 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { - struct gendisk *disk; + struct block_device *bdev; struct blkcg_gq *blkg; char *body; }; -struct gendisk *blkcg_conf_get_disk(char **inputp); +struct block_device *blkcg_conf_open_bdev(char **inputp); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bdd7339bcda4..5d48b92f5e43 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1994,6 +1994,12 @@ void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, void *holder); void blkdev_put(struct block_device *bdev, fmode_t mode); +/* just for blk-cgroup, don't use elsewhere */ +struct block_device *blkdev_get_no_open(dev_t dev); +void blkdev_put_no_open(struct block_device *bdev); + +struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); +void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); struct block_device *bdget_part(struct hd_struct *part); struct block_device *bdgrab(struct block_device *bdev); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ca5e356084c3..42a51653c730 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -65,6 +65,7 @@ struct hd_struct { struct disk_stats __percpu *dkstats; struct percpu_ref ref; + struct block_device *bdev; struct device __dev; struct kobject *holder_dir; int policy, partno; @@ -193,7 +194,6 @@ struct gendisk { int flags; unsigned long state; #define GD_NEED_PART_SCAN 0 - struct rw_semaphore lookup_sem; struct kobject *slave_dir; struct timer_rand_state *random; @@ -300,7 +300,6 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk) } extern void del_gendisk(struct gendisk *gp); -extern struct gendisk *get_gendisk(dev_t dev, int *partno); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); extern void set_disk_ro(struct gendisk *disk, int flag); @@ -338,7 +337,6 @@ int blk_drop_partitions(struct block_device *bdev); extern struct gendisk *__alloc_disk_node(int minors, int node_id); extern void put_disk(struct gendisk *disk); -extern void put_disk_and_module(struct gendisk *disk); #define alloc_disk_node(minors, node_id) \ ({ \ @@ -388,7 +386,10 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, } #endif /* CONFIG_SYSFS */ +extern struct rw_semaphore bdev_lookup_sem; + dev_t blk_lookup_devt(const char *name, int partno); +void blk_request_module(dev_t devt); #ifdef CONFIG_BLOCK void printk_all_partitions(void); #else /* CONFIG_BLOCK */ From a954ea812018a84d350b316c39a2be3edc4b7ca8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 13:29:55 +0100 Subject: [PATCH 240/395] block: remove ->bd_contains Now that each hd_struct has a reference to the corresponding block_device, there is no need for the bd_contains pointer. Add a bdev_whole() helper to look up the whole device block_device struture instead. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 +- drivers/scsi/scsicam.c | 2 +- fs/block_dev.c | 22 ++++++++-------------- include/linux/blk_types.h | 4 +++- 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 26c7aafba7c5..c0df88b3300c 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1088,7 +1088,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, * here to avoid changing device under exclusive owner. */ if (!(mode & FMODE_EXCL)) { - claimed_bdev = bdev->bd_contains; + claimed_bdev = bdev_whole(bdev); error = bd_prepare_to_claim(bdev, claimed_bdev, loop_configure); if (error) goto out_putf; diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index 682cf08ab041..f1553a453616 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -32,7 +32,7 @@ */ unsigned char *scsi_bios_ptable(struct block_device *dev) { - struct address_space *mapping = dev->bd_contains->bd_inode->i_mapping; + struct address_space *mapping = bdev_whole(dev)->bd_inode->i_mapping; unsigned char *res = NULL; struct page *page; diff --git a/fs/block_dev.c b/fs/block_dev.c index b350ed3af83b..94baee369d26 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -119,7 +119,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, * under live filesystem. */ if (!(mode & FMODE_EXCL)) { - claimed_bdev = bdev->bd_contains; + claimed_bdev = bdev_whole(bdev); err = bd_prepare_to_claim(bdev, claimed_bdev, truncate_bdev_range); if (err) @@ -880,7 +880,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) spin_lock_init(&bdev->bd_size_lock); bdev->bd_disk = disk; bdev->bd_partno = partno; - bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; bdev->bd_part_count = 0; @@ -1347,9 +1346,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) int ret; if (!bdev->bd_openers) { - bdev->bd_contains = bdev; - - if (!bdev->bd_partno) { + if (!bdev_is_partition(bdev)) { ret = -ENXIO; bdev->bd_part = disk_get_part(disk, 0); if (!bdev->bd_part) @@ -1389,7 +1386,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) whole->bd_part_count++; mutex_unlock(&whole->bd_mutex); - bdev->bd_contains = whole; bdev->bd_part = disk_get_part(disk, bdev->bd_partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { @@ -1405,7 +1401,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) if (bdev->bd_bdi == &noop_backing_dev_info) bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } else { - if (bdev->bd_contains == bdev) { + if (!bdev_is_partition(bdev)) { ret = 0; if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); @@ -1423,7 +1419,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) out_clear: disk_put_part(bdev->bd_part); bdev->bd_part = NULL; - bdev->bd_contains = NULL; return ret; } @@ -1670,8 +1665,7 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; if (bdev_is_partition(bdev)) - victim = bdev->bd_contains; - bdev->bd_contains = NULL; + victim = bdev_whole(bdev); } else { if (!bdev_is_partition(bdev) && disk->fops->release) disk->fops->release(disk, mode); @@ -1690,6 +1684,7 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) mutex_lock(&bdev->bd_mutex); if (mode & FMODE_EXCL) { + struct block_device *whole = bdev_whole(bdev); bool bdev_free; /* @@ -1700,13 +1695,12 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) spin_lock(&bdev_lock); WARN_ON_ONCE(--bdev->bd_holders < 0); - WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); + WARN_ON_ONCE(--whole->bd_holders < 0); - /* bd_contains might point to self, check in a separate step */ if ((bdev_free = !bdev->bd_holders)) bdev->bd_holder = NULL; - if (!bdev->bd_contains->bd_holders) - bdev->bd_contains->bd_holder = NULL; + if (!whole->bd_holders) + whole->bd_holder = NULL; spin_unlock(&bdev_lock); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 9698f459cc65..2e0a9bd9688d 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -32,7 +32,6 @@ struct block_device { #ifdef CONFIG_SYSFS struct list_head bd_holder_disks; #endif - struct block_device * bd_contains; u8 bd_partno; struct hd_struct * bd_part; /* number of times partitions within this device have been opened. */ @@ -49,6 +48,9 @@ struct block_device { struct super_block *bd_fsfreeze_sb; } __randomize_layout; +#define bdev_whole(_bdev) \ + ((_bdev)->bd_disk->part0.bdev) + #define bdev_kobj(_bdev) \ (&part_to_dev((_bdev)->bd_part)->kobj) From 37c3fc9abb25cd767ad5b048358336ac89488c16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Nov 2020 21:20:08 +0100 Subject: [PATCH 241/395] block: simplify the block device claiming interface Stop passing the whole device as a separate argument given that it can be trivially deducted and cleanup the !holder debug check. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/loop.c | 12 +++++----- fs/block_dev.c | 51 +++++++++++++++--------------------------- include/linux/blkdev.h | 6 ++--- 3 files changed, 25 insertions(+), 44 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c0df88b3300c..d643c67be6ac 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1069,7 +1069,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, struct file *file; struct inode *inode; struct address_space *mapping; - struct block_device *claimed_bdev = NULL; int error; loff_t size; bool partscan; @@ -1088,8 +1087,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, * here to avoid changing device under exclusive owner. */ if (!(mode & FMODE_EXCL)) { - claimed_bdev = bdev_whole(bdev); - error = bd_prepare_to_claim(bdev, claimed_bdev, loop_configure); + error = bd_prepare_to_claim(bdev, loop_configure); if (error) goto out_putf; } @@ -1176,15 +1174,15 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, mutex_unlock(&loop_ctl_mutex); if (partscan) loop_reread_partitions(lo, bdev); - if (claimed_bdev) - bd_abort_claiming(bdev, claimed_bdev, loop_configure); + if (!(mode & FMODE_EXCL)) + bd_abort_claiming(bdev, loop_configure); return 0; out_unlock: mutex_unlock(&loop_ctl_mutex); out_bdev: - if (claimed_bdev) - bd_abort_claiming(bdev, claimed_bdev, loop_configure); + if (!(mode & FMODE_EXCL)) + bd_abort_claiming(bdev, loop_configure); out_putf: fput(file); out: diff --git a/fs/block_dev.c b/fs/block_dev.c index 94baee369d26..0569f5ebeb6f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -110,24 +110,20 @@ EXPORT_SYMBOL(invalidate_bdev); int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend) { - struct block_device *claimed_bdev = NULL; - int err; - /* * If we don't hold exclusive handle for the device, upgrade to it * while we discard the buffer cache to avoid discarding buffers * under live filesystem. */ if (!(mode & FMODE_EXCL)) { - claimed_bdev = bdev_whole(bdev); - err = bd_prepare_to_claim(bdev, claimed_bdev, - truncate_bdev_range); + int err = bd_prepare_to_claim(bdev, truncate_bdev_range); if (err) return err; } + truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); - if (claimed_bdev) - bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range); + if (!(mode & FMODE_EXCL)) + bd_abort_claiming(bdev, truncate_bdev_range); return 0; } EXPORT_SYMBOL(truncate_bdev_range); @@ -978,7 +974,6 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, /** * bd_prepare_to_claim - claim a block device * @bdev: block device of interest - * @whole: the whole device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev * * Claim @bdev. This function fails if @bdev is already claimed by another @@ -988,9 +983,12 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. */ -int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole, - void *holder) +int bd_prepare_to_claim(struct block_device *bdev, void *holder) { + struct block_device *whole = bdev_whole(bdev); + + if (WARN_ON_ONCE(!holder)) + return -EINVAL; retry: spin_lock(&bdev_lock); /* if someone else claimed, fail */ @@ -1030,15 +1028,15 @@ static void bd_clear_claiming(struct block_device *whole, void *holder) /** * bd_finish_claiming - finish claiming of a block device * @bdev: block device of interest - * @whole: whole block device * @holder: holder that has claimed @bdev * * Finish exclusive open of a block device. Mark the device as exlusively * open by the holder and wake up all waiters for exclusive open to finish. */ -static void bd_finish_claiming(struct block_device *bdev, - struct block_device *whole, void *holder) +static void bd_finish_claiming(struct block_device *bdev, void *holder) { + struct block_device *whole = bdev_whole(bdev); + spin_lock(&bdev_lock); BUG_ON(!bd_may_claim(bdev, whole, holder)); /* @@ -1063,11 +1061,10 @@ static void bd_finish_claiming(struct block_device *bdev, * also used when exclusive open is not actually desired and we just needed * to block other exclusive openers for a while. */ -void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, - void *holder) +void bd_abort_claiming(struct block_device *bdev, void *holder) { spin_lock(&bdev_lock); - bd_clear_claiming(whole, holder); + bd_clear_claiming(bdev_whole(bdev), holder); spin_unlock(&bdev_lock); } EXPORT_SYMBOL(bd_abort_claiming); @@ -1487,7 +1484,6 @@ void blkdev_put_no_open(struct block_device *bdev) */ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) { - struct block_device *claiming; bool unblock_events = true; struct block_device *bdev; struct gendisk *disk; @@ -1510,15 +1506,9 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) disk = bdev->bd_disk; if (mode & FMODE_EXCL) { - WARN_ON_ONCE(!holder); - - ret = -ENOMEM; - claiming = bdget_disk(disk, 0); - if (!claiming) - goto put_blkdev; - ret = bd_prepare_to_claim(bdev, claiming, holder); + ret = bd_prepare_to_claim(bdev, holder); if (ret) - goto put_claiming; + goto put_blkdev; } disk_block_events(disk); @@ -1528,7 +1518,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (ret) goto abort_claiming; if (mode & FMODE_EXCL) { - bd_finish_claiming(bdev, claiming, holder); + bd_finish_claiming(bdev, holder); /* * Block event polling for write claims if requested. Any write @@ -1547,18 +1537,13 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (unblock_events) disk_unblock_events(disk); - if (mode & FMODE_EXCL) - bdput(claiming); return bdev; abort_claiming: if (mode & FMODE_EXCL) - bd_abort_claiming(bdev, claiming, holder); + bd_abort_claiming(bdev, holder); mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); -put_claiming: - if (mode & FMODE_EXCL) - bdput(claiming); put_blkdev: blkdev_put_no_open(bdev); if (ret == -ERESTARTSYS) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5d48b92f5e43..43a25d855e04 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1988,10 +1988,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset); struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder); struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder); -int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole, - void *holder); -void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, - void *holder); +int bd_prepare_to_claim(struct block_device *bdev, void *holder); +void bd_abort_claiming(struct block_device *bdev, void *holder); void blkdev_put(struct block_device *bdev, fmode_t mode); /* just for blk-cgroup, don't use elsewhere */ From c64dc3bd87097e7f08b9437819440f8bfddef995 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 16:38:15 +0100 Subject: [PATCH 242/395] block: simplify part_to_disk Now that struct hd_struct has a block_device pointer use that to find the disk. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 42a51653c730..6ba91ee54cb2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -218,13 +218,9 @@ struct gendisk { static inline struct gendisk *part_to_disk(struct hd_struct *part) { - if (likely(part)) { - if (part->partno) - return dev_to_disk(part_to_dev(part)->parent); - else - return dev_to_disk(part_to_dev(part)); - } - return NULL; + if (unlikely(!part)) + return NULL; + return part->bdev->bd_disk; } static inline int disk_max_parts(struct gendisk *disk) From e6cb53827ed60019bbbc5cf189dd204b3b0e8121 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 15:41:40 +0100 Subject: [PATCH 243/395] block: initialize struct block_device in bdev_alloc Don't play tricks with slab constructors as bdev structures tends to not get reused very much, and this makes the code a lot less error prone. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 0569f5ebeb6f..a5b6955a841f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -784,20 +784,11 @@ static void bdev_free_inode(struct inode *inode) kmem_cache_free(bdev_cachep, BDEV_I(inode)); } -static void init_once(void *foo) +static void init_once(void *data) { - struct bdev_inode *ei = (struct bdev_inode *) foo; - struct block_device *bdev = &ei->bdev; + struct bdev_inode *ei = data; - memset(bdev, 0, sizeof(*bdev)); - mutex_init(&bdev->bd_mutex); -#ifdef CONFIG_SYSFS - INIT_LIST_HEAD(&bdev->bd_holder_disks); -#endif - bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); - /* Initialize mutex for freeze. */ - mutex_init(&bdev->bd_fsfreeze_mutex); } static void bdev_evict_inode(struct inode *inode) @@ -873,12 +864,17 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) mapping_set_gfp_mask(&inode->i_data, GFP_USER); bdev = I_BDEV(inode); + memset(bdev, 0, sizeof(*bdev)); + mutex_init(&bdev->bd_mutex); + mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); bdev->bd_disk = disk; bdev->bd_partno = partno; - bdev->bd_super = NULL; bdev->bd_inode = inode; - bdev->bd_part_count = 0; + bdev->bd_bdi = &noop_backing_dev_info; +#ifdef CONFIG_SYSFS + INIT_LIST_HEAD(&bdev->bd_holder_disks); +#endif return bdev; } From a782483cc1f875355690625d8253a232f2581418 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 18:43:37 +0100 Subject: [PATCH 244/395] block: remove the nr_sects field in struct hd_struct Now that the hd_struct always has a block device attached to it, there is no need for having two size field that just get out of sync. Additionally the field in hd_struct did not use proper serialization, possibly allowing for torn writes. By only using the block_device field this problem also gets fixed. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Coly Li [bcache] Acked-by: Chao Yu [f2fs] Signed-off-by: Jens Axboe --- block/bio.c | 4 +- block/blk-core.c | 2 +- block/blk.h | 53 ---------------------- block/genhd.c | 59 ++++++++++++++---------- block/partitions/core.c | 17 ++++--- drivers/block/loop.c | 1 - drivers/block/nbd.c | 2 +- drivers/block/xen-blkback/common.h | 4 +- drivers/md/bcache/super.c | 2 +- drivers/s390/block/dasd_ioctl.c | 4 +- drivers/target/target_core_pscsi.c | 5 +- fs/block_dev.c | 73 +----------------------------- fs/f2fs/super.c | 2 +- fs/pstore/blk.c | 2 +- include/linux/genhd.h | 31 ++++--------- kernel/trace/blktrace.c | 2 +- 16 files changed, 68 insertions(+), 195 deletions(-) diff --git a/block/bio.c b/block/bio.c index fa01bef35bb1..669bb47a3198 100644 --- a/block/bio.c +++ b/block/bio.c @@ -613,8 +613,8 @@ void guard_bio_eod(struct bio *bio) rcu_read_lock(); part = __disk_get_part(bio->bi_disk, bio->bi_partno); if (part) - maxsector = part_nr_sects_read(part); - else + maxsector = bdev_nr_sectors(part->bdev); + else maxsector = get_capacity(bio->bi_disk); rcu_read_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 2db8bda43b6e..988f45094a38 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -755,7 +755,7 @@ static inline int blk_partition_remap(struct bio *bio) goto out; if (bio_sectors(bio)) { - if (bio_check_eod(bio, part_nr_sects_read(p))) + if (bio_check_eod(bio, bdev_nr_sectors(p->bdev))) goto out; bio->bi_iter.bi_sector += p->start_sect; trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), diff --git a/block/blk.h b/block/blk.h index c4839abcfa27..09cee7024fb4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -387,59 +387,6 @@ static inline void hd_free_part(struct hd_struct *part) percpu_ref_exit(&part->ref); } -/* - * Any access of part->nr_sects which is not protected by partition - * bd_mutex or gendisk bdev bd_mutex, should be done using this - * accessor function. - * - * Code written along the lines of i_size_read() and i_size_write(). - * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption - * on. - */ -static inline sector_t part_nr_sects_read(struct hd_struct *part) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - sector_t nr_sects; - unsigned seq; - do { - seq = read_seqcount_begin(&part->nr_sects_seq); - nr_sects = part->nr_sects; - } while (read_seqcount_retry(&part->nr_sects_seq, seq)); - return nr_sects; -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - sector_t nr_sects; - - preempt_disable(); - nr_sects = part->nr_sects; - preempt_enable(); - return nr_sects; -#else - return part->nr_sects; -#endif -} - -/* - * Should be called with mutex lock held (typically bd_mutex) of partition - * to provide mutual exlusion among writers otherwise seqcount might be - * left in wrong state leaving the readers spinning infinitely. - */ -static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - preempt_disable(); - write_seqcount_begin(&part->nr_sects_seq); - part->nr_sects = size; - write_seqcount_end(&part->nr_sects_seq); - preempt_enable(); -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - preempt_disable(); - part->nr_sects = size; - preempt_enable(); -#else - part->nr_sects = size; -#endif -} - int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); diff --git a/block/genhd.c b/block/genhd.c index bf8fa82f135f..c65f485b9db5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -40,6 +40,16 @@ static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); +void set_capacity(struct gendisk *disk, sector_t sectors) +{ + struct block_device *bdev = disk->part0.bdev; + + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); +} +EXPORT_SYMBOL(set_capacity); + /* * Set disk capacity and notify if the size is not currently zero and will not * be set to zero. Returns true if a uevent was sent, otherwise false. @@ -47,18 +57,30 @@ static void disk_release_events(struct gendisk *disk); bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); + char *envp[] = { "RESIZE=1", NULL }; set_capacity(disk, size); - revalidate_disk_size(disk, true); - if (capacity != size && capacity != 0 && size != 0) { - char *envp[] = { "RESIZE=1", NULL }; + /* + * Only print a message and send a uevent if the gendisk is user visible + * and alive. This avoids spamming the log and udev when setting the + * initial capacity during probing. + */ + if (size == capacity || + (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + return false; - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); - return true; - } + pr_info("%s: detected capacity change from %lld to %lld\n", + disk->disk_name, size, capacity); - return false; + /* + * Historically we did not send a uevent for changes to/from an empty + * device. + */ + if (!capacity || !size) + return false; + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + return true; } EXPORT_SYMBOL_GPL(set_capacity_and_notify); @@ -247,7 +269,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!part_nr_sects_read(part) && + if (!bdev_nr_sectors(part->bdev) && !(piter->flags & DISK_PITER_INCL_EMPTY) && !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && piter->idx == 0)) @@ -284,7 +306,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); static inline int sector_in_part(struct hd_struct *part, sector_t sector) { return part->start_sect <= sector && - sector < part->start_sect + part_nr_sects_read(part); + sector < part->start_sect + bdev_nr_sectors(part->bdev); } /** @@ -986,8 +1008,8 @@ void __init printk_all_partitions(void) printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), - (unsigned long long)part_nr_sects_read(part) >> 1 - , disk_name(disk, part->partno, name_buf), + bdev_nr_sectors(part->bdev) >> 1, + disk_name(disk, part->partno, name_buf), part->info ? part->info->uuid : ""); if (is_part0) { if (dev->parent && dev->parent->driver) @@ -1079,7 +1101,7 @@ static int show_partition(struct seq_file *seqf, void *v) while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), - (unsigned long long)part_nr_sects_read(part) >> 1, + bdev_nr_sectors(part->bdev) >> 1, disk_name(sgp, part->partno, buf)); disk_part_iter_exit(&piter); @@ -1161,8 +1183,7 @@ ssize_t part_size_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n", - (unsigned long long)part_nr_sects_read(p)); + return sprintf(buf, "%llu\n", bdev_nr_sectors(p->bdev)); } ssize_t part_stat_show(struct device *dev, @@ -1618,16 +1639,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) ptbl = rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(ptbl->part[0], &disk->part0); - /* - * set_capacity() and get_capacity() currently don't use - * seqcounter to read/update the part0->nr_sects. Still init - * the counter as we can read the sectors in IO submission - * patch using seqence counters. - * - * TODO: Ideally set_capacity() and get_capacity() should be - * converted to make use of bd_mutex and sequence counters. - */ - hd_sects_seq_init(&disk->part0); if (hd_ref_init(&disk->part0)) goto out_free_bdstats; diff --git a/block/partitions/core.c b/block/partitions/core.c index 696bd9ff63c6..bcfa8215bd5e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -85,6 +85,13 @@ static int (*check_part[])(struct parsed_partitions *) = { NULL }; +static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) +{ + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); +} + static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; @@ -295,7 +302,7 @@ static void hd_struct_free_work(struct work_struct *work) put_device(disk_to_dev(disk)); part->start_sect = 0; - part->nr_sects = 0; + bdev_set_nr_sectors(part->bdev, 0); part_stat_set_all(part, 0); put_device(part_to_dev(part)); } @@ -412,11 +419,10 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_free_stats; p->bdev = bdev; - hd_sects_seq_init(p); pdev = part_to_dev(p); p->start_sect = start; - p->nr_sects = len; + bdev_set_nr_sectors(bdev, len); p->partno = partno; p->policy = get_disk_ro(disk); @@ -509,7 +515,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { if (part->partno == skip_partno || - start >= part->start_sect + part->nr_sects || + start >= part->start_sect + bdev_nr_sectors(part->bdev) || start + length <= part->start_sect) continue; overlap = true; @@ -600,8 +606,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno, if (partition_overlaps(bdev->bd_disk, start, length, partno)) goto out_unlock; - part_nr_sects_write(part, length); - bd_set_nr_sectors(bdevp, length); + bdev_set_nr_sectors(bdevp, length); ret = 0; out_unlock: diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d643c67be6ac..d2ce1ddc192d 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1241,7 +1241,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) set_capacity(lo->lo_disk, 0); loop_sysfs_exit(lo); if (bdev) { - bd_set_nr_sectors(bdev, 0); /* let user-space know about this change */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 45b0423ef2c5..014683968ce1 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1132,7 +1132,7 @@ static void nbd_bdev_reset(struct block_device *bdev) { if (bdev->bd_openers > 1) return; - bd_set_nr_sectors(bdev, 0); + set_capacity(bdev->bd_disk, 0); } static void nbd_parse_flags(struct nbd_device *nbd) diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index c6ea5d38c509..0762db247b41 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -358,9 +358,7 @@ struct pending_req { }; -#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ - (_v)->bdev->bd_part->nr_sects : \ - get_capacity((_v)->bdev->bd_disk)) +#define vbd_sz(_v) bdev_nr_sectors((_v)->bdev) #define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt)) #define xen_blkif_put(_b) \ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c55d3c58a7ef..04fa40868fbe 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1408,7 +1408,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) q->limits.raid_partial_stripes_expensive; ret = bcache_device_init(&dc->disk, block_size, - dc->bdev->bd_part->nr_sects - dc->sb.data_offset, + bdev_nr_sectors(dc->bdev) - dc->sb.data_offset, dc->bdev, &bcache_cached_ops); if (ret) return ret; diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 3359559517bf..304eba1acf16 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -54,8 +54,6 @@ dasd_ioctl_enable(struct block_device *bdev) return -ENODEV; dasd_enable_device(base); - /* Formatting the dasd device can change the capacity. */ - bd_set_nr_sectors(bdev, get_capacity(base->block->gdp)); dasd_put_device(base); return 0; } @@ -88,7 +86,7 @@ dasd_ioctl_disable(struct block_device *bdev) * Set i_size to zero, since read, write, etc. check against this * value. */ - bd_set_nr_sectors(bdev, 0); + set_capacity(bdev->bd_disk, 0); dasd_put_device(base); return 0; } diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 4e37fa9b409d..7994f27e4527 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1029,9 +1029,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev) { struct pscsi_dev_virt *pdv = PSCSI_DEV(dev); - if (pdv->pdv_bd && pdv->pdv_bd->bd_part) - return pdv->pdv_bd->bd_part->nr_sects; - + if (pdv->pdv_bd) + return bdev_nr_sectors(pdv->pdv_bd); return 0; } diff --git a/fs/block_dev.c b/fs/block_dev.c index a5b6955a841f..31ee5a857f71 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1208,70 +1208,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif -/** - * check_disk_size_change - checks for disk size change and adjusts bdev size. - * @disk: struct gendisk to check - * @bdev: struct bdev to adjust. - * @verbose: if %true log a message about a size change if there is any - * - * This routine checks to see if the bdev size does not match the disk size - * and adjusts it if it differs. When shrinking the bdev size, its all caches - * are freed. - */ -static void check_disk_size_change(struct gendisk *disk, - struct block_device *bdev, bool verbose) -{ - loff_t disk_size, bdev_size; - - spin_lock(&bdev->bd_size_lock); - disk_size = (loff_t)get_capacity(disk) << 9; - bdev_size = i_size_read(bdev->bd_inode); - if (disk_size != bdev_size) { - if (verbose) { - printk(KERN_INFO - "%s: detected capacity change from %lld to %lld\n", - disk->disk_name, bdev_size, disk_size); - } - i_size_write(bdev->bd_inode, disk_size); - } - spin_unlock(&bdev->bd_size_lock); -} - -/** - * revalidate_disk_size - checks for disk size change and adjusts bdev size. - * @disk: struct gendisk to check - * @verbose: if %true log a message about a size change if there is any - * - * This routine checks to see if the bdev size does not match the disk size - * and adjusts it if it differs. When shrinking the bdev size, its all caches - * are freed. - */ -void revalidate_disk_size(struct gendisk *disk, bool verbose) -{ - struct block_device *bdev; - - /* - * Hidden disks don't have associated bdev so there's no point in - * revalidating them. - */ - if (disk->flags & GENHD_FL_HIDDEN) - return; - - bdev = bdget_disk(disk, 0); - if (bdev) { - check_disk_size_change(disk, bdev, verbose); - bdput(bdev); - } -} - -void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) -{ - spin_lock(&bdev->bd_size_lock); - i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); - spin_unlock(&bdev->bd_size_lock); -} -EXPORT_SYMBOL(bd_set_nr_sectors); - static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); int bdev_disk_changed(struct block_device *bdev, bool invalidate) @@ -1305,8 +1241,6 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) disk->fops->revalidate_disk(disk); } - check_disk_size_change(disk, bdev, !invalidate); - if (get_capacity(disk)) { ret = blk_add_partitions(disk, bdev); if (ret == -EAGAIN) @@ -1349,10 +1283,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) if (disk->fops->open) ret = disk->fops->open(bdev, mode); - if (!ret) { - bd_set_nr_sectors(bdev, get_capacity(disk)); + if (!ret) set_init_blocksize(bdev); - } /* * If the device is invalidated, rescan partition @@ -1381,13 +1313,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) bdev->bd_part = disk_get_part(disk, bdev->bd_partno); if (!(disk->flags & GENHD_FL_UP) || - !bdev->bd_part || !bdev->bd_part->nr_sects) { + !bdev->bd_part || !bdev_nr_sectors(bdev)) { __blkdev_put(whole, mode, 1); bdput(whole); ret = -ENXIO; goto out_clear; } - bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects); set_init_blocksize(bdev); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 00eff2f51807..d4e7fab352ba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3151,7 +3151,7 @@ static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { struct block_device *bdev = FDEV(devi).bdev; - sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t nr_sectors = bdev_nr_sectors(bdev); struct f2fs_report_zones_args rep_zone_arg; int ret; diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index fcd5563dde06..777a26f7bbe2 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -245,7 +245,7 @@ static struct block_device *psblk_get_bdev(void *holder, return bdev; } - nr_sects = part_nr_sects_read(bdev->bd_part); + nr_sects = bdev_nr_sectors(bdev); if (!nr_sects) { pr_err("not enough space for '%s'\n", blkdev); blkdev_put(bdev, mode); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6ba91ee54cb2..30d4785b7df8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -52,15 +52,6 @@ struct partition_meta_info { struct hd_struct { sector_t start_sect; - /* - * nr_sects is protected by sequence counter. One might extend a - * partition while IO is happening to it and update of nr_sects - * can be non-atomic on 32bit machines with 64bit sector_t. - */ - sector_t nr_sects; -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - seqcount_t nr_sects_seq; -#endif unsigned long stamp; struct disk_stats __percpu *dkstats; struct percpu_ref ref; @@ -254,13 +245,6 @@ static inline void disk_put_part(struct hd_struct *part) put_device(part_to_dev(part)); } -static inline void hd_sects_seq_init(struct hd_struct *p) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - seqcount_init(&p->nr_sects_seq); -#endif -} - /* * Smarter partition iterator without context limits. */ @@ -318,13 +302,15 @@ static inline sector_t get_start_sect(struct block_device *bdev) { return bdev->bd_part->start_sect; } + +static inline sector_t bdev_nr_sectors(struct block_device *bdev) +{ + return i_size_read(bdev->bd_inode) >> 9; +} + static inline sector_t get_capacity(struct gendisk *disk) { - return disk->part0.nr_sects; -} -static inline void set_capacity(struct gendisk *disk, sector_t size) -{ - disk->part0.nr_sects = size; + return bdev_nr_sectors(disk->part0.bdev); } int bdev_disk_changed(struct block_device *bdev, bool invalidate); @@ -358,10 +344,9 @@ int __register_blkdev(unsigned int major, const char *name, __register_blkdev(major, name, NULL) void unregister_blkdev(unsigned int major, const char *name); -void revalidate_disk_size(struct gendisk *disk, bool verbose); bool bdev_check_media_change(struct block_device *bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty); -void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors); +void set_capacity(struct gendisk *disk, sector_t size); /* for drivers/char/raw.c: */ int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f1022945e346..7076d588a50d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -465,7 +465,7 @@ static void blk_trace_setup_lba(struct blk_trace *bt, if (part) { bt->start_lba = part->start_sect; - bt->end_lba = part->start_sect + part->nr_sects; + bt->end_lba = part->start_sect + bdev_nr_sectors(bdev); } else { bt->start_lba = 0; bt->end_lba = -1ULL; From 15e3d2c5cd53298272e59ad9072d3468f9dd3781 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:34:00 +0100 Subject: [PATCH 245/395] block: move disk stat accounting to struct block_device Move the dkstats and stamp field to struct block_device in preparation of killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-core.c | 4 ++-- block/blk.h | 1 - block/genhd.c | 14 ++++---------- block/partitions/core.c | 9 +-------- fs/block_dev.c | 10 ++++++++++ include/linux/blk_types.h | 2 ++ include/linux/genhd.h | 2 -- include/linux/part_stat.h | 38 +++++++++++++++++++------------------- 9 files changed, 39 insertions(+), 43 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ad02289a4f7f..79aa96240cec 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -830,7 +830,7 @@ static void blkcg_fill_root_iostats(void) for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; - cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); + cpu_dkstats = per_cpu_ptr(part->bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += diff --git a/block/blk-core.c b/block/blk-core.c index 988f45094a38..d2c9cb24e087 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1264,9 +1264,9 @@ static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) { unsigned long stamp; again: - stamp = READ_ONCE(part->stamp); + stamp = READ_ONCE(part->bdev->bd_stamp); if (unlikely(stamp != now)) { - if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) + if (likely(cmpxchg(&part->bdev->bd_stamp, stamp, now) == stamp)) __part_stat_add(part, io_ticks, end ? now - stamp : 1); } if (part->partno) { diff --git a/block/blk.h b/block/blk.h index 09cee7024fb4..3f801f6e86f8 100644 --- a/block/blk.h +++ b/block/blk.h @@ -381,7 +381,6 @@ static inline void hd_struct_put(struct hd_struct *part) static inline void hd_free_part(struct hd_struct *part) { - free_percpu(part->dkstats); kfree(part->info); bdput(part->bdev); percpu_ref_exit(&part->ref); diff --git a/block/genhd.c b/block/genhd.c index c65f485b9db5..2cbda8139556 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -112,7 +112,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { - struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu); + struct disk_stats *ptr = per_cpu_ptr(part->bdev->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { @@ -891,7 +891,7 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->slave_dir); part_stat_set_all(&disk->part0, 0); - disk->part0.stamp = 0; + disk->part0.bdev->bd_stamp = 0; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -1628,19 +1628,15 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (!disk->part0.bdev) goto out_free_disk; - disk->part0.dkstats = alloc_percpu(struct disk_stats); - if (!disk->part0.dkstats) - goto out_bdput; - disk->node_id = node_id; if (disk_expand_part_tbl(disk, 0)) - goto out_free_bdstats; + goto out_bdput; ptbl = rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(ptbl->part[0], &disk->part0); if (hd_ref_init(&disk->part0)) - goto out_free_bdstats; + goto out_bdput; disk->minors = minors; rand_initialize_disk(disk); @@ -1649,8 +1645,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); return disk; -out_free_bdstats: - free_percpu(disk->part0.dkstats); out_bdput: bdput(disk->part0.bdev); out_free_disk: diff --git a/block/partitions/core.c b/block/partitions/core.c index bcfa8215bd5e..8924e1ea8b2a 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -409,14 +409,9 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!p) return ERR_PTR(-EBUSY); - err = -ENOMEM; - p->dkstats = alloc_percpu(struct disk_stats); - if (!p->dkstats) - goto out_free; - bdev = bdev_alloc(disk, partno); if (!bdev) - goto out_free_stats; + goto out_free; p->bdev = bdev; pdev = part_to_dev(p); @@ -490,8 +485,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, kfree(p->info); out_bdput: bdput(bdev); -out_free_stats: - free_percpu(p->dkstats); out_free: kfree(p); return ERR_PTR(err); diff --git a/fs/block_dev.c b/fs/block_dev.c index 31ee5a857f71..0832c7830f3a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -781,6 +782,10 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) static void bdev_free_inode(struct inode *inode) { + struct block_device *bdev = I_BDEV(inode); + + free_percpu(bdev->bd_stats); + kmem_cache_free(bdev_cachep, BDEV_I(inode)); } @@ -875,6 +880,11 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif + bdev->bd_stats = alloc_percpu(struct disk_stats); + if (!bdev->bd_stats) { + iput(inode); + return NULL; + } return bdev; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 2e0a9bd9688d..520011b95276 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -20,6 +20,8 @@ typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; struct block_device { + struct disk_stats __percpu *bd_stats; + unsigned long bd_stamp; dev_t bd_dev; int bd_openers; struct inode * bd_inode; /* will die */ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 30d4785b7df8..804ac45fbfbc 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -52,8 +52,6 @@ struct partition_meta_info { struct hd_struct { sector_t start_sect; - unsigned long stamp; - struct disk_stats __percpu *dkstats; struct percpu_ref ref; struct block_device *bdev; diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 24125778ef3e..87ad60106e1d 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -25,17 +25,17 @@ struct disk_stats { #define part_stat_unlock() preempt_enable() #define part_stat_get_cpu(part, field, cpu) \ - (per_cpu_ptr((part)->dkstats, (cpu))->field) + (per_cpu_ptr((part)->bdev->bd_stats, (cpu))->field) #define part_stat_get(part, field) \ part_stat_get_cpu(part, field, smp_processor_id()) #define part_stat_read(part, field) \ ({ \ - typeof((part)->dkstats->field) res = 0; \ + typeof((part)->bdev->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ - res += per_cpu_ptr((part)->dkstats, _cpu)->field; \ + res += per_cpu_ptr((part)->bdev->bd_stats, _cpu)->field; \ res; \ }) @@ -44,7 +44,7 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) int i; for_each_possible_cpu(i) - memset(per_cpu_ptr(part->dkstats, i), value, + memset(per_cpu_ptr(part->bdev->bd_stats, i), value, sizeof(struct disk_stats)); } @@ -54,7 +54,7 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) part_stat_read(part, field[STAT_DISCARD])) #define __part_stat_add(part, field, addnd) \ - __this_cpu_add((part)->dkstats->field, addnd) + __this_cpu_add((part)->bdev->bd_stats->field, addnd) #define part_stat_add(part, field, addnd) do { \ __part_stat_add((part), field, addnd); \ @@ -63,20 +63,20 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) field, addnd); \ } while (0) -#define part_stat_dec(gendiskp, field) \ - part_stat_add(gendiskp, field, -1) -#define part_stat_inc(gendiskp, field) \ - part_stat_add(gendiskp, field, 1) -#define part_stat_sub(gendiskp, field, subnd) \ - part_stat_add(gendiskp, field, -subnd) +#define part_stat_dec(part, field) \ + part_stat_add(part, field, -1) +#define part_stat_inc(part, field) \ + part_stat_add(part, field, 1) +#define part_stat_sub(part, field, subnd) \ + part_stat_add(part, field, -subnd) -#define part_stat_local_dec(gendiskp, field) \ - local_dec(&(part_stat_get(gendiskp, field))) -#define part_stat_local_inc(gendiskp, field) \ - local_inc(&(part_stat_get(gendiskp, field))) -#define part_stat_local_read(gendiskp, field) \ - local_read(&(part_stat_get(gendiskp, field))) -#define part_stat_local_read_cpu(gendiskp, field, cpu) \ - local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) +#define part_stat_local_dec(part, field) \ + local_dec(&(part_stat_get(part, field))) +#define part_stat_local_inc(part, field) \ + local_inc(&(part_stat_get(part, field))) +#define part_stat_local_read(part, field) \ + local_read(&(part_stat_get(part, field))) +#define part_stat_local_read_cpu(part, field, cpu) \ + local_read(&(part_stat_get_cpu(part, field, cpu))) #endif /* _LINUX_PART_STAT_H */ From 29ff57c61094e7bbd921ab10b5a99dce9a0132e0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:34:24 +0100 Subject: [PATCH 246/395] block: move the start_sect field to struct block_device Move the start_sect field to struct block_device in preparation of killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++-- block/blk-lib.c | 2 +- block/genhd.c | 4 ++-- block/partitions/core.c | 17 +++++++++-------- include/linux/blk_types.h | 1 + include/linux/blkdev.h | 4 ++-- include/linux/genhd.h | 3 +-- kernel/trace/blktrace.c | 11 +++-------- 8 files changed, 22 insertions(+), 25 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index d2c9cb24e087..9a3793d5ce38 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -757,9 +757,10 @@ static inline int blk_partition_remap(struct bio *bio) if (bio_sectors(bio)) { if (bio_check_eod(bio, bdev_nr_sectors(p->bdev))) goto out; - bio->bi_iter.bi_sector += p->start_sect; + bio->bi_iter.bi_sector += p->bdev->bd_start_sect; trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), - bio->bi_iter.bi_sector - p->start_sect); + bio->bi_iter.bi_sector - + p->bdev->bd_start_sect); } bio->bi_partno = 0; ret = 0; diff --git a/block/blk-lib.c b/block/blk-lib.c index e90614fd8d6a..752f9c722062 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -65,7 +65,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, /* In case the discard request is in a partition */ if (bdev_is_partition(bdev)) - part_offset = bdev->bd_part->start_sect; + part_offset = bdev->bd_start_sect; while (nr_sects) { sector_t granularity_aligned_lba, req_sects; diff --git a/block/genhd.c b/block/genhd.c index 2cbda8139556..5efb2df1f079 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -305,8 +305,8 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); static inline int sector_in_part(struct hd_struct *part, sector_t sector) { - return part->start_sect <= sector && - sector < part->start_sect + bdev_nr_sectors(part->bdev); + return part->bdev->bd_start_sect <= sector && + sector < part->bdev->bd_start_sect + bdev_nr_sectors(part->bdev); } /** diff --git a/block/partitions/core.c b/block/partitions/core.c index 8924e1ea8b2a..460a745812c6 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -192,7 +192,7 @@ static ssize_t part_start_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); + return sprintf(buf, "%llu\n", p->bdev->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, @@ -209,7 +209,7 @@ static ssize_t part_alignment_offset_show(struct device *dev, return sprintf(buf, "%u\n", queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, - p->start_sect)); + p->bdev->bd_start_sect)); } static ssize_t part_discard_alignment_show(struct device *dev, @@ -219,7 +219,7 @@ static ssize_t part_discard_alignment_show(struct device *dev, return sprintf(buf, "%u\n", queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, - p->start_sect)); + p->bdev->bd_start_sect)); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -301,7 +301,7 @@ static void hd_struct_free_work(struct work_struct *work) */ put_device(disk_to_dev(disk)); - part->start_sect = 0; + part->bdev->bd_start_sect = 0; bdev_set_nr_sectors(part->bdev, 0); part_stat_set_all(part, 0); put_device(part_to_dev(part)); @@ -416,7 +416,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev = part_to_dev(p); - p->start_sect = start; + bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); p->partno = partno; p->policy = get_disk_ro(disk); @@ -508,8 +508,9 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { if (part->partno == skip_partno || - start >= part->start_sect + bdev_nr_sectors(part->bdev) || - start + length <= part->start_sect) + start >= part->bdev->bd_start_sect + + bdev_nr_sectors(part->bdev) || + start + length <= part->bdev->bd_start_sect) continue; overlap = true; break; @@ -592,7 +593,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno, mutex_lock_nested(&bdev->bd_mutex, 1); ret = -EINVAL; - if (start != part->start_sect) + if (start != part->bdev->bd_start_sect) goto out_unlock; ret = -EBUSY; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 520011b95276..a690008f60cd 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -20,6 +20,7 @@ typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; struct block_device { + sector_t bd_start_sect; struct disk_stats __percpu *bd_stats; unsigned long bd_stamp; dev_t bd_dev; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 43a25d855e04..619adea57098 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1488,7 +1488,7 @@ static inline int bdev_alignment_offset(struct block_device *bdev) return -1; if (bdev_is_partition(bdev)) return queue_limit_alignment_offset(&q->limits, - bdev->bd_part->start_sect); + bdev->bd_start_sect); return q->limits.alignment_offset; } @@ -1529,7 +1529,7 @@ static inline int bdev_discard_alignment(struct block_device *bdev) if (bdev_is_partition(bdev)) return queue_limit_discard_alignment(&q->limits, - bdev->bd_part->start_sect); + bdev->bd_start_sect); return q->limits.discard_alignment; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 804ac45fbfbc..50d27f5d38e2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -51,7 +51,6 @@ struct partition_meta_info { }; struct hd_struct { - sector_t start_sect; struct percpu_ref ref; struct block_device *bdev; @@ -298,7 +297,7 @@ extern void rand_initialize_disk(struct gendisk *disk); static inline sector_t get_start_sect(struct block_device *bdev) { - return bdev->bd_part->start_sect; + return bdev->bd_start_sect; } static inline sector_t bdev_nr_sectors(struct block_device *bdev) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7076d588a50d..8a723a91ec5a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -458,14 +458,9 @@ static struct rchan_callbacks blk_relay_callbacks = { static void blk_trace_setup_lba(struct blk_trace *bt, struct block_device *bdev) { - struct hd_struct *part = NULL; - - if (bdev) - part = bdev->bd_part; - - if (part) { - bt->start_lba = part->start_sect; - bt->end_lba = part->start_sect + bdev_nr_sectors(bdev); + if (bdev) { + bt->start_lba = bdev->bd_start_sect; + bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev); } else { bt->start_lba = 0; bt->end_lba = -1ULL; From 231926dbf0f084211e4ec4f4c006f0bf1f47809a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 12:01:45 +0100 Subject: [PATCH 247/395] block: move the partition_meta_info to struct block_device Move the partition_meta_info to struct block_device in preparation for killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk.h | 1 - block/genhd.c | 3 ++- block/partitions/core.c | 18 +++++++----------- fs/block_dev.c | 1 + include/linux/blk_types.h | 2 ++ include/linux/genhd.h | 1 - init/do_mounts.c | 7 ++++--- 7 files changed, 16 insertions(+), 17 deletions(-) diff --git a/block/blk.h b/block/blk.h index 3f801f6e86f8..0bd4b58bcbaf 100644 --- a/block/blk.h +++ b/block/blk.h @@ -381,7 +381,6 @@ static inline void hd_struct_put(struct hd_struct *part) static inline void hd_free_part(struct hd_struct *part) { - kfree(part->info); bdput(part->bdev); percpu_ref_exit(&part->ref); } diff --git a/block/genhd.c b/block/genhd.c index 5efb2df1f079..4273e89f07e8 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1010,7 +1010,8 @@ void __init printk_all_partitions(void) bdevt_str(part_devt(part), devt_buf), bdev_nr_sectors(part->bdev) >> 1, disk_name(disk, part->partno, name_buf), - part->info ? part->info->uuid : ""); + part->bdev->bd_meta_info ? + part->bdev->bd_meta_info->uuid : ""); if (is_part0) { if (dev->parent && dev->parent->driver) printk(" driver: %s\n", diff --git a/block/partitions/core.c b/block/partitions/core.c index 460a745812c6..07df9ff55462 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -275,8 +275,9 @@ static int part_uevent(struct device *dev, struct kobj_uevent_env *env) struct hd_struct *part = dev_to_part(dev); add_uevent_var(env, "PARTN=%u", part->partno); - if (part->info && part->info->volname[0]) - add_uevent_var(env, "PARTNAME=%s", part->info->volname); + if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", + part->bdev->bd_meta_info->volname); return 0; } @@ -422,13 +423,10 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, p->policy = get_disk_ro(disk); if (info) { - struct partition_meta_info *pinfo; - - pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); - if (!pinfo) + err = -ENOMEM; + bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); + if (!bdev->bd_meta_info) goto out_bdput; - memcpy(pinfo, info, sizeof(*info)); - p->info = pinfo; } dname = dev_name(ddev); @@ -444,7 +442,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, err = blk_alloc_devt(p, &devt); if (err) - goto out_free_info; + goto out_bdput; pdev->devt = devt; /* delay uevent until 'holders' subdir is created */ @@ -481,8 +479,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, kobject_uevent(&pdev->kobj, KOBJ_ADD); return p; -out_free_info: - kfree(p->info); out_bdput: bdput(bdev); out_free: diff --git a/fs/block_dev.c b/fs/block_dev.c index 0832c7830f3a..0770f654b09c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -785,6 +785,7 @@ static void bdev_free_inode(struct inode *inode) struct block_device *bdev = I_BDEV(inode); free_percpu(bdev->bd_stats); + kfree(bdev->bd_meta_info); kmem_cache_free(bdev_cachep, BDEV_I(inode)); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a690008f60cd..2f8ede04e5a9 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -49,6 +49,8 @@ struct block_device { /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; struct super_block *bd_fsfreeze_sb; + + struct partition_meta_info *bd_meta_info; } __randomize_layout; #define bdev_whole(_bdev) \ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 50d27f5d38e2..30d7076155b4 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -57,7 +57,6 @@ struct hd_struct { struct device __dev; struct kobject *holder_dir; int policy, partno; - struct partition_meta_info *info; #ifdef CONFIG_FAIL_MAKE_REQUEST int make_it_fail; #endif diff --git a/init/do_mounts.c b/init/do_mounts.c index 5879edf083b3..368ccb718501 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -79,8 +79,8 @@ static int match_dev_by_uuid(struct device *dev, const void *data) const struct uuidcmp *cmp = data; struct hd_struct *part = dev_to_part(dev); - if (!part->info || - strncasecmp(cmp->uuid, part->info->uuid, cmp->len)) + if (!part->bdev->bd_meta_info || + strncasecmp(cmp->uuid, part->bdev->bd_meta_info->uuid, cmp->len)) return 0; return 1; } @@ -169,7 +169,8 @@ static int match_dev_by_label(struct device *dev, const void *data) const char *label = data; struct hd_struct *part = dev_to_part(dev); - if (!part->info || strcmp(label, part->info->volname)) + if (!part->bdev->bd_meta_info || + strcmp(label, part->bdev->bd_meta_info->volname)) return 0; return 1; } From 1bdd5ae0251d678488dffcf455d4633c2beef1bc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 19:00:13 +0100 Subject: [PATCH 248/395] block: move holder_dir to struct block_device Move the holder_dir field to struct block_device in preparation for kill struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 5 +++-- block/partitions/core.c | 8 ++++---- fs/block_dev.c | 11 +++++------ include/linux/blk_types.h | 1 + include/linux/genhd.h | 1 - 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 4273e89f07e8..0bd7026cee62 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -681,7 +681,8 @@ static void register_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->part0.bdev->bd_holder_dir = + kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); if (disk->flags & GENHD_FL_HIDDEN) { @@ -887,7 +888,7 @@ void del_gendisk(struct gendisk *disk) blk_unregister_queue(disk); - kobject_put(disk->part0.holder_dir); + kobject_put(disk->part0.bdev->bd_holder_dir); kobject_put(disk->slave_dir); part_stat_set_all(&disk->part0, 0); diff --git a/block/partitions/core.c b/block/partitions/core.c index 07df9ff55462..c068471fa654 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -344,7 +344,7 @@ void delete_partition(struct hd_struct *part) */ get_device(disk_to_dev(disk)); rcu_assign_pointer(ptbl->part[part->partno], NULL); - kobject_put(part->holder_dir); + kobject_put(part->bdev->bd_holder_dir); device_del(part_to_dev(part)); /* @@ -452,8 +452,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_put; err = -ENOMEM; - p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); - if (!p->holder_dir) + bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj); + if (!bdev->bd_holder_dir) goto out_del; dev_set_uevent_suppress(pdev, 0); @@ -487,7 +487,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, out_remove_file: device_remove_file(pdev, &dev_attr_whole_disk); out_del: - kobject_put(p->holder_dir); + kobject_put(bdev->bd_holder_dir); device_del(pdev); out_put: put_device(pdev); diff --git a/fs/block_dev.c b/fs/block_dev.c index 0770f654b09c..381c22426f43 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1142,7 +1142,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) WARN_ON_ONCE(!bdev->bd_holder); /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) + if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) goto out_unlock; holder = bd_find_holder_disk(bdev, disk); @@ -1165,14 +1165,14 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) if (ret) goto out_free; - ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); if (ret) goto out_del; /* * bdev could be deleted beneath us which would implicitly destroy * the holder directory. Hold on to it. */ - kobject_get(bdev->bd_part->holder_dir); + kobject_get(bdev->bd_holder_dir); list_add(&holder->list, &bdev->bd_holder_disks); goto out_unlock; @@ -1207,9 +1207,8 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_part->holder_dir, - &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_part->holder_dir); + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(bdev->bd_holder_dir); list_del_init(&holder->list); kfree(holder); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 2f8ede04e5a9..c0591e52d7d7 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -35,6 +35,7 @@ struct block_device { #ifdef CONFIG_SYSFS struct list_head bd_holder_disks; #endif + struct kobject *bd_holder_dir; u8 bd_partno; struct hd_struct * bd_part; /* number of times partitions within this device have been opened. */ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 30d7076155b4..b4a5c05593b9 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -55,7 +55,6 @@ struct hd_struct { struct block_device *bdev; struct device __dev; - struct kobject *holder_dir; int policy, partno; #ifdef CONFIG_FAIL_MAKE_REQUEST int make_it_fail; From b309e9936347232c724eaa13f70533128b4864e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 16:28:47 +0100 Subject: [PATCH 249/395] block: move make_it_fail to struct block_device Move the make_it_fail flag to struct block_device an turn it into a bool in preparation of killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 3 ++- block/genhd.c | 4 ++-- include/linux/blk_types.h | 3 +++ include/linux/genhd.h | 3 --- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 9a3793d5ce38..9121390be97a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -668,7 +668,8 @@ __setup("fail_make_request=", setup_fail_make_request); static bool should_fail_request(struct hd_struct *part, unsigned int bytes) { - return part->make_it_fail && should_fail(&fail_make_request, bytes); + return part->bdev->bd_make_it_fail && + should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) diff --git a/block/genhd.c b/block/genhd.c index 0bd7026cee62..f9c957739d4b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1292,7 +1292,7 @@ ssize_t part_fail_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->make_it_fail); + return sprintf(buf, "%d\n", p->bdev->bd_make_it_fail); } ssize_t part_fail_store(struct device *dev, @@ -1303,7 +1303,7 @@ ssize_t part_fail_store(struct device *dev, int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->make_it_fail = (i == 0) ? 0 : 1; + p->bdev->bd_make_it_fail = i; return count; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c0591e52d7d7..b237f1e40814 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -52,6 +52,9 @@ struct block_device { struct super_block *bd_fsfreeze_sb; struct partition_meta_info *bd_meta_info; +#ifdef CONFIG_FAIL_MAKE_REQUEST + bool bd_make_it_fail; +#endif } __randomize_layout; #define bdev_whole(_bdev) \ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b4a5c05593b9..349cf6403ccd 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -56,9 +56,6 @@ struct hd_struct { struct block_device *bdev; struct device __dev; int policy, partno; -#ifdef CONFIG_FAIL_MAKE_REQUEST - int make_it_fail; -#endif struct rcu_work rcu_work; }; From 83950d359010a493462d58c712b1124c877d1b3b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 16:36:02 +0100 Subject: [PATCH 250/395] block: move the policy field to struct block_device Move the policy field to struct block_device and rename it to the more descriptive bd_read_only. Also turn the field into a bool as it is used as such. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/genhd.c | 8 ++++---- block/ioctl.c | 2 +- block/partitions/core.c | 4 ++-- include/linux/blk_types.h | 1 + include/linux/genhd.h | 4 ++-- 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 9121390be97a..d64ffcb6f9ae 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -696,7 +696,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) { const int op = bio_op(bio); - if (part->policy && op_is_write(op)) { + if (part->bdev->bd_read_only && op_is_write(op)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) diff --git a/block/genhd.c b/block/genhd.c index f9c957739d4b..2db1204920a9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1687,14 +1687,14 @@ void set_disk_ro(struct gendisk *disk, int flag) struct disk_part_iter piter; struct hd_struct *part; - if (disk->part0.policy != flag) { + if (disk->part0.bdev->bd_read_only != flag) { set_disk_ro_uevent(disk, flag); - disk->part0.policy = flag; + disk->part0.bdev->bd_read_only = flag; } disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - part->policy = flag; + part->bdev->bd_read_only = flag; disk_part_iter_exit(&piter); } @@ -1704,7 +1704,7 @@ int bdev_read_only(struct block_device *bdev) { if (!bdev) return 0; - return bdev->bd_part->policy; + return bdev->bd_read_only; } EXPORT_SYMBOL(bdev_read_only); diff --git a/block/ioctl.c b/block/ioctl.c index a6d8171221c7..d61d652078f4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -345,7 +345,7 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, if (ret) return ret; } - bdev->bd_part->policy = n; + bdev->bd_read_only = n; return 0; } diff --git a/block/partitions/core.c b/block/partitions/core.c index c068471fa654..060c1be13cd8 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -199,7 +199,7 @@ static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->policy ? 1 : 0); + return sprintf(buf, "%d\n", p->bdev->bd_read_only); } static ssize_t part_alignment_offset_show(struct device *dev, @@ -420,7 +420,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); p->partno = partno; - p->policy = get_disk_ro(disk); + bdev->bd_read_only = get_disk_ro(disk); if (info) { err = -ENOMEM; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index b237f1e40814..758cf71c9aa2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -23,6 +23,7 @@ struct block_device { sector_t bd_start_sect; struct disk_stats __percpu *bd_stats; unsigned long bd_stamp; + bool bd_read_only; /* read-only policy */ dev_t bd_dev; int bd_openers; struct inode * bd_inode; /* will die */ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 349cf6403ccd..dcbf9ef7610e 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -55,7 +55,7 @@ struct hd_struct { struct block_device *bdev; struct device __dev; - int policy, partno; + int partno; struct rcu_work rcu_work; }; @@ -278,7 +278,7 @@ extern void set_disk_ro(struct gendisk *disk, int flag); static inline int get_disk_ro(struct gendisk *disk) { - return disk->part0.policy; + return disk->part0.bdev->bd_read_only; } extern void disk_block_events(struct gendisk *disk); From cb8432d650fe3be58bb962bc8e602dc405510327 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 18:47:17 +0100 Subject: [PATCH 251/395] block: allocate struct hd_struct as part of struct bdev_inode Allocate hd_struct together with struct block_device to pre-load the lifetime rule changes in preparation of merging the two structures. Note that part0 was previously embedded into struct gendisk, but is a separate allocation now, and already points to the block_device instead of the hd_struct. The lifetime of struct gendisk is still controlled by the struct device embedded in the part0 hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 16 ++++--- block/blk-flush.c | 2 +- block/blk-merge.c | 2 - block/blk.h | 21 ---------- block/genhd.c | 50 +++++++++------------- block/partitions/core.c | 67 +++--------------------------- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/drbd/drbd_worker.c | 3 +- drivers/block/zram/zram_drv.c | 2 +- drivers/md/dm.c | 4 +- drivers/md/md.c | 2 +- fs/block_dev.c | 39 ++++++----------- include/linux/blk_types.h | 2 +- include/linux/genhd.h | 14 +++---- include/linux/part_stat.h | 4 +- 15 files changed, 61 insertions(+), 169 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index d64ffcb6f9ae..9ea70275fc1c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -714,7 +714,8 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) static noinline int should_fail_bio(struct bio *bio) { - if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) + if (should_fail_request(bio->bi_disk->part0->bd_part, + bio->bi_iter.bi_size)) return -EIO; return 0; } @@ -831,7 +832,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (unlikely(blk_partition_remap(bio))) goto end_io; } else { - if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) + if (unlikely(bio_check_ro(bio, bio->bi_disk->part0->bd_part))) goto end_io; if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) goto end_io; @@ -1203,7 +1204,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * return ret; if (rq->rq_disk && - should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) + should_fail_request(rq->rq_disk->part0->bd_part, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) @@ -1272,7 +1273,7 @@ static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) __part_stat_add(part, io_ticks, end ? now - stamp : 1); } if (part->partno) { - part = &part_to_disk(part)->part0; + part = part_to_disk(part)->part0->bd_part; goto again; } } @@ -1309,8 +1310,6 @@ void blk_account_io_done(struct request *req, u64 now) part_stat_inc(part, ios[sgrp]); part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); part_stat_unlock(); - - hd_struct_put(part); } } @@ -1354,7 +1353,7 @@ EXPORT_SYMBOL_GPL(part_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) { - return __part_start_io_acct(&disk->part0, sectors, op); + return __part_start_io_acct(disk->part0->bd_part, sectors, op); } EXPORT_SYMBOL(disk_start_io_acct); @@ -1376,14 +1375,13 @@ void part_end_io_acct(struct hd_struct *part, struct bio *bio, unsigned long start_time) { __part_end_io_acct(part, bio_op(bio), start_time); - hd_struct_put(part); } EXPORT_SYMBOL_GPL(part_end_io_acct); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) { - __part_end_io_acct(&disk->part0, op, start_time); + __part_end_io_acct(disk->part0->bd_part, op, start_time); } EXPORT_SYMBOL(disk_end_io_acct); diff --git a/block/blk-flush.c b/block/blk-flush.c index e32958f0b687..fcd0a60574df 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -139,7 +139,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) static void blk_account_io_flush(struct request *rq) { - struct hd_struct *part = &rq->rq_disk->part0; + struct hd_struct *part = rq->rq_disk->part0->bd_part; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); diff --git a/block/blk-merge.c b/block/blk-merge.c index bcf5e4580603..cb351ab9b77d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -683,8 +683,6 @@ static void blk_account_io_merge_request(struct request *req) part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); part_stat_unlock(); - - hd_struct_put(req->part); } } diff --git a/block/blk.h b/block/blk.h index 0bd4b58bcbaf..32ac41f7557f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -363,27 +363,6 @@ int bdev_del_partition(struct block_device *bdev, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int disk_expand_part_tbl(struct gendisk *disk, int target); -int hd_ref_init(struct hd_struct *part); - -/* no need to get/put refcount of part0 */ -static inline int hd_struct_try_get(struct hd_struct *part) -{ - if (part->partno) - return percpu_ref_tryget_live(&part->ref); - return 1; -} - -static inline void hd_struct_put(struct hd_struct *part) -{ - if (part->partno) - percpu_ref_put(&part->ref); -} - -static inline void hd_free_part(struct hd_struct *part) -{ - bdput(part->bdev); - percpu_ref_exit(&part->ref); -} int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, diff --git a/block/genhd.c b/block/genhd.c index 2db1204920a9..c35b03dac5e5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -42,7 +42,7 @@ static void disk_release_events(struct gendisk *disk); void set_capacity(struct gendisk *disk, sector_t sectors) { - struct block_device *bdev = disk->part0.bdev; + struct block_device *bdev = disk->part0; spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); @@ -318,9 +318,7 @@ static inline int sector_in_part(struct hd_struct *part, sector_t sector) * primarily used for stats accounting. * * CONTEXT: - * RCU read locked. The returned partition pointer is always valid - * because its refcount is grabbed except for part0, which lifetime - * is same with the disk. + * RCU read locked. * * RETURNS: * Found partition on success, part0 is returned if no partition matches @@ -336,26 +334,19 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) ptbl = rcu_dereference(disk->part_tbl); part = rcu_dereference(ptbl->last_lookup); - if (part && sector_in_part(part, sector) && hd_struct_try_get(part)) + if (part && sector_in_part(part, sector)) goto out_unlock; for (i = 1; i < ptbl->len; i++) { part = rcu_dereference(ptbl->part[i]); if (part && sector_in_part(part, sector)) { - /* - * only live partition can be cached for lookup, - * so use-after-free on cached & deleting partition - * can be avoided - */ - if (!hd_struct_try_get(part)) - break; rcu_assign_pointer(ptbl->last_lookup, part); goto out_unlock; } } - part = &disk->part0; + part = disk->part0->bd_part; out_unlock: rcu_read_unlock(); return part; @@ -681,8 +672,8 @@ static void register_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - disk->part0.bdev->bd_holder_dir = - kobject_create_and_add("holders", &ddev->kobj); + disk->part0->bd_holder_dir = + kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); if (disk->flags & GENHD_FL_HIDDEN) { @@ -748,7 +739,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_UP; - retval = blk_alloc_devt(&disk->part0, &devt); + retval = blk_alloc_devt(disk->part0->bd_part, &devt); if (retval) { WARN_ON(1); return; @@ -775,7 +766,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); WARN_ON(ret); bdi_set_owner(bdi, dev); - bdev_add(disk->part0.bdev, devt); + bdev_add(disk->part0, devt); } register_disk(parent, disk, groups); if (register_queue) @@ -888,11 +879,11 @@ void del_gendisk(struct gendisk *disk) blk_unregister_queue(disk); - kobject_put(disk->part0.bdev->bd_holder_dir); + kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); - part_stat_set_all(&disk->part0, 0); - disk->part0.bdev->bd_stamp = 0; + part_stat_set_all(disk->part0->bd_part, 0); + disk->part0->bd_stamp = 0; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -1005,7 +996,7 @@ void __init printk_all_partitions(void) */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) { - bool is_part0 = part == &disk->part0; + bool is_part0 = part == disk->part0->bd_part; printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), @@ -1460,7 +1451,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); - hd_free_part(&disk->part0); + bdput(disk->part0); if (disk->queue) blk_put_queue(disk->queue); kfree(disk); @@ -1626,8 +1617,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (!disk) return NULL; - disk->part0.bdev = bdev_alloc(disk, 0); - if (!disk->part0.bdev) + disk->part0 = bdev_alloc(disk, 0); + if (!disk->part0) goto out_free_disk; disk->node_id = node_id; @@ -1635,10 +1626,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) goto out_bdput; ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], &disk->part0); - - if (hd_ref_init(&disk->part0)) - goto out_bdput; + rcu_assign_pointer(ptbl->part[0], disk->part0->bd_part); disk->minors = minors; rand_initialize_disk(disk); @@ -1648,7 +1636,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) return disk; out_bdput: - bdput(disk->part0.bdev); + bdput(disk->part0); out_free_disk: kfree(disk); return NULL; @@ -1687,9 +1675,9 @@ void set_disk_ro(struct gendisk *disk, int flag) struct disk_part_iter piter; struct hd_struct *part; - if (disk->part0.bdev->bd_read_only != flag) { + if (disk->part0->bd_read_only != flag) { set_disk_ro_uevent(disk, flag); - disk->part0.bdev->bd_read_only = flag; + disk->part0->bd_read_only = flag; } disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); diff --git a/block/partitions/core.c b/block/partitions/core.c index 060c1be13cd8..6d1fca193cbd 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -265,9 +265,9 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { struct hd_struct *p = dev_to_part(dev); + blk_free_devt(dev->devt); - hd_free_part(p); - kfree(p); + bdput(p->bdev); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) @@ -288,46 +288,6 @@ struct device_type part_type = { .uevent = part_uevent, }; -static void hd_struct_free_work(struct work_struct *work) -{ - struct hd_struct *part = - container_of(to_rcu_work(work), struct hd_struct, rcu_work); - struct gendisk *disk = part_to_disk(part); - - /* - * Release the disk reference acquired in delete_partition here. - * We can't release it in hd_struct_free because the final put_device - * needs process context and thus can't be run directly from a - * percpu_ref ->release handler. - */ - put_device(disk_to_dev(disk)); - - part->bdev->bd_start_sect = 0; - bdev_set_nr_sectors(part->bdev, 0); - part_stat_set_all(part, 0); - put_device(part_to_dev(part)); -} - -static void hd_struct_free(struct percpu_ref *ref) -{ - struct hd_struct *part = container_of(ref, struct hd_struct, ref); - struct gendisk *disk = part_to_disk(part); - struct disk_part_tbl *ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - - rcu_assign_pointer(ptbl->last_lookup, NULL); - - INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work); - queue_rcu_work(system_wq, &part->rcu_work); -} - -int hd_ref_init(struct hd_struct *part) -{ - if (percpu_ref_init(&part->ref, hd_struct_free, 0, GFP_KERNEL)) - return -ENOMEM; - return 0; -} - /* * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. @@ -342,8 +302,8 @@ void delete_partition(struct hd_struct *part) * ->part_tbl is referenced in this part's release handler, so * we have to hold the disk device */ - get_device(disk_to_dev(disk)); rcu_assign_pointer(ptbl->part[part->partno], NULL); + rcu_assign_pointer(ptbl->last_lookup, NULL); kobject_put(part->bdev->bd_holder_dir); device_del(part_to_dev(part)); @@ -353,7 +313,7 @@ void delete_partition(struct hd_struct *part) */ remove_inode_hash(part->bdev->bd_inode); - percpu_ref_kill(&part->ref); + put_device(part_to_dev(part)); } static ssize_t whole_disk_show(struct device *dev, @@ -406,15 +366,11 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (ptbl->part[partno]) return ERR_PTR(-EBUSY); - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return ERR_PTR(-EBUSY); - bdev = bdev_alloc(disk, partno); if (!bdev) - goto out_free; - p->bdev = bdev; + return ERR_PTR(-ENOMEM); + p = bdev->bd_part; pdev = part_to_dev(p); bdev->bd_start_sect = start; @@ -463,13 +419,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_del; } - err = hd_ref_init(p); - if (err) { - if (flags & ADDPART_FLAG_WHOLEDISK) - goto out_remove_file; - goto out_del; - } - /* everything is up and running, commence */ bdev_add(bdev, devt); rcu_assign_pointer(ptbl->part[partno], p); @@ -481,11 +430,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, out_bdput: bdput(bdev); -out_free: - kfree(p); return ERR_PTR(err); -out_remove_file: - device_remove_file(pdev, &dev_attr_whole_disk); out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index dc333dbe5232..9e5c2fdfda36 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2802,7 +2802,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) if (c_min_rate == 0) return false; - curr_events = (int)part_stat_read_accum(&disk->part0, sectors) - + curr_events = (int)part_stat_read_accum(disk->part0->bd_part, sectors) - atomic_read(&device->rs_sect_ev); if (atomic_read(&device->ap_actlog_cnt) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index ba56f3f05312..343f56b86bb7 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1678,7 +1678,8 @@ void drbd_rs_controller_reset(struct drbd_device *device) atomic_set(&device->rs_sect_in, 0); atomic_set(&device->rs_sect_ev, 0); device->rs_in_flight = 0; - device->rs_last_events = (int)part_stat_read_accum(&disk->part0, sectors); + device->rs_last_events = + (int)part_stat_read_accum(disk->part0->bd_part, sectors); /* Updating the RCU protected object in place is necessary since this function gets called from atomic context. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index b5f68951c9d2..6d84876a9cd0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1687,7 +1687,7 @@ static void zram_reset_device(struct zram *zram) zram->disksize = 0; set_capacity_and_notify(zram->disk, 0); - part_stat_set_all(&zram->disk->part0, 0); + part_stat_set_all(zram->disk->part0->bd_part, 0); up_write(&zram->init_lock); /* I/O operation under all of CPU are done so let's free */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 48051db006f3..1b2db4d530ea 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1607,7 +1607,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, * (by eliminating DM's splitting and just using bio_split) */ part_stat_lock(); - __dm_part_stat_sub(&dm_disk(md)->part0, + __dm_part_stat_sub(dm_disk(md)->part0->bd_part, sectors[op_stat_group(bio_op(bio))], ci.sector_count); part_stat_unlock(); @@ -2242,7 +2242,7 @@ EXPORT_SYMBOL_GPL(dm_put); static bool md_in_flight_bios(struct mapped_device *md) { int cpu; - struct hd_struct *part = &dm_disk(md)->part0; + struct hd_struct *part = dm_disk(md)->part0->bd_part; long sum = 0; for_each_possible_cpu(cpu) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 7ce6047c856e..3696c2d77a4d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8441,7 +8441,7 @@ static int is_mddev_idle(struct mddev *mddev, int init) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_disk; - curr_events = (int)part_stat_read_accum(&disk->part0, sectors) - + curr_events = (int)part_stat_read_accum(disk->part0->bd_part, sectors) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and diff --git a/fs/block_dev.c b/fs/block_dev.c index 381c22426f43..61cf33b6284f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -39,6 +39,7 @@ struct bdev_inode { struct block_device bdev; + struct hd_struct hd; struct inode vfs_inode; }; @@ -886,6 +887,9 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) iput(inode); return NULL; } + bdev->bd_part = &BDEV_I(inode)->hd; + memset(bdev->bd_part, 0, sizeof(*bdev->bd_part)); + bdev->bd_part->bdev = bdev; return bdev; } @@ -1280,15 +1284,10 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); static int __blkdev_get(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; - int ret; + int ret = 0; if (!bdev->bd_openers) { if (!bdev_is_partition(bdev)) { - ret = -ENXIO; - bdev->bd_part = disk_get_part(disk, 0); - if (!bdev->bd_part) - goto out_clear; - ret = 0; if (disk->fops->open) ret = disk->fops->open(bdev, mode); @@ -1307,7 +1306,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) - goto out_clear; + return ret; } else { struct block_device *whole = bdget_disk(disk, 0); @@ -1316,18 +1315,16 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) if (ret) { mutex_unlock(&whole->bd_mutex); bdput(whole); - goto out_clear; + return ret; } whole->bd_part_count++; mutex_unlock(&whole->bd_mutex); - bdev->bd_part = disk_get_part(disk, bdev->bd_partno); if (!(disk->flags & GENHD_FL_UP) || - !bdev->bd_part || !bdev_nr_sectors(bdev)) { + !bdev_nr_sectors(bdev)) { __blkdev_put(whole, mode, 1); bdput(whole); - ret = -ENXIO; - goto out_clear; + return -ENXIO; } set_init_blocksize(bdev); } @@ -1336,7 +1333,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } else { if (!bdev_is_partition(bdev)) { - ret = 0; if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ @@ -1349,11 +1345,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) } bdev->bd_openers++; return 0; - - out_clear: - disk_put_part(bdev->bd_part); - bdev->bd_part = NULL; - return ret; } struct block_device *blkdev_get_no_open(dev_t dev) @@ -1580,18 +1571,12 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) sync_blockdev(bdev); kill_bdev(bdev); bdev_write_inode(bdev); - - if (!bdev_is_partition(bdev) && disk->fops->release) - disk->fops->release(disk, mode); - - disk_put_part(bdev->bd_part); - bdev->bd_part = NULL; if (bdev_is_partition(bdev)) victim = bdev_whole(bdev); - } else { - if (!bdev_is_partition(bdev) && disk->fops->release) - disk->fops->release(disk, mode); } + + if (!bdev_is_partition(bdev) && disk->fops->release) + disk->fops->release(disk, mode); mutex_unlock(&bdev->bd_mutex); if (victim) { __blkdev_put(victim, mode, 1); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 758cf71c9aa2..6edea5c16259 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -59,7 +59,7 @@ struct block_device { } __randomize_layout; #define bdev_whole(_bdev) \ - ((_bdev)->bd_disk->part0.bdev) + ((_bdev)->bd_disk->part0) #define bdev_kobj(_bdev) \ (&part_to_dev((_bdev)->bd_part)->kobj) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index dcbf9ef7610e..df7319da013c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -19,11 +19,12 @@ #include #include -#define dev_to_disk(device) container_of((device), struct gendisk, part0.__dev) #define dev_to_part(device) container_of((device), struct hd_struct, __dev) -#define disk_to_dev(disk) (&(disk)->part0.__dev) #define part_to_dev(part) (&((part)->__dev)) +#define dev_to_disk(device) (dev_to_part(device)->bdev->bd_disk) +#define disk_to_dev(disk) (part_to_dev((disk)->part0->bd_part)) + extern const struct device_type disk_type; extern struct device_type part_type; extern struct class block_class; @@ -51,12 +52,9 @@ struct partition_meta_info { }; struct hd_struct { - struct percpu_ref ref; - struct block_device *bdev; struct device __dev; int partno; - struct rcu_work rcu_work; }; /** @@ -168,7 +166,7 @@ struct gendisk { * helpers. */ struct disk_part_tbl __rcu *part_tbl; - struct hd_struct part0; + struct block_device *part0; const struct block_device_operations *fops; struct request_queue *queue; @@ -278,7 +276,7 @@ extern void set_disk_ro(struct gendisk *disk, int flag); static inline int get_disk_ro(struct gendisk *disk) { - return disk->part0.bdev->bd_read_only; + return disk->part0->bd_read_only; } extern void disk_block_events(struct gendisk *disk); @@ -302,7 +300,7 @@ static inline sector_t bdev_nr_sectors(struct block_device *bdev) static inline sector_t get_capacity(struct gendisk *disk) { - return bdev_nr_sectors(disk->part0.bdev); + return bdev_nr_sectors(disk->part0); } int bdev_disk_changed(struct block_device *bdev, bool invalidate); diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 87ad60106e1d..680de036691e 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -59,8 +59,8 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) #define part_stat_add(part, field, addnd) do { \ __part_stat_add((part), field, addnd); \ if ((part)->partno) \ - __part_stat_add(&part_to_disk((part))->part0, \ - field, addnd); \ + __part_stat_add(part_to_disk((part))->part0->bd_part, \ + field, addnd); \ } while (0) #define part_stat_dec(part, field) \ From 8446fe9255be821cb38ffd306d7e8edc4b9ea662 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:36:54 +0100 Subject: [PATCH 252/395] block: switch partition lookup to use struct block_device Use struct block_device to lookup partitions on a disk. This removes all usage of struct hd_struct from the I/O path. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Coly Li [bcache] Acked-by: Chao Yu [f2fs] Signed-off-by: Jens Axboe --- block/bio.c | 4 +- block/blk-core.c | 66 ++++++++++++++---------------- block/blk-flush.c | 2 +- block/blk-mq.c | 9 ++-- block/blk-mq.h | 7 ++-- block/blk.h | 4 +- block/genhd.c | 57 +++++++++++++++----------- block/partitions/core.c | 7 +--- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/drbd/drbd_worker.c | 2 +- drivers/block/zram/zram_drv.c | 2 +- drivers/md/bcache/request.c | 4 +- drivers/md/dm.c | 4 +- drivers/md/md.c | 4 +- drivers/nvme/target/admin-cmd.c | 20 ++++----- fs/ext4/super.c | 18 +++----- fs/ext4/sysfs.c | 10 +---- fs/f2fs/f2fs.h | 2 +- fs/f2fs/super.c | 6 +-- include/linux/blkdev.h | 8 ++-- include/linux/genhd.h | 4 +- include/linux/part_stat.h | 17 ++++---- 22 files changed, 122 insertions(+), 137 deletions(-) diff --git a/block/bio.c b/block/bio.c index 669bb47a3198..ebb18136b86f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -608,12 +608,12 @@ void bio_truncate(struct bio *bio, unsigned new_size) void guard_bio_eod(struct bio *bio) { sector_t maxsector; - struct hd_struct *part; + struct block_device *part; rcu_read_lock(); part = __disk_get_part(bio->bi_disk, bio->bi_partno); if (part) - maxsector = bdev_nr_sectors(part->bdev); + maxsector = bdev_nr_sectors(part); else maxsector = get_capacity(bio->bi_disk); rcu_read_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 9ea70275fc1c..cee568389b7e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -666,10 +666,9 @@ static int __init setup_fail_make_request(char *str) } __setup("fail_make_request=", setup_fail_make_request); -static bool should_fail_request(struct hd_struct *part, unsigned int bytes) +static bool should_fail_request(struct block_device *part, unsigned int bytes) { - return part->bdev->bd_make_it_fail && - should_fail(&fail_make_request, bytes); + return part->bd_make_it_fail && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) @@ -684,7 +683,7 @@ late_initcall(fail_make_request_debugfs); #else /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool should_fail_request(struct hd_struct *part, +static inline bool should_fail_request(struct block_device *part, unsigned int bytes) { return false; @@ -692,11 +691,11 @@ static inline bool should_fail_request(struct hd_struct *part, #endif /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) +static inline bool bio_check_ro(struct bio *bio, struct block_device *part) { const int op = bio_op(bio); - if (part->bdev->bd_read_only && op_is_write(op)) { + if (part->bd_read_only && op_is_write(op)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) @@ -704,7 +703,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) WARN_ONCE(1, "Trying to write to read-only block-device %s (partno %d)\n", - bio_devname(bio, b), part->partno); + bio_devname(bio, b), part->bd_partno); /* Older lvm-tools actually trigger this */ return false; } @@ -714,8 +713,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) static noinline int should_fail_bio(struct bio *bio) { - if (should_fail_request(bio->bi_disk->part0->bd_part, - bio->bi_iter.bi_size)) + if (should_fail_request(bio->bi_disk->part0, bio->bi_iter.bi_size)) return -EIO; return 0; } @@ -744,7 +742,7 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector) */ static inline int blk_partition_remap(struct bio *bio) { - struct hd_struct *p; + struct block_device *p; int ret = -EIO; rcu_read_lock(); @@ -757,12 +755,12 @@ static inline int blk_partition_remap(struct bio *bio) goto out; if (bio_sectors(bio)) { - if (bio_check_eod(bio, bdev_nr_sectors(p->bdev))) + if (bio_check_eod(bio, bdev_nr_sectors(p))) goto out; - bio->bi_iter.bi_sector += p->bdev->bd_start_sect; - trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), + bio->bi_iter.bi_sector += p->bd_start_sect; + trace_block_bio_remap(bio->bi_disk->queue, bio, p->bd_dev, bio->bi_iter.bi_sector - - p->bdev->bd_start_sect); + p->bd_start_sect); } bio->bi_partno = 0; ret = 0; @@ -832,7 +830,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (unlikely(blk_partition_remap(bio))) goto end_io; } else { - if (unlikely(bio_check_ro(bio, bio->bi_disk->part0->bd_part))) + if (unlikely(bio_check_ro(bio, bio->bi_disk->part0))) goto end_io; if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) goto end_io; @@ -1204,7 +1202,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * return ret; if (rq->rq_disk && - should_fail_request(rq->rq_disk->part0->bd_part, blk_rq_bytes(rq))) + should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) @@ -1263,17 +1261,18 @@ unsigned int blk_rq_err_bytes(const struct request *rq) } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); -static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) +static void update_io_ticks(struct block_device *part, unsigned long now, + bool end) { unsigned long stamp; again: - stamp = READ_ONCE(part->bdev->bd_stamp); + stamp = READ_ONCE(part->bd_stamp); if (unlikely(stamp != now)) { - if (likely(cmpxchg(&part->bdev->bd_stamp, stamp, now) == stamp)) + if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp)) __part_stat_add(part, io_ticks, end ? now - stamp : 1); } - if (part->partno) { - part = part_to_disk(part)->part0->bd_part; + if (part->bd_partno) { + part = bdev_whole(part); goto again; } } @@ -1282,11 +1281,9 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->part && blk_do_io_stat(req)) { const int sgrp = op_stat_group(req_op(req)); - struct hd_struct *part; part_stat_lock(); - part = req->part; - part_stat_add(part, sectors[sgrp], bytes >> 9); + part_stat_add(req->part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } @@ -1301,14 +1298,11 @@ void blk_account_io_done(struct request *req, u64 now) if (req->part && blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); - struct hd_struct *part; part_stat_lock(); - part = req->part; - - update_io_ticks(part, jiffies, true); - part_stat_inc(part, ios[sgrp]); - part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); + update_io_ticks(req->part, jiffies, true); + part_stat_inc(req->part, ios[sgrp]); + part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); part_stat_unlock(); } } @@ -1325,7 +1319,7 @@ void blk_account_io_start(struct request *rq) part_stat_unlock(); } -static unsigned long __part_start_io_acct(struct hd_struct *part, +static unsigned long __part_start_io_acct(struct block_device *part, unsigned int sectors, unsigned int op) { const int sgrp = op_stat_group(op); @@ -1341,7 +1335,7 @@ static unsigned long __part_start_io_acct(struct hd_struct *part, return now; } -unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, +unsigned long part_start_io_acct(struct gendisk *disk, struct block_device **part, struct bio *bio) { *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); @@ -1353,11 +1347,11 @@ EXPORT_SYMBOL_GPL(part_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) { - return __part_start_io_acct(disk->part0->bd_part, sectors, op); + return __part_start_io_acct(disk->part0, sectors, op); } EXPORT_SYMBOL(disk_start_io_acct); -static void __part_end_io_acct(struct hd_struct *part, unsigned int op, +static void __part_end_io_acct(struct block_device *part, unsigned int op, unsigned long start_time) { const int sgrp = op_stat_group(op); @@ -1371,7 +1365,7 @@ static void __part_end_io_acct(struct hd_struct *part, unsigned int op, part_stat_unlock(); } -void part_end_io_acct(struct hd_struct *part, struct bio *bio, +void part_end_io_acct(struct block_device *part, struct bio *bio, unsigned long start_time) { __part_end_io_acct(part, bio_op(bio), start_time); @@ -1381,7 +1375,7 @@ EXPORT_SYMBOL_GPL(part_end_io_acct); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) { - __part_end_io_acct(disk->part0->bd_part, op, start_time); + __part_end_io_acct(disk->part0, op, start_time); } EXPORT_SYMBOL(disk_end_io_acct); diff --git a/block/blk-flush.c b/block/blk-flush.c index fcd0a60574df..9507dcdd5881 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -139,7 +139,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) static void blk_account_io_flush(struct request *rq) { - struct hd_struct *part = rq->rq_disk->part0->bd_part; + struct block_device *part = rq->rq_disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); diff --git a/block/blk-mq.c b/block/blk-mq.c index 55bcee5dc032..a2593748fa53 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -95,7 +95,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, } struct mq_inflight { - struct hd_struct *part; + struct block_device *part; unsigned int inflight[2]; }; @@ -111,7 +111,8 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, return true; } -unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) +unsigned int blk_mq_in_flight(struct request_queue *q, + struct block_device *part) { struct mq_inflight mi = { .part = part }; @@ -120,8 +121,8 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) return mi.inflight[0] + mi.inflight[1]; } -void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, + unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; diff --git a/block/blk-mq.h b/block/blk-mq.h index a52703c98b77..c696515766c7 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -182,9 +182,10 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) return hctx->nr_ctx && hctx->tags; } -unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); -void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); +unsigned int blk_mq_in_flight(struct request_queue *q, + struct block_device *part); +void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, + unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q) { diff --git a/block/blk.h b/block/blk.h index 32ac41f7557f..d5bf8f3a0781 100644 --- a/block/blk.h +++ b/block/blk.h @@ -215,7 +215,7 @@ static inline void elevator_exit(struct request_queue *q, __elevator_exit(q, e); } -struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); +struct block_device *__disk_get_part(struct gendisk *disk, int partno); ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); @@ -348,7 +348,7 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q); static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} #endif -struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); +struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); int blk_alloc_devt(struct hd_struct *part, dev_t *devt); void blk_free_devt(dev_t devt); diff --git a/block/genhd.c b/block/genhd.c index c35b03dac5e5..ed06466b305d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -126,7 +126,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) } } -static unsigned int part_in_flight(struct hd_struct *part) +static unsigned int part_in_flight(struct block_device *part) { unsigned int inflight = 0; int cpu; @@ -141,7 +141,8 @@ static unsigned int part_in_flight(struct hd_struct *part) return inflight; } -static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) +static void part_in_flight_rw(struct block_device *part, + unsigned int inflight[2]) { int cpu; @@ -157,7 +158,7 @@ static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) inflight[1] = 0; } -struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) +struct block_device *__disk_get_part(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); @@ -182,15 +183,21 @@ struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) */ struct hd_struct *disk_get_part(struct gendisk *disk, int partno) { + struct block_device *bdev; struct hd_struct *part; rcu_read_lock(); - part = __disk_get_part(disk, partno); - if (part) - get_device(part_to_dev(part)); + bdev = __disk_get_part(disk, partno); + if (!bdev) + goto fail; + part = bdev->bd_part; + if (!kobject_get_unless_zero(&part_to_dev(part)->kobj)) + goto fail; rcu_read_unlock(); - return part; +fail: + rcu_read_unlock(); + return NULL; } /** @@ -264,19 +271,19 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) /* iterate to the next partition */ for (; piter->idx != end; piter->idx += inc) { - struct hd_struct *part; + struct block_device *part; part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!bdev_nr_sectors(part->bdev) && + if (!bdev_nr_sectors(part) && !(piter->flags & DISK_PITER_INCL_EMPTY) && !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && piter->idx == 0)) continue; - get_device(part_to_dev(part)); - piter->part = part; + get_device(part_to_dev(part->bd_part)); + piter->part = part->bd_part; piter->idx += inc; break; } @@ -303,10 +310,10 @@ void disk_part_iter_exit(struct disk_part_iter *piter) } EXPORT_SYMBOL_GPL(disk_part_iter_exit); -static inline int sector_in_part(struct hd_struct *part, sector_t sector) +static inline int sector_in_part(struct block_device *part, sector_t sector) { - return part->bdev->bd_start_sect <= sector && - sector < part->bdev->bd_start_sect + bdev_nr_sectors(part->bdev); + return part->bd_start_sect <= sector && + sector < part->bd_start_sect + bdev_nr_sectors(part); } /** @@ -324,10 +331,10 @@ static inline int sector_in_part(struct hd_struct *part, sector_t sector) * Found partition on success, part0 is returned if no partition matches * or the matched partition is being deleted. */ -struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) +struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) { struct disk_part_tbl *ptbl; - struct hd_struct *part; + struct block_device *part; int i; rcu_read_lock(); @@ -346,7 +353,7 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) } } - part = disk->part0->bd_part; + part = disk->part0; out_unlock: rcu_read_unlock(); return part; @@ -882,7 +889,7 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); - part_stat_set_all(disk->part0->bd_part, 0); + part_stat_set_all(disk->part0, 0); disk->part0->bd_stamp = 0; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); @@ -1189,9 +1196,9 @@ ssize_t part_stat_show(struct device *dev, part_stat_read_all(p, &stat); if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, p); + inflight = blk_mq_in_flight(q, p->bdev); else - inflight = part_in_flight(p); + inflight = part_in_flight(p->bdev); return sprintf(buf, "%8lu %8lu %8llu %8u " @@ -1231,9 +1238,9 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, unsigned int inflight[2]; if (queue_is_mq(q)) - blk_mq_in_flight_rw(q, p, inflight); + blk_mq_in_flight_rw(q, p->bdev, inflight); else - part_in_flight_rw(p, inflight); + part_in_flight_rw(p->bdev, inflight); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } @@ -1506,9 +1513,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) while ((hd = disk_part_iter_next(&piter))) { part_stat_read_all(hd, &stat); if (queue_is_mq(gp->queue)) - inflight = blk_mq_in_flight(gp->queue, hd); + inflight = blk_mq_in_flight(gp->queue, hd->bdev); else - inflight = part_in_flight(hd); + inflight = part_in_flight(hd->bdev); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " @@ -1626,7 +1633,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) goto out_bdput; ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], disk->part0->bd_part); + rcu_assign_pointer(ptbl->part[0], disk->part0); disk->minors = minors; rand_initialize_disk(disk); diff --git a/block/partitions/core.c b/block/partitions/core.c index 6d1fca193cbd..c2f6721633b8 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -298,12 +298,9 @@ void delete_partition(struct hd_struct *part) struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); - /* - * ->part_tbl is referenced in this part's release handler, so - * we have to hold the disk device - */ rcu_assign_pointer(ptbl->part[part->partno], NULL); rcu_assign_pointer(ptbl->last_lookup, NULL); + kobject_put(part->bdev->bd_holder_dir); device_del(part_to_dev(part)); @@ -421,7 +418,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, /* everything is up and running, commence */ bdev_add(bdev, devt); - rcu_assign_pointer(ptbl->part[partno], p); + rcu_assign_pointer(ptbl->part[partno], bdev); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 9e5c2fdfda36..09c86ef3f0fd 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2802,7 +2802,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) if (c_min_rate == 0) return false; - curr_events = (int)part_stat_read_accum(disk->part0->bd_part, sectors) - + curr_events = (int)part_stat_read_accum(disk->part0, sectors) - atomic_read(&device->rs_sect_ev); if (atomic_read(&device->ap_actlog_cnt) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 343f56b86bb7..02044ab7f767 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1679,7 +1679,7 @@ void drbd_rs_controller_reset(struct drbd_device *device) atomic_set(&device->rs_sect_ev, 0); device->rs_in_flight = 0; device->rs_last_events = - (int)part_stat_read_accum(disk->part0->bd_part, sectors); + (int)part_stat_read_accum(disk->part0, sectors); /* Updating the RCU protected object in place is necessary since this function gets called from atomic context. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6d84876a9cd0..dc8957d173d3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1687,7 +1687,7 @@ static void zram_reset_device(struct zram *zram) zram->disksize = 0; set_capacity_and_notify(zram->disk, 0); - part_stat_set_all(zram->disk->part0->bd_part, 0); + part_stat_set_all(zram->disk->part0, 0); up_write(&zram->init_lock); /* I/O operation under all of CPU are done so let's free */ diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index afac8d07c1bd..85b1f2a9b72d 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -475,7 +475,7 @@ struct search { unsigned int read_dirty_data:1; unsigned int cache_missed:1; - struct hd_struct *part; + struct block_device *part; unsigned long start_time; struct btree_op op; @@ -1073,7 +1073,7 @@ struct detached_dev_io_private { unsigned long start_time; bio_end_io_t *bi_end_io; void *bi_private; - struct hd_struct *part; + struct block_device *part; }; static void detached_dev_end_io(struct bio *bio) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 1b2db4d530ea..176adcff56b3 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1607,7 +1607,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, * (by eliminating DM's splitting and just using bio_split) */ part_stat_lock(); - __dm_part_stat_sub(dm_disk(md)->part0->bd_part, + __dm_part_stat_sub(dm_disk(md)->part0, sectors[op_stat_group(bio_op(bio))], ci.sector_count); part_stat_unlock(); @@ -2242,7 +2242,7 @@ EXPORT_SYMBOL_GPL(dm_put); static bool md_in_flight_bios(struct mapped_device *md) { int cpu; - struct hd_struct *part = dm_disk(md)->part0->bd_part; + struct block_device *part = dm_disk(md)->part0; long sum = 0; for_each_possible_cpu(cpu) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 3696c2d77a4d..0065736f05b4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -464,7 +464,7 @@ struct md_io { bio_end_io_t *orig_bi_end_io; void *orig_bi_private; unsigned long start_time; - struct hd_struct *part; + struct block_device *part; }; static void md_end_io(struct bio *bio) @@ -8441,7 +8441,7 @@ static int is_mddev_idle(struct mddev *mddev, int init) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_disk; - curr_events = (int)part_stat_read_accum(disk->part0->bd_part, sectors) - + curr_events = (int)part_stat_read_accum(disk->part0, sectors) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index dca34489a1dc..8d90235e4fcc 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -89,12 +89,12 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, if (!ns->bdev) goto out; - host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); - data_units_read = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part, - sectors[READ]), 1000); - host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]); - data_units_written = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part, - sectors[WRITE]), 1000); + host_reads = part_stat_read(ns->bdev, ios[READ]); + data_units_read = + DIV_ROUND_UP(part_stat_read(ns->bdev, sectors[READ]), 1000); + host_writes = part_stat_read(ns->bdev, ios[WRITE]); + data_units_written = + DIV_ROUND_UP(part_stat_read(ns->bdev, sectors[WRITE]), 1000); put_unaligned_le64(host_reads, &slog->host_reads[0]); put_unaligned_le64(data_units_read, &slog->data_units_read[0]); @@ -120,12 +120,12 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, /* we don't have the right data for file backed ns */ if (!ns->bdev) continue; - host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]); + host_reads += part_stat_read(ns->bdev, ios[READ]); data_units_read += DIV_ROUND_UP( - part_stat_read(ns->bdev->bd_part, sectors[READ]), 1000); - host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); + part_stat_read(ns->bdev, sectors[READ]), 1000); + host_writes += part_stat_read(ns->bdev, ios[WRITE]); data_units_written += DIV_ROUND_UP( - part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000); + part_stat_read(ns->bdev, sectors[WRITE]), 1000); } put_unaligned_le64(host_reads, &slog->host_reads[0]); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6633b20224d5..c303a0ff0b17 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4048,9 +4048,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_sb = sb; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; - if (sb->s_bdev->bd_part) - sbi->s_sectors_written_start = - part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]); + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); /* Cleanup superblock name */ strreplace(sb->s_id, '/', '!'); @@ -5509,15 +5508,10 @@ static int ext4_commit_super(struct super_block *sb, int sync) */ if (!(sb->s_flags & SB_RDONLY)) ext4_update_tstamp(es, s_wtime); - if (sb->s_bdev->bd_part) - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1)); - else - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1)); if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter)) ext4_free_blocks_count_set(es, EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 4e27fe6ed3ae..075aa3a19ff5 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -62,11 +62,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%lu\n", - (part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]) - + (part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); } @@ -74,12 +71,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)(sbi->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]) - + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - EXT4_SB(sb)->s_sectors_written_start) >> 1))); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cb700d797296..49681a8d2b14 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1675,7 +1675,7 @@ static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi) * and the return value is in kbytes. s is of struct f2fs_sb_info. */ #define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) - \ + (((u64)part_stat_read((s)->sb->s_bdev, sectors[STAT_WRITE]) - \ (s)->sectors_written_start) >> 1) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d4e7fab352ba..af9f449da64b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3700,10 +3700,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } /* For write statistics */ - if (sb->s_bdev->bd_part) - sbi->sectors_written_start = - (u64)part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]); + sbi->sectors_written_start = + (u64)part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 619adea57098..1d4be1fc6007 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -191,7 +191,7 @@ struct request { }; struct gendisk *rq_disk; - struct hd_struct *part; + struct block_device *part; #ifdef CONFIG_BLK_RQ_ALLOC_TIME /* Time that the first bio started allocating this request. */ u64 alloc_time_ns; @@ -1943,9 +1943,9 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); -unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, - struct bio *bio); -void part_end_io_acct(struct hd_struct *part, struct bio *bio, +unsigned long part_start_io_acct(struct gendisk *disk, + struct block_device **part, struct bio *bio); +void part_end_io_acct(struct block_device *part, struct bio *bio, unsigned long start_time); /** diff --git a/include/linux/genhd.h b/include/linux/genhd.h index df7319da013c..fe6fee77e2b9 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -131,8 +131,8 @@ enum { struct disk_part_tbl { struct rcu_head rcu_head; int len; - struct hd_struct __rcu *last_lookup; - struct hd_struct __rcu *part[]; + struct block_device __rcu *last_lookup; + struct block_device __rcu *part[]; }; struct disk_events; diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 680de036691e..d2558121d48c 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -25,26 +25,26 @@ struct disk_stats { #define part_stat_unlock() preempt_enable() #define part_stat_get_cpu(part, field, cpu) \ - (per_cpu_ptr((part)->bdev->bd_stats, (cpu))->field) + (per_cpu_ptr((part)->bd_stats, (cpu))->field) #define part_stat_get(part, field) \ part_stat_get_cpu(part, field, smp_processor_id()) #define part_stat_read(part, field) \ ({ \ - typeof((part)->bdev->bd_stats->field) res = 0; \ + typeof((part)->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ - res += per_cpu_ptr((part)->bdev->bd_stats, _cpu)->field; \ + res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \ res; \ }) -static inline void part_stat_set_all(struct hd_struct *part, int value) +static inline void part_stat_set_all(struct block_device *part, int value) { int i; for_each_possible_cpu(i) - memset(per_cpu_ptr(part->bdev->bd_stats, i), value, + memset(per_cpu_ptr(part->bd_stats, i), value, sizeof(struct disk_stats)); } @@ -54,13 +54,12 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) part_stat_read(part, field[STAT_DISCARD])) #define __part_stat_add(part, field, addnd) \ - __this_cpu_add((part)->bdev->bd_stats->field, addnd) + __this_cpu_add((part)->bd_stats->field, addnd) #define part_stat_add(part, field, addnd) do { \ __part_stat_add((part), field, addnd); \ - if ((part)->partno) \ - __part_stat_add(part_to_disk((part))->part0->bd_part, \ - field, addnd); \ + if ((part)->bd_partno) \ + __part_stat_add(bdev_whole(part), field, addnd); \ } while (0) #define part_stat_dec(part, field) \ From 41e5c81984eac8ce87f2b4f57fec0bd90a049b2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:37:14 +0100 Subject: [PATCH 253/395] block: remove the partno field from struct hd_struct Just use the bd_partno field in struct block_device everywhere. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 12 ++++++------ block/partitions/core.c | 9 ++++----- include/linux/genhd.h | 1 - init/do_mounts.c | 2 +- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index ed06466b305d..b7e39b41a275 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -589,8 +589,8 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) int idx; /* in consecutive minor range? */ - if (part->partno < disk->minors) { - *devt = MKDEV(disk->major, disk->first_minor + part->partno); + if (part->bdev->bd_partno < disk->minors) { + *devt = MKDEV(disk->major, disk->first_minor + part->bdev->bd_partno); return 0; } @@ -864,7 +864,7 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(disk, part->partno); + invalidate_partition(disk, part->bdev->bd_partno); delete_partition(part); } disk_part_iter_exit(&piter); @@ -1008,7 +1008,7 @@ void __init printk_all_partitions(void) printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), bdev_nr_sectors(part->bdev) >> 1, - disk_name(disk, part->partno, name_buf), + disk_name(disk, part->bdev->bd_partno, name_buf), part->bdev->bd_meta_info ? part->bdev->bd_meta_info->uuid : ""); if (is_part0) { @@ -1102,7 +1102,7 @@ static int show_partition(struct seq_file *seqf, void *v) seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), bdev_nr_sectors(part->bdev) >> 1, - disk_name(sgp, part->partno, buf)); + disk_name(sgp, part->bdev->bd_partno, buf)); disk_part_iter_exit(&piter); return 0; @@ -1525,7 +1525,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) "%lu %u" "\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), - disk_name(gp, hd->partno, buf), + disk_name(gp, hd->bdev->bd_partno, buf), stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], diff --git a/block/partitions/core.c b/block/partitions/core.c index c2f6721633b8..6db9ca8b722d 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -184,7 +184,7 @@ static ssize_t part_partition_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->partno); + return sprintf(buf, "%d\n", p->bdev->bd_partno); } static ssize_t part_start_show(struct device *dev, @@ -274,7 +274,7 @@ static int part_uevent(struct device *dev, struct kobj_uevent_env *env) { struct hd_struct *part = dev_to_part(dev); - add_uevent_var(env, "PARTN=%u", part->partno); + add_uevent_var(env, "PARTN=%u", part->bdev->bd_partno); if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0]) add_uevent_var(env, "PARTNAME=%s", part->bdev->bd_meta_info->volname); @@ -298,7 +298,7 @@ void delete_partition(struct hd_struct *part) struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[part->partno], NULL); + rcu_assign_pointer(ptbl->part[part->bdev->bd_partno], NULL); rcu_assign_pointer(ptbl->last_lookup, NULL); kobject_put(part->bdev->bd_holder_dir); @@ -372,7 +372,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); - p->partno = partno; bdev->bd_read_only = get_disk_ro(disk); if (info) { @@ -445,7 +444,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { - if (part->partno == skip_partno || + if (part->bdev->bd_partno == skip_partno || start >= part->bdev->bd_start_sect + bdev_nr_sectors(part->bdev) || start + length <= part->bdev->bd_start_sect) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index fe6fee77e2b9..3c13d4708e3f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -54,7 +54,6 @@ struct partition_meta_info { struct hd_struct { struct block_device *bdev; struct device __dev; - int partno; }; /** diff --git a/init/do_mounts.c b/init/do_mounts.c index 368ccb718501..86bef93e72eb 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -136,7 +136,7 @@ static dev_t devt_from_partuuid(const char *uuid_str) struct hd_struct *part; part = disk_get_part(dev_to_disk(dev), - dev_to_part(dev)->partno + offset); + dev_to_part(dev)->bdev->bd_partno + offset); if (part) { devt = part_devt(part); put_device(part_to_dev(part)); From 9fc995a6e08349b5c5baff2cc31544b96ee2b1c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:17:46 +0100 Subject: [PATCH 254/395] block: pass a block_device to blk_alloc_devt Pass the block_device actually needed instead of the hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/genhd.c | 14 +++++++------- block/partitions/core.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/block/blk.h b/block/blk.h index d5bf8f3a0781..9657c6da7c77 100644 --- a/block/blk.h +++ b/block/blk.h @@ -350,7 +350,7 @@ static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); -int blk_alloc_devt(struct hd_struct *part, dev_t *devt); +int blk_alloc_devt(struct block_device *part, dev_t *devt); void blk_free_devt(dev_t devt); char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 diff --git a/block/genhd.c b/block/genhd.c index b7e39b41a275..fd6333332ab5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -570,8 +570,8 @@ static int blk_mangle_minor(int minor) } /** - * blk_alloc_devt - allocate a dev_t for a partition - * @part: partition to allocate dev_t for + * blk_alloc_devt - allocate a dev_t for a block device + * @bdev: block device to allocate dev_t for * @devt: out parameter for resulting dev_t * * Allocate a dev_t for block device. @@ -583,14 +583,14 @@ static int blk_mangle_minor(int minor) * CONTEXT: * Might sleep. */ -int blk_alloc_devt(struct hd_struct *part, dev_t *devt) +int blk_alloc_devt(struct block_device *bdev, dev_t *devt) { - struct gendisk *disk = part_to_disk(part); + struct gendisk *disk = bdev->bd_disk; int idx; /* in consecutive minor range? */ - if (part->bdev->bd_partno < disk->minors) { - *devt = MKDEV(disk->major, disk->first_minor + part->bdev->bd_partno); + if (bdev->bd_partno < disk->minors) { + *devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno); return 0; } @@ -746,7 +746,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_UP; - retval = blk_alloc_devt(disk->part0->bd_part, &devt); + retval = blk_alloc_devt(disk->part0, &devt); if (retval) { WARN_ON(1); return; diff --git a/block/partitions/core.c b/block/partitions/core.c index 6db9ca8b722d..3d8243334c7c 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -392,7 +392,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev->type = &part_type; pdev->parent = ddev; - err = blk_alloc_devt(p, &devt); + err = blk_alloc_devt(bdev, &devt); if (err) goto out_bdput; pdev->devt = devt; From 71773cf797490e1cbe4909b25a2543937e7eea82 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:20:37 +0100 Subject: [PATCH 255/395] block: pass a block_device to invalidate_partition Pass the block_device actually needed instead of looking it up using bdget_disk. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index fd6333332ab5..452f7c646e02 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -803,14 +803,8 @@ void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) } EXPORT_SYMBOL(device_add_disk_no_queue_reg); -static void invalidate_partition(struct gendisk *disk, int partno) +static void invalidate_partition(struct block_device *bdev) { - struct block_device *bdev; - - bdev = bdget_disk(disk, partno); - if (!bdev) - return; - fsync_bdev(bdev); __invalidate_device(bdev, true); @@ -819,7 +813,6 @@ static void invalidate_partition(struct gendisk *disk, int partno) * up any more even if openers still hold references to it. */ remove_inode_hash(bdev->bd_inode); - bdput(bdev); } /** @@ -864,12 +857,12 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(disk, part->bdev->bd_partno); + invalidate_partition(part->bdev); delete_partition(part); } disk_part_iter_exit(&piter); - invalidate_partition(disk, 0); + invalidate_partition(disk->part0); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; up_write(&bdev_lookup_sem); From ad1eaa5344b293552b6ba43f5709c76a9aa14d17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:52:59 +0100 Subject: [PATCH 256/395] block: switch disk_part_iter_* to use a struct block_device Switch the partition iter infrastructure to iterate over block_device references instead of hd_struct ones mostly used to get at the block_device. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 59 ++++++++++++++++++++------------------- block/partitions/core.c | 13 ++++----- drivers/s390/block/dasd.c | 8 +++--- include/linux/genhd.h | 4 +-- 4 files changed, 42 insertions(+), 42 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 452f7c646e02..2d34dd2da4e9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -244,7 +244,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_init); * CONTEXT: * Don't care. */ -struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) +struct block_device *disk_part_iter_next(struct disk_part_iter *piter) { struct disk_part_tbl *ptbl; int inc, end; @@ -282,8 +282,9 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) piter->idx == 0)) continue; - get_device(part_to_dev(part->bd_part)); - piter->part = part->bd_part; + piter->part = bdgrab(part); + if (!piter->part) + continue; piter->idx += inc; break; } @@ -305,7 +306,8 @@ EXPORT_SYMBOL_GPL(disk_part_iter_next); */ void disk_part_iter_exit(struct disk_part_iter *piter) { - disk_put_part(piter->part); + if (piter->part) + bdput(piter->part); piter->part = NULL; } EXPORT_SYMBOL_GPL(disk_part_iter_exit); @@ -346,7 +348,6 @@ struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) for (i = 1; i < ptbl->len; i++) { part = rcu_dereference(ptbl->part[i]); - if (part && sector_in_part(part, sector)) { rcu_assign_pointer(ptbl->last_lookup, part); goto out_unlock; @@ -647,7 +648,7 @@ static void register_disk(struct device *parent, struct gendisk *disk, { struct device *ddev = disk_to_dev(disk); struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; int err; ddev->parent = parent; @@ -697,7 +698,7 @@ static void register_disk(struct device *parent, struct gendisk *disk, /* announce possible partitions */ disk_part_iter_init(&piter, disk, 0); while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + kobject_uevent(bdev_kobj(part), KOBJ_ADD); disk_part_iter_exit(&piter); if (disk->queue->backing_dev_info->dev) { @@ -837,7 +838,7 @@ static void invalidate_partition(struct block_device *bdev) void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; might_sleep(); @@ -857,8 +858,8 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(part->bdev); - delete_partition(part); + invalidate_partition(part); + delete_partition(part->bd_part); } disk_part_iter_exit(&piter); @@ -977,7 +978,7 @@ void __init printk_all_partitions(void) while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; @@ -996,14 +997,14 @@ void __init printk_all_partitions(void) */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) { - bool is_part0 = part == disk->part0->bd_part; + bool is_part0 = part == disk->part0; printk("%s%s %10llu %s %s", is_part0 ? "" : " ", - bdevt_str(part_devt(part), devt_buf), - bdev_nr_sectors(part->bdev) >> 1, - disk_name(disk, part->bdev->bd_partno, name_buf), - part->bdev->bd_meta_info ? - part->bdev->bd_meta_info->uuid : ""); + bdevt_str(part->bd_dev, devt_buf), + bdev_nr_sectors(part) >> 1, + disk_name(disk, part->bd_partno, name_buf), + part->bd_meta_info ? + part->bd_meta_info->uuid : ""); if (is_part0) { if (dev->parent && dev->parent->driver) printk(" driver: %s\n", @@ -1079,7 +1080,7 @@ static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ @@ -1093,9 +1094,9 @@ static int show_partition(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %7d %10llu %s\n", - MAJOR(part_devt(part)), MINOR(part_devt(part)), - bdev_nr_sectors(part->bdev) >> 1, - disk_name(sgp, part->bdev->bd_partno, buf)); + MAJOR(part->bd_dev), MINOR(part->bd_dev), + bdev_nr_sectors(part) >> 1, + disk_name(sgp, part->bd_partno, buf)); disk_part_iter_exit(&piter); return 0; @@ -1489,7 +1490,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; struct disk_part_iter piter; - struct hd_struct *hd; + struct block_device *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight; struct disk_stats stat; @@ -1504,11 +1505,11 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - part_stat_read_all(hd, &stat); + part_stat_read_all(hd->bd_part, &stat); if (queue_is_mq(gp->queue)) - inflight = blk_mq_in_flight(gp->queue, hd->bdev); + inflight = blk_mq_in_flight(gp->queue, hd); else - inflight = part_in_flight(hd->bdev); + inflight = part_in_flight(hd); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " @@ -1517,8 +1518,8 @@ static int diskstats_show(struct seq_file *seqf, void *v) "%lu %lu %lu %u " "%lu %u" "\n", - MAJOR(part_devt(hd)), MINOR(part_devt(hd)), - disk_name(gp, hd->bdev->bd_partno, buf), + MAJOR(hd->bd_dev), MINOR(hd->bd_dev), + disk_name(gp, hd->bd_partno, buf), stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], @@ -1673,7 +1674,7 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro) void set_disk_ro(struct gendisk *disk, int flag) { struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; if (disk->part0->bd_read_only != flag) { set_disk_ro_uevent(disk, flag); @@ -1682,7 +1683,7 @@ void set_disk_ro(struct gendisk *disk, int flag) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - part->bdev->bd_read_only = flag; + part->bd_read_only = flag; disk_part_iter_exit(&piter); } diff --git a/block/partitions/core.c b/block/partitions/core.c index 3d8243334c7c..4cb6df175f90 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -439,15 +439,14 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, sector_t length, int skip_partno) { struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; bool overlap = false; disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { - if (part->bdev->bd_partno == skip_partno || - start >= part->bdev->bd_start_sect + - bdev_nr_sectors(part->bdev) || - start + length <= part->bdev->bd_start_sect) + if (part->bd_partno == skip_partno || + start >= part->bd_start_sect + bdev_nr_sectors(part) || + start + length <= part->bd_start_sect) continue; overlap = true; break; @@ -568,7 +567,7 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) int blk_drop_partitions(struct block_device *bdev) { struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; if (bdev->bd_part_count) return -EBUSY; @@ -578,7 +577,7 @@ int blk_drop_partitions(struct block_device *bdev) disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - delete_partition(part); + delete_partition(part->bd_part); disk_part_iter_exit(&piter); return 0; diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index db24e04ee978..1825fa8d05a7 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -432,7 +432,7 @@ dasd_state_ready_to_online(struct dasd_device * device) { struct gendisk *disk; struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; device->state = DASD_STATE_ONLINE; if (device->block) { @@ -445,7 +445,7 @@ dasd_state_ready_to_online(struct dasd_device * device) disk = device->block->bdev->bd_disk; disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE); + kobject_uevent(bdev_kobj(part), KOBJ_CHANGE); disk_part_iter_exit(&piter); } return 0; @@ -459,7 +459,7 @@ static int dasd_state_online_to_ready(struct dasd_device *device) int rc; struct gendisk *disk; struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; if (device->discipline->online_to_ready) { rc = device->discipline->online_to_ready(device); @@ -472,7 +472,7 @@ static int dasd_state_online_to_ready(struct dasd_device *device) disk = device->block->bdev->bd_disk; disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE); + kobject_uevent(bdev_kobj(part), KOBJ_CHANGE); disk_part_iter_exit(&piter); } return 0; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3c13d4708e3f..cd23c80265b2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -244,14 +244,14 @@ static inline void disk_put_part(struct hd_struct *part) struct disk_part_iter { struct gendisk *disk; - struct hd_struct *part; + struct block_device *part; int idx; unsigned int flags; }; extern void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, unsigned int flags); -extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter); +struct block_device *disk_part_iter_next(struct disk_part_iter *piter); extern void disk_part_iter_exit(struct disk_part_iter *piter); extern bool disk_has_partitions(struct gendisk *disk); From 9499ffc7521742e3fea32f6ac6c1213b6fc4e914 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:21:57 +0100 Subject: [PATCH 257/395] f2fs: remove a few bd_part checks bd_part is never NULL for a block device in use by a file system, so remove the checks. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Chao Yu Signed-off-by: Jens Axboe --- fs/f2fs/checkpoint.c | 5 +---- fs/f2fs/sysfs.c | 9 --------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 023462e80e58..54a1905af052 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1395,7 +1395,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); - struct super_block *sb = sbi->sb; struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); u64 kbytes_written; int err; @@ -1489,9 +1488,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk += data_sum_blocks; /* Record write statistics in the hot node summary */ - kbytes_written = sbi->kbytes_written; - if (sb->s_bdev->bd_part) - kbytes_written += BD_PART_WRITTEN(sbi); + kbytes_written = sbi->kbytes_written + BD_PART_WRITTEN(sbi); seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index ec77ccfea923..24e876e849c5 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -90,11 +90,6 @@ static ssize_t free_segments_show(struct f2fs_attr *a, static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return sprintf(buf, "0\n"); - return sprintf(buf, "%llu\n", (unsigned long long)(sbi->kbytes_written + BD_PART_WRITTEN(sbi))); @@ -103,12 +98,8 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct super_block *sb = sbi->sb; int len = 0; - if (!sb->s_bdev->bd_part) - return sprintf(buf, "0\n"); - if (f2fs_sb_has_encrypt(sbi)) len += scnprintf(buf, PAGE_SIZE - len, "%s", "encryption"); From 0d02129e76edf91cf04fabf1efbc3a9a1f1d729a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Nov 2020 16:43:51 +0100 Subject: [PATCH 258/395] block: merge struct block_device and struct hd_struct Instead of having two structures that represent each block device with different life time rules, merge them into a single one. This also greatly simplifies the reference counting rules, as we can use the inode reference count as the main reference count for the new struct block_device, with the device model reference front ending it for device model interaction. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 8 +-- block/blk.h | 2 +- block/genhd.c | 90 +++++++++-------------------- block/partitions/core.c | 116 +++++++++++++++----------------------- fs/block_dev.c | 9 --- include/linux/blk_types.h | 8 ++- include/linux/blkdev.h | 1 - include/linux/genhd.h | 40 +++---------- init/do_mounts.c | 21 ++++--- kernel/trace/blktrace.c | 43 +++----------- 10 files changed, 108 insertions(+), 230 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 79aa96240cec..031114d454a6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -820,9 +820,9 @@ static void blkcg_fill_root_iostats(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct hd_struct *part = disk_get_part(disk, 0); - struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); + struct block_device *bdev = dev_to_bdev(dev); + struct blkcg_gq *blkg = + blk_queue_root_blkg(bdev->bd_disk->queue); struct blkg_iostat tmp; int cpu; @@ -830,7 +830,7 @@ static void blkcg_fill_root_iostats(void) for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; - cpu_dkstats = per_cpu_ptr(part->bdev->bd_stats, cpu); + cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += diff --git a/block/blk.h b/block/blk.h index 9657c6da7c77..98f0b1ae2641 100644 --- a/block/blk.h +++ b/block/blk.h @@ -356,7 +356,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -void delete_partition(struct hd_struct *part); +void delete_partition(struct block_device *part); int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int bdev_del_partition(struct block_device *bdev, int partno); diff --git a/block/genhd.c b/block/genhd.c index 2d34dd2da4e9..0fabfc90b8e4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -106,13 +106,14 @@ const char *bdevname(struct block_device *bdev, char *buf) } EXPORT_SYMBOL(bdevname); -static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +static void part_stat_read_all(struct block_device *part, + struct disk_stats *stat) { int cpu; memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { - struct disk_stats *ptr = per_cpu_ptr(part->bdev->bd_stats, cpu); + struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { @@ -167,39 +168,6 @@ struct block_device *__disk_get_part(struct gendisk *disk, int partno) return rcu_dereference(ptbl->part[partno]); } -/** - * disk_get_part - get partition - * @disk: disk to look partition from - * @partno: partition number - * - * Look for partition @partno from @disk. If found, increment - * reference count and return it. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * Pointer to the found partition on success, NULL if not found. - */ -struct hd_struct *disk_get_part(struct gendisk *disk, int partno) -{ - struct block_device *bdev; - struct hd_struct *part; - - rcu_read_lock(); - bdev = __disk_get_part(disk, partno); - if (!bdev) - goto fail; - part = bdev->bd_part; - if (!kobject_get_unless_zero(&part_to_dev(part)->kobj)) - goto fail; - rcu_read_unlock(); - return part; -fail: - rcu_read_unlock(); - return NULL; -} - /** * disk_part_iter_init - initialize partition iterator * @piter: iterator to initialize @@ -859,7 +827,7 @@ void del_gendisk(struct gendisk *disk) DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(part); - delete_partition(part->bd_part); + delete_partition(part); } disk_part_iter_exit(&piter); @@ -952,13 +920,13 @@ void blk_request_module(dev_t devt) */ struct block_device *bdget_disk(struct gendisk *disk, int partno) { - struct hd_struct *part; struct block_device *bdev = NULL; - part = disk_get_part(disk, partno); - if (part) - bdev = bdget_part(part); - disk_put_part(part); + rcu_read_lock(); + bdev = __disk_get_part(disk, partno); + if (bdev && !bdgrab(bdev)) + bdev = NULL; + rcu_read_unlock(); return bdev; } @@ -1175,24 +1143,22 @@ static ssize_t disk_ro_show(struct device *dev, ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n", bdev_nr_sectors(p->bdev)); + return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev->bd_disk->queue; struct disk_stats stat; unsigned int inflight; - part_stat_read_all(p, &stat); + part_stat_read_all(bdev, &stat); if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, p->bdev); + inflight = blk_mq_in_flight(q, bdev); else - inflight = part_in_flight(p->bdev); + inflight = part_in_flight(bdev); return sprintf(buf, "%8lu %8lu %8llu %8u " @@ -1227,14 +1193,14 @@ ssize_t part_stat_show(struct device *dev, ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev->bd_disk->queue; unsigned int inflight[2]; if (queue_is_mq(q)) - blk_mq_in_flight_rw(q, p->bdev, inflight); + blk_mq_in_flight_rw(q, bdev, inflight); else - part_in_flight_rw(p->bdev, inflight); + part_in_flight_rw(bdev, inflight); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } @@ -1282,20 +1248,17 @@ static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->bdev->bd_make_it_fail); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail); } ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct hd_struct *p = dev_to_part(dev); int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->bdev->bd_make_it_fail = i; + dev_to_bdev(dev)->bd_make_it_fail = i; return count; } @@ -1505,7 +1468,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - part_stat_read_all(hd->bd_part, &stat); + part_stat_read_all(hd, &stat); if (queue_is_mq(gp->queue)) inflight = blk_mq_in_flight(gp->queue, hd); else @@ -1577,7 +1540,7 @@ dev_t blk_lookup_devt(const char *name, int partno) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - struct hd_struct *part; + struct block_device *part; if (strcmp(dev_name(dev), name)) continue; @@ -1590,13 +1553,12 @@ dev_t blk_lookup_devt(const char *name, int partno) MINOR(dev->devt) + partno); break; } - part = disk_get_part(disk, partno); + part = bdget_disk(disk, partno); if (part) { - devt = part_devt(part); - disk_put_part(part); + devt = part->bd_dev; + bdput(part); break; } - disk_put_part(part); } class_dev_iter_exit(&iter); return devt; diff --git a/block/partitions/core.c b/block/partitions/core.c index 4cb6df175f90..deca253583bd 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -182,44 +182,39 @@ static struct parsed_partitions *check_partition(struct gendisk *hd, static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->bdev->bd_partno); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n", p->bdev->bd_start_sect); + return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->bdev->bd_read_only); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_read_only); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, - p->bdev->bd_start_sect)); + queue_limit_alignment_offset(&bdev->bd_disk->queue->limits, + bdev->bd_start_sect)); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, - p->bdev->bd_start_sect)); + queue_limit_discard_alignment(&bdev->bd_disk->queue->limits, + bdev->bd_start_sect)); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -264,20 +259,17 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { - struct hd_struct *p = dev_to_part(dev); - blk_free_devt(dev->devt); - bdput(p->bdev); + bdput(dev_to_bdev(dev)); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) { - struct hd_struct *part = dev_to_part(dev); + struct block_device *part = dev_to_bdev(dev); - add_uevent_var(env, "PARTN=%u", part->bdev->bd_partno); - if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0]) - add_uevent_var(env, "PARTNAME=%s", - part->bdev->bd_meta_info->volname); + add_uevent_var(env, "PARTN=%u", part->bd_partno); + if (part->bd_meta_info && part->bd_meta_info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); return 0; } @@ -292,25 +284,25 @@ struct device_type part_type = { * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -void delete_partition(struct hd_struct *part) +void delete_partition(struct block_device *part) { - struct gendisk *disk = part_to_disk(part); + struct gendisk *disk = part->bd_disk; struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[part->bdev->bd_partno], NULL); + rcu_assign_pointer(ptbl->part[part->bd_partno], NULL); rcu_assign_pointer(ptbl->last_lookup, NULL); - kobject_put(part->bdev->bd_holder_dir); - device_del(part_to_dev(part)); + kobject_put(part->bd_holder_dir); + device_del(&part->bd_device); /* * Remove the block device from the inode hash, so that it cannot be * looked up any more even when openers still hold references. */ - remove_inode_hash(part->bdev->bd_inode); + remove_inode_hash(part->bd_inode); - put_device(part_to_dev(part)); + put_device(&part->bd_device); } static ssize_t whole_disk_show(struct device *dev, @@ -324,11 +316,10 @@ static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -static struct hd_struct *add_partition(struct gendisk *disk, int partno, +static struct block_device *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) { - struct hd_struct *p; dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; @@ -367,9 +358,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!bdev) return ERR_PTR(-ENOMEM); - p = bdev->bd_part; - pdev = part_to_dev(p); - bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); bdev->bd_read_only = get_disk_ro(disk); @@ -381,6 +369,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_bdput; } + pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); @@ -422,7 +411,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); - return p; + return bdev; out_bdput: bdput(bdev); @@ -459,7 +448,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length) { - struct hd_struct *part; + struct block_device *part; mutex_lock(&bdev->bd_mutex); if (partition_overlaps(bdev->bd_disk, start, length, -1)) { @@ -475,76 +464,59 @@ int bdev_add_partition(struct block_device *bdev, int partno, int bdev_del_partition(struct block_device *bdev, int partno) { - struct block_device *bdevp; - struct hd_struct *part = NULL; + struct block_device *part; int ret; - bdevp = bdget_disk(bdev->bd_disk, partno); - if (!bdevp) + part = bdget_disk(bdev->bd_disk, partno); + if (!part) return -ENXIO; - mutex_lock(&bdevp->bd_mutex); + mutex_lock(&part->bd_mutex); mutex_lock_nested(&bdev->bd_mutex, 1); - ret = -ENXIO; - part = disk_get_part(bdev->bd_disk, partno); - if (!part) - goto out_unlock; - ret = -EBUSY; - if (bdevp->bd_openers) + if (part->bd_openers) goto out_unlock; - sync_blockdev(bdevp); - invalidate_bdev(bdevp); + sync_blockdev(part); + invalidate_bdev(part); delete_partition(part); ret = 0; out_unlock: mutex_unlock(&bdev->bd_mutex); - mutex_unlock(&bdevp->bd_mutex); - bdput(bdevp); - if (part) - disk_put_part(part); + mutex_unlock(&part->bd_mutex); + bdput(part); return ret; } int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length) { - struct block_device *bdevp; - struct hd_struct *part; + struct block_device *part; int ret = 0; - part = disk_get_part(bdev->bd_disk, partno); + part = bdget_disk(bdev->bd_disk, partno); if (!part) return -ENXIO; - ret = -ENOMEM; - bdevp = bdget_part(part); - if (!bdevp) - goto out_put_part; - - mutex_lock(&bdevp->bd_mutex); + mutex_lock(&part->bd_mutex); mutex_lock_nested(&bdev->bd_mutex, 1); - ret = -EINVAL; - if (start != part->bdev->bd_start_sect) + if (start != part->bd_start_sect) goto out_unlock; ret = -EBUSY; if (partition_overlaps(bdev->bd_disk, start, length, partno)) goto out_unlock; - bdev_set_nr_sectors(bdevp, length); + bdev_set_nr_sectors(part, length); ret = 0; out_unlock: - mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&part->bd_mutex); mutex_unlock(&bdev->bd_mutex); - bdput(bdevp); -out_put_part: - disk_put_part(part); + bdput(part); return ret; } @@ -577,7 +549,7 @@ int blk_drop_partitions(struct block_device *bdev) disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - delete_partition(part->bd_part); + delete_partition(part); disk_part_iter_exit(&piter); return 0; @@ -592,7 +564,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; - struct hd_struct *part; + struct block_device *part; if (!size) return true; @@ -632,7 +604,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && (state->parts[p].flags & ADDPART_FLAG_RAID)) - md_autodetect_dev(part_to_dev(part)->devt); + md_autodetect_dev(part->bd_dev); return true; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 61cf33b6284f..a9905d8fd02b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -39,7 +39,6 @@ struct bdev_inode { struct block_device bdev; - struct hd_struct hd; struct inode vfs_inode; }; @@ -887,9 +886,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) iput(inode); return NULL; } - bdev->bd_part = &BDEV_I(inode)->hd; - memset(bdev->bd_part, 0, sizeof(*bdev->bd_part)); - bdev->bd_part->bdev = bdev; return bdev; } @@ -926,11 +922,6 @@ struct block_device *bdgrab(struct block_device *bdev) } EXPORT_SYMBOL(bdgrab); -struct block_device *bdget_part(struct hd_struct *part) -{ - return bdget(part_devt(part)); -} - long nr_blockdev_pages(void) { struct inode *inode; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 6edea5c16259..866f74261b3b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -8,6 +8,7 @@ #include #include +#include #include struct bio_set; @@ -30,6 +31,7 @@ struct block_device { struct super_block * bd_super; struct mutex bd_mutex; /* open/close mutex */ void * bd_claiming; + struct device bd_device; void * bd_holder; int bd_holders; bool bd_write_holder; @@ -38,7 +40,6 @@ struct block_device { #endif struct kobject *bd_holder_dir; u8 bd_partno; - struct hd_struct * bd_part; /* number of times partitions within this device have been opened. */ unsigned bd_part_count; @@ -61,8 +62,11 @@ struct block_device { #define bdev_whole(_bdev) \ ((_bdev)->bd_disk->part0) +#define dev_to_bdev(device) \ + container_of((device), struct block_device, bd_device) + #define bdev_kobj(_bdev) \ - (&part_to_dev((_bdev)->bd_part)->kobj) + (&((_bdev)->bd_device.kobj)) /* * Block error status values. See block/blk-core:blk_errors for the details. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1d4be1fc6007..17cedf0dc83d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1999,7 +1999,6 @@ void blkdev_put_no_open(struct block_device *bdev); struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); -struct block_device *bdget_part(struct hd_struct *part); struct block_device *bdgrab(struct block_device *bdev); void bdput(struct block_device *); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index cd23c80265b2..809aaa32d53c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -19,12 +19,6 @@ #include #include -#define dev_to_part(device) container_of((device), struct hd_struct, __dev) -#define part_to_dev(part) (&((part)->__dev)) - -#define dev_to_disk(device) (dev_to_part(device)->bdev->bd_disk) -#define disk_to_dev(disk) (part_to_dev((disk)->part0->bd_part)) - extern const struct device_type disk_type; extern struct device_type part_type; extern struct class block_class; @@ -51,11 +45,6 @@ struct partition_meta_info { u8 volname[PARTITION_META_INFO_VOLNAMELTH]; }; -struct hd_struct { - struct block_device *bdev; - struct device __dev; -}; - /** * DOC: genhd capability flags * @@ -190,19 +179,21 @@ struct gendisk { struct lockdep_map lockdep_map; }; +/* + * The gendisk is refcounted by the part0 block_device, and the bd_device + * therein is also used for device model presentation in sysfs. + */ +#define dev_to_disk(device) \ + (dev_to_bdev(device)->bd_disk) +#define disk_to_dev(disk) \ + (&((disk)->part0->bd_device)) + #if IS_REACHABLE(CONFIG_CDROM) #define disk_to_cdi(disk) ((disk)->cdi) #else #define disk_to_cdi(disk) NULL #endif -static inline struct gendisk *part_to_disk(struct hd_struct *part) -{ - if (unlikely(!part)) - return NULL; - return part->bdev->bd_disk; -} - static inline int disk_max_parts(struct gendisk *disk) { if (disk->flags & GENHD_FL_EXT_DEVT) @@ -221,19 +212,6 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } -static inline dev_t part_devt(struct hd_struct *part) -{ - return part_to_dev(part)->devt; -} - -extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno); - -static inline void disk_put_part(struct hd_struct *part) -{ - if (likely(part)) - put_device(part_to_dev(part)); -} - /* * Smarter partition iterator without context limits. */ diff --git a/init/do_mounts.c b/init/do_mounts.c index 86bef93e72eb..a78e44ee6adb 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -76,11 +76,11 @@ struct uuidcmp { */ static int match_dev_by_uuid(struct device *dev, const void *data) { + struct block_device *bdev = dev_to_bdev(dev); const struct uuidcmp *cmp = data; - struct hd_struct *part = dev_to_part(dev); - if (!part->bdev->bd_meta_info || - strncasecmp(cmp->uuid, part->bdev->bd_meta_info->uuid, cmp->len)) + if (!bdev->bd_meta_info || + strncasecmp(cmp->uuid, bdev->bd_meta_info->uuid, cmp->len)) return 0; return 1; } @@ -133,13 +133,13 @@ static dev_t devt_from_partuuid(const char *uuid_str) * Attempt to find the requested partition by adding an offset * to the partition number found by UUID. */ - struct hd_struct *part; + struct block_device *part; - part = disk_get_part(dev_to_disk(dev), - dev_to_part(dev)->bdev->bd_partno + offset); + part = bdget_disk(dev_to_disk(dev), + dev_to_bdev(dev)->bd_partno + offset); if (part) { - devt = part_devt(part); - put_device(part_to_dev(part)); + devt = part->bd_dev; + bdput(part); } } else { devt = dev->devt; @@ -166,11 +166,10 @@ static dev_t devt_from_partuuid(const char *uuid_str) */ static int match_dev_by_label(struct device *dev, const void *data) { + struct block_device *bdev = dev_to_bdev(dev); const char *label = data; - struct hd_struct *part = dev_to_part(dev); - if (!part->bdev->bd_meta_info || - strcmp(label, part->bdev->bd_meta_info->volname)) + if (!bdev->bd_meta_info || strcmp(label, bdev->bd_meta_info->volname)) return 0; return 1; } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8a723a91ec5a..a482a37848bf 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1810,30 +1810,15 @@ static ssize_t blk_trace_mask2str(char *buf, int mask) return p - buf; } -static struct request_queue *blk_trace_get_queue(struct block_device *bdev) -{ - if (bdev->bd_disk == NULL) - return NULL; - - return bdev_get_queue(bdev); -} - static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct block_device *bdev = bdget_part(dev_to_part(dev)); - struct request_queue *q; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; ssize_t ret = -ENXIO; - if (bdev == NULL) - goto out; - - q = blk_trace_get_queue(bdev); - if (q == NULL) - goto out_bdput; - mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, @@ -1856,9 +1841,6 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, out_unlock_bdev: mutex_unlock(&q->debugfs_mutex); -out_bdput: - bdput(bdev); -out: return ret; } @@ -1866,8 +1848,8 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct block_device *bdev; - struct request_queue *q; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1883,17 +1865,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out; value = ret; } - } else if (kstrtoull(buf, 0, &value)) - goto out; - - ret = -ENXIO; - bdev = bdget_part(dev_to_part(dev)); - if (bdev == NULL) - goto out; - - q = blk_trace_get_queue(bdev); - if (q == NULL) - goto out_bdput; + } else { + if (kstrtoull(buf, 0, &value)) + goto out; + } mutex_lock(&q->debugfs_mutex); @@ -1931,8 +1906,6 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, out_unlock_bdev: mutex_unlock(&q->debugfs_mutex); -out_bdput: - bdput(bdev); out: return ret ? ret : count; } From 977115c0f664e016a6b2774d4f97116ade23d732 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 10:41:07 +0100 Subject: [PATCH 259/395] block: stop using bdget_disk for partition 0 We can just dereference the point in struct gendisk instead. Also remove the now unused export. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 1 - drivers/block/nbd.c | 4 +--- drivers/block/xen-blkfront.c | 20 +++++--------------- drivers/block/zram/zram_drv.c | 14 ++------------ drivers/md/dm.c | 16 ++-------------- drivers/s390/block/dasd_ioctl.c | 5 ++--- fs/block_dev.c | 2 +- 7 files changed, 13 insertions(+), 49 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 0fabfc90b8e4..b84b8671e627 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -930,7 +930,6 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno) return bdev; } -EXPORT_SYMBOL(bdget_disk); /* * print a full list of all partitions - intended for places where the root diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 014683968ce1..92f84ed0ba9e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1488,12 +1488,10 @@ static int nbd_open(struct block_device *bdev, fmode_t mode) static void nbd_release(struct gendisk *disk, fmode_t mode) { struct nbd_device *nbd = disk->private_data; - struct block_device *bdev = bdget_disk(disk, 0); if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && - bdev->bd_openers == 0) + disk->part0->bd_openers == 0) nbd_disconnect_and_put(nbd); - bdput(bdev); nbd_config_put(nbd); nbd_put(nbd); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 79521e33d30e..188e0b47534b 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2153,7 +2153,7 @@ static void blkfront_closing(struct blkfront_info *info) } if (info->gd) - bdev = bdget_disk(info->gd, 0); + bdev = bdgrab(info->gd->part0); mutex_unlock(&info->mutex); @@ -2518,7 +2518,7 @@ static int blkfront_remove(struct xenbus_device *xbdev) disk = info->gd; if (disk) - bdev = bdget_disk(disk, 0); + bdev = bdgrab(disk->part0); info->xbdev = NULL; mutex_unlock(&info->mutex); @@ -2595,19 +2595,11 @@ static int blkif_open(struct block_device *bdev, fmode_t mode) static void blkif_release(struct gendisk *disk, fmode_t mode) { struct blkfront_info *info = disk->private_data; - struct block_device *bdev; struct xenbus_device *xbdev; mutex_lock(&blkfront_mutex); - - bdev = bdget_disk(disk, 0); - - if (!bdev) { - WARN(1, "Block device %s yanked out from us!\n", disk->disk_name); + if (disk->part0->bd_openers) goto out_mutex; - } - if (bdev->bd_openers) - goto out; /* * Check if we have been instructed to close. We will have @@ -2619,7 +2611,7 @@ static void blkif_release(struct gendisk *disk, fmode_t mode) if (xbdev && xbdev->state == XenbusStateClosing) { /* pending switch to state closed */ - dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); + dev_info(disk_to_dev(disk), "releasing disk\n"); xlvbd_release_gendisk(info); xenbus_frontend_closed(info->xbdev); } @@ -2628,14 +2620,12 @@ static void blkif_release(struct gendisk *disk, fmode_t mode) if (!xbdev) { /* sudden device removal */ - dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); + dev_info(disk_to_dev(disk), "releasing disk\n"); xlvbd_release_gendisk(info); disk->private_data = NULL; free_info(info); } -out: - bdput(bdev); out_mutex: mutex_unlock(&blkfront_mutex); } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index dc8957d173d3..b0701bae6e98 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1760,15 +1760,12 @@ static ssize_t reset_store(struct device *dev, return -EINVAL; zram = dev_to_zram(dev); - bdev = bdget_disk(zram->disk, 0); - if (!bdev) - return -ENOMEM; + bdev = zram->disk->part0; mutex_lock(&bdev->bd_mutex); /* Do not reset an active device or claimed device */ if (bdev->bd_openers || zram->claim) { mutex_unlock(&bdev->bd_mutex); - bdput(bdev); return -EBUSY; } @@ -1779,7 +1776,6 @@ static ssize_t reset_store(struct device *dev, /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - bdput(bdev); mutex_lock(&bdev->bd_mutex); zram->claim = false; @@ -1965,16 +1961,11 @@ static int zram_add(void) static int zram_remove(struct zram *zram) { - struct block_device *bdev; - - bdev = bdget_disk(zram->disk, 0); - if (!bdev) - return -ENOMEM; + struct block_device *bdev = zram->disk->part0; mutex_lock(&bdev->bd_mutex); if (bdev->bd_openers || zram->claim) { mutex_unlock(&bdev->bd_mutex); - bdput(bdev); return -EBUSY; } @@ -1986,7 +1977,6 @@ static int zram_remove(struct zram *zram) /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - bdput(bdev); pr_info("Removed device: %s\n", zram->disk->disk_name); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 176adcff56b3..ed7e836efbcd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2375,16 +2375,11 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) */ static int lock_fs(struct mapped_device *md) { - struct block_device *bdev; int r; WARN_ON(test_bit(DMF_FROZEN, &md->flags)); - bdev = bdget_disk(md->disk, 0); - if (!bdev) - return -ENOMEM; - r = freeze_bdev(bdev); - bdput(bdev); + r = freeze_bdev(md->disk->part0); if (!r) set_bit(DMF_FROZEN, &md->flags); return r; @@ -2392,16 +2387,9 @@ static int lock_fs(struct mapped_device *md) static void unlock_fs(struct mapped_device *md) { - struct block_device *bdev; - if (!test_bit(DMF_FROZEN, &md->flags)) return; - - bdev = bdget_disk(md->disk, 0); - if (!bdev) - return; - thaw_bdev(bdev); - bdput(bdev); + thaw_bdev(md->disk->part0); clear_bit(DMF_FROZEN, &md->flags); } diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 304eba1acf16..9f6424408946 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -220,9 +220,8 @@ dasd_format(struct dasd_block *block, struct format_data_t *fdata) * enabling the device later. */ if (fdata->start_unit == 0) { - struct block_device *bdev = bdget_disk(block->gdp, 0); - bdev->bd_inode->i_blkbits = blksize_bits(fdata->blksize); - bdput(bdev); + block->gdp->part0->bd_inode->i_blkbits = + blksize_bits(fdata->blksize); } rc = base->discipline->format_device(base, fdata, 1); diff --git a/fs/block_dev.c b/fs/block_dev.c index a9905d8fd02b..9e56ee1f2652 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1299,7 +1299,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) if (ret) return ret; } else { - struct block_device *whole = bdget_disk(disk, 0); + struct block_device *whole = bdgrab(disk->part0); mutex_lock_nested(&whole->bd_mutex, 1); ret = __blkdev_get(whole, mode); From b0c03eff79a67aa43f17249dd42fac58e96718dc Mon Sep 17 00:00:00 2001 From: Matheus Castello Date: Wed, 25 Nov 2020 00:29:26 -0300 Subject: [PATCH 260/395] drivers: hv: vmbus: Fix checkpatch SPLIT_STRING Checkpatch emits WARNING: quoted string split across lines. To keep the code clean and with the 80 column length indentation the check and registration code for kmsg_dump_register has been transferred to a new function hv_kmsg_dump_register. Signed-off-by: Matheus Castello Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20201125032926.17002-1-matheus@castello.eng.br Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 0a2711aa63a1..502f8cd95f6d 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1387,6 +1387,24 @@ static struct kmsg_dumper hv_kmsg_dumper = { .dump = hv_kmsg_dump, }; +static void hv_kmsg_dump_register(void) +{ + int ret; + + hv_panic_page = hv_alloc_hyperv_zeroed_page(); + if (!hv_panic_page) { + pr_err("Hyper-V: panic message page memory allocation failed\n"); + return; + } + + ret = kmsg_dump_register(&hv_kmsg_dumper); + if (ret) { + pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret); + hv_free_hyperv_page((unsigned long)hv_panic_page); + hv_panic_page = NULL; + } +} + static struct ctl_table_header *hv_ctl_table_hdr; /* @@ -1477,21 +1495,8 @@ static int vmbus_bus_init(void) * capability is supported by the hypervisor. */ hv_get_crash_ctl(hyperv_crash_ctl); - if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) { - hv_panic_page = (void *)hv_alloc_hyperv_zeroed_page(); - if (hv_panic_page) { - ret = kmsg_dump_register(&hv_kmsg_dumper); - if (ret) { - pr_err("Hyper-V: kmsg dump register " - "error 0x%x\n", ret); - hv_free_hyperv_page( - (unsigned long)hv_panic_page); - hv_panic_page = NULL; - } - } else - pr_err("Hyper-V: panic message page memory " - "allocation failed"); - } + if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) + hv_kmsg_dump_register(); register_die_notifier(&hyperv_die_block); } From 6b6667aa4d1e0866f00b62d35a9be3875c7551f8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 24 Nov 2020 17:58:12 +0000 Subject: [PATCH 261/395] block: optimise for_each_bvec() advance Because of how for_each_bvec() works it never advances across multiple entries at a time, so bvec_iter_advance() is an overkill. Add specialised bvec_iter_advance_single() that is faster. It also handles zero-len bvecs, so can kill bvec_iter_skip_zero_bvec(). text data bss dec hex filename before: 23977 805 0 24782 60ce lib/iov_iter.o before, bvec_iter_advance() w/o WARN_ONCE() 22886 600 0 23486 5bbe ./lib/iov_iter.o after: 21862 600 0 22462 57be lib/iov_iter.o Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bvec.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 2efec10bf792..ff832e698efb 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -121,18 +121,28 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv, return true; } -static inline void bvec_iter_skip_zero_bvec(struct bvec_iter *iter) +/* + * A simpler version of bvec_iter_advance(), @bytes should not span + * across multiple bvec entries, i.e. bytes <= bv[i->bi_idx].bv_len + */ +static inline void bvec_iter_advance_single(const struct bio_vec *bv, + struct bvec_iter *iter, unsigned int bytes) { - iter->bi_bvec_done = 0; - iter->bi_idx++; + unsigned int done = iter->bi_bvec_done + bytes; + + if (done == bv[iter->bi_idx].bv_len) { + done = 0; + iter->bi_idx++; + } + iter->bi_bvec_done = done; + iter->bi_size -= bytes; } #define for_each_bvec(bvl, bio_vec, iter, start) \ for (iter = (start); \ (iter).bi_size && \ ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ - (bvl).bv_len ? (void)bvec_iter_advance((bio_vec), &(iter), \ - (bvl).bv_len) : bvec_iter_skip_zero_bvec(&(iter))) + bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) /* for iterating one bio from start to end */ #define BVEC_ITER_ALL_INIT (struct bvec_iter) \ From 22b56c2964386ddced252be407150b22f85e209e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 24 Nov 2020 17:58:13 +0000 Subject: [PATCH 262/395] bio: optimise bvec iteration __bio_for_each_bvec(), __bio_for_each_segment() and bio_copy_data_iter() fall under conditions of bvec_iter_advance_single(), which is a faster and slimmer version of bvec_iter_advance(). Add bio_advance_iter_single() and convert them. Signed-off-by: Pavel Begunkov Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 4 ++-- include/linux/bio.h | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/block/bio.c b/block/bio.c index ebb18136b86f..1f2cc1fbe283 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1212,8 +1212,8 @@ void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, flush_dcache_page(dst_bv.bv_page); - bio_advance_iter(src, src_iter, bytes); - bio_advance_iter(dst, dst_iter, bytes); + bio_advance_iter_single(src, src_iter, bytes); + bio_advance_iter_single(dst, dst_iter, bytes); } } EXPORT_SYMBOL(bio_copy_data_iter); diff --git a/include/linux/bio.h b/include/linux/bio.h index ecf67108f091..1edda614f7ce 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -148,11 +148,24 @@ static inline void bio_advance_iter(const struct bio *bio, /* TODO: It is reasonable to complete bio with error here. */ } +/* @bytes should be less or equal to bvec[i->bi_idx].bv_len */ +static inline void bio_advance_iter_single(const struct bio *bio, + struct bvec_iter *iter, + unsigned int bytes) +{ + iter->bi_sector += bytes >> 9; + + if (bio_no_advance_iter(bio)) + iter->bi_size -= bytes; + else + bvec_iter_advance_single(bio->bi_io_vec, iter, bytes); +} + #define __bio_for_each_segment(bvl, bio, iter, start) \ for (iter = (start); \ (iter).bi_size && \ ((bvl = bio_iter_iovec((bio), (iter))), 1); \ - bio_advance_iter((bio), &(iter), (bvl).bv_len)) + bio_advance_iter_single((bio), &(iter), (bvl).bv_len)) #define bio_for_each_segment(bvl, bio, iter) \ __bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter) @@ -161,7 +174,7 @@ static inline void bio_advance_iter(const struct bio *bio, for (iter = (start); \ (iter).bi_size && \ ((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \ - bio_advance_iter((bio), &(iter), (bvl).bv_len)) + bio_advance_iter_single((bio), &(iter), (bvl).bv_len)) /* iterate over multi-page bvec */ #define bio_for_each_bvec(bvl, bio, iter) \ From 2c07343abd8932200a45ff7b10950e71081e9e77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Wed, 2 Dec 2020 17:26:43 +0100 Subject: [PATCH 263/395] selftests/seccomp: Update kernel config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit seccomp_bpf.c uses unshare(CLONE_NEWPID), which requires CONFIG_PID_NS to be set. Cc: Kees Cook Cc: Shuah Khan Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Signed-off-by: Mickaël Salaün Acked-by: Tycho Andersen Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201202162643.249276-1-mic@digikod.net --- tools/testing/selftests/seccomp/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/seccomp/config b/tools/testing/selftests/seccomp/config index 64c19d8eba79..ad431a5178fb 100644 --- a/tools/testing/selftests/seccomp/config +++ b/tools/testing/selftests/seccomp/config @@ -1,3 +1,4 @@ +CONFIG_PID_NS=y CONFIG_SECCOMP=y CONFIG_SECCOMP_FILTER=y CONFIG_USER_NS=y From b0d97557ebfc9d5ba5f2939339a9fdd267abafeb Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Wed, 2 Dec 2020 19:11:45 +0800 Subject: [PATCH 264/395] block: fix inflight statistics of part0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The inflight of partition 0 doesn't include inflight IOs to all sub-partitions, since currently mq calculates inflight of specific partition by simply camparing the value of the partition pointer. Thus the following case is possible: $ cat /sys/block/vda/inflight        0        0 $ cat /sys/block/vda/vda1/inflight        0      128 While single queue device (on a previous version, e.g. v3.10) has no this issue: $cat /sys/block/sda/sda3/inflight 0 33 $cat /sys/block/sda/inflight 0 33 Partition 0 should be specially handled since it represents the whole disk. This issue is introduced since commit bf0ddaba65dd ("blk-mq: fix sysfs inflight counter"). Besides, this patch can also fix the inflight statistics of part 0 in /proc/diskstats. Before this patch, the inflight statistics of part 0 doesn't include that of sub partitions. (I have marked the 'inflight' field with asterisk.) $cat /proc/diskstats 259 0 nvme0n1 45974469 0 367814768 6445794 1 0 1 0 *0* 111062 6445794 0 0 0 0 0 0 259 2 nvme0n1p1 45974058 0 367797952 6445727 0 0 0 0 *33* 111001 6445727 0 0 0 0 0 0 This is introduced since commit f299b7c7a9de ("blk-mq: provide internal in-flight variant"). Fixes: bf0ddaba65dd ("blk-mq: fix sysfs inflight counter") Fixes: f299b7c7a9de ("blk-mq: provide internal in-flight variant") Signed-off-by: Jeffle Xu Reviewed-by: Christoph Hellwig [axboe: adapt for 5.11 partition change] Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a2593748fa53..37c682855a63 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -105,7 +105,8 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - if (rq->part == mi->part && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) + if ((!mi->part->bd_partno || rq->part == mi->part) && + blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; return true; From acaf523a7bf226b28504306c1cfee194520123b3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 26 Nov 2020 11:18:34 +0800 Subject: [PATCH 265/395] blk-throttle: don't check whether or not lower limit is valid if CONFIG_BLK_DEV_THROTTLING_LOW is off blk_throtl_update_limit_valid() will search for descendants to see if 'LIMIT_LOW' of bps/iops and READ/WRITE is nonzero. However, they're always zero if CONFIG_BLK_DEV_THROTTLING_LOW is not set, furthermore, a lot of time will be wasted to iterate descendants. Thus do nothing in blk_throtl_update_limit_valid() in such situation. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b771c4299982..d52cac9f3a7c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -587,6 +587,7 @@ static void throtl_pd_online(struct blkg_policy_data *pd) tg_update_has_rules(tg); } +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void blk_throtl_update_limit_valid(struct throtl_data *td) { struct cgroup_subsys_state *pos_css; @@ -607,6 +608,11 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) td->limit_valid[LIMIT_LOW] = low_valid; } +#else +static inline void blk_throtl_update_limit_valid(struct throtl_data *td) +{ +} +#endif static void throtl_upgrade_state(struct throtl_data *td); static void throtl_pd_offline(struct blkg_policy_data *pd) From 6b3211842a115d697fbf78d09f3e83852200e413 Mon Sep 17 00:00:00 2001 From: Yejune Deng Date: Mon, 30 Nov 2020 16:35:45 +0800 Subject: [PATCH 266/395] audit: replace atomic_add_return() atomic_inc_return() is a little neater Signed-off-by: Yejune Deng Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/audit.c b/kernel/audit.c index e22f22bdc000..1ffc2e059027 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1779,7 +1779,7 @@ unsigned int audit_serial(void) { static atomic_t serial = ATOMIC_INIT(0); - return atomic_add_return(1, &serial); + return atomic_inc_return(&serial); } static inline void audit_get_stamp(struct audit_context *ctx, From 79109a515ac3f1009632f4a4c81597e9438a2d65 Mon Sep 17 00:00:00 2001 From: Andrey Zhizhikin Date: Tue, 1 Dec 2020 22:29:20 +0000 Subject: [PATCH 267/395] MIPS: configs: drop unused BACKLIGHT_GENERIC option Commit 7ecdea4a0226 ("backlight: generic_bl: Remove this driver as it is unused") removed geenric_bl driver from the tree, together with corresponding config option. Remove BACKLIGHT_GENERIC config item from all MIPS configurations. Fixes: 7ecdea4a0226 ("backlight: generic_bl: Remove this driver as it is unused") Signed-off-by: Andrey Zhizhikin Reviewed-by: Krzysztof Kozlowski Acked-by: Daniel Thompson Acked-by: Sam Ravnborg Cc: Sam Ravnborg Signed-off-by: Thomas Bogendoerfer --- arch/mips/configs/gcw0_defconfig | 1 - arch/mips/configs/gpr_defconfig | 1 - arch/mips/configs/lemote2f_defconfig | 1 - arch/mips/configs/loongson3_defconfig | 1 - arch/mips/configs/mtx1_defconfig | 1 - arch/mips/configs/rs90_defconfig | 1 - 6 files changed, 6 deletions(-) diff --git a/arch/mips/configs/gcw0_defconfig b/arch/mips/configs/gcw0_defconfig index 7e28a4fe9d84..460683b52285 100644 --- a/arch/mips/configs/gcw0_defconfig +++ b/arch/mips/configs/gcw0_defconfig @@ -73,7 +73,6 @@ CONFIG_DRM_PANEL_NOVATEK_NT39016=y CONFIG_DRM_INGENIC=y CONFIG_DRM_ETNAVIV=y CONFIG_BACKLIGHT_CLASS_DEVICE=y -# CONFIG_BACKLIGHT_GENERIC is not set CONFIG_BACKLIGHT_PWM=y # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig index 9085f4d6c698..87e20f3391ed 100644 --- a/arch/mips/configs/gpr_defconfig +++ b/arch/mips/configs/gpr_defconfig @@ -251,7 +251,6 @@ CONFIG_SSB_DRIVER_PCICORE=y # CONFIG_VGA_ARB is not set # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y -# CONFIG_BACKLIGHT_GENERIC is not set # CONFIG_VGA_CONSOLE is not set CONFIG_USB_HID=m CONFIG_USB_HIDDEV=y diff --git a/arch/mips/configs/lemote2f_defconfig b/arch/mips/configs/lemote2f_defconfig index 3a9a453b1264..688c91918db2 100644 --- a/arch/mips/configs/lemote2f_defconfig +++ b/arch/mips/configs/lemote2f_defconfig @@ -145,7 +145,6 @@ CONFIG_FB_SIS_300=y CONFIG_FB_SIS_315=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y -CONFIG_BACKLIGHT_GENERIC=m # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y diff --git a/arch/mips/configs/loongson3_defconfig b/arch/mips/configs/loongson3_defconfig index 38a817ead8e7..9c5fadef38cb 100644 --- a/arch/mips/configs/loongson3_defconfig +++ b/arch/mips/configs/loongson3_defconfig @@ -286,7 +286,6 @@ CONFIG_DRM_VIRTIO_GPU=y CONFIG_FB_RADEON=y CONFIG_LCD_CLASS_DEVICE=y CONFIG_LCD_PLATFORM=m -CONFIG_BACKLIGHT_GENERIC=m # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig index 914af125a7fa..0ef2373404e5 100644 --- a/arch/mips/configs/mtx1_defconfig +++ b/arch/mips/configs/mtx1_defconfig @@ -450,7 +450,6 @@ CONFIG_WDT_MTX1=y # CONFIG_VGA_ARB is not set # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y -# CONFIG_BACKLIGHT_GENERIC is not set # CONFIG_VGA_CONSOLE is not set CONFIG_SOUND=m CONFIG_SND=m diff --git a/arch/mips/configs/rs90_defconfig b/arch/mips/configs/rs90_defconfig index dfbb9fed9a42..4f540bb94628 100644 --- a/arch/mips/configs/rs90_defconfig +++ b/arch/mips/configs/rs90_defconfig @@ -97,7 +97,6 @@ CONFIG_DRM_FBDEV_OVERALLOC=300 CONFIG_DRM_PANEL_SIMPLE=y CONFIG_DRM_INGENIC=y CONFIG_BACKLIGHT_CLASS_DEVICE=y -# CONFIG_BACKLIGHT_GENERIC is not set CONFIG_BACKLIGHT_PWM=y # CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y From 8b3165e54566e8bb8f4b7d4e5f12ced78ce462bb Mon Sep 17 00:00:00 2001 From: Xingxing Su Date: Thu, 3 Dec 2020 15:22:51 +0800 Subject: [PATCH 268/395] MIPS: Enable GCOV Enable gcov profiling of the entire kernel on mips. Required changes include disabling profiling for: * arch/kernel/boot/compressed: not linked to main kernel. Lightly tested on Loongson 3A3000 an 3A4000, seems to work as expected. without "GCOV_PROFILE := n" in compressed Makefile, build errors as follows: ... ld: arch/mips/boot/compressed/string.o:(.data+0x88): undefined reference to `__gcov_merge_add' ld: arch/mips/boot/compressed/string.o: in function `_GLOBAL__sub_I_00100_0_memcpy': string.c:(.text.startup+0x4): undefined reference to `__gcov_init' ld: arch/mips/boot/compressed/string.o: in function `_GLOBAL__sub_D_00100_1_memcpy': string.c:(.text.exit+0x0): undefined reference to `__gcov_exit' ... Signed-off-by: Youling Tang Signed-off-by: Xingxing Su Signed-off-by: Thomas Bogendoerfer --- arch/mips/Kconfig | 1 + arch/mips/boot/compressed/Makefile | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 44a47adb1275..b99498958b5c 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -9,6 +9,7 @@ config MIPS select ARCH_HAS_PTE_SPECIAL if !(32BIT && CPU_HAS_RIXI) select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_SUPPORTS_UPROBES select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if 64BIT diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index 4c3bc7e3d56d..47cd9dc7454a 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -36,6 +36,7 @@ KBUILD_AFLAGS := $(KBUILD_AFLAGS) -D__ASSEMBLY__ \ # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. KCOV_INSTRUMENT := n +GCOV_PROFILE := n # decompressor objects (linked with vmlinuz) vmlinuzobjs-y := $(obj)/head.o $(obj)/decompress.o $(obj)/string.o From d121f125af22a16f0f679293756d28a9691fa46d Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Thu, 3 Dec 2020 13:36:48 +0100 Subject: [PATCH 269/395] MIPS: Don't round up kernel sections size for memblock_add() Linux doesn't own the memory immediately after the kernel image. On Octeon bootloader places a shared structure right close after the kernel _end, refer to "struct cvmx_bootinfo *octeon_bootinfo" in cavium-octeon/setup.c. If check_kernel_sections_mem() rounds the PFNs up, first memblock_alloc() inside early_init_dt_alloc_memory_arch() <= device_tree_init() returns memory block overlapping with the above octeon_bootinfo structure, which is being overwritten afterwards. Fixes: a94e4f24ec83 ("MIPS: init: Drop boot_mem_map") Signed-off-by: Alexander Sverdlin Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index ca579deef939..9d11f68a9e8b 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -498,8 +498,8 @@ static void __init request_crashkernel(struct resource *res) static void __init check_kernel_sections_mem(void) { - phys_addr_t start = PFN_PHYS(PFN_DOWN(__pa_symbol(&_text))); - phys_addr_t size = PFN_PHYS(PFN_UP(__pa_symbol(&_end))) - start; + phys_addr_t start = __pa_symbol(&_text); + phys_addr_t size = __pa_symbol(&_end) - start; if (!memblock_is_region_memory(start, size)) { pr_info("Kernel sections are not in the memory maps\n"); From ca13300a88a37a90160d352cede05776ea723919 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Thu, 3 Dec 2020 13:36:49 +0100 Subject: [PATCH 270/395] MIPS: OCTEON: Don't add kernel sections into memblock allocator Because check_kernel_sections_mem() does exactly this for all platforms. Signed-off-by: Alexander Sverdlin Signed-off-by: Thomas Bogendoerfer --- arch/mips/cavium-octeon/setup.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c index 561389d3fadb..982826ba0ef7 100644 --- a/arch/mips/cavium-octeon/setup.c +++ b/arch/mips/cavium-octeon/setup.c @@ -973,8 +973,6 @@ void __init plat_mem_setup(void) uint64_t crashk_end; #ifndef CONFIG_CRASH_DUMP int64_t memory; - uint64_t kernel_start; - uint64_t kernel_size; #endif total = 0; @@ -1078,13 +1076,6 @@ void __init plat_mem_setup(void) } } cvmx_bootmem_unlock(); - /* Add the memory region for the kernel. */ - kernel_start = (unsigned long) _text; - kernel_size = _end - _text; - - /* Adjust for physical offset. */ - kernel_start &= ~0xffffffff80000000ULL; - memblock_add(kernel_start, kernel_size); #endif /* CONFIG_CRASH_DUMP */ #ifdef CONFIG_CAVIUM_RESERVE32 From d8d3276bfc49e114103c54d5f93268c70dcf3600 Mon Sep 17 00:00:00 2001 From: Wei Li Date: Thu, 3 Dec 2020 14:54:43 +0800 Subject: [PATCH 271/395] MIPS: SMP-CPS: Add support for irq migration when CPU offline Currently we won't migrate irqs when offline CPUs, which has been implemented on most architectures. That will lead to some devices work incorrectly if the bound cores are offline. While that can be easily supported by enabling GENERIC_IRQ_MIGRATION. But i don't pretty known the reason it was not supported on all MIPS platforms. This patch add the support for irq migration on MIPS CPS platform, and it's tested on the interAptiv processor. Signed-off-by: Wei Li Signed-off-by: Thomas Bogendoerfer --- arch/mips/Kconfig | 1 + arch/mips/kernel/smp-cps.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index b99498958b5c..b49a390df1cb 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2488,6 +2488,7 @@ config MIPS_CPS select SYS_SUPPORTS_SCHED_SMT if CPU_MIPSR6 select SYS_SUPPORTS_SMP select WEAK_ORDERING + select GENERIC_IRQ_MIGRATION if HOTPLUG_CPU help Select this if you wish to run an SMP kernel across multiple cores within a MIPS Coherent Processing System. When this option is diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c index dbb3f1fc71ab..8b027c72b8ef 100644 --- a/arch/mips/kernel/smp-cps.c +++ b/arch/mips/kernel/smp-cps.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -461,6 +462,7 @@ static int cps_cpu_disable(void) smp_mb__after_atomic(); set_cpu_online(cpu, false); calculate_cpu_foreign_map(); + irq_migrate_all_off_this_cpu(); return 0; } From 4f1682b8a97dc24e57e8bcb62b23c216d8425266 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 3 Dec 2020 11:19:03 +0800 Subject: [PATCH 272/395] MIPS: Move memblock_dump_all() to the end of setup_arch() In order to get more memblock configuration with memblock=debug in the boot cmdline, move memblock_dump_all() to the end of setup_arch(), this can help us to get dmi_setup() and resource_init() memblock info, at least for now. Signed-off-by: Tiezhu Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 9d11f68a9e8b..7e1f8e277437 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -688,8 +688,6 @@ static void __init arch_mem_init(char **cmdline_p) fdt_init_reserved_mem(); - memblock_dump_all(); - early_memtest(PFN_PHYS(ARCH_PFN_OFFSET), PFN_PHYS(max_low_pfn)); } @@ -787,6 +785,8 @@ void __init setup_arch(char **cmdline_p) cpu_cache_init(); paging_init(); + + memblock_dump_all(); } unsigned long kernelsp[NR_CPUS]; From ce9fe18abb7c86a71b545e1cdd60fe333bf462a3 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Thu, 26 Nov 2020 11:47:16 +0100 Subject: [PATCH 273/395] block/rnbd-clt: Make path parameter optional for map_device During map_device if the given session exists, then the path parameter is not used. In such a case, the path parameter is redundant. This commit makes the path parameter optional for map_device. When the path parameter is not given, if the session exists then that is used to establish the rtrs connection. If the session does not exist, and the path parameter is also missing, then map_device fails. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 1 - drivers/block/rnbd/rnbd-clt.c | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 4f4474eecadb..e7b41ec7cd6a 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -37,7 +37,6 @@ enum { }; static const unsigned int rnbd_opt_mandatory[] = { - RNBD_OPT_PATH, RNBD_OPT_DEV_PATH, RNBD_OPT_SESSNAME, }; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 8b2411ccbda9..edefa0761a81 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1193,6 +1193,12 @@ find_and_get_or_create_sess(const char *sessname, else if (!first) return sess; + if (!path_cnt) { + pr_err("Session %s not found, and path parameter not given", sessname); + err = -ENXIO; + goto put_sess; + } + rtrs_ops = (struct rtrs_clt_ops) { .priv = sess, .link_ev = rnbd_clt_link_ev, From 91f4acb2801ce4985483b0fa174bbe995d105417 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 26 Nov 2020 11:47:17 +0100 Subject: [PATCH 274/395] block/rnbd-clt: support mapping two devices with the same name from different servers Previously, we can't map same device name from different sessions due to the limitation of sysfs naming mechanism. root@clt2:~# ls -l /sys/class/rnbd-client/ctl/devices/ total 0 lrwxrwxrwx 1 root 0 Sep 2 16:31 !dev!nullb1 -> ../../../block/rnbd0 We only use the device name in above, which caused device with the same name can't be mapped from another server. To address the issue, the sessname is appended to the node to differentiate where the device comes from. Also, we need to check if the pathname is existed in a specific session instead of search it in global sess_list. Signed-off-by: Guoqing Jiang Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Reviewed-by: Md Haris Iqbal Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 4 ++++ drivers/block/rnbd/rnbd-clt.c | 13 ++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index e7b41ec7cd6a..5d3c3c80dab4 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -480,6 +480,10 @@ static int rnbd_clt_get_path_name(struct rnbd_clt_dev *dev, char *buf, if (ret >= len) return -ENAMETOOLONG; + ret = snprintf(buf, len, "%s@%s", buf, dev->sess->sessname); + if (ret >= len) + return -ENAMETOOLONG; + return 0; } diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index edefa0761a81..1bb495e50931 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1410,13 +1410,16 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, return ERR_PTR(ret); } -static bool __exists_dev(const char *pathname) +static bool __exists_dev(const char *pathname, const char *sessname) { struct rnbd_clt_session *sess; struct rnbd_clt_dev *dev; bool found = false; list_for_each_entry(sess, &sess_list, list) { + if (sessname && strncmp(sess->sessname, sessname, + sizeof(sess->sessname))) + continue; mutex_lock(&sess->lock); list_for_each_entry(dev, &sess->devs_list, list) { if (!strncmp(dev->pathname, pathname, @@ -1433,12 +1436,12 @@ static bool __exists_dev(const char *pathname) return found; } -static bool exists_devpath(const char *pathname) +static bool exists_devpath(const char *pathname, const char *sessname) { bool found; mutex_lock(&sess_lock); - found = __exists_dev(pathname); + found = __exists_dev(pathname, sessname); mutex_unlock(&sess_lock); return found; @@ -1451,7 +1454,7 @@ static bool insert_dev_if_not_exists_devpath(const char *pathname, bool found; mutex_lock(&sess_lock); - found = __exists_dev(pathname); + found = __exists_dev(pathname, sess->sessname); if (!found) { mutex_lock(&sess->lock); list_add_tail(&dev->list, &sess->devs_list); @@ -1481,7 +1484,7 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, struct rnbd_clt_dev *dev; int ret; - if (exists_devpath(pathname)) + if (unlikely(exists_devpath(pathname, sessname))) return ERR_PTR(-EEXIST); sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr); From 47479b795490f146ff045ec3ee5a724bbce294f0 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 26 Nov 2020 11:47:18 +0100 Subject: [PATCH 275/395] Documentation/ABI/rnbd-clt: fix typo in sysfs-class-rnbd-client /sys/block/rnbd is created, not /sys/block/rnbd_client/rnbd Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-class-rnbd-client | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-class-rnbd-client b/Documentation/ABI/testing/sysfs-class-rnbd-client index 00c0286733d4..ca3267b81886 100644 --- a/Documentation/ABI/testing/sysfs-class-rnbd-client +++ b/Documentation/ABI/testing/sysfs-class-rnbd-client @@ -66,7 +66,7 @@ Description: Expected format is the following:: The rnbd_server prepends the received from client with and tries to open the / block device. On success, - a /dev/rnbd device file, a /sys/block/rnbd_client/rnbd/ + a /dev/rnbd device file, a /sys/block/rnbd/ directory and an entry in /sys/class/rnbd-client/ctl/devices will be created. From 7578d5cd1e0fe71736970372fcf96341d69f2234 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 26 Nov 2020 11:47:19 +0100 Subject: [PATCH 276/395] Documentation/ABI/rnbd-clt: session name is appended to the device path When mapping a device, /sys/devices/virtual/rnbd-client/ctl/devices/ was created. But we found out that it had a problem when mapping the same file on different servers. So we append the session name after the device_id as below. /sys/devices/virtual/rnbd-client/ctl/devices/@ Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-class-rnbd-client | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-class-rnbd-client b/Documentation/ABI/testing/sysfs-class-rnbd-client index ca3267b81886..2aa05b3e348e 100644 --- a/Documentation/ABI/testing/sysfs-class-rnbd-client +++ b/Documentation/ABI/testing/sysfs-class-rnbd-client @@ -95,12 +95,12 @@ Description: Expected format is the following:: --------------------------------- After mapping, the device file can be found by: - o The symlink /sys/class/rnbd-client/ctl/devices/ + o The symlink /sys/class/rnbd-client/ctl/devices/@ points to /sys/block/. The last part of the symlink destination is the same as the device name. By extracting the last part of the path the path to the device /dev/ can be build. - * /dev/block/$(cat /sys/class/rnbd-client/ctl/devices//dev) + * /dev/block/$(cat /sys/class/rnbd-client/ctl/devices/@/dev) How to find the of the device is described on the next section. @@ -110,7 +110,7 @@ Date: Feb 2020 KernelVersion: 5.7 Contact: Jack Wang Danil Kipnis Description: For each device mapped on the client a new symbolic link is created as - /sys/class/rnbd-client/ctl/devices/, which points + /sys/class/rnbd-client/ctl/devices/@, which points to the block device created by rnbd (/sys/block/rnbd/). The of each device is created as follows: From 786998050cbc8ead32e6e9fcda2facb3bf3d198d Mon Sep 17 00:00:00 2001 From: Lutz Pogrell Date: Thu, 26 Nov 2020 11:47:20 +0100 Subject: [PATCH 277/395] block/rnbd-srv: close a mapped device from server side. The forceful close of an exported device is required for the use case, when the client side hangs, is crashed, or is not accessible. There have been cases observed, where only some of the devices are to be cleaned up, but the session shall remain. When the device is to be exported to a different client host, server side cleanup is required. Signed-off-by: Lutz Pogrell Signed-off-by: Jack Wang Reviewed-by: Gioh Kim Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv-sysfs.c | 38 ++++++++++++++++++++++++++++- drivers/block/rnbd/rnbd-srv.c | 19 +++++++++++++-- drivers/block/rnbd/rnbd-srv.h | 4 ++- 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 106775c074d1..08ffb492ebfa 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -120,10 +120,46 @@ static ssize_t mapping_path_show(struct kobject *kobj, static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr = __ATTR_RO(mapping_path); +static ssize_t rnbd_srv_dev_session_force_close_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rnbd_srv_dev_session_force_close_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); + + if (!sysfs_streq(buf, "1")) { + rnbd_srv_err(sess_dev, "%s: invalid value: '%s'\n", + attr->attr.name, buf); + return -EINVAL; + } + + rnbd_srv_info(sess_dev, "force close requested\n"); + + /* first remove sysfs itself to avoid deadlock */ + sysfs_remove_file_self(&sess_dev->kobj, &attr->attr); + rnbd_srv_sess_dev_force_close(sess_dev); + + return count; +} + +static struct kobj_attribute rnbd_srv_dev_session_force_close_attr = + __ATTR(force_close, 0644, + rnbd_srv_dev_session_force_close_show, + rnbd_srv_dev_session_force_close_store); + static struct attribute *rnbd_srv_default_dev_sessions_attrs[] = { &rnbd_srv_dev_session_access_mode_attr.attr, &rnbd_srv_dev_session_ro_attr.attr, &rnbd_srv_dev_session_mapping_path_attr.attr, + &rnbd_srv_dev_session_force_close_attr.attr, NULL, }; @@ -145,7 +181,7 @@ static void rnbd_srv_sess_dev_release(struct kobject *kobj) struct rnbd_srv_sess_dev *sess_dev; sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - rnbd_destroy_sess_dev(sess_dev); + rnbd_destroy_sess_dev(sess_dev, sess_dev->keep_id); } static struct kobj_type rnbd_srv_sess_dev_ktype = { diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index e1bc8b4cd592..d1ee72ed8384 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -212,12 +212,20 @@ static void rnbd_put_srv_dev(struct rnbd_srv_dev *dev) kref_put(&dev->kref, destroy_device_cb); } -void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev) +void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) { DECLARE_COMPLETION_ONSTACK(dc); - xa_erase(&sess_dev->sess->index_idr, sess_dev->device_id); + if (keep_id) + /* free the resources for the id but don't */ + /* allow to re-use the id itself because it */ + /* is still used by the client */ + xa_cmpxchg(&sess_dev->sess->index_idr, sess_dev->device_id, + sess_dev, NULL, 0); + else + xa_erase(&sess_dev->sess->index_idr, sess_dev->device_id); synchronize_rcu(); + sess_dev->destroy_comp = &dc; rnbd_put_sess_dev(sess_dev); wait_for_completion(&dc); /* wait for inflights to drop to zero */ @@ -328,6 +336,13 @@ static int rnbd_srv_link_ev(struct rtrs_srv *rtrs, } } +void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev) +{ + rnbd_srv_destroy_dev_session_sysfs(sess_dev); + sess_dev->keep_id = true; + +} + static int process_msg_close(struct rtrs_srv *rtrs, struct rnbd_srv_session *srv_sess, void *data, size_t datalen, const void *usr, diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h index 5a8544b5e74f..b157371c25ed 100644 --- a/drivers/block/rnbd/rnbd-srv.h +++ b/drivers/block/rnbd/rnbd-srv.h @@ -56,6 +56,7 @@ struct rnbd_srv_sess_dev { struct rnbd_srv_dev *dev; struct kobject kobj; u32 device_id; + bool keep_id; fmode_t open_flags; struct kref kref; struct completion *destroy_comp; @@ -63,6 +64,7 @@ struct rnbd_srv_sess_dev { enum rnbd_access_mode access_mode; }; +void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev); /* rnbd-srv-sysfs.c */ int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, @@ -73,6 +75,6 @@ int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); void rnbd_srv_destroy_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); int rnbd_srv_create_sysfs_files(void); void rnbd_srv_destroy_sysfs_files(void); -void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev); +void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id); #endif /* RNBD_SRV_H */ From 765c5c56ffde0a555ce69559ab275395fb1a12a9 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Thu, 26 Nov 2020 11:47:21 +0100 Subject: [PATCH 278/395] Documentation/ABI/rnbd-srv: add document for force_close describe force_close of device Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-class-rnbd-server | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-rnbd-server b/Documentation/ABI/testing/sysfs-class-rnbd-server index ba60a90c0e45..6c5996cd7cfb 100644 --- a/Documentation/ABI/testing/sysfs-class-rnbd-server +++ b/Documentation/ABI/testing/sysfs-class-rnbd-server @@ -48,3 +48,11 @@ Date: Feb 2020 KernelVersion: 5.7 Contact: Jack Wang Danil Kipnis Description: Contains the device access mode: ro, rw or migration. + +What: /sys/class/rnbd-server/ctl/devices//sessions//force_close +Date: Nov 2020 +KernelVersion: 5.10 +Contact: Jack Wang Danil Kipnis +Description: Write "1" to the file to close the device on server side. Please + note that the client side device will not be closed, read or + write to the device will get -ENOTCONN. From d3a95ccaaf4df94743a958c90ab85f4703e3a687 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 26 Nov 2020 11:47:22 +0100 Subject: [PATCH 279/395] block/rnbd: call kobject_put in the failure path Per the comment of kobject_init_and_add, we need to cleanup the memory by call kobject_put. Also we need to call kobject_del for the other failure cases if the kobject_init_and_add doesn't fail. Signed-off-by: Guoqing Jiang Signed-off-by: Jack Wang Reviewed-by: Md Haris Iqbal Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 4 +++- drivers/block/rnbd/rnbd-srv-sysfs.c | 28 ++++++++++++++++------------ 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 5d3c3c80dab4..e3c3270b0cee 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -450,9 +450,11 @@ static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev) ret = kobject_init_and_add(&dev->kobj, &rnbd_dev_ktype, gd_kobj, "%s", "rnbd"); - if (ret) + if (ret) { rnbd_clt_err(dev, "Failed to create device sysfs dir, err: %d\n", ret); + kobject_put(&dev->kobj); + } return ret; } diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 08ffb492ebfa..05ffe488ddc6 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -47,13 +47,17 @@ int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, ret = kobject_init_and_add(&dev->dev_kobj, &dev_ktype, rnbd_devs_kobj, dev_name); - if (ret) + if (ret) { + kobject_put(&dev->dev_kobj); return ret; + } dev->dev_sessions_kobj = kobject_create_and_add("sessions", &dev->dev_kobj); - if (!dev->dev_sessions_kobj) - goto put_dev_kobj; + if (!dev->dev_sessions_kobj) { + ret = -ENOMEM; + goto free_dev_kobj; + } bdev_kobj = &disk_to_dev(bdev->bd_disk)->kobj; ret = sysfs_create_link(&dev->dev_kobj, bdev_kobj, "block_dev"); @@ -64,7 +68,8 @@ int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, put_sess_kobj: kobject_put(dev->dev_sessions_kobj); -put_dev_kobj: +free_dev_kobj: + kobject_del(&dev->dev_kobj); kobject_put(&dev->dev_kobj); return ret; } @@ -196,18 +201,17 @@ int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev) ret = kobject_init_and_add(&sess_dev->kobj, &rnbd_srv_sess_dev_ktype, sess_dev->dev->dev_sessions_kobj, "%s", sess_dev->sess->sessname); - if (ret) + if (ret) { + kobject_put(&sess_dev->kobj); return ret; + } ret = sysfs_create_group(&sess_dev->kobj, &rnbd_srv_default_dev_session_attr_group); - if (ret) - goto err; - - return 0; - -err: - kobject_put(&sess_dev->kobj); + if (ret) { + kobject_del(&sess_dev->kobj); + kobject_put(&sess_dev->kobj); + } return ret; } From 64e8a6ece1a5b1fa21316918053d068baeac84af Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Thu, 26 Nov 2020 11:47:23 +0100 Subject: [PATCH 280/395] block/rnbd-clt: Dynamically alloc buffer for pathname & blk_symlink_name For every rnbd_clt_dev, we alloc the pathname and blk_symlink_name statically to NAME_MAX which is 255 bytes. In most of the cases we only need less than 10 bytes, so 500 bytes per block device are wasted. This commit dynamically allocates memory buffer for pathname and blk_symlink_name. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Reviewed-by: Lutz Pogrell Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 12 ++++++++++-- drivers/block/rnbd/rnbd-clt.c | 14 +++++++++++--- drivers/block/rnbd/rnbd-clt.h | 4 ++-- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index e3c3270b0cee..c3c96a567568 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -434,6 +434,7 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) */ if (strlen(dev->blk_symlink_name) && try_module_get(THIS_MODULE)) { sysfs_remove_link(rnbd_devs_kobj, dev->blk_symlink_name); + kfree(dev->blk_symlink_name); module_put(THIS_MODULE); } } @@ -492,10 +493,17 @@ static int rnbd_clt_get_path_name(struct rnbd_clt_dev *dev, char *buf, static int rnbd_clt_add_dev_symlink(struct rnbd_clt_dev *dev) { struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj; - int ret; + int ret, len; + + len = strlen(dev->pathname) + strlen(dev->sess->sessname) + 2; + dev->blk_symlink_name = kzalloc(len, GFP_KERNEL); + if (!dev->blk_symlink_name) { + rnbd_clt_err(dev, "Failed to allocate memory for blk_symlink_name\n"); + goto out_err; + } ret = rnbd_clt_get_path_name(dev, dev->blk_symlink_name, - sizeof(dev->blk_symlink_name)); + len); if (ret) { rnbd_clt_err(dev, "Failed to get /sys/block symlink path, err: %d\n", ret); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 1bb495e50931..34bc6083b58d 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -59,6 +59,7 @@ static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev) ida_simple_remove(&index_ida, dev->clt_device_id); mutex_unlock(&ida_lock); kfree(dev->hw_queues); + kfree(dev->pathname); rnbd_clt_put_sess(dev->sess); mutex_destroy(&dev->lock); kfree(dev); @@ -1387,10 +1388,17 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, pathname, sess->sessname, ret); goto out_queues; } + + dev->pathname = kzalloc(strlen(pathname) + 1, GFP_KERNEL); + if (!dev->pathname) { + ret = -ENOMEM; + goto out_queues; + } + strlcpy(dev->pathname, pathname, strlen(pathname) + 1); + dev->clt_device_id = ret; dev->sess = sess; dev->access_mode = access_mode; - strlcpy(dev->pathname, pathname, sizeof(dev->pathname)); mutex_init(&dev->lock); refcount_set(&dev->refcount, 1); dev->dev_state = DEV_STATE_INIT; @@ -1422,8 +1430,8 @@ static bool __exists_dev(const char *pathname, const char *sessname) continue; mutex_lock(&sess->lock); list_for_each_entry(dev, &sess->devs_list, list) { - if (!strncmp(dev->pathname, pathname, - sizeof(dev->pathname))) { + if (strlen(dev->pathname) == strlen(pathname) && + !strcmp(dev->pathname, pathname)) { found = true; break; } diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index ed33654aa486..b193d5904050 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -108,7 +108,7 @@ struct rnbd_clt_dev { u32 clt_device_id; struct mutex lock; enum rnbd_clt_dev_state dev_state; - char pathname[NAME_MAX]; + char *pathname; enum rnbd_access_mode access_mode; bool read_only; bool rotational; @@ -126,7 +126,7 @@ struct rnbd_clt_dev { struct list_head list; struct gendisk *gd; struct kobject kobj; - char blk_symlink_name[NAME_MAX]; + char *blk_symlink_name; refcount_t refcount; struct work_struct unmap_on_rmmod_work; }; From b81b8f40c5b43dcb2ff473236baccc421706435f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:35 +0100 Subject: [PATCH 281/395] block: remove the unused block_sleeprq tracepoint The block_sleeprq tracepoint was only used by the legacy request code. Remove it now that the legacy request code is gone. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/trace/events/block.h | 18 ------------------ kernel/trace/blktrace.c | 22 ---------------------- 2 files changed, 40 deletions(-) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 34d64ca306b1..76459cf750e1 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -441,24 +441,6 @@ DEFINE_EVENT(block_get_rq, block_getrq, TP_ARGS(q, bio, rw) ); -/** - * block_sleeprq - waiting to get a free request entry in queue for block IO operation - * @q: queue for operation - * @bio: pending block IO operation (can be %NULL) - * @rw: low bit indicates a read (%0) or a write (%1) - * - * In the case where a request struct cannot be provided for queue @q - * the process needs to wait for an request struct to become - * available. This tracepoint event is generated each time the - * process goes to sleep waiting for request struct become available. - */ -DEFINE_EVENT(block_get_rq, block_sleeprq, - - TP_PROTO(struct request_queue *q, struct bio *bio, int rw), - - TP_ARGS(q, bio, rw) -); - /** * block_plug - keep operations requests in request queue * @q: request queue to plug diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a482a37848bf..ced589df304b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -959,25 +959,6 @@ static void blk_add_trace_getrq(void *ignore, } } - -static void blk_add_trace_sleeprq(void *ignore, - struct request_queue *q, - struct bio *bio, int rw) -{ - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); - else { - struct blk_trace *bt; - - rcu_read_lock(); - bt = rcu_dereference(q->blk_trace); - if (bt) - __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL, 0); - rcu_read_unlock(); - } -} - static void blk_add_trace_plug(void *ignore, struct request_queue *q) { struct blk_trace *bt; @@ -1164,8 +1145,6 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); - ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); - WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); @@ -1185,7 +1164,6 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); - unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); From e8a676d61c07eccfcd9d6fddfe4dcb630651c29a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:36 +0100 Subject: [PATCH 282/395] block: simplify and extend the block_bio_merge tracepoint class The block_bio_merge tracepoint class can be reused for most bio-based tracepoints. For that it just needs to lose the superfluous q and rq parameters. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-merge.c | 4 +- block/blk-mq.c | 2 +- block/bounce.c | 2 +- include/trace/events/block.h | 158 ++++++++--------------------------- kernel/trace/blktrace.c | 41 +++------ 6 files changed, 48 insertions(+), 161 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index cee568389b7e..cb24654983e1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -907,7 +907,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_queue(q, bio); + trace_block_bio_queue(bio); /* Now that enqueuing has been traced, we need to trace * completion as well. */ diff --git a/block/blk-merge.c b/block/blk-merge.c index cb351ab9b77d..1a46d5bbd399 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -922,7 +922,7 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req, if (!ll_back_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; - trace_block_bio_backmerge(req->q, req, bio); + trace_block_bio_backmerge(bio); rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) @@ -946,7 +946,7 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req, if (!ll_front_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; - trace_block_bio_frontmerge(req->q, req, bio); + trace_block_bio_frontmerge(bio); rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) diff --git a/block/blk-mq.c b/block/blk-mq.c index 37c682855a63..21e2b4b6b742 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2184,7 +2184,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) goto queue_exit; } - trace_block_getrq(q, bio, bio->bi_opf); + trace_block_getrq(bio); rq_qos_track(q, rq, bio); diff --git a/block/bounce.c b/block/bounce.c index 162a6eee8999..d3f51acd6e3b 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -340,7 +340,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, } } - trace_block_bio_bounce(q, *bio_orig); + trace_block_bio_bounce(*bio_orig); bio->bi_flags |= (1 << BIO_BOUNCED); diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 76459cf750e1..506c29dc7c76 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -226,45 +226,6 @@ DEFINE_EVENT(block_rq, block_rq_merge, TP_ARGS(q, rq) ); -/** - * block_bio_bounce - used bounce buffer when processing block operation - * @q: queue holding the block operation - * @bio: block operation - * - * A bounce buffer was used to handle the block operation @bio in @q. - * This occurs when hardware limitations prevent a direct transfer of - * data between the @bio data memory area and the IO device. Use of a - * bounce buffer requires extra copying of data and decreases - * performance. - */ -TRACE_EVENT(block_bio_bounce, - - TP_PROTO(struct request_queue *q, struct bio *bio), - - TP_ARGS(q, bio), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( sector_t, sector ) - __field( unsigned int, nr_sector ) - __array( char, rwbs, RWBS_LEN ) - __array( char, comm, TASK_COMM_LEN ) - ), - - TP_fast_assign( - __entry->dev = bio_dev(bio); - __entry->sector = bio->bi_iter.bi_sector; - __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - ), - - TP_printk("%d,%d %s %llu + %u [%s]", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) -); - /** * block_bio_complete - completed all work on the block operation * @q: queue holding the block operation @@ -301,11 +262,11 @@ TRACE_EVENT(block_bio_complete, __entry->nr_sector, __entry->error) ); -DECLARE_EVENT_CLASS(block_bio_merge, +DECLARE_EVENT_CLASS(block_bio, - TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), + TP_PROTO(struct bio *bio), - TP_ARGS(q, rq, bio), + TP_ARGS(bio), TP_STRUCT__entry( __field( dev_t, dev ) @@ -329,116 +290,63 @@ DECLARE_EVENT_CLASS(block_bio_merge, __entry->nr_sector, __entry->comm) ); +/** + * block_bio_bounce - used bounce buffer when processing block operation + * @bio: block operation + * + * A bounce buffer was used to handle the block operation @bio in @q. + * This occurs when hardware limitations prevent a direct transfer of + * data between the @bio data memory area and the IO device. Use of a + * bounce buffer requires extra copying of data and decreases + * performance. + */ +DEFINE_EVENT(block_bio, block_bio_bounce, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + /** * block_bio_backmerge - merging block operation to the end of an existing operation - * @q: queue holding operation - * @rq: request bio is being merged into * @bio: new block operation to merge * - * Merging block request @bio to the end of an existing block request - * in queue @q. + * Merging block request @bio to the end of an existing block request. */ -DEFINE_EVENT(block_bio_merge, block_bio_backmerge, - - TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), - - TP_ARGS(q, rq, bio) +DEFINE_EVENT(block_bio, block_bio_backmerge, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); /** * block_bio_frontmerge - merging block operation to the beginning of an existing operation - * @q: queue holding operation - * @rq: request bio is being merged into * @bio: new block operation to merge * - * Merging block IO operation @bio to the beginning of an existing block - * operation in queue @q. + * Merging block IO operation @bio to the beginning of an existing block request. */ -DEFINE_EVENT(block_bio_merge, block_bio_frontmerge, - - TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), - - TP_ARGS(q, rq, bio) +DEFINE_EVENT(block_bio, block_bio_frontmerge, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); /** * block_bio_queue - putting new block IO operation in queue - * @q: queue holding operation * @bio: new block operation * * About to place the block IO operation @bio into queue @q. */ -TRACE_EVENT(block_bio_queue, - - TP_PROTO(struct request_queue *q, struct bio *bio), - - TP_ARGS(q, bio), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( sector_t, sector ) - __field( unsigned int, nr_sector ) - __array( char, rwbs, RWBS_LEN ) - __array( char, comm, TASK_COMM_LEN ) - ), - - TP_fast_assign( - __entry->dev = bio_dev(bio); - __entry->sector = bio->bi_iter.bi_sector; - __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - ), - - TP_printk("%d,%d %s %llu + %u [%s]", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) -); - -DECLARE_EVENT_CLASS(block_get_rq, - - TP_PROTO(struct request_queue *q, struct bio *bio, int rw), - - TP_ARGS(q, bio, rw), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( sector_t, sector ) - __field( unsigned int, nr_sector ) - __array( char, rwbs, RWBS_LEN ) - __array( char, comm, TASK_COMM_LEN ) - ), - - TP_fast_assign( - __entry->dev = bio ? bio_dev(bio) : 0; - __entry->sector = bio ? bio->bi_iter.bi_sector : 0; - __entry->nr_sector = bio ? bio_sectors(bio) : 0; - blk_fill_rwbs(__entry->rwbs, - bio ? bio->bi_opf : 0, __entry->nr_sector); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - ), - - TP_printk("%d,%d %s %llu + %u [%s]", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) +DEFINE_EVENT(block_bio, block_bio_queue, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); /** * block_getrq - get a free request entry in queue for block IO operations - * @q: queue for operations * @bio: pending block IO operation (can be %NULL) - * @rw: low bit indicates a read (%0) or a write (%1) * - * A request struct for queue @q has been allocated to handle the - * block IO operation @bio. + * A request struct has been allocated to handle the block IO operation @bio. */ -DEFINE_EVENT(block_get_rq, block_getrq, - - TP_PROTO(struct request_queue *q, struct bio *bio, int rw), - - TP_ARGS(q, bio, rw) +DEFINE_EVENT(block_bio, block_getrq, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); /** diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ced589df304b..7ab88e00c157 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -906,10 +906,9 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, rcu_read_unlock(); } -static void blk_add_trace_bio_bounce(void *ignore, - struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, @@ -919,44 +918,24 @@ static void blk_add_trace_bio_complete(void *ignore, blk_status_to_errno(bio->bi_status)); } -static void blk_add_trace_bio_backmerge(void *ignore, - struct request_queue *q, - struct request *rq, - struct bio *bio) +static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0); } -static void blk_add_trace_bio_frontmerge(void *ignore, - struct request_queue *q, - struct request *rq, - struct bio *bio) +static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0); } -static void blk_add_trace_bio_queue(void *ignore, - struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_queue(void *ignore, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0); } -static void blk_add_trace_getrq(void *ignore, - struct request_queue *q, - struct bio *bio, int rw) +static void blk_add_trace_getrq(void *ignore, struct bio *bio) { - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); - else { - struct blk_trace *bt; - - rcu_read_lock(); - bt = rcu_dereference(q->blk_trace); - if (bt) - __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL, 0); - rcu_read_unlock(); - } + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0); } static void blk_add_trace_plug(void *ignore, struct request_queue *q) From eb6f7f7cd3af0f67ce57b21fab1bc64beb643581 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:37 +0100 Subject: [PATCH 283/395] block: remove the request_queue argument to the block_split tracepoint The request_queue can trivially be derived from the bio. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- drivers/md/dm.c | 2 +- include/trace/events/block.h | 14 ++++++-------- kernel/trace/blktrace.c | 5 ++--- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 1a46d5bbd399..4071daa88a5e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -338,7 +338,7 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) split->bi_opf |= REQ_NOMERGE; bio_chain(split, *bio); - trace_block_split(q, split, (*bio)->bi_iter.bi_sector); + trace_block_split(split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio); *bio = split; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ed7e836efbcd..9a5bd90779c7 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1612,7 +1612,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, part_stat_unlock(); bio_chain(b, bio); - trace_block_split(md->queue, b, bio->bi_iter.bi_sector); + trace_block_split(b, bio->bi_iter.bi_sector); ret = submit_bio_noacct(bio); break; } diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 506c29dc7c76..b415e4cba843 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -411,21 +411,19 @@ DEFINE_EVENT(block_unplug, block_unplug, /** * block_split - split a single bio struct into two bio structs - * @q: queue containing the bio * @bio: block operation being split * @new_sector: The starting sector for the new bio * - * The bio request @bio in request queue @q needs to be split into two - * bio requests. The newly created @bio request starts at - * @new_sector. This split may be required due to hardware limitation - * such as operation crossing device boundaries in a RAID system. + * The bio request @bio needs to be split into two bio requests. The newly + * created @bio request starts at @new_sector. This split may be required due to + * hardware limitations such as operation crossing device boundaries in a RAID + * system. */ TRACE_EVENT(block_split, - TP_PROTO(struct request_queue *q, struct bio *bio, - unsigned int new_sector), + TP_PROTO(struct bio *bio, unsigned int new_sector), - TP_ARGS(q, bio, new_sector), + TP_ARGS(bio, new_sector), TP_STRUCT__entry( __field( dev_t, dev ) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7ab88e00c157..3ca6d62114f4 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -970,10 +970,9 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, rcu_read_unlock(); } -static void blk_add_trace_split(void *ignore, - struct request_queue *q, struct bio *bio, - unsigned int pdu) +static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { + struct request_queue *q = bio->bi_disk->queue; struct blk_trace *bt; rcu_read_lock(); From 1c02fca620f7273b597591065d366e2cca948d8f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:38 +0100 Subject: [PATCH 284/395] block: remove the request_queue argument to the block_bio_remap tracepoint The request_queue can trivially be derived from the bio. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- drivers/md/dm.c | 3 +-- drivers/md/md-linear.c | 3 +-- drivers/md/md.c | 5 ++--- drivers/md/raid0.c | 4 ++-- drivers/md/raid1.c | 7 +++---- drivers/md/raid10.c | 6 ++---- drivers/md/raid5.c | 15 +++++++-------- drivers/nvme/host/multipath.c | 3 +-- include/trace/events/block.h | 8 +++----- kernel/trace/blktrace.c | 14 +++++--------- 11 files changed, 28 insertions(+), 42 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index cb24654983e1..96e5fcd7f071 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -758,7 +758,7 @@ static inline int blk_partition_remap(struct bio *bio) if (bio_check_eod(bio, bdev_nr_sectors(p))) goto out; bio->bi_iter.bi_sector += p->bd_start_sect; - trace_block_bio_remap(bio->bi_disk->queue, bio, p->bd_dev, + trace_block_bio_remap(bio, p->bd_dev, bio->bi_iter.bi_sector - p->bd_start_sect); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 9a5bd90779c7..5181907dc595 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1276,8 +1276,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) break; case DM_MAPIO_REMAPPED: /* the bio has been remapped so dispatch it */ - trace_block_bio_remap(clone->bi_disk->queue, clone, - bio_dev(io->orig_bio), sector); + trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector); ret = submit_bio_noacct(clone); break; case DM_MAPIO_KILL: diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 98f1b4b2bdce..68cac7d19278 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -257,8 +257,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio_endio(bio); } else { if (mddev->gendisk) - trace_block_bio_remap(bio->bi_disk->queue, - bio, disk_devt(mddev->gendisk), + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); diff --git a/drivers/md/md.c b/drivers/md/md.c index 0065736f05b4..c555be0a8dce 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8591,9 +8591,8 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, bio_chain(discard_bio, bio); bio_clone_blkg_association(discard_bio, bio); if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(rdev->bdev), - discard_bio, disk_devt(mddev->gendisk), - bio->bi_iter.bi_sector); + trace_block_bio_remap(discard_bio, disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); submit_bio_noacct(discard_bio); } EXPORT_SYMBOL(md_submit_discard_bio); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 6f44177593a5..e5d7411cba9b 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -571,8 +571,8 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) tmp_dev->data_offset; if (mddev->gendisk) - trace_block_bio_remap(bio->bi_disk->queue, bio, - disk_devt(mddev->gendisk), bio_sector); + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), + bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); submit_bio_noacct(bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 960d854c07f8..c0347997f6ff 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1305,8 +1305,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r1_bio; if (mddev->gendisk) - trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, - disk_devt(mddev->gendisk), r1_bio->sector); + trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), + r1_bio->sector); submit_bio_noacct(read_bio); } @@ -1517,8 +1517,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, atomic_inc(&r1_bio->remaining); if (mddev->gendisk) - trace_block_bio_remap(mbio->bi_disk->queue, - mbio, disk_devt(mddev->gendisk), + trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_disk = (void *)conf->mirrors[i].rdev; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b7bca6703df8..a6f99fa0b32c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1200,8 +1200,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r10_bio; if (mddev->gendisk) - trace_block_bio_remap(read_bio->bi_disk->queue, - read_bio, disk_devt(mddev->gendisk), + trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), r10_bio->sector); submit_bio_noacct(read_bio); return; @@ -1250,8 +1249,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_private = r10_bio; if (conf->mddev->gendisk) - trace_block_bio_remap(mbio->bi_disk->queue, - mbio, disk_devt(conf->mddev->gendisk), + trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), r10_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_disk = (void *)rdev; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 39343479ac2a..3a90cc0e43ca 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1222,9 +1222,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); if (conf->mddev->gendisk) - trace_block_bio_remap(bi->bi_disk->queue, - bi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + trace_block_bio_remap(bi, + disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, bi); else @@ -1272,9 +1272,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (op == REQ_OP_DISCARD) rbi->bi_vcnt = 0; if (conf->mddev->gendisk) - trace_block_bio_remap(rbi->bi_disk->queue, - rbi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + trace_block_bio_remap(rbi, + disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, rbi); else @@ -5468,8 +5468,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) spin_unlock_irq(&conf->device_lock); if (mddev->gendisk) - trace_block_bio_remap(align_bi->bi_disk->queue, - align_bi, disk_devt(mddev->gendisk), + trace_block_bio_remap(align_bi, disk_devt(mddev->gendisk), raid_bio->bi_iter.bi_sector); submit_bio_noacct(align_bi); return 1; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 74896be40c17..106cf5c44ee7 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -312,8 +312,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) if (likely(ns)) { bio->bi_disk = ns->disk; bio->bi_opf |= REQ_NVME_MPATH; - trace_block_bio_remap(bio->bi_disk->queue, bio, - disk_devt(ns->head->disk), + trace_block_bio_remap(bio, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); ret = submit_bio_noacct(bio); } else if (nvme_available_path(head)) { diff --git a/include/trace/events/block.h b/include/trace/events/block.h index b415e4cba843..8fb89574d867 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -450,9 +450,8 @@ TRACE_EVENT(block_split, /** * block_bio_remap - map request for a logical device to the raw device - * @q: queue holding the operation * @bio: revised operation - * @dev: device for the operation + * @dev: original device for the operation * @from: original sector for the operation * * An operation for a logical device has been mapped to the @@ -460,10 +459,9 @@ TRACE_EVENT(block_split, */ TRACE_EVENT(block_bio_remap, - TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t from), + TP_PROTO(struct bio *bio, dev_t dev, sector_t from), - TP_ARGS(q, bio, dev, from), + TP_ARGS(bio, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3ca6d62114f4..405637144a03 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -993,20 +993,16 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) /** * blk_add_trace_bio_remap - Add a trace for a bio-remap operation * @ignore: trace callback data parameter (not used) - * @q: queue the io is for * @bio: the source bio - * @dev: target device + * @dev: source device * @from: source sector * - * Description: - * Device mapper or raid target sometimes need to split a bio because - * it spans a stripe (or similar). Add a trace for that action. - * + * Called after a bio is remapped to a different device and/or sector. **/ -static void blk_add_trace_bio_remap(void *ignore, - struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from) +static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, + sector_t from) { + struct request_queue *q = bio->bi_disk->queue; struct blk_trace *bt; struct blk_io_trace_remap r; From a54895fa057c67700270777f7661d8d3c7fda88a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:39 +0100 Subject: [PATCH 285/395] block: remove the request_queue to argument request based tracepoints The request_queue can trivially be derived from the request. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- block/blk-mq-sched.c | 2 +- block/blk-mq.c | 8 +++---- drivers/md/dm-rq.c | 2 +- drivers/s390/scsi/zfcp_fsf.c | 3 +-- include/linux/blktrace_api.h | 5 ++-- include/trace/events/block.h | 30 ++++++++++-------------- kernel/trace/blktrace.c | 44 ++++++++++++++---------------------- 8 files changed, 39 insertions(+), 57 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 4071daa88a5e..7497d86fff38 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -799,7 +799,7 @@ static struct request *attempt_merge(struct request_queue *q, */ blk_account_io_merge_request(next); - trace_block_rq_merge(q, next); + trace_block_rq_merge(next); /* * ownership of bio passed from next to req, return 'next' for diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index d1eafe2c045c..deff4e826e23 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -386,7 +386,7 @@ EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); void blk_mq_sched_request_inserted(struct request *rq) { - trace_block_rq_insert(rq->q, rq); + trace_block_rq_insert(rq); } EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); diff --git a/block/blk-mq.c b/block/blk-mq.c index 21e2b4b6b742..cf3916e2852f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -733,7 +733,7 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; - trace_block_rq_issue(q, rq); + trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); @@ -760,7 +760,7 @@ static void __blk_mq_requeue_request(struct request *rq) blk_mq_put_driver_tag(rq); - trace_block_rq_requeue(q, rq); + trace_block_rq_requeue(rq); rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { @@ -1821,7 +1821,7 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, lockdep_assert_held(&ctx->lock); - trace_block_rq_insert(hctx->queue, rq); + trace_block_rq_insert(rq); if (at_head) list_add(&rq->queuelist, &ctx->rq_lists[type]); @@ -1878,7 +1878,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, */ list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); - trace_block_rq_insert(hctx->queue, rq); + trace_block_rq_insert(rq); } spin_lock(&ctx->lock); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 729a72ec30cc..13b4385f4d5a 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -397,7 +397,7 @@ static int map_request(struct dm_rq_target_io *tio) } /* The target has remapped the I/O so dispatch it */ - trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), + trace_block_rq_remap(clone, disk_devt(dm_disk(md)), blk_rq_pos(rq)); ret = dm_dispatch_clone_request(clone, rq); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index 6cb963a06777..37d450f46952 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -2359,8 +2359,7 @@ static void zfcp_fsf_req_trace(struct zfcp_fsf_req *req, struct scsi_cmnd *scsi) } } - blk_add_driver_data(scsi->request->q, scsi->request, &blktrc, - sizeof(blktrc)); + blk_add_driver_data(scsi->request, &blktrc, sizeof(blktrc)); } /** diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 3b6ff5902edc..05556573b896 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -75,8 +75,7 @@ static inline bool blk_trace_note_message_enabled(struct request_queue *q) return ret; } -extern void blk_add_driver_data(struct request_queue *q, struct request *rq, - void *data, size_t len); +extern void blk_add_driver_data(struct request *rq, void *data, size_t len); extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg); @@ -90,7 +89,7 @@ extern struct attribute_group blk_trace_attr_group; #else /* !CONFIG_BLK_DEV_IO_TRACE */ # define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) # define blk_trace_shutdown(q) do { } while (0) -# define blk_add_driver_data(q, rq, data, len) do {} while (0) +# define blk_add_driver_data(rq, data, len) do {} while (0) # define blk_trace_setup(q, name, dev, bdev, arg) (-ENOTTY) # define blk_trace_startstop(q, start) (-ENOTTY) # define blk_trace_remove(q) (-ENOTTY) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 8fb89574d867..0d782663a005 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -64,7 +64,6 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer, /** * block_rq_requeue - place block IO request back on a queue - * @q: queue holding operation * @rq: block IO operation request * * The block operation request @rq is being placed back into queue @@ -73,9 +72,9 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer, */ TRACE_EVENT(block_rq_requeue, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq), + TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) @@ -147,9 +146,9 @@ TRACE_EVENT(block_rq_complete, DECLARE_EVENT_CLASS(block_rq, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq), + TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) @@ -181,7 +180,6 @@ DECLARE_EVENT_CLASS(block_rq, /** * block_rq_insert - insert block operation request into queue - * @q: target queue * @rq: block IO operation request * * Called immediately before block operation request @rq is inserted @@ -191,14 +189,13 @@ DECLARE_EVENT_CLASS(block_rq, */ DEFINE_EVENT(block_rq, block_rq_insert, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq) + TP_ARGS(rq) ); /** * block_rq_issue - issue pending block IO request operation to device driver - * @q: queue holding operation * @rq: block IO operation operation request * * Called when block operation request @rq from queue @q is sent to a @@ -206,14 +203,13 @@ DEFINE_EVENT(block_rq, block_rq_insert, */ DEFINE_EVENT(block_rq, block_rq_issue, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq) + TP_ARGS(rq) ); /** * block_rq_merge - merge request with another one in the elevator - * @q: queue holding operation * @rq: block IO operation operation request * * Called when block operation request @rq from queue @q is merged to another @@ -221,9 +217,9 @@ DEFINE_EVENT(block_rq, block_rq_issue, */ DEFINE_EVENT(block_rq, block_rq_merge, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq) + TP_ARGS(rq) ); /** @@ -491,7 +487,6 @@ TRACE_EVENT(block_bio_remap, /** * block_rq_remap - map request for a block operation request - * @q: queue holding the operation * @rq: block IO operation request * @dev: device for the operation * @from: original sector for the operation @@ -502,10 +497,9 @@ TRACE_EVENT(block_bio_remap, */ TRACE_EVENT(block_rq_remap, - TP_PROTO(struct request_queue *q, struct request *rq, dev_t dev, - sector_t from), + TP_PROTO(struct request *rq, dev_t dev, sector_t from), - TP_ARGS(q, rq, dev, from), + TP_ARGS(rq, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 405637144a03..7839a78205c2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -795,12 +795,12 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) #endif static u64 -blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) +blk_trace_request_get_cgid(struct request *rq) { if (!rq->bio) return 0; /* Use the first bio */ - return blk_trace_bio_get_cgid(q, rq->bio); + return blk_trace_bio_get_cgid(rq->q, rq->bio); } /* @@ -841,40 +841,35 @@ static void blk_add_trace_rq(struct request *rq, int error, rcu_read_unlock(); } -static void blk_add_trace_rq_insert(void *ignore, - struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_insert(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); } -static void blk_add_trace_rq_issue(void *ignore, - struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_issue(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); } -static void blk_add_trace_rq_merge(void *ignore, - struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_merge(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); } -static void blk_add_trace_rq_requeue(void *ignore, - struct request_queue *q, - struct request *rq) +static void blk_add_trace_rq_requeue(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, int error, unsigned int nr_bytes) { blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, - blk_trace_request_get_cgid(rq->q, rq)); + blk_trace_request_get_cgid(rq)); } /** @@ -1037,16 +1032,14 @@ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, * Add a trace for that action. * **/ -static void blk_add_trace_rq_remap(void *ignore, - struct request_queue *q, - struct request *rq, dev_t dev, +static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { struct blk_trace *bt; struct blk_io_trace_remap r; rcu_read_lock(); - bt = rcu_dereference(q->blk_trace); + bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; @@ -1058,13 +1051,12 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, - sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); + sizeof(r), &r, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } /** * blk_add_driver_data - Add binary message with driver-specific data - * @q: queue the io is for * @rq: io request * @data: driver-specific data * @len: length of driver-specific data @@ -1073,14 +1065,12 @@ static void blk_add_trace_rq_remap(void *ignore, * Some drivers might want to write driver-specific data per request. * **/ -void blk_add_driver_data(struct request_queue *q, - struct request *rq, - void *data, size_t len) +void blk_add_driver_data(struct request *rq, void *data, size_t len) { struct blk_trace *bt; rcu_read_lock(); - bt = rcu_dereference(q->blk_trace); + bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; @@ -1088,7 +1078,7 @@ void blk_add_driver_data(struct request_queue *q, __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, 0, len, data, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); From af2d22254e8ee4558d3803372735c0b4f6046cd2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 2 Dec 2020 22:49:29 +0900 Subject: [PATCH 286/395] gcc-plugins: remove code for GCC versions older than 4.9 Documentation/process/changes.rst says the minimal GCC version is 4.9. Hence, BUILDING_GCC_VERSION is greater than or equal to 4009. Signed-off-by: Masahiro Yamada Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201202134929.99883-1-masahiroy@kernel.org --- scripts/gcc-plugins/gcc-common.h | 407 ------------------ .../gcc-plugins/gcc-generate-gimple-pass.h | 12 - scripts/gcc-plugins/gcc-generate-ipa-pass.h | 23 - scripts/gcc-plugins/gcc-generate-rtl-pass.h | 17 - .../gcc-generate-simple_ipa-pass.h | 17 - scripts/gcc-plugins/latent_entropy_plugin.c | 12 - scripts/gcc-plugins/randomize_layout_plugin.c | 4 - scripts/gcc-plugins/sancov_plugin.c | 6 - scripts/gcc-plugins/stackleak_plugin.c | 4 +- scripts/gcc-plugins/structleak_plugin.c | 4 - 10 files changed, 1 insertion(+), 505 deletions(-) diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h index 9ad76b7f3f10..0c087614fc3e 100644 --- a/scripts/gcc-plugins/gcc-common.h +++ b/scripts/gcc-plugins/gcc-common.h @@ -55,47 +55,17 @@ #include "cfgloop.h" #include "cgraph.h" #include "opts.h" - -#if BUILDING_GCC_VERSION == 4005 -#include -#endif - -#if BUILDING_GCC_VERSION >= 4007 #include "tree-pretty-print.h" #include "gimple-pretty-print.h" -#endif - -#if BUILDING_GCC_VERSION >= 4006 -/* - * The c-family headers were moved into a subdirectory in GCC version - * 4.7, but most plugin-building users of GCC 4.6 are using the Debian - * or Ubuntu package, which has an out-of-tree patch to move this to the - * same location as found in 4.7 and later: - * https://sources.debian.net/src/gcc-4.6/4.6.3-14/debian/patches/pr45078.diff/ - */ #include "c-family/c-common.h" -#else -#include "c-common.h" -#endif - -#if BUILDING_GCC_VERSION <= 4008 -#include "tree-flow.h" -#else #include "tree-cfgcleanup.h" #include "tree-ssa-operands.h" #include "tree-into-ssa.h" -#endif - -#if BUILDING_GCC_VERSION >= 4008 #include "is-a.h" -#endif - #include "diagnostic.h" #include "tree-dump.h" #include "tree-pass.h" -#if BUILDING_GCC_VERSION >= 4009 #include "pass_manager.h" -#endif #include "predict.h" #include "ipa-utils.h" @@ -103,7 +73,6 @@ #include "stringpool.h" #endif -#if BUILDING_GCC_VERSION >= 4009 #include "attribs.h" #include "varasm.h" #include "stor-layout.h" @@ -122,18 +91,13 @@ #include "tree-eh.h" #include "stmt.h" #include "gimplify.h" -#endif - #include "gimple.h" - -#if BUILDING_GCC_VERSION >= 4009 #include "tree-ssa-operands.h" #include "tree-phinodes.h" #include "tree-cfg.h" #include "gimple-iterator.h" #include "gimple-ssa.h" #include "ssa-iterators.h" -#endif #if BUILDING_GCC_VERSION >= 5000 #include "builtins.h" @@ -143,15 +107,6 @@ void debug_dominance_info(enum cdi_direction dir); void debug_dominance_tree(enum cdi_direction dir, basic_block root); -#if BUILDING_GCC_VERSION == 4006 -void debug_gimple_stmt(gimple); -void debug_gimple_seq(gimple_seq); -void print_gimple_seq(FILE *, gimple_seq, int, int); -void print_gimple_stmt(FILE *, gimple, int, int); -void print_gimple_expr(FILE *, gimple, int, int); -void dump_gimple_stmt(pretty_printer *, gimple, int, int); -#endif - #ifndef __unused #define __unused __attribute__((__unused__)) #endif @@ -190,372 +145,12 @@ struct register_pass_info NAME##_pass_info = { \ .pos_op = POS, \ } -#if BUILDING_GCC_VERSION == 4005 -#define FOR_EACH_LOCAL_DECL(FUN, I, D) \ - for (tree vars = (FUN)->local_decls, (I) = 0; \ - vars && ((D) = TREE_VALUE(vars)); \ - vars = TREE_CHAIN(vars), (I)++) -#define DECL_CHAIN(NODE) (TREE_CHAIN(DECL_MINIMAL_CHECK(NODE))) -#define FOR_EACH_VEC_ELT(T, V, I, P) \ - for (I = 0; VEC_iterate(T, (V), (I), (P)); ++(I)) -#define TODO_rebuild_cgraph_edges 0 -#define SCOPE_FILE_SCOPE_P(EXP) (!(EXP)) - -#ifndef O_BINARY -#define O_BINARY 0 -#endif - -typedef struct varpool_node *varpool_node_ptr; - -static inline bool gimple_call_builtin_p(gimple stmt, enum built_in_function code) -{ - tree fndecl; - - if (!is_gimple_call(stmt)) - return false; - fndecl = gimple_call_fndecl(stmt); - if (!fndecl || DECL_BUILT_IN_CLASS(fndecl) != BUILT_IN_NORMAL) - return false; - return DECL_FUNCTION_CODE(fndecl) == code; -} - -static inline bool is_simple_builtin(tree decl) -{ - if (decl && DECL_BUILT_IN_CLASS(decl) != BUILT_IN_NORMAL) - return false; - - switch (DECL_FUNCTION_CODE(decl)) { - /* Builtins that expand to constants. */ - case BUILT_IN_CONSTANT_P: - case BUILT_IN_EXPECT: - case BUILT_IN_OBJECT_SIZE: - case BUILT_IN_UNREACHABLE: - /* Simple register moves or loads from stack. */ - case BUILT_IN_RETURN_ADDRESS: - case BUILT_IN_EXTRACT_RETURN_ADDR: - case BUILT_IN_FROB_RETURN_ADDR: - case BUILT_IN_RETURN: - case BUILT_IN_AGGREGATE_INCOMING_ADDRESS: - case BUILT_IN_FRAME_ADDRESS: - case BUILT_IN_VA_END: - case BUILT_IN_STACK_SAVE: - case BUILT_IN_STACK_RESTORE: - /* Exception state returns or moves registers around. */ - case BUILT_IN_EH_FILTER: - case BUILT_IN_EH_POINTER: - case BUILT_IN_EH_COPY_VALUES: - return true; - - default: - return false; - } -} - -static inline void add_local_decl(struct function *fun, tree d) -{ - gcc_assert(TREE_CODE(d) == VAR_DECL); - fun->local_decls = tree_cons(NULL_TREE, d, fun->local_decls); -} -#endif - -#if BUILDING_GCC_VERSION <= 4006 -#define ANY_RETURN_P(rtx) (GET_CODE(rtx) == RETURN) -#define C_DECL_REGISTER(EXP) DECL_LANG_FLAG_4(EXP) -#define EDGE_PRESERVE 0ULL -#define HOST_WIDE_INT_PRINT_HEX_PURE "%" HOST_WIDE_INT_PRINT "x" -#define flag_fat_lto_objects true - -#define get_random_seed(noinit) ({ \ - unsigned HOST_WIDE_INT seed; \ - sscanf(get_random_seed(noinit), "%" HOST_WIDE_INT_PRINT "x", &seed); \ - seed * seed; }) - -#define int_const_binop(code, arg1, arg2) \ - int_const_binop((code), (arg1), (arg2), 0) - -static inline bool gimple_clobber_p(gimple s __unused) -{ - return false; -} - -static inline bool gimple_asm_clobbers_memory_p(const_gimple stmt) -{ - unsigned i; - - for (i = 0; i < gimple_asm_nclobbers(stmt); i++) { - tree op = gimple_asm_clobber_op(stmt, i); - - if (!strcmp(TREE_STRING_POINTER(TREE_VALUE(op)), "memory")) - return true; - } - - return false; -} - -static inline tree builtin_decl_implicit(enum built_in_function fncode) -{ - return implicit_built_in_decls[fncode]; -} - -static inline int ipa_reverse_postorder(struct cgraph_node **order) -{ - return cgraph_postorder(order); -} - -static inline struct cgraph_node *cgraph_create_node(tree decl) -{ - return cgraph_node(decl); -} - -static inline struct cgraph_node *cgraph_get_create_node(tree decl) -{ - struct cgraph_node *node = cgraph_get_node(decl); - - return node ? node : cgraph_node(decl); -} - -static inline bool cgraph_function_with_gimple_body_p(struct cgraph_node *node) -{ - return node->analyzed && !node->thunk.thunk_p && !node->alias; -} - -static inline struct cgraph_node *cgraph_first_function_with_gimple_body(void) -{ - struct cgraph_node *node; - - for (node = cgraph_nodes; node; node = node->next) - if (cgraph_function_with_gimple_body_p(node)) - return node; - return NULL; -} - -static inline struct cgraph_node *cgraph_next_function_with_gimple_body(struct cgraph_node *node) -{ - for (node = node->next; node; node = node->next) - if (cgraph_function_with_gimple_body_p(node)) - return node; - return NULL; -} - -static inline bool cgraph_for_node_and_aliases(cgraph_node_ptr node, bool (*callback)(cgraph_node_ptr, void *), void *data, bool include_overwritable) -{ - cgraph_node_ptr alias; - - if (callback(node, data)) - return true; - - for (alias = node->same_body; alias; alias = alias->next) { - if (include_overwritable || cgraph_function_body_availability(alias) > AVAIL_OVERWRITABLE) - if (cgraph_for_node_and_aliases(alias, callback, data, include_overwritable)) - return true; - } - - return false; -} - -#define FOR_EACH_FUNCTION_WITH_GIMPLE_BODY(node) \ - for ((node) = cgraph_first_function_with_gimple_body(); (node); \ - (node) = cgraph_next_function_with_gimple_body(node)) - -static inline void varpool_add_new_variable(tree decl) -{ - varpool_finalize_decl(decl); -} -#endif - -#if BUILDING_GCC_VERSION <= 4007 -#define FOR_EACH_FUNCTION(node) \ - for (node = cgraph_nodes; node; node = node->next) -#define FOR_EACH_VARIABLE(node) \ - for (node = varpool_nodes; node; node = node->next) -#define PROP_loops 0 -#define NODE_SYMBOL(node) (node) -#define NODE_DECL(node) (node)->decl -#define INSN_LOCATION(INSN) RTL_LOCATION(INSN) -#define vNULL NULL - -static inline int bb_loop_depth(const_basic_block bb) -{ - return bb->loop_father ? loop_depth(bb->loop_father) : 0; -} - -static inline bool gimple_store_p(gimple gs) -{ - tree lhs = gimple_get_lhs(gs); - - return lhs && !is_gimple_reg(lhs); -} - -static inline void gimple_init_singleton(gimple g __unused) -{ -} -#endif - -#if BUILDING_GCC_VERSION == 4007 || BUILDING_GCC_VERSION == 4008 -static inline struct cgraph_node *cgraph_alias_target(struct cgraph_node *n) -{ - return cgraph_alias_aliased_node(n); -} -#endif - -#if BUILDING_GCC_VERSION <= 4008 -#define ENTRY_BLOCK_PTR_FOR_FN(FN) ENTRY_BLOCK_PTR_FOR_FUNCTION(FN) -#define EXIT_BLOCK_PTR_FOR_FN(FN) EXIT_BLOCK_PTR_FOR_FUNCTION(FN) -#define basic_block_info_for_fn(FN) ((FN)->cfg->x_basic_block_info) -#define n_basic_blocks_for_fn(FN) ((FN)->cfg->x_n_basic_blocks) -#define n_edges_for_fn(FN) ((FN)->cfg->x_n_edges) -#define last_basic_block_for_fn(FN) ((FN)->cfg->x_last_basic_block) -#define label_to_block_map_for_fn(FN) ((FN)->cfg->x_label_to_block_map) -#define profile_status_for_fn(FN) ((FN)->cfg->x_profile_status) -#define BASIC_BLOCK_FOR_FN(FN, N) BASIC_BLOCK_FOR_FUNCTION((FN), (N)) -#define NODE_IMPLICIT_ALIAS(node) (node)->same_body_alias -#define VAR_P(NODE) (TREE_CODE(NODE) == VAR_DECL) - -static inline bool tree_fits_shwi_p(const_tree t) -{ - if (t == NULL_TREE || TREE_CODE(t) != INTEGER_CST) - return false; - - if (TREE_INT_CST_HIGH(t) == 0 && (HOST_WIDE_INT)TREE_INT_CST_LOW(t) >= 0) - return true; - - if (TREE_INT_CST_HIGH(t) == -1 && (HOST_WIDE_INT)TREE_INT_CST_LOW(t) < 0 && !TYPE_UNSIGNED(TREE_TYPE(t))) - return true; - - return false; -} - -static inline bool tree_fits_uhwi_p(const_tree t) -{ - if (t == NULL_TREE || TREE_CODE(t) != INTEGER_CST) - return false; - - return TREE_INT_CST_HIGH(t) == 0; -} - -static inline HOST_WIDE_INT tree_to_shwi(const_tree t) -{ - gcc_assert(tree_fits_shwi_p(t)); - return TREE_INT_CST_LOW(t); -} - -static inline unsigned HOST_WIDE_INT tree_to_uhwi(const_tree t) -{ - gcc_assert(tree_fits_uhwi_p(t)); - return TREE_INT_CST_LOW(t); -} - -static inline const char *get_tree_code_name(enum tree_code code) -{ - gcc_assert(code < MAX_TREE_CODES); - return tree_code_name[code]; -} - -#define ipa_remove_stmt_references(cnode, stmt) - -typedef union gimple_statement_d gasm; -typedef union gimple_statement_d gassign; -typedef union gimple_statement_d gcall; -typedef union gimple_statement_d gcond; -typedef union gimple_statement_d gdebug; -typedef union gimple_statement_d ggoto; -typedef union gimple_statement_d gphi; -typedef union gimple_statement_d greturn; - -static inline gasm *as_a_gasm(gimple stmt) -{ - return stmt; -} - -static inline const gasm *as_a_const_gasm(const_gimple stmt) -{ - return stmt; -} - -static inline gassign *as_a_gassign(gimple stmt) -{ - return stmt; -} - -static inline const gassign *as_a_const_gassign(const_gimple stmt) -{ - return stmt; -} - -static inline gcall *as_a_gcall(gimple stmt) -{ - return stmt; -} - -static inline const gcall *as_a_const_gcall(const_gimple stmt) -{ - return stmt; -} - -static inline gcond *as_a_gcond(gimple stmt) -{ - return stmt; -} - -static inline const gcond *as_a_const_gcond(const_gimple stmt) -{ - return stmt; -} - -static inline gdebug *as_a_gdebug(gimple stmt) -{ - return stmt; -} - -static inline const gdebug *as_a_const_gdebug(const_gimple stmt) -{ - return stmt; -} - -static inline ggoto *as_a_ggoto(gimple stmt) -{ - return stmt; -} - -static inline const ggoto *as_a_const_ggoto(const_gimple stmt) -{ - return stmt; -} - -static inline gphi *as_a_gphi(gimple stmt) -{ - return stmt; -} - -static inline const gphi *as_a_const_gphi(const_gimple stmt) -{ - return stmt; -} - -static inline greturn *as_a_greturn(gimple stmt) -{ - return stmt; -} - -static inline const greturn *as_a_const_greturn(const_gimple stmt) -{ - return stmt; -} -#endif - -#if BUILDING_GCC_VERSION == 4008 -#define NODE_SYMBOL(node) (&(node)->symbol) -#define NODE_DECL(node) (node)->symbol.decl -#endif - -#if BUILDING_GCC_VERSION >= 4008 #define add_referenced_var(var) #define mark_sym_for_renaming(var) #define varpool_mark_needed_node(node) #define create_var_ann(var) #define TODO_dump_func 0 #define TODO_dump_cgraph 0 -#endif #if BUILDING_GCC_VERSION <= 4009 #define TODO_verify_il 0 @@ -676,7 +271,6 @@ static inline const greturn *as_a_const_greturn(const_gimple stmt) } #endif -#if BUILDING_GCC_VERSION >= 4009 #define TODO_ggc_collect 0 #define NODE_SYMBOL(node) (node) #define NODE_DECL(node) (node)->decl @@ -687,7 +281,6 @@ static inline opt_pass *get_pass_for_id(int id) { return g->get_passes()->get_pass_for_id(id); } -#endif #if BUILDING_GCC_VERSION >= 5000 && BUILDING_GCC_VERSION < 6000 /* gimple related */ diff --git a/scripts/gcc-plugins/gcc-generate-gimple-pass.h b/scripts/gcc-plugins/gcc-generate-gimple-pass.h index f20797e80b6d..51780828734e 100644 --- a/scripts/gcc-plugins/gcc-generate-gimple-pass.h +++ b/scripts/gcc-plugins/gcc-generate-gimple-pass.h @@ -73,18 +73,11 @@ #define TODO_FLAGS_FINISH 0 #endif -#if BUILDING_GCC_VERSION >= 4009 namespace { static const pass_data _PASS_NAME_PASS_DATA = { -#else -static struct gimple_opt_pass _PASS_NAME_PASS = { - .pass = { -#endif .type = GIMPLE_PASS, .name = _PASS_NAME_NAME, -#if BUILDING_GCC_VERSION >= 4008 .optinfo_flags = OPTGROUP_NONE, -#endif #if BUILDING_GCC_VERSION >= 5000 #elif BUILDING_GCC_VERSION == 4009 .has_gate = _HAS_GATE, @@ -102,12 +95,8 @@ static struct gimple_opt_pass _PASS_NAME_PASS = { .properties_destroyed = PROPERTIES_DESTROYED, .todo_flags_start = TODO_FLAGS_START, .todo_flags_finish = TODO_FLAGS_FINISH, -#if BUILDING_GCC_VERSION < 4009 - } -#endif }; -#if BUILDING_GCC_VERSION >= 4009 class _PASS_NAME_PASS : public gimple_opt_pass { public: _PASS_NAME_PASS() : gimple_opt_pass(_PASS_NAME_PASS_DATA, g) {} @@ -128,7 +117,6 @@ class _PASS_NAME_PASS : public gimple_opt_pass { #else virtual unsigned int execute(void) { return _EXECUTE(); } #endif -#endif }; } diff --git a/scripts/gcc-plugins/gcc-generate-ipa-pass.h b/scripts/gcc-plugins/gcc-generate-ipa-pass.h index 92bb4f3a87a4..c34ffec035bf 100644 --- a/scripts/gcc-plugins/gcc-generate-ipa-pass.h +++ b/scripts/gcc-plugins/gcc-generate-ipa-pass.h @@ -141,18 +141,11 @@ #define FUNCTION_TRANSFORM_TODO_FLAGS_START 0 #endif -#if BUILDING_GCC_VERSION >= 4009 namespace { static const pass_data _PASS_NAME_PASS_DATA = { -#else -static struct ipa_opt_pass_d _PASS_NAME_PASS = { - .pass = { -#endif .type = IPA_PASS, .name = _PASS_NAME_NAME, -#if BUILDING_GCC_VERSION >= 4008 .optinfo_flags = OPTGROUP_NONE, -#endif #if BUILDING_GCC_VERSION >= 5000 #elif BUILDING_GCC_VERSION == 4009 .has_gate = _HAS_GATE, @@ -170,23 +163,8 @@ static struct ipa_opt_pass_d _PASS_NAME_PASS = { .properties_destroyed = PROPERTIES_DESTROYED, .todo_flags_start = TODO_FLAGS_START, .todo_flags_finish = TODO_FLAGS_FINISH, -#if BUILDING_GCC_VERSION < 4009 - }, - .generate_summary = _GENERATE_SUMMARY, - .write_summary = _WRITE_SUMMARY, - .read_summary = _READ_SUMMARY, -#if BUILDING_GCC_VERSION >= 4006 - .write_optimization_summary = _WRITE_OPTIMIZATION_SUMMARY, - .read_optimization_summary = _READ_OPTIMIZATION_SUMMARY, -#endif - .stmt_fixup = _STMT_FIXUP, - .function_transform_todo_flags_start = FUNCTION_TRANSFORM_TODO_FLAGS_START, - .function_transform = _FUNCTION_TRANSFORM, - .variable_transform = _VARIABLE_TRANSFORM, -#endif }; -#if BUILDING_GCC_VERSION >= 4009 class _PASS_NAME_PASS : public ipa_opt_pass_d { public: _PASS_NAME_PASS() : ipa_opt_pass_d(_PASS_NAME_PASS_DATA, @@ -206,7 +184,6 @@ class _PASS_NAME_PASS : public ipa_opt_pass_d { virtual bool gate(function *) { return _GATE(); } #else virtual bool gate(void) { return _GATE(); } -#endif #endif virtual opt_pass *clone() { return new _PASS_NAME_PASS(); } diff --git a/scripts/gcc-plugins/gcc-generate-rtl-pass.h b/scripts/gcc-plugins/gcc-generate-rtl-pass.h index d69cd80b6c10..d14614f4b139 100644 --- a/scripts/gcc-plugins/gcc-generate-rtl-pass.h +++ b/scripts/gcc-plugins/gcc-generate-rtl-pass.h @@ -73,18 +73,11 @@ #define TODO_FLAGS_FINISH 0 #endif -#if BUILDING_GCC_VERSION >= 4009 namespace { static const pass_data _PASS_NAME_PASS_DATA = { -#else -static struct rtl_opt_pass _PASS_NAME_PASS = { - .pass = { -#endif .type = RTL_PASS, .name = _PASS_NAME_NAME, -#if BUILDING_GCC_VERSION >= 4008 .optinfo_flags = OPTGROUP_NONE, -#endif #if BUILDING_GCC_VERSION >= 5000 #elif BUILDING_GCC_VERSION == 4009 .has_gate = _HAS_GATE, @@ -102,12 +95,8 @@ static struct rtl_opt_pass _PASS_NAME_PASS = { .properties_destroyed = PROPERTIES_DESTROYED, .todo_flags_start = TODO_FLAGS_START, .todo_flags_finish = TODO_FLAGS_FINISH, -#if BUILDING_GCC_VERSION < 4009 - } -#endif }; -#if BUILDING_GCC_VERSION >= 4009 class _PASS_NAME_PASS : public rtl_opt_pass { public: _PASS_NAME_PASS() : rtl_opt_pass(_PASS_NAME_PASS_DATA, g) {} @@ -136,12 +125,6 @@ opt_pass *_MAKE_PASS_NAME_PASS(void) { return new _PASS_NAME_PASS(); } -#else -struct opt_pass *_MAKE_PASS_NAME_PASS(void) -{ - return &_PASS_NAME_PASS.pass; -} -#endif /* clean up user provided defines */ #undef PASS_NAME diff --git a/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h b/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h index 06800bc477e0..ef6f4c2cb6fa 100644 --- a/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h +++ b/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h @@ -73,18 +73,11 @@ #define TODO_FLAGS_FINISH 0 #endif -#if BUILDING_GCC_VERSION >= 4009 namespace { static const pass_data _PASS_NAME_PASS_DATA = { -#else -static struct simple_ipa_opt_pass _PASS_NAME_PASS = { - .pass = { -#endif .type = SIMPLE_IPA_PASS, .name = _PASS_NAME_NAME, -#if BUILDING_GCC_VERSION >= 4008 .optinfo_flags = OPTGROUP_NONE, -#endif #if BUILDING_GCC_VERSION >= 5000 #elif BUILDING_GCC_VERSION == 4009 .has_gate = _HAS_GATE, @@ -102,12 +95,8 @@ static struct simple_ipa_opt_pass _PASS_NAME_PASS = { .properties_destroyed = PROPERTIES_DESTROYED, .todo_flags_start = TODO_FLAGS_START, .todo_flags_finish = TODO_FLAGS_FINISH, -#if BUILDING_GCC_VERSION < 4009 - } -#endif }; -#if BUILDING_GCC_VERSION >= 4009 class _PASS_NAME_PASS : public simple_ipa_opt_pass { public: _PASS_NAME_PASS() : simple_ipa_opt_pass(_PASS_NAME_PASS_DATA, g) {} @@ -136,12 +125,6 @@ opt_pass *_MAKE_PASS_NAME_PASS(void) { return new _PASS_NAME_PASS(); } -#else -struct opt_pass *_MAKE_PASS_NAME_PASS(void) -{ - return &_PASS_NAME_PASS.pass; -} -#endif /* clean up user provided defines */ #undef PASS_NAME diff --git a/scripts/gcc-plugins/latent_entropy_plugin.c b/scripts/gcc-plugins/latent_entropy_plugin.c index cbe1d6c4b1a5..9dced66d158e 100644 --- a/scripts/gcc-plugins/latent_entropy_plugin.c +++ b/scripts/gcc-plugins/latent_entropy_plugin.c @@ -125,11 +125,7 @@ static tree handle_latent_entropy_attribute(tree *node, tree name, bool *no_add_attrs) { tree type; -#if BUILDING_GCC_VERSION <= 4007 - VEC(constructor_elt, gc) *vals; -#else vec *vals; -#endif switch (TREE_CODE(*node)) { default: @@ -181,11 +177,7 @@ static tree handle_latent_entropy_attribute(tree *node, tree name, if (fld) break; -#if BUILDING_GCC_VERSION <= 4007 - vals = VEC_alloc(constructor_elt, gc, nelt); -#else vec_alloc(vals, nelt); -#endif for (fld = lst; fld; fld = TREE_CHAIN(fld)) { tree random_const, fld_t = TREE_TYPE(fld); @@ -225,11 +217,7 @@ static tree handle_latent_entropy_attribute(tree *node, tree name, elt_size_int = TREE_INT_CST_LOW(elt_size); nelt = array_size_int / elt_size_int; -#if BUILDING_GCC_VERSION <= 4007 - vals = VEC_alloc(constructor_elt, gc, nelt); -#else vec_alloc(vals, nelt); -#endif for (i = 0; i < nelt; i++) { tree cst = size_int(i); diff --git a/scripts/gcc-plugins/randomize_layout_plugin.c b/scripts/gcc-plugins/randomize_layout_plugin.c index bd29e4e7a524..334741a31d0a 100644 --- a/scripts/gcc-plugins/randomize_layout_plugin.c +++ b/scripts/gcc-plugins/randomize_layout_plugin.c @@ -590,16 +590,12 @@ static void register_attributes(void *event_data, void *data) randomize_layout_attr.name = "randomize_layout"; randomize_layout_attr.type_required = true; randomize_layout_attr.handler = handle_randomize_layout_attr; -#if BUILDING_GCC_VERSION >= 4007 randomize_layout_attr.affects_type_identity = true; -#endif no_randomize_layout_attr.name = "no_randomize_layout"; no_randomize_layout_attr.type_required = true; no_randomize_layout_attr.handler = handle_randomize_layout_attr; -#if BUILDING_GCC_VERSION >= 4007 no_randomize_layout_attr.affects_type_identity = true; -#endif randomize_considered_attr.name = "randomize_considered"; randomize_considered_attr.type_required = true; diff --git a/scripts/gcc-plugins/sancov_plugin.c b/scripts/gcc-plugins/sancov_plugin.c index caff4a6c7e9a..23bd023a283b 100644 --- a/scripts/gcc-plugins/sancov_plugin.c +++ b/scripts/gcc-plugins/sancov_plugin.c @@ -80,10 +80,8 @@ static void sancov_start_unit(void __unused *gcc_data, void __unused *user_data) nothrow_attr = tree_cons(get_identifier("nothrow"), NULL, NULL); decl_attributes(&sancov_fndecl, nothrow_attr, 0); gcc_assert(TREE_NOTHROW(sancov_fndecl)); -#if BUILDING_GCC_VERSION > 4005 leaf_attr = tree_cons(get_identifier("leaf"), NULL, NULL); decl_attributes(&sancov_fndecl, leaf_attr, 0); -#endif } __visible int plugin_init(struct plugin_name_args *plugin_info, struct plugin_gcc_version *version) @@ -106,11 +104,7 @@ __visible int plugin_init(struct plugin_name_args *plugin_info, struct plugin_gc }; /* BBs can be split afterwards?? */ -#if BUILDING_GCC_VERSION >= 4009 PASS_INFO(sancov, "asan", 0, PASS_POS_INSERT_BEFORE); -#else - PASS_INFO(sancov, "nrv", 1, PASS_POS_INSERT_BEFORE); -#endif if (!plugin_default_version_check(version, &gcc_version)) { error(G_("incompatible gcc/plugin versions")); diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c index 48e141e07956..e9db7dcb3e5f 100644 --- a/scripts/gcc-plugins/stackleak_plugin.c +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -80,10 +80,8 @@ static bool is_alloca(gimple stmt) if (gimple_call_builtin_p(stmt, BUILT_IN_ALLOCA)) return true; -#if BUILDING_GCC_VERSION >= 4007 if (gimple_call_builtin_p(stmt, BUILT_IN_ALLOCA_WITH_ALIGN)) return true; -#endif return false; } @@ -322,7 +320,7 @@ static void remove_stack_tracking_gcall(void) /* Delete the stackleak_track_stack() call */ delete_insn_and_edges(insn); -#if BUILDING_GCC_VERSION >= 4007 && BUILDING_GCC_VERSION < 8000 +#if BUILDING_GCC_VERSION < 8000 if (GET_CODE(next) == NOTE && NOTE_KIND(next) == NOTE_INSN_CALL_ARG_LOCATION) { insn = next; diff --git a/scripts/gcc-plugins/structleak_plugin.c b/scripts/gcc-plugins/structleak_plugin.c index b9ef2e162107..29b480c33a8d 100644 --- a/scripts/gcc-plugins/structleak_plugin.c +++ b/scripts/gcc-plugins/structleak_plugin.c @@ -68,9 +68,7 @@ static void register_attributes(void *event_data, void *data) { user_attr.name = "user"; user_attr.handler = handle_user_attribute; -#if BUILDING_GCC_VERSION >= 4007 user_attr.affects_type_identity = true; -#endif register_attribute(&user_attr); } @@ -137,11 +135,9 @@ static void initialize(tree var) if (!gimple_assign_single_p(stmt)) continue; rhs1 = gimple_assign_rhs1(stmt); -#if BUILDING_GCC_VERSION >= 4007 /* ... of a non-clobbering expression... */ if (TREE_CLOBBER_P(rhs1)) continue; -#endif /* ... to our variable... */ if (gimple_get_lhs(stmt) != var) continue; From 1e860048c53ee77ee9870dcce94847a28544b753 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 3 Dec 2020 21:57:00 +0900 Subject: [PATCH 287/395] gcc-plugins: simplify GCC plugin-dev capability test Linus pointed out a third of the time in the Kconfig parse stage comes from the single invocation of cc1plus in scripts/gcc-plugin.sh [1], and directly testing plugin-version.h for existence cuts down the overhead a lot. [2] This commit takes one step further to kill the build test entirely. The small piece of code was probably intended to test the C++ designated initializer, which was not supported until C++20. In fact, with -pedantic option given, both GCC and Clang emit a warning. $ echo 'class test { public: int test; } test = { .test = 1 };' | g++ -x c++ -pedantic - -fsyntax-only :1:43: warning: C++ designated initializers only available with '-std=c++2a' or '-std=gnu++2a' [-Wpedantic] $ echo 'class test { public: int test; } test = { .test = 1 };' | clang++ -x c++ -pedantic - -fsyntax-only :1:43: warning: designated initializers are a C++20 extension [-Wc++20-designator] class test { public: int test; } test = { .test = 1 }; ^ 1 warning generated. Otherwise, modern C++ compilers should be able to build the code, and hopefully skipping this test should not make any practical problem. Checking the existence of plugin-version.h is still needed to ensure the plugin-dev package is installed. The test code is now small enough to be embedded in scripts/gcc-plugins/Kconfig. [1] https://lore.kernel.org/lkml/CAHk-=wjU4DCuwQ4pXshRbwDCUQB31ScaeuDo1tjoZ0_PjhLHzQ@mail.gmail.com/ [2] https://lore.kernel.org/lkml/CAHk-=whK0aQxs6Q5ijJmYF1n2ch8cVFSUzU5yUM_HOjig=+vnw@mail.gmail.com/ Reported-by: Linus Torvalds Signed-off-by: Masahiro Yamada Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201203125700.161354-1-masahiroy@kernel.org --- scripts/gcc-plugin.sh | 19 ------------------- scripts/gcc-plugins/Kconfig | 2 +- 2 files changed, 1 insertion(+), 20 deletions(-) delete mode 100755 scripts/gcc-plugin.sh diff --git a/scripts/gcc-plugin.sh b/scripts/gcc-plugin.sh deleted file mode 100755 index b79fd0bea838..000000000000 --- a/scripts/gcc-plugin.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -set -e - -srctree=$(dirname "$0") - -gccplugins_dir=$($* -print-file-name=plugin) - -# we need a c++ compiler that supports the designated initializer GNU extension -$HOSTCC -c -x c++ -std=gnu++98 - -fsyntax-only -I $srctree/gcc-plugins -I $gccplugins_dir/include 2>/dev/null < Date: Fri, 4 Dec 2020 14:11:05 -0800 Subject: [PATCH 288/395] MAINTAINERS: Drop inactive gcc-plugins maintainer Adjust MAINTAINERS since Emese hasn't sent email to LKML in almost 3 years. Signed-off-by: Kees Cook --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2daa6ee673f7..23703d3df6d9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7284,7 +7284,6 @@ F: drivers/staging/gasket/ GCC PLUGINS M: Kees Cook -R: Emese Revfy L: linux-hardening@vger.kernel.org S: Maintained F: Documentation/kbuild/gcc-plugins.rst From c0aac3a51cb6364bed367ee3e1a96ed414f386b4 Mon Sep 17 00:00:00 2001 From: Jinyang He Date: Fri, 4 Dec 2020 09:11:46 +0800 Subject: [PATCH 289/395] MIPS: KASLR: Avoid endless loop in sync_icache if synci_step is zero Most platforms do not need to do synci instruction operations when synci_step is 0. But for example, the synci implementation on Loongson64 platform has some changes. On the one hand, it ensures that the memory access instructions have been completed. On the other hand, it guarantees that all prefetch instructions need to be fetched again. And its address information is useless. Thus, only one synci operation is required when synci_step is 0 on Loongson64 platform. I guess that some other platforms have similar implementations on synci, so add judgment conditions in `while` to ensure that at least all platforms perform synci operations once. For those platforms that do not need synci, they just do one more operation similar to nop. Signed-off-by: Jinyang He Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/relocate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/relocate.c b/arch/mips/kernel/relocate.c index 57bdd2761c78..47aeb3350a76 100644 --- a/arch/mips/kernel/relocate.c +++ b/arch/mips/kernel/relocate.c @@ -64,7 +64,7 @@ static void __init sync_icache(void *kbase, unsigned long kernel_length) : "r" (kbase)); kbase += step; - } while (kbase < kend); + } while (step && kbase < kend); /* Completion barrier */ __sync(); From 991838f90e9315468cd1d1daed29d27faae77a9b Mon Sep 17 00:00:00 2001 From: Alexander Dahl Date: Sat, 28 Nov 2020 22:53:53 +0100 Subject: [PATCH 290/395] MIPS: DTS: img: Fix schema warnings for pwm-leds The node names for devices using the pwm-leds driver follow a certain naming scheme (now). Parent node name is not enforced, but recommended by DT project. Signed-off-by: Alexander Dahl Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/img/pistachio_marduk.dts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/mips/boot/dts/img/pistachio_marduk.dts b/arch/mips/boot/dts/img/pistachio_marduk.dts index bf69da96dc8b..a8708783f04b 100644 --- a/arch/mips/boot/dts/img/pistachio_marduk.dts +++ b/arch/mips/boot/dts/img/pistachio_marduk.dts @@ -46,9 +46,10 @@ internal_dac_supply: internal-dac-supply { regulator-max-microvolt = <1800000>; }; - leds { + led-controller { compatible = "pwm-leds"; - heartbeat { + + led-1 { label = "marduk:red:heartbeat"; pwms = <&pwm 3 300000>; max-brightness = <255>; From 5c641fee4ccfd27520b7863bf4a66491faea6d2a Mon Sep 17 00:00:00 2001 From: Stefan Eschenbacher Date: Sun, 6 Dec 2020 11:48:50 +0100 Subject: [PATCH 291/395] drivers/hv: remove obsolete TODO and fix misleading typo in comment Removes an obsolete TODO in the VMBus module and fixes a misleading typo in the comment for the macro MAX_NUM_CHANNELS, where two digits have been twisted. Signed-off-by: Stefan Eschenbacher Co-developed-by: Max Stolze Signed-off-by: Max Stolze Link: https://lore.kernel.org/r/20201206104850.24843-1-stefan.eschenbacher@fau.de Signed-off-by: Wei Liu --- drivers/hv/hyperv_vmbus.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 02f3e8988836..9416e09ebd58 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -187,14 +187,13 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, u64 *requestid, bool raw); /* - * The Maximum number of channels (16348) is determined by the size of the + * The Maximum number of channels (16384) is determined by the size of the * interrupt page, which is HV_HYP_PAGE_SIZE. 1/2 of HV_HYP_PAGE_SIZE is to * send endpoint interrupts, and the other is to receive endpoint interrupts. */ #define MAX_NUM_CHANNELS ((HV_HYP_PAGE_SIZE >> 1) << 3) /* The value here must be in multiple of 32 */ -/* TODO: Need to make this configurable */ #define MAX_NUM_CHANNELS_SUPPORTED 256 #define MAX_CHANNEL_RELIDS \ From 733c15bd3a944b8eeaacdddf061759b6a83dd3f4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 7 Dec 2020 14:54:46 +0000 Subject: [PATCH 292/395] block/rnbd: fix a null pointer dereference on dev->blk_symlink_name Currently in the case where dev->blk_symlink_name fails to be allocates the error return path attempts to set an end-of-string character to the unallocated dev->blk_symlink_name causing a null pointer dereference error. Fix this by returning with an explicity ENOMEM error (which also is missing in the original code as was not initialized). Fixes: 1eb54f8f5dd8 ("block/rnbd: client: sysfs interface functions") Signed-off-by: Colin Ian King Addresses-Coverity: ("Dereference after null check") Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index c3c96a567568..a7caeedeb198 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -499,7 +499,7 @@ static int rnbd_clt_add_dev_symlink(struct rnbd_clt_dev *dev) dev->blk_symlink_name = kzalloc(len, GFP_KERNEL); if (!dev->blk_symlink_name) { rnbd_clt_err(dev, "Failed to allocate memory for blk_symlink_name\n"); - goto out_err; + return -ENOMEM; } ret = rnbd_clt_get_path_name(dev, dev->blk_symlink_name, From 45dc656aeb4d50e6a4b2ca110345fb0c96cf1189 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Dec 2020 14:40:48 +0100 Subject: [PATCH 293/395] blktrace: fix up a kerneldoc comment Fixes: a54895fa057c ("block: remove the request_queue to argument request based tracepoints") Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7839a78205c2..2c5b3c5317c2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1022,7 +1022,6 @@ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, /** * blk_add_trace_rq_remap - Add a trace for a request-remap operation * @ignore: trace callback data parameter (not used) - * @q: queue the io is for * @rq: the source request * @dev: target device * @from: source sector From 5ba1add216fe82289769045627d97f233bbcc645 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:11 +0800 Subject: [PATCH 294/395] blk-iocost: Fix some typos in comments Fix some typos in comments. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 8e20fe4bddec..087ae215529e 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -39,7 +39,7 @@ * On top of that, a size cost proportional to the length of the IO is * added. While simple, this model captures the operational * characteristics of a wide varienty of devices well enough. Default - * paramters for several different classes of devices are provided and the + * parameters for several different classes of devices are provided and the * parameters can be configured from userspace via * /sys/fs/cgroup/io.cost.model. * @@ -77,7 +77,7 @@ * * This constitutes the basis of IO capacity distribution. Each cgroup's * vtime is running at a rate determined by its hweight. A cgroup tracks - * the vtime consumed by past IOs and can issue a new IO iff doing so + * the vtime consumed by past IOs and can issue a new IO if doing so * wouldn't outrun the current device vtime. Otherwise, the IO is * suspended until the vtime has progressed enough to cover it. * @@ -155,7 +155,7 @@ * Instead of debugfs or other clumsy monitoring mechanisms, this * controller uses a drgn based monitoring script - * tools/cgroup/iocost_monitor.py. For details on drgn, please see - * https://github.com/osandov/drgn. The ouput looks like the following. + * https://github.com/osandov/drgn. The output looks like the following. * * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% * active weight hweight% inflt% dbt delay usages% @@ -492,7 +492,7 @@ struct ioc_gq { /* * `vtime` is this iocg's vtime cursor which progresses as IOs are * issued. If lagging behind device vtime, the delta represents - * the currently available IO budget. If runnning ahead, the + * the currently available IO budget. If running ahead, the * overage. * * `vtime_done` is the same but progressed on completion rather @@ -1046,7 +1046,7 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, /* * The delta between inuse and active sums indicates that - * that much of weight is being given away. Parent's inuse + * much of weight is being given away. Parent's inuse * and active should reflect the ratio. */ if (parent->child_active_sum) { @@ -2400,7 +2400,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, return cost; /* - * We only increase inuse during period and do so iff the margin has + * We only increase inuse during period and do so if the margin has * deteriorated since the previous adjustment. */ if (margin >= iocg->saved_margin || margin >= margins->low || From 647c9f03b2b66cf1f505208c313998fc833ed28b Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:12 +0800 Subject: [PATCH 295/395] blk-iocost: Remove unnecessary advance declaration Remove unnecessary advance declaration of struct ioc_gq. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 087ae215529e..ec4865206353 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -370,8 +370,6 @@ enum { AUTOP_SSD_FAST, }; -struct ioc_gq; - struct ioc_params { u32 qos[NR_QOS_PARAMS]; u64 i_lcoefs[NR_I_LCOEFS]; From c09245f61c6ac4ef253a5fcf97e5bcfc0ce25fc7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:13 +0800 Subject: [PATCH 296/395] blk-iocost: Move the usage ratio calculation to the correct place We only use the hweight based usage ratio to calculate the new hweight_inuse of the iocg to decide if this iocg can donate some surplus vtime. Thus move the usage ratio calculation to the correct place to avoid unnecessary calculation for some vtime shortage iocgs. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ec4865206353..09f22f9a6ba4 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2168,8 +2168,8 @@ static void ioc_timer_fn(struct timer_list *timer) /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, usage_us, usage_dur; - u32 usage, hw_active, hw_inuse; + u64 vdone, vtime, usage_us; + u32 hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent @@ -2200,30 +2200,32 @@ static void ioc_timer_fn(struct timer_list *timer) usage_us = iocg->usage_delta_us; usage_us_sum += usage_us; - if (vdone != vtime) { - u64 inflight_us = DIV64_U64_ROUND_UP( - cost_to_abs_cost(vtime - vdone, hw_inuse), - ioc->vtime_base_rate); - usage_us = max(usage_us, inflight_us); - } - - /* convert to hweight based usage ratio */ - if (time_after64(iocg->activated_at, ioc->period_at)) - usage_dur = max_t(u64, now.now - iocg->activated_at, 1); - else - usage_dur = max_t(u64, now.now - ioc->period_at, 1); - - usage = clamp_t(u32, - DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, - usage_dur), - 1, WEIGHT_ONE); - /* see whether there's surplus vtime */ WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.low))) { - u32 hwa, old_hwi, hwm, new_hwi; + u32 hwa, old_hwi, hwm, new_hwi, usage; + u64 usage_dur; + + if (vdone != vtime) { + u64 inflight_us = DIV64_U64_ROUND_UP( + cost_to_abs_cost(vtime - vdone, hw_inuse), + ioc->vtime_base_rate); + + usage_us = max(usage_us, inflight_us); + } + + /* convert to hweight based usage ratio */ + if (time_after64(iocg->activated_at, ioc->period_at)) + usage_dur = max_t(u64, now.now - iocg->activated_at, 1); + else + usage_dur = max_t(u64, now.now - ioc->period_at, 1); + + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, + usage_dur), + 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. From 2474787a75b4f358e81f367653c73edecd67aa2d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:14 +0800 Subject: [PATCH 297/395] blk-iocost: Factor out the active iocgs' state check into a separate function Factor out the iocgs' state check into a separate function to simplify the ioc_timer_fn(). No functional change. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 140 +++++++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 63 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 09f22f9a6ba4..7dd1424d5833 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2069,13 +2069,88 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, } } +/* + * Check the active iocgs' state to avoid oversleeping and deactive + * idle iocgs. + * + * Since waiters determine the sleep durations based on the vrate + * they saw at the time of sleep, if vrate has increased, some + * waiters could be sleeping for too long. Wake up tardy waiters + * which should have woken up in the last period and expire idle + * iocgs. + */ +static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) +{ + int nr_debtors = 0; + struct ioc_gq *iocg, *tiocg; + + list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { + if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && + !iocg->delay && !iocg_is_idle(iocg)) + continue; + + spin_lock(&iocg->waitq.lock); + + /* flush wait and indebt stat deltas */ + if (iocg->wait_since) { + iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->wait_since = now->now; + } + if (iocg->indebt_since) { + iocg->local_stat.indebt_us += + now->now - iocg->indebt_since; + iocg->indebt_since = now->now; + } + if (iocg->indelay_since) { + iocg->local_stat.indelay_us += + now->now - iocg->indelay_since; + iocg->indelay_since = now->now; + } + + if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || + iocg->delay) { + /* might be oversleeping vtime / hweight changes, kick */ + iocg_kick_waitq(iocg, true, now); + if (iocg->abs_vdebt || iocg->delay) + nr_debtors++; + } else if (iocg_is_idle(iocg)) { + /* no waiter and idle, deactivate */ + u64 vtime = atomic64_read(&iocg->vtime); + s64 excess; + + /* + * @iocg has been inactive for a full duration and will + * have a high budget. Account anything above target as + * error and throw away. On reactivation, it'll start + * with the target budget. + */ + excess = now->vnow - vtime - ioc->margins.target; + if (excess > 0) { + u32 old_hwi; + + current_hweight(iocg, NULL, &old_hwi); + ioc->vtime_err -= div64_u64(excess * old_hwi, + WEIGHT_ONE); + } + + __propagate_weights(iocg, 0, 0, false, now); + list_del_init(&iocg->active_list); + } + + spin_unlock(&iocg->waitq.lock); + } + + commit_weights(ioc); + return nr_debtors; +} + static void ioc_timer_fn(struct timer_list *timer) { struct ioc *ioc = container_of(timer, struct ioc, timer); struct ioc_gq *iocg, *tiocg; struct ioc_now now; LIST_HEAD(surpluses); - int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0; + int nr_debtors, nr_shortages = 0, nr_lagging = 0; u64 usage_us_sum = 0; u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; @@ -2097,68 +2172,7 @@ static void ioc_timer_fn(struct timer_list *timer) return; } - /* - * Waiters determine the sleep durations based on the vrate they - * saw at the time of sleep. If vrate has increased, some waiters - * could be sleeping for too long. Wake up tardy waiters which - * should have woken up in the last period and expire idle iocgs. - */ - list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { - if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && - !iocg->delay && !iocg_is_idle(iocg)) - continue; - - spin_lock(&iocg->waitq.lock); - - /* flush wait and indebt stat deltas */ - if (iocg->wait_since) { - iocg->local_stat.wait_us += now.now - iocg->wait_since; - iocg->wait_since = now.now; - } - if (iocg->indebt_since) { - iocg->local_stat.indebt_us += - now.now - iocg->indebt_since; - iocg->indebt_since = now.now; - } - if (iocg->indelay_since) { - iocg->local_stat.indelay_us += - now.now - iocg->indelay_since; - iocg->indelay_since = now.now; - } - - if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || - iocg->delay) { - /* might be oversleeping vtime / hweight changes, kick */ - iocg_kick_waitq(iocg, true, &now); - if (iocg->abs_vdebt || iocg->delay) - nr_debtors++; - } else if (iocg_is_idle(iocg)) { - /* no waiter and idle, deactivate */ - u64 vtime = atomic64_read(&iocg->vtime); - s64 excess; - - /* - * @iocg has been inactive for a full duration and will - * have a high budget. Account anything above target as - * error and throw away. On reactivation, it'll start - * with the target budget. - */ - excess = now.vnow - vtime - ioc->margins.target; - if (excess > 0) { - u32 old_hwi; - - current_hweight(iocg, NULL, &old_hwi); - ioc->vtime_err -= div64_u64(excess * old_hwi, - WEIGHT_ONE); - } - - __propagate_weights(iocg, 0, 0, false, &now); - list_del_init(&iocg->active_list); - } - - spin_unlock(&iocg->waitq.lock); - } - commit_weights(ioc); + nr_debtors = ioc_check_iocgs(ioc, &now); /* * Wait and indebt stat are flushed above and the donation calculation From 926f75f6a9ef503d45dced061e304d0324beeba1 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:15 +0800 Subject: [PATCH 298/395] blk-iocost: Factor out the base vrate change into a separate function Factor out the base vrate change code into a separate function to fimplify the ioc_timer_fn(). No functional change. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 99 +++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 45 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 7dd1424d5833..ffa418c0dcb1 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -971,6 +971,58 @@ static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); } +static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, + int nr_lagging, int nr_shortages, + int prev_busy_level, u32 *missed_ppm) +{ + u64 vrate = ioc->vtime_base_rate; + u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; + + if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { + if (ioc->busy_level != prev_busy_level || nr_lagging) + trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + return; + } + + /* rq_wait signal is always reliable, ignore user vrate_min */ + if (rq_wait_pct > RQ_WAIT_BUSY_PCT) + vrate_min = VRATE_MIN; + + /* + * If vrate is out of bounds, apply clamp gradually as the + * bounds can change abruptly. Otherwise, apply busy_level + * based adjustment. + */ + if (vrate < vrate_min) { + vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100); + vrate = min(vrate, vrate_min); + } else if (vrate > vrate_max) { + vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); + vrate = max(vrate, vrate_max); + } else { + int idx = min_t(int, abs(ioc->busy_level), + ARRAY_SIZE(vrate_adj_pct) - 1); + u32 adj_pct = vrate_adj_pct[idx]; + + if (ioc->busy_level > 0) + adj_pct = 100 - adj_pct; + else + adj_pct = 100 + adj_pct; + + vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), + vrate_min, vrate_max); + } + + trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + ioc->vtime_base_rate = vrate; + ioc_refresh_margins(ioc); +} + /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { @@ -2323,51 +2375,8 @@ static void ioc_timer_fn(struct timer_list *timer) ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); - if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { - u64 vrate = ioc->vtime_base_rate; - u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; - - /* rq_wait signal is always reliable, ignore user vrate_min */ - if (rq_wait_pct > RQ_WAIT_BUSY_PCT) - vrate_min = VRATE_MIN; - - /* - * If vrate is out of bounds, apply clamp gradually as the - * bounds can change abruptly. Otherwise, apply busy_level - * based adjustment. - */ - if (vrate < vrate_min) { - vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), - 100); - vrate = min(vrate, vrate_min); - } else if (vrate > vrate_max) { - vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), - 100); - vrate = max(vrate, vrate_max); - } else { - int idx = min_t(int, abs(ioc->busy_level), - ARRAY_SIZE(vrate_adj_pct) - 1); - u32 adj_pct = vrate_adj_pct[idx]; - - if (ioc->busy_level > 0) - adj_pct = 100 - adj_pct; - else - adj_pct = 100 + adj_pct; - - vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), - vrate_min, vrate_max); - } - - trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, - nr_lagging, nr_shortages); - - ioc->vtime_base_rate = vrate; - ioc_refresh_margins(ioc); - } else if (ioc->busy_level != prev_busy_level || nr_lagging) { - trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), - missed_ppm, rq_wait_pct, nr_lagging, - nr_shortages); - } + ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages, + prev_busy_level, missed_ppm); ioc_refresh_params(ioc, false); From df4ad53242158f9f1f97daf4feddbb4f8b77f080 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Tue, 8 Dec 2020 00:39:15 +0800 Subject: [PATCH 299/395] bcache: fix race between setting bdev state to none and new write request direct to backing There is a race condition in detaching as below: A. detaching B. Write request (1) writing back (2) write back done, set bdev state to clean. (3) cached_dev_put() and schedule_work(&dc->detach); (4) write data [0 - 4K] directly into backing and ack to user. (5) power-failure... When we restart this bcache device, this bdev is clean but not detached, and read [0 - 4K], we will get unexpected old data from cache device. To fix this problem, set the bdev state to none when we writeback done in detaching, and then if power-failure happened as above, the data in cache will not be used in next bcache device starting, it's detached, we will read the correct data from backing derectly. Signed-off-by: Dongsheng Yang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 9 --------- drivers/md/bcache/writeback.c | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 46a00134a36a..b1a6ba9a5adb 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1114,9 +1114,6 @@ static void cancel_writeback_rate_update_dwork(struct cached_dev *dc) static void cached_dev_detach_finish(struct work_struct *w) { struct cached_dev *dc = container_of(w, struct cached_dev, detach); - struct closure cl; - - closure_init_stack(&cl); BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(refcount_read(&dc->count)); @@ -1130,12 +1127,6 @@ static void cached_dev_detach_finish(struct work_struct *w) dc->writeback_thread = NULL; } - memset(&dc->sb.set_uuid, 0, 16); - SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); - - bch_write_bdev_super(dc, &cl); - closure_sync(&cl); - mutex_lock(&bch_register_lock); calc_cached_dev_sectors(dc->disk.c); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 3c74996978da..a129e4d2707c 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -705,6 +705,15 @@ static int bch_writeback_thread(void *arg) * bch_cached_dev_detach(). */ if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) { + struct closure cl; + + closure_init_stack(&cl); + memset(&dc->sb.set_uuid, 0, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); + + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + up_write(&dc->writeback_lock); break; } From f6f371f7db42917c7b2a861c4fc923cb352ce5a1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 6 Dec 2020 14:04:39 +0000 Subject: [PATCH 300/395] blk-mq: skip hybrid polling if iopoll doesn't spin If blk_poll() is not going to spin (i.e. @spin=false), it also must not sleep in hybrid polling, otherwise it might be pretty suprising for users trying to do a quick check and expecting no-wait behaviour. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index cf3916e2852f..2881a457de83 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3865,9 +3865,10 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) * the state. Like for the other success return cases, the * caller is responsible for checking if the IO completed. If * the IO isn't complete, we'll get called again and will go - * straight to the busy poll loop. + * straight to the busy poll loop. If specified not to spin, + * we also should not sleep. */ - if (blk_mq_poll_hybrid(q, hctx, cookie)) + if (spin && blk_mq_poll_hybrid(q, hctx, cookie)) return 1; hctx->poll_considered++; From b78beea038a3087df63bba7adaacb476a8ca95af Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 22 Nov 2020 15:35:45 +0000 Subject: [PATCH 301/395] sbitmap: optimise sbitmap_deferred_clear() Because of spinlocks and atomics sbitmap_deferred_clear() have to reload &sb->map[index] on each access even though the map address won't change. Pass in sbitmap_word instead of {sb, index}, so it's cached in a variable. It also improves code generation of sbitmap_find_bit_in_index(). Signed-off-by: Pavel Begunkov Reviewed-by: John Garry Signed-off-by: Jens Axboe --- lib/sbitmap.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 267aa7709416..c1c8a4e69325 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -12,32 +12,32 @@ /* * See if we have deferred clears that we can batch move */ -static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index) +static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) { unsigned long mask, val; bool ret = false; unsigned long flags; - spin_lock_irqsave(&sb->map[index].swap_lock, flags); + spin_lock_irqsave(&map->swap_lock, flags); - if (!sb->map[index].cleared) + if (!map->cleared) goto out_unlock; /* * First get a stable cleared mask, setting the old mask to 0. */ - mask = xchg(&sb->map[index].cleared, 0); + mask = xchg(&map->cleared, 0); /* * Now clear the masked bits in our free word */ do { - val = sb->map[index].word; - } while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val); + val = map->word; + } while (cmpxchg(&map->word, val, val & ~mask) != val); ret = true; out_unlock: - spin_unlock_irqrestore(&sb->map[index].swap_lock, flags); + spin_unlock_irqrestore(&map->swap_lock, flags); return ret; } @@ -92,7 +92,7 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth) unsigned int i; for (i = 0; i < sb->map_nr; i++) - sbitmap_deferred_clear(sb, i); + sbitmap_deferred_clear(&sb->map[i]); sb->depth = depth; sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); @@ -139,15 +139,15 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth, static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index, unsigned int alloc_hint, bool round_robin) { + struct sbitmap_word *map = &sb->map[index]; int nr; do { - nr = __sbitmap_get_word(&sb->map[index].word, - sb->map[index].depth, alloc_hint, + nr = __sbitmap_get_word(&map->word, map->depth, alloc_hint, !round_robin); if (nr != -1) break; - if (!sbitmap_deferred_clear(sb, index)) + if (!sbitmap_deferred_clear(map)) break; } while (1); @@ -207,7 +207,7 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, break; } - if (sbitmap_deferred_clear(sb, index)) + if (sbitmap_deferred_clear(&sb->map[index])) goto again; /* Jump to next index. */ From 661d4f55a79483aee4970a76e3bd9d4cdc74ac79 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 22 Nov 2020 15:35:46 +0000 Subject: [PATCH 302/395] sbitmap: remove swap_lock map->swap_lock protects map->cleared from concurrent modification, however sbitmap_deferred_clear() is already atomically drains it, so it's guaranteed to not loose bits on concurrent sbitmap_deferred_clear(). A one threaded tag heavy test on top of nullbk showed ~1.5% t-put increase, and 3% -> 1% cycle reduction of sbitmap_get() according to perf. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 5 ----- lib/sbitmap.c | 14 +++----------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index e40d019c3d9d..74cc6384715e 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -32,11 +32,6 @@ struct sbitmap_word { * @cleared: word holding cleared bits */ unsigned long cleared ____cacheline_aligned_in_smp; - - /** - * @swap_lock: Held while swapping word <-> cleared - */ - spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** diff --git a/lib/sbitmap.c b/lib/sbitmap.c index c1c8a4e69325..4fd877048ba8 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -15,13 +15,9 @@ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) { unsigned long mask, val; - bool ret = false; - unsigned long flags; - spin_lock_irqsave(&map->swap_lock, flags); - - if (!map->cleared) - goto out_unlock; + if (!READ_ONCE(map->cleared)) + return false; /* * First get a stable cleared mask, setting the old mask to 0. @@ -35,10 +31,7 @@ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) val = map->word; } while (cmpxchg(&map->word, val, val & ~mask) != val); - ret = true; -out_unlock: - spin_unlock_irqrestore(&map->swap_lock, flags); - return ret; + return true; } int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, @@ -80,7 +73,6 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, for (i = 0; i < sb->map_nr; i++) { sb->map[i].depth = min(depth, bits_per_word); depth -= sb->map[i].depth; - spin_lock_init(&sb->map[i].swap_lock); } return 0; } From c3250c8d2451ffbea14ba95164c59edd943ee4be Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 22 Nov 2020 15:35:47 +0000 Subject: [PATCH 303/395] sbitmap: replace CAS with atomic and sbitmap_deferred_clear() does CAS loop to propagate cleared bits, replace it with equivalent atomic bitwise and. That's slightly faster and makes wait-free instead of lock-free as before. The atomic can be relaxed (i.e. barrier-less) because following sbitmap_get*() deal with synchronisation, see comments in sbitmap_queue_clear(). It's ok to cast to atomic_long_t, that's what bitops/lock.h does. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- lib/sbitmap.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 4fd877048ba8..c18b518a16ba 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -14,7 +14,7 @@ */ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) { - unsigned long mask, val; + unsigned long mask; if (!READ_ONCE(map->cleared)) return false; @@ -27,10 +27,8 @@ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) /* * Now clear the masked bits in our free word */ - do { - val = map->word; - } while (cmpxchg(&map->word, val, val & ~mask) != val); - + atomic_long_andnot(mask, (atomic_long_t *)&map->word); + BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(map->word)); return true; } From 0eff1f1a38a95b20fec83d0b69409c8da967fe1e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 22 Nov 2020 15:35:48 +0000 Subject: [PATCH 304/395] sbitmap: simplify wrap check __sbitmap_get_word() doesn't warp if it's starting from the beginning (i.e. initial hint is 0). Instead of stashing the original hint just set @wrap accordingly. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- lib/sbitmap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index c18b518a16ba..d693d9213ceb 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -97,9 +97,11 @@ EXPORT_SYMBOL_GPL(sbitmap_resize); static int __sbitmap_get_word(unsigned long *word, unsigned long depth, unsigned int hint, bool wrap) { - unsigned int orig_hint = hint; int nr; + /* don't wrap if starting from 0 */ + wrap = wrap && hint; + while (1) { nr = find_next_zero_bit(word, depth, hint); if (unlikely(nr >= depth)) { @@ -108,8 +110,8 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth, * offset to 0 in a failure case, so start from 0 to * exhaust the map. */ - if (orig_hint && hint && wrap) { - hint = orig_hint = 0; + if (hint && wrap) { + hint = 0; continue; } return -1; From 2afdeb23e4750acb4ff16fd86f566c9074708691 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Nov 2020 16:36:06 +0900 Subject: [PATCH 305/395] block: Improve blk_revalidate_disk_zones() checks Improves the checks on the zones of a zoned block device done in blk_revalidate_disk_zones() by making sure that the device report_zones method did report at least one zone and that the zones reported exactly cover the entire disk capacity, that is, that there are no missing zones at the end of the disk sector range. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-zoned.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 6817a673e5ce..7a68b6e4300c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -508,15 +508,29 @@ int blk_revalidate_disk_zones(struct gendisk *disk, noio_flag = memalloc_noio_save(); ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); + if (!ret) { + pr_warn("%s: No zones reported\n", disk->disk_name); + ret = -ENODEV; + } memalloc_noio_restore(noio_flag); + /* + * If zones where reported, make sure that the entire disk capacity + * has been checked. + */ + if (ret > 0 && args.sector != get_capacity(disk)) { + pr_warn("%s: Missing zones from sector %llu\n", + disk->disk_name, args.sector); + ret = -ENODEV; + } + /* * Install the new bitmaps and update nr_zones only once the queue is * stopped and all I/Os are completed (i.e. a scheduler is not * referencing the bitmaps). */ blk_mq_freeze_queue(q); - if (ret >= 0) { + if (ret > 0) { blk_queue_chunk_sectors(q, args.zone_sectors); q->nr_zones = args.nr_zones; swap(q->seq_zones_wlock, args.seq_zones_wlock); From 0ebcdd702f49aeb0ad2e2d894f8c124a0acc6e23 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:11 +0900 Subject: [PATCH 306/395] null_blk: Fix zone size initialization For a null_blk device with zoned mode enabled is currently initialized with a number of zones equal to the device capacity divided by the zone size, without considering if the device capacity is a multiple of the zone size. If the zone size is not a divisor of the capacity, the zones end up not covering the entire capacity, potentially resulting is out of bounds accesses to the zone array. Fix this by adding one last smaller zone with a size equal to the remainder of the disk capacity divided by the zone size if the capacity is not a multiple of the zone size. For such smaller last zone, the zone capacity is also checked so that it does not exceed the smaller zone size. Reported-by: Naohiro Aota Fixes: ca4b2a011948 ("null_blk: add zone support") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk_zoned.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index beb34b4f76b0..1d0370d91fe7 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -6,8 +6,7 @@ #define CREATE_TRACE_POINTS #include "null_blk_trace.h" -/* zone_size in MBs to sectors. */ -#define ZONE_SIZE_SHIFT 11 +#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) { @@ -16,7 +15,7 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) { - sector_t dev_size = (sector_t)dev->size * 1024 * 1024; + sector_t dev_capacity_sects, zone_capacity_sects; sector_t sector = 0; unsigned int i; @@ -38,9 +37,13 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) return -EINVAL; } - dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT; - dev->nr_zones = dev_size >> - (SECTOR_SHIFT + ilog2(dev->zone_size_sects)); + zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); + dev_capacity_sects = MB_TO_SECTS(dev->size); + dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); + dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); + if (dev_capacity_sects & (dev->zone_size_sects - 1)) + dev->nr_zones++; + dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), GFP_KERNEL | __GFP_ZERO); if (!dev->zones) @@ -101,8 +104,12 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) struct blk_zone *zone = &dev->zones[i]; zone->start = zone->wp = sector; - zone->len = dev->zone_size_sects; - zone->capacity = dev->zone_capacity << ZONE_SIZE_SHIFT; + if (zone->start + dev->zone_size_sects > dev_capacity_sects) + zone->len = dev_capacity_sects - zone->start; + else + zone->len = dev->zone_size_sects; + zone->capacity = + min_t(sector_t, zone->len, zone_capacity_sects); zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; zone->cond = BLK_ZONE_COND_EMPTY; From 2e896d89510f23927ec393bee1e0570db3d5a6c6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:12 +0900 Subject: [PATCH 307/395] null_blk: Fail zone append to conventional zones Conventional zones do not have a write pointer and so cannot accept zone append writes. Make sure to fail any zone append write command issued to a conventional zone. Reported-by: Naohiro Aota Fixes: e0489ed5daeb ("null_blk: Support REQ_OP_ZONE_APPEND") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk_zoned.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 1d0370d91fe7..172f720b8d63 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -339,8 +339,11 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, trace_nullb_zone_op(cmd, zno, zone->cond); - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { + if (append) + return BLK_STS_IOERR; return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); + } null_lock_zone(dev, zno); From 817046ecddbc5f3cdd93fb84dd58c58ced987dee Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:13 +0900 Subject: [PATCH 308/395] block: Align max_hw_sectors to logical blocksize Block device drivers do not have to call blk_queue_max_hw_sectors() to set a limit on request size if the default limit BLK_SAFE_MAX_SECTORS is acceptable. However, this limit (255 sectors) may not be aligned to the device logical block size which cannot be used as is for a request maximum size. This is the case for the null_blk device driver. Modify blk_queue_max_hw_sectors() to make sure that the request size limits specified by the max_hw_sectors and max_sectors queue limits are always aligned to the device logical block size. Additionally, to avoid introducing a dependence on the execution order of this function with blk_queue_logical_block_size(), also modify blk_queue_logical_block_size() to perform the same alignment when the logical block size is set after max_hw_sectors. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-settings.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 9741d1d83e98..dde5c2e9a728 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -157,10 +157,16 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto __func__, max_hw_sectors); } + max_hw_sectors = round_down(max_hw_sectors, + limits->logical_block_size >> SECTOR_SHIFT); limits->max_hw_sectors = max_hw_sectors; + max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); + max_sectors = round_down(max_sectors, + limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; + q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -321,13 +327,20 @@ EXPORT_SYMBOL(blk_queue_max_segment_size); **/ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) { - q->limits.logical_block_size = size; + struct queue_limits *limits = &q->limits; - if (q->limits.physical_block_size < size) - q->limits.physical_block_size = size; + limits->logical_block_size = size; - if (q->limits.io_min < q->limits.physical_block_size) - q->limits.io_min = q->limits.physical_block_size; + if (limits->physical_block_size < size) + limits->physical_block_size = size; + + if (limits->io_min < limits->physical_block_size) + limits->io_min = limits->physical_block_size; + + limits->max_hw_sectors = + round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT); + limits->max_sectors = + round_down(limits->max_sectors, size >> SECTOR_SHIFT); } EXPORT_SYMBOL(blk_queue_logical_block_size); From 2b8b7ed7f3fc2b1536a0add3941ae159529d23bd Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:14 +0900 Subject: [PATCH 309/395] null_blk: improve zone locking With memory backing disabled, using a single spinlock for protecting zone information and zone resource management prevents the parallel execution on multiple queue of IO requests to different zones. Furthermore, regardless of the use of memory backing, if a null_blk device is created without limits on the number of open and active zones, accounting for zone resource management is not necessary. >From these observations, zone locking is changed as follows to improve performance: 1) the zone_lock spinlock is renamed zone_res_lock and used only if zone resource management is necessary, that is, if either zone_max_open or zone_max_active are not 0. This is indicated using the new boolean need_zone_res_mgmt in the nullb_device structure. null_zone_write() is modified to reduce the amount of code executed with the zone_res_lock spinlock held. 2) With memory backing disabled, per zone locking is changed to a spinlock per zone. 3) Introduce the structure nullb_zone to replace the use of struct blk_zone for zone information. This new structure includes a union of a spinlock and a mutex for zone locking. The spinlock is used when memory backing is disabled and the mutex is used with memory backing. With these changes, fio performance with zonemode=zbd for 4K random read and random write on a dual socket (24 cores per socket) machine using the none schedulder is as follows: before patch: write (psync x 96 jobs) = 465 KIOPS read (libaio@qd=8 x 96 jobs) = 1361 KIOPS after patch: write (psync x 96 jobs) = 456 KIOPS read (libaio@qd=8 x 96 jobs) = 4096 KIOPS Write performance remains mostly unchanged but read performance is three times higher. Performance when using the mq-deadline scheduler is not changed by this patch as mq-deadline becomes the bottleneck for a multi-queue device. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 28 +++- drivers/block/null_blk_zoned.c | 280 +++++++++++++++++++-------------- 2 files changed, 188 insertions(+), 120 deletions(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index c24d9b5ad81a..14546ead1d66 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -12,6 +12,8 @@ #include #include #include +#include +#include struct nullb_cmd { struct request *rq; @@ -32,6 +34,26 @@ struct nullb_queue { struct nullb_cmd *cmds; }; +struct nullb_zone { + /* + * Zone lock to prevent concurrent modification of a zone write + * pointer position and condition: with memory backing, a write + * command execution may sleep on memory allocation. For this case, + * use mutex as the zone lock. Otherwise, use the spinlock for + * locking the zone. + */ + union { + spinlock_t spinlock; + struct mutex mutex; + }; + enum blk_zone_type type; + enum blk_zone_cond cond; + sector_t start; + sector_t wp; + unsigned int len; + unsigned int capacity; +}; + struct nullb_device { struct nullb *nullb; struct config_item item; @@ -45,10 +67,10 @@ struct nullb_device { unsigned int nr_zones_imp_open; unsigned int nr_zones_exp_open; unsigned int nr_zones_closed; - struct blk_zone *zones; + struct nullb_zone *zones; sector_t zone_size_sects; - spinlock_t zone_lock; - unsigned long *zone_locks; + bool need_zone_res_mgmt; + spinlock_t zone_res_lock; unsigned long size; /* device size in MB */ unsigned long completion_nsec; /* time in ns to complete a request */ diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 172f720b8d63..4d5c0b938618 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -13,9 +13,49 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) return sect >> ilog2(dev->zone_size_sects); } +static inline void null_lock_zone_res(struct nullb_device *dev) +{ + if (dev->need_zone_res_mgmt) + spin_lock_irq(&dev->zone_res_lock); +} + +static inline void null_unlock_zone_res(struct nullb_device *dev) +{ + if (dev->need_zone_res_mgmt) + spin_unlock_irq(&dev->zone_res_lock); +} + +static inline void null_init_zone_lock(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_lock_init(&zone->spinlock); + else + mutex_init(&zone->mutex); +} + +static inline void null_lock_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_lock_irq(&zone->spinlock); + else + mutex_lock(&zone->mutex); +} + +static inline void null_unlock_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + if (!dev->memory_backed) + spin_unlock_irq(&zone->spinlock); + else + mutex_unlock(&zone->mutex); +} + int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) { sector_t dev_capacity_sects, zone_capacity_sects; + struct nullb_zone *zone; sector_t sector = 0; unsigned int i; @@ -44,26 +84,12 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) if (dev_capacity_sects & (dev->zone_size_sects - 1)) dev->nr_zones++; - dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), - GFP_KERNEL | __GFP_ZERO); + dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone), + GFP_KERNEL | __GFP_ZERO); if (!dev->zones) return -ENOMEM; - /* - * With memory backing, the zone_lock spinlock needs to be temporarily - * released to avoid scheduling in atomic context. To guarantee zone - * information protection, use a bitmap to lock zones with - * wait_on_bit_lock_io(). Sleeping on the lock is OK as memory backing - * implies that the queue is marked with BLK_MQ_F_BLOCKING. - */ - spin_lock_init(&dev->zone_lock); - if (dev->memory_backed) { - dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL); - if (!dev->zone_locks) { - kvfree(dev->zones); - return -ENOMEM; - } - } + spin_lock_init(&dev->zone_res_lock); if (dev->zone_nr_conv >= dev->nr_zones) { dev->zone_nr_conv = dev->nr_zones - 1; @@ -86,10 +112,12 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) dev->zone_max_open = 0; pr_info("zone_max_open limit disabled, limit >= zone count\n"); } + dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; for (i = 0; i < dev->zone_nr_conv; i++) { - struct blk_zone *zone = &dev->zones[i]; + zone = &dev->zones[i]; + null_init_zone_lock(dev, zone); zone->start = sector; zone->len = dev->zone_size_sects; zone->capacity = zone->len; @@ -101,8 +129,9 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) } for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - struct blk_zone *zone = &dev->zones[i]; + zone = &dev->zones[i]; + null_init_zone_lock(dev, zone); zone->start = zone->wp = sector; if (zone->start + dev->zone_size_sects > dev_capacity_sects) zone->len = dev_capacity_sects - zone->start; @@ -147,32 +176,17 @@ int null_register_zoned_dev(struct nullb *nullb) void null_free_zoned_dev(struct nullb_device *dev) { - bitmap_free(dev->zone_locks); kvfree(dev->zones); } -static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno) -{ - if (dev->memory_backed) - wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE); - spin_lock_irq(&dev->zone_lock); -} - -static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno) -{ - spin_unlock_irq(&dev->zone_lock); - - if (dev->memory_backed) - clear_and_wake_up_bit(zno, dev->zone_locks); -} - int null_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct nullb *nullb = disk->private_data; struct nullb_device *dev = nullb->dev; - unsigned int first_zone, i, zno; - struct blk_zone zone; + unsigned int first_zone, i; + struct nullb_zone *zone; + struct blk_zone blkz; int error; first_zone = null_zone_no(dev, sector); @@ -182,19 +196,25 @@ int null_report_zones(struct gendisk *disk, sector_t sector, nr_zones = min(nr_zones, dev->nr_zones - first_zone); trace_nullb_report_zones(nullb, nr_zones); - zno = first_zone; - for (i = 0; i < nr_zones; i++, zno++) { + memset(&blkz, 0, sizeof(struct blk_zone)); + zone = &dev->zones[first_zone]; + for (i = 0; i < nr_zones; i++, zone++) { /* * Stacked DM target drivers will remap the zone information by * modifying the zone information passed to the report callback. * So use a local copy to avoid corruption of the device zone * array. */ - null_lock_zone(dev, zno); - memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone)); - null_unlock_zone(dev, zno); + null_lock_zone(dev, zone); + blkz.start = zone->start; + blkz.len = zone->len; + blkz.wp = zone->wp; + blkz.type = zone->type; + blkz.cond = zone->cond; + blkz.capacity = zone->capacity; + null_unlock_zone(dev, zone); - error = cb(&zone, i, data); + error = cb(&blkz, i, data); if (error) return error; } @@ -210,7 +230,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb, sector_t sector, unsigned int len) { struct nullb_device *dev = nullb->dev; - struct blk_zone *zone = &dev->zones[null_zone_no(dev, sector)]; + struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; unsigned int nr_sectors = len >> SECTOR_SHIFT; /* Read must be below the write pointer position */ @@ -224,11 +244,9 @@ size_t null_zone_valid_read_len(struct nullb *nullb, return (zone->wp - sector) << SECTOR_SHIFT; } -static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t __null_close_zone(struct nullb_device *dev, + struct nullb_zone *zone) { - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - return BLK_STS_IOERR; - switch (zone->cond) { case BLK_ZONE_COND_CLOSED: /* close operation on closed is not an error */ @@ -261,7 +279,7 @@ static void null_close_first_imp_zone(struct nullb_device *dev) for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { - null_close_zone(dev, &dev->zones[i]); + __null_close_zone(dev, &dev->zones[i]); return; } } @@ -310,7 +328,8 @@ static blk_status_t null_check_open(struct nullb_device *dev) * it is not certain that closing an implicit open zone will allow a new zone * to be opened, since we might already be at the active limit capacity. */ -static blk_status_t null_check_zone_resources(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_check_zone_resources(struct nullb_device *dev, + struct nullb_zone *zone) { blk_status_t ret; @@ -334,7 +353,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, { struct nullb_device *dev = cmd->nq->dev; unsigned int zno = null_zone_no(dev, sector); - struct blk_zone *zone = &dev->zones[zno]; + struct nullb_zone *zone = &dev->zones[zno]; blk_status_t ret; trace_nullb_zone_op(cmd, zno, zone->cond); @@ -345,26 +364,12 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); } - null_lock_zone(dev, zno); + null_lock_zone(dev, zone); - switch (zone->cond) { - case BLK_ZONE_COND_FULL: + if (zone->cond == BLK_ZONE_COND_FULL) { /* Cannot write to a full zone */ ret = BLK_STS_IOERR; goto unlock; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; - case BLK_ZONE_COND_IMP_OPEN: - case BLK_ZONE_COND_EXP_OPEN: - break; - default: - /* Invalid zone condition */ - ret = BLK_STS_IOERR; - goto unlock; } /* @@ -389,60 +394,69 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, goto unlock; } - if (zone->cond == BLK_ZONE_COND_CLOSED) { - dev->nr_zones_closed--; - dev->nr_zones_imp_open++; - } else if (zone->cond == BLK_ZONE_COND_EMPTY) { - dev->nr_zones_imp_open++; + if (zone->cond == BLK_ZONE_COND_CLOSED || + zone->cond == BLK_ZONE_COND_EMPTY) { + null_lock_zone_res(dev); + + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + null_unlock_zone_res(dev); + goto unlock; + } + if (zone->cond == BLK_ZONE_COND_CLOSED) { + dev->nr_zones_closed--; + dev->nr_zones_imp_open++; + } else if (zone->cond == BLK_ZONE_COND_EMPTY) { + dev->nr_zones_imp_open++; + } + + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + null_unlock_zone_res(dev); } - if (zone->cond != BLK_ZONE_COND_EXP_OPEN) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - /* - * Memory backing allocation may sleep: release the zone_lock spinlock - * to avoid scheduling in atomic context. Zone operation atomicity is - * still guaranteed through the zone_locks bitmap. - */ - if (dev->memory_backed) - spin_unlock_irq(&dev->zone_lock); ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); - if (dev->memory_backed) - spin_lock_irq(&dev->zone_lock); - if (ret != BLK_STS_OK) goto unlock; zone->wp += nr_sectors; if (zone->wp == zone->start + zone->capacity) { + null_lock_zone_res(dev); if (zone->cond == BLK_ZONE_COND_EXP_OPEN) dev->nr_zones_exp_open--; else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) dev->nr_zones_imp_open--; zone->cond = BLK_ZONE_COND_FULL; + null_unlock_zone_res(dev); } + ret = BLK_STS_OK; unlock: - null_unlock_zone(dev, zno); + null_unlock_zone(dev, zone); return ret; } -static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_open_zone(struct nullb_device *dev, + struct nullb_zone *zone) { - blk_status_t ret; + blk_status_t ret = BLK_STS_OK; if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; + null_lock_zone_res(dev); + switch (zone->cond) { case BLK_ZONE_COND_EXP_OPEN: /* open operation on exp open is not an error */ - return BLK_STS_OK; + goto unlock; case BLK_ZONE_COND_EMPTY: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; break; case BLK_ZONE_COND_IMP_OPEN: dev->nr_zones_imp_open--; @@ -450,35 +464,57 @@ static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zo case BLK_ZONE_COND_CLOSED: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; dev->nr_zones_closed--; break; case BLK_ZONE_COND_FULL: default: - return BLK_STS_IOERR; + ret = BLK_STS_IOERR; + goto unlock; } zone->cond = BLK_ZONE_COND_EXP_OPEN; dev->nr_zones_exp_open++; - return BLK_STS_OK; +unlock: + null_unlock_zone_res(dev); + + return ret; } -static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_close_zone(struct nullb_device *dev, + struct nullb_zone *zone) { blk_status_t ret; if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; + null_lock_zone_res(dev); + ret = __null_close_zone(dev, zone); + null_unlock_zone_res(dev); + + return ret; +} + +static blk_status_t null_finish_zone(struct nullb_device *dev, + struct nullb_zone *zone) +{ + blk_status_t ret = BLK_STS_OK; + + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + return BLK_STS_IOERR; + + null_lock_zone_res(dev); + switch (zone->cond) { case BLK_ZONE_COND_FULL: /* finish operation on full is not an error */ - return BLK_STS_OK; + goto unlock; case BLK_ZONE_COND_EMPTY: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; break; case BLK_ZONE_COND_IMP_OPEN: dev->nr_zones_imp_open--; @@ -489,27 +525,35 @@ static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone * case BLK_ZONE_COND_CLOSED: ret = null_check_zone_resources(dev, zone); if (ret != BLK_STS_OK) - return ret; + goto unlock; dev->nr_zones_closed--; break; default: - return BLK_STS_IOERR; + ret = BLK_STS_IOERR; + goto unlock; } zone->cond = BLK_ZONE_COND_FULL; zone->wp = zone->start + zone->len; - return BLK_STS_OK; +unlock: + null_unlock_zone_res(dev); + + return ret; } -static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_reset_zone(struct nullb_device *dev, + struct nullb_zone *zone) { if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; + null_lock_zone_res(dev); + switch (zone->cond) { case BLK_ZONE_COND_EMPTY: /* reset operation on empty is not an error */ + null_unlock_zone_res(dev); return BLK_STS_OK; case BLK_ZONE_COND_IMP_OPEN: dev->nr_zones_imp_open--; @@ -523,12 +567,15 @@ static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *z case BLK_ZONE_COND_FULL: break; default: + null_unlock_zone_res(dev); return BLK_STS_IOERR; } zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; + null_unlock_zone_res(dev); + return BLK_STS_OK; } @@ -537,19 +584,19 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, { struct nullb_device *dev = cmd->nq->dev; unsigned int zone_no; - struct blk_zone *zone; + struct nullb_zone *zone; blk_status_t ret; size_t i; if (op == REQ_OP_ZONE_RESET_ALL) { for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - null_lock_zone(dev, i); zone = &dev->zones[i]; + null_lock_zone(dev, zone); if (zone->cond != BLK_ZONE_COND_EMPTY) { null_reset_zone(dev, zone); trace_nullb_zone_op(cmd, i, zone->cond); } - null_unlock_zone(dev, i); + null_unlock_zone(dev, zone); } return BLK_STS_OK; } @@ -557,7 +604,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, zone_no = null_zone_no(dev, sector); zone = &dev->zones[zone_no]; - null_lock_zone(dev, zone_no); + null_lock_zone(dev, zone); switch (op) { case REQ_OP_ZONE_RESET: @@ -580,7 +627,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, if (ret == BLK_STS_OK) trace_nullb_zone_op(cmd, zone_no, zone->cond); - null_unlock_zone(dev, zone_no); + null_unlock_zone(dev, zone); return ret; } @@ -588,29 +635,28 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, sector_t sector, sector_t nr_sectors) { - struct nullb_device *dev = cmd->nq->dev; - unsigned int zno = null_zone_no(dev, sector); + struct nullb_device *dev; + struct nullb_zone *zone; blk_status_t sts; switch (op) { case REQ_OP_WRITE: - sts = null_zone_write(cmd, sector, nr_sectors, false); - break; + return null_zone_write(cmd, sector, nr_sectors, false); case REQ_OP_ZONE_APPEND: - sts = null_zone_write(cmd, sector, nr_sectors, true); - break; + return null_zone_write(cmd, sector, nr_sectors, true); case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: - sts = null_zone_mgmt(cmd, op, sector); - break; + return null_zone_mgmt(cmd, op, sector); default: - null_lock_zone(dev, zno); - sts = null_process_cmd(cmd, op, sector, nr_sectors); - null_unlock_zone(dev, zno); - } + dev = cmd->nq->dev; + zone = &dev->zones[null_zone_no(dev, sector)]; - return sts; + null_lock_zone(dev, zone); + sts = null_process_cmd(cmd, op, sector, nr_sectors); + null_unlock_zone(dev, zone); + return sts; + } } From 2e8c6e0e1d2d65562c637940747cfa30559f976a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:15 +0900 Subject: [PATCH 310/395] null_blk: Improve implicit zone close When open zone resource management is enabled, that is, when a null_blk zoned device is created with zone_max_open different than 0, implicitly or explicitly opening a zone may require implicitly closing a zone that is already implicitly open. This operation is done using the function null_close_first_imp_zone(), which search for an implicitly open zone to close starting from the first sequential zone. This implementation is simple but may result in the same being constantly implicitly closed and then implicitly reopened on write, namely, the lowest numbered zone that is being written. Avoid this by starting the search for an implicitly open zone starting from the zone following the last zone that was implicitly closed. The function null_close_first_imp_zone() is renamed null_close_imp_open_zone(). Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 1 + drivers/block/null_blk_zoned.c | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 14546ead1d66..29a8817fadfc 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -67,6 +67,7 @@ struct nullb_device { unsigned int nr_zones_imp_open; unsigned int nr_zones_exp_open; unsigned int nr_zones_closed; + unsigned int imp_close_zone_no; struct nullb_zone *zones; sector_t zone_size_sects; bool need_zone_res_mgmt; diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 4d5c0b938618..4dad8748a61d 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -113,6 +113,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) pr_info("zone_max_open limit disabled, limit >= zone count\n"); } dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; + dev->imp_close_zone_no = dev->zone_nr_conv; for (i = 0; i < dev->zone_nr_conv; i++) { zone = &dev->zones[i]; @@ -273,13 +274,24 @@ static blk_status_t __null_close_zone(struct nullb_device *dev, return BLK_STS_OK; } -static void null_close_first_imp_zone(struct nullb_device *dev) +static void null_close_imp_open_zone(struct nullb_device *dev) { - unsigned int i; + struct nullb_zone *zone; + unsigned int zno, i; + + zno = dev->imp_close_zone_no; + if (zno >= dev->nr_zones) + zno = dev->zone_nr_conv; for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { - if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { - __null_close_zone(dev, &dev->zones[i]); + zone = &dev->zones[zno]; + zno++; + if (zno >= dev->nr_zones) + zno = dev->zone_nr_conv; + + if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { + __null_close_zone(dev, zone); + dev->imp_close_zone_no = zno; return; } } @@ -307,7 +319,7 @@ static blk_status_t null_check_open(struct nullb_device *dev) if (dev->nr_zones_imp_open) { if (null_check_active(dev) == BLK_STS_OK) { - null_close_first_imp_zone(dev); + null_close_imp_open_zone(dev); return BLK_STS_OK; } } From 49c7089f3ded981fcea387f853fa394788e60fb2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:16 +0900 Subject: [PATCH 311/395] null_blk: cleanup discard handling null_handle_discard() is called from both null_handle_rq() and null_handle_bio(). As these functions are only passed a nullb_cmd structure, this forces pointer dereferences to identiify the discard operation code and to access the sector range to be discarded. Simplify all this by changing the interface of the functions null_handle_discard() and null_handle_memory_backed() to pass along the operation code, operation start sector and number of sectors. With this change null_handle_discard() can be called directly from null_handle_memory_backed(). Also add a message warning that the discard configuration attribute has no effect when memory backing is disabled. No functional change is introduced by this patch. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk_main.c | 43 ++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 4685ea401d5b..a223bee24e76 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1076,13 +1076,16 @@ static void nullb_fill_pattern(struct nullb *nullb, struct page *page, kunmap_atomic(dst); } -static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) +static blk_status_t null_handle_discard(struct nullb_device *dev, + sector_t sector, sector_t nr_sectors) { + struct nullb *nullb = dev->nullb; + size_t n = nr_sectors << SECTOR_SHIFT; size_t temp; spin_lock_irq(&nullb->lock); while (n > 0) { - temp = min_t(size_t, n, nullb->dev->blocksize); + temp = min_t(size_t, n, dev->blocksize); null_free_sector(nullb, sector, false); if (null_cache_active(nullb)) null_free_sector(nullb, sector, true); @@ -1090,6 +1093,8 @@ static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n) n -= temp; } spin_unlock_irq(&nullb->lock); + + return BLK_STS_OK; } static int null_handle_flush(struct nullb *nullb) @@ -1149,17 +1154,10 @@ static int null_handle_rq(struct nullb_cmd *cmd) struct nullb *nullb = cmd->nq->dev->nullb; int err; unsigned int len; - sector_t sector; + sector_t sector = blk_rq_pos(rq); struct req_iterator iter; struct bio_vec bvec; - sector = blk_rq_pos(rq); - - if (req_op(rq) == REQ_OP_DISCARD) { - null_handle_discard(nullb, sector, blk_rq_bytes(rq)); - return 0; - } - spin_lock_irq(&nullb->lock); rq_for_each_segment(bvec, rq, iter) { len = bvec.bv_len; @@ -1183,18 +1181,10 @@ static int null_handle_bio(struct nullb_cmd *cmd) struct nullb *nullb = cmd->nq->dev->nullb; int err; unsigned int len; - sector_t sector; + sector_t sector = bio->bi_iter.bi_sector; struct bio_vec bvec; struct bvec_iter iter; - sector = bio->bi_iter.bi_sector; - - if (bio_op(bio) == REQ_OP_DISCARD) { - null_handle_discard(nullb, sector, - bio_sectors(bio) << SECTOR_SHIFT); - return 0; - } - spin_lock_irq(&nullb->lock); bio_for_each_segment(bvec, bio, iter) { len = bvec.bv_len; @@ -1263,11 +1253,16 @@ static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, } static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, - enum req_opf op) + enum req_opf op, + sector_t sector, + sector_t nr_sectors) { struct nullb_device *dev = cmd->nq->dev; int err; + if (op == REQ_OP_DISCARD) + return null_handle_discard(dev, sector, nr_sectors); + if (dev->queue_mode == NULL_Q_BIO) err = null_handle_bio(cmd); else @@ -1343,7 +1338,7 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, } if (dev->memory_backed) - return null_handle_memory_backed(cmd, op); + return null_handle_memory_backed(cmd, op, sector, nr_sectors); return BLK_STS_OK; } @@ -1589,6 +1584,12 @@ static void null_config_discard(struct nullb *nullb) if (nullb->dev->discard == false) return; + if (!nullb->dev->memory_backed) { + nullb->dev->discard = false; + pr_info("discard option is ignored without memory backing\n"); + return; + } + if (nullb->dev->zoned) { nullb->dev->discard = false; pr_info("discard option is ignored in zoned mode\n"); From 0ec4d913ac69ec86757eec117fc2733018552aa7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:17 +0900 Subject: [PATCH 312/395] null_blk: discard zones on reset When memory backing is enabled, use null_handle_discard() to free the backing memory used by a zone when the zone is being reset. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 2 ++ drivers/block/null_blk_main.c | 4 ++-- drivers/block/null_blk_zoned.c | 3 +++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 29a8817fadfc..63000aeeb2f3 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -116,6 +116,8 @@ struct nullb { char disk_name[DISK_NAME_LEN]; }; +blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, + sector_t nr_sectors); blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_opf op, sector_t sector, unsigned int nr_sectors); diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index a223bee24e76..b758b9366630 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1076,8 +1076,8 @@ static void nullb_fill_pattern(struct nullb *nullb, struct page *page, kunmap_atomic(dst); } -static blk_status_t null_handle_discard(struct nullb_device *dev, - sector_t sector, sector_t nr_sectors) +blk_status_t null_handle_discard(struct nullb_device *dev, + sector_t sector, sector_t nr_sectors) { struct nullb *nullb = dev->nullb; size_t n = nr_sectors << SECTOR_SHIFT; diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 4dad8748a61d..65464f7559e0 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -588,6 +588,9 @@ static blk_status_t null_reset_zone(struct nullb_device *dev, null_unlock_zone_res(dev); + if (dev->memory_backed) + return null_handle_discard(dev, zone->start, zone->len); + return BLK_STS_OK; } From ea17fd354ca8afd3e8962a77236b1a9a59262fdd Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:18 +0900 Subject: [PATCH 313/395] null_blk: Allow controlling max_hw_sectors limit Add the module option and configfs attribute max_sectors to allow configuring the maximum size of a command issued to a null_blk device. This allows exercising the block layer BIO splitting with different limits than the default BLK_SAFE_MAX_SECTORS. This is also useful for testing the zone append write path of file systems as the max_hw_sectors limit value is also used for the max_zone_append_sectors limit. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/null_blk.h | 1 + drivers/block/null_blk_main.c | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 63000aeeb2f3..83504f3cc9d6 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -85,6 +85,7 @@ struct nullb_device { unsigned int home_node; /* home node for the device */ unsigned int queue_mode; /* block interface */ unsigned int blocksize; /* block size */ + unsigned int max_sectors; /* Max sectors per command */ unsigned int irqmode; /* IRQ completion handler */ unsigned int hw_queue_depth; /* queue depth */ unsigned int index; /* index of the disk, only valid with a disk */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index b758b9366630..5357c3a4a36f 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -152,6 +152,10 @@ static int g_bs = 512; module_param_named(bs, g_bs, int, 0444); MODULE_PARM_DESC(bs, "Block size (in bytes)"); +static int g_max_sectors; +module_param_named(max_sectors, g_max_sectors, int, 0444); +MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); + static unsigned int nr_devices = 1; module_param(nr_devices, uint, 0444); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); @@ -346,6 +350,7 @@ NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); NULLB_DEVICE_ATTR(home_node, uint, NULL); NULLB_DEVICE_ATTR(queue_mode, uint, NULL); NULLB_DEVICE_ATTR(blocksize, uint, NULL); +NULLB_DEVICE_ATTR(max_sectors, uint, NULL); NULLB_DEVICE_ATTR(irqmode, uint, NULL); NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); NULLB_DEVICE_ATTR(index, uint, NULL); @@ -463,6 +468,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_home_node, &nullb_device_attr_queue_mode, &nullb_device_attr_blocksize, + &nullb_device_attr_max_sectors, &nullb_device_attr_irqmode, &nullb_device_attr_hw_queue_depth, &nullb_device_attr_index, @@ -533,7 +539,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { return snprintf(page, PAGE_SIZE, - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n"); + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -588,6 +594,7 @@ static struct nullb_device *null_alloc_dev(void) dev->home_node = g_home_node; dev->queue_mode = g_queue_mode; dev->blocksize = g_bs; + dev->max_sectors = g_max_sectors; dev->irqmode = g_irqmode; dev->hw_queue_depth = g_hw_queue_depth; dev->blocking = g_blocking; @@ -1867,6 +1874,11 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_logical_block_size(nullb->q, dev->blocksize); blk_queue_physical_block_size(nullb->q, dev->blocksize); + if (!dev->max_sectors) + dev->max_sectors = queue_max_hw_sectors(nullb->q); + dev->max_sectors = min_t(unsigned int, dev->max_sectors, + BLK_DEF_MAX_SECTORS); + blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); null_config_discard(nullb); @@ -1910,6 +1922,12 @@ static int __init null_init(void) g_bs = PAGE_SIZE; } + if (g_max_sectors > BLK_DEF_MAX_SECTORS) { + pr_warn("invalid max sectors\n"); + pr_warn("defaults max sectors to %u\n", BLK_DEF_MAX_SECTORS); + g_max_sectors = BLK_DEF_MAX_SECTORS; + } + if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { pr_err("invalid home_node value\n"); g_home_node = NUMA_NO_NODE; From eebf34a85c8c724676eba502d15202854f199b05 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 20 Nov 2020 10:55:19 +0900 Subject: [PATCH 314/395] null_blk: Move driver into its own directory Move null_blk driver code into the new sub-directory drivers/block/null_blk. Suggested-by: Bart Van Assche Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 8 +------- drivers/block/Makefile | 7 +------ drivers/block/null_blk/Kconfig | 12 ++++++++++++ drivers/block/null_blk/Makefile | 11 +++++++++++ drivers/block/{null_blk_main.c => null_blk/main.c} | 0 drivers/block/{ => null_blk}/null_blk.h | 0 drivers/block/{null_blk_trace.c => null_blk/trace.c} | 2 +- drivers/block/{null_blk_trace.h => null_blk/trace.h} | 2 +- drivers/block/{null_blk_zoned.c => null_blk/zoned.c} | 2 +- 9 files changed, 28 insertions(+), 16 deletions(-) create mode 100644 drivers/block/null_blk/Kconfig create mode 100644 drivers/block/null_blk/Makefile rename drivers/block/{null_blk_main.c => null_blk/main.c} (100%) rename drivers/block/{ => null_blk}/null_blk.h (100%) rename drivers/block/{null_blk_trace.c => null_blk/trace.c} (93%) rename drivers/block/{null_blk_trace.h => null_blk/trace.h} (97%) rename drivers/block/{null_blk_zoned.c => null_blk/zoned.c} (99%) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index ecceaaa1a66f..262326973ee0 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -16,13 +16,7 @@ menuconfig BLK_DEV if BLK_DEV -config BLK_DEV_NULL_BLK - tristate "Null test block driver" - select CONFIGFS_FS - -config BLK_DEV_NULL_BLK_FAULT_INJECTION - bool "Support fault injection for Null test block driver" - depends on BLK_DEV_NULL_BLK && FAULT_INJECTION +source "drivers/block/null_blk/Kconfig" config BLK_DEV_FD tristate "Normal floppy disk support" diff --git a/drivers/block/Makefile b/drivers/block/Makefile index e1f63117ee94..a3170859e01d 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -41,12 +41,7 @@ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ -obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o -null_blk-objs := null_blk_main.o -ifeq ($(CONFIG_BLK_DEV_ZONED), y) -null_blk-$(CONFIG_TRACING) += null_blk_trace.o -endif -null_blk-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ skd-y := skd_main.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/null_blk/Kconfig b/drivers/block/null_blk/Kconfig new file mode 100644 index 000000000000..6bf1f8ca20a2 --- /dev/null +++ b/drivers/block/null_blk/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Null block device driver configuration +# + +config BLK_DEV_NULL_BLK + tristate "Null test block driver" + select CONFIGFS_FS + +config BLK_DEV_NULL_BLK_FAULT_INJECTION + bool "Support fault injection for Null test block driver" + depends on BLK_DEV_NULL_BLK && FAULT_INJECTION diff --git a/drivers/block/null_blk/Makefile b/drivers/block/null_blk/Makefile new file mode 100644 index 000000000000..84c36e512ab8 --- /dev/null +++ b/drivers/block/null_blk/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +# needed for trace events +ccflags-y += -I$(src) + +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o +null_blk-objs := main.o +ifeq ($(CONFIG_BLK_DEV_ZONED), y) +null_blk-$(CONFIG_TRACING) += trace.o +endif +null_blk-$(CONFIG_BLK_DEV_ZONED) += zoned.o diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk/main.c similarity index 100% rename from drivers/block/null_blk_main.c rename to drivers/block/null_blk/main.c diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk/null_blk.h similarity index 100% rename from drivers/block/null_blk.h rename to drivers/block/null_blk/null_blk.h diff --git a/drivers/block/null_blk_trace.c b/drivers/block/null_blk/trace.c similarity index 93% rename from drivers/block/null_blk_trace.c rename to drivers/block/null_blk/trace.c index f246e7bff698..3711cba16071 100644 --- a/drivers/block/null_blk_trace.c +++ b/drivers/block/null_blk/trace.c @@ -4,7 +4,7 @@ * * Copyright (C) 2020 Western Digital Corporation or its affiliates. */ -#include "null_blk_trace.h" +#include "trace.h" /* * Helper to use for all null_blk traces to extract disk name. diff --git a/drivers/block/null_blk_trace.h b/drivers/block/null_blk/trace.h similarity index 97% rename from drivers/block/null_blk_trace.h rename to drivers/block/null_blk/trace.h index 4f83032eb544..ce3b430e88c5 100644 --- a/drivers/block/null_blk_trace.h +++ b/drivers/block/null_blk/trace.h @@ -73,7 +73,7 @@ TRACE_EVENT(nullb_report_zones, #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE null_blk_trace +#define TRACE_INCLUDE_FILE trace /* This part must be outside protection */ #include diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk/zoned.c similarity index 99% rename from drivers/block/null_blk_zoned.c rename to drivers/block/null_blk/zoned.c index 65464f7559e0..148b871f263b 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -4,7 +4,7 @@ #include "null_blk.h" #define CREATE_TRACE_POINTS -#include "null_blk_trace.h" +#include "trace.h" #define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) From cc29e1bf0d63f728a5bd60ef22638bbf77369552 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 26 Nov 2020 17:18:52 +0800 Subject: [PATCH 315/395] block: disable iopoll for split bio iopoll is initially for small size, latency sensitive IO. It doesn't work well for big IO, especially when it needs to be split to multiple bios. In this case, the returned cookie of __submit_bio_noacct_mq() is indeed the cookie of the last split bio. The completion of *this* last split bio done by iopoll doesn't mean the whole original bio has completed. Callers of iopoll still need to wait for completion of other split bios. Besides bio splitting may cause more trouble for iopoll which isn't supposed to be used in case of big IO. iopoll for split bio may cause potential race if CPU migration happens during bio submission. Since the returned cookie is that of the last split bio, polling on the corresponding hardware queue doesn't help complete other split bios, if these split bios are enqueued into different hardware queues. Since interrupts are disabled for polling queues, the completion of these other split bios depends on timeout mechanism, thus causing a potential hang. iopoll for split bio may also cause hang for sync polling. Currently both the blkdev and iomap-based fs (ext4/xfs, etc) support sync polling in direct IO routine. These routines will submit bio without REQ_NOWAIT flag set, and then start sync polling in current process context. The process may hang in blk_mq_get_tag() if the submitted bio has to be split into multiple bios and can rapidly exhaust the queue depth. The process are waiting for the completion of the previously allocated requests, which should be reaped by the following polling, and thus causing a deadlock. To avoid these subtle trouble described above, just disable iopoll for split bio and return BLK_QC_T_NONE in this case. The side effect is that non-HIPRI IO also returns BLK_QC_T_NONE now. It should be acceptable since the returned cookie is never used for non-HIPRI IO. Suggested-by: Ming Lei Signed-off-by: Jeffle Xu Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 8 ++++++++ block/blk-mq.c | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index 7497d86fff38..c3399bf29e9c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -279,6 +279,14 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, return NULL; split: *segs = nsegs; + + /* + * Bio splitting may cause subtle trouble such as hang when doing sync + * iopoll in direct IO routine. Given performance gain of iopoll for + * big IO can be trival, disable iopoll when split needed. + */ + bio->bi_opf &= ~REQ_HIPRI; + return bio_split(bio, sectors, GFP_NOIO, bs); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 2881a457de83..95ecc4c69969 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2159,6 +2159,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) unsigned int nr_segs; blk_qc_t cookie; blk_status_t ret; + bool hipri; blk_queue_bounce(q, &bio); __blk_queue_split(&bio, &nr_segs); @@ -2175,6 +2176,8 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) rq_qos_throttle(q, bio); + hipri = bio->bi_opf & REQ_HIPRI; + data.cmd_flags = bio->bi_opf; rq = __blk_mq_alloc_request(&data); if (unlikely(!rq)) { @@ -2267,6 +2270,8 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) blk_mq_sched_insert_request(rq, false, true, true); } + if (!hipri) + return BLK_QC_T_NONE; return cookie; queue_exit: blk_queue_exit(q); From fb01a2932e81a1fb2273f87ff92dc8172b8880ee Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 3 Dec 2020 09:26:36 +0800 Subject: [PATCH 316/395] blk-mq: add new API of blk_mq_hctx_set_fq_lock_class flush_end_io() may be called recursively from some driver, such as nvme-loop, so lockdep may complain 'possible recursive locking'. Commit b3c6a5997541("block: Fix a lockdep complaint triggered by request queue flushing") tried to address this issue by assigning dynamically allocated per-flush-queue lock class. This solution adds synchronize_rcu() for each hctx's release handler, and causes horrible SCSI MQ probe delay(more than half an hour on megaraid sas). Add new API of blk_mq_hctx_set_fq_lock_class() for these drivers, so we just need to use driver specific lock class for avoiding the lockdep warning of 'possible recursive locking'. Tested-by: Kashyap Desai Reported-by: Qian Cai Cc: Sumit Saxena Cc: John Garry Cc: Kashyap Desai Cc: Bart Van Assche Cc: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-flush.c | 25 +++++++++++++++++++++++++ include/linux/blk-mq.h | 3 +++ 2 files changed, 28 insertions(+) diff --git a/block/blk-flush.c b/block/blk-flush.c index 9507dcdd5881..bf51588762d8 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -490,3 +490,28 @@ void blk_free_flush_queue(struct blk_flush_queue *fq) kfree(fq->flush_rq); kfree(fq); } + +/* + * Allow driver to set its own lock class to fq->mq_flush_lock for + * avoiding lockdep complaint. + * + * flush_end_io() may be called recursively from some driver, such as + * nvme-loop, so lockdep may complain 'possible recursive locking' because + * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class + * key. We need to assign different lock class for these driver's + * fq->mq_flush_lock for avoiding the lockdep warning. + * + * Use dynamically allocated lock class key for each 'blk_flush_queue' + * instance is over-kill, and more worse it introduces horrible boot delay + * issue because synchronize_rcu() is implied in lockdep_unregister_key which + * is called for each hctx release. SCSI probing may synchronously create and + * destroy lots of MQ request_queues for non-existent devices, and some robot + * test kernel always enable lockdep option. It is observed that more than half + * an hour is taken during SCSI MQ probe with per-fq lock class. + */ +void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, + struct lock_class_key *key) +{ + lockdep_set_class(&hctx->fq->mq_flush_lock, key); +} +EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 794b2a33a2c3..5f639240760e 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -5,6 +5,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -594,5 +595,7 @@ static inline void blk_mq_cleanup_rq(struct request *rq) } blk_qc_t blk_mq_submit_bio(struct bio *bio); +void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, + struct lock_class_key *key); #endif From 88c9979334aa5ff8c814ddf578f3113ed6c5ce8e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 3 Dec 2020 09:26:37 +0800 Subject: [PATCH 317/395] nvme-loop: use blk_mq_hctx_set_fq_lock_class to set loop's lock class Set nvme-loop's lock class via blk_mq_hctx_set_fq_lock_class for avoiding lockdep possible recursive locking, then we can remove the dynamically allocated lock class for each flush queue, finally we can avoid horrible SCSI probe delay. This way may not address situation in which one nvme-loop is backed on another nvme-loop. However, in reality, people seldom uses this way for test. Even though someone played in this way, it is just one recursive locking false positive, no real deadlock issue. Tested-by: Kashyap Desai Reported-by: Qian Cai Reviewed-by: Christoph Hellwig Cc: Sumit Saxena Cc: John Garry Cc: Kashyap Desai Cc: Bart Van Assche Cc: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/target/loop.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index f6d81239be21..07806016c09d 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -211,6 +211,8 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set, (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); } +static struct lock_class_key loop_hctx_fq_lock_key; + static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -219,6 +221,14 @@ static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); + /* + * flush_end_io() can be called recursively for us, so use our own + * lock class key for avoiding lockdep possible recursive locking, + * then we can remove the dynamically allocated lock class for each + * flush queue, that way may cause horrible boot delay. + */ + blk_mq_hctx_set_fq_lock_class(hctx, &loop_hctx_fq_lock_key); + hctx->driver_data = queue; return 0; } From 7aa390ec2d9db0cd6677d95d0b8f307f9c086770 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 3 Dec 2020 09:26:38 +0800 Subject: [PATCH 318/395] Revert "block: Fix a lockdep complaint triggered by request queue flushing" This reverts commit b3c6a59975415bde29cfd76ff1ab008edbf614a9. Now we can avoid nvme-loop lockdep warning of 'lockdep possible recursive locking' by nvme-loop's lock class, no need to apply dynamically allocated lock class key, so revert commit b3c6a5997541("block: Fix a lockdep complaint triggered by request queue flushing"). This way fixes horrible SCSI probe delay issue on megaraid_sas, and it is reported the whole probe may take more than half an hour. Tested-by: Kashyap Desai Reported-by: Qian Cai Reviewed-by: Christoph Hellwig Cc: Sumit Saxena Cc: John Garry Cc: Kashyap Desai Cc: Bart Van Assche Cc: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-flush.c | 5 ----- block/blk.h | 1 - 2 files changed, 6 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index bf51588762d8..996d5d03dade 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -69,7 +69,6 @@ #include #include #include -#include #include "blk.h" #include "blk-mq.h" @@ -469,9 +468,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, INIT_LIST_HEAD(&fq->flush_queue[1]); INIT_LIST_HEAD(&fq->flush_data_in_flight); - lockdep_register_key(&fq->key); - lockdep_set_class(&fq->mq_flush_lock, &fq->key); - return fq; fail_rq: @@ -486,7 +482,6 @@ void blk_free_flush_queue(struct blk_flush_queue *fq) if (!fq) return; - lockdep_unregister_key(&fq->key); kfree(fq->flush_rq); kfree(fq); } diff --git a/block/blk.h b/block/blk.h index 98f0b1ae2641..d23d018fd2cd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -25,7 +25,6 @@ struct blk_flush_queue { struct list_head flush_data_in_flight; struct request *flush_rq; - struct lock_class_key key; spinlock_t mq_flush_lock; }; From f87905660ed01d85e45eac22d479f31f380b2f50 Mon Sep 17 00:00:00 2001 From: tangzhenhao Date: Sun, 29 Nov 2020 23:23:56 -0800 Subject: [PATCH 319/395] drivers/lightnvm: fix a null-ptr-deref bug in pblk-core.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At line 294 in drivers/lightnvm/pblk-write.c, function pblk_gen_run_ws is called with actual param GFP_ATOMIC. pblk_gen_run_ws call mempool_alloc using "GFP_ATOMIC" flag, so mempool_alloc can return null. So we need to check the return-val of mempool_alloc to avoid null-ptr-deref bug. Signed-off-by: tangzhenhao Reviewed-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 97c68731406b..1dddba11e721 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1869,6 +1869,10 @@ void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, struct pblk_line_ws *line_ws; line_ws = mempool_alloc(&pblk->gen_ws_pool, gfp_mask); + if (!line_ws) { + pblk_err(pblk, "pblk: could not allocate memory\n"); + return; + } line_ws->pblk = pblk; line_ws->line = line; From 28cea78af44918b920306df150afbd116bd94301 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Sep 2020 10:51:17 -0600 Subject: [PATCH 320/395] io_uring: allow non-fixed files with SQPOLL The restriction of needing fixed files for SQPOLL is problematic, and prevents/inhibits several valid uses cases. With the referenced files_struct that we have now, it's trivially supportable. Treat ->files like we do the mm for the SQPOLL thread - grab a reference to it (and assign it), and drop it when we're done. This feature is exposed as IORING_FEAT_SQPOLL_NONFIXED. Signed-off-by: Jens Axboe --- fs/io_uring.c | 87 +++++++++++++++++++++++++++-------- include/uapi/linux/io_uring.h | 1 + 2 files changed, 70 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d17198733f6a..c1f3980945e4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -999,8 +999,9 @@ static inline void io_clean_op(struct io_kiocb *req) __io_clean_op(req); } -static void io_sq_thread_drop_mm(void) +static void io_sq_thread_drop_mm_files(void) { + struct files_struct *files = current->files; struct mm_struct *mm = current->mm; if (mm) { @@ -1008,6 +1009,40 @@ static void io_sq_thread_drop_mm(void) mmput(mm); current->mm = NULL; } + if (files) { + struct nsproxy *nsproxy = current->nsproxy; + + task_lock(current); + current->files = NULL; + current->nsproxy = NULL; + task_unlock(current); + put_files_struct(files); + put_nsproxy(nsproxy); + } +} + +static void __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) +{ + if (!current->files) { + struct files_struct *files; + struct nsproxy *nsproxy; + + task_lock(ctx->sqo_task); + files = ctx->sqo_task->files; + if (!files) { + task_unlock(ctx->sqo_task); + return; + } + atomic_inc(&files->count); + get_nsproxy(ctx->sqo_task->nsproxy); + nsproxy = ctx->sqo_task->nsproxy; + task_unlock(ctx->sqo_task); + + task_lock(current); + current->files = files; + current->nsproxy = nsproxy; + task_unlock(current); + } } static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) @@ -1035,12 +1070,21 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) return -EFAULT; } -static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, + struct io_kiocb *req) { - if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM)) - return 0; - return __io_sq_thread_acquire_mm(ctx); + const struct io_op_def *def = &io_op_defs[req->opcode]; + + if (def->work_flags & IO_WQ_WORK_MM) { + int ret = __io_sq_thread_acquire_mm(ctx); + if (unlikely(ret)) + return ret; + } + + if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) + __io_sq_thread_acquire_files(ctx); + + return 0; } static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, @@ -2061,6 +2105,7 @@ static void __io_req_task_submit(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; if (!__io_sq_thread_acquire_mm(ctx)) { + __io_sq_thread_acquire_files(ctx); mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); @@ -2603,7 +2648,7 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) return false; - ret = io_sq_thread_acquire_mm(req->ctx, req); + ret = io_sq_thread_acquire_mm_files(req->ctx, req); if (io_resubmit_prep(req, ret)) { refcount_inc(&req->refs); @@ -6168,13 +6213,7 @@ static struct file *io_file_get(struct io_submit_state *state, static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, int fd) { - bool fixed; - - fixed = (req->flags & REQ_F_FIXED_FILE) != 0; - if (unlikely(!fixed && io_async_submit(req->ctx))) - return -EBADF; - - req->file = io_file_get(state, req, fd, fixed); + req->file = io_file_get(state, req, fd, req->flags & REQ_F_FIXED_FILE); if (req->file || io_op_defs[req->opcode].needs_file_no_error) return 0; return -EBADF; @@ -6551,7 +6590,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; - if (unlikely(io_sq_thread_acquire_mm(ctx, req))) + if (unlikely(io_sq_thread_acquire_mm_files(ctx, req))) return -EFAULT; sqe_flags = READ_ONCE(sqe->flags); @@ -6739,7 +6778,7 @@ static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx, * adding ourselves to the waitqueue, as the unuse/drop * may sleep. */ - io_sq_thread_drop_mm(); + io_sq_thread_drop_mm_files(); /* * We're polling. If we're within the defined idle @@ -6808,11 +6847,18 @@ static void io_sqd_init_new(struct io_sq_data *sqd) static int io_sq_thread(void *data) { struct cgroup_subsys_state *cur_css = NULL; + struct files_struct *old_files = current->files; + struct nsproxy *old_nsproxy = current->nsproxy; const struct cred *old_cred = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; unsigned long start_jiffies; + task_lock(current); + current->files = NULL; + current->nsproxy = NULL; + task_unlock(current); + start_jiffies = jiffies; while (!kthread_should_stop()) { enum sq_ret ret = 0; @@ -6845,7 +6891,7 @@ static int io_sq_thread(void *data) ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); - io_sq_thread_drop_mm(); + io_sq_thread_drop_mm_files(); } if (ret & SQT_SPIN) { @@ -6870,6 +6916,11 @@ static int io_sq_thread(void *data) if (old_cred) revert_creds(old_cred); + task_lock(current); + current->files = old_files; + current->nsproxy = old_nsproxy; + task_unlock(current); + kthread_parkme(); return 0; @@ -9415,7 +9466,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | - IORING_FEAT_POLL_32BITS; + IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e943bf07c959..2301c37e86cb 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -254,6 +254,7 @@ struct io_uring_params { #define IORING_FEAT_CUR_PERSONALITY (1U << 4) #define IORING_FEAT_FAST_POLL (1U << 5) #define IORING_FEAT_POLL_32BITS (1U << 6) +#define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) /* * io_uring_register(2) opcodes and arguments From 14587a46646d30d2b4a6b69865682cfe6bbdcd1f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 5 Sep 2020 11:36:08 -0600 Subject: [PATCH 321/395] io_uring: enable file table usage for SQPOLL rings Now that SQPOLL supports non-registered files and grabs the file table, we can relax the restriction on open/close/accept/connect and allow them on a ring that is setup with IORING_SETUP_SQPOLL. Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c1f3980945e4..1824bd4329ee 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -869,7 +869,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_OPENAT] = { .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, + IO_WQ_WORK_FS | IO_WQ_WORK_MM, }, [IORING_OP_CLOSE] = { .needs_file = 1, @@ -921,7 +921,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_OPENAT2] = { .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | - IO_WQ_WORK_BLKCG, + IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, @@ -3897,7 +3897,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { u64 flags, mode; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; mode = READ_ONCE(sqe->len); flags = READ_ONCE(sqe->open_flags); @@ -3911,7 +3911,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); @@ -4305,7 +4305,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) io_req_init_async(req); req->work.flags |= IO_WQ_WORK_NO_CANCEL; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) @@ -4786,7 +4786,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = &req->accept; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; @@ -4827,7 +4827,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_connect *conn = &req->connect; struct io_async_connect *io = req->async_data; - if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) return -EINVAL; From e886663cfd029b64a1d8da7efae7014526d884e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 26 Sep 2020 17:20:17 -0600 Subject: [PATCH 322/395] fs: make do_renameat2() take struct filename Pass in the struct filename pointers instead of the user string, and update the three callers to do the same. This behaves like do_unlinkat(), which also takes a filename struct and puts it when it is done. Converting callers is then trivial. Signed-off-by: Jens Axboe --- fs/internal.h | 2 ++ fs/namei.c | 40 ++++++++++++++++++++++------------------ 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index a7cd0f64faa4..6fd14ea213c3 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -78,6 +78,8 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, long do_rmdir(int dfd, struct filename *name); long do_unlinkat(int dfd, struct filename *name); int may_linkat(struct path *link); +int do_renameat2(int olddfd, struct filename *oldname, int newdfd, + struct filename *newname, unsigned int flags); /* * namespace.c diff --git a/fs/namei.c b/fs/namei.c index d4a6dd772303..03d0e11e4f36 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4346,8 +4346,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, } EXPORT_SYMBOL(vfs_rename); -static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, - const char __user *newname, unsigned int flags) +int do_renameat2(int olddfd, struct filename *from, int newdfd, + struct filename *to, unsigned int flags) { struct dentry *old_dentry, *new_dentry; struct dentry *trap; @@ -4355,32 +4355,30 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; - struct filename *from; - struct filename *to; unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; bool should_retry = false; - int error; + int error = -EINVAL; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) - return -EINVAL; + goto put_both; if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && (flags & RENAME_EXCHANGE)) - return -EINVAL; + goto put_both; if (flags & RENAME_EXCHANGE) target_flags = 0; retry: - from = filename_parentat(olddfd, getname(oldname), lookup_flags, - &old_path, &old_last, &old_type); + from = filename_parentat(olddfd, from, lookup_flags, &old_path, + &old_last, &old_type); if (IS_ERR(from)) { error = PTR_ERR(from); - goto exit; + goto put_new; } - to = filename_parentat(newdfd, getname(newname), lookup_flags, - &new_path, &new_last, &new_type); + to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, + &new_type); if (IS_ERR(to)) { error = PTR_ERR(to); goto exit1; @@ -4473,34 +4471,40 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, if (retry_estale(error, lookup_flags)) should_retry = true; path_put(&new_path); - putname(to); exit1: path_put(&old_path); - putname(from); if (should_retry) { should_retry = false; lookup_flags |= LOOKUP_REVAL; goto retry; } -exit: +put_both: + if (!IS_ERR(from)) + putname(from); +put_new: + if (!IS_ERR(to)) + putname(to); return error; } SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) { - return do_renameat2(olddfd, oldname, newdfd, newname, flags); + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), + flags); } SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) { - return do_renameat2(olddfd, oldname, newdfd, newname, 0); + return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), + 0); } SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { - return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); + return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, + getname(newname), 0); } int readlink_copy(char __user *buffer, int buflen, const char *link) From 80a261fd00327898e272ddc84ccc9510c036453c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Sep 2020 14:23:58 -0600 Subject: [PATCH 323/395] io_uring: add support for IORING_OP_RENAMEAT IORING_OP_RENAMEAT behaves like renameat2(), and takes the same flags etc. Signed-off-by: Jens Axboe --- fs/io_uring.c | 70 +++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 2 + 2 files changed, 72 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1824bd4329ee..94a5e1618368 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -546,6 +546,15 @@ struct io_shutdown { int how; }; +struct io_rename { + struct file *file; + int old_dfd; + int new_dfd; + struct filename *oldpath; + struct filename *newpath; + int flags; +}; + struct io_completion { struct file *file; struct list_head list; @@ -673,6 +682,7 @@ struct io_kiocb { struct io_provide_buf pbuf; struct io_statx statx; struct io_shutdown shutdown; + struct io_rename rename; /* use only after cleaning per-op data, see io_clean_op() */ struct io_completion compl; }; @@ -943,6 +953,10 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_SHUTDOWN] = { .needs_file = 1, }, + [IORING_OP_RENAMEAT] = { + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, + }, }; enum io_mem_account { @@ -3645,6 +3659,53 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, return ret; } +static int io_renameat_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_rename *ren = &req->rename; + const char __user *oldf, *newf; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ren->old_dfd = READ_ONCE(sqe->fd); + oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ren->new_dfd = READ_ONCE(sqe->len); + ren->flags = READ_ONCE(sqe->rename_flags); + + ren->oldpath = getname(oldf); + if (IS_ERR(ren->oldpath)) + return PTR_ERR(ren->oldpath); + + ren->newpath = getname(newf); + if (IS_ERR(ren->newpath)) { + putname(ren->oldpath); + return PTR_ERR(ren->newpath); + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_renameat(struct io_kiocb *req, bool force_nonblock) +{ + struct io_rename *ren = &req->rename; + int ret; + + if (force_nonblock) + return -EAGAIN; + + ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, + ren->newpath, ren->flags); + + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) + req_set_fail_links(req); + io_req_complete(req, ret); + return 0; +} + static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -5869,6 +5930,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_tee_prep(req, sqe); case IORING_OP_SHUTDOWN: return io_shutdown_prep(req, sqe); + case IORING_OP_RENAMEAT: + return io_renameat_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6006,6 +6069,10 @@ static void __io_clean_op(struct io_kiocb *req) if (req->open.filename) putname(req->open.filename); break; + case IORING_OP_RENAMEAT: + putname(req->rename.oldpath); + putname(req->rename.newpath); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; } @@ -6115,6 +6182,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, case IORING_OP_SHUTDOWN: ret = io_shutdown(req, force_nonblock); break; + case IORING_OP_RENAMEAT: + ret = io_renameat(req, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2301c37e86cb..c9a58bc7e4be 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -42,6 +42,7 @@ struct io_uring_sqe { __u32 statx_flags; __u32 fadvise_advice; __u32 splice_flags; + __u32 rename_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -133,6 +134,7 @@ enum { IORING_OP_REMOVE_BUFFERS, IORING_OP_TEE, IORING_OP_SHUTDOWN, + IORING_OP_RENAMEAT, /* this goes last, obviously */ IORING_OP_LAST, From 14a1143b68ee2e4ec4e8d54f71cddb9724f9ec70 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Sep 2020 14:27:37 -0600 Subject: [PATCH 324/395] io_uring: add support for IORING_OP_UNLINKAT IORING_OP_UNLINKAT behaves like unlinkat(2) and takes the same flags and arguments. Signed-off-by: Jens Axboe --- fs/io_uring.c | 64 +++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 2 ++ 2 files changed, 66 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 94a5e1618368..c8ecbc0bd286 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -555,6 +555,13 @@ struct io_rename { int flags; }; +struct io_unlink { + struct file *file; + int dfd; + int flags; + struct filename *filename; +}; + struct io_completion { struct file *file; struct list_head list; @@ -683,6 +690,7 @@ struct io_kiocb { struct io_statx statx; struct io_shutdown shutdown; struct io_rename rename; + struct io_unlink unlink; /* use only after cleaning per-op data, see io_clean_op() */ struct io_completion compl; }; @@ -957,6 +965,10 @@ static const struct io_op_def io_op_defs[] = { .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, }, + [IORING_OP_UNLINKAT] = { + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, + }, }; enum io_mem_account { @@ -3706,6 +3718,50 @@ static int io_renameat(struct io_kiocb *req, bool force_nonblock) return 0; } +static int io_unlinkat_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_unlink *un = &req->unlink; + const char __user *fname; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + un->dfd = READ_ONCE(sqe->fd); + + un->flags = READ_ONCE(sqe->unlink_flags); + if (un->flags & ~AT_REMOVEDIR) + return -EINVAL; + + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + un->filename = getname(fname); + if (IS_ERR(un->filename)) + return PTR_ERR(un->filename); + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_unlinkat(struct io_kiocb *req, bool force_nonblock) +{ + struct io_unlink *un = &req->unlink; + int ret; + + if (force_nonblock) + return -EAGAIN; + + if (un->flags & AT_REMOVEDIR) + ret = do_rmdir(un->dfd, un->filename); + else + ret = do_unlinkat(un->dfd, un->filename); + + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) + req_set_fail_links(req); + io_req_complete(req, ret); + return 0; +} + static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -5932,6 +5988,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_shutdown_prep(req, sqe); case IORING_OP_RENAMEAT: return io_renameat_prep(req, sqe); + case IORING_OP_UNLINKAT: + return io_unlinkat_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6073,6 +6131,9 @@ static void __io_clean_op(struct io_kiocb *req) putname(req->rename.oldpath); putname(req->rename.newpath); break; + case IORING_OP_UNLINKAT: + putname(req->unlink.filename); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; } @@ -6185,6 +6246,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, case IORING_OP_RENAMEAT: ret = io_renameat(req, force_nonblock); break; + case IORING_OP_UNLINKAT: + ret = io_unlinkat(req, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index c9a58bc7e4be..557e7eae497f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -43,6 +43,7 @@ struct io_uring_sqe { __u32 fadvise_advice; __u32 splice_flags; __u32 rename_flags; + __u32 unlink_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -135,6 +136,7 @@ enum { IORING_OP_TEE, IORING_OP_SHUTDOWN, IORING_OP_RENAMEAT, + IORING_OP_UNLINKAT, /* this goes last, obviously */ IORING_OP_LAST, From 018043be1f1bc43ad6956bfd39b7beea12fb4ca6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Oct 2020 23:17:18 +0000 Subject: [PATCH 325/395] io_uring: split poll and poll_remove structs Don't use a single struct for polls and poll remove requests, they have totally different layouts. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c8ecbc0bd286..10cfb6d17994 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -395,16 +395,18 @@ struct io_ring_ctx { */ struct io_poll_iocb { struct file *file; - union { - struct wait_queue_head *head; - u64 addr; - }; + struct wait_queue_head *head; __poll_t events; bool done; bool canceled; struct wait_queue_entry wait; }; +struct io_poll_remove { + struct file *file; + u64 addr; +}; + struct io_close { struct file *file; struct file *put_file; @@ -672,6 +674,7 @@ struct io_kiocb { struct file *file; struct io_rw rw; struct io_poll_iocb poll; + struct io_poll_remove poll_remove; struct io_accept accept; struct io_sync sync; struct io_cancel cancel; @@ -5538,7 +5541,7 @@ static int io_poll_remove_prep(struct io_kiocb *req, sqe->poll_events) return -EINVAL; - req->poll.addr = READ_ONCE(sqe->addr); + req->poll_remove.addr = READ_ONCE(sqe->addr); return 0; } @@ -5549,12 +5552,10 @@ static int io_poll_remove_prep(struct io_kiocb *req, static int io_poll_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - u64 addr; int ret; - addr = req->poll.addr; spin_lock_irq(&ctx->completion_lock); - ret = io_poll_cancel(ctx, addr); + ret = io_poll_cancel(ctx, req->poll_remove.addr); spin_unlock_irq(&ctx->completion_lock); if (ret < 0) From 863e05604a6fb45f0f56b3e9eca5cd533001253b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Oct 2020 23:25:35 +0000 Subject: [PATCH 326/395] io_uring: track link's head and tail during submit Explicitly save not only a link's head in io_submit_sqe[s]() but the tail as well. That's in preparation for keeping linked requests in a singly linked list. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 10cfb6d17994..0441185ac510 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6536,8 +6536,13 @@ static inline void io_queue_link_head(struct io_kiocb *req, io_queue_sqe(req, NULL, cs); } +struct io_submit_link { + struct io_kiocb *head; + struct io_kiocb *last; +}; + static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **link, struct io_comp_state *cs) + struct io_submit_link *link, struct io_comp_state *cs) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -6549,8 +6554,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, * submitted sync once the chain is complete. If none of those * conditions are true (normal request), then just queue it. */ - if (*link) { - struct io_kiocb *head = *link; + if (link->head) { + struct io_kiocb *head = link->head; /* * Taking sequential execution of a link, draining both sides @@ -6571,11 +6576,12 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } trace_io_uring_link(ctx, req, head); list_add_tail(&req->link_list, &head->link_list); + link->last = req; /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { io_queue_link_head(head, cs); - *link = NULL; + link->head = NULL; } } else { if (unlikely(ctx->drain_next)) { @@ -6589,7 +6595,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = io_req_defer_prep(req, sqe); if (unlikely(ret)) req->flags |= REQ_F_FAIL_LINK; - *link = req; + link->head = req; + link->last = req; } else { io_queue_sqe(req, sqe, cs); } @@ -6769,7 +6776,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { struct io_submit_state state; - struct io_kiocb *link = NULL; + struct io_submit_link link; int i, submitted = 0; /* if we have a backlog and couldn't flush it all, return BUSY */ @@ -6789,6 +6796,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) refcount_add(nr, ¤t->usage); io_submit_state_start(&state, ctx, nr); + link.head = NULL; for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; @@ -6834,8 +6842,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) percpu_counter_sub(&tctx->inflight, unused); put_task_struct_many(current, unused); } - if (link) - io_queue_link_head(link, &state.comp); + if (link.head) + io_queue_link_head(link.head, &state.comp); io_submit_state_end(&state); /* Commit SQ ring head once we've consumed and submitted all SQEs */ From 90cd7e424969d29aff653333b4dcb4e2e199d791 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Oct 2020 23:25:36 +0000 Subject: [PATCH 327/395] io_uring: track link timeout's master explicitly In preparation for converting singly linked lists for chaining requests, make linked timeouts save requests that they're responsible for and not count on doubly linked list for back referencing. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0441185ac510..2d14a4f636d6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -446,6 +446,8 @@ struct io_timeout { u32 off; u32 target_seq; struct list_head list; + /* head of the link, used by linked timeouts only */ + struct io_kiocb *head; }; struct io_timeout_rem { @@ -1984,6 +1986,7 @@ static void io_kill_linked_timeout(struct io_kiocb *req) int ret; list_del_init(&link->link_list); + link->timeout.head = NULL; ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { io_cqring_fill_event(link, -ECANCELED); @@ -6358,26 +6361,22 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, struct io_timeout_data, timer); - struct io_kiocb *req = data->req; + struct io_kiocb *prev, *req = data->req; struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *prev = NULL; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); + prev = req->timeout.head; + req->timeout.head = NULL; /* * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ - if (!list_empty(&req->link_list)) { - prev = list_entry(req->link_list.prev, struct io_kiocb, - link_list); - if (refcount_inc_not_zero(&prev->refs)) - list_del_init(&req->link_list); - else - prev = NULL; - } - + if (prev && refcount_inc_not_zero(&prev->refs)) + list_del_init(&req->link_list); + else + prev = NULL; spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { @@ -6396,7 +6395,7 @@ static void __io_queue_linked_timeout(struct io_kiocb *req) * If the list is now empty, then our linked request finished before * we got a chance to setup the timer */ - if (!list_empty(&req->link_list)) { + if (req->timeout.head) { struct io_timeout_data *data = req->async_data; data->timer.function = io_link_timeout_fn; @@ -6431,6 +6430,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; + nxt->timeout.head = req; nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; req->flags |= REQ_F_LINK_TIMEOUT; return nxt; From f2f87370bb6664e5babb6705e886cfb340f163e1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Oct 2020 23:25:37 +0000 Subject: [PATCH 328/395] io_uring: link requests with singly linked list Singly linked list for keeping linked requests is enough, because we almost always operate on the head and traverse forward with the exception of linked timeouts going 1 hop backwards. Replace ->link_list with a handmade singly linked list. Also kill REQ_F_LINK_HEAD in favour of checking a newly added ->list for NULL directly. That saves 8B in io_kiocb, is not as heavy as list fixup, makes better use of cache by not touching a previous request (i.e. last request of the link) each time on list modification and optimises cache use further in the following patch, and actually makes travesal easier removing in the end some lines. Also, keeping invariant in ->list instead of having REQ_F_LINK_HEAD is less error-prone. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 133 ++++++++++++++++++++------------------------------ 1 file changed, 52 insertions(+), 81 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2d14a4f636d6..a0a6d13c0c16 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -600,7 +600,6 @@ enum { REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, - REQ_F_LINK_HEAD_BIT, REQ_F_FAIL_LINK_BIT, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, @@ -632,8 +631,6 @@ enum { /* IOSQE_BUFFER_SELECT */ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), - /* head of a link */ - REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT), /* fail rest of links */ REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), /* on inflight list */ @@ -715,7 +712,7 @@ struct io_kiocb { struct task_struct *task; u64 user_data; - struct list_head link_list; + struct io_kiocb *link; /* * 1. used with ctx->iopoll_list with reads/writes @@ -1023,6 +1020,9 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +#define io_for_each_link(pos, head) \ + for (pos = (head); pos; pos = pos->link) + static inline void io_clean_op(struct io_kiocb *req) { if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | @@ -1501,10 +1501,8 @@ static void io_prep_async_link(struct io_kiocb *req) { struct io_kiocb *cur; - io_prep_async_work(req); - if (req->flags & REQ_F_LINK_HEAD) - list_for_each_entry(cur, &req->link_list, link_list) - io_prep_async_work(cur); + io_for_each_link(cur, req) + io_prep_async_work(cur); } static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) @@ -1687,20 +1685,15 @@ static inline bool __io_match_files(struct io_kiocb *req, req->work.identity->files == files; } -static bool io_match_files(struct io_kiocb *req, - struct files_struct *files) +static bool io_match_files(struct io_kiocb *head, struct files_struct *files) { - struct io_kiocb *link; + struct io_kiocb *req; if (!files) return true; - if (__io_match_files(req, files)) - return true; - if (req->flags & REQ_F_LINK_HEAD) { - list_for_each_entry(link, &req->link_list, link_list) { - if (__io_match_files(link, files)) - return true; - } + io_for_each_link(req, head) { + if (__io_match_files(req, files)) + return true; } return false; } @@ -1967,6 +1960,14 @@ static void __io_free_req(struct io_kiocb *req) percpu_ref_put(&ctx->refs); } +static inline void io_remove_next_linked(struct io_kiocb *req) +{ + struct io_kiocb *nxt = req->link; + + req->link = nxt->link; + nxt->link = NULL; +} + static void io_kill_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1975,8 +1976,8 @@ static void io_kill_linked_timeout(struct io_kiocb *req) unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - link = list_first_entry_or_null(&req->link_list, struct io_kiocb, - link_list); + link = req->link; + /* * Can happen if a linked timeout fired and link had been like * req -> link t-out -> link t-out [-> ...] @@ -1985,7 +1986,7 @@ static void io_kill_linked_timeout(struct io_kiocb *req) struct io_timeout_data *io = link->async_data; int ret; - list_del_init(&link->link_list); + io_remove_next_linked(req); link->timeout.head = NULL; ret = hrtimer_try_to_cancel(&io->timer); if (ret != -1) { @@ -2003,41 +2004,22 @@ static void io_kill_linked_timeout(struct io_kiocb *req) } } -static struct io_kiocb *io_req_link_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt; - /* - * The list should never be empty when we are called here. But could - * potentially happen if the chain is messed up, check to be on the - * safe side. - */ - if (unlikely(list_empty(&req->link_list))) - return NULL; - - nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list); - list_del_init(&req->link_list); - if (!list_empty(&nxt->link_list)) - nxt->flags |= REQ_F_LINK_HEAD; - return nxt; -} - -/* - * Called if REQ_F_LINK_HEAD is set, and we fail the head request - */ static void io_fail_links(struct io_kiocb *req) { + struct io_kiocb *link, *nxt; struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); - while (!list_empty(&req->link_list)) { - struct io_kiocb *link = list_first_entry(&req->link_list, - struct io_kiocb, link_list); + link = req->link; + req->link = NULL; + + while (link) { + nxt = link->link; + link->link = NULL; - list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); - io_cqring_fill_event(link, -ECANCELED); /* @@ -2049,8 +2031,8 @@ static void io_fail_links(struct io_kiocb *req) io_put_req_deferred(link, 2); else io_double_put_req(link); + link = nxt; } - io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -2059,7 +2041,6 @@ static void io_fail_links(struct io_kiocb *req) static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) { - req->flags &= ~REQ_F_LINK_HEAD; if (req->flags & REQ_F_LINK_TIMEOUT) io_kill_linked_timeout(req); @@ -2069,15 +2050,19 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (likely(!(req->flags & REQ_F_FAIL_LINK))) - return io_req_link_next(req); + if (likely(!(req->flags & REQ_F_FAIL_LINK))) { + struct io_kiocb *nxt = req->link; + + req->link = NULL; + return nxt; + } io_fail_links(req); return NULL; } -static struct io_kiocb *io_req_find_next(struct io_kiocb *req) +static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { - if (likely(!(req->flags & REQ_F_LINK_HEAD))) + if (likely(!(req->link) && !(req->flags & REQ_F_LINK_TIMEOUT))) return NULL; return __io_req_find_next(req); } @@ -2173,7 +2158,7 @@ static void io_req_task_queue(struct io_kiocb *req) } } -static void io_queue_next(struct io_kiocb *req) +static inline void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -2230,8 +2215,7 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req) io_free_req(req); return; } - if (req->flags & REQ_F_LINK_HEAD) - io_queue_next(req); + io_queue_next(req); if (req->task != rb->task) { if (rb->task) { @@ -6015,11 +5999,10 @@ static u32 io_get_sequence(struct io_kiocb *req) { struct io_kiocb *pos; struct io_ring_ctx *ctx = req->ctx; - u32 total_submitted, nr_reqs = 1; + u32 total_submitted, nr_reqs = 0; - if (req->flags & REQ_F_LINK_HEAD) - list_for_each_entry(pos, &req->link_list, link_list) - nr_reqs++; + io_for_each_link(pos, req) + nr_reqs++; total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; return total_submitted - nr_reqs; @@ -6374,7 +6357,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) * race with the completion of the linked work. */ if (prev && refcount_inc_not_zero(&prev->refs)) - list_del_init(&req->link_list); + io_remove_next_linked(prev); else prev = NULL; spin_unlock_irqrestore(&ctx->completion_lock, flags); @@ -6392,8 +6375,8 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) static void __io_queue_linked_timeout(struct io_kiocb *req) { /* - * If the list is now empty, then our linked request finished before - * we got a chance to setup the timer + * If the back reference is NULL, then our linked request finished + * before we got a chance to setup the timer */ if (req->timeout.head) { struct io_timeout_data *data = req->async_data; @@ -6418,16 +6401,10 @@ static void io_queue_linked_timeout(struct io_kiocb *req) static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) { - struct io_kiocb *nxt; + struct io_kiocb *nxt = req->link; - if (!(req->flags & REQ_F_LINK_HEAD)) - return NULL; - if (req->flags & REQ_F_LINK_TIMEOUT) - return NULL; - - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, - link_list); - if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) + if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) || + nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; nxt->timeout.head = req; @@ -6575,7 +6552,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return ret; } trace_io_uring_link(ctx, req, head); - list_add_tail(&req->link_list, &head->link_list); + link->last->link = req; link->last = req; /* last request of a link, enqueue the link */ @@ -6589,9 +6566,6 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ctx->drain_next = 0; } if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - req->flags |= REQ_F_LINK_HEAD; - INIT_LIST_HEAD(&req->link_list); - ret = io_req_defer_prep(req, sqe); if (unlikely(ret)) req->flags |= REQ_F_FAIL_LINK; @@ -6724,6 +6698,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->file = NULL; req->ctx = ctx; req->flags = 0; + req->link = NULL; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); req->task = current; @@ -8682,14 +8657,10 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) { struct io_kiocb *link; - if (!(preq->flags & REQ_F_LINK_HEAD)) - return false; - - list_for_each_entry(link, &preq->link_list, link_list) { + io_for_each_link(link, preq->link) { if (link == req) return true; } - return false; } From 0415767e7f0542b3cd1ab270c2e61e90e87aafa2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Oct 2020 23:25:38 +0000 Subject: [PATCH 329/395] io_uring: rearrange io_kiocb fields for better caching We've got extra 8 bytes in the 2nd cacheline, put ->fixed_file_refs there, so inline execution path mostly doesn't touch the 3rd cacheline for fixed_file requests as well. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a0a6d13c0c16..b651d6e6d609 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -713,14 +713,13 @@ struct io_kiocb { u64 user_data; struct io_kiocb *link; + struct percpu_ref *fixed_file_refs; /* * 1. used with ctx->iopoll_list with reads/writes * 2. to track reqs with ->files (see io_op_def::file_table) */ struct list_head inflight_entry; - - struct percpu_ref *fixed_file_refs; struct callback_head task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; From 27926b683db03be307c6905b44ecfc1f081d9d6f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 Oct 2020 09:33:23 -0600 Subject: [PATCH 330/395] io_uring: only plug when appropriate We unconditionally call blk_start_plug() when starting the IO submission, but we only really should do that if we have more than 1 request to submit AND we're potentially dealing with block based storage underneath. For any other type of request, it's just a waste of time to do so. Add a ->plug bit to io_op_def and set it for read/write requests. We could make this more precise and check the file itself as well, but it doesn't matter that much and would quickly become more expensive. Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b651d6e6d609..11ce97d6259c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -750,6 +750,8 @@ struct io_submit_state { void *reqs[IO_IOPOLL_BATCH]; unsigned int free_reqs; + bool plug_started; + /* * Batch completion logic */ @@ -782,6 +784,8 @@ struct io_op_def { unsigned buffer_select : 1; /* must always have async data allocated */ unsigned needs_async_data : 1; + /* should block plug */ + unsigned plug : 1; /* size of async data needed, if any */ unsigned short async_size; unsigned work_flags; @@ -795,6 +799,7 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .buffer_select = 1, .needs_async_data = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, @@ -804,6 +809,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .needs_async_data = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE, @@ -816,6 +822,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, @@ -824,6 +831,7 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | IO_WQ_WORK_MM, @@ -907,6 +915,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, @@ -914,6 +923,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .plug = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE, @@ -6585,7 +6595,8 @@ static void io_submit_state_end(struct io_submit_state *state) { if (!list_empty(&state->comp.list)) io_submit_flush_completions(&state->comp); - blk_finish_plug(&state->plug); + if (state->plug_started) + blk_finish_plug(&state->plug); io_state_file_put(state); if (state->free_reqs) kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); @@ -6597,7 +6608,7 @@ static void io_submit_state_end(struct io_submit_state *state) static void io_submit_state_start(struct io_submit_state *state, struct io_ring_ctx *ctx, unsigned int max_ios) { - blk_start_plug(&state->plug); + state->plug_started = false; state->comp.nr = 0; INIT_LIST_HEAD(&state->comp.list); state->comp.ctx = ctx; @@ -6739,6 +6750,16 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags |= sqe_flags; + /* + * Plug now if we have more than 1 IO left after this, and the target + * is potentially a read/write to block based storage. + */ + if (!state->plug_started && state->ios_left > 1 && + io_op_defs[req->opcode].plug) { + blk_start_plug(&state->plug); + state->plug_started = true; + } + if (!io_op_defs[req->opcode].needs_file) return 0; From c73ebb685fb6dfb513d394cbea64fb81ba3d994f Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Tue, 3 Nov 2020 10:54:37 +0800 Subject: [PATCH 331/395] io_uring: add timeout support for io_uring_enter() Now users who want to get woken when waiting for events should submit a timeout command first. It is not safe for applications that split SQ and CQ handling between two threads, such as mysql. Users should synchronize the two threads explicitly to protect SQ and that will impact the performance. This patch adds support for timeout to existing io_uring_enter(). To avoid overloading arguments, it introduces a new parameter structure which contains sigmask and timeout. I have tested the workloads with one thread submiting nop requests while the other reaping the cqe with timeout. It shows 1.8~2x faster when the iodepth is 16. Signed-off-by: Jiufei Xue Signed-off-by: Hao Xu [axboe: various cleanups/fixes, and name change to SIG_IS_DATA] Signed-off-by: Jens Axboe --- fs/io_uring.c | 69 +++++++++++++++++++++++++++++++---- include/linux/syscalls.h | 2 +- include/uapi/linux/io_uring.h | 9 +++++ 3 files changed, 72 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 11ce97d6259c..ee25c70527aa 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7118,7 +7118,8 @@ static int io_run_task_work_sig(void) * application must reap them itself, as they reside on the shared cq ring. */ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, - const sigset_t __user *sig, size_t sigsz) + const sigset_t __user *sig, size_t sigsz, + struct __kernel_timespec __user *uts) { struct io_wait_queue iowq = { .wq = { @@ -7130,6 +7131,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, .to_wait = min_events, }; struct io_rings *rings = ctx->rings; + struct timespec64 ts; + signed long timeout = 0; int ret = 0; do { @@ -7152,6 +7155,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } + if (uts) { + if (get_timespec64(&ts, uts)) + return -EFAULT; + timeout = timespec64_to_jiffies(&ts); + } + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); trace_io_uring_cqring_wait(ctx, min_events); do { @@ -7165,7 +7174,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, break; if (io_should_wake(&iowq, false)) break; - schedule(); + if (uts) { + timeout = schedule_timeout(timeout); + if (timeout == 0) { + ret = -ETIME; + break; + } + } else { + schedule(); + } } while (1); finish_wait(&ctx->wait, &iowq.wq); @@ -9167,9 +9184,39 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) finish_wait(&ctx->sqo_sq_wait, &wait); } +static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, + struct __kernel_timespec __user **ts, + const sigset_t __user **sig) +{ + struct io_uring_getevents_arg arg; + + /* + * If EXT_ARG isn't set, then we have no timespec and the argp pointer + * is just a pointer to the sigset_t. + */ + if (!(flags & IORING_ENTER_EXT_ARG)) { + *sig = (const sigset_t __user *) argp; + *ts = NULL; + return 0; + } + + /* + * EXT_ARG is set - ensure we agree on the size of it and copy in our + * timespec and sigset_t pointers if good. + */ + if (*argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; + *sig = u64_to_user_ptr(arg.sigmask); + *argsz = arg.sigmask_sz; + *ts = u64_to_user_ptr(arg.ts); + return 0; +} + SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, - u32, min_complete, u32, flags, const sigset_t __user *, sig, - size_t, sigsz) + u32, min_complete, u32, flags, const void __user *, argp, + size_t, argsz) { struct io_ring_ctx *ctx; long ret = -EBADF; @@ -9179,7 +9226,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_run_task_work(); if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT)) + IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)) return -EINVAL; f = fdget(fd); @@ -9225,6 +9272,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { + const sigset_t __user *sig; + struct __kernel_timespec __user *ts; + + ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); + if (unlikely(ret)) + goto out; + min_complete = min(min_complete, ctx->cq_entries); /* @@ -9237,7 +9291,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, !(ctx->flags & IORING_SETUP_SQPOLL)) { ret = io_iopoll_check(ctx, min_complete); } else { - ret = io_cqring_wait(ctx, min_complete, sig, sigsz); + ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); } } @@ -9600,7 +9654,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | - IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED; + IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | + IORING_FEAT_EXT_ARG; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 37bea07c12f2..8576e8bf92fe 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -317,7 +317,7 @@ asmlinkage long sys_io_uring_setup(u32 entries, struct io_uring_params __user *p); asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, u32 min_complete, u32 flags, - const sigset_t __user *sig, size_t sigsz); + const void __user *argp, size_t argsz); asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, void __user *arg, unsigned int nr_args); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 557e7eae497f..6bb8229de892 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -231,6 +231,7 @@ struct io_cqring_offsets { #define IORING_ENTER_GETEVENTS (1U << 0) #define IORING_ENTER_SQ_WAKEUP (1U << 1) #define IORING_ENTER_SQ_WAIT (1U << 2) +#define IORING_ENTER_EXT_ARG (1U << 3) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -259,6 +260,7 @@ struct io_uring_params { #define IORING_FEAT_FAST_POLL (1U << 5) #define IORING_FEAT_POLL_32BITS (1U << 6) #define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) +#define IORING_FEAT_EXT_ARG (1U << 8) /* * io_uring_register(2) opcodes and arguments @@ -335,4 +337,11 @@ enum { IORING_RESTRICTION_LAST }; +struct io_uring_getevents_arg { + __u64 sigmask; + __u32 sigmask_sz; + __u32 pad; + __u64 ts; +}; + #endif From 1a38ffc9cbca361cc274d6e234f5ef8922f0b6d9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 8 Nov 2020 12:55:55 +0000 Subject: [PATCH 332/395] io_uring: NULL files dereference by SQPOLL SQPOLL task may find sqo_task->files == NULL and __io_sq_thread_acquire_files() would leave it unset, so following fget_many() and others try to dereference NULL and fault. Propagate an error files are missing. [ 118.962785] BUG: kernel NULL pointer dereference, address: 0000000000000020 [ 118.963812] #PF: supervisor read access in kernel mode [ 118.964534] #PF: error_code(0x0000) - not-present page [ 118.969029] RIP: 0010:__fget_files+0xb/0x80 [ 119.005409] Call Trace: [ 119.005651] fget_many+0x2b/0x30 [ 119.005964] io_file_get+0xcf/0x180 [ 119.006315] io_submit_sqes+0x3a4/0x950 [ 119.007481] io_sq_thread+0x1de/0x6a0 [ 119.007828] kthread+0x114/0x150 [ 119.008963] ret_from_fork+0x22/0x30 Reported-by: Josef Grieb Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ee25c70527aa..775a83ebc8ab 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1061,7 +1061,7 @@ static void io_sq_thread_drop_mm_files(void) } } -static void __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) +static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) { if (!current->files) { struct files_struct *files; @@ -1071,7 +1071,7 @@ static void __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) files = ctx->sqo_task->files; if (!files) { task_unlock(ctx->sqo_task); - return; + return -EOWNERDEAD; } atomic_inc(&files->count); get_nsproxy(ctx->sqo_task->nsproxy); @@ -1083,6 +1083,7 @@ static void __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) current->nsproxy = nsproxy; task_unlock(current); } + return 0; } static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) @@ -1114,15 +1115,19 @@ static int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; + int ret; if (def->work_flags & IO_WQ_WORK_MM) { - int ret = __io_sq_thread_acquire_mm(ctx); + ret = __io_sq_thread_acquire_mm(ctx); if (unlikely(ret)) return ret; } - if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) - __io_sq_thread_acquire_files(ctx); + if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) { + ret = __io_sq_thread_acquire_files(ctx); + if (unlikely(ret)) + return ret; + } return 0; } @@ -2130,8 +2135,8 @@ static void __io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (!__io_sq_thread_acquire_mm(ctx)) { - __io_sq_thread_acquire_files(ctx); + if (!__io_sq_thread_acquire_mm(ctx) && + !__io_sq_thread_acquire_files(ctx)) { mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); From 10fc72e43352753a08f9cf83aa5c40baec00d212 Mon Sep 17 00:00:00 2001 From: David Laight Date: Sat, 7 Nov 2020 13:16:25 +0000 Subject: [PATCH 333/395] fs/io_uring Don't use the return value from import_iovec(). This is the only code that relies on import_iovec() returning iter.count on success. This allows a better interface to import_iovec(). Signed-off-by: David Laight Signed-off-by: Pavel Begunkov Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 775a83ebc8ab..c1ac352e3e15 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3179,7 +3179,7 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, ret = import_single_range(rw, buf, sqe_len, *iovec, iter); *iovec = NULL; - return ret < 0 ? ret : sqe_len; + return ret; } if (req->flags & REQ_F_BUFFER_SELECT) { @@ -3205,7 +3205,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (!iorw) return __io_import_iovec(rw, req, iovec, iter, needs_lock); *iovec = NULL; - return iov_iter_count(&iorw->iter); + return 0; } static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) @@ -3474,7 +3474,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (ret < 0) return ret; iov_count = iov_iter_count(iter); - io_size = ret; + io_size = iov_count; req->result = io_size; ret = 0; @@ -3602,7 +3602,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (ret < 0) return ret; iov_count = iov_iter_count(iter); - io_size = ret; + io_size = iov_count; req->result = io_size; /* Ensure we clear previously set non-block flag */ From 632546c4b5a4dad8e3ac456406c65c0db9a0b570 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 7 Nov 2020 13:16:26 +0000 Subject: [PATCH 334/395] io_uring: remove duplicated io_size from rw io_size and iov_count in io_read() and io_write() hold the same value, kill the last one. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c1ac352e3e15..b74048b5134a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3464,7 +3464,6 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, struct iov_iter __iter, *iter = &__iter; struct io_async_rw *rw = req->async_data; ssize_t io_size, ret, ret2; - size_t iov_count; bool no_async; if (rw) @@ -3473,8 +3472,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; - iov_count = iov_iter_count(iter); - io_size = iov_count; + io_size = iov_iter_count(iter); req->result = io_size; ret = 0; @@ -3490,7 +3488,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (no_async) goto copy_iov; - ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size); if (unlikely(ret)) goto out_free; @@ -3509,7 +3507,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (req->file->f_flags & O_NONBLOCK) goto done; /* some cases will consume bytes even on error returns */ - iov_iter_revert(iter, iov_count - iov_iter_count(iter)); + iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = 0; goto copy_iov; } else if (ret < 0) { @@ -3592,7 +3590,6 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter __iter, *iter = &__iter; struct io_async_rw *rw = req->async_data; - size_t iov_count; ssize_t ret, ret2, io_size; if (rw) @@ -3601,8 +3598,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; - iov_count = iov_iter_count(iter); - io_size = iov_count; + io_size = iov_iter_count(iter); req->result = io_size; /* Ensure we clear previously set non-block flag */ @@ -3620,7 +3616,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, (req->flags & REQ_F_ISREG)) goto copy_iov; - ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count); + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size); if (unlikely(ret)) goto out_free; @@ -3663,7 +3659,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, } else { copy_iov: /* some cases will consume bytes even on error returns */ - iov_iter_revert(iter, iov_count - iov_iter_count(iter)); + iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (!ret) return -EAGAIN; From 2846c481c9dd1f1fb504b4885bcb815c311df532 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 7 Nov 2020 13:16:27 +0000 Subject: [PATCH 335/395] io_uring: inline io_import_iovec() Inline io_import_iovec() and leave only its former __io_import_iovec() renamed to the original name. That makes it more obious what is reused in io_read/write(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b74048b5134a..12a213b147c0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3150,7 +3150,7 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, needs_lock); } -static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter, bool needs_lock) { @@ -3196,18 +3196,6 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, req->ctx->compat); } -static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter, - bool needs_lock) -{ - struct io_async_rw *iorw = req->async_data; - - if (!iorw) - return __io_import_iovec(rw, req, iovec, iter, needs_lock); - *iovec = NULL; - return 0; -} - static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) { return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; @@ -3331,7 +3319,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) struct iovec *iov = iorw->fast_iov; ssize_t ret; - ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false); + ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); if (unlikely(ret < 0)) return ret; @@ -3466,12 +3454,14 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ssize_t io_size, ret, ret2; bool no_async; - if (rw) + if (rw) { iter = &rw->iter; - - ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); - if (ret < 0) - return ret; + iovec = NULL; + } else { + ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); + if (ret < 0) + return ret; + } io_size = iov_iter_count(iter); req->result = io_size; ret = 0; @@ -3592,12 +3582,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, struct io_async_rw *rw = req->async_data; ssize_t ret, ret2, io_size; - if (rw) + if (rw) { iter = &rw->iter; - - ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); - if (ret < 0) - return ret; + iovec = NULL; + } else { + ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); + if (ret < 0) + return ret; + } io_size = iov_iter_count(iter); req->result = io_size; From 06de5f5973c641c7ae033f133ecfaaf64fe633a6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 6 Nov 2020 13:00:21 +0000 Subject: [PATCH 336/395] io_uring: simplify io_task_match() If IORING_SETUP_SQPOLL is set all requests belong to the corresponding SQPOLL task, so skip task checking in that case and always match. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 12a213b147c0..baff81313f54 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1563,11 +1563,7 @@ static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) if (!tsk || req->task == tsk) return true; - if (ctx->flags & IORING_SETUP_SQPOLL) { - if (ctx->sq_data && req->task == ctx->sq_data->thread) - return true; - } - return false; + return (ctx->flags & IORING_SETUP_SQPOLL); } /* From 08d23634643c239ddae706758f54d3a8e0c24962 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 6 Nov 2020 13:00:22 +0000 Subject: [PATCH 337/395] io_uring: add a {task,files} pair matching helper Add io_match_task() that matches both task and files. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index baff81313f54..35828af37065 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1039,6 +1039,26 @@ static inline void io_clean_op(struct io_kiocb *req) __io_clean_op(req); } +static bool io_match_task(struct io_kiocb *head, + struct task_struct *task, + struct files_struct *files) +{ + struct io_kiocb *req; + + if (task && head->task != task) + return false; + if (!files) + return true; + + io_for_each_link(req, head) { + if ((req->flags & REQ_F_WORK_INITIALIZED) && + (req->work.flags & IO_WQ_WORK_FILES) && + req->work.identity->files == files) + return true; + } + return false; +} + static void io_sq_thread_drop_mm_files(void) { struct files_struct *files = current->files; @@ -1687,27 +1707,6 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) } } -static inline bool __io_match_files(struct io_kiocb *req, - struct files_struct *files) -{ - return ((req->flags & REQ_F_WORK_INITIALIZED) && - (req->work.flags & IO_WQ_WORK_FILES)) && - req->work.identity->files == files; -} - -static bool io_match_files(struct io_kiocb *head, struct files_struct *files) -{ - struct io_kiocb *req; - - if (!files) - return true; - io_for_each_link(req, head) { - if (__io_match_files(req, files)) - return true; - } - return false; -} - /* Returns true if there are no backlogged entries after the flush */ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, struct task_struct *tsk, @@ -1735,9 +1734,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, cqe = NULL; list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) { - if (tsk && req->task != tsk) - continue; - if (!io_match_files(req, files)) + if (!io_match_task(req, tsk, files)) continue; cqe = io_get_cqring(ctx); @@ -8787,8 +8784,7 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_task_match(de->req, task) && - io_match_files(de->req, files)) { + if (io_match_task(de->req, task, files)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } From df9923f96717d0aebb0a73adbcf6285fa79e38cb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 6 Nov 2020 13:00:23 +0000 Subject: [PATCH 338/395] io_uring: cancel only requests of current task io_uring_cancel_files() cancels all request that match files regardless of task. There is no real need in that, cancel only requests of the specified task. That also handles SQPOLL case as it already changes task to it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 35828af37065..c42fd31cb314 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8665,14 +8665,6 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } -static bool io_wq_files_match(struct io_wq_work *work, void *data) -{ - struct files_struct *files = data; - - return !files || ((work->flags & IO_WQ_WORK_FILES) && - work->identity->files == files); -} - /* * Returns true if 'preq' is the link parent of 'req' */ @@ -8805,21 +8797,20 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, * Returns true if we found and killed one or more files pinning requests */ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, + struct task_struct *task, struct files_struct *files) { if (list_empty_careful(&ctx->inflight_list)) return false; - /* cancel all at once, should be faster than doing it one by one*/ - io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); - while (!list_empty_careful(&ctx->inflight_list)) { struct io_kiocb *cancel_req = NULL, *req; DEFINE_WAIT(wait); spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (files && (req->work.flags & IO_WQ_WORK_FILES) && + if (req->task == task && + (req->work.flags & IO_WQ_WORK_FILES) && req->work.identity->files != files) continue; /* req is being completed, ignore */ @@ -8862,7 +8853,7 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, { bool ret; - ret = io_uring_cancel_files(ctx, files); + ret = io_uring_cancel_files(ctx, task, files); if (!files) { enum io_wq_cancel cret; @@ -8901,11 +8892,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_sq_thread_park(ctx->sq_data); } - if (files) - io_cancel_defer_files(ctx, NULL, files); - else - io_cancel_defer_files(ctx, task, NULL); - + io_cancel_defer_files(ctx, task, files); io_cqring_overflow_flush(ctx, true, task, files); while (__io_uring_cancel_task_requests(ctx, task, files)) { From b52fda00dd9df8b4a6de5784df94f9617f6133a1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 6 Nov 2020 13:00:24 +0000 Subject: [PATCH 339/395] io_uring: don't iterate io_uring_cancel_files() io_uring_cancel_files() guarantees to cancel all matching requests, that's not necessary to do that in a loop. Move it up in the callchain into io_uring_cancel_task_requests(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c42fd31cb314..364f96486e05 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8793,16 +8793,10 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, } } -/* - * Returns true if we found and killed one or more files pinning requests - */ -static bool io_uring_cancel_files(struct io_ring_ctx *ctx, +static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files) { - if (list_empty_careful(&ctx->inflight_list)) - return false; - while (!list_empty_careful(&ctx->inflight_list)) { struct io_kiocb *cancel_req = NULL, *req; DEFINE_WAIT(wait); @@ -8835,8 +8829,6 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, schedule(); finish_wait(&ctx->inflight_wait, &wait); } - - return true; } static bool io_cancel_task_cb(struct io_wq_work *work, void *data) @@ -8847,15 +8839,12 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) return io_task_match(req, task); } -static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - struct files_struct *files) +static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, + struct task_struct *task) { - bool ret; - - ret = io_uring_cancel_files(ctx, task, files); - if (!files) { + while (1) { enum io_wq_cancel cret; + bool ret = false; cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); if (cret != IO_WQ_CANCEL_NOTFOUND) @@ -8871,9 +8860,11 @@ static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, ret |= io_poll_remove_all(ctx, task); ret |= io_kill_timeouts(ctx, task); + if (!ret) + break; + io_run_task_work(); + cond_resched(); } - - return ret; } /* @@ -8894,11 +8885,10 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_cancel_defer_files(ctx, task, files); io_cqring_overflow_flush(ctx, true, task, files); + io_uring_cancel_files(ctx, task, files); - while (__io_uring_cancel_task_requests(ctx, task, files)) { - io_run_task_work(); - cond_resched(); - } + if (!files) + __io_uring_cancel_task_requests(ctx, task); if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { atomic_dec(&task->io_uring->in_idle); From 6b81928d4ca8668513251f9c04cdcb9d38ef51c7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 6 Nov 2020 13:00:25 +0000 Subject: [PATCH 340/395] io_uring: pass files into kill timeouts/poll Make io_poll_remove_all() and io_kill_timeouts() to match against files as well. A preparation patch, effectively not used by now. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 364f96486e05..a51dcd33a1b8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1589,14 +1589,15 @@ static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) /* * Returns true if we found and killed one or more timeouts */ -static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk) +static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, + struct files_struct *files) { struct io_kiocb *req, *tmp; int canceled = 0; spin_lock_irq(&ctx->completion_lock); list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - if (io_task_match(req, tsk)) { + if (io_match_task(req, tsk, files)) { io_kill_timeout(req); canceled++; } @@ -5473,7 +5474,8 @@ static bool io_poll_remove_one(struct io_kiocb *req) /* * Returns true if we found and killed one or more poll requests */ -static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) +static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, + struct files_struct *files) { struct hlist_node *tmp; struct io_kiocb *req; @@ -5485,7 +5487,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk) list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_task_match(req, tsk)) + if (io_match_task(req, tsk, files)) posted += io_poll_remove_one(req); } } @@ -8626,8 +8628,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); - io_kill_timeouts(ctx, NULL); - io_poll_remove_all(ctx, NULL); + io_kill_timeouts(ctx, NULL, NULL); + io_poll_remove_all(ctx, NULL, NULL); if (ctx->io_wq) io_wq_cancel_all(ctx->io_wq); @@ -8858,8 +8860,8 @@ static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, } } - ret |= io_poll_remove_all(ctx, task); - ret |= io_kill_timeouts(ctx, task); + ret |= io_poll_remove_all(ctx, task, NULL); + ret |= io_kill_timeouts(ctx, task, NULL); if (!ret) break; io_run_task_work(); From f6edbabb8359798c541b0776616c5eab3a840d3d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 6 Nov 2020 13:00:26 +0000 Subject: [PATCH 341/395] io_uring: always batch cancel in *cancel_files() Instead of iterating over each request and cancelling it individually in io_uring_cancel_files(), try to cancel all matching requests and use ->inflight_list only to check if there anything left. In many cases it should be faster, and we can reuse a lot of code from task cancellation. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io-wq.c | 10 ---- fs/io-wq.h | 1 - fs/io_uring.c | 135 ++++++++------------------------------------------ 3 files changed, 21 insertions(+), 125 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index b53c055bea6a..f72d53848dcb 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1078,16 +1078,6 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, return IO_WQ_CANCEL_NOTFOUND; } -static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) -{ - return work == data; -} - -enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) -{ - return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false); -} - struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { int ret = -ENOMEM, node; diff --git a/fs/io-wq.h b/fs/io-wq.h index cba36f03c355..069496c6d4f9 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -129,7 +129,6 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work) } void io_wq_cancel_all(struct io_wq *wq); -enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/fs/io_uring.c b/fs/io_uring.c index a51dcd33a1b8..733ba91e0205 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1577,15 +1577,6 @@ static void io_kill_timeout(struct io_kiocb *req) } } -static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk) -{ - struct io_ring_ctx *ctx = req->ctx; - - if (!tsk || req->task == tsk) - return true; - return (ctx->flags & IORING_SETUP_SQPOLL); -} - /* * Returns true if we found and killed one or more timeouts */ @@ -8667,108 +8658,31 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } -/* - * Returns true if 'preq' is the link parent of 'req' - */ -static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) -{ - struct io_kiocb *link; +struct io_task_cancel { + struct task_struct *task; + struct files_struct *files; +}; - io_for_each_link(link, preq->link) { - if (link == req) - return true; - } - return false; -} - -/* - * We're looking to cancel 'req' because it's holding on to our files, but - * 'req' could be a link to another request. See if it is, and cancel that - * parent request if so. - */ -static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req) -{ - struct hlist_node *tmp; - struct io_kiocb *preq; - bool found = false; - int i; - - spin_lock_irq(&ctx->completion_lock); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; - - list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(preq, tmp, list, hash_node) { - found = io_match_link(preq, req); - if (found) { - io_poll_remove_one(preq); - break; - } - } - } - spin_unlock_irq(&ctx->completion_lock); - return found; -} - -static bool io_timeout_remove_link(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - struct io_kiocb *preq; - bool found = false; - - spin_lock_irq(&ctx->completion_lock); - list_for_each_entry(preq, &ctx->timeout_list, timeout.list) { - found = io_match_link(preq, req); - if (found) { - __io_timeout_cancel(preq); - break; - } - } - spin_unlock_irq(&ctx->completion_lock); - return found; -} - -static bool io_cancel_link_cb(struct io_wq_work *work, void *data) +static bool io_cancel_task_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_task_cancel *cancel = data; bool ret; - if (req->flags & REQ_F_LINK_TIMEOUT) { + if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) { unsigned long flags; struct io_ring_ctx *ctx = req->ctx; /* protect against races with linked timeouts */ spin_lock_irqsave(&ctx->completion_lock, flags); - ret = io_match_link(req, data); + ret = io_match_task(req, cancel->task, cancel->files); spin_unlock_irqrestore(&ctx->completion_lock, flags); } else { - ret = io_match_link(req, data); + ret = io_match_task(req, cancel->task, cancel->files); } return ret; } -static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) -{ - enum io_wq_cancel cret; - - /* cancel this particular work, if it's running */ - cret = io_wq_cancel_work(ctx->io_wq, &req->work); - if (cret != IO_WQ_CANCEL_NOTFOUND) - return; - - /* find links that hold this pending, cancel those */ - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); - if (cret != IO_WQ_CANCEL_NOTFOUND) - return; - - /* if we have a poll link holding this pending, cancel that */ - if (io_poll_remove_link(ctx, req)) - return; - - /* final option, timeout link is holding this req pending */ - io_timeout_remove_link(ctx, req); -} - static void io_cancel_defer_files(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files) @@ -8800,8 +8714,10 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { while (!list_empty_careful(&ctx->inflight_list)) { - struct io_kiocb *cancel_req = NULL, *req; + struct io_task_cancel cancel = { .task = task, .files = NULL, }; + struct io_kiocb *req; DEFINE_WAIT(wait); + bool found = false; spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { @@ -8809,23 +8725,21 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, (req->work.flags & IO_WQ_WORK_FILES) && req->work.identity->files != files) continue; - /* req is being completed, ignore */ - if (!refcount_inc_not_zero(&req->refs)) - continue; - cancel_req = req; + found = true; break; } - if (cancel_req) + if (found) prepare_to_wait(&ctx->inflight_wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&ctx->inflight_lock); /* We need to keep going until we don't find a matching req */ - if (!cancel_req) + if (!found) break; - /* cancel this request, or head link requests */ - io_attempt_cancel(ctx, cancel_req); - io_put_req(cancel_req); + + io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); + io_poll_remove_all(ctx, task, files); + io_kill_timeouts(ctx, task, files); /* cancellations _may_ trigger task work */ io_run_task_work(); schedule(); @@ -8833,22 +8747,15 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } -static bool io_cancel_task_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct task_struct *task = data; - - return io_task_match(req, task); -} - static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, struct task_struct *task) { while (1) { + struct io_task_cancel cancel = { .task = task, .files = NULL, }; enum io_wq_cancel cret; bool ret = false; - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true); + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); if (cret != IO_WQ_CANCEL_NOTFOUND) ret = true; From 08369246344077a9cf8109c1cf92a640733314f2 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 3 Nov 2020 14:15:59 +0800 Subject: [PATCH 342/395] io_uring: refactor io_sq_thread() handling There are some issues about current io_sq_thread() implementation: 1. The prepare_to_wait() usage in __io_sq_thread() is weird. If multiple ctxs share one same poll thread, one ctx will put poll thread in TASK_INTERRUPTIBLE, but if other ctxs have work to do, we don't need to change task's stat at all. I think only if all ctxs don't have work to do, we can do it. 2. We use round-robin strategy to make multiple ctxs share one same poll thread, but there are various condition in __io_sq_thread(), which seems complicated and may affect round-robin strategy. To improve above issues, I take below actions: 1. If multiple ctxs share one same poll thread, only if all all ctxs don't have work to do, we can call prepare_to_wait() and schedule() to make poll thread enter sleep state. 2. To make round-robin strategy more straight, I simplify __io_sq_thread() a bit, it just does io poll and sqes submit work once, does not check various condition. 3. For multiple ctxs share one same poll thread, we choose the biggest sq_thread_idle among these ctxs as timeout condition, and will update it when ctx is in or out. 4. Not need to check EBUSY especially, if io_submit_sqes() returns EBUSY, IORING_SQ_CQ_OVERFLOW should be set, helper in liburing should be aware of cq overflow and enters kernel to flush work. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 169 ++++++++++++++++++++------------------------------ 1 file changed, 67 insertions(+), 102 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 733ba91e0205..b789b9af2f4c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -245,6 +245,8 @@ struct io_sq_data { struct task_struct *thread; struct wait_queue_head wait; + + unsigned sq_thread_idle; }; struct io_ring_ctx { @@ -310,7 +312,6 @@ struct io_ring_ctx { struct io_sq_data *sq_data; /* if using sq thread polling */ struct wait_queue_head sqo_sq_wait; - struct wait_queue_entry sqo_wait_entry; struct list_head sqd_list; /* @@ -6841,111 +6842,49 @@ static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->completion_lock); } -static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode, - int sync, void *key) +static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) { - struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry); - int ret; - - ret = autoremove_wake_function(wqe, mode, sync, key); - if (ret) { - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irqrestore(&ctx->completion_lock, flags); - } - return ret; -} - -enum sq_ret { - SQT_IDLE = 1, - SQT_SPIN = 2, - SQT_DID_WORK = 4, -}; - -static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx, - unsigned long start_jiffies, bool cap_entries) -{ - unsigned long timeout = start_jiffies + ctx->sq_thread_idle; - struct io_sq_data *sqd = ctx->sq_data; unsigned int to_submit; int ret = 0; -again: if (!list_empty(&ctx->iopoll_list)) { unsigned nr_events = 0; mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->iopoll_list) && !need_resched()) + if (!list_empty(&ctx->iopoll_list)) io_do_iopoll(ctx, &nr_events, 0); mutex_unlock(&ctx->uring_lock); } to_submit = io_sqring_entries(ctx); - - /* - * If submit got -EBUSY, flag us as needing the application - * to enter the kernel to reap and flush events. - */ - if (!to_submit || ret == -EBUSY || need_resched()) { - /* - * Drop cur_mm before scheduling, we can't hold it for - * long periods (or over schedule()). Do this before - * adding ourselves to the waitqueue, as the unuse/drop - * may sleep. - */ - io_sq_thread_drop_mm_files(); - - /* - * We're polling. If we're within the defined idle - * period, then let us spin without work before going - * to sleep. The exception is if we got EBUSY doing - * more IO, we should wait for the application to - * reap events and wake us up. - */ - if (!list_empty(&ctx->iopoll_list) || need_resched() || - (!time_after(jiffies, timeout) && ret != -EBUSY && - !percpu_ref_is_dying(&ctx->refs))) - return SQT_SPIN; - - prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry, - TASK_INTERRUPTIBLE); - - /* - * While doing polled IO, before going to sleep, we need - * to check if there are new reqs added to iopoll_list, - * it is because reqs may have been punted to io worker - * and will be added to iopoll_list later, hence check - * the iopoll_list again. - */ - if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->iopoll_list)) { - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); - goto again; - } - - to_submit = io_sqring_entries(ctx); - if (!to_submit || ret == -EBUSY) - return SQT_IDLE; - } - - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); - io_ring_clear_wakeup_flag(ctx); - /* if we're handling multiple rings, cap submit size for fairness */ if (cap_entries && to_submit > 8) to_submit = 8; - mutex_lock(&ctx->uring_lock); - if (likely(!percpu_ref_is_dying(&ctx->refs))) - ret = io_submit_sqes(ctx, to_submit); - mutex_unlock(&ctx->uring_lock); + if (to_submit) { + mutex_lock(&ctx->uring_lock); + if (likely(!percpu_ref_is_dying(&ctx->refs))) + ret = io_submit_sqes(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + } if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) wake_up(&ctx->sqo_sq_wait); - return SQT_DID_WORK; + return ret; +} + +static void io_sqd_update_thread_idle(struct io_sq_data *sqd) +{ + struct io_ring_ctx *ctx; + unsigned sq_thread_idle = 0; + + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + if (sq_thread_idle < ctx->sq_thread_idle) + sq_thread_idle = ctx->sq_thread_idle; + } + + sqd->sq_thread_idle = sq_thread_idle; } static void io_sqd_init_new(struct io_sq_data *sqd) @@ -6954,11 +6893,11 @@ static void io_sqd_init_new(struct io_sq_data *sqd) while (!list_empty(&sqd->ctx_new_list)) { ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list); - init_wait(&ctx->sqo_wait_entry); - ctx->sqo_wait_entry.func = io_sq_wake_function; list_move_tail(&ctx->sqd_list, &sqd->ctx_list); complete(&ctx->sq_thread_comp); } + + io_sqd_update_thread_idle(sqd); } static int io_sq_thread(void *data) @@ -6969,17 +6908,17 @@ static int io_sq_thread(void *data) const struct cred *old_cred = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; - unsigned long start_jiffies; + unsigned long timeout; + DEFINE_WAIT(wait); task_lock(current); current->files = NULL; current->nsproxy = NULL; task_unlock(current); - start_jiffies = jiffies; while (!kthread_should_stop()) { - enum sq_ret ret = 0; - bool cap_entries; + int ret; + bool cap_entries, sqt_spin, needs_sched; /* * Any changes to the sqd lists are synchronized through the @@ -6989,11 +6928,13 @@ static int io_sq_thread(void *data) if (kthread_should_park()) kthread_parkme(); - if (unlikely(!list_empty(&sqd->ctx_new_list))) + if (unlikely(!list_empty(&sqd->ctx_new_list))) { io_sqd_init_new(sqd); + timeout = jiffies + sqd->sq_thread_idle; + } + sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { if (current->cred != ctx->creds) { if (old_cred) @@ -7006,24 +6947,49 @@ static int io_sq_thread(void *data) current->sessionid = ctx->sessionid; #endif - ret |= __io_sq_thread(ctx, start_jiffies, cap_entries); + ret = __io_sq_thread(ctx, cap_entries); + if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) + sqt_spin = true; io_sq_thread_drop_mm_files(); } - if (ret & SQT_SPIN) { + if (sqt_spin || !time_after(jiffies, timeout)) { io_run_task_work(); cond_resched(); - } else if (ret == SQT_IDLE) { - if (kthread_should_park()) - continue; + if (sqt_spin) + timeout = jiffies + sqd->sq_thread_idle; + continue; + } + + if (kthread_should_park()) + continue; + + needs_sched = true; + prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + if ((ctx->flags & IORING_SETUP_IOPOLL) && + !list_empty_careful(&ctx->iopoll_list)) { + needs_sched = false; + break; + } + if (io_sqring_entries(ctx)) { + needs_sched = false; + break; + } + } + + if (needs_sched) { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); + schedule(); - start_jiffies = jiffies; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); } + + finish_wait(&sqd->wait, &wait); + timeout = jiffies + sqd->sq_thread_idle; } io_run_task_work(); @@ -7335,12 +7301,11 @@ static void io_sq_thread_stop(struct io_ring_ctx *ctx) mutex_lock(&sqd->ctx_lock); list_del(&ctx->sqd_list); + io_sqd_update_thread_idle(sqd); mutex_unlock(&sqd->ctx_lock); - if (sqd->thread) { - finish_wait(&sqd->wait, &ctx->sqo_wait_entry); + if (sqd->thread) io_sq_thread_unpark(sqd); - } io_put_sq_data(sqd); ctx->sq_data = NULL; From a0d9205f7d36bf72279f34a93850fd14789fdc7e Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 12 Nov 2020 14:55:59 +0800 Subject: [PATCH 343/395] io_uring: initialize 'timeout' properly in io_sq_thread() Some static checker reports below warning: fs/io_uring.c:6939 io_sq_thread() error: uninitialized symbol 'timeout'. This is a false positive, but let's just initialize 'timeout' to make sure we don't trip over this. Reported-by: Dan Carpenter Signed-off-by: Xiaoguang Wang Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b789b9af2f4c..d52d6f529dc6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6908,7 +6908,7 @@ static int io_sq_thread(void *data) const struct cred *old_cred = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; - unsigned long timeout; + unsigned long timeout = 0; DEFINE_WAIT(wait); task_lock(current); From 906a3c6f9ca072e917c701f7421647e169740954 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 12 Nov 2020 14:56:00 +0800 Subject: [PATCH 344/395] io_uring: don't acquire uring_lock twice Both IOPOLL and sqes handling need to acquire uring_lock, combine them together, then we just need to acquire uring_lock once. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d52d6f529dc6..67bf9047d230 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6847,23 +6847,19 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) unsigned int to_submit; int ret = 0; - if (!list_empty(&ctx->iopoll_list)) { - unsigned nr_events = 0; - - mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, &nr_events, 0); - mutex_unlock(&ctx->uring_lock); - } - to_submit = io_sqring_entries(ctx); /* if we're handling multiple rings, cap submit size for fairness */ if (cap_entries && to_submit > 8) to_submit = 8; - if (to_submit) { + if (!list_empty(&ctx->iopoll_list) || to_submit) { + unsigned nr_events = 0; + mutex_lock(&ctx->uring_lock); - if (likely(!percpu_ref_is_dying(&ctx->refs))) + if (!list_empty(&ctx->iopoll_list)) + io_do_iopoll(ctx, &nr_events, 0); + + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs))) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); } From 2e9dbe902d1020ef70f968e8675c8d2457c4ffaa Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Fri, 13 Nov 2020 00:44:08 +0800 Subject: [PATCH 345/395] io_uring: only wake up sq thread while current task is in io worker context If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread task context or in io worker task context. If current task context is sq thread, we don't need to check whether should wake up sq thread. io_iopoll_req_issued() calls wq_has_sleeper(), which has smp_mb() memory barrier, before this patch, perf shows obvious overhead: Samples: 481K of event 'cycles', Event count (approx.): 299807382878 Overhead Comma Shared Object Symbol 3.69% :9630 [kernel.vmlinux] [k] io_issue_sqe With this patch, perf shows: Samples: 482K of event 'cycles', Event count (approx.): 299929547283 Overhead Comma Shared Object Symbol 0.70% :4015 [kernel.vmlinux] [k] io_issue_sqe It shows some obvious improvements. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 67bf9047d230..3617bde95de2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2712,7 +2712,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) * find it from a io_iopoll_getevents() thread before the issuer is done * accessing the kiocb cookie. */ -static void io_iopoll_req_issued(struct io_kiocb *req) +static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) { struct io_ring_ctx *ctx = req->ctx; @@ -2741,7 +2741,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req) else list_add_tail(&req->inflight_entry, &ctx->iopoll_list); - if ((ctx->flags & IORING_SETUP_SQPOLL) && + /* + * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread + * task context or in io worker task context. If current task context is + * sq thread, we don't need to check whether should wake up sq thread. + */ + if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); } @@ -6241,7 +6246,7 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, if (in_async) mutex_lock(&ctx->uring_lock); - io_iopoll_req_issued(req); + io_iopoll_req_issued(req, in_async); if (in_async) mutex_unlock(&ctx->uring_lock); From 10cad2c40dcb04bb46b2bf399e00ca5ea93d36b0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 7 Nov 2020 13:20:39 +0000 Subject: [PATCH 346/395] io_uring: don't take fs for recvmsg/sendmsg We don't even allow not plain data msg_control, which is disallowed in __sys_{send,revb}msg_sock(). So no need in fs for IORING_OP_SENDMSG and IORING_OP_RECVMSG. fs->lock is less contanged not as much as before, but there are cases that can be, e.g. IOSQE_ASYNC. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3617bde95de2..33c448776731 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -852,8 +852,7 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_RECVMSG] = { .needs_file = 1, @@ -862,8 +861,7 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_TIMEOUT] = { .needs_async_data = 1, From c98de08c990e190fc7cc3aaf8079b4a0674c6425 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 15 Nov 2020 12:56:32 +0000 Subject: [PATCH 347/395] io_uring: replace inflight_wait with tctx->wait As tasks now cancel only theirs requests, and inflight_wait is awaited only in io_uring_cancel_files(), which should be called with ->in_idle set, instead of keeping a separate inflight_wait use tctx->wait. That will add some spurious wakeups but actually is safer from point of not hanging the task. e.g. task1 | IRQ | *start* io_complete_rw_common(link) | link: req1 -> req2 -> req3(with files) *cancel_files() | io_wq_cancel(), etc. | | put_req(link), adds to io-wq req2 schedule() | So, task1 will never try to cancel req2 or req3. If req2 is long-standing (e.g. read(empty_pipe)), this may hang. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 33c448776731..c42321ecda5d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -287,7 +287,6 @@ struct io_ring_ctx { struct list_head timeout_list; struct list_head cq_overflow_list; - wait_queue_head_t inflight_wait; struct io_uring_sqe *sq_sqes; } ____cacheline_aligned_in_smp; @@ -1291,7 +1290,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); - init_waitqueue_head(&ctx->inflight_wait); spin_lock_init(&ctx->inflight_lock); INIT_LIST_HEAD(&ctx->inflight_list); INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work); @@ -6046,12 +6044,13 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) static void io_req_drop_files(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + struct io_uring_task *tctx = req->task->io_uring; unsigned long flags; spin_lock_irqsave(&ctx->inflight_lock, flags); list_del(&req->inflight_entry); - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); + if (atomic_read(&tctx->in_idle)) + wake_up(&tctx->wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); req->flags &= ~REQ_F_INFLIGHT; put_files_struct(req->work.identity->files); @@ -8693,8 +8692,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, break; } if (found) - prepare_to_wait(&ctx->inflight_wait, &wait, - TASK_UNINTERRUPTIBLE); + prepare_to_wait(&task->io_uring->wait, &wait, + TASK_UNINTERRUPTIBLE); spin_unlock_irq(&ctx->inflight_lock); /* We need to keep going until we don't find a matching req */ @@ -8707,7 +8706,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, /* cancellations _may_ trigger task work */ io_run_task_work(); schedule(); - finish_wait(&ctx->inflight_wait, &wait); + finish_wait(&task->io_uring->wait, &wait); } } From 36f72fe2792c4304f1203a44a6a7178e49b447f7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 18 Nov 2020 19:57:26 +0000 Subject: [PATCH 348/395] io_uring: share fixed_file_refs b/w multiple rsrcs Double fixed files for splice/tee are done in a nasty way, it takes 2 ref_node refs, and during the second time it blindly overrides req->fixed_file_refs hoping that it haven't changed. That works because all that is done under iouring_lock in a single go but is error-prone. Bind everything explicitly to a single ref_node and take only one ref, with current ref_node ordering it's guaranteed to keep all files valid awhile the request is inflight. That's mainly a cleanup + preparation for generic resource handling, but also saves pcpu_ref get/put for splice/tee with 2 fixed files. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c42321ecda5d..5818a7b3d29b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1037,6 +1037,16 @@ static inline void io_clean_op(struct io_kiocb *req) __io_clean_op(req); } +static inline void io_set_resource_node(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!req->fixed_file_refs) { + req->fixed_file_refs = &ctx->file_data->node->refs; + percpu_ref_get(req->fixed_file_refs); + } +} + static bool io_match_task(struct io_kiocb *head, struct task_struct *task, struct files_struct *files) @@ -1919,9 +1929,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, static inline void io_put_file(struct io_kiocb *req, struct file *file, bool fixed) { - if (fixed) - percpu_ref_put(req->fixed_file_refs); - else + if (!fixed) fput(file); } @@ -1933,7 +1941,8 @@ static void io_dismantle_req(struct io_kiocb *req) kfree(req->async_data); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - + if (req->fixed_file_refs) + percpu_ref_put(req->fixed_file_refs); io_req_clean_work(req); } @@ -6310,10 +6319,7 @@ static struct file *io_file_get(struct io_submit_state *state, return NULL; fd = array_index_nospec(fd, ctx->nr_user_files); file = io_file_from_index(ctx, fd); - if (file) { - req->fixed_file_refs = &ctx->file_data->node->refs; - percpu_ref_get(req->fixed_file_refs); - } + io_set_resource_node(req); } else { trace_io_uring_file_get(ctx, fd); file = __io_file_get(state, fd); @@ -6691,6 +6697,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->ctx = ctx; req->flags = 0; req->link = NULL; + req->fixed_file_refs = NULL; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); req->task = current; From 65b2b213484acd89a3c20dbb524e52a2f3793b78 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 19 Nov 2020 17:44:46 +0800 Subject: [PATCH 349/395] io_uring: check kthread stopped flag when sq thread is unparked syzbot reports following issue: INFO: task syz-executor.2:12399 can't die for more than 143 seconds. task:syz-executor.2 state:D stack:28744 pid:12399 ppid: 8504 flags:0x00004004 Call Trace: context_switch kernel/sched/core.c:3773 [inline] __schedule+0x893/0x2170 kernel/sched/core.c:4522 schedule+0xcf/0x270 kernel/sched/core.c:4600 schedule_timeout+0x1d8/0x250 kernel/time/timer.c:1847 do_wait_for_common kernel/sched/completion.c:85 [inline] __wait_for_common kernel/sched/completion.c:106 [inline] wait_for_common kernel/sched/completion.c:117 [inline] wait_for_completion+0x163/0x260 kernel/sched/completion.c:138 kthread_stop+0x17a/0x720 kernel/kthread.c:596 io_put_sq_data fs/io_uring.c:7193 [inline] io_sq_thread_stop+0x452/0x570 fs/io_uring.c:7290 io_finish_async fs/io_uring.c:7297 [inline] io_sq_offload_create fs/io_uring.c:8015 [inline] io_uring_create fs/io_uring.c:9433 [inline] io_uring_setup+0x19b7/0x3730 fs/io_uring.c:9507 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45deb9 Code: Unable to access opcode bytes at RIP 0x45de8f. RSP: 002b:00007f174e51ac78 EFLAGS: 00000246 ORIG_RAX: 00000000000001a9 RAX: ffffffffffffffda RBX: 0000000000008640 RCX: 000000000045deb9 RDX: 0000000000000000 RSI: 0000000020000140 RDI: 00000000000050e5 RBP: 000000000118bf58 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000118bf2c R13: 00007ffed9ca723f R14: 00007f174e51b9c0 R15: 000000000118bf2c INFO: task syz-executor.2:12399 blocked for more than 143 seconds. Not tainted 5.10.0-rc3-next-20201110-syzkaller #0 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. Currently we don't have a reproducer yet, but seems that there is a race in current codes: => io_put_sq_data ctx_list is empty now. | ==> kthread_park(sqd->thread); | | T1: sq thread is parked now. ==> kthread_stop(sqd->thread); | KTHREAD_SHOULD_STOP is set now.| ===> kthread_unpark(k); | | T2: sq thread is now unparkd, run again. | | T3: sq thread is now preempted out. | ===> wake_up_process(k); | | | T4: Since sqd ctx_list is empty, needs_sched will be true, | then sq thread sets task state to TASK_INTERRUPTIBLE, | and schedule, now sq thread will never be waken up. ===> wait_for_completion | I have artificially used mdelay() to simulate above race, will get same stack like this syzbot report, but to be honest, I'm not sure this code race triggers syzbot report. To fix this possible code race, when sq thread is unparked, need to check whether sq thread has been stopped. Reported-by: syzbot+03beeb595f074db9cfd1@syzkaller.appspotmail.com Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5818a7b3d29b..6f18b5f27404 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6930,8 +6930,16 @@ static int io_sq_thread(void *data) * kthread parking. This synchronizes the thread vs users, * the users are synchronized on the sqd->ctx_lock. */ - if (kthread_should_park()) + if (kthread_should_park()) { kthread_parkme(); + /* + * When sq thread is unparked, in case the previous park operation + * comes from io_put_sq_data(), which means that sq thread is going + * to be stopped, so here needs to have a check. + */ + if (kthread_should_stop()) + break; + } if (unlikely(!list_empty(&sqd->ctx_new_list))) { io_sqd_init_new(sqd); From 6e1271e60c1d5e822fd1a32a56d52d9ae1823e62 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 20 Nov 2020 15:50:50 +0000 Subject: [PATCH 350/395] io_uring: change submit file state invariant Keep submit state invariant of whether there are file refs left based on state->nr_refs instead of (state->file==NULL), and always check against the first one. It's easier to track and allows to remove 1 if. It also automatically leaves struct submit_state in a consistent state after io_submit_state_end(), that's not used yet but nice. btw rename has_refs to file_refs for more clarity. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f18b5f27404..a83593cadcc9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -762,7 +762,7 @@ struct io_submit_state { */ struct file *file; unsigned int fd; - unsigned int has_refs; + unsigned int file_refs; unsigned int ios_left; }; @@ -2756,16 +2756,15 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) wake_up(&ctx->sq_data->wait); } -static void __io_state_file_put(struct io_submit_state *state) +static inline void __io_state_file_put(struct io_submit_state *state) { - if (state->has_refs) - fput_many(state->file, state->has_refs); - state->file = NULL; + fput_many(state->file, state->file_refs); + state->file_refs = 0; } static inline void io_state_file_put(struct io_submit_state *state) { - if (state->file) + if (state->file_refs) __io_state_file_put(state); } @@ -2779,19 +2778,19 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) if (!state) return fget(fd); - if (state->file) { + if (state->file_refs) { if (state->fd == fd) { - state->has_refs--; + state->file_refs--; return state->file; } __io_state_file_put(state); } state->file = fget_many(fd, state->ios_left); - if (!state->file) + if (unlikely(!state->file)) return NULL; state->fd = fd; - state->has_refs = state->ios_left - 1; + state->file_refs = state->ios_left - 1; return state->file; } @@ -6601,7 +6600,7 @@ static void io_submit_state_start(struct io_submit_state *state, INIT_LIST_HEAD(&state->comp.list); state->comp.ctx = ctx; state->free_reqs = 0; - state->file = NULL; + state->file_refs = 0; state->ios_left = max_ios; } From bd5bbda72f7fa013ddea0ff7c4d91daedb821869 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 20 Nov 2020 15:50:51 +0000 Subject: [PATCH 351/395] io_uring: fix miscounting ios_left io_req_init() doesn't decrement state->ios_left if a request doesn't need ->file, it just returns before that on if(!needs_file). That's not really a problem but may cause overhead for an additional fput(). Also inline and kill io_req_set_file() as it's of no use anymore. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a83593cadcc9..91536947ef08 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6327,15 +6327,6 @@ static struct file *io_file_get(struct io_submit_state *state, return file; } -static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, - int fd) -{ - req->file = io_file_get(state, req, fd, req->flags & REQ_F_FIXED_FILE); - if (req->file || io_op_defs[req->opcode].needs_file_no_error) - return 0; - return -EBADF; -} - static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -6748,10 +6739,16 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, state->plug_started = true; } - if (!io_op_defs[req->opcode].needs_file) - return 0; + ret = 0; + if (io_op_defs[req->opcode].needs_file) { + bool fixed = req->flags & REQ_F_FIXED_FILE; + + req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); + if (unlikely(!req->file && + !io_op_defs[req->opcode].needs_file_no_error)) + ret = -EBADF; + } - ret = io_req_set_file(state, req, READ_ONCE(sqe->fd)); state->ios_left--; return ret; } From ac0648a56c1ff66c1cbf735075ad33a26cbc50de Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 Nov 2020 09:37:51 -0700 Subject: [PATCH 352/395] io_uring: use bottom half safe lock for fixed file data io_file_data_ref_zero() can be invoked from soft-irq from the RCU core, hence we need to ensure that the file_data lock is bottom half safe. Use the _bh() variants when grabbing this lock. Reported-by: syzbot+1f4ba1e5520762c523c6@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 91536947ef08..e66888d45778 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7185,9 +7185,9 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) if (!data) return -ENXIO; - spin_lock(&data->lock); + spin_lock_bh(&data->lock); ref_node = data->node; - spin_unlock(&data->lock); + spin_unlock_bh(&data->lock); if (ref_node) percpu_ref_kill(&ref_node->refs); @@ -7569,7 +7569,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref) data = ref_node->file_data; ctx = data->ctx; - spin_lock(&data->lock); + spin_lock_bh(&data->lock); ref_node->done = true; while (!list_empty(&data->ref_list)) { @@ -7581,7 +7581,7 @@ static void io_file_data_ref_zero(struct percpu_ref *ref) list_del(&ref_node->node); first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist); } - spin_unlock(&data->lock); + spin_unlock_bh(&data->lock); if (percpu_ref_is_dying(&data->refs)) delay = 0; @@ -7704,9 +7704,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, } file_data->node = ref_node; - spin_lock(&file_data->lock); + spin_lock_bh(&file_data->lock); list_add_tail(&ref_node->node, &file_data->ref_list); - spin_unlock(&file_data->lock); + spin_unlock_bh(&file_data->lock); percpu_ref_get(&file_data->refs); return ret; out_fput: @@ -7863,10 +7863,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (needs_switch) { percpu_ref_kill(&data->node->refs); - spin_lock(&data->lock); + spin_lock_bh(&data->lock); list_add_tail(&ref_node->node, &data->ref_list); data->node = ref_node; - spin_unlock(&data->lock); + spin_unlock_bh(&data->lock); percpu_ref_get(&ctx->file_data->refs); } else destroy_fixed_file_ref_node(ref_node); From bee749b187ac57d1faf00b2ab356ff322230fce8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 25 Nov 2020 02:19:23 +0000 Subject: [PATCH 353/395] io_uring: fix files cancellation io_uring_cancel_files()'s task check condition mistakenly got flipped. 1. There can't be a request in the inflight list without IO_WQ_WORK_FILES, kill this check to keep the whole condition simpler. 2. Also, don't call the function for files==NULL to not do such a check, all that staff is already handled well by its counter part, __io_uring_cancel_task_requests(). With that just flip the task check. Also, it iowq-cancels all request of current task there, don't forget to set right ->files into struct io_task_cancel. Fixes: c1973b38bf639 ("io_uring: cancel only requests of current task") Reported-by: syzbot+c0d52d0b3c0c3ffb9525@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e66888d45778..f47de27e5125 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8688,15 +8688,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { while (!list_empty_careful(&ctx->inflight_list)) { - struct io_task_cancel cancel = { .task = task, .files = NULL, }; + struct io_task_cancel cancel = { .task = task, .files = files }; struct io_kiocb *req; DEFINE_WAIT(wait); bool found = false; spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { - if (req->task == task && - (req->work.flags & IO_WQ_WORK_FILES) && + if (req->task != task || req->work.identity->files != files) continue; found = true; @@ -8768,10 +8767,11 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_cancel_defer_files(ctx, task, files); io_cqring_overflow_flush(ctx, true, task, files); - io_uring_cancel_files(ctx, task, files); if (!files) __io_uring_cancel_task_requests(ctx, task); + else + io_uring_cancel_files(ctx, task, files); if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { atomic_dec(&task->io_uring->in_idle); From fbd15848f3c13506253b6c5de0077a603947cb67 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Nov 2020 19:11:15 +0000 Subject: [PATCH 354/395] io_uring: restructure io_timeout_cancel() Add io_timeout_extract() helper, which searches and disarms timeouts, but doesn't complete them. No functional changes. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f47de27e5125..3930b11dcd58 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5633,24 +5633,10 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static int __io_timeout_cancel(struct io_kiocb *req) -{ - struct io_timeout_data *io = req->async_data; - int ret; - - ret = hrtimer_try_to_cancel(&io->timer); - if (ret == -1) - return -EALREADY; - list_del_init(&req->timeout.list); - - req_set_fail_links(req); - io_cqring_fill_event(req, -ECANCELED); - io_put_req_deferred(req, 1); - return 0; -} - -static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, + __u64 user_data) { + struct io_timeout_data *io; struct io_kiocb *req; int ret = -ENOENT; @@ -5662,9 +5648,27 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) } if (ret == -ENOENT) - return ret; + return ERR_PTR(ret); - return __io_timeout_cancel(req); + io = req->async_data; + ret = hrtimer_try_to_cancel(&io->timer); + if (ret == -1) + return ERR_PTR(-EALREADY); + list_del_init(&req->timeout.list); + return req; +} + +static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) +{ + struct io_kiocb *req = io_timeout_extract(ctx, user_data); + + if (IS_ERR(req)) + return PTR_ERR(req); + + req_set_fail_links(req); + io_cqring_fill_event(req, -ECANCELED); + io_put_req_deferred(req, 1); + return 0; } static int io_timeout_remove_prep(struct io_kiocb *req, From 9c8e11b36c9b640a85a4a33a9e9dff418993cc34 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Nov 2020 19:11:16 +0000 Subject: [PATCH 355/395] io_uring: add timeout update Support timeout updates through IORING_OP_TIMEOUT_REMOVE with passed in IORING_TIMEOUT_UPDATE. Updates doesn't support offset timeout mode. Oirignal timeout.off will be ignored as well. Signed-off-by: Pavel Begunkov [axboe: remove now unused 'ret' variable] Signed-off-by: Jens Axboe --- fs/io_uring.c | 54 ++++++++++++++++++++++++++++++++--- include/uapi/linux/io_uring.h | 1 + 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 3930b11dcd58..b40083cde733 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -453,6 +453,10 @@ struct io_timeout { struct io_timeout_rem { struct file *file; u64 addr; + + /* timeout update */ + struct timespec64 ts; + u32 flags; }; struct io_rw { @@ -867,7 +871,10 @@ static const struct io_op_def io_op_defs[] = { .async_size = sizeof(struct io_timeout_data), .work_flags = IO_WQ_WORK_MM, }, - [IORING_OP_TIMEOUT_REMOVE] = {}, + [IORING_OP_TIMEOUT_REMOVE] = { + /* used by timeout updates' prep() */ + .work_flags = IO_WQ_WORK_MM, + }, [IORING_OP_ACCEPT] = { .needs_file = 1, .unbound_nonreg_file = 1, @@ -5671,17 +5678,48 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return 0; } +static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, + struct timespec64 *ts, enum hrtimer_mode mode) +{ + struct io_kiocb *req = io_timeout_extract(ctx, user_data); + struct io_timeout_data *data; + + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout.off = 0; /* noseq */ + data = req->async_data; + list_add_tail(&req->timeout.list, &ctx->timeout_list); + hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode); + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); + return 0; +} + static int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_timeout_rem *tr = &req->timeout_rem; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags) + if (sqe->ioprio || sqe->buf_index || sqe->len) return -EINVAL; - req->timeout_rem.addr = READ_ONCE(sqe->addr); + tr->addr = READ_ONCE(sqe->addr); + tr->flags = READ_ONCE(sqe->timeout_flags); + if (tr->flags & IORING_TIMEOUT_UPDATE) { + if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS)) + return -EINVAL; + if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) + return -EFAULT; + } else if (tr->flags) { + /* timeout removal doesn't support flags */ + return -EINVAL; + } + return 0; } @@ -5690,11 +5728,19 @@ static int io_timeout_remove_prep(struct io_kiocb *req, */ static int io_timeout_remove(struct io_kiocb *req) { + struct io_timeout_rem *tr = &req->timeout_rem; struct io_ring_ctx *ctx = req->ctx; int ret; spin_lock_irq(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, req->timeout_rem.addr); + if (req->timeout_rem.flags & IORING_TIMEOUT_UPDATE) { + enum hrtimer_mode mode = (tr->flags & IORING_TIMEOUT_ABS) + ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); + } else { + ret = io_timeout_cancel(ctx, tr->addr); + } io_cqring_fill_event(req, ret); io_commit_cqring(ctx); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6bb8229de892..d31a2a1e8ef9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -151,6 +151,7 @@ enum { * sqe->timeout_flags */ #define IORING_TIMEOUT_ABS (1U << 0) +#define IORING_TIMEOUT_UPDATE (1U << 1) /* * sqe->splice_flags From dad1b1242fd5717af18ae4ac9d12b9f65849e13a Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Sun, 6 Dec 2020 22:22:42 +0000 Subject: [PATCH 356/395] io_uring: always let io_iopoll_complete() complete polled io Abaci Fuzz reported a double-free or invalid-free BUG in io_commit_cqring(): [ 95.504842] BUG: KASAN: double-free or invalid-free in io_commit_cqring+0x3ec/0x8e0 [ 95.505921] [ 95.506225] CPU: 0 PID: 4037 Comm: io_wqe_worker-0 Tainted: G B W 5.10.0-rc5+ #1 [ 95.507434] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 95.508248] Call Trace: [ 95.508683] dump_stack+0x107/0x163 [ 95.509323] ? io_commit_cqring+0x3ec/0x8e0 [ 95.509982] print_address_description.constprop.0+0x3e/0x60 [ 95.510814] ? vprintk_func+0x98/0x140 [ 95.511399] ? io_commit_cqring+0x3ec/0x8e0 [ 95.512036] ? io_commit_cqring+0x3ec/0x8e0 [ 95.512733] kasan_report_invalid_free+0x51/0x80 [ 95.513431] ? io_commit_cqring+0x3ec/0x8e0 [ 95.514047] __kasan_slab_free+0x141/0x160 [ 95.514699] kfree+0xd1/0x390 [ 95.515182] io_commit_cqring+0x3ec/0x8e0 [ 95.515799] __io_req_complete.part.0+0x64/0x90 [ 95.516483] io_wq_submit_work+0x1fa/0x260 [ 95.517117] io_worker_handle_work+0xeac/0x1c00 [ 95.517828] io_wqe_worker+0xc94/0x11a0 [ 95.518438] ? io_worker_handle_work+0x1c00/0x1c00 [ 95.519151] ? __kthread_parkme+0x11d/0x1d0 [ 95.519806] ? io_worker_handle_work+0x1c00/0x1c00 [ 95.520512] ? io_worker_handle_work+0x1c00/0x1c00 [ 95.521211] kthread+0x396/0x470 [ 95.521727] ? _raw_spin_unlock_irq+0x24/0x30 [ 95.522380] ? kthread_mod_delayed_work+0x180/0x180 [ 95.523108] ret_from_fork+0x22/0x30 [ 95.523684] [ 95.523985] Allocated by task 4035: [ 95.524543] kasan_save_stack+0x1b/0x40 [ 95.525136] __kasan_kmalloc.constprop.0+0xc2/0xd0 [ 95.525882] kmem_cache_alloc_trace+0x17b/0x310 [ 95.533930] io_queue_sqe+0x225/0xcb0 [ 95.534505] io_submit_sqes+0x1768/0x25f0 [ 95.535164] __x64_sys_io_uring_enter+0x89e/0xd10 [ 95.535900] do_syscall_64+0x33/0x40 [ 95.536465] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 95.537199] [ 95.537505] Freed by task 4035: [ 95.538003] kasan_save_stack+0x1b/0x40 [ 95.538599] kasan_set_track+0x1c/0x30 [ 95.539177] kasan_set_free_info+0x1b/0x30 [ 95.539798] __kasan_slab_free+0x112/0x160 [ 95.540427] kfree+0xd1/0x390 [ 95.540910] io_commit_cqring+0x3ec/0x8e0 [ 95.541516] io_iopoll_complete+0x914/0x1390 [ 95.542150] io_do_iopoll+0x580/0x700 [ 95.542724] io_iopoll_try_reap_events.part.0+0x108/0x200 [ 95.543512] io_ring_ctx_wait_and_kill+0x118/0x340 [ 95.544206] io_uring_release+0x43/0x50 [ 95.544791] __fput+0x28d/0x940 [ 95.545291] task_work_run+0xea/0x1b0 [ 95.545873] do_exit+0xb6a/0x2c60 [ 95.546400] do_group_exit+0x12a/0x320 [ 95.546967] __x64_sys_exit_group+0x3f/0x50 [ 95.547605] do_syscall_64+0x33/0x40 [ 95.548155] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The reason is that once we got a non EAGAIN error in io_wq_submit_work(), we'll complete req by calling io_req_complete(), which will hold completion_lock to call io_commit_cqring(), but for polled io, io_iopoll_complete() won't hold completion_lock to call io_commit_cqring(), then there maybe concurrent access to ctx->defer_list, double free may happen. To fix this bug, we always let io_iopoll_complete() complete polled io. Cc: # 5.5+ Reported-by: Abaci Fuzz Signed-off-by: Xiaoguang Wang Reviewed-by: Pavel Begunkov Reviewed-by: Joseph Qi Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b40083cde733..89fd893a6952 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6341,8 +6341,19 @@ static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work) } if (ret) { - req_set_fail_links(req); - io_req_complete(req, ret); + /* + * io_iopoll_complete() does not hold completion_lock to complete + * polled io, so here for polled io, just mark it done and still let + * io_iopoll_complete() complete it. + */ + if (req->ctx->flags & IORING_SETUP_IOPOLL) { + struct kiocb *kiocb = &req->rw.kiocb; + + kiocb_done(kiocb, ret, NULL); + } else { + req_set_fail_links(req); + io_req_complete(req, ret); + } } return io_steal_work(req); From 31bff9a51b264df6d144931a6a5f1d6cc815ed4b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 6 Dec 2020 22:22:43 +0000 Subject: [PATCH 357/395] io_uring: fix racy IOPOLL completions IOPOLL allows buffer remove/provide requests, but they doesn't synchronise by rules of IOPOLL, namely it have to hold uring_lock. Cc: # 5.7+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 89fd893a6952..d030a4404b8f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4152,11 +4152,17 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock, head = idr_find(&ctx->io_buffer_idr, p->bgid); if (head) ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); - - io_ring_submit_lock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, 0, cs); + + /* need to hold the lock to complete IOPOLL requests */ + if (ctx->flags & IORING_SETUP_IOPOLL) { + __io_req_complete(req, ret, 0, cs); + io_ring_submit_unlock(ctx, !force_nonblock); + } else { + io_ring_submit_unlock(ctx, !force_nonblock); + __io_req_complete(req, ret, 0, cs); + } return 0; } @@ -4241,10 +4247,17 @@ static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock, } } out: - io_ring_submit_unlock(ctx, !force_nonblock); if (ret < 0) req_set_fail_links(req); - __io_req_complete(req, ret, 0, cs); + + /* need to hold the lock to complete IOPOLL requests */ + if (ctx->flags & IORING_SETUP_IOPOLL) { + __io_req_complete(req, ret, 0, cs); + io_ring_submit_unlock(ctx, !force_nonblock); + } else { + io_ring_submit_unlock(ctx, !force_nonblock); + __io_req_complete(req, ret, 0, cs); + } return 0; } From 634578f800652035debba3098d8ab0d21af7c7a5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 6 Dec 2020 22:22:44 +0000 Subject: [PATCH 358/395] io_uring: fix racy IOPOLL flush overflow It's not safe to call io_cqring_overflow_flush() for IOPOLL mode without hodling uring_lock, because it does synchronisation differently. Make sure we have it. As for io_ring_exit_work(), we don't even need it there because io_ring_ctx_wait_and_kill() already set force flag making all overflowed requests to be dropped. Cc: # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d030a4404b8f..c0306f77211a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8654,8 +8654,6 @@ static void io_ring_exit_work(struct work_struct *work) * as nobody else will be looking for them. */ do { - if (ctx->rings) - io_cqring_overflow_flush(ctx, true, NULL, NULL); io_iopoll_try_reap_events(ctx); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); @@ -8665,6 +8663,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); + if (ctx->rings) + io_cqring_overflow_flush(ctx, true, NULL, NULL); mutex_unlock(&ctx->uring_lock); io_kill_timeouts(ctx, NULL, NULL); @@ -8674,8 +8674,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_wq_cancel_all(ctx->io_wq); /* if we failed setting up the ctx, we might not have any rings */ - if (ctx->rings) - io_cqring_overflow_flush(ctx, true, NULL, NULL); io_iopoll_try_reap_events(ctx); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); @@ -8840,7 +8838,9 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, } io_cancel_defer_files(ctx, task, files); + io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); io_cqring_overflow_flush(ctx, true, task, files); + io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); if (!files) __io_uring_cancel_task_requests(ctx, task); @@ -9172,8 +9172,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { + io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); if (!list_empty_careful(&ctx->cq_overflow_list)) io_cqring_overflow_flush(ctx, false, NULL, NULL); + io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) From 59850d226e4907a6f37c1d2fe5ba97546a8691a4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 6 Dec 2020 22:22:45 +0000 Subject: [PATCH 359/395] io_uring: fix io_cqring_events()'s noflush Checking !list_empty(&ctx->cq_overflow_list) around noflush in io_cqring_events() is racy, because if it fails but a request overflowed just after that, io_cqring_overflow_flush() still will be called. Remove the second check, it shouldn't be a problem for performance, because there is cq_check_overflow bit check just above. Cc: # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index c0306f77211a..f53356ced5ab 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2329,7 +2329,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) * we wake up the task, and the next invocation will flush the * entries. We cannot safely to it from here. */ - if (noflush && !list_empty(&ctx->cq_overflow_list)) + if (noflush) return -1U; io_cqring_overflow_flush(ctx, false, NULL, NULL); From 5a9a8897c253a075805401d38d987ec1ac1824b6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 Oct 2020 09:11:42 -0600 Subject: [PATCH 360/395] alpha: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for alpha. Cc: linux-alpha@vger.kernel.org Signed-off-by: Jens Axboe --- arch/alpha/include/asm/thread_info.h | 2 ++ arch/alpha/kernel/entry.S | 2 +- arch/alpha/kernel/signal.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 807d7b9a1860..2592356e3215 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -62,6 +62,7 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SYSCALL_AUDIT 4 /* syscall audit active */ +#define TIF_NOTIFY_SIGNAL 5 /* signal notifications exist */ #define TIF_DIE_IF_KERNEL 9 /* dik recursion lock */ #define TIF_MEMDIE 13 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 14 /* idle is polling for TIF_NEED_RESCHED */ @@ -71,6 +72,7 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define _TIF_NEED_RESCHED (1< Date: Fri, 9 Oct 2020 14:35:34 -0600 Subject: [PATCH 361/395] c6x: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for c6x. Cc: linux-c6x-dev@linux-c6x.org Signed-off-by: Jens Axboe --- arch/c6x/include/asm/thread_info.h | 1 + arch/c6x/kernel/asm-offsets.c | 1 + arch/c6x/kernel/signal.c | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/c6x/include/asm/thread_info.h b/arch/c6x/include/asm/thread_info.h index f70382844b96..dd8913d57189 100644 --- a/arch/c6x/include/asm/thread_info.h +++ b/arch/c6x/include/asm/thread_info.h @@ -82,6 +82,7 @@ struct thread_info *current_thread_info(void) #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_RESTORE_SIGMASK 4 /* restore signal mask in do_signal() */ +#define TIF_NOTIFY_SIGNAL 5 /* signal notifications exist */ #define TIF_MEMDIE 17 /* OOM killer killed process */ diff --git a/arch/c6x/kernel/asm-offsets.c b/arch/c6x/kernel/asm-offsets.c index 0f8fde494875..4a264ef87dcb 100644 --- a/arch/c6x/kernel/asm-offsets.c +++ b/arch/c6x/kernel/asm-offsets.c @@ -116,6 +116,7 @@ void foo(void) DEFINE(_TIF_NOTIFY_RESUME, (1< #include +#include #include #include @@ -313,7 +314,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags, int syscall) { /* deal with pending signal delivery */ - if (thread_info_flags & (1 << TIF_SIGPENDING)) + if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs, syscall); if (thread_info_flags & (1 << TIF_NOTIFY_RESUME)) From 2f9799ad0111ee742ccc02dd2ea2c87646746fc1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 14:42:33 -0600 Subject: [PATCH 362/395] h8300: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for h8300. Cc: uclinux-h8-devel@lists.sourceforge.jp Signed-off-by: Jens Axboe --- arch/h8300/include/asm/thread_info.h | 4 +++- arch/h8300/kernel/signal.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h index 0cdaa302d3d2..a518214d4ddd 100644 --- a/arch/h8300/include/asm/thread_info.h +++ b/arch/h8300/include/asm/thread_info.h @@ -73,6 +73,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SYSCALL_TRACEPOINT 8 /* for ftrace syscall instrumentation */ #define TIF_POLLING_NRFLAG 9 /* true if poll_idle() is polling TIF_NEED_RESCHED */ +#define TIF_NOTIFY_SIGNAL 10 /* signal notifications exist */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) @@ -83,6 +84,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) /* work to do in syscall trace */ #define _TIF_WORK_SYSCALL_MASK (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP | \ @@ -92,7 +94,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_ALLWORK_MASK (_TIF_SYSCALL_TRACE | _TIF_SIGPENDING | \ _TIF_NEED_RESCHED | _TIF_SYSCALL_AUDIT | \ _TIF_SINGLESTEP | _TIF_NOTIFY_RESUME | \ - _TIF_SYSCALL_TRACEPOINT) + _TIF_SYSCALL_TRACEPOINT | _TIF_NOTIFY_SIGNAL) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK (_TIF_ALLWORK_MASK & ~(_TIF_SYSCALL_TRACE | \ diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index 75d9b7e626b2..75a1c36b105a 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -279,7 +279,7 @@ static void do_signal(struct pt_regs *regs) asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags) { - if (thread_info_flags & _TIF_SIGPENDING) + if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); if (thread_info_flags & _TIF_NOTIFY_RESUME) From b269c229b0e89aedb7943c06673b56b6052cf5e5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 14:49:43 -0600 Subject: [PATCH 363/395] ia64: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for ia64. Cc: linux-ia64@vger.kernel.org [axboe: added fixes from Mike Rapoport ] Signed-off-by: Jens Axboe --- arch/ia64/include/asm/thread_info.h | 4 +++- arch/ia64/kernel/process.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 64a1011f6812..51d20cb37706 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -103,6 +103,7 @@ struct thread_info { #define TIF_SYSCALL_TRACE 2 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 3 /* syscall auditing active */ #define TIF_SINGLESTEP 4 /* restore singlestep on return to user mode */ +#define TIF_NOTIFY_SIGNAL 5 /* signal notification exist */ #define TIF_NOTIFY_RESUME 6 /* resumption notification requested */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ #define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */ @@ -115,6 +116,7 @@ struct thread_info { #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SYSCALL_TRACEAUDIT (_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_MCA_INIT (1 << TIF_MCA_INIT) @@ -124,7 +126,7 @@ struct thread_info { /* "work to do on user-return" bits */ #define TIF_ALLWORK_MASK (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\ - _TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE) + _TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_NOTIFY_SIGNAL) /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */ #define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)) diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 6b61a703bcf5..8d4e1cab9190 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -171,7 +171,8 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) } /* deal with pending signal delivery */ - if (test_thread_flag(TIF_SIGPENDING)) { + if (test_thread_flag(TIF_SIGPENDING) || + test_thread_flag(TIF_NOTIFY_SIGNAL)) { local_irq_enable(); /* force interrupt enable */ ia64_do_signal(scr, in_syscall); } From b13e8bf615fe26fb6a6dfe1b5a1c65e1624dfee2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 15:18:43 -0600 Subject: [PATCH 364/395] nds32: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for nds32. Cc: Nick Hu Cc: Greentime Hu Cc: Vincent Chen Signed-off-by: Jens Axboe --- arch/nds32/include/asm/thread_info.h | 2 ++ arch/nds32/kernel/ex-exit.S | 2 +- arch/nds32/kernel/signal.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/nds32/include/asm/thread_info.h b/arch/nds32/include/asm/thread_info.h index c135111ec44e..d3967ad184f0 100644 --- a/arch/nds32/include/asm/thread_info.h +++ b/arch/nds32/include/asm/thread_info.h @@ -48,6 +48,7 @@ struct thread_info { #define TIF_NEED_RESCHED 2 #define TIF_SINGLESTEP 3 #define TIF_NOTIFY_RESUME 4 /* callback before returning to user */ +#define TIF_NOTIFY_SIGNAL 5 /* signal notifications exist */ #define TIF_SYSCALL_TRACE 8 #define TIF_POLLING_NRFLAG 17 #define TIF_MEMDIE 18 @@ -57,6 +58,7 @@ struct thread_info { #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) diff --git a/arch/nds32/kernel/ex-exit.S b/arch/nds32/kernel/ex-exit.S index 6a2966c2d8c8..b30699911b81 100644 --- a/arch/nds32/kernel/ex-exit.S +++ b/arch/nds32/kernel/ex-exit.S @@ -120,7 +120,7 @@ work_pending: andi $p1, $r1, #_TIF_NEED_RESCHED bnez $p1, work_resched - andi $p1, $r1, #_TIF_SIGPENDING|#_TIF_NOTIFY_RESUME + andi $p1, $r1, #_TIF_SIGPENDING|#_TIF_NOTIFY_RESUME|#_TIF_NOTIFY_SIGNAL beqz $p1, no_work_pending move $r0, $sp ! 'regs' diff --git a/arch/nds32/kernel/signal.c b/arch/nds32/kernel/signal.c index 2acb94812af9..7e3ca430a223 100644 --- a/arch/nds32/kernel/signal.c +++ b/arch/nds32/kernel/signal.c @@ -376,7 +376,7 @@ static void do_signal(struct pt_regs *regs) asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int thread_flags) { - if (thread_flags & _TIF_SIGPENDING) + if (thread_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); if (thread_flags & _TIF_NOTIFY_RESUME) From 24a31b81e38309b1604f24520110aae1f83f3cbf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 14:29:17 -0600 Subject: [PATCH 365/395] riscv: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for riscv. Cc: linux-riscv@lists.infradead.org Signed-off-by: Jens Axboe --- arch/riscv/include/asm/thread_info.h | 5 ++++- arch/riscv/kernel/signal.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index a390711129de..97bf5a1575d2 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -74,6 +74,7 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing */ #define TIF_SECCOMP 8 /* syscall secure computing */ +#define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -82,9 +83,11 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_WORK_MASK \ - (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED) + (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED | \ + _TIF_NOTIFY_SIGNAL) #define _TIF_SYSCALL_WORK \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \ diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index bc6841867b51..469aef8ed922 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -310,7 +310,7 @@ asmlinkage __visible void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) { /* Handle pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) + if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) do_signal(regs); if (thread_info_flags & _TIF_NOTIFY_RESUME) From f50a7052f5e70ee7a6a5e2ed08660994dc3df2a5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 15:44:37 -0600 Subject: [PATCH 366/395] sparc: add support for TIF_NOTIFY_SIGNAL Wire up TIF_NOTIFY_SIGNAL handling for sparc. Cc: sparclinux@vger.kernel.org Signed-off-by: Jens Axboe --- arch/sparc/include/asm/thread_info_32.h | 4 +++- arch/sparc/include/asm/thread_info_64.h | 6 ++++-- arch/sparc/kernel/signal_32.c | 2 +- arch/sparc/kernel/signal_64.c | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index 548b366165dd..45b4955b253f 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -104,6 +104,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_RESTORE_SIGMASK 4 /* restore signal mask in do_signal() */ +#define TIF_NOTIFY_SIGNAL 5 /* signal notifications exist */ #define TIF_USEDFPU 8 /* FPU was used by this task * this quantum (SMP) */ #define TIF_POLLING_NRFLAG 9 /* true if poll_idle() is polling @@ -115,11 +116,12 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define _TIF_NOTIFY_RESUME (1< Date: Fri, 9 Oct 2020 16:01:33 -0600 Subject: [PATCH 367/395] task_work: remove legacy TWA_SIGNAL path All archs now support TIF_NOTIFY_SIGNAL. Signed-off-by: Jens Axboe --- kernel/task_work.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/kernel/task_work.c b/kernel/task_work.c index 15b087286bea..9cde961875c0 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -5,34 +5,6 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ -/* - * TWA_SIGNAL signaling - use TIF_NOTIFY_SIGNAL, if available, as it's faster - * than TIF_SIGPENDING as there's no dependency on ->sighand. The latter is - * shared for threads, and can cause contention on sighand->lock. Even for - * the non-threaded case TIF_NOTIFY_SIGNAL is more efficient, as no locking - * or IRQ disabling is involved for notification (or running) purposes. - */ -static void task_work_notify_signal(struct task_struct *task) -{ -#if defined(TIF_NOTIFY_SIGNAL) - set_notify_signal(task); -#else - unsigned long flags; - - /* - * Only grab the sighand lock if we don't already have some - * task_work pending. This pairs with the smp_store_mb() - * in get_signal(), see comment there. - */ - if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && - lock_task_sighand(task, &flags)) { - task->jobctl |= JOBCTL_TASK_WORK; - signal_wake_up(task, 0); - unlock_task_sighand(task, &flags); - } -#endif -} - /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback @@ -76,7 +48,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, set_notify_resume(task); break; case TWA_SIGNAL: - task_work_notify_signal(task); + set_notify_signal(task); break; default: WARN_ON_ONCE(1); From 792ee0f6db5b942ee68ee7c9aea9d34dde4c4ff2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Oct 2020 20:17:18 -0600 Subject: [PATCH 368/395] io_uring: JOBCTL_TASK_WORK is no longer used by task_work Remove the dead code, TWA_SIGNAL will never set JOBCTL_TASK_WORK at this point. Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8018c7076b25..a170488507cf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6846,13 +6846,8 @@ static int io_run_task_work_sig(void) return 1; if (!signal_pending(current)) return 0; - if (current->jobctl & JOBCTL_TASK_WORK) { - spin_lock_irq(¤t->sighand->siglock); - current->jobctl &= ~JOBCTL_TASK_WORK; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 1; - } + if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL)) + return -ERESTARTSYS; return -EINTR; } From 98b89b649fce39dacb9dc036d6d0fdb8caff73f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 16:03:01 -0600 Subject: [PATCH 369/395] signal: kill JOBCTL_TASK_WORK It's no longer used, get rid of it. Signed-off-by: Jens Axboe --- include/linux/sched/jobctl.h | 4 +--- kernel/signal.c | 20 -------------------- 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h index d2b4204ba4d3..fa067de9f1a9 100644 --- a/include/linux/sched/jobctl.h +++ b/include/linux/sched/jobctl.h @@ -19,7 +19,6 @@ struct task_struct; #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ #define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ -#define JOBCTL_TASK_WORK_BIT 24 /* set by TWA_SIGNAL */ #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) @@ -29,10 +28,9 @@ struct task_struct; #define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) #define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) #define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) -#define JOBCTL_TASK_WORK (1UL << JOBCTL_TASK_WORK_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) -#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK | JOBCTL_TASK_WORK) +#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) extern bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask); extern void task_clear_jobctl_trapping(struct task_struct *task); diff --git a/kernel/signal.c b/kernel/signal.c index 923230ff6cfc..cf8b057ca2ac 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2556,26 +2556,6 @@ bool get_signal(struct ksignal *ksig) relock: spin_lock_irq(&sighand->siglock); - /* - * Make sure we can safely read ->jobctl() in task_work add. As Oleg - * states: - * - * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we - * roughly have - * - * task_work_add: get_signal: - * STORE(task->task_works, new_work); STORE(task->jobctl); - * mb(); mb(); - * LOAD(task->jobctl); LOAD(task->task_works); - * - * and we can rely on STORE-MB-LOAD [ in task_work_add]. - */ - smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK); - if (unlikely(current->task_works)) { - spin_unlock_irq(&sighand->siglock); - task_work_run(); - goto relock; - } /* * Every stopped thread goes here after wakeup. Check to see if From e296dc4996b8094ccde45d19090d804c4103513e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 16:04:39 -0600 Subject: [PATCH 370/395] kernel: remove checking for TIF_NOTIFY_SIGNAL It's available everywhere now, no need to check or add dummy defines. Signed-off-by: Jens Axboe --- include/linux/entry-common.h | 4 ---- include/linux/sched/signal.h | 2 -- include/linux/tracehook.h | 4 ---- kernel/signal.c | 2 -- 4 files changed, 12 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index b9711e813ec2..abec3a5ae799 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -37,10 +37,6 @@ # define _TIF_UPROBE (0) #endif -#ifndef _TIF_NOTIFY_SIGNAL -# define _TIF_NOTIFY_SIGNAL (0) -#endif - /* * TIF flags handled in syscall_enter_from_user_mode() */ diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index bd5afa076189..24b7b862e043 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -360,7 +360,6 @@ static inline int task_sigpending(struct task_struct *p) static inline int signal_pending(struct task_struct *p) { -#if defined(TIF_NOTIFY_SIGNAL) /* * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same * behavior in terms of ensuring that we break out of wait loops @@ -368,7 +367,6 @@ static inline int signal_pending(struct task_struct *p) */ if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) return 1; -#endif return task_sigpending(p); } diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index f7d82e4fafd6..ee9ab7dbc8c3 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -205,12 +205,10 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) */ static inline void tracehook_notify_signal(void) { -#if defined(TIF_NOTIFY_SIGNAL) clear_thread_flag(TIF_NOTIFY_SIGNAL); smp_mb__after_atomic(); if (current->task_works) task_work_run(); -#endif } /* @@ -218,11 +216,9 @@ static inline void tracehook_notify_signal(void) */ static inline void set_notify_signal(struct task_struct *task) { -#if defined(TIF_NOTIFY_SIGNAL) if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && !wake_up_state(task, TASK_INTERRUPTIBLE)) kick_process(task); -#endif } #endif /* */ diff --git a/kernel/signal.c b/kernel/signal.c index cf8b057ca2ac..ccd530509201 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2535,14 +2535,12 @@ bool get_signal(struct ksignal *ksig) * that the arch handlers don't all have to do it. If we get here * without TIF_SIGPENDING, just exit after running signal work. */ -#ifdef TIF_NOTIFY_SIGNAL if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) { if (test_thread_flag(TIF_NOTIFY_SIGNAL)) tracehook_notify_signal(); if (!task_sigpending(current)) return false; } -#endif if (unlikely(uprobe_deny_signal())) return false; From 355fb9e2b78e78b38ec00f5cd9b05c6aceb98335 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Oct 2020 20:19:35 -0600 Subject: [PATCH 371/395] io_uring: remove 'twa_signal_ok' deadlock work-around The TIF_NOTIFY_SIGNAL based implementation of TWA_SIGNAL is always safe to use, regardless of context, as we won't be recursing into the signal lock. So now that all archs are using that, we can drop this deadlock work-around as it's always safe to use TWA_SIGNAL. Signed-off-by: Jens Axboe --- fs/io_uring.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a170488507cf..30d88f3d42b0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1995,7 +1995,7 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } -static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) +static int io_req_task_work_add(struct io_kiocb *req) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; @@ -2012,7 +2012,7 @@ static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok) * will do the job. */ notify = TWA_NONE; - if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok) + if (!(ctx->flags & IORING_SETUP_SQPOLL)) notify = TWA_SIGNAL; ret = task_work_add(tsk, &req->task_work, notify); @@ -2074,7 +2074,7 @@ static void io_req_task_queue(struct io_kiocb *req) init_task_work(&req->task_work, io_req_task_submit); percpu_ref_get(&req->ctx->refs); - ret = io_req_task_work_add(req, true); + ret = io_req_task_work_add(req); if (unlikely(ret)) { struct task_struct *tsk; @@ -2196,7 +2196,7 @@ static void io_free_req_deferred(struct io_kiocb *req) int ret; init_task_work(&req->task_work, io_put_req_deferred_cb); - ret = io_req_task_work_add(req, true); + ret = io_req_task_work_add(req); if (unlikely(ret)) { struct task_struct *tsk; @@ -3305,7 +3305,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - ret = io_req_task_work_add(req, true); + ret = io_req_task_work_add(req); if (unlikely(ret)) { struct task_struct *tsk; @@ -4843,7 +4843,6 @@ struct io_poll_table { static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { - bool twa_signal_ok; int ret; /* for instances that support it check for an event match first: */ @@ -4858,21 +4857,13 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, init_task_work(&req->task_work, func); percpu_ref_get(&req->ctx->refs); - /* - * If we using the signalfd wait_queue_head for this wakeup, then - * it's not safe to use TWA_SIGNAL as we could be recursing on the - * tsk->sighand->siglock on doing the wakeup. Should not be needed - * either, as the normal wakeup will suffice. - */ - twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh); - /* * If this fails, then the task is exiting. When a task exits, the * work gets canceled, so just cancel this request as well instead * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = io_req_task_work_add(req, twa_signal_ok); + ret = io_req_task_work_add(req); if (unlikely(ret)) { struct task_struct *tsk; From b5f32555567cfe0a5d5dbe7c1e85ebe37b3f545a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 Dec 2020 17:48:48 +0100 Subject: [PATCH 372/395] cdrom: Reset sector_size back it is not 2048. In v2.4.0-test2pre2 mmc_ioctl_cdrom_read_data() was extended by issuing a MODE_SELECT opcode to change the sector size and READ_10 to perform the actual read if the READ_CD opcode is not support. The sector size is never changed back to the previous value of 2048 bytes which is however denoted by the comment for version 3.09 of the cdrom.c file. Use cdrom_switch_blocksize() to change the sector size only if the requested size deviates from 2048. Change it back to 2048 after the read operation if a change was mode. Link: https://lkml.kernel.org/r/20201204164803.ovwurzs3257em2rp@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 0c271b9e3c5b..8f0e52a71493 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2996,13 +2996,15 @@ static noinline int mmc_ioctl_cdrom_read_data(struct cdrom_device_info *cdi, * SCSI-II devices are not required to support * READ_CD, so let's try switching block size */ - /* FIXME: switch back again... */ - ret = cdrom_switch_blocksize(cdi, blocksize); - if (ret) - goto out; + if (blocksize != CD_FRAMESIZE) { + ret = cdrom_switch_blocksize(cdi, blocksize); + if (ret) + goto out; + } cgc->sshdr = NULL; ret = cdrom_read_cd(cdi, cgc, lba, blocksize, 1); - ret |= cdrom_switch_blocksize(cdi, blocksize); + if (blocksize != CD_FRAMESIZE) + ret |= cdrom_switch_blocksize(cdi, CD_FRAMESIZE); } if (!ret && copy_to_user(arg, cgc->buffer, blocksize)) ret = -EFAULT; From 8d2ac857a81d5a44b9643038291ea958bbf05c7f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 Dec 2020 17:48:49 +0100 Subject: [PATCH 373/395] sr: Switch the sector size back to 2048 if sr_read_sector() changed it. sr_read_sector() is hardly used since v2.3.16. Its only purpose is to check if it is a XA medium via sr_is_xa(). This check is only enabled if the module parameter `xa_test' is enabled. Change the sector size back to 2048 if it was changed. With this change, there is no lazy sector size changing left. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jens Axboe --- drivers/scsi/sr_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c index ffcf902da390..5703f8400b73 100644 --- a/drivers/scsi/sr_ioctl.c +++ b/drivers/scsi/sr_ioctl.c @@ -549,6 +549,8 @@ static int sr_read_sector(Scsi_CD *cd, int lba, int blksize, unsigned char *dest cgc.timeout = IOCTL_TIMEOUT; rc = sr_do_ioctl(cd, &cgc); + if (blksize != CD_FRAMESIZE) + rc |= sr_set_blocklength(cd, CD_FRAMESIZE); return rc; } From 31cc07761ccb389c7c01f904f6a6479544abbd11 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 Dec 2020 17:48:50 +0100 Subject: [PATCH 374/395] sr: Remove in_interrupt() usage in sr_init_command(). The in_interrupt() check in sr_init_command() is a leftover from the past, pre v2.3.16 era to be exact. Back then the ioctl() was served by `sr' itself and sector size changes by CDROMREADMODE2 (as noted in the comment) were accounted within sr's data structures which allowed a "lazy" reset so it could be skipped on the next request and reset back to the default value once the device node was closed or before a command from the blockqueue was issued. This does not work like that anymore. The CDROMREADMODE2 is served by cdrom's mmc_ioctl() function which may change the sector size but the `sr' driver does not learn about it and so its ->sector_size is not updated. The ioctl() resets the changed sector size back to 2048. sr_read_sector() also resets the sector size back to the default once it is done. Remove the conditional sector size update from sr_init_command() and sr_release() because it is not needed. Link: https://lkml.kernel.org/r/20201204164803.ovwurzs3257em2rp@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jens Axboe --- drivers/scsi/sr.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index fd4b582110b2..e4633b84c556 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -416,19 +416,7 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) goto out; } - /* - * we do lazy blocksize switching (when reading XA sectors, - * see CDROMREADMODE2 ioctl) - */ s_size = cd->device->sector_size; - if (s_size > 2048) { - if (!in_interrupt()) - sr_set_blocklength(cd, 2048); - else - scmd_printk(KERN_INFO, SCpnt, - "can't switch blocksize: in interrupt\n"); - } - if (s_size != 512 && s_size != 1024 && s_size != 2048) { scmd_printk(KERN_ERR, SCpnt, "bad sector size %d\n", s_size); goto out; @@ -701,11 +689,6 @@ static int sr_open(struct cdrom_device_info *cdi, int purpose) static void sr_release(struct cdrom_device_info *cdi) { - struct scsi_cd *cd = cdi->handle; - - if (cd->device->sector_size > 2048) - sr_set_blocklength(cd, 2048); - } static int sr_probe(struct device *dev) From 91cdf265b74bf63a69949d6db08a60523207400c Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 5 Dec 2020 00:20:53 +0900 Subject: [PATCH 375/395] blk-mq: add helper allocating tagset->tags tagset->set is allocated from blk_mq_alloc_tag_set() rather than being reallocated. This patch added a helper to make its meaning explicitly which is to allocate rather than to reallocate. Signed-off-by: Minwoo Im Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 95ecc4c69969..e2bd9ef81d55 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3382,6 +3382,12 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, return 0; } +static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set, + int new_nr_hw_queues) +{ + return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues); +} + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -3435,7 +3441,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0) + if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0) return -ENOMEM; ret = -ENOMEM; From d220a21410e445324b8ae67d93f9c51406f99a29 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 5 Dec 2020 00:20:54 +0900 Subject: [PATCH 376/395] blk-mq: update arg in comment of blk_mq_map_queue Update mis-named argument description of blk_mq_map_queue(). This patch also updates description that argument to software queue percpu context. Signed-off-by: Minwoo Im Reviewed-by: John Garry Signed-off-by: Jens Axboe --- block/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.h b/block/blk-mq.h index c696515766c7..c1458d9502f1 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -99,7 +99,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @q: request queue * @flags: request command flags - * @cpu: cpu ctx + * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, unsigned int flags, From fa94ba8a7b22890e6a17b39b9359e114fe18cd59 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 5 Dec 2020 00:20:55 +0900 Subject: [PATCH 377/395] blk-mq: fix msec comment from micro to milli seconds Delay to wait for queue running is milli second unit which is passed to delayed work via msecs_to_jiffies() which is to convert milliseconds to jiffies. Signed-off-by: Minwoo Im Reviewed-by: John Garry Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index e2bd9ef81d55..6f207ec9ef83 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1594,7 +1594,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. * @hctx: Pointer to the hardware queue to run. * @async: If we want to run the queue asynchronously. - * @msecs: Microseconds of delay to wait before running the queue. + * @msecs: Milliseconds of delay to wait before running the queue. * * If !@async, try to run the queue now. Else, run the queue asynchronously and * with a delay of @msecs. @@ -1623,7 +1623,7 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, /** * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. - * @msecs: Microseconds of delay to wait before running the queue. + * @msecs: Milliseconds of delay to wait before running the queue. * * Run a hardware queue asynchronously with a delay of @msecs. */ @@ -1687,7 +1687,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); /** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. - * @msecs: Microseconds of delay to wait before running the queues. + * @msecs: Milliseconds of delay to wait before running the queues. */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { From aeb2b0b1a3da5791d3b216e71ec72db7570f3571 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Sat, 12 Dec 2020 06:13:02 +0100 Subject: [PATCH 378/395] block: drop dead assignments in loop_init() Commit 8410d38c2552 ("loop: use __register_blkdev to allocate devices on demand") simplified loop_init(); so computing the range of the block region is not required anymore and can be dropped. Drop dead assignments in loop_init(). As compilers will detect these unneeded assignments and optimize this, the resulting object code is identical before and after this change. No functional change. No change in object code. Signed-off-by: Lukas Bulwahn Signed-off-by: Jens Axboe --- drivers/block/loop.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 22e59410b971..a45248c6e319 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2316,7 +2316,6 @@ MODULE_ALIAS("devname:loop-control"); static int __init loop_init(void) { int i, nr; - unsigned long range; struct loop_device *lo; int err; @@ -2353,13 +2352,10 @@ static int __init loop_init(void) * /dev/loop-control interface, or be instantiated by accessing * a 'dead' device node. */ - if (max_loop) { + if (max_loop) nr = max_loop; - range = max_loop << part_shift; - } else { + else nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT; - range = 1UL << MINORBITS; - } err = misc_register(&loop_misc); if (err < 0) From 7f3f227b41e81f8669e906c49a240c1678c65cfe Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 2 Dec 2020 17:12:44 +0100 Subject: [PATCH 379/395] hv_balloon: simplify math in alloc_balloon_pages() 'alloc_unit' in alloc_balloon_pages() is either '512' for 2M allocations or '1' for 4k allocations. So 1 << get_order(alloc_unit << PAGE_SHIFT) equals to 'alloc_unit' and the for loop basically sets all them offline. Simplify the math to improve the readability. Signed-off-by: Vitaly Kuznetsov Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20201202161245.2406143-2-vkuznets@redhat.com Signed-off-by: Wei Liu --- drivers/hv/hv_balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index eb56e09ae15f..da3b6bd2367c 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1238,7 +1238,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); /* mark all pages offline */ - for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++) + for (j = 0; j < alloc_unit; j++) __SetPageOffline(pg + j); bl_resp->range_count++; From d1df458cbfdb0c3384c03c7fbcb1689bc02a746c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 2 Dec 2020 17:12:45 +0100 Subject: [PATCH 380/395] hv_balloon: do adjust_managed_page_count() when ballooning/un-ballooning Unlike virtio_balloon/virtio_mem/xen balloon drivers, Hyper-V balloon driver does not adjust managed pages count when ballooning/un-ballooning and this leads to incorrect stats being reported, e.g. unexpected 'free' output. Note, the calculation in post_status() seems to remain correct: ballooned out pages are never 'available' and we manually add dm->num_pages_ballooned to 'commited'. Suggested-by: David Hildenbrand Signed-off-by: Vitaly Kuznetsov Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20201202161245.2406143-3-vkuznets@redhat.com Signed-off-by: Wei Liu --- drivers/hv/hv_balloon.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index da3b6bd2367c..8c471823a5af 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -1198,6 +1198,7 @@ static void free_balloon_pages(struct hv_dynmem_device *dm, __ClearPageOffline(pg); __free_page(pg); dm->num_pages_ballooned--; + adjust_managed_page_count(pg, 1); } } @@ -1238,8 +1239,10 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); /* mark all pages offline */ - for (j = 0; j < alloc_unit; j++) + for (j = 0; j < alloc_unit; j++) { __SetPageOffline(pg + j); + adjust_managed_page_count(pg + j, -1); + } bl_resp->range_count++; bl_resp->range_array[i].finfo.start_page = From fed1755b118147721f2c87b37b9d66e62c39b668 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 14 Dec 2020 10:02:45 +0100 Subject: [PATCH 381/395] xen/xenbus: Allow watches discard events before queueing If handling logics of watch events are slower than the events enqueue logic and the events can be created from the guests, the guests could trigger memory pressure by intensively inducing the events, because it will create a huge number of pending events that exhausting the memory. Fortunately, some watch events could be ignored, depending on its handler callback. For example, if the callback has interest in only one single path, the watch wouldn't want multiple pending events. Or, some watches could ignore events to same path. To let such watches to volutarily help avoiding the memory pressure situation, this commit introduces new watch callback, 'will_handle'. If it is not NULL, it will be called for each new event just before enqueuing it. Then, if the callback returns false, the event will be discarded. No watch is using the callback for now, though. This is part of XSA-349 Cc: stable@vger.kernel.org Signed-off-by: SeongJae Park Reported-by: Michael Kurth Reported-by: Pawel Wieczorkiewicz Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/net/xen-netback/xenbus.c | 4 ++++ drivers/xen/xenbus/xenbus_client.c | 1 + drivers/xen/xenbus/xenbus_xs.c | 5 ++++- include/xen/xenbus.h | 7 +++++++ 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index f1c1624cec8f..00f6f8dc56c8 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -557,12 +557,14 @@ static int xen_register_credit_watch(struct xenbus_device *dev, return -ENOMEM; snprintf(node, maxlen, "%s/rate", dev->nodename); vif->credit_watch.node = node; + vif->credit_watch.will_handle = NULL; vif->credit_watch.callback = xen_net_rate_changed; err = register_xenbus_watch(&vif->credit_watch); if (err) { pr_err("Failed to set watcher %s\n", vif->credit_watch.node); kfree(node); vif->credit_watch.node = NULL; + vif->credit_watch.will_handle = NULL; vif->credit_watch.callback = NULL; } return err; @@ -609,6 +611,7 @@ static int xen_register_mcast_ctrl_watch(struct xenbus_device *dev, snprintf(node, maxlen, "%s/request-multicast-control", dev->otherend); vif->mcast_ctrl_watch.node = node; + vif->mcast_ctrl_watch.will_handle = NULL; vif->mcast_ctrl_watch.callback = xen_mcast_ctrl_changed; err = register_xenbus_watch(&vif->mcast_ctrl_watch); if (err) { @@ -616,6 +619,7 @@ static int xen_register_mcast_ctrl_watch(struct xenbus_device *dev, vif->mcast_ctrl_watch.node); kfree(node); vif->mcast_ctrl_watch.node = NULL; + vif->mcast_ctrl_watch.will_handle = NULL; vif->mcast_ctrl_watch.callback = NULL; } return err; diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index fd80e318b99c..0a21a12d9c34 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -133,6 +133,7 @@ int xenbus_watch_path(struct xenbus_device *dev, const char *path, int err; watch->node = path; + watch->will_handle = NULL; watch->callback = callback; err = register_xenbus_watch(watch); diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 3a06eb699f33..e8bdbd0a1e26 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -705,7 +705,10 @@ int xs_watch_msg(struct xs_watch_event *event) spin_lock(&watches_lock); event->handle = find_watch(event->token); - if (event->handle != NULL) { + if (event->handle != NULL && + (!event->handle->will_handle || + event->handle->will_handle(event->handle, + event->path, event->token))) { spin_lock(&watch_events_lock); list_add_tail(&event->list, &watch_events); wake_up(&watch_events_waitq); diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 5a8315e6d8a6..baa88bf0b9bc 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -61,6 +61,13 @@ struct xenbus_watch /* Path being watched. */ const char *node; + /* + * Called just before enqueing new event while a spinlock is held. + * The event will be discarded if this callback returns false. + */ + bool (*will_handle)(struct xenbus_watch *, + const char *path, const char *token); + /* Callback (executed in a process context with no locks held). */ void (*callback)(struct xenbus_watch *, const char *path, const char *token); From 2e85d32b1c865bec703ce0c962221a5e955c52c2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 14 Dec 2020 10:04:18 +0100 Subject: [PATCH 382/395] xen/xenbus: Add 'will_handle' callback support in xenbus_watch_path() Some code does not directly make 'xenbus_watch' object and call 'register_xenbus_watch()' but use 'xenbus_watch_path()' instead. This commit adds support of 'will_handle' callback in the 'xenbus_watch_path()' and it's wrapper, 'xenbus_watch_pathfmt()'. This is part of XSA-349 Cc: stable@vger.kernel.org Signed-off-by: SeongJae Park Reported-by: Michael Kurth Reported-by: Pawel Wieczorkiewicz Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/block/xen-blkback/xenbus.c | 3 ++- drivers/net/xen-netback/xenbus.c | 2 +- drivers/xen/xen-pciback/xenbus.c | 2 +- drivers/xen/xenbus/xenbus_client.c | 9 +++++++-- drivers/xen/xenbus/xenbus_probe.c | 2 +- include/xen/xenbus.h | 6 +++++- 6 files changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 76912c584a76..1d8b8d24496c 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -675,7 +675,8 @@ static int xen_blkbk_probe(struct xenbus_device *dev, /* setup back pointer */ be->blkif->be = be; - err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed, + err = xenbus_watch_pathfmt(dev, &be->backend_watch, NULL, + backend_changed, "%s/%s", dev->nodename, "physical-device"); if (err) goto fail; diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 00f6f8dc56c8..6f10e0998f1c 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -824,7 +824,7 @@ static void connect(struct backend_info *be) xenvif_carrier_on(be->vif); unregister_hotplug_status_watch(be); - err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch, + err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch, NULL, hotplug_status_changed, "%s/%s", dev->nodename, "hotplug-status"); if (!err) diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index 4b99ec3dec58..e7c692cfb2cf 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -689,7 +689,7 @@ static int xen_pcibk_xenbus_probe(struct xenbus_device *dev, /* watch the backend node for backend configuration information */ err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch, - xen_pcibk_be_watch); + NULL, xen_pcibk_be_watch); if (err) goto out; diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 0a21a12d9c34..0cd728961fce 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -127,19 +127,22 @@ EXPORT_SYMBOL_GPL(xenbus_strstate); */ int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, + bool (*will_handle)(struct xenbus_watch *, + const char *, const char *), void (*callback)(struct xenbus_watch *, const char *, const char *)) { int err; watch->node = path; - watch->will_handle = NULL; + watch->will_handle = will_handle; watch->callback = callback; err = register_xenbus_watch(watch); if (err) { watch->node = NULL; + watch->will_handle = NULL; watch->callback = NULL; xenbus_dev_fatal(dev, err, "adding watch on %s", path); } @@ -166,6 +169,8 @@ EXPORT_SYMBOL_GPL(xenbus_watch_path); */ int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, + bool (*will_handle)(struct xenbus_watch *, + const char *, const char *), void (*callback)(struct xenbus_watch *, const char *, const char *), const char *pathfmt, ...) @@ -182,7 +187,7 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch"); return -ENOMEM; } - err = xenbus_watch_path(dev, path, watch, callback); + err = xenbus_watch_path(dev, path, watch, will_handle, callback); if (err) kfree(path); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 38725d97d909..4c3d1b84aa0a 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -136,7 +136,7 @@ static int watch_otherend(struct xenbus_device *dev) container_of(dev->dev.bus, struct xen_bus_type, bus); return xenbus_watch_pathfmt(dev, &dev->otherend_watch, - bus->otherend_changed, + NULL, bus->otherend_changed, "%s/%s", dev->otherend, "state"); } diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index baa88bf0b9bc..c8574d1b814c 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -204,10 +204,14 @@ void xenbus_probe(struct work_struct *); int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, + bool (*will_handle)(struct xenbus_watch *, + const char *, const char *), void (*callback)(struct xenbus_watch *, const char *, const char *)); -__printf(4, 5) +__printf(5, 6) int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, + bool (*will_handle)(struct xenbus_watch *, + const char *, const char *), void (*callback)(struct xenbus_watch *, const char *, const char *), const char *pathfmt, ...); From be987200fbaceaef340872841d4f7af2c5ee8dc3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 14 Dec 2020 10:05:47 +0100 Subject: [PATCH 383/395] xen/xenbus/xen_bus_type: Support will_handle watch callback This commit adds support of the 'will_handle' watch callback for 'xen_bus_type' users. This is part of XSA-349 Cc: stable@vger.kernel.org Signed-off-by: SeongJae Park Reported-by: Michael Kurth Reported-by: Pawel Wieczorkiewicz Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/xenbus/xenbus.h | 2 ++ drivers/xen/xenbus/xenbus_probe.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h index 5f5b8a7d5b80..2a93b7c9c159 100644 --- a/drivers/xen/xenbus/xenbus.h +++ b/drivers/xen/xenbus/xenbus.h @@ -44,6 +44,8 @@ struct xen_bus_type { int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); int (*probe)(struct xen_bus_type *bus, const char *type, const char *dir); + bool (*otherend_will_handle)(struct xenbus_watch *watch, + const char *path, const char *token); void (*otherend_changed)(struct xenbus_watch *watch, const char *path, const char *token); struct bus_type bus; diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 4c3d1b84aa0a..44634d970a5c 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -136,7 +136,8 @@ static int watch_otherend(struct xenbus_device *dev) container_of(dev->dev.bus, struct xen_bus_type, bus); return xenbus_watch_pathfmt(dev, &dev->otherend_watch, - NULL, bus->otherend_changed, + bus->otherend_will_handle, + bus->otherend_changed, "%s/%s", dev->otherend, "state"); } From 3dc86ca6b4c8cfcba9da7996189d1b5a358a94fc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 14 Dec 2020 10:07:13 +0100 Subject: [PATCH 384/395] xen/xenbus: Count pending messages for each watch This commit adds a counter of pending messages for each watch in the struct. It is used to skip unnecessary pending messages lookup in 'unregister_xenbus_watch()'. It could also be used in 'will_handle' callback. This is part of XSA-349 Cc: stable@vger.kernel.org Signed-off-by: SeongJae Park Reported-by: Michael Kurth Reported-by: Pawel Wieczorkiewicz Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/xenbus/xenbus_xs.c | 29 ++++++++++++++++++----------- include/xen/xenbus.h | 2 ++ 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index e8bdbd0a1e26..12e02eb01f59 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -711,6 +711,7 @@ int xs_watch_msg(struct xs_watch_event *event) event->path, event->token))) { spin_lock(&watch_events_lock); list_add_tail(&event->list, &watch_events); + event->handle->nr_pending++; wake_up(&watch_events_waitq); spin_unlock(&watch_events_lock); } else @@ -768,6 +769,8 @@ int register_xenbus_watch(struct xenbus_watch *watch) sprintf(token, "%lX", (long)watch); + watch->nr_pending = 0; + down_read(&xs_watch_rwsem); spin_lock(&watches_lock); @@ -817,11 +820,14 @@ void unregister_xenbus_watch(struct xenbus_watch *watch) /* Cancel pending watch events. */ spin_lock(&watch_events_lock); - list_for_each_entry_safe(event, tmp, &watch_events, list) { - if (event->handle != watch) - continue; - list_del(&event->list); - kfree(event); + if (watch->nr_pending) { + list_for_each_entry_safe(event, tmp, &watch_events, list) { + if (event->handle != watch) + continue; + list_del(&event->list); + kfree(event); + } + watch->nr_pending = 0; } spin_unlock(&watch_events_lock); @@ -868,7 +874,6 @@ void xs_suspend_cancel(void) static int xenwatch_thread(void *unused) { - struct list_head *ent; struct xs_watch_event *event; xenwatch_pid = current->pid; @@ -883,13 +888,15 @@ static int xenwatch_thread(void *unused) mutex_lock(&xenwatch_mutex); spin_lock(&watch_events_lock); - ent = watch_events.next; - if (ent != &watch_events) - list_del(ent); + event = list_first_entry_or_null(&watch_events, + struct xs_watch_event, list); + if (event) { + list_del(&event->list); + event->handle->nr_pending--; + } spin_unlock(&watch_events_lock); - if (ent != &watch_events) { - event = list_entry(ent, struct xs_watch_event, list); + if (event) { event->handle->callback(event->handle, event->path, event->token); kfree(event); diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index c8574d1b814c..00c7235ae93e 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -61,6 +61,8 @@ struct xenbus_watch /* Path being watched. */ const char *node; + unsigned int nr_pending; + /* * Called just before enqueing new event while a spinlock is held. * The event will be discarded if this callback returns false. From 9996bd494794a2fe393e97e7a982388c6249aa76 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 14 Dec 2020 10:08:40 +0100 Subject: [PATCH 385/395] xenbus/xenbus_backend: Disallow pending watch messages 'xenbus_backend' watches 'state' of devices, which is writable by guests. Hence, if guests intensively updates it, dom0 will have lots of pending events that exhausting memory of dom0. In other words, guests can trigger dom0 memory pressure. This is known as XSA-349. However, the watch callback of it, 'frontend_changed()', reads only 'state', so doesn't need to have the pending events. To avoid the problem, this commit disallows pending watch messages for 'xenbus_backend' using the 'will_handle()' watch callback. This is part of XSA-349 Cc: stable@vger.kernel.org Signed-off-by: SeongJae Park Reported-by: Michael Kurth Reported-by: Pawel Wieczorkiewicz Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/xenbus/xenbus_probe_backend.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 2ba699897e6d..5abded97e1a7 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -180,6 +180,12 @@ static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, return err; } +static bool frontend_will_handle(struct xenbus_watch *watch, + const char *path, const char *token) +{ + return watch->nr_pending == 0; +} + static void frontend_changed(struct xenbus_watch *watch, const char *path, const char *token) { @@ -191,6 +197,7 @@ static struct xen_bus_type xenbus_backend = { .levels = 3, /* backend/type// */ .get_bus_id = backend_bus_id, .probe = xenbus_probe_backend, + .otherend_will_handle = frontend_will_handle, .otherend_changed = frontend_changed, .bus = { .name = "xen-backend", From 1c728719a4da6e654afb9cc047164755072ed7c9 Mon Sep 17 00:00:00 2001 From: Pawel Wieczorkiewicz Date: Mon, 14 Dec 2020 10:25:57 +0100 Subject: [PATCH 386/395] xen-blkback: set ring->xenblkd to NULL after kthread_stop() When xen_blkif_disconnect() is called, the kernel thread behind the block interface is stopped by calling kthread_stop(ring->xenblkd). The ring->xenblkd thread pointer being non-NULL determines if the thread has been already stopped. Normally, the thread's function xen_blkif_schedule() sets the ring->xenblkd to NULL, when the thread's main loop ends. However, when the thread has not been started yet (i.e. wake_up_process() has not been called on it), the xen_blkif_schedule() function would not be called yet. In such case the kthread_stop() call returns -EINTR and the ring->xenblkd remains dangling. When this happens, any consecutive call to xen_blkif_disconnect (for example in frontend_changed() callback) leads to a kernel crash in kthread_stop() (e.g. NULL pointer dereference in exit_creds()). This is XSA-350. Cc: # 4.12 Fixes: a24fa22ce22a ("xen/blkback: don't use xen_blkif_get() in xen-blkback kthread") Reported-by: Olivier Benjamin Reported-by: Pawel Wieczorkiewicz Signed-off-by: Pawel Wieczorkiewicz Reviewed-by: Julien Grall Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/block/xen-blkback/xenbus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 1d8b8d24496c..9860d4842f36 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -274,6 +274,7 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) if (ring->xenblkd) { kthread_stop(ring->xenblkd); + ring->xenblkd = NULL; wake_up(&ring->shutdown_wq); } From 5b058973d3205578aa6c9a71392e072a11ca44ef Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 11 Dec 2020 11:24:37 +0100 Subject: [PATCH 387/395] mips: lib: uncached: fix non-standard usage of variable 'sp' When building mips tinyconfig with clang the following warning show up: arch/mips/lib/uncached.c:45:6: warning: variable 'sp' is uninitialized when used here [-Wuninitialized] if (sp >= (long)CKSEG0 && sp < (long)CKSEG2) ^~ arch/mips/lib/uncached.c:40:18: note: initialize the variable 'sp' to silence this warning register long sp __asm__("$sp"); ^ = 0 1 warning generated. Rework to make an explicit inline move, instead of the non-standard use of specifying registers for local variables. This is what's written from the gcc-10 manual [1] about specifying registers for local variables: "6.47.5.2 Specifying Registers for Local Variables ................................................. [...] "The only supported use for this feature is to specify registers for input and output operands when calling Extended 'asm' (*note Extended Asm::). [...]". [1] https://docs.w3cub.com/gcc~10/local-register-variables Signed-off-by: Anders Roxell Reported-by: Nathan Chancellor Reported-by: Naresh Kamboju Reviewed-by: Nick Desaulniers Signed-off-by: Thomas Bogendoerfer --- arch/mips/lib/uncached.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/mips/lib/uncached.c b/arch/mips/lib/uncached.c index 09d5deea747f..f80a67c092b6 100644 --- a/arch/mips/lib/uncached.c +++ b/arch/mips/lib/uncached.c @@ -37,10 +37,12 @@ */ unsigned long run_uncached(void *func) { - register long sp __asm__("$sp"); register long ret __asm__("$2"); long lfunc = (long)func, ufunc; long usp; + long sp; + + __asm__("move %0, $sp" : "=r" (sp)); if (sp >= (long)CKSEG0 && sp < (long)CKSEG2) usp = CKSEG1ADDR(sp); From a8c0f1c634507a36ef87a23cfd93720f6142ad9a Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 7 Dec 2020 20:21:42 +0800 Subject: [PATCH 388/395] MIPS: Select ARCH_KEEP_MEMBLOCK if DEBUG_KERNEL to enable sysfs memblock debug In the current code, CONFIG_ARCH_KEEP_MEMBLOCK is not set for MIPS arch, memblock_discard() will discard memory and reserved arrays if they were allocated, select ARCH_KEEP_MEMBLOCK if DEBUG_KERNEL to give a chance to track "memory" and "reserved" memblocks after early boot, with this patch, we can see the following two sysfs interfaces under DEBUG_FS. /sys/kernel/debug/memblock/memory /sys/kernel/debug/memblock/reserved Signed-off-by: Tiezhu Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index b49a390df1cb..85c7b0637734 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -10,6 +10,7 @@ config MIPS select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_KEEP_MEMBLOCK if DEBUG_KERNEL select ARCH_SUPPORTS_UPROBES select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if 64BIT From 41bb1a9b85dd613787a54414a1ae7d4181b9de5d Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Fri, 27 Nov 2020 10:16:03 +0100 Subject: [PATCH 389/395] MIPS: mm: Add back define for PAGE_SHARED There are still some drivers using PAGE_SHARED constant so put it back. Reported-by: kernel test robot Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 158ba3aa3bf1..4f9c37616d42 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -25,6 +25,8 @@ struct mm_struct; struct vm_area_struct; +#define PAGE_SHARED vm_get_page_prot(VM_READ|VM_WRITE|VM_SHARED) + #define PAGE_KERNEL __pgprot(_PAGE_PRESENT | __READABLE | __WRITEABLE | \ _PAGE_GLOBAL | _page_cachable_default) #define PAGE_KERNEL_NC __pgprot(_PAGE_PRESENT | __READABLE | __WRITEABLE | \ From 99fbc70f8547c0782dcde25679c647a11393b801 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Fri, 27 Nov 2020 12:13:55 +0100 Subject: [PATCH 390/395] MIPS: Octeon: irq: Alloc desc before configuring IRQ Allocate the IRQ descriptors where necessary before configuring them via irq_set_chip_and_handler(). Fixes the following soft lockup: watchdog: BUG: soft lockup - CPU#5 stuck for 22s! [modprobe:72] Modules linked in: irq event stamp: 33288 hardirqs last enabled at (33287): [] restore_partial+0x74/0x150 hardirqs last disabled at (33288): [] handle_int+0x128/0x178 softirqs last enabled at (33284): [] __do_softirq+0x5c4/0x6d0 softirqs last disabled at (33279): [] irq_exit+0xe8/0xf0 CPU: 5 PID: 72 Comm: modprobe Not tainted 4.19.80-... #1 $ 0 : 0000000000000000 0000000000000001 0000000000000003 8000000002bdc640 $ 4 : 0000000000000000 0000000000000000 0000000000000000 0000000000000000 $ 8 : 0000000000000001 0000000000000001 0000000000000000 ffffffff803076cc $12 : 0000000000000000 0000000000000000 ffffffff817f0000 0000000008000000 $16 : ffffffff80a96d10 ffffffff80a90000 8000000002c41780 8000000002c41788 $20 : 0000000000000001 ffffffff8013b248 800000008ef28080 ffffffff80bb8700 $24 : 0000000003bf0000 ffffffff802d0610 $28 : 800000008ef20000 800000008ef23bd0 0000000000000006 ffffffff8020d6f8 Hi : 0000000000000160 Lo : 0000000000000014 epc : ffffffff8020d72c smp_call_function_many+0x2f4/0x370 ra : ffffffff8020d6f8 smp_call_function_many+0x2c0/0x370 Status: 10008ce3 KX SX UX KERNEL EXL IE Cause : 40808000 (ExcCode 00) PrId : 000d900a (Cavium Octeon II) CPU: 5 PID: 72 Comm: modprobe Not tainted 4.19.80-... #1 Stack : ffffffff80ab0000 00000051801c0da0 0000000010000ce0 5e70a8a65518aeac 5e70a8a65518aeac 0000000000000000 800000008e0cfb48 ffffffff81820000 800000008e0cfad4 00000000f0ce6f64 0000000000000001 0000000000000000 ffffffff801ccfb8 0000000000000000 0000000000000000 ffffffff817f0000 800000008531d840 ffffffff80a90000 fffe000000000000 0000000000000000 ffffffff80b20000 ffffffffffffffff ffffffff80bb3980 ffffffff80bb3980 ffffffff80a90000 00000000fffffffe ffffffff8057a760 0000000000000028 ffffffff80c50028 800000008ef20000 800000008e0cfb40 ffffffff80b20000 ffffffff80835d6c 0000000000000000 800000008e0cfc78 5e70a8a65518aeac ffffffff80a9dbf7 ffffffff80835c2c ffffffff801357a4 ffffffff809bdd50 ... Call Trace: [] show_stack+0x9c/0x130 [] dump_stack+0xdc/0x140 [] watchdog_timer_fn+0x3e8/0x478 [] __hrtimer_run_queues+0x18c/0x6d8 [] hrtimer_interrupt+0x104/0x2e8 [] c0_compare_interrupt+0x60/0x90 [] __handle_irq_event_percpu+0xb4/0x4a0 [] handle_irq_event_percpu+0x34/0x90 [] handle_percpu_irq+0x9c/0xe0 [] generic_handle_irq+0x34/0x50 [] do_IRQ+0x18/0x28 [] plat_irq_dispatch+0x90/0x128 [] handle_int+0x16c/0x178 [] smp_call_function_many+0x2f4/0x370 [] smp_call_function+0x40/0xa0 [] flush_tlb_mm+0x44/0x140 [] tlb_flush_mmu+0x38/0x90 [] arch_tlb_finish_mmu+0x4c/0x88 [] tlb_finish_mmu+0x24/0x50 [] exit_mmap+0x11c/0x1b8 [] mmput+0x84/0x138 [] do_exit+0x314/0xc88 [] do_group_exit+0x48/0xb0 [] __wake_up_parent+0x0/0x18 Signed-off-by: Alexander Sverdlin Signed-off-by: Thomas Bogendoerfer --- arch/mips/cavium-octeon/octeon-irq.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c index 6501a842c41a..bd47e15d02c7 100644 --- a/arch/mips/cavium-octeon/octeon-irq.c +++ b/arch/mips/cavium-octeon/octeon-irq.c @@ -1505,10 +1505,20 @@ static int __init octeon_irq_init_ciu( goto err; } + r = irq_alloc_desc_at(OCTEON_IRQ_MBOX0, -1); + if (r < 0) { + pr_err("Failed to allocate desc for %s\n", "OCTEON_IRQ_MBOX0"); + goto err; + } r = octeon_irq_set_ciu_mapping( OCTEON_IRQ_MBOX0, 0, 32, 0, chip_mbox, handle_percpu_irq); if (r) goto err; + r = irq_alloc_desc_at(OCTEON_IRQ_MBOX1, -1); + if (r < 0) { + pr_err("Failed to allocate desc for %s\n", "OCTEON_IRQ_MBOX1"); + goto err; + } r = octeon_irq_set_ciu_mapping( OCTEON_IRQ_MBOX1, 0, 33, 0, chip_mbox, handle_percpu_irq); if (r) @@ -1546,6 +1556,11 @@ static int __init octeon_irq_init_ciu( if (r) goto err; + r = irq_alloc_descs(OCTEON_IRQ_WDOG0, OCTEON_IRQ_WDOG0, 16, -1); + if (r < 0) { + pr_err("Failed to allocate desc for %s\n", "OCTEON_IRQ_WDOGx"); + goto err; + } /* CIU_1 */ for (i = 0; i < 16; i++) { r = octeon_irq_set_ciu_mapping( From 47683459ba8f032fec81399dde1b346a1d2a4ff6 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sat, 5 Dec 2020 16:55:07 +0800 Subject: [PATCH 391/395] MAINTAINERS: chenhc@lemote.com -> chenhuacai@kernel.org Use @kernel.org address as the main communications end point. Update the corresponding M-entries and .mailmap (for git shortlog translation). Signed-off-by: Huacai Chen Signed-off-by: Thomas Bogendoerfer --- .mailmap | 2 ++ MAINTAINERS | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 1e14566a3d56..e6ea21990a67 100644 --- a/.mailmap +++ b/.mailmap @@ -119,6 +119,8 @@ Henk Vergonet Henrik Kretzschmar Henrik Rydberg Herbert Xu +Huacai Chen +Huacai Chen Jacob Shin Jaegeuk Kim Jaegeuk Kim diff --git a/MAINTAINERS b/MAINTAINERS index e73636b75f29..88319de9c0b0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9640,7 +9640,7 @@ F: arch/arm64/kvm/ F: include/kvm/arm_* KERNEL VIRTUAL MACHINE FOR MIPS (KVM/mips) -M: Huacai Chen +M: Huacai Chen M: Aleksandar Markovic L: linux-mips@vger.kernel.org L: kvm@vger.kernel.org @@ -11728,7 +11728,7 @@ F: drivers/*/*/*loongson2* F: drivers/*/*loongson2* MIPS/LOONGSON64 ARCHITECTURE -M: Huacai Chen +M: Huacai Chen M: Jiaxun Yang L: linux-mips@vger.kernel.org S: Maintained From e22a26421fce36802785d742acaa4b2f4c37b995 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 29 Oct 2020 16:44:13 +0100 Subject: [PATCH 392/395] MAINTAINERS: Remove JZ4780 DMA driver entry The entry for MIPS Ingenic JZ4780 DMA driver is not up to date anymore. Zubair Lutfullah Kakakhel's email bounces and no maintenance is provided. Suggested-by: Paul Cercueil Signed-off-by: Krzysztof Kozlowski Signed-off-by: Thomas Bogendoerfer --- MAINTAINERS | 5 ----- 1 file changed, 5 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 88319de9c0b0..f463320197b1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8673,11 +8673,6 @@ F: include/uapi/rdma/ F: samples/bpf/ibumad_kern.c F: samples/bpf/ibumad_user.c -INGENIC JZ4780 DMA Driver -M: Zubair Lutfullah Kakakhel -S: Maintained -F: drivers/dma/dma-jz4780.c - INGENIC JZ4780 NAND DRIVER M: Harvey Hunt L: linux-mtd@lists.infradead.org From 3bd5a2350262f1d316c9ff7b86a97335da5f2118 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 29 Oct 2020 16:44:14 +0100 Subject: [PATCH 393/395] MAINTAINERS: Add linux-mips mailing list to JZ47xx entries The entries for JZ47xx SoCs and its drivers lacked MIPS mailing list. Only MTD NAND driver pointed linux-mtd. Add linux-mips so the relevant patches will get attention of MIPS developers. Signed-off-by: Krzysztof Kozlowski Acked-by: Paul Cercueil Signed-off-by: Thomas Bogendoerfer --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f463320197b1..c70c72ef7c66 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8676,11 +8676,13 @@ F: samples/bpf/ibumad_user.c INGENIC JZ4780 NAND DRIVER M: Harvey Hunt L: linux-mtd@lists.infradead.org +L: linux-mips@vger.kernel.org S: Maintained F: drivers/mtd/nand/raw/ingenic/ INGENIC JZ47xx SoCs M: Paul Cercueil +L: linux-mips@vger.kernel.org S: Maintained F: arch/mips/boot/dts/ingenic/ F: arch/mips/generic/board-ingenic.c From ad4fddef5f2345aa9214e979febe2f47639c10d9 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 27 Nov 2020 09:39:43 +0100 Subject: [PATCH 394/395] mips: fix Section mismatch in reference When building mips tinyconfig with clang the following error show up: WARNING: modpost: vmlinux.o(.text+0x1940c): Section mismatch in reference from the function r4k_cache_init() to the function .init.text:loongson3_sc_init() The function r4k_cache_init() references the function __init loongson3_sc_init(). This is often because r4k_cache_init lacks a __init annotation or the annotation of loongson3_sc_init is wrong. Remove marked __init from function loongson3_sc_init(), mips_sc_probe_cm3(), and mips_sc_probe(). Signed-off-by: Anders Roxell Reviewed-by: Nick Desaulniers Signed-off-by: Thomas Bogendoerfer --- arch/mips/mm/c-r4k.c | 2 +- arch/mips/mm/sc-mips.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 99521764c75b..4f976d687ab0 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -1609,7 +1609,7 @@ static void __init loongson2_sc_init(void) c->options |= MIPS_CPU_INCLUSIVE_CACHES; } -static void __init loongson3_sc_init(void) +static void loongson3_sc_init(void) { struct cpuinfo_mips *c = ¤t_cpu_data; unsigned int config2, lsize; diff --git a/arch/mips/mm/sc-mips.c b/arch/mips/mm/sc-mips.c index dd0a5becaabd..06ec304ad4d1 100644 --- a/arch/mips/mm/sc-mips.c +++ b/arch/mips/mm/sc-mips.c @@ -146,7 +146,7 @@ static inline int mips_sc_is_activated(struct cpuinfo_mips *c) return 1; } -static int __init mips_sc_probe_cm3(void) +static int mips_sc_probe_cm3(void) { struct cpuinfo_mips *c = ¤t_cpu_data; unsigned long cfg = read_gcr_l2_config(); @@ -180,7 +180,7 @@ static int __init mips_sc_probe_cm3(void) return 0; } -static inline int __init mips_sc_probe(void) +static inline int mips_sc_probe(void) { struct cpuinfo_mips *c = ¤t_cpu_data; unsigned int config1, config2; From 39b1e779b6e2d4ca7967b49b26f1e4358f20c90c Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Thu, 26 Nov 2020 14:06:00 +0100 Subject: [PATCH 395/395] parisc: pci-dma: fix warning unused-function When building tinyconfig on parisc the following warnign shows up: /tmp/arch/parisc/kernel/pci-dma.c:338:12: warning: 'proc_pcxl_dma_show' defined but not used [-Wunused-function] static int proc_pcxl_dma_show(struct seq_file *m, void *v) ^~~~~~~~~~~~~~~~~~ Mark the function as __maybe_unused to fix the warning. Signed-off-by: Anders Roxell Signed-off-by: Helge Deller --- arch/parisc/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index 36610a5c029f..36a57aa38e87 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -335,7 +335,7 @@ pcxl_free_range(unsigned long vaddr, size_t size) dump_resmap(); } -static int proc_pcxl_dma_show(struct seq_file *m, void *v) +static int __maybe_unused proc_pcxl_dma_show(struct seq_file *m, void *v) { #if 0 u_long i = 0;