From 40fb68c7725aee024ed99ad38504f5d25820c6f0 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 27 Jan 2021 09:50:41 -0600 Subject: [PATCH 01/10] Revert "PCI/ASPM: Save/restore L1SS Capability for suspend/resume" This reverts commit 4257f7e008ea394fcecc050f1569c3503b8bcc15. Kenneth reported that after 4257f7e008ea, he sees a torrent of disk I/O errors on his NVMe device after suspend/resume until a reboot. Link: https://lore.kernel.org/linux-pci/20201228040513.GA611645@bjorn-Precision-5520/ Reported-by: Kenneth R. Crudup Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 7 ------- drivers/pci/pci.h | 4 ---- drivers/pci/pcie/aspm.c | 44 ----------------------------------------- 3 files changed, 55 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b9fecc25d213..790393d1e318 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1558,7 +1558,6 @@ int pci_save_state(struct pci_dev *dev) return i; pci_save_ltr_state(dev); - pci_save_aspm_l1ss_state(dev); pci_save_dpc_state(dev); pci_save_aer_state(dev); pci_save_ptm_state(dev); @@ -1665,7 +1664,6 @@ void pci_restore_state(struct pci_dev *dev) * LTR itself (in the PCIe capability). */ pci_restore_ltr_state(dev); - pci_restore_aspm_l1ss_state(dev); pci_restore_pcie_state(dev); pci_restore_pasid_state(dev); @@ -3353,11 +3351,6 @@ void pci_allocate_cap_save_buffers(struct pci_dev *dev) if (error) pci_err(dev, "unable to allocate suspend buffer for LTR\n"); - error = pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_L1SS, - 2 * sizeof(u32)); - if (error) - pci_err(dev, "unable to allocate suspend buffer for ASPM-L1SS\n"); - pci_allocate_vc_save_buffers(dev); } diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 5c59365092fa..a7bdf0b1d45d 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -582,15 +582,11 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev); void pcie_aspm_exit_link_state(struct pci_dev *pdev); void pcie_aspm_pm_state_change(struct pci_dev *pdev); void pcie_aspm_powersave_config_link(struct pci_dev *pdev); -void pci_save_aspm_l1ss_state(struct pci_dev *dev); -void pci_restore_aspm_l1ss_state(struct pci_dev *dev); #else static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev) { } static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { } -static inline void pci_save_aspm_l1ss_state(struct pci_dev *dev) { } -static inline void pci_restore_aspm_l1ss_state(struct pci_dev *dev) { } #endif #ifdef CONFIG_PCIE_ECRC diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index a08e7d6dc248..ac0557a305af 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -734,50 +734,6 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state) PCI_L1SS_CTL1_L1SS_MASK, val); } -void pci_save_aspm_l1ss_state(struct pci_dev *dev) -{ - int aspm_l1ss; - struct pci_cap_saved_state *save_state; - u32 *cap; - - if (!pci_is_pcie(dev)) - return; - - aspm_l1ss = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_L1SS); - if (!aspm_l1ss) - return; - - save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_L1SS); - if (!save_state) - return; - - cap = (u32 *)&save_state->cap.data[0]; - pci_read_config_dword(dev, aspm_l1ss + PCI_L1SS_CTL1, cap++); - pci_read_config_dword(dev, aspm_l1ss + PCI_L1SS_CTL2, cap++); -} - -void pci_restore_aspm_l1ss_state(struct pci_dev *dev) -{ - int aspm_l1ss; - struct pci_cap_saved_state *save_state; - u32 *cap; - - if (!pci_is_pcie(dev)) - return; - - aspm_l1ss = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_L1SS); - if (!aspm_l1ss) - return; - - save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_L1SS); - if (!save_state) - return; - - cap = (u32 *)&save_state->cap.data[0]; - pci_write_config_dword(dev, aspm_l1ss + PCI_L1SS_CTL1, *cap++); - pci_write_config_dword(dev, aspm_l1ss + PCI_L1SS_CTL2, *cap++); -} - static void pcie_config_aspm_dev(struct pci_dev *pdev, u32 val) { pcie_capability_clear_and_set_word(pdev, PCI_EXP_LNKCTL, From 9efb069de4ba748d284f6129e71de239f801053a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 28 Jan 2021 10:22:48 +0100 Subject: [PATCH 02/10] ovl: add warning on user_ns mismatch Currently there's no way to create an overlay filesystem outside of the current user namespace. Make sure that if this assumption changes it doesn't go unnoticed. Reported-by: "Eric W. Biederman" Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 2bd570cbe8a4..82cd6d55a5a1 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1923,6 +1923,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) unsigned int numlower; int err; + err = -EIO; + if (WARN_ON(sb->s_user_ns != current_user_ns())) + goto out; + sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; From 554677b97257b0b69378bd74e521edb7e94769ff Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 28 Jan 2021 10:22:48 +0100 Subject: [PATCH 03/10] ovl: perform vfs_getxattr() with mounter creds The vfs_getxattr() in ovl_xattr_set() is used to check whether an xattr exist on a lower layer file that is to be removed. If the xattr does not exist, then no need to copy up the file. This call of vfs_getxattr() wasn't wrapped in credential override, and this is probably okay. But for consitency wrap this instance as well. Reported-by: "Eric W. Biederman" Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index d739e14c6814..cf41bcb664bc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -352,7 +352,9 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, goto out; if (!value && !upperdentry) { + old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getxattr(realdentry, name, NULL, 0); + revert_creds(old_cred); if (err < 0) goto out_drop_write; } From f2b00be488730522d0fb7a8a5de663febdcefe0a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 28 Jan 2021 10:22:48 +0100 Subject: [PATCH 04/10] cap: fix conversions on getxattr If a capability is stored on disk in v2 format cap_inode_getsecurity() will currently return in v2 format unconditionally. This is wrong: v2 cap should be equivalent to a v3 cap with zero rootid, and so the same conversions performed on it. If the rootid cannot be mapped, v3 is returned unconverted. Fix this so that both v2 and v3 return -EOVERFLOW if the rootid (or the owner of the fs user namespace in case of v2) cannot be mapped into the current user namespace. Signed-off-by: Miklos Szeredi Acked-by: "Eric W. Biederman" --- security/commoncap.c | 67 ++++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/security/commoncap.c b/security/commoncap.c index bacc1111d871..26c1cb725dcb 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -371,10 +371,11 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, { int size, ret; kuid_t kroot; + u32 nsmagic, magic; uid_t root, mappedroot; char *tmpbuf = NULL; struct vfs_cap_data *cap; - struct vfs_ns_cap_data *nscap; + struct vfs_ns_cap_data *nscap = NULL; struct dentry *dentry; struct user_namespace *fs_ns; @@ -396,46 +397,61 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, fs_ns = inode->i_sb->s_user_ns; cap = (struct vfs_cap_data *) tmpbuf; if (is_v2header((size_t) ret, cap)) { - /* If this is sizeof(vfs_cap_data) then we're ok with the - * on-disk value, so return that. */ - if (alloc) - *buffer = tmpbuf; - else - kfree(tmpbuf); - return ret; - } else if (!is_v3header((size_t) ret, cap)) { - kfree(tmpbuf); - return -EINVAL; + root = 0; + } else if (is_v3header((size_t) ret, cap)) { + nscap = (struct vfs_ns_cap_data *) tmpbuf; + root = le32_to_cpu(nscap->rootid); + } else { + size = -EINVAL; + goto out_free; } - nscap = (struct vfs_ns_cap_data *) tmpbuf; - root = le32_to_cpu(nscap->rootid); kroot = make_kuid(fs_ns, root); /* If the root kuid maps to a valid uid in current ns, then return * this as a nscap. */ mappedroot = from_kuid(current_user_ns(), kroot); if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { + size = sizeof(struct vfs_ns_cap_data); if (alloc) { - *buffer = tmpbuf; + if (!nscap) { + /* v2 -> v3 conversion */ + nscap = kzalloc(size, GFP_ATOMIC); + if (!nscap) { + size = -ENOMEM; + goto out_free; + } + nsmagic = VFS_CAP_REVISION_3; + magic = le32_to_cpu(cap->magic_etc); + if (magic & VFS_CAP_FLAGS_EFFECTIVE) + nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; + memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); + nscap->magic_etc = cpu_to_le32(nsmagic); + } else { + /* use allocated v3 buffer */ + tmpbuf = NULL; + } nscap->rootid = cpu_to_le32(mappedroot); - } else - kfree(tmpbuf); - return size; + *buffer = nscap; + } + goto out_free; } if (!rootid_owns_currentns(kroot)) { - kfree(tmpbuf); - return -EOPNOTSUPP; + size = -EOVERFLOW; + goto out_free; } /* This comes from a parent namespace. Return as a v2 capability */ size = sizeof(struct vfs_cap_data); if (alloc) { - *buffer = kmalloc(size, GFP_ATOMIC); - if (*buffer) { - struct vfs_cap_data *cap = *buffer; - __le32 nsmagic, magic; + if (nscap) { + /* v3 -> v2 conversion */ + cap = kzalloc(size, GFP_ATOMIC); + if (!cap) { + size = -ENOMEM; + goto out_free; + } magic = VFS_CAP_REVISION_2; nsmagic = le32_to_cpu(nscap->magic_etc); if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) @@ -443,9 +459,12 @@ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); cap->magic_etc = cpu_to_le32(magic); } else { - size = -ENOMEM; + /* use unconverted v2 */ + tmpbuf = NULL; } + *buffer = cap; } +out_free: kfree(tmpbuf); return size; } From b854cc659dcb80f172cb35dbedc15d39d49c383f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 5 Jan 2021 08:36:11 +0800 Subject: [PATCH 05/10] ovl: avoid deadlock on directory ioctl The function ovl_dir_real_file() currently uses the inode lock to serialize writes to the od->upperfile field. However, this function will get called by ovl_ioctl_set_flags(), which utilizes the inode lock too. In this case ovl_dir_real_file() will try to claim a lock that is owned by a function in its call stack, which won't get released before ovl_dir_real_file() returns. Fix by replacing the open coded compare and exchange by an explicit atomic op. Fixes: 61536bed2149 ("ovl: support [S|G]ETFLAGS and FS[S|G]ETXATTR ioctls for directories") Cc: stable@vger.kernel.org # v5.10 Reported-by: Icenowy Zheng Tested-by: Icenowy Zheng Signed-off-by: Miklos Szeredi --- fs/overlayfs/readdir.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 01620ebae1bd..60d751f28fea 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -865,7 +865,7 @@ struct file *ovl_dir_real_file(const struct file *file, bool want_upper) struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; - struct file *realfile = od->realfile; + struct file *old, *realfile = od->realfile; if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) return want_upper ? NULL : realfile; @@ -874,29 +874,20 @@ struct file *ovl_dir_real_file(const struct file *file, bool want_upper) * Need to check if we started out being a lower dir, but got copied up */ if (!od->is_upper) { - struct inode *inode = file_inode(file); - realfile = READ_ONCE(od->upperfile); if (!realfile) { struct path upperpath; ovl_path_upper(dentry, &upperpath); realfile = ovl_dir_open_realfile(file, &upperpath); + if (IS_ERR(realfile)) + return realfile; - inode_lock(inode); - if (!od->upperfile) { - if (IS_ERR(realfile)) { - inode_unlock(inode); - return realfile; - } - smp_store_release(&od->upperfile, realfile); - } else { - /* somebody has beaten us to it */ - if (!IS_ERR(realfile)) - fput(realfile); - realfile = od->upperfile; + old = cmpxchg_release(&od->upperfile, NULL, realfile); + if (old) { + fput(realfile); + realfile = old; } - inode_unlock(inode); } } From e04527fefba6e4e66492f122cf8cc6314f3cf3bf Mon Sep 17 00:00:00 2001 From: Liangyan Date: Tue, 22 Dec 2020 11:06:26 +0800 Subject: [PATCH 06/10] ovl: fix dentry leak in ovl_get_redirect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to lock d_parent->d_lock before dget_dlock, or this may have d_lockref updated parallelly like calltrace below which will cause dentry->d_lockref leak and risk a crash. CPU 0 CPU 1 ovl_set_redirect lookup_fast ovl_get_redirect __d_lookup dget_dlock //no lock protection here spin_lock(&dentry->d_lock) dentry->d_lockref.count++ dentry->d_lockref.count++ [   49.799059] PGD 800000061fed7067 P4D 800000061fed7067 PUD 61fec5067 PMD 0 [   49.799689] Oops: 0002 [#1] SMP PTI [   49.800019] CPU: 2 PID: 2332 Comm: node Not tainted 4.19.24-7.20.al7.x86_64 #1 [   49.800678] Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 8a46cfe 04/01/2014 [   49.801380] RIP: 0010:_raw_spin_lock+0xc/0x20 [   49.803470] RSP: 0018:ffffac6fc5417e98 EFLAGS: 00010246 [   49.803949] RAX: 0000000000000000 RBX: ffff93b8da3446c0 RCX: 0000000a00000000 [   49.804600] RDX: 0000000000000001 RSI: 000000000000000a RDI: 0000000000000088 [   49.805252] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffffff993cf040 [   49.805898] R10: ffff93b92292e580 R11: ffffd27f188a4b80 R12: 0000000000000000 [   49.806548] R13: 00000000ffffff9c R14: 00000000fffffffe R15: ffff93b8da3446c0 [   49.807200] FS:  00007ffbedffb700(0000) GS:ffff93b927880000(0000) knlGS:0000000000000000 [   49.807935] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [   49.808461] CR2: 0000000000000088 CR3: 00000005e3f74006 CR4: 00000000003606a0 [   49.809113] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [   49.809758] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [   49.810410] Call Trace: [   49.810653]  d_delete+0x2c/0xb0 [   49.810951]  vfs_rmdir+0xfd/0x120 [   49.811264]  do_rmdir+0x14f/0x1a0 [   49.811573]  do_syscall_64+0x5b/0x190 [   49.811917]  entry_SYSCALL_64_after_hwframe+0x44/0xa9 [   49.812385] RIP: 0033:0x7ffbf505ffd7 [   49.814404] RSP: 002b:00007ffbedffada8 EFLAGS: 00000297 ORIG_RAX: 0000000000000054 [   49.815098] RAX: ffffffffffffffda RBX: 00007ffbedffb640 RCX: 00007ffbf505ffd7 [   49.815744] RDX: 0000000004449700 RSI: 0000000000000000 RDI: 0000000006c8cd50 [   49.816394] RBP: 00007ffbedffaea0 R08: 0000000000000000 R09: 0000000000017d0b [   49.817038] R10: 0000000000000000 R11: 0000000000000297 R12: 0000000000000012 [   49.817687] R13: 00000000072823d8 R14: 00007ffbedffb700 R15: 00000000072823d8 [   49.818338] Modules linked in: pvpanic cirrusfb button qemu_fw_cfg atkbd libps2 i8042 [   49.819052] CR2: 0000000000000088 [   49.819368] ---[ end trace 4e652b8aa299aa2d ]--- [   49.819796] RIP: 0010:_raw_spin_lock+0xc/0x20 [   49.821880] RSP: 0018:ffffac6fc5417e98 EFLAGS: 00010246 [   49.822363] RAX: 0000000000000000 RBX: ffff93b8da3446c0 RCX: 0000000a00000000 [   49.823008] RDX: 0000000000000001 RSI: 000000000000000a RDI: 0000000000000088 [   49.823658] RBP: 0000000000000000 R08: 0000000000000000 R09: ffffffff993cf040 [   49.825404] R10: ffff93b92292e580 R11: ffffd27f188a4b80 R12: 0000000000000000 [   49.827147] R13: 00000000ffffff9c R14: 00000000fffffffe R15: ffff93b8da3446c0 [   49.828890] FS:  00007ffbedffb700(0000) GS:ffff93b927880000(0000) knlGS:0000000000000000 [   49.830725] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [   49.832359] CR2: 0000000000000088 CR3: 00000005e3f74006 CR4: 00000000003606a0 [   49.834085] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [   49.835792] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Cc: Fixes: a6c606551141 ("ovl: redirect on rename-dir") Signed-off-by: Liangyan Reviewed-by: Joseph Qi Suggested-by: Al Viro Signed-off-by: Miklos Szeredi --- fs/overlayfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 28a075b5f5b2..d1efa3a5a503 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -992,8 +992,8 @@ static char *ovl_get_redirect(struct dentry *dentry, bool abs_redirect) buflen -= thislen; memcpy(&buf[buflen], name, thislen); - tmp = dget_dlock(d->d_parent); spin_unlock(&d->d_lock); + tmp = dget_parent(d); dput(d); d = tmp; From 03fedf93593c82538b18476d8c4f0e8f8435ea70 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sat, 19 Dec 2020 12:16:08 +0200 Subject: [PATCH 07/10] ovl: skip getxattr of security labels When inode has no listxattr op of its own (e.g. squashfs) vfs_listxattr calls the LSM inode_listsecurity hooks to list the xattrs that LSMs will intercept in inode_getxattr hooks. When selinux LSM is installed but not initialized, it will list the security.selinux xattr in inode_listsecurity, but will not intercept it in inode_getxattr. This results in -ENODATA for a getxattr call for an xattr returned by listxattr. This situation was manifested as overlayfs failure to copy up lower files from squashfs when selinux is built-in but not initialized, because ovl_copy_xattr() iterates the lower inode xattrs by vfs_listxattr() and vfs_getxattr(). ovl_copy_xattr() skips copy up of security labels that are indentified by inode_copy_up_xattr LSM hooks, but it does that after vfs_getxattr(). Since we are not going to copy them, skip vfs_getxattr() of the security labels. Reported-by: Michael Labriola Tested-by: Michael Labriola Link: https://lore.kernel.org/linux-unionfs/2nv9d47zt7.fsf@aldarion.sourceruckus.org/ Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/copy_up.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index e5b616c93e11..0fed532efa68 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -84,6 +84,14 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old, if (ovl_is_private_xattr(sb, name)) continue; + + error = security_inode_copy_up_xattr(name); + if (error < 0 && error != -EOPNOTSUPP) + break; + if (error == 1) { + error = 0; + continue; /* Discard */ + } retry: size = vfs_getxattr(old, name, value, value_size); if (size == -ERANGE) @@ -107,13 +115,6 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old, goto retry; } - error = security_inode_copy_up_xattr(name); - if (error < 0 && error != -EOPNOTSUPP) - break; - if (error == 1) { - error = 0; - continue; /* Discard */ - } error = vfs_setxattr(new, name, value, size, 0); if (error) { if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name)) From 335d3fc57941e5c6164c69d439aec1cb7a800876 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Thu, 7 Jan 2021 16:10:43 -0800 Subject: [PATCH 08/10] ovl: implement volatile-specific fsync error behaviour Overlayfs's volatile option allows the user to bypass all forced sync calls to the upperdir filesystem. This comes at the cost of safety. We can never ensure that the user's data is intact, but we can make a best effort to expose whether or not the data is likely to be in a bad state. The best way to handle this in the time being is that if an overlayfs's upperdir experiences an error after a volatile mount occurs, that error will be returned on fsync, fdatasync, sync, and syncfs. This is contradictory to the traditional behaviour of VFS which fails the call once, and only raises an error if a subsequent fsync error has occurred, and been raised by the filesystem. One awkward aspect of the patch is that we have to manually set the superblock's errseq_t after the sync_fs callback as opposed to just returning an error from syncfs. This is because the call chain looks something like this: sys_syncfs -> sync_filesystem -> __sync_filesystem -> /* The return value is ignored here sb->s_op->sync_fs(sb) _sync_blockdev /* Where the VFS fetches the error to raise to userspace */ errseq_check_and_advance Because of this we call errseq_set every time the sync_fs callback occurs. Due to the nature of this seen / unseen dichotomy, if the upperdir is an inconsistent state at the initial mount time, overlayfs will refuse to mount, as overlayfs cannot get a snapshot of the upperdir's errseq that will increment on error until the user calls syncfs. Signed-off-by: Sargun Dhillon Suggested-by: Amir Goldstein Reviewed-by: Amir Goldstein Fixes: c86243b090bc ("ovl: provide a mount option "volatile"") Cc: stable@vger.kernel.org Reviewed-by: Vivek Goyal Reviewed-by: Jeff Layton Signed-off-by: Miklos Szeredi --- Documentation/filesystems/overlayfs.rst | 8 ++++++ fs/overlayfs/file.c | 5 ++-- fs/overlayfs/overlayfs.h | 1 + fs/overlayfs/ovl_entry.h | 2 ++ fs/overlayfs/readdir.c | 5 ++-- fs/overlayfs/super.c | 34 ++++++++++++++++++++----- fs/overlayfs/util.c | 27 ++++++++++++++++++++ 7 files changed, 71 insertions(+), 11 deletions(-) diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst index 587a93973929..78240e29b0bb 100644 --- a/Documentation/filesystems/overlayfs.rst +++ b/Documentation/filesystems/overlayfs.rst @@ -586,6 +586,14 @@ without significant effort. The advantage of mounting with the "volatile" option is that all forms of sync calls to the upper filesystem are omitted. +In order to avoid a giving a false sense of safety, the syncfs (and fsync) +semantics of volatile mounts are slightly different than that of the rest of +VFS. If any writeback error occurs on the upperdir's filesystem after a +volatile mount takes place, all sync functions will return an error. Once this +condition is reached, the filesystem will not recover, and every subsequent sync +call will return an error, even if the upperdir has not experience a new error +since the last sync call. + When overlay is mounted with "volatile" option, the directory "$workdir/work/incompat/volatile" is created. During next mount, overlay checks for this directory and refuses to mount if present. This is a strong diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index bd9dd38347ae..077d3ad343f6 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -398,8 +398,9 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) const struct cred *old_cred; int ret; - if (!ovl_should_sync(OVL_FS(file_inode(file)->i_sb))) - return 0; + ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); + if (ret <= 0) + return ret; ret = ovl_real_fdget_meta(file, &real, !datasync); if (ret) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index b487e48c7fd4..cb4e2d60ecf9 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -324,6 +324,7 @@ int ovl_check_metacopy_xattr(struct ovl_fs *ofs, struct dentry *dentry); bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, int padding); +int ovl_sync_status(struct ovl_fs *ofs); static inline bool ovl_is_impuredir(struct super_block *sb, struct dentry *dentry) diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index fbd5e27ce66b..63efee554f69 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -81,6 +81,8 @@ struct ovl_fs { atomic_long_t last_ino; /* Whiteout dentry cache */ struct dentry *whiteout; + /* r/o snapshot of upperdir sb's only taken on volatile mounts */ + errseq_t errseq; }; static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 60d751f28fea..f404a78e6b60 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -900,8 +900,9 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, struct file *realfile; int err; - if (!ovl_should_sync(OVL_FS(file->f_path.dentry->d_sb))) - return 0; + err = ovl_sync_status(OVL_FS(file->f_path.dentry->d_sb)); + if (err <= 0) + return err; realfile = ovl_dir_real_file(file, true); err = PTR_ERR_OR_ZERO(realfile); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 82cd6d55a5a1..d58b8f2bf9d0 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -264,11 +264,20 @@ static int ovl_sync_fs(struct super_block *sb, int wait) struct super_block *upper_sb; int ret; - if (!ovl_upper_mnt(ofs)) - return 0; + ret = ovl_sync_status(ofs); + /* + * We have to always set the err, because the return value isn't + * checked in syncfs, and instead indirectly return an error via + * the sb's writeback errseq, which VFS inspects after this call. + */ + if (ret < 0) { + errseq_set(&sb->s_wb_err, -EIO); + return -EIO; + } + + if (!ret) + return ret; - if (!ovl_should_sync(ofs)) - return 0; /* * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC). * All the super blocks will be iterated, including upper_sb. @@ -1993,6 +2002,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ovl_super_operations; if (ofs->config.upperdir) { + struct super_block *upper_sb; + if (!ofs->config.workdir) { pr_err("missing 'workdir'\n"); goto out_err; @@ -2002,6 +2013,16 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_err; + upper_sb = ovl_upper_mnt(ofs)->mnt_sb; + if (!ovl_should_sync(ofs)) { + ofs->errseq = errseq_sample(&upper_sb->s_wb_err); + if (errseq_check(&upper_sb->s_wb_err, ofs->errseq)) { + err = -EIO; + pr_err("Cannot mount volatile when upperdir has an unseen error. Sync upperdir fs to clear state.\n"); + goto out_err; + } + } + err = ovl_get_workdir(sb, ofs, &upperpath); if (err) goto out_err; @@ -2009,9 +2030,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ofs->workdir) sb->s_flags |= SB_RDONLY; - sb->s_stack_depth = ovl_upper_mnt(ofs)->mnt_sb->s_stack_depth; - sb->s_time_gran = ovl_upper_mnt(ofs)->mnt_sb->s_time_gran; - + sb->s_stack_depth = upper_sb->s_stack_depth; + sb->s_time_gran = upper_sb->s_time_gran; } oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers); err = PTR_ERR(oe); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 6569031af3cd..9826b003f1d2 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -962,3 +962,30 @@ char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry, kfree(buf); return ERR_PTR(res); } + +/* + * ovl_sync_status() - Check fs sync status for volatile mounts + * + * Returns 1 if this is not a volatile mount and a real sync is required. + * + * Returns 0 if syncing can be skipped because mount is volatile, and no errors + * have occurred on the upperdir since the mount. + * + * Returns -errno if it is a volatile mount, and the error that occurred since + * the last mount. If the error code changes, it'll return the latest error + * code. + */ + +int ovl_sync_status(struct ovl_fs *ofs) +{ + struct vfsmount *mnt; + + if (ovl_should_sync(ofs)) + return 1; + + mnt = ovl_upper_mnt(ofs); + if (!mnt) + return 0; + + return errseq_check(&mnt->mnt_sb->s_wb_err, ofs->errseq); +} From 0f347aa07f15b346a001e557f4a0a45069f7fa3d Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 1 Feb 2021 17:34:19 +0100 Subject: [PATCH 09/10] ACPI: scan: Fix battery devices sometimes never binding With the new 2 step scanning process, which defers instantiating some ACPI-devices based on their _DEP to the second step, the following may happen: 1. During the first acpi_walk_namespace(acpi_bus_check_add) call acpi_scan_check_dep() gets called on the Battery ACPI dev handle and adds one or more deps for this handle to the acpi_dep_list 2. During the first acpi_bus_attach() call one or more of the suppliers of these deps get their driver attached and acpi_walk_dep_device_list(supplier_handle) gets called. At this point acpi_bus_get_device(dep->consumer) get called, but since the battery has DEPs it has not been instantiated during the first acpi_walk_namespace(acpi_bus_check_add), so the acpi_bus_get_device(dep->consumer) call fails. Before this commit, acpi_walk_dep_device_list() would now continue *without* removing the acpi_dep_data entry for this supplier,consumer pair from the acpi_dep_list. 3. During the second acpi_walk_namespace(acpi_bus_check_add) call an acpi_device gets instantiated for the battery and acpi_scan_dep_init() gets called to initialize its dep_unmet val. Before this commit, the dep_unmet count would include DEPs for suppliers for which acpi_walk_dep_device_list(supplier_handle) has already been called, so it will never become 0 and the ACPI battery driver will never get attached / bind. Fix the ACPI battery driver never binding in this scenario by making acpi_walk_dep_device_list() always remove matching acpi_dep_data entries independent of the acpi_bus_get_device(dep->consumer) call succeeding or not. Fixes: 71da201f38df ("ACPI: scan: Defer enumeration of devices with _DEP lists") Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/scan.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 1db063b02f63..22566b4b3150 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -2123,12 +2123,12 @@ void acpi_walk_dep_device_list(acpi_handle handle) list_for_each_entry_safe(dep, tmp, &acpi_dep_list, node) { if (dep->supplier == handle) { acpi_bus_get_device(dep->consumer, &adev); - if (!adev) - continue; - adev->dep_unmet--; - if (!adev->dep_unmet) - acpi_bus_attach(adev, true); + if (adev) { + adev->dep_unmet--; + if (!adev->dep_unmet) + acpi_bus_attach(adev, true); + } list_del(&dep->node); kfree(dep); From 5c279c4cf206e03995e04fd3404fa95ffd243a97 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 4 Feb 2021 20:12:37 +0200 Subject: [PATCH 10/10] Revert "x86/setup: don't remove E820_TYPE_RAM for pfn 0" This reverts commit bde9cfa3afe4324ec251e4af80ebf9b7afaf7afe. Changing the first memory page type from E820_TYPE_RESERVED to E820_TYPE_RAM makes it a part of "System RAM" resource rather than a reserved resource and this in turn causes devmem_is_allowed() to treat is as area that can be accessed but it is filled with zeroes instead of the actual data as previously. The change in /dev/mem output causes lilo to fail as was reported at slakware users forum, and probably other legacy applications will experience similar problems. Link: https://www.linuxquestions.org/questions/slackware-14/slackware-current-lilo-vesa-warnings-after-recent-updates-4175689617/#post6214439 Signed-off-by: Mike Rapoport Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3412c4595efd..740f3bdb3f61 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -660,6 +660,17 @@ static void __init trim_platform_memory_ranges(void) static void __init trim_bios_range(void) { + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + * + * This typically reserves additional memory (64KiB by default) + * since some BIOSes are known to corrupt low memory. See the + * Kconfig help text for X86_RESERVE_LOW. + */ + e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); + /* * special case: Some BIOSes report the PC BIOS * area (640Kb -> 1Mb) as RAM even though it is not. @@ -717,15 +728,6 @@ early_param("reservelow", parse_reservelow); static void __init trim_low_memory_range(void) { - /* - * A special case is the first 4Kb of memory; - * This is a BIOS owned area, not kernel ram, but generally - * not listed as such in the E820 table. - * - * This typically reserves additional memory (64KiB by default) - * since some BIOSes are known to corrupt low memory. See the - * Kconfig help text for X86_RESERVE_LOW. - */ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); }