From b4de0e9be963b95c46c4a5426e94059923d236d6 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 3 Mar 2025 17:11:10 +0000 Subject: [PATCH 1/3] iomap: Rename IOMAP_ATOMIC -> IOMAP_ATOMIC_HW In future xfs will support a SW-based atomic write, so rename IOMAP_ATOMIC -> IOMAP_ATOMIC_HW to be clear which mode is being used. Also relocate setting of IOMAP_ATOMIC_HW to the write path in __iomap_dio_rw(), to be clear that this flag is only relevant to writes. Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Link: https://lore.kernel.org/r/20250303171120.2837067-3-john.g.garry@oracle.com Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/operations.rst | 4 ++-- fs/ext4/inode.c | 2 +- fs/iomap/direct-io.c | 18 +++++++++--------- fs/iomap/trace.h | 2 +- include/linux/iomap.h | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index d1535109587a..0b9d7be23bce 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -514,8 +514,8 @@ IOMAP_WRITE`` with any combination of the following enhancements: if the mapping is unwritten and the filesystem cannot handle zeroing the unaligned regions without exposing stale contents. - * ``IOMAP_ATOMIC``: This write is being issued with torn-write - protection. + * ``IOMAP_ATOMIC_HW``: This write is being issued with torn-write + protection based on HW-offload support. Only a single bio can be created for the write, and the write must not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be set. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7c54ae5fcbd4..ba2f1e3db7c7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3467,7 +3467,7 @@ static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) return false; /* atomic writes are all-or-nothing */ - if (flags & IOMAP_ATOMIC) + if (flags & IOMAP_ATOMIC_HW) return false; /* can only try again if we wrote nothing */ diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index e1e32e2bb0bf..c696ce980796 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -317,7 +317,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua, bool atomic) + const struct iomap *iomap, bool use_fua, bool atomic_hw) { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; @@ -329,7 +329,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, opflags |= REQ_FUA; else dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; - if (atomic) + if (atomic_hw) opflags |= REQ_ATOMIC; return opflags; @@ -340,8 +340,8 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; + bool atomic_hw = iter->flags & IOMAP_ATOMIC_HW; const loff_t length = iomap_length(iter); - bool atomic = iter->flags & IOMAP_ATOMIC; loff_t pos = iter->pos; blk_opf_t bio_opf; struct bio *bio; @@ -351,7 +351,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) u64 copied = 0; size_t orig_count; - if (atomic && length != fs_block_size) + if (atomic_hw && length != fs_block_size) return -EINVAL; if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || @@ -428,7 +428,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) goto out; } - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_hw); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { @@ -461,7 +461,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) } n = bio->bi_iter.bi_size; - if (WARN_ON_ONCE(atomic && n != length)) { + if (WARN_ON_ONCE(atomic_hw && n != length)) { /* * This bio should have covered the complete length, * which it doesn't, so error. We may need to zero out @@ -652,9 +652,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; - if (iocb->ki_flags & IOCB_ATOMIC) - iomi.flags |= IOMAP_ATOMIC; - if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; @@ -689,6 +686,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_OVERWRITE_ONLY; } + if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC_HW; + /* for data sync or sync, we need sync completion processing */ if (iocb_is_dsync(iocb)) { dio->flags |= IOMAP_DIO_NEED_SYNC; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 9eab2c8ac3c5..69af89044ebd 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -99,7 +99,7 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ { IOMAP_NOWAIT, "NOWAIT" }, \ - { IOMAP_ATOMIC, "ATOMIC" } + { IOMAP_ATOMIC_HW, "ATOMIC_HW" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index ea29388b2fba..87cd7079aaf3 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -189,7 +189,7 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC (1 << 9) +#define IOMAP_ATOMIC_HW (1 << 9) #define IOMAP_DONTCACHE (1 << 10) struct iomap_ops { From 794ca29dcc924cd3f16d12b6fba61074c992b8fd Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 3 Mar 2025 17:11:13 +0000 Subject: [PATCH 2/3] iomap: Support SW-based atomic writes Currently atomic write support requires dedicated HW support. This imposes a restriction on the filesystem that disk blocks need to be aligned and contiguously mapped to FS blocks to issue atomic writes. XFS has no method to guarantee FS block alignment for regular, non-RT files. As such, atomic writes are currently limited to 1x FS block there. To deal with the scenario that we are issuing an atomic write over misaligned or discontiguous data blocks - and raise the atomic write size limit - support a SW-based software emulated atomic write mode. For XFS, this SW-based atomic writes would use CoW support to issue emulated untorn writes. It is the responsibility of the FS to detect discontiguous atomic writes and switch to IOMAP_DIO_ATOMIC_SW mode and retry the write. Indeed, SW-based atomic writes could be used always when the mounted bdev does not support HW offload, but this strategy is not initially expected to be used. Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Link: https://lore.kernel.org/r/20250303171120.2837067-6-john.g.garry@oracle.com Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/operations.rst | 16 ++++++++++++++-- fs/iomap/direct-io.c | 4 +++- include/linux/iomap.h | 8 +++++++- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 0b9d7be23bce..b08a79d11d9f 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -526,8 +526,20 @@ IOMAP_WRITE`` with any combination of the following enhancements: conversion or copy on write), all updates for the entire file range must be committed atomically as well. Only one space mapping is allowed per untorn write. - Untorn writes must be aligned to, and must not be longer than, a - single file block. + Untorn writes may be longer than a single file block. In all cases, + the mapping start disk block must have at least the same alignment as + the write offset. + + * ``IOMAP_ATOMIC_SW``: This write is being issued with torn-write + protection via a software mechanism provided by the filesystem. + All the disk block alignment and single bio restrictions which apply + to IOMAP_ATOMIC_HW do not apply here. + SW-based untorn writes would typically be used as a fallback when + HW-based untorn writes may not be issued, e.g. the range of the write + covers multiple extents, meaning that it is not possible to issue + a single bio. + All filesystem metadata updates for the entire file range must be + committed atomically as well. Callers commonly hold ``i_rwsem`` in shared or exclusive mode before calling this function. diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index c696ce980796..c594f2cf3ab4 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -686,7 +686,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_OVERWRITE_ONLY; } - if (iocb->ki_flags & IOCB_ATOMIC) + if (dio_flags & IOMAP_DIO_ATOMIC_SW) + iomi.flags |= IOMAP_ATOMIC_SW; + else if (iocb->ki_flags & IOCB_ATOMIC) iomi.flags |= IOMAP_ATOMIC_HW; /* for data sync or sync, we need sync completion processing */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 87cd7079aaf3..9cd93530013c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -189,8 +189,9 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC_HW (1 << 9) +#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */ #define IOMAP_DONTCACHE (1 << 10) +#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */ struct iomap_ops { /* @@ -502,6 +503,11 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) +/* + * Use software-based torn-write protection. + */ +#define IOMAP_DIO_ATOMIC_SW (1 << 3) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); From 786e3080cbe916f6a4c0987124d4f930c1814a97 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Mon, 3 Mar 2025 17:11:14 +0000 Subject: [PATCH 3/3] iomap: Lift blocksize restriction on atomic writes Filesystems like ext4 can submit writes in multiples of blocksizes. But we still can't allow the writes to be split. Hence let's check if the iomap_length() is same as iter->len or not. It is the role of the FS to ensure that a single mapping may be created for an atomic write. The FS will also continue to check size and alignment legality. Signed-off-by: "Ritesh Harjani (IBM)" jpg: Tweak commit message Reviewed-by: "Darrick J. Wong" Signed-off-by: John Garry Link: https://lore.kernel.org/r/20250303171120.2837067-7-john.g.garry@oracle.com Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index c594f2cf3ab4..5299f70428ef 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -351,7 +351,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) u64 copied = 0; size_t orig_count; - if (atomic_hw && length != fs_block_size) + if (atomic_hw && length != iter->len) return -EINVAL; if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||