Merge patch series "iomap: allow the file system to submit the writeback bios"

Christoph Hellwig <hch@lst.de> says:

This series contains the iomap prep work to support zoned XFS.

The biggest changes are:

 - an option to reuse the ioend code for direct writes in addition to the
   current use for buffered writeback, which allows the file system to
   track completions on a per-bio basis instead of the current end_io
   callback which operates on the entire I/O.
   Note that it might make sense to split the ioend code from
   buffered-io.c into its own file with this.  Let me know what you think
   of that and I can include it in the next version
 - change of the writeback_ops so that the submit_bio call can be done by
   the file system.  Note that btrfs will also need this eventually when
   it starts using iomap
 - helpers to split ioend to the zone append queue_limits that plug
   into the previous item above.
 - a new ANON_WRITE flags for writes that don't have a block number
   assigned to them at the iomap level, leaving the file system to do
   that work in the submission handler.  Note that btrfs wants something
   similar also for compressed I/O, which should be able to reuse this,
   maybe with minor tweaks.
 - passing private data to a few more helper

The XFS changes to use this will be posted to the xfs list only to not
spam fsdevel too much.

* patches from https://lore.kernel.org/r/20250206064035.2323428-2-hch@lst.de:
  iomap: pass private data to iomap_truncate_page
  iomap: pass private data to iomap_zero_range
  iomap: pass private data to iomap_page_mkwrite
  iomap: add a io_private field to struct iomap_ioend
  iomap: optionally use ioends for direct I/O
  iomap: factor out a iomap_dio_done helper
  iomap: move common ioend code to ioend.c
  iomap: split bios to zone append limits in the submission handlers
  iomap: add a IOMAP_F_ANON_WRITE flag
  iomap: simplify io_flags and io_type in struct iomap_ioend
  iomap: allow the file system to submit the writeback bios

Link: https://lore.kernel.org/r/20250206064035.2323428-2-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
Christian Brauner 2025-02-06 13:02:22 +01:00
commit f87897339a
No known key found for this signature in database
GPG Key ID: 91C61BC06578DCA2
13 changed files with 447 additions and 232 deletions

View File

@ -246,6 +246,10 @@ The fields are as follows:
* **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can
be set by the filesystem for its own purposes.
* **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target
block assigned to it yet and the file system will do that in the bio
submission handler, splitting the I/O as needed.
These flags can be set by iomap itself during file operations.
The filesystem should supply an ``->iomap_end`` function if it needs
to observe these flags:

View File

@ -283,7 +283,7 @@ The ``ops`` structure must be specified and is as follows:
struct iomap_writeback_ops {
int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
loff_t offset, unsigned len);
int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
void (*discard_folio)(struct folio *folio, loff_t pos);
};
@ -306,13 +306,12 @@ The fields are as follows:
purpose.
This function must be supplied by the filesystem.
- ``prepare_ioend``: Enables filesystems to transform the writeback
ioend or perform any other preparatory work before the writeback I/O
is submitted.
- ``submit_ioend``: Allows the file systems to hook into writeback bio
submission.
This might include pre-write space accounting updates, or installing
a custom ``->bi_end_io`` function for internal purposes, such as
deferring the ioend completion to a workqueue to run metadata update
transactions from process context.
transactions from process context before submitting the bio.
This function is optional.
- ``discard_folio``: iomap calls this function after ``->map_blocks``
@ -341,7 +340,7 @@ This can happen in interrupt or process context, depending on the
storage device.
Filesystems that need to update internal bookkeeping (e.g. unwritten
extent conversions) should provide a ``->prepare_ioend`` function to
extent conversions) should provide a ``->submit_ioend`` function to
set ``struct iomap_end::bio::bi_end_io`` to its own function.
This function should call ``iomap_finish_ioends`` after finishing its
own work (e.g. unwritten extent conversion).

View File

@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
unsigned int length)
{
BUG_ON(current->journal_info);
return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
NULL);
}
#define GFS2_JTRUNC_REVOKES 8192

View File

@ -12,6 +12,7 @@ iomap-y += trace.o \
iter.o
iomap-$(CONFIG_BLOCK) += buffered-io.o \
direct-io.o \
ioend.o \
fiemap.o \
seek.o
iomap-$(CONFIG_SWAP) += swapfile.o

View File

@ -12,17 +12,15 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/writeback.h>
#include <linux/list_sort.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
#include "internal.h"
#include "trace.h"
#include "../internal.h"
#define IOEND_BATCH_SIZE 4096
/*
* Structure allocated for each folio to track per-block uptodate, dirty state
* and I/O completions.
@ -40,8 +38,6 @@ struct iomap_folio_state {
unsigned long state[];
};
static struct bio_set iomap_ioend_bioset;
static inline bool ifs_is_fully_uptodate(struct folio *folio,
struct iomap_folio_state *ifs)
{
@ -1395,13 +1391,14 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
const struct iomap_ops *ops)
const struct iomap_ops *ops, void *private)
{
struct iomap_iter iter = {
.inode = inode,
.pos = pos,
.len = len,
.flags = IOMAP_ZERO,
.private = private,
};
struct address_space *mapping = inode->i_mapping;
unsigned int blocksize = i_blocksize(inode);
@ -1461,7 +1458,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
int
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops)
const struct iomap_ops *ops, void *private)
{
unsigned int blocksize = i_blocksize(inode);
unsigned int off = pos & (blocksize - 1);
@ -1469,7 +1466,8 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
/* Block boundary? Nothing to do */
if (!off)
return 0;
return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
private);
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);
@ -1493,11 +1491,13 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
return length;
}
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
void *private)
{
struct iomap_iter iter = {
.inode = file_inode(vmf->vma->vm_file),
.flags = IOMAP_WRITE | IOMAP_FAULT,
.private = private,
};
struct folio *folio = page_folio(vmf->page);
ssize_t ret;
@ -1538,16 +1538,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
* state, release holds on bios, and finally free up memory. Do not use the
* ioend after this.
*/
static u32
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
{
struct inode *inode = ioend->io_inode;
struct bio *bio = &ioend->io_bio;
struct folio_iter fi;
u32 folio_count = 0;
if (error) {
mapping_set_error(inode->i_mapping, error);
if (ioend->io_error) {
mapping_set_error(inode->i_mapping, ioend->io_error);
if (!bio_flagged(bio, BIO_QUIET)) {
pr_err_ratelimited(
"%s: writeback error on inode %lu, offset %lld, sector %llu",
@ -1566,116 +1565,16 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
return folio_count;
}
/*
* Ioend completion routine for merged bios. This can only be called from task
* contexts as merged ioends can be of unbound length. Hence we have to break up
* the writeback completions into manageable chunks to avoid long scheduler
* holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
* good batch processing throughput without creating adverse scheduler latency
* conditions.
*/
void
iomap_finish_ioends(struct iomap_ioend *ioend, int error)
{
struct list_head tmp;
u32 completions;
might_sleep();
list_replace_init(&ioend->io_list, &tmp);
completions = iomap_finish_ioend(ioend, error);
while (!list_empty(&tmp)) {
if (completions > IOEND_BATCH_SIZE * 8) {
cond_resched();
completions = 0;
}
ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
list_del_init(&ioend->io_list);
completions += iomap_finish_ioend(ioend, error);
}
}
EXPORT_SYMBOL_GPL(iomap_finish_ioends);
/*
* We can merge two adjacent ioends if they have the same set of work to do.
*/
static bool
iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
{
if (ioend->io_bio.bi_status != next->io_bio.bi_status)
return false;
if (next->io_flags & IOMAP_F_BOUNDARY)
return false;
if ((ioend->io_flags & IOMAP_F_SHARED) ^
(next->io_flags & IOMAP_F_SHARED))
return false;
if ((ioend->io_type == IOMAP_UNWRITTEN) ^
(next->io_type == IOMAP_UNWRITTEN))
return false;
if (ioend->io_offset + ioend->io_size != next->io_offset)
return false;
/*
* Do not merge physically discontiguous ioends. The filesystem
* completion functions will have to iterate the physical
* discontiguities even if we merge the ioends at a logical level, so
* we don't gain anything by merging physical discontiguities here.
*
* We cannot use bio->bi_iter.bi_sector here as it is modified during
* submission so does not point to the start sector of the bio at
* completion.
*/
if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
return false;
return true;
}
void
iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
{
struct iomap_ioend *next;
INIT_LIST_HEAD(&ioend->io_list);
while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
io_list))) {
if (!iomap_ioend_can_merge(ioend, next))
break;
list_move_tail(&next->io_list, &ioend->io_list);
ioend->io_size += next->io_size;
}
}
EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
static int
iomap_ioend_compare(void *priv, const struct list_head *a,
const struct list_head *b)
{
struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
if (ia->io_offset < ib->io_offset)
return -1;
if (ia->io_offset > ib->io_offset)
return 1;
return 0;
}
void
iomap_sort_ioends(struct list_head *ioend_list)
{
list_sort(NULL, ioend_list, iomap_ioend_compare);
}
EXPORT_SYMBOL_GPL(iomap_sort_ioends);
static void iomap_writepage_end_bio(struct bio *bio)
{
iomap_finish_ioend(iomap_ioend_from_bio(bio),
blk_status_to_errno(bio->bi_status));
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
ioend->io_error = blk_status_to_errno(bio->bi_status);
iomap_finish_ioend_buffered(ioend);
}
/*
* Submit the final bio for an ioend.
* Submit an ioend.
*
* If @error is non-zero, it means that we have a situation where some part of
* the submission process has failed after we've marked pages for writeback.
@ -1694,14 +1593,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
* failure happened so that the file system end I/O handler gets called
* to clean up.
*/
if (wpc->ops->prepare_ioend)
error = wpc->ops->prepare_ioend(wpc->ioend, error);
if (wpc->ops->submit_ioend) {
error = wpc->ops->submit_ioend(wpc, error);
} else {
if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
error = -EIO;
if (!error)
submit_bio(&wpc->ioend->io_bio);
}
if (error) {
wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
bio_endio(&wpc->ioend->io_bio);
} else {
submit_bio(&wpc->ioend->io_bio);
}
wpc->ioend = NULL;
@ -1709,9 +1612,9 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
}
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode, loff_t pos)
struct writeback_control *wbc, struct inode *inode, loff_t pos,
u16 ioend_flags)
{
struct iomap_ioend *ioend;
struct bio *bio;
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
@ -1719,36 +1622,24 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
bio->bi_end_io = iomap_writepage_end_bio;
wbc_init_bio(wbc, bio);
bio->bi_write_hint = inode->i_write_hint;
ioend = iomap_ioend_from_bio(bio);
INIT_LIST_HEAD(&ioend->io_list);
ioend->io_type = wpc->iomap.type;
ioend->io_flags = wpc->iomap.flags;
if (pos > wpc->iomap.offset)
wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = pos;
ioend->io_sector = bio->bi_iter.bi_sector;
wbc_init_bio(wbc, bio);
wpc->nr_folios = 0;
return ioend;
return iomap_init_ioend(inode, bio, pos, ioend_flags);
}
static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
u16 ioend_flags)
{
if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
if (ioend_flags & IOMAP_IOEND_BOUNDARY)
return false;
if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
(wpc->ioend->io_flags & IOMAP_F_SHARED))
return false;
if (wpc->iomap.type != wpc->ioend->io_type)
if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
(wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
return false;
if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
return false;
if (iomap_sector(&wpc->iomap, pos) !=
if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
iomap_sector(&wpc->iomap, pos) !=
bio_end_sector(&wpc->ioend->io_bio))
return false;
/*
@ -1779,14 +1670,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
{
struct iomap_folio_state *ifs = folio->private;
size_t poff = offset_in_folio(folio, pos);
unsigned int ioend_flags = 0;
int error;
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
if (wpc->iomap.type == IOMAP_UNWRITTEN)
ioend_flags |= IOMAP_IOEND_UNWRITTEN;
if (wpc->iomap.flags & IOMAP_F_SHARED)
ioend_flags |= IOMAP_IOEND_SHARED;
if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
ioend_flags |= IOMAP_IOEND_BOUNDARY;
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
new_ioend:
error = iomap_submit_ioend(wpc, 0);
if (error)
return error;
wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
ioend_flags);
}
if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
@ -2062,11 +1962,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
return iomap_submit_ioend(wpc, error);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
static int __init iomap_buffered_init(void)
{
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
offsetof(struct iomap_ioend, io_bio),
BIOSET_NEED_BVECS);
}
fs_initcall(iomap_buffered_init);

View File

@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
* Copyright (c) 2016-2021 Christoph Hellwig.
* Copyright (c) 2016-2025 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@ -12,6 +12,7 @@
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
#include "trace.h"
#include "../internal.h"
@ -20,6 +21,7 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
#define IOMAP_DIO_NO_INVALIDATE (1U << 25)
#define IOMAP_DIO_CALLER_COMP (1U << 26)
#define IOMAP_DIO_INLINE_COMP (1U << 27)
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
@ -81,10 +83,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter,
WRITE_ONCE(iocb->private, bio);
}
if (dio->dops && dio->dops->submit_io)
if (dio->dops && dio->dops->submit_io) {
dio->dops->submit_io(iter, bio, pos);
else
} else {
WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE);
submit_bio(bio);
}
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
@ -117,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
* ->end_io() when necessary, otherwise a racing buffer read would cache
* zeros from unwritten extents.
*/
if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) &&
!(dio->flags & IOMAP_DIO_NO_INVALIDATE))
kiocb_invalidate_post_direct_write(iocb, dio->size);
inode_dio_end(file_inode(iocb->ki_filp));
@ -163,43 +168,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
cmpxchg(&dio->error, 0, ret);
}
void iomap_dio_bio_end_io(struct bio *bio)
/*
* Called when dio->ref reaches zero from an I/O completion.
*/
static void iomap_dio_done(struct iomap_dio *dio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
struct kiocb *iocb = dio->iocb;
if (bio->bi_status)
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
if (!atomic_dec_and_test(&dio->ref))
goto release_bio;
/*
* Synchronous dio, task itself will handle any completion work
* that needs after IO. All we need to do is wake the task.
*/
if (dio->wait_for_completion) {
/*
* Synchronous I/O, task itself will handle any completion work
* that needs after IO. All we need to do is wake the task.
*/
struct task_struct *waiter = dio->submit.waiter;
WRITE_ONCE(dio->submit.waiter, NULL);
blk_wake_io_task(waiter);
goto release_bio;
}
/*
* Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
*/
if (dio->flags & IOMAP_DIO_INLINE_COMP) {
} else if (dio->flags & IOMAP_DIO_INLINE_COMP) {
WRITE_ONCE(iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
goto release_bio;
}
/*
* If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
* our completion that way to avoid an async punt to a workqueue.
*/
if (dio->flags & IOMAP_DIO_CALLER_COMP) {
} else if (dio->flags & IOMAP_DIO_CALLER_COMP) {
/*
* If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
* schedule our completion that way to avoid an async punt to a
* workqueue.
*/
/* only polled IO cares about private cleared */
iocb->private = dio;
iocb->dio_complete = iomap_dio_deferred_complete;
@ -217,19 +210,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
* issuer.
*/
iocb->ki_complete(iocb, 0);
goto release_bio;
}
} else {
struct inode *inode = file_inode(iocb->ki_filp);
/*
* Async DIO completion that requires filesystem level
* completion work gets punted to a work queue to complete as
* the operation may require more IO to be issued to finalise
* filesystem metadata changes or guarantee data integrity.
*/
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
}
}
void iomap_dio_bio_end_io(struct bio *bio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
if (bio->bi_status)
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
if (atomic_dec_and_test(&dio->ref))
iomap_dio_done(dio);
/*
* Async DIO completion that requires filesystem level completion work
* gets punted to a work queue to complete as the operation may require
* more IO to be issued to finalise filesystem metadata changes or
* guarantee data integrity.
*/
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
&dio->aio.work);
release_bio:
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
@ -239,6 +244,47 @@ void iomap_dio_bio_end_io(struct bio *bio)
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
{
struct iomap_dio *dio = ioend->io_bio.bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
u32 vec_count = ioend->io_bio.bi_vcnt;
if (ioend->io_error)
iomap_dio_set_error(dio, ioend->io_error);
if (atomic_dec_and_test(&dio->ref)) {
/*
* Try to avoid another context switch for the completion given
* that we are already called from the ioend completion
* workqueue, but never invalidate pages from this thread to
* avoid deadlocks with buffered I/O completions. Tough luck if
* you hit the tiny race with someone dirtying the range now
* between this check and the actual completion.
*/
if (!dio->iocb->ki_filp->f_mapping->nrpages) {
dio->flags |= IOMAP_DIO_INLINE_COMP;
dio->flags |= IOMAP_DIO_NO_INVALIDATE;
}
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
iomap_dio_done(dio);
}
if (should_dirty) {
bio_check_pages_dirty(&ioend->io_bio);
} else {
bio_release_pages(&ioend->io_bio, false);
bio_put(&ioend->io_bio);
}
/*
* Return the number of bvecs completed as even direct I/O completions
* do significant per-folio work and we'll still want to give up the
* CPU after a lot of completions.
*/
return vec_count;
}
static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{

10
fs/iomap/internal.h Normal file
View File

@ -0,0 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _IOMAP_INTERNAL_H
#define _IOMAP_INTERNAL_H 1
#define IOEND_BATCH_SIZE 4096
u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
#endif /* _IOMAP_INTERNAL_H */

216
fs/iomap/ioend.c Normal file
View File

@ -0,0 +1,216 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2024-2025 Christoph Hellwig.
*/
#include <linux/iomap.h>
#include <linux/list_sort.h>
#include "internal.h"
struct bio_set iomap_ioend_bioset;
EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
struct iomap_ioend *iomap_init_ioend(struct inode *inode,
struct bio *bio, loff_t file_offset, u16 ioend_flags)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
atomic_set(&ioend->io_remaining, 1);
ioend->io_error = 0;
ioend->io_parent = NULL;
INIT_LIST_HEAD(&ioend->io_list);
ioend->io_flags = ioend_flags;
ioend->io_inode = inode;
ioend->io_offset = file_offset;
ioend->io_size = bio->bi_iter.bi_size;
ioend->io_sector = bio->bi_iter.bi_sector;
ioend->io_private = NULL;
return ioend;
}
EXPORT_SYMBOL_GPL(iomap_init_ioend);
static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
{
if (ioend->io_parent) {
struct bio *bio = &ioend->io_bio;
ioend = ioend->io_parent;
bio_put(bio);
}
if (error)
cmpxchg(&ioend->io_error, 0, error);
if (!atomic_dec_and_test(&ioend->io_remaining))
return 0;
if (ioend->io_flags & IOMAP_IOEND_DIRECT)
return iomap_finish_ioend_direct(ioend);
return iomap_finish_ioend_buffered(ioend);
}
/*
* Ioend completion routine for merged bios. This can only be called from task
* contexts as merged ioends can be of unbound length. Hence we have to break up
* the writeback completions into manageable chunks to avoid long scheduler
* holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
* good batch processing throughput without creating adverse scheduler latency
* conditions.
*/
void iomap_finish_ioends(struct iomap_ioend *ioend, int error)
{
struct list_head tmp;
u32 completions;
might_sleep();
list_replace_init(&ioend->io_list, &tmp);
completions = iomap_finish_ioend(ioend, error);
while (!list_empty(&tmp)) {
if (completions > IOEND_BATCH_SIZE * 8) {
cond_resched();
completions = 0;
}
ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
list_del_init(&ioend->io_list);
completions += iomap_finish_ioend(ioend, error);
}
}
EXPORT_SYMBOL_GPL(iomap_finish_ioends);
/*
* We can merge two adjacent ioends if they have the same set of work to do.
*/
static bool iomap_ioend_can_merge(struct iomap_ioend *ioend,
struct iomap_ioend *next)
{
if (ioend->io_bio.bi_status != next->io_bio.bi_status)
return false;
if (next->io_flags & IOMAP_IOEND_BOUNDARY)
return false;
if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
(next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
return false;
if (ioend->io_offset + ioend->io_size != next->io_offset)
return false;
/*
* Do not merge physically discontiguous ioends. The filesystem
* completion functions will have to iterate the physical
* discontiguities even if we merge the ioends at a logical level, so
* we don't gain anything by merging physical discontiguities here.
*
* We cannot use bio->bi_iter.bi_sector here as it is modified during
* submission so does not point to the start sector of the bio at
* completion.
*/
if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) !=
next->io_sector)
return false;
return true;
}
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
struct list_head *more_ioends)
{
struct iomap_ioend *next;
INIT_LIST_HEAD(&ioend->io_list);
while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
io_list))) {
if (!iomap_ioend_can_merge(ioend, next))
break;
list_move_tail(&next->io_list, &ioend->io_list);
ioend->io_size += next->io_size;
}
}
EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
static int iomap_ioend_compare(void *priv, const struct list_head *a,
const struct list_head *b)
{
struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
if (ia->io_offset < ib->io_offset)
return -1;
if (ia->io_offset > ib->io_offset)
return 1;
return 0;
}
void iomap_sort_ioends(struct list_head *ioend_list)
{
list_sort(NULL, ioend_list, iomap_ioend_compare);
}
EXPORT_SYMBOL_GPL(iomap_sort_ioends);
/*
* Split up to the first @max_len bytes from @ioend if the ioend covers more
* than @max_len bytes.
*
* If @is_append is set, the split will be based on the hardware limits for
* REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware
* limits don't allow the entire @max_len length.
*
* The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer
* does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to
* switch the operation after this call, but before submitting the bio.
*/
struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
unsigned int max_len, bool is_append)
{
struct bio *bio = &ioend->io_bio;
struct iomap_ioend *split_ioend;
unsigned int nr_segs;
int sector_offset;
struct bio *split;
if (is_append) {
struct queue_limits *lim = bdev_limits(bio->bi_bdev);
max_len = min(max_len,
lim->max_zone_append_sectors << SECTOR_SHIFT);
sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len);
if (unlikely(sector_offset < 0))
return ERR_PTR(sector_offset);
if (!sector_offset)
return NULL;
} else {
if (bio->bi_iter.bi_size <= max_len)
return NULL;
sector_offset = max_len >> SECTOR_SHIFT;
}
/* ensure the split ioend is still block size aligned */
sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
if (IS_ERR(split))
return ERR_CAST(split);
split->bi_private = bio->bi_private;
split->bi_end_io = bio->bi_end_io;
split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
ioend->io_flags);
split_ioend->io_parent = ioend;
atomic_inc(&ioend->io_remaining);
ioend->io_offset += split_ioend->io_size;
ioend->io_size -= split_ioend->io_size;
split_ioend->io_sector = ioend->io_sector;
if (!is_append)
ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
return split_ioend;
}
EXPORT_SYMBOL_GPL(iomap_split_ioend);
static int __init iomap_ioend_init(void)
{
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
offsetof(struct iomap_ioend, io_bio),
BIOSET_NEED_BVECS);
}
fs_initcall(iomap_ioend_init);

View File

@ -114,7 +114,7 @@ xfs_end_ioend(
*/
error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) {
if (ioend->io_flags & IOMAP_IOEND_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
offset + size);
@ -125,9 +125,9 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
if (ioend->io_flags & IOMAP_F_SHARED)
if (ioend->io_flags & IOMAP_IOEND_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
else if (ioend->io_type == IOMAP_UNWRITTEN)
else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
if (!error && xfs_ioend_is_append(ioend))
@ -395,10 +395,11 @@ xfs_map_blocks(
}
static int
xfs_prepare_ioend(
struct iomap_ioend *ioend,
xfs_submit_ioend(
struct iomap_writepage_ctx *wpc,
int status)
{
struct iomap_ioend *ioend = wpc->ioend;
unsigned int nofs_flag;
/*
@ -409,7 +410,7 @@ xfs_prepare_ioend(
nofs_flag = memalloc_nofs_save();
/* Convert CoW extents to regular */
if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
}
@ -417,10 +418,14 @@ xfs_prepare_ioend(
memalloc_nofs_restore(nofs_flag);
/* send ioends that might require a transaction to the completion wq */
if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
(ioend->io_flags & IOMAP_F_SHARED))
if (xfs_ioend_is_append(ioend) ||
(ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)))
ioend->io_bio.bi_end_io = xfs_end_bio;
return status;
if (status)
return status;
submit_bio(&ioend->io_bio);
return 0;
}
/*
@ -462,7 +467,7 @@ xfs_discard_folio(
static const struct iomap_writeback_ops xfs_writeback_ops = {
.map_blocks = xfs_map_blocks,
.prepare_ioend = xfs_prepare_ioend,
.submit_ioend = xfs_submit_ioend,
.discard_folio = xfs_discard_folio,
};

View File

@ -1511,7 +1511,8 @@ xfs_write_fault(
if (IS_DAX(inode))
ret = xfs_dax_fault_locked(vmf, order, true);
else
ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
NULL);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);

View File

@ -1497,7 +1497,7 @@ xfs_zero_range(
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
&xfs_buffered_write_iomap_ops);
&xfs_buffered_write_iomap_ops, NULL);
}
int
@ -1512,5 +1512,5 @@ xfs_truncate_page(
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
&xfs_buffered_write_iomap_ops);
&xfs_buffered_write_iomap_ops, NULL);
}

View File

@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
/* Serialize against truncates */
filemap_invalidate_lock_shared(inode->i_mapping);
ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL);
filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);

View File

@ -56,6 +56,10 @@ struct vm_fault;
*
* IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must
* never be merged with the mapping before it.
*
* IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block
* assigned to it yet and the file system will do that in the bio submission
* handler, splitting the I/O as needed.
*/
#define IOMAP_F_NEW (1U << 0)
#define IOMAP_F_DIRTY (1U << 1)
@ -68,6 +72,7 @@ struct vm_fault;
#endif /* CONFIG_BUFFER_HEAD */
#define IOMAP_F_XATTR (1U << 5)
#define IOMAP_F_BOUNDARY (1U << 6)
#define IOMAP_F_ANON_WRITE (1U << 7)
/*
* Flags set by the core iomap code during operations:
@ -111,6 +116,8 @@ struct iomap {
static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
{
if (iomap->flags & IOMAP_F_ANON_WRITE)
return U64_MAX; /* invalid */
return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
}
@ -306,12 +313,11 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops);
bool *did_zero, const struct iomap_ops *ops, void *private);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops);
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
const struct iomap_ops *ops);
const struct iomap_ops *ops, void *private);
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
void *private);
typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
struct iomap *iomap);
void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
@ -327,17 +333,43 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset,
sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
const struct iomap_ops *ops);
/*
* Flags for iomap_ioend->io_flags.
*/
/* shared COW extent */
#define IOMAP_IOEND_SHARED (1U << 0)
/* unwritten extent */
#define IOMAP_IOEND_UNWRITTEN (1U << 1)
/* don't merge into previous ioend */
#define IOMAP_IOEND_BOUNDARY (1U << 2)
/* is direct I/O */
#define IOMAP_IOEND_DIRECT (1U << 3)
/*
* Flags that if set on either ioend prevent the merge of two ioends.
* (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way)
*/
#define IOMAP_IOEND_NOMERGE_FLAGS \
(IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
/*
* Structure for writeback I/O completions.
*
* File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io
* for direct I/O) can split a bio generated by iomap. In that case the parent
* ioend it was split from is recorded in ioend->io_parent.
*/
struct iomap_ioend {
struct list_head io_list; /* next ioend in chain */
u16 io_type;
u16 io_flags; /* IOMAP_F_* */
u16 io_flags; /* IOMAP_IOEND_* */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of data within eof */
size_t io_size; /* size of the extent */
atomic_t io_remaining; /* completetion defer count */
int io_error; /* stashed away status */
struct iomap_ioend *io_parent; /* parent for completions */
loff_t io_offset; /* offset in the file */
sector_t io_sector; /* start sector of ioend */
void *io_private; /* file system private data */
struct bio io_bio; /* MUST BE LAST! */
};
@ -362,12 +394,14 @@ struct iomap_writeback_ops {
loff_t offset, unsigned len);
/*
* Optional, allows the file systems to perform actions just before
* submitting the bio and/or override the bio end_io handler for complex
* operations like copy on write extent manipulation or unwritten extent
* conversions.
* Optional, allows the file systems to hook into bio submission,
* including overriding the bi_end_io handler.
*
* Returns 0 if the bio was successfully submitted, or a negative
* error code if status was non-zero or another error happened and
* the bio could not be submitted.
*/
int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
/*
* Optional, allows the file system to discard state on a page where
@ -383,6 +417,10 @@ struct iomap_writepage_ctx {
u32 nr_folios; /* folios added to the ioend */
};
struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
loff_t file_offset, u16 ioend_flags);
struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
unsigned int max_len, bool is_append);
void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
struct list_head *more_ioends);
@ -454,4 +492,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
# define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO)
#endif /* CONFIG_SWAP */
extern struct bio_set iomap_ioend_bioset;
#endif /* LINUX_IOMAP_H */