mirror of
https://github.com/torvalds/linux.git
synced 2026-06-03 20:14:06 +02:00
Merge patch series "writeback: Avoid lockups when switching inodes"
Jan Kara <jack@suse.cz> says: This patch series addresses lockups reported by users when systemd unit reading lots of files from a filesystem mounted with lazytime mount option exits. See patch 3 for more details about the reproducer. There are two main issues why switching many inodes between wbs: 1) Multiple workers will be spawned to do the switching but they all contend on the same wb->list_lock making all the parallelism pointless and just wasting time. 2) Sorting of wb->b_dirty list by dirtied_time_when is inherently slow. Patches 1-3 address these problems, patch 4 adds a tracepoint for better observability of inode writeback switching. * patches from https://lore.kernel.org/20250912103522.2935-1-jack@suse.cz: writeback: Add tracepoint to track pending inode switches writeback: Avoid excessively long inode switching times writeback: Avoid softlockup when switching many inodes writeback: Avoid contention on wb->list_lock when switching inodes Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
commit
9426414f0d
|
|
@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
|
|||
}
|
||||
|
||||
struct inode_switch_wbs_context {
|
||||
struct rcu_work work;
|
||||
/* List of queued switching contexts for the wb */
|
||||
struct llist_node list;
|
||||
|
||||
/*
|
||||
* Multiple inodes can be switched at once. The switching procedure
|
||||
|
|
@ -378,7 +379,6 @@ struct inode_switch_wbs_context {
|
|||
* array embedded into struct inode_switch_wbs_context. Otherwise
|
||||
* an inode could be left in a non-consistent state.
|
||||
*/
|
||||
struct bdi_writeback *new_wb;
|
||||
struct inode *inodes[];
|
||||
};
|
||||
|
||||
|
|
@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode,
|
|||
* Transfer to @new_wb's IO list if necessary. If the @inode is dirty,
|
||||
* the specific list @inode was on is ignored and the @inode is put on
|
||||
* ->b_dirty which is always correct including from ->b_dirty_time.
|
||||
* The transfer preserves @inode->dirtied_when ordering. If the @inode
|
||||
* was clean, it means it was on the b_attached list, so move it onto
|
||||
* the b_attached list of @new_wb.
|
||||
* If the @inode was clean, it means it was on the b_attached list, so
|
||||
* move it onto the b_attached list of @new_wb.
|
||||
*/
|
||||
if (!list_empty(&inode->i_io_list)) {
|
||||
inode->i_wb = new_wb;
|
||||
|
||||
if (inode->i_state & I_DIRTY_ALL) {
|
||||
struct inode *pos;
|
||||
|
||||
list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
|
||||
if (time_after_eq(inode->dirtied_when,
|
||||
pos->dirtied_when))
|
||||
break;
|
||||
/*
|
||||
* We need to keep b_dirty list sorted by
|
||||
* dirtied_time_when. However properly sorting the
|
||||
* inode in the list gets too expensive when switching
|
||||
* many inodes. So just attach inode at the end of the
|
||||
* dirty list and clobber the dirtied_time_when.
|
||||
*/
|
||||
inode->dirtied_time_when = jiffies;
|
||||
inode_io_list_move_locked(inode, new_wb,
|
||||
pos->i_io_list.prev);
|
||||
&new_wb->b_dirty);
|
||||
} else {
|
||||
inode_cgwb_move_to_attached(inode, new_wb);
|
||||
}
|
||||
|
|
@ -486,13 +487,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
|
|||
return switched;
|
||||
}
|
||||
|
||||
static void inode_switch_wbs_work_fn(struct work_struct *work)
|
||||
static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
|
||||
struct inode_switch_wbs_context *isw)
|
||||
{
|
||||
struct inode_switch_wbs_context *isw =
|
||||
container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
|
||||
struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
|
||||
struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
|
||||
struct bdi_writeback *new_wb = isw->new_wb;
|
||||
unsigned long nr_switched = 0;
|
||||
struct inode **inodep;
|
||||
|
||||
|
|
@ -502,6 +501,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
|
|||
*/
|
||||
down_read(&bdi->wb_switch_rwsem);
|
||||
|
||||
inodep = isw->inodes;
|
||||
/*
|
||||
* By the time control reaches here, RCU grace period has passed
|
||||
* since I_WB_SWITCH assertion and all wb stat update transactions
|
||||
|
|
@ -512,6 +512,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
|
|||
* gives us exclusion against all wb related operations on @inode
|
||||
* including IO list manipulations and stat updates.
|
||||
*/
|
||||
relock:
|
||||
if (old_wb < new_wb) {
|
||||
spin_lock(&old_wb->list_lock);
|
||||
spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
|
||||
|
|
@ -520,10 +521,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
|
|||
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
|
||||
}
|
||||
|
||||
for (inodep = isw->inodes; *inodep; inodep++) {
|
||||
while (*inodep) {
|
||||
WARN_ON_ONCE((*inodep)->i_wb != old_wb);
|
||||
if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
|
||||
nr_switched++;
|
||||
inodep++;
|
||||
if (*inodep && need_resched()) {
|
||||
spin_unlock(&new_wb->list_lock);
|
||||
spin_unlock(&old_wb->list_lock);
|
||||
cond_resched();
|
||||
goto relock;
|
||||
}
|
||||
}
|
||||
|
||||
spin_unlock(&new_wb->list_lock);
|
||||
|
|
@ -543,6 +551,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
|
|||
atomic_dec(&isw_nr_in_flight);
|
||||
}
|
||||
|
||||
void inode_switch_wbs_work_fn(struct work_struct *work)
|
||||
{
|
||||
struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
|
||||
switch_work);
|
||||
struct inode_switch_wbs_context *isw, *next_isw;
|
||||
struct llist_node *list;
|
||||
|
||||
/*
|
||||
* Grab out reference to wb so that it cannot get freed under us
|
||||
* after we process all the isw items.
|
||||
*/
|
||||
wb_get(new_wb);
|
||||
while (1) {
|
||||
list = llist_del_all(&new_wb->switch_wbs_ctxs);
|
||||
/* Nothing to do? */
|
||||
if (!list)
|
||||
break;
|
||||
/*
|
||||
* In addition to synchronizing among switchers, I_WB_SWITCH
|
||||
* tells the RCU protected stat update paths to grab the i_page
|
||||
* lock so that stat transfer can synchronize against them.
|
||||
* Let's continue after I_WB_SWITCH is guaranteed to be
|
||||
* visible.
|
||||
*/
|
||||
synchronize_rcu();
|
||||
|
||||
llist_for_each_entry_safe(isw, next_isw, list, list)
|
||||
process_inode_switch_wbs(new_wb, isw);
|
||||
}
|
||||
wb_put(new_wb);
|
||||
}
|
||||
|
||||
static bool inode_prepare_wbs_switch(struct inode *inode,
|
||||
struct bdi_writeback *new_wb)
|
||||
{
|
||||
|
|
@ -572,6 +612,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
|
|||
return true;
|
||||
}
|
||||
|
||||
static void wb_queue_isw(struct bdi_writeback *wb,
|
||||
struct inode_switch_wbs_context *isw)
|
||||
{
|
||||
if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
|
||||
queue_work(isw_wq, &wb->switch_work);
|
||||
}
|
||||
|
||||
/**
|
||||
* inode_switch_wbs - change the wb association of an inode
|
||||
* @inode: target inode
|
||||
|
|
@ -585,6 +632,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
|
|||
struct backing_dev_info *bdi = inode_to_bdi(inode);
|
||||
struct cgroup_subsys_state *memcg_css;
|
||||
struct inode_switch_wbs_context *isw;
|
||||
struct bdi_writeback *new_wb = NULL;
|
||||
|
||||
/* noop if seems to be already in progress */
|
||||
if (inode->i_state & I_WB_SWITCH)
|
||||
|
|
@ -609,40 +657,35 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
|
|||
if (!memcg_css)
|
||||
goto out_free;
|
||||
|
||||
isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
|
||||
new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
|
||||
css_put(memcg_css);
|
||||
if (!isw->new_wb)
|
||||
if (!new_wb)
|
||||
goto out_free;
|
||||
|
||||
if (!inode_prepare_wbs_switch(inode, isw->new_wb))
|
||||
if (!inode_prepare_wbs_switch(inode, new_wb))
|
||||
goto out_free;
|
||||
|
||||
isw->inodes[0] = inode;
|
||||
|
||||
/*
|
||||
* In addition to synchronizing among switchers, I_WB_SWITCH tells
|
||||
* the RCU protected stat update paths to grab the i_page
|
||||
* lock so that stat transfer can synchronize against them.
|
||||
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
|
||||
*/
|
||||
INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
|
||||
queue_rcu_work(isw_wq, &isw->work);
|
||||
trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1);
|
||||
wb_queue_isw(new_wb, isw);
|
||||
return;
|
||||
|
||||
out_free:
|
||||
atomic_dec(&isw_nr_in_flight);
|
||||
if (isw->new_wb)
|
||||
wb_put(isw->new_wb);
|
||||
if (new_wb)
|
||||
wb_put(new_wb);
|
||||
kfree(isw);
|
||||
}
|
||||
|
||||
static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
|
||||
static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
|
||||
struct inode_switch_wbs_context *isw,
|
||||
struct list_head *list, int *nr)
|
||||
{
|
||||
struct inode *inode;
|
||||
|
||||
list_for_each_entry(inode, list, i_io_list) {
|
||||
if (!inode_prepare_wbs_switch(inode, isw->new_wb))
|
||||
if (!inode_prepare_wbs_switch(inode, new_wb))
|
||||
continue;
|
||||
|
||||
isw->inodes[*nr] = inode;
|
||||
|
|
@ -666,6 +709,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
|
|||
{
|
||||
struct cgroup_subsys_state *memcg_css;
|
||||
struct inode_switch_wbs_context *isw;
|
||||
struct bdi_writeback *new_wb;
|
||||
int nr;
|
||||
bool restart = false;
|
||||
|
||||
|
|
@ -678,12 +722,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
|
|||
|
||||
for (memcg_css = wb->memcg_css->parent; memcg_css;
|
||||
memcg_css = memcg_css->parent) {
|
||||
isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
|
||||
if (isw->new_wb)
|
||||
new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
|
||||
if (new_wb)
|
||||
break;
|
||||
}
|
||||
if (unlikely(!isw->new_wb))
|
||||
isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
|
||||
if (unlikely(!new_wb))
|
||||
new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
|
||||
|
||||
nr = 0;
|
||||
spin_lock(&wb->list_lock);
|
||||
|
|
@ -695,27 +739,22 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
|
|||
* bandwidth restrictions, as writeback of inode metadata is not
|
||||
* accounted for.
|
||||
*/
|
||||
restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
|
||||
restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
|
||||
if (!restart)
|
||||
restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
|
||||
restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
|
||||
&nr);
|
||||
spin_unlock(&wb->list_lock);
|
||||
|
||||
/* no attached inodes? bail out */
|
||||
if (nr == 0) {
|
||||
atomic_dec(&isw_nr_in_flight);
|
||||
wb_put(isw->new_wb);
|
||||
wb_put(new_wb);
|
||||
kfree(isw);
|
||||
return restart;
|
||||
}
|
||||
|
||||
/*
|
||||
* In addition to synchronizing among switchers, I_WB_SWITCH tells
|
||||
* the RCU protected stat update paths to grab the i_page
|
||||
* lock so that stat transfer can synchronize against them.
|
||||
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
|
||||
*/
|
||||
INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
|
||||
queue_rcu_work(isw_wq, &isw->work);
|
||||
trace_inode_switch_wbs_queue(wb, new_wb, nr);
|
||||
wb_queue_isw(new_wb, isw);
|
||||
|
||||
return restart;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -152,6 +152,10 @@ struct bdi_writeback {
|
|||
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
|
||||
struct list_head b_attached; /* attached inodes, protected by list_lock */
|
||||
struct list_head offline_node; /* anchored at offline_cgwbs */
|
||||
struct work_struct switch_work; /* work used to perform inode switching
|
||||
* to this wb */
|
||||
struct llist_head switch_wbs_ctxs; /* queued contexts for
|
||||
* writeback switching */
|
||||
|
||||
union {
|
||||
struct work_struct release_work;
|
||||
|
|
|
|||
|
|
@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
|
|||
bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
|
||||
}
|
||||
|
||||
void inode_switch_wbs_work_fn(struct work_struct *work);
|
||||
|
||||
#else /* CONFIG_CGROUP_WRITEBACK */
|
||||
|
||||
static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
|
||||
|
|
|
|||
|
|
@ -213,6 +213,35 @@ TRACE_EVENT(inode_foreign_history,
|
|||
)
|
||||
);
|
||||
|
||||
TRACE_EVENT(inode_switch_wbs_queue,
|
||||
|
||||
TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb,
|
||||
unsigned int count),
|
||||
|
||||
TP_ARGS(old_wb, new_wb, count),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__array(char, name, 32)
|
||||
__field(ino_t, old_cgroup_ino)
|
||||
__field(ino_t, new_cgroup_ino)
|
||||
__field(unsigned int, count)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
|
||||
__entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb);
|
||||
__entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb);
|
||||
__entry->count = count;
|
||||
),
|
||||
|
||||
TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u",
|
||||
__entry->name,
|
||||
(unsigned long)__entry->old_cgroup_ino,
|
||||
(unsigned long)__entry->new_cgroup_ino,
|
||||
__entry->count
|
||||
)
|
||||
);
|
||||
|
||||
TRACE_EVENT(inode_switch_wbs,
|
||||
|
||||
TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
|
||||
|
|
|
|||
|
|
@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work)
|
|||
wb_exit(wb);
|
||||
bdi_put(bdi);
|
||||
WARN_ON_ONCE(!list_empty(&wb->b_attached));
|
||||
WARN_ON_ONCE(work_pending(&wb->switch_work));
|
||||
call_rcu(&wb->rcu, cgwb_free_rcu);
|
||||
}
|
||||
|
||||
|
|
@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
|
|||
wb->memcg_css = memcg_css;
|
||||
wb->blkcg_css = blkcg_css;
|
||||
INIT_LIST_HEAD(&wb->b_attached);
|
||||
INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
|
||||
init_llist_head(&wb->switch_wbs_ctxs);
|
||||
INIT_WORK(&wb->release_work, cgwb_release_workfn);
|
||||
set_bit(WB_registered, &wb->state);
|
||||
bdi_get(bdi);
|
||||
|
|
@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
|
|||
if (!ret) {
|
||||
bdi->wb.memcg_css = &root_mem_cgroup->css;
|
||||
bdi->wb.blkcg_css = blkcg_root_css;
|
||||
INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
|
||||
init_llist_head(&bdi->wb.switch_wbs_ctxs);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user